From a1c00f78c064882b5712e0c0c0c0b827ac9e9938 Mon Sep 17 00:00:00 2001 From: John Meneghini Date: Fri, 11 Apr 2025 14:00:52 -0400 Subject: [PATCH 1/2] nvme-core: measure and report namespace scan time From: Maurizio Lombardi Add instrumentation to report the total time elapsed during async namespace scanning. --- drivers/nvme/host/core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b502ac07483ba8..d943e61f10a604 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "nvme.h" #include "fabrics.h" @@ -4168,6 +4169,8 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) int ret = 0, i; ASYNC_DOMAIN(domain); struct async_scan_info scan_info; + ktime_t start, end; + s64 elapsed_ms; ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); if (!ns_list) @@ -4175,6 +4178,7 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) scan_info.ctrl = ctrl; scan_info.ns_list = ns_list; + start = ktime_get(); for (;;) { struct nvme_command cmd = { .identify.opcode = nvme_admin_identify, @@ -4208,6 +4212,9 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) free: async_synchronize_full_domain(&domain); kfree(ns_list); + end = ktime_get(); + elapsed_ms = ktime_to_ms(ktime_sub(end, start)); + dev_info(ctrl->device,"namespace scan time: %lld ms\n", elapsed_ms); return ret; } From e2bde8c7ee55dcc1a0c9ed186ed357f487df53f0 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Fri, 11 Apr 2025 12:28:44 +0200 Subject: [PATCH 2/2] nvme-core: allocate the namespaces in order I've done some performance tests. The results are better than expected, there are practically no significant differences between the fully async mode in use in the upstream kernel and the "chained" tasks implemented here. Performance are measured by using this patch: http://bsdbackstore.eu/misc/async_scan_perf_test.patch High latency NVMe/TCP, ~100ms ping, 100 namespaces Synchronous namespace scan (RHEL-9.6): 31175ms Fully async namespace scan (6.15-rc1): 1563ms Async namespace scan with dependency chain (6.15-rc1 + my patch): 1554ms Low latency NVMe/TCP, ~0.4ms ping, 100 namespaces Synchronous namespace scan (RHEL-9.6): 594ms Fully async namespace scan (6.15-rc1): 156ms Async namespace scan with dependency chain (6.15-rc1 + my patch): 160ms NVMe-PCI, 2 namespaces: Synchronous namespace scan (RHEL-9.6): 84ms Fully async namespace scan (6.15-rc1): 121ms Async namespace scan with dependency chain (6.15-rc1 + my patch): 123ms Signed-off-by: Maurizio Lombardi [jmeneghi: rebased to nvme-6.5 and fixed compile warning] Co-developed-by: John Meneghini Signed-off-by: John Meneghini --- drivers/nvme/host/core.c | 172 ++++++++++++++++++++++++++++----------- drivers/nvme/host/nvme.h | 2 + 2 files changed, 125 insertions(+), 49 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d943e61f10a604..18116cc0ace77a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3877,16 +3877,11 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns) list_add(&ns->list, &ns->ctrl->namespaces); } -static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) +static void nvme_init_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct nvme_ns_info *info) { struct queue_limits lim = { }; - struct nvme_ns *ns; struct gendisk *disk; - int node = ctrl->numa_node; - - ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); - if (!ns) - return; if (ctrl->opts && ctrl->opts->data_digest) lim.features |= BLK_FEAT_STABLE_WRITES; @@ -3902,11 +3897,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) ns->disk = disk; ns->queue = disk->queue; - ns->ctrl = ctrl; - kref_init(&ns->kref); - - if (nvme_init_ns_head(ns, info)) - goto out_cleanup_disk; /* * If multipathing is enabled, the device name for all disks and not @@ -3979,7 +3969,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) list_del_init(&ns->head->entry); mutex_unlock(&ctrl->subsys->lock); nvme_put_ns_head(ns->head); - out_cleanup_disk: put_disk(disk); out_free_ns: kfree(ns); @@ -4066,19 +4055,106 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info) nvme_ns_remove(ns); } -static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid) +/** + * struct async_scan_task - keeps track of controller & NSID to scan + * @ctrl: Controller on which namespaces are being scanned + * @nsid: The NSID to scan + * @prev_finished: Set when the namespace scan has been completed + * @head: Linked list of the scan asynchronous tasks + */ +struct async_scan_task { + struct nvme_ctrl *ctrl; + u32 nsid; + struct completion prev_finished; + struct list_head head; +}; + +static struct async_scan_task *async_scan_task_init(struct nvme_ctrl *ctrl, + u32 nsid) +{ + struct async_scan_task *task = kzalloc(sizeof(*task), GFP_KERNEL); + if (!task) + return NULL; + + task->ctrl = ctrl; + task->nsid = nsid; + init_completion(&task->prev_finished); + INIT_LIST_HEAD(&task->head); + + spin_lock(&ctrl->scan_list_lock); + list_add_tail(&task->head, &ctrl->scan_list); + if (list_is_first(&task->head, &ctrl->scan_list)) + complete_all(&task->prev_finished); + spin_unlock(&ctrl->scan_list_lock); + + return task; +} + +static void async_scan_prev_task_wait(struct async_scan_task *task) +{ + if (!task) + return; + + wait_for_completion(&task->prev_finished); +} + +static void async_scan_task_complete(struct async_scan_task *task) +{ + struct nvme_ctrl *ctrl; + + if (!task) + return; + + ctrl = task->ctrl; + async_scan_prev_task_wait(task); + + spin_lock(&ctrl->scan_list_lock); + list_del(&task->head); + if (!list_empty(&ctrl->scan_list)) { + struct async_scan_task *next = list_entry(ctrl->scan_list.next, + struct async_scan_task, head); + complete_all(&next->prev_finished); + } + spin_unlock(&ctrl->scan_list_lock); + kfree(task); +} + + +static struct nvme_ns *nvme_alloc_ns(struct nvme_ctrl *ctrl, + struct nvme_ns_info *info) +{ + struct nvme_ns *ns; + int node = ctrl->numa_node; + + ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); + if (!ns) + return NULL; + + ns->ctrl = ctrl; + kref_init(&ns->kref); + + if (nvme_init_ns_head(ns, info)) { + kfree(ns); + return NULL; + } + + return ns; +} + +static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid, + struct async_scan_task *task) { struct nvme_ns_info info = { .nsid = nsid }; struct nvme_ns *ns; int ret = 1; if (nvme_identify_ns_descs(ctrl, &info)) - return; + goto exit; if (info.ids.csi != NVME_CSI_NVM && !nvme_multi_css(ctrl)) { dev_warn(ctrl->device, "command set not reported for nsid: %d\n", nsid); - return; + goto exit; } /* @@ -4101,44 +4177,35 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid) * becomes ready and restart the scan. */ if (ret || !info.is_ready) - return; + goto exit; ns = nvme_find_get_ns(ctrl, nsid); if (ns) { + async_scan_task_complete(task); nvme_validate_ns(ns, &info); nvme_put_ns(ns); } else { - nvme_alloc_ns(ctrl, &info); + /* + * wait for the previous async task to finish + * before allocating the namespace. + */ + async_scan_prev_task_wait(task); + ns = nvme_alloc_ns(ctrl, &info); + async_scan_task_complete(task); + if (ns) + nvme_init_ns(ctrl, ns, &info); } -} + return; -/** - * struct async_scan_info - keeps track of controller & NSIDs to scan - * @ctrl: Controller on which namespaces are being scanned - * @next_nsid: Index of next NSID to scan in ns_list - * @ns_list: Pointer to list of NSIDs to scan - * - * Note: There is a single async_scan_info structure shared by all instances - * of nvme_scan_ns_async() scanning a given controller, so the atomic - * operations on next_nsid are critical to ensure each instance scans a unique - * NSID. - */ -struct async_scan_info { - struct nvme_ctrl *ctrl; - atomic_t next_nsid; - __le32 *ns_list; -}; +exit: + async_scan_task_complete(task); +} static void nvme_scan_ns_async(void *data, async_cookie_t cookie) { - struct async_scan_info *scan_info = data; - int idx; - u32 nsid; + struct async_scan_task *task = data; - idx = (u32)atomic_fetch_inc(&scan_info->next_nsid); - nsid = le32_to_cpu(scan_info->ns_list[idx]); - - nvme_scan_ns(scan_info->ctrl, nsid); + nvme_scan_ns(task->ctrl, task->nsid, task); } static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, @@ -4168,7 +4235,7 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) u32 prev = 0; int ret = 0, i; ASYNC_DOMAIN(domain); - struct async_scan_info scan_info; + struct async_scan_task *task; ktime_t start, end; s64 elapsed_ms; @@ -4176,8 +4243,6 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) if (!ns_list) return -ENOMEM; - scan_info.ctrl = ctrl; - scan_info.ns_list = ns_list; start = ktime_get(); for (;;) { struct nvme_command cmd = { @@ -4194,23 +4259,30 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) goto free; } - atomic_set(&scan_info.next_nsid, 0); for (i = 0; i < nr_entries; i++) { u32 nsid = le32_to_cpu(ns_list[i]); if (!nsid) /* end of the list? */ goto out; - async_schedule_domain(nvme_scan_ns_async, &scan_info, + + task = async_scan_task_init(ctrl, nsid); + if (!task) { + ret = -ENOMEM; + goto out; + } + + async_schedule_domain(nvme_scan_ns_async, task, &domain); while (++prev < nsid) nvme_ns_remove_by_nsid(ctrl, prev); } - async_synchronize_full_domain(&domain); } out: + async_synchronize_full_domain(&domain); nvme_remove_invalid_namespaces(ctrl, prev); free: async_synchronize_full_domain(&domain); + WARN_ON_ONCE(!list_empty(&ctrl->scan_list)); kfree(ns_list); end = ktime_get(); elapsed_ms = ktime_to_ms(ktime_sub(end, start)); @@ -4229,7 +4301,7 @@ static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl) kfree(id); for (i = 1; i <= nn; i++) - nvme_scan_ns(ctrl, i); + nvme_scan_ns(ctrl, i, NULL); nvme_remove_invalid_namespaces(ctrl, nn); } @@ -4849,6 +4921,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, mutex_init(&ctrl->scan_lock); INIT_LIST_HEAD(&ctrl->namespaces); + INIT_LIST_HEAD(&ctrl->scan_list); + spin_lock_init(&ctrl->scan_list_lock); xa_init(&ctrl->cels); ctrl->dev = dev; ctrl->ops = ops; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 51e07864212710..5ec40d96649eac 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -294,6 +294,8 @@ struct nvme_ctrl { struct blk_mq_tag_set *tagset; struct blk_mq_tag_set *admin_tagset; struct list_head namespaces; + struct list_head scan_list; + spinlock_t scan_list_lock; struct mutex namespaces_lock; struct srcu_struct srcu; struct device ctrl_device;