aboutsummaryrefslogtreecommitdiff
path: root/drivers/nvme/host/pci.c
diff options
context:
space:
mode:
authorGravatar Linus Torvalds <torvalds@linux-foundation.org> 2023-02-20 14:27:21 -0800
committerGravatar Linus Torvalds <torvalds@linux-foundation.org> 2023-02-20 14:27:21 -0800
commit5b0ed5964928b0aaf0d644c17c886c7f5ea4bb3f (patch)
tree02df7848b8c28552039bf463e0034f5d5518b2a9 /drivers/nvme/host/pci.c
parentMerge tag 'for-6.3/dio-2023-02-16' of git://git.kernel.dk/linux (diff)
parentbrd: use radix_tree_maybe_preload instead of radix_tree_preload (diff)
downloadlinux-5b0ed5964928b0aaf0d644c17c886c7f5ea4bb3f.tar.gz
linux-5b0ed5964928b0aaf0d644c17c886c7f5ea4bb3f.tar.bz2
linux-5b0ed5964928b0aaf0d644c17c886c7f5ea4bb3f.zip
Merge tag 'for-6.3/block-2023-02-16' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe: - NVMe updates via Christoph: - Small improvements to the logging functionality (Amit Engel) - Authentication cleanups (Hannes Reinecke) - Cleanup and optimize the DMA mapping cod in the PCIe driver (Keith Busch) - Work around the command effects for Format NVM (Keith Busch) - Misc cleanups (Keith Busch, Christoph Hellwig) - Fix and cleanup freeing single sgl (Keith Busch) - MD updates via Song: - Fix a rare crash during the takeover process - Don't update recovery_cp when curr_resync is ACTIVE - Free writes_pending in md_stop - Change active_io to percpu - Updates to drbd, inching us closer to unifying the out-of-tree driver with the in-tree one (Andreas, Christoph, Lars, Robert) - BFQ update adding support for multi-actuator drives (Paolo, Federico, Davide) - Make brd compliant with REQ_NOWAIT (me) - Fix for IOPOLL and queue entering, fixing stalled IO waiting on timeouts (me) - Fix for REQ_NOWAIT with multiple bios (me) - Fix memory leak in blktrace cleanup (Greg) - Clean up sbitmap and fix a potential hang (Kemeng) - Clean up some bits in BFQ, and fix a bug in the request injection (Kemeng) - Clean up the request allocation and issue code, and fix some bugs related to that (Kemeng) - ublk updates and fixes: - Add support for unprivileged ublk (Ming) - Improve device deletion handling (Ming) - Misc (Liu, Ziyang) - s390 dasd fixes (Alexander, Qiheng) - Improve utility of request caching and fixes (Anuj, Xiao) - zoned cleanups (Pankaj) - More constification for kobjs (Thomas) - blk-iocost cleanups (Yu) - Remove bio splitting from drivers that don't need it (Christoph) - Switch blk-cgroups to use struct gendisk. Some of this is now incomplete as select late reverts were done. (Christoph) - Add bvec initialization helpers, and convert callers to use that rather than open-coding it (Christoph) - Misc fixes and cleanups (Jinke, Keith, Arnd, Bart, Li, Martin, Matthew, Ulf, Zhong) * tag 'for-6.3/block-2023-02-16' of git://git.kernel.dk/linux: (169 commits) brd: use radix_tree_maybe_preload instead of radix_tree_preload block: use proper return value from bio_failfast() block: bio-integrity: Copy flags when bio_integrity_payload is cloned block: Fix io statistics for cgroup in throttle path brd: mark as nowait compatible brd: check for REQ_NOWAIT and set correct page allocation mask brd: return 0/-error from brd_insert_page() block: sync mixed merged request's failfast with 1st bio's Revert "blk-cgroup: pin the gendisk in struct blkcg_gq" Revert "blk-cgroup: pass a gendisk to blkg_lookup" Revert "blk-cgroup: delay blk-cgroup initialization until add_disk" Revert "blk-cgroup: delay calling blkcg_exit_disk until disk_release" Revert "blk-cgroup: move the cgroup information to struct gendisk" nvme-pci: remove iod use_sgls nvme-pci: fix freeing single sgl block: ublk: check IO buffer based on flag need_get_data s390/dasd: Fix potential memleak in dasd_eckd_init() s390/dasd: sort out physical vs virtual pointers usage block: Remove the ALLOC_CACHE_SLACK constant block: make kobj_type structures constant ...
Diffstat (limited to 'drivers/nvme/host/pci.c')
-rw-r--r--drivers/nvme/host/pci.c104
1 files changed, 26 insertions, 78 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c11e0cfeef0f..5b95c94ee40f 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -42,8 +42,9 @@
* These can be higher, but we need to ensure that any command doesn't
* require an sg allocation that needs more than a page of data.
*/
-#define NVME_MAX_KB_SZ 4096
-#define NVME_MAX_SEGS 127
+#define NVME_MAX_KB_SZ 8192
+#define NVME_MAX_SEGS 128
+#define NVME_MAX_NR_ALLOCATIONS 5
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0444);
@@ -216,6 +217,11 @@ struct nvme_queue {
struct completion delete_done;
};
+union nvme_descriptor {
+ struct nvme_sgl_desc *sg_list;
+ __le64 *prp_list;
+};
+
/*
* The nvme_iod describes the data in an I/O.
*
@@ -225,7 +231,6 @@ struct nvme_queue {
struct nvme_iod {
struct nvme_request req;
struct nvme_command cmd;
- bool use_sgl;
bool aborted;
s8 nr_allocations; /* PRP list pool allocations. 0 means small
pool in use */
@@ -233,6 +238,7 @@ struct nvme_iod {
dma_addr_t first_dma;
dma_addr_t meta_dma;
struct sg_table sgt;
+ union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS];
};
static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
@@ -387,16 +393,6 @@ static int nvme_pci_npages_prp(void)
return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8);
}
-/*
- * Calculates the number of pages needed for the SGL segments. For example a 4k
- * page can accommodate 256 SGL descriptors.
- */
-static int nvme_pci_npages_sgl(void)
-{
- return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc),
- NVME_CTRL_PAGE_SIZE);
-}
-
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
@@ -510,16 +506,10 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
spin_unlock(&nvmeq->sq_lock);
}
-static void **nvme_pci_iod_list(struct request *req)
-{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- return (void **)(iod->sgt.sgl + blk_rq_nr_phys_segments(req));
-}
-
-static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
+static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
+ int nseg)
{
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
- int nseg = blk_rq_nr_phys_segments(req);
unsigned int avg_seg_size;
avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
@@ -541,7 +531,7 @@ static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
int i;
for (i = 0; i < iod->nr_allocations; i++) {
- __le64 *prp_list = nvme_pci_iod_list(req)[i];
+ __le64 *prp_list = iod->list[i].prp_list;
dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
@@ -549,22 +539,6 @@ static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
}
}
-static void nvme_free_sgls(struct nvme_dev *dev, struct request *req)
-{
- const int last_sg = SGES_PER_PAGE - 1;
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- dma_addr_t dma_addr = iod->first_dma;
- int i;
-
- for (i = 0; i < iod->nr_allocations; i++) {
- struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i];
- dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr);
-
- dma_pool_free(dev->prp_page_pool, sg_list, dma_addr);
- dma_addr = next_dma_addr;
- }
-}
-
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -580,10 +554,11 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
if (iod->nr_allocations == 0)
- dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
+ dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list,
+ iod->first_dma);
+ else if (iod->nr_allocations == 1)
+ dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list,
iod->first_dma);
- else if (iod->use_sgl)
- nvme_free_sgls(dev, req);
else
nvme_free_prps(dev, req);
mempool_free(iod->sgt.sgl, dev->iod_mempool);
@@ -614,7 +589,6 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
u64 dma_addr = sg_dma_address(sg);
int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
__le64 *prp_list;
- void **list = nvme_pci_iod_list(req);
dma_addr_t prp_dma;
int nprps, i;
@@ -652,7 +626,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
iod->nr_allocations = -1;
return BLK_STS_RESOURCE;
}
- list[0] = prp_list;
+ iod->list[0].prp_list = prp_list;
iod->first_dma = prp_dma;
i = 0;
for (;;) {
@@ -661,7 +635,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
if (!prp_list)
goto free_prps;
- list[iod->nr_allocations++] = prp_list;
+ iod->list[iod->nr_allocations++].prp_list = prp_list;
prp_list[0] = old_prp_list[i - 1];
old_prp_list[i - 1] = cpu_to_le64(prp_dma);
i = 1;
@@ -706,13 +680,8 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
dma_addr_t dma_addr, int entries)
{
sge->addr = cpu_to_le64(dma_addr);
- if (entries < SGES_PER_PAGE) {
- sge->length = cpu_to_le32(entries * sizeof(*sge));
- sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
- } else {
- sge->length = cpu_to_le32(NVME_CTRL_PAGE_SIZE);
- sge->type = NVME_SGL_FMT_SEG_DESC << 4;
- }
+ sge->length = cpu_to_le32(entries * sizeof(*sge));
+ sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
}
static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
@@ -748,34 +717,16 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
return BLK_STS_RESOURCE;
}
- nvme_pci_iod_list(req)[0] = sg_list;
+ iod->list[0].sg_list = sg_list;
iod->first_dma = sgl_dma;
nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
-
do {
- if (i == SGES_PER_PAGE) {
- struct nvme_sgl_desc *old_sg_desc = sg_list;
- struct nvme_sgl_desc *link = &old_sg_desc[i - 1];
-
- sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
- if (!sg_list)
- goto free_sgls;
-
- i = 0;
- nvme_pci_iod_list(req)[iod->nr_allocations++] = sg_list;
- sg_list[i++] = *link;
- nvme_pci_sgl_set_seg(link, sgl_dma, entries);
- }
-
nvme_pci_sgl_set_data(&sg_list[i++], sg);
sg = sg_next(sg);
} while (--entries > 0);
return BLK_STS_OK;
-free_sgls:
- nvme_free_sgls(dev, req);
- return BLK_STS_RESOURCE;
}
static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
@@ -857,8 +808,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
goto out_free_sg;
}
- iod->use_sgl = nvme_pci_use_sgls(dev, req);
- if (iod->use_sgl)
+ if (nvme_pci_use_sgls(dev, req, iod->sgt.nents))
ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
else
ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
@@ -2706,11 +2656,8 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
{
- size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
- size_t alloc_size = sizeof(__le64 *) * npages +
- sizeof(struct scatterlist) * NVME_MAX_SEGS;
+ size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS;
- WARN_ON_ONCE(alloc_size > PAGE_SIZE);
dev->iod_mempool = mempool_create_node(1,
mempool_kmalloc, mempool_kfree,
(void *)alloc_size, GFP_KERNEL,
@@ -3538,8 +3485,9 @@ static int __init nvme_init(void)
BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
- BUILD_BUG_ON(DIV_ROUND_UP(nvme_pci_npages_prp(), NVME_CTRL_PAGE_SIZE) >
- S8_MAX);
+ BUILD_BUG_ON(NVME_MAX_SEGS > SGES_PER_PAGE);
+ BUILD_BUG_ON(sizeof(struct scatterlist) * NVME_MAX_SEGS > PAGE_SIZE);
+ BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_ALLOCATIONS);
return pci_register_driver(&nvme_driver);
}