From 62a584fe05eef1f80ed49a286a29328f1a224fb9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 May 2016 14:34:46 -0400 Subject: writeback: use higher precision calculation in domain_dirty_limits() As vm.dirty_[background_]bytes can't be applied verbatim to multiple cgroup writeback domains, they get converted to percentages in domain_dirty_limits() and applied the same way as vm.dirty_[background]ratio. However, if the specified bytes is lower than 1% of available memory, the calculated ratios become zero and the writeback domain gets throttled constantly. Fix it by using per-PAGE_SIZE instead of percentage for ratio calculations. Also, the updated DIV_ROUND_UP() usages now should yield 1/4096 (0.0244%) as the minimum ratio as long as the specified bytes are above zero. Signed-off-by: Tejun Heo Reported-by: Miao Xie Link: http://lkml.kernel.org/g/57333E75.3080309@huawei.com Cc: stable@vger.kernel.org # v4.2+ Fixes: 9fc3a43e1757 ("writeback: separate out domain_dirty_limits()") Reviewed-by: Jan Kara Adjusted comment based on Jan's suggestion. Signed-off-by: Jens Axboe --- mm/page-writeback.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b9956fdee8f5..e2481949494c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -373,8 +373,9 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc); unsigned long bytes = vm_dirty_bytes; unsigned long bg_bytes = dirty_background_bytes; - unsigned long ratio = vm_dirty_ratio; - unsigned long bg_ratio = dirty_background_ratio; + /* convert ratios to per-PAGE_SIZE for higher precision */ + unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100; + unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100; unsigned long thresh; unsigned long bg_thresh; struct task_struct *tsk; @@ -386,26 +387,28 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) /* * The byte settings can't be applied directly to memcg * domains. Convert them to ratios by scaling against - * globally available memory. + * globally available memory. As the ratios are in + * per-PAGE_SIZE, they can be obtained by dividing bytes by + * number of pages. */ if (bytes) - ratio = min(DIV_ROUND_UP(bytes, PAGE_SIZE) * 100 / - global_avail, 100UL); + ratio = min(DIV_ROUND_UP(bytes, global_avail), + PAGE_SIZE); if (bg_bytes) - bg_ratio = min(DIV_ROUND_UP(bg_bytes, PAGE_SIZE) * 100 / - global_avail, 100UL); + bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail), + PAGE_SIZE); bytes = bg_bytes = 0; } if (bytes) thresh = DIV_ROUND_UP(bytes, PAGE_SIZE); else - thresh = (ratio * available_memory) / 100; + thresh = (ratio * available_memory) / PAGE_SIZE; if (bg_bytes) bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE); else - bg_thresh = (bg_ratio * available_memory) / 100; + bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; if (bg_thresh >= thresh) bg_thresh = thresh / 2; -- cgit v1.2.3 From 87c279e613f848c691111b29d49de8df3f4f56da Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 1 Jun 2016 22:18:48 -0700 Subject: blk-mq: really fix plug list flushing for nomerge queues Commit 0809e3ac6231 ("block: fix plug list flushing for nomerge queues") updated blk_mq_make_request() to set request_count even when blk_queue_nomerges() returns true. However, blk_mq_make_request() only does limited plugging and doesn't use request_count; blk_sq_make_request() is the one that should have been fixed. Do that and get rid of the unnecessary work in the mq version. Fixes: 0809e3ac6231 ("block: fix plug list flushing for nomerge queues") Signed-off-by: Omar Sandoval Reviewed-by: Ming Lei Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/blk-mq.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 29cbc1b5fbdb..f9b9049b1284 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1262,12 +1262,9 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_queue_split(q, &bio, q->bio_split); - if (!is_flush_fua && !blk_queue_nomerges(q)) { - if (blk_attempt_plug_merge(q, bio, &request_count, - &same_queue_rq)) - return BLK_QC_T_NONE; - } else - request_count = blk_plug_queued_count(q); + if (!is_flush_fua && !blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) + return BLK_QC_T_NONE; rq = blk_mq_map_request(q, bio, &data); if (unlikely(!rq)) @@ -1358,9 +1355,11 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) blk_queue_split(q, &bio, q->bio_split); - if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count, NULL)) - return BLK_QC_T_NONE; + if (!is_flush_fua && !blk_queue_nomerges(q)) { + if (blk_attempt_plug_merge(q, bio, &request_count, NULL)) + return BLK_QC_T_NONE; + } else + request_count = blk_plug_queued_count(q); rq = blk_mq_map_request(q, bio, &data); if (unlikely(!rq)) -- cgit v1.2.3 From 05bd92dddc595d74ea645e793c1f3bd4b1fc251a Mon Sep 17 00:00:00 2001 From: Shaun Tancheff Date: Tue, 7 Jun 2016 11:32:13 -0500 Subject: block: missing bio_put following submit_bio_wait submit_bio_wait() gives the caller an opportunity to examine struct bio and so expects the caller to issue the put_bio() This fixes a memory leak reported by a few people in 4.7-rc2 kmemleak report after 9082e87bfbf8 ("block: remove struct bio_batch") Signed-off-by: Shaun Tancheff Tested-by: Catalin Marinas Tested-by: Larry Finger@lwfinger.net Tested-by: David Drysdale Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-lib.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index 23d7f301a196..9e29dc351695 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -113,6 +113,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, ret = submit_bio_wait(type, bio); if (ret == -EOPNOTSUPP) ret = 0; + bio_put(bio); } blk_finish_plug(&plug); @@ -165,8 +166,10 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, } } - if (bio) + if (bio) { ret = submit_bio_wait(REQ_WRITE | REQ_WRITE_SAME, bio); + bio_put(bio); + } return ret != -EOPNOTSUPP ? ret : 0; } EXPORT_SYMBOL(blkdev_issue_write_same); @@ -206,8 +209,11 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, } } - if (bio) - return submit_bio_wait(WRITE, bio); + if (bio) { + ret = submit_bio_wait(WRITE, bio); + bio_put(bio); + return ret; + } return 0; } -- cgit v1.2.3 From d366a0ff1cf73f93796f2377e7b0361a94c41c35 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 8 Jun 2016 10:32:10 -0400 Subject: nbd: pass the nbd pointer for flags debugfs We were passing in &nbd for the private data in debugfs_create_file() for the flags entry. We expect it to just be nbd, fix this so we get proper output from this debugfs entry. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 31e73a7a40f2..6a48ed41963f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -941,7 +941,7 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd) debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize); debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout); debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize); - debugfs_create_file("flags", 0444, dir, &nbd, &nbd_dbg_flags_ops); + debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops); return 0; } -- cgit v1.2.3 From efd1535270c1deb0487527bf0c3c827301a69c93 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Tue, 7 Jun 2016 10:43:15 -0400 Subject: xen-blkfront: don't call talk_to_blkback when already connected to blkback Sometimes blkfront may twice receive blkback_changed() notification (XenbusStateConnected) after migration, which will cause talk_to_blkback() to be called twice too and confuse xen-blkback. The flow is as follow: blkfront blkback blkfront_resume() > talk_to_blkback() > Set blkfront to XenbusStateInitialised front changed() > Connect() > Set blkback to XenbusStateConnected blkback_changed() > Skip talk_to_blkback() because frontstate == XenbusStateInitialised > blkfront_connect() > Set blkfront to XenbusStateConnected ----- And here we get another XenbusStateConnected notification leading to: ----- blkback_changed() > because now frontstate != XenbusStateInitialised talk_to_blkback() is also called again > blkfront state changed from XenbusStateConnected to XenbusStateInitialised (Which is not correct!) front_changed(): > Do nothing because blkback already in XenbusStateConnected Now blkback is in XenbusStateConnected but blkfront is still in XenbusStateInitialised - leading to no disks. Poking of the XenbusStateConnected state is allowed (to deal with block disk change) and has to be dealt with. The most likely cause of this bug are custom udev scripts hooking up the disks and then validating the size. Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index ca13df854639..6ba8891e8efe 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2485,10 +2485,23 @@ static void blkback_changed(struct xenbus_device *dev, break; case XenbusStateConnected: - if (dev->state != XenbusStateInitialised) { + /* + * talk_to_blkback sets state to XenbusStateInitialised + * and blkfront_connect sets it to XenbusStateConnected + * (if connection went OK). + * + * If the backend (or toolstack) decides to poke at backend + * state (and re-trigger the watch by setting the state repeatedly + * to XenbusStateConnected (4)) we need to deal with this. + * This is allowed as this is used to communicate to the guest + * that the size of disk has changed! + */ + if ((dev->state != XenbusStateInitialised) && + (dev->state != XenbusStateConnected)) { if (talk_to_blkback(dev, info)) break; } + blkfront_connect(info); break; -- cgit v1.2.3 From 2a6f71ad99cabe436e70c3f5fcf58072cb3bc07f Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Tue, 31 May 2016 16:59:17 +0800 Subject: xen-blkfront: fix resume issues after a migration After a migrate to another host (which may not have multiqueue support), the number of rings (block hardware queues) may be changed and the ring info structure will also be reallocated. This patch fixes two related bugs: * call blk_mq_update_nr_hw_queues() to make blk-core know the number of hardware queues have been changed. * Don't store rinfo pointer to hctx->driver_data, because rinfo may be reallocated so use hctx->queue_num to get the rinfo structure instead. Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 6ba8891e8efe..2e6d1e9c3345 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -874,8 +874,12 @@ static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *qd) { unsigned long flags; - struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data; + int qid = hctx->queue_num; + struct blkfront_info *info = hctx->queue->queuedata; + struct blkfront_ring_info *rinfo = NULL; + BUG_ON(info->nr_rings <= qid); + rinfo = &info->rinfo[qid]; blk_mq_start_request(qd->rq); spin_lock_irqsave(&rinfo->ring_lock, flags); if (RING_FULL(&rinfo->ring)) @@ -901,20 +905,9 @@ out_busy: return BLK_MQ_RQ_QUEUE_BUSY; } -static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, - unsigned int index) -{ - struct blkfront_info *info = (struct blkfront_info *)data; - - BUG_ON(info->nr_rings <= index); - hctx->driver_data = &info->rinfo[index]; - return 0; -} - static struct blk_mq_ops blkfront_mq_ops = { .queue_rq = blkif_queue_rq, .map_queue = blk_mq_map_queue, - .init_hctx = blk_mq_init_hctx, }; static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, @@ -950,6 +943,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, return PTR_ERR(rq); } + rq->queuedata = info; queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); if (info->feature_discard) { @@ -2149,6 +2143,8 @@ static int blkfront_resume(struct xenbus_device *dev) return err; err = talk_to_blkback(dev, info); + if (!err) + blk_mq_update_nr_hw_queues(&info->tag_set, info->nr_rings); /* * We have to wait for the backend to switch to -- cgit v1.2.3 From edb50a5403d2e2d2b2b63a8365c4378c9c300ed6 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 10 May 2016 15:14:28 +0200 Subject: NVMe: Only release requested regions The NVMe driver only requests the PCIe device's memory regions but releases all possible regions (including eventual I/O regions). This leads to a stale warning entry in dmesg about freeing non existent resources. Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 78dca3193ca4..befac5b19490 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1679,9 +1679,14 @@ static int nvme_pci_enable(struct nvme_dev *dev) static void nvme_dev_unmap(struct nvme_dev *dev) { + struct pci_dev *pdev = to_pci_dev(dev->dev); + int bars; + if (dev->bar) iounmap(dev->bar); - pci_release_regions(to_pci_dev(dev->dev)); + + bars = pci_select_bars(pdev, IORESOURCE_MEM); + pci_release_selected_regions(pdev, bars); } static void nvme_pci_disable(struct nvme_dev *dev) @@ -1924,7 +1929,7 @@ static int nvme_dev_map(struct nvme_dev *dev) return 0; release: - pci_release_regions(pdev); + pci_release_selected_regions(pdev, bars); return -ENODEV; } -- cgit v1.2.3