From b41fdc4a7bf9045e4871c5b15905ea732ffd044f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 11 Mar 2019 15:38:10 +0000 Subject: irqchip/gic: Drop support for secondary GIC in non-DT systems We do not have any in-tree platform with this pathological setup, and only a single system (Cavium's cns3xxx) isn't DT aware. Let's drop the secondary GIC support for now, until we remove the above horror altogether. Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index 626179077bb0..0f049b384ccd 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -158,8 +158,7 @@ int gic_of_init_child(struct device *dev, struct gic_chip_data **gic, int irq); * Legacy platforms not converted to DT yet must use this to init * their GIC */ -void gic_init(unsigned int nr, int start, - void __iomem *dist , void __iomem *cpu); +void gic_init(void __iomem *dist , void __iomem *cpu); int gicv2m_init(struct fwnode_handle *parent_handle, struct irq_domain *parent); -- cgit v1.2.3-59-g8ed1b From 8e44fc85060ec997e9c6f3c49a04274db6621d26 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 12 Mar 2019 16:44:30 +0200 Subject: auxdisplay: charlcd: Introduce charlcd_free() helper The charlcd_free() is a counterpart to charlcd_alloc() and should be called symmetrically on tear down. Reviewed-by: Geert Uytterhoeven Signed-off-by: Andy Shevchenko Signed-off-by: Miguel Ojeda --- drivers/auxdisplay/charlcd.c | 6 ++++++ include/misc/charlcd.h | 1 + 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/drivers/auxdisplay/charlcd.c b/drivers/auxdisplay/charlcd.c index 407acd22efa8..a351a9400054 100644 --- a/drivers/auxdisplay/charlcd.c +++ b/drivers/auxdisplay/charlcd.c @@ -818,6 +818,12 @@ struct charlcd *charlcd_alloc(unsigned int drvdata_size) } EXPORT_SYMBOL_GPL(charlcd_alloc); +void charlcd_free(struct charlcd *lcd) +{ + kfree(charlcd_to_priv(lcd)); +} +EXPORT_SYMBOL_GPL(charlcd_free); + static int panel_notify_sys(struct notifier_block *this, unsigned long code, void *unused) { diff --git a/include/misc/charlcd.h b/include/misc/charlcd.h index 23f61850f363..1832402324ce 100644 --- a/include/misc/charlcd.h +++ b/include/misc/charlcd.h @@ -35,6 +35,7 @@ struct charlcd_ops { }; struct charlcd *charlcd_alloc(unsigned int drvdata_size); +void charlcd_free(struct charlcd *lcd); int charlcd_register(struct charlcd *lcd); int charlcd_unregister(struct charlcd *lcd); -- cgit v1.2.3-59-g8ed1b From c5ae1954c47d3fd8815bd5a592aba18702c93f33 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Wed, 6 Mar 2019 19:21:42 +0200 Subject: IB/mlx5: Use mlx5 core to create/destroy a DEVX DCT To prevent a hardware memory leak when a DEVX DCT object is destroyed without calling DRAIN DCT before, (e.g. under cleanup flow), need to manage its creation and destruction via mlx5 core. In that case the DRAIN DCT command will be called and only once that it will be completed the DESTROY DCT command will be called. Otherwise, the DESTROY DCT may fail and a hardware leak may occur. As of that change the DRAIN DCT command should not be exposed any more from DEVX, it's managed internally by the driver to work as expected by the device specification. Fixes: 7efce3691d33 ("IB/mlx5: Add obj create and destroy functionality") Signed-off-by: Yishai Hadas Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 34 +++++++++++++++++++++------- drivers/infiniband/hw/mlx5/qp.c | 4 +++- drivers/net/ethernet/mellanox/mlx5/core/qp.c | 6 ++--- include/linux/mlx5/qp.h | 3 ++- 4 files changed, 34 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index eaa055007f28..9e08df7914aa 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -20,6 +20,7 @@ enum devx_obj_flags { DEVX_OBJ_FLAGS_INDIRECT_MKEY = 1 << 0, + DEVX_OBJ_FLAGS_DCT = 1 << 1, }; struct devx_async_data { @@ -39,7 +40,10 @@ struct devx_obj { u32 dinlen; /* destroy inbox length */ u32 dinbox[MLX5_MAX_DESTROY_INBOX_SIZE_DW]; u32 flags; - struct mlx5_ib_devx_mr devx_mr; + union { + struct mlx5_ib_devx_mr devx_mr; + struct mlx5_core_dct core_dct; + }; }; struct devx_umem { @@ -347,7 +351,6 @@ static u64 devx_get_obj_id(const void *in) obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ, MLX5_GET(arm_rq_in, in, srq_number)); break; - case MLX5_CMD_OP_DRAIN_DCT: case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION: obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_DCT, MLX5_GET(drain_dct_in, in, dctn)); @@ -618,7 +621,6 @@ static bool devx_is_obj_modify_cmd(const void *in) case MLX5_CMD_OP_2RST_QP: case MLX5_CMD_OP_ARM_XRC_SRQ: case MLX5_CMD_OP_ARM_RQ: - case MLX5_CMD_OP_DRAIN_DCT: case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION: case MLX5_CMD_OP_ARM_XRQ: case MLX5_CMD_OP_SET_XRQ_DC_PARAMS_ENTRY: @@ -1124,7 +1126,11 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) devx_cleanup_mkey(obj); - ret = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); + if (obj->flags & DEVX_OBJ_FLAGS_DCT) + ret = mlx5_core_destroy_dct(obj->mdev, &obj->core_dct); + else + ret = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, + sizeof(out)); if (ib_is_destroy_retryable(ret, why, uobject)) return ret; @@ -1185,9 +1191,17 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( devx_set_umem_valid(cmd_in); } - err = mlx5_cmd_exec(dev->mdev, cmd_in, - cmd_in_len, - cmd_out, cmd_out_len); + if (opcode == MLX5_CMD_OP_CREATE_DCT) { + obj->flags |= DEVX_OBJ_FLAGS_DCT; + err = mlx5_core_create_dct(dev->mdev, &obj->core_dct, + cmd_in, cmd_in_len, + cmd_out, cmd_out_len); + } else { + err = mlx5_cmd_exec(dev->mdev, cmd_in, + cmd_in_len, + cmd_out, cmd_out_len); + } + if (err) goto obj_free; @@ -1214,7 +1228,11 @@ err_copy: if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) devx_cleanup_mkey(obj); obj_destroy: - mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); + if (obj->flags & DEVX_OBJ_FLAGS_DCT) + mlx5_core_destroy_dct(obj->mdev, &obj->core_dct); + else + mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, + sizeof(out)); obj_free: kfree(obj); return err; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 6b1f0e76900b..7cd006da1dae 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3729,6 +3729,7 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { struct mlx5_ib_modify_qp_resp resp = {}; + u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {0}; u32 min_resp_len = offsetof(typeof(resp), dctn) + sizeof(resp.dctn); @@ -3747,7 +3748,8 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit); err = mlx5_core_create_dct(dev->mdev, &qp->dct.mdct, qp->dct.in, - MLX5_ST_SZ_BYTES(create_dct_in)); + MLX5_ST_SZ_BYTES(create_dct_in), out, + sizeof(out)); if (err) return err; resp.dctn = qp->dct.mdct.mqp.qpn; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index c7c2920c05c4..b8ba74de9555 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -263,16 +263,16 @@ destroy: int mlx5_core_create_dct(struct mlx5_core_dev *dev, struct mlx5_core_dct *dct, - u32 *in, int inlen) + u32 *in, int inlen, + u32 *out, int outlen) { - u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {0}; struct mlx5_core_qp *qp = &dct->mqp; int err; init_completion(&dct->drained); MLX5_SET(create_dct_in, in, opcode, MLX5_CMD_OP_CREATE_DCT); - err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out)); + err = mlx5_cmd_exec(dev, in, inlen, out, outlen); if (err) { mlx5_core_warn(dev, "create DCT failed, ret %d\n", err); return err; diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index b26ea9077384..0343c81d4c5f 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -557,7 +557,8 @@ static inline struct mlx5_core_mkey *__mlx5_mr_lookup(struct mlx5_core_dev *dev, int mlx5_core_create_dct(struct mlx5_core_dev *dev, struct mlx5_core_dct *qp, - u32 *in, int inlen); + u32 *in, int inlen, + u32 *out, int outlen); int mlx5_core_create_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, u32 *in, -- cgit v1.2.3-59-g8ed1b From 875f1d0769cdcfe1596ff0ca609b453359e42ec9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 27 Feb 2019 13:05:25 -0700 Subject: iov_iter: add ITER_BVEC_FLAG_NO_REF flag For ITER_BVEC, if we're holding on to kernel pages, the caller doesn't need to grab a reference to the bvec pages, and drop that same reference on IO completion. This is essentially safe for any ITER_BVEC, but some use cases end up reusing pages and uncondtionally dropping a page reference on completion. And example of that is sendfile(2), that ends up being a splice_in + splice_out on the pipe pages. Add a flag that tells us it's fine to not grab a page reference to the bvec pages, since that caller knows not to drop a reference when it's done with the pages. Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +++ include/linux/uio.h | 24 +++++++++++++++++++----- 2 files changed, 22 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/fs/io_uring.c b/fs/io_uring.c index 4c6a5e60ddbe..c592a0933b0d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -855,6 +855,9 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw, iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); if (offset) iov_iter_advance(iter, offset); + + /* don't drop a reference to these pages */ + iter->type |= ITER_BVEC_FLAG_NO_REF; return 0; } diff --git a/include/linux/uio.h b/include/linux/uio.h index ecf584f6b82d..4e926641fa80 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -23,14 +23,23 @@ struct kvec { }; enum iter_type { - ITER_IOVEC = 0, - ITER_KVEC = 2, - ITER_BVEC = 4, - ITER_PIPE = 8, - ITER_DISCARD = 16, + /* set if ITER_BVEC doesn't hold a bv_page ref */ + ITER_BVEC_FLAG_NO_REF = 2, + + /* iter types */ + ITER_IOVEC = 4, + ITER_KVEC = 8, + ITER_BVEC = 16, + ITER_PIPE = 32, + ITER_DISCARD = 64, }; struct iov_iter { + /* + * Bit 0 is the read/write bit, set if we're writing. + * Bit 1 is the BVEC_FLAG_NO_REF bit, set if type is a bvec and + * the caller isn't expecting to drop a page reference when done. + */ unsigned int type; size_t iov_offset; size_t count; @@ -84,6 +93,11 @@ static inline unsigned char iov_iter_rw(const struct iov_iter *i) return i->type & (READ | WRITE); } +static inline bool iov_iter_bvec_no_ref(const struct iov_iter *i) +{ + return (i->type & ITER_BVEC_FLAG_NO_REF) != 0; +} + /* * Total number of bytes covered by an iovec. * -- cgit v1.2.3-59-g8ed1b From 399254aaf4892113c806816f7e64cf40c804d46d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 27 Feb 2019 13:13:23 -0700 Subject: block: add BIO_NO_PAGE_REF flag If bio_iov_iter_get_pages() is called on an iov_iter that is flagged with NO_REF, then we don't need to add a page reference for the pages that we add. Add BIO_NO_PAGE_REF to track this in the bio, so IO completion knows not to drop a reference to these pages. Signed-off-by: Jens Axboe --- block/bio.c | 43 ++++++++++++++++++++++++------------------- fs/block_dev.c | 12 +++++++----- fs/iomap.c | 12 +++++++----- include/linux/blk_types.h | 1 + 4 files changed, 39 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/block/bio.c b/block/bio.c index 71a78d9fb8b7..b64cedc7f87c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -849,20 +849,14 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) size = bio_add_page(bio, bv->bv_page, len, bv->bv_offset + iter->iov_offset); if (size == len) { - struct page *page; - int i; + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { + struct page *page; + int i; + + mp_bvec_for_each_page(page, bv, i) + get_page(page); + } - /* - * For the normal O_DIRECT case, we could skip grabbing this - * reference and then not have to put them again when IO - * completes. But this breaks some in-kernel users, like - * splicing to/from a loop device, where we release the pipe - * pages unconditionally. If we can fix that case, we can - * get rid of the get here and the need to call - * bio_release_pages() at IO completion time. - */ - mp_bvec_for_each_page(page, bv, i) - get_page(page); iov_iter_advance(iter, size); return 0; } @@ -925,10 +919,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and * map them into the kernel. On IO completion, the caller should put those - * pages. For now, when adding kernel pages, we still grab a reference to the - * page. This isn't strictly needed for the common case, but some call paths - * end up releasing pages from eg a pipe and we can't easily control these. - * See comment in __bio_iov_bvec_add_pages(). + * pages. If we're adding kernel pages, and the caller told us it's safe to + * do so, we just have to add the pages to the bio directly. We don't grab an + * extra reference to those pages (the user should already have that), and we + * don't put the page on IO completion. The caller needs to check if the bio is + * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be + * released. * * The function tries, but does not guarantee, to pin as many pages as * fit into the bio, or are requested in *iter, whatever is smaller. If @@ -940,6 +936,13 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) const bool is_bvec = iov_iter_is_bvec(iter); unsigned short orig_vcnt = bio->bi_vcnt; + /* + * If this is a BVEC iter, then the pages are kernel pages. Don't + * release them on IO completion, if the caller asked us to. + */ + if (is_bvec && iov_iter_bvec_no_ref(iter)) + bio_set_flag(bio, BIO_NO_PAGE_REF); + do { int ret; @@ -1696,7 +1699,8 @@ static void bio_dirty_fn(struct work_struct *work) next = bio->bi_private; bio_set_pages_dirty(bio); - bio_release_pages(bio); + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) + bio_release_pages(bio); bio_put(bio); } } @@ -1713,7 +1717,8 @@ void bio_check_pages_dirty(struct bio *bio) goto defer; } - bio_release_pages(bio); + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) + bio_release_pages(bio); bio_put(bio); return; defer: diff --git a/fs/block_dev.c b/fs/block_dev.c index e9faa52bb489..78d3257435c0 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -336,12 +336,14 @@ static void blkdev_bio_end_io(struct bio *bio) if (should_dirty) { bio_check_pages_dirty(bio); } else { - struct bio_vec *bvec; - int i; - struct bvec_iter_all iter_all; + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { + struct bvec_iter_all iter_all; + struct bio_vec *bvec; + int i; - bio_for_each_segment_all(bvec, bio, i, iter_all) - put_page(bvec->bv_page); + bio_for_each_segment_all(bvec, bio, i, iter_all) + put_page(bvec->bv_page); + } bio_put(bio); } } diff --git a/fs/iomap.c b/fs/iomap.c index 97cb9d486a7d..abdd18e404f8 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1589,12 +1589,14 @@ static void iomap_dio_bio_end_io(struct bio *bio) if (should_dirty) { bio_check_pages_dirty(bio); } else { - struct bio_vec *bvec; - int i; - struct bvec_iter_all iter_all; + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { + struct bvec_iter_all iter_all; + struct bio_vec *bvec; + int i; - bio_for_each_segment_all(bvec, bio, i, iter_all) - put_page(bvec->bv_page); + bio_for_each_segment_all(bvec, bio, i, iter_all) + put_page(bvec->bv_page); + } bio_put(bio); } } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d66bf5f32610..791fee35df88 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -215,6 +215,7 @@ struct bio { /* * bio flags */ +#define BIO_NO_PAGE_REF 0 /* don't put release vec pages */ #define BIO_SEG_VALID 1 /* bi_phys_segments valid */ #define BIO_CLONED 2 /* doesn't own data */ #define BIO_BOUNCED 3 /* bio is a bounce bio */ -- cgit v1.2.3-59-g8ed1b From 9496c015ed39ddfce971d63a1442e6d258504a7d Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Tue, 19 Mar 2019 23:05:18 +0800 Subject: blk-mq: remove unused 'nr_expired' from blk_mq_hw_ctx There is no usage of 'nr_expired'. The 'nr_expired' was introduced by commit 1d9bd5161ba3 ("blk-mq: replace timeout synchronization with a RCU and generation based scheme"). Its usage was removed since commit 12f5b9314545 ("blk-mq: Remove generation seqeunce"). Signed-off-by: Dongli Zhang Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b0c814bcc7e3..35359697318b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -57,7 +57,6 @@ struct blk_mq_hw_ctx { unsigned int queue_num; atomic_t nr_active; - unsigned int nr_expired; struct hlist_node cpuhp_dead; struct kobject kobj; -- cgit v1.2.3-59-g8ed1b From bb229bbb3bf63d23128e851a1f3b85c083178fa1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 20 Mar 2019 09:46:58 +0100 Subject: libceph: wait for latest osdmap in ceph_monc_blacklist_add() Because map updates are distributed lazily, an OSD may not know about the new blacklist for quite some time after "osd blacklist add" command is completed. This makes it possible for a blacklisted but still alive client to overwrite a post-blacklist update, resulting in data corruption. Waiting for latest osdmap in ceph_monc_blacklist_add() and thus using the post-blacklist epoch for all post-blacklist requests ensures that all such requests "wait" for the blacklist to come into force on their respective OSDs. Cc: stable@vger.kernel.org Fixes: 6305a3b41515 ("libceph: support for blacklisting clients") Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman --- include/linux/ceph/libceph.h | 2 ++ net/ceph/ceph_common.c | 18 +++++++++++++++++- net/ceph/mon_client.c | 9 +++++++++ 3 files changed, 28 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index a420c07904bc..337d5049ff93 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -294,6 +294,8 @@ extern void ceph_destroy_client(struct ceph_client *client); extern int __ceph_open_session(struct ceph_client *client, unsigned long started); extern int ceph_open_session(struct ceph_client *client); +int ceph_wait_for_latest_osdmap(struct ceph_client *client, + unsigned long timeout); /* pagevec.c */ extern void ceph_release_page_vector(struct page **pages, int num_pages); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 9cab80207ced..79eac465ec65 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -738,7 +738,6 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started) } EXPORT_SYMBOL(__ceph_open_session); - int ceph_open_session(struct ceph_client *client) { int ret; @@ -754,6 +753,23 @@ int ceph_open_session(struct ceph_client *client) } EXPORT_SYMBOL(ceph_open_session); +int ceph_wait_for_latest_osdmap(struct ceph_client *client, + unsigned long timeout) +{ + u64 newest_epoch; + int ret; + + ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch); + if (ret) + return ret; + + if (client->osdc.osdmap->epoch >= newest_epoch) + return 0; + + ceph_osdc_maybe_request_map(&client->osdc); + return ceph_monc_wait_osdmap(&client->monc, newest_epoch, timeout); +} +EXPORT_SYMBOL(ceph_wait_for_latest_osdmap); static int __init init_ceph_lib(void) { diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 18deb3d889c4..a53e4fbb6319 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -922,6 +922,15 @@ int ceph_monc_blacklist_add(struct ceph_mon_client *monc, mutex_unlock(&monc->mutex); ret = wait_generic_request(req); + if (!ret) + /* + * Make sure we have the osdmap that includes the blacklist + * entry. This is needed to ensure that the OSDs pick up the + * new blacklist before processing any future requests from + * this client. + */ + ret = ceph_wait_for_latest_osdmap(monc->client, 0); + out: put_generic_request(req); return ret; -- cgit v1.2.3-59-g8ed1b From 29ece8b4354f8c5eaee798a3d8a1b356efee426f Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Mon, 18 Mar 2019 22:44:41 +0800 Subject: block: add BLK_MQ_POLL_CLASSIC for hybrid poll and return EINVAL for unexpected value For q->poll_nsec == -1, means doing classic poll, not hybrid poll. We introduce a new flag BLK_MQ_POLL_CLASSIC to replace -1, which may make code much easier to read. Additionally, since val is an int obtained with kstrtoint(), val can be a negative value other than -1, so return -EINVAL for that case. Thanks to Damien Le Moal for some good suggestion. Reviewed-by: Damien Le Moal Signed-off-by: Yufen Yu Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ++-- block/blk-sysfs.c | 12 +++++++----- include/linux/blkdev.h | 3 +++ 3 files changed, 12 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/block/blk-mq.c b/block/blk-mq.c index ea01c23b58a3..76a3f78c566a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2856,7 +2856,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* * Default to classic polling */ - q->poll_nsec = -1; + q->poll_nsec = BLK_MQ_POLL_CLASSIC; blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_add_queue_tag_set(set, q); @@ -3391,7 +3391,7 @@ static bool blk_mq_poll_hybrid(struct request_queue *q, { struct request *rq; - if (q->poll_nsec == -1) + if (q->poll_nsec == BLK_MQ_POLL_CLASSIC) return false; if (!blk_qc_t_is_internal(cookie)) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 59685918167e..422327089e0f 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -360,8 +360,8 @@ static ssize_t queue_poll_delay_show(struct request_queue *q, char *page) { int val; - if (q->poll_nsec == -1) - val = -1; + if (q->poll_nsec == BLK_MQ_POLL_CLASSIC) + val = BLK_MQ_POLL_CLASSIC; else val = q->poll_nsec / 1000; @@ -380,10 +380,12 @@ static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, if (err < 0) return err; - if (val == -1) - q->poll_nsec = -1; - else + if (val == BLK_MQ_POLL_CLASSIC) + q->poll_nsec = BLK_MQ_POLL_CLASSIC; + else if (val >= 0) q->poll_nsec = val * 1000; + else + return -EINVAL; return count; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0de92b29f589..5c58a3b2bf00 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -50,6 +50,9 @@ struct blk_stat_callback; /* Must be consistent with blk_mq_poll_stats_bkt() */ #define BLK_MQ_POLL_STATS_BKTS 16 +/* Doing classic polling */ +#define BLK_MQ_POLL_CLASSIC -1 + /* * Maximum number of blkcg policies allowed to be registered concurrently. * Defined here to simplify include dependency. -- cgit v1.2.3-59-g8ed1b From e6c987120e24cb913cb7bd4e675129a30fa49e0d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 20 Mar 2019 13:14:37 -0700 Subject: block: Unexport blk_mq_add_to_requeue_list() This function is not used outside the block layer core. Hence unexport it. Cc: Christoph Hellwig Cc: Ming Lei Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 - block/blk-mq.h | 2 ++ include/linux/blk-mq.h | 2 -- 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/block/blk-mq.c b/block/blk-mq.c index 76a3f78c566a..70b210a308c4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -782,7 +782,6 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, if (kick_requeue_list) blk_mq_kick_requeue_list(q); } -EXPORT_SYMBOL(blk_mq_add_to_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q) { diff --git a/block/blk-mq.h b/block/blk-mq.h index c11353a3749d..0ed8e5a8729f 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -41,6 +41,8 @@ void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool); +void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, + bool kick_requeue_list); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); bool blk_mq_get_driver_tag(struct request *rq); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 35359697318b..cb2aa7ecafff 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -299,8 +299,6 @@ void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); -void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, - bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); bool blk_mq_complete_request(struct request *rq); -- cgit v1.2.3-59-g8ed1b From 551417af91b163bd697eb50b3601adae2177c28a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 18 Mar 2019 14:51:23 +0800 Subject: genirq: Fix typo in comment of IRQD_MOVE_PCNTXT Signed-off-by: Peter Xu Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Dou Liyang Cc: Julien Thierry Link: https://lkml.kernel.org/r/20190318065123.11862-1-peterx@redhat.com --- include/linux/irq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index d6160d479b14..7ae8de5ad0f2 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -195,7 +195,7 @@ struct irq_data { * IRQD_LEVEL - Interrupt is level triggered * IRQD_WAKEUP_STATE - Interrupt is configured for wakeup * from suspend - * IRDQ_MOVE_PCNTXT - Interrupt can be moved in process + * IRQD_MOVE_PCNTXT - Interrupt can be moved in process * context * IRQD_IRQ_DISABLED - Disabled state of the interrupt * IRQD_IRQ_MASKED - Masked state of the interrupt -- cgit v1.2.3-59-g8ed1b From 1e4471e74c75acb3f89959ffa02a241227937ae2 Mon Sep 17 00:00:00 2001 From: Shenghui Wang Date: Sat, 16 Mar 2019 16:24:37 +0800 Subject: sbitmap: trivial - update comment for sbitmap_deferred_clear_bit "sbitmap_batch_clear" should be "sbitmap_deferred_clear" Acked-by: Omar Sandoval Signed-off-by: Shenghui Wang Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 14d558146aea..20f3e3f029b9 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -330,7 +330,7 @@ static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr) /* * This one is special, since it doesn't actually clear the bit, rather it * sets the corresponding bit in the ->cleared mask instead. Paired with - * the caller doing sbitmap_batch_clear() if a given index is full, which + * the caller doing sbitmap_deferred_clear() if a given index is full, which * will clear the previously freed entries in the corresponding ->word. */ static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr) -- cgit v1.2.3-59-g8ed1b From ffc8599aa9763f39f6736a79da4d1575e7006f9a Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 8 Mar 2019 11:05:08 +0800 Subject: x86/gart: Exclude GART aperture from kcore On machines where the GART aperture is mapped over physical RAM, /proc/kcore contains the GART aperture range. Accessing the GART range via /proc/kcore results in a kernel crash. vmcore used to have the same issue, until it was fixed with commit 2a3e83c6f96c ("x86/gart: Exclude GART aperture from vmcore")', leveraging existing hook infrastructure in vmcore to let /proc/vmcore return zeroes when attempting to read the aperture region, and so it won't read from the actual memory. Apply the same workaround for kcore. First implement the same hook infrastructure for kcore, then reuse the hook functions introduced in the previous vmcore fix. Just with some minor adjustment, rename some functions for more general usage, and simplify the hook infrastructure a bit as there is no module usage yet. Suggested-by: Baoquan He Signed-off-by: Kairui Song Signed-off-by: Thomas Gleixner Reviewed-by: Jiri Bohac Acked-by: Baoquan He Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Omar Sandoval Cc: Dave Young Link: https://lkml.kernel.org/r/20190308030508.13548-1-kasong@redhat.com --- arch/x86/kernel/aperture_64.c | 20 +++++++++++++------- fs/proc/kcore.c | 27 +++++++++++++++++++++++++++ include/linux/kcore.h | 2 ++ 3 files changed, 42 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 58176b56354e..294ed4392a0e 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -14,6 +14,7 @@ #define pr_fmt(fmt) "AGP: " fmt #include +#include #include #include #include @@ -57,7 +58,7 @@ int fallback_aper_force __initdata; int fix_aperture __initdata = 1; -#ifdef CONFIG_PROC_VMCORE +#if defined(CONFIG_PROC_VMCORE) || defined(CONFIG_PROC_KCORE) /* * If the first kernel maps the aperture over e820 RAM, the kdump kernel will * use the same range because it will remain configured in the northbridge. @@ -66,20 +67,25 @@ int fix_aperture __initdata = 1; */ static unsigned long aperture_pfn_start, aperture_page_count; -static int gart_oldmem_pfn_is_ram(unsigned long pfn) +static int gart_mem_pfn_is_ram(unsigned long pfn) { return likely((pfn < aperture_pfn_start) || (pfn >= aperture_pfn_start + aperture_page_count)); } -static void exclude_from_vmcore(u64 aper_base, u32 aper_order) +static void __init exclude_from_core(u64 aper_base, u32 aper_order) { aperture_pfn_start = aper_base >> PAGE_SHIFT; aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT; - WARN_ON(register_oldmem_pfn_is_ram(&gart_oldmem_pfn_is_ram)); +#ifdef CONFIG_PROC_VMCORE + WARN_ON(register_oldmem_pfn_is_ram(&gart_mem_pfn_is_ram)); +#endif +#ifdef CONFIG_PROC_KCORE + WARN_ON(register_mem_pfn_is_ram(&gart_mem_pfn_is_ram)); +#endif } #else -static void exclude_from_vmcore(u64 aper_base, u32 aper_order) +static void exclude_from_core(u64 aper_base, u32 aper_order) { } #endif @@ -474,7 +480,7 @@ out: * may have allocated the range over its e820 RAM * and fixed up the northbridge */ - exclude_from_vmcore(last_aper_base, last_aper_order); + exclude_from_core(last_aper_base, last_aper_order); return 1; } @@ -520,7 +526,7 @@ out: * overlap with the first kernel's memory. We can't access the * range through vmcore even though it should be part of the dump. */ - exclude_from_vmcore(aper_alloc, aper_order); + exclude_from_core(aper_alloc, aper_order); /* Fix up the north bridges */ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index bbcc185062bb..d29d869abec1 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -54,6 +54,28 @@ static LIST_HEAD(kclist_head); static DECLARE_RWSEM(kclist_lock); static int kcore_need_update = 1; +/* + * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error + * Same as oldmem_pfn_is_ram in vmcore + */ +static int (*mem_pfn_is_ram)(unsigned long pfn); + +int __init register_mem_pfn_is_ram(int (*fn)(unsigned long pfn)) +{ + if (mem_pfn_is_ram) + return -EBUSY; + mem_pfn_is_ram = fn; + return 0; +} + +static int pfn_is_ram(unsigned long pfn) +{ + if (mem_pfn_is_ram) + return mem_pfn_is_ram(pfn); + else + return 1; +} + /* This doesn't grab kclist_lock, so it should only be used at init time. */ void __init kclist_add(struct kcore_list *new, void *addr, size_t size, int type) @@ -465,6 +487,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) goto out; } m = NULL; /* skip the list anchor */ + } else if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) { + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } } else if (m->type == KCORE_VMALLOC) { vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ diff --git a/include/linux/kcore.h b/include/linux/kcore.h index 8c3f8c14eeaa..c843f4a9c512 100644 --- a/include/linux/kcore.h +++ b/include/linux/kcore.h @@ -44,6 +44,8 @@ void kclist_add_remap(struct kcore_list *m, void *addr, void *vaddr, size_t sz) m->vaddr = (unsigned long)vaddr; kclist_add(m, addr, sz, KCORE_REMAP); } + +extern int __init register_mem_pfn_is_ram(int (*fn)(unsigned long pfn)); #else static inline void kclist_add(struct kcore_list *new, void *addr, size_t size, int type) -- cgit v1.2.3-59-g8ed1b