aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-09-17 16:57:47 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-09-17 16:57:47 -0700
commit7ad67ca5534ee7c958559c4ad610f05c4578e361 (patch)
treedc6b6a8a6b70b5f25b07bcdc06d8e77e705f6822 /include
parentMerge tag 'for-5.4/libata-2019-09-15' of git://git.kernel.dk/linux-block (diff)
parentnull_blk: format pr_* logs with pr_fmt (diff)
downloadlinux-dev-7ad67ca5534ee7c958559c4ad610f05c4578e361.tar.xz
linux-dev-7ad67ca5534ee7c958559c4ad610f05c4578e361.zip
Merge tag 'for-5.4/block-2019-09-16' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe: - Two NVMe pull requests: - ana log parse fix from Anton - nvme quirks support for Apple devices from Ben - fix missing bio completion tracing for multipath stack devices from Hannes and Mikhail - IP TOS settings for nvme rdma and tcp transports from Israel - rq_dma_dir cleanups from Israel - tracing for Get LBA Status command from Minwoo - Some nvme-tcp cleanups from Minwoo, Potnuri and Myself - Some consolidation between the fabrics transports for handling the CAP register - reset race with ns scanning fix for fabrics (move fabrics commands to a dedicated request queue with a different lifetime from the admin request queue)." - controller reset and namespace scan races fixes - nvme discovery log change uevent support - naming improvements from Keith - multiple discovery controllers reject fix from James - some regular cleanups from various people - Series fixing (and re-fixing) null_blk debug printing and nr_devices checks (André) - A few pull requests from Song, with fixes from Andy, Guoqing, Guilherme, Neil, Nigel, and Yufen. - REQ_OP_ZONE_RESET_ALL support (Chaitanya) - Bio merge handling unification (Christoph) - Pick default elevator correctly for devices with special needs (Damien) - Block stats fixes (Hou) - Timeout and support devices nbd fixes (Mike) - Series fixing races around elevator switching and device add/remove (Ming) - sed-opal cleanups (Revanth) - Per device weight support for BFQ (Fam) - Support for blk-iocost, a new model that can properly account cost of IO workloads. (Tejun) - blk-cgroup writeback fixes (Tejun) - paride queue init fixes (zhengbin) - blk_set_runtime_active() cleanup (Stanley) - Block segment mapping optimizations (Bart) - lightnvm fixes (Hans/Minwoo/YueHaibing) - Various little fixes and cleanups * tag 'for-5.4/block-2019-09-16' of git://git.kernel.dk/linux-block: (186 commits) null_blk: format pr_* logs with pr_fmt null_blk: match the type of parameter nr_devices null_blk: do not fail the module load with zero devices block: also check RQF_STATS in blk_mq_need_time_stamp() block: make rq sector size accessible for block stats bfq: Fix bfq linkage error raid5: use bio_end_sector in r5_next_bio raid5: remove STRIPE_OPS_REQ_PENDING md: add feature flag MD_FEATURE_RAID0_LAYOUT md/raid0: avoid RAID0 data corruption due to layout confusion. raid5: don't set STRIPE_HANDLE to stripe which is in batch list raid5: don't increment read_errors on EILSEQ return nvmet: fix a wrong error status returned in error log page nvme: send discovery log page change events to userspace nvme: add uevent variables for controller devices nvme: enable aen regardless of the presence of I/O queues nvme-fabrics: allow discovery subsystems accept a kato nvmet: Use PTR_ERR_OR_ZERO() in nvmet_init_discovery() nvme: Remove redundant assignment of cq vector nvme: Assign subsys instance from first ctrl ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/backing-dev-defs.h23
-rw-r--r--include/linux/backing-dev.h5
-rw-r--r--include/linux/blk-cgroup.h6
-rw-r--r--include/linux/blk-mq.h20
-rw-r--r--include/linux/blk_types.h6
-rw-r--r--include/linux/blkdev.h73
-rw-r--r--include/linux/elevator.h8
-rw-r--r--include/linux/lightnvm.h8
-rw-r--r--include/linux/memcontrol.h39
-rw-r--r--include/linux/nvme.h5
-rw-r--r--include/linux/writeback.h2
-rw-r--r--include/trace/events/iocost.h178
-rw-r--r--include/trace/events/writeback.h126
-rw-r--r--include/uapi/linux/raid/md_p.h2
14 files changed, 466 insertions, 35 deletions
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 6a1a8a314d85..4fc87dee005a 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -63,10 +63,31 @@ enum wb_reason {
* so it has a mismatch name.
*/
WB_REASON_FORKER_THREAD,
+ WB_REASON_FOREIGN_FLUSH,
WB_REASON_MAX,
};
+struct wb_completion {
+ atomic_t cnt;
+ wait_queue_head_t *waitq;
+};
+
+#define __WB_COMPLETION_INIT(_waitq) \
+ (struct wb_completion){ .cnt = ATOMIC_INIT(1), .waitq = (_waitq) }
+
+/*
+ * If one wants to wait for one or more wb_writeback_works, each work's
+ * ->done should be set to a wb_completion defined using the following
+ * macro. Once all work items are issued with wb_queue_work(), the caller
+ * can wait for the completion of all using wb_wait_for_completion(). Work
+ * items which are waited upon aren't freed automatically on completion.
+ */
+#define WB_COMPLETION_INIT(bdi) __WB_COMPLETION_INIT(&(bdi)->wb_waitq)
+
+#define DEFINE_WB_COMPLETION(cmpl, bdi) \
+ struct wb_completion cmpl = WB_COMPLETION_INIT(bdi)
+
/*
* For cgroup writeback, multiple wb's may map to the same blkcg. Those
* wb's can operate mostly independently but should share the congested
@@ -165,6 +186,8 @@ struct bdi_writeback {
};
struct backing_dev_info {
+ u64 id;
+ struct rb_node rb_node; /* keyed by ->id */
struct list_head bdi_list;
unsigned long ra_pages; /* max readahead in PAGE_SIZE units */
unsigned long io_pages; /* max allowed IO size */
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 35b31d176f74..97967ce06de3 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -24,6 +24,7 @@ static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
return bdi;
}
+struct backing_dev_info *bdi_get_by_id(u64 id);
void bdi_put(struct backing_dev_info *bdi);
__printf(2, 3)
@@ -44,6 +45,8 @@ void wb_start_background_writeback(struct bdi_writeback *wb);
void wb_workfn(struct work_struct *work);
void wb_wakeup_delayed(struct bdi_writeback *wb);
+void wb_wait_for_completion(struct wb_completion *done);
+
extern spinlock_t bdi_lock;
extern struct list_head bdi_list;
@@ -227,6 +230,8 @@ static inline int bdi_sched_wait(void *word)
struct bdi_writeback_congested *
wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp);
void wb_congested_put(struct bdi_writeback_congested *congested);
+struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
+ struct cgroup_subsys_state *memcg_css);
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
struct cgroup_subsys_state *memcg_css,
gfp_t gfp);
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 12811091fd50..bed9e43f9426 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -149,7 +149,8 @@ typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
-typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
+typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp,
+ struct request_queue *q, struct blkcg *blkcg);
typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
@@ -233,6 +234,7 @@ struct blkg_conf_ctx {
char *body;
};
+struct gendisk *blkcg_conf_get_disk(char **inputp);
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
char *input, struct blkg_conf_ctx *ctx);
void blkg_conf_finish(struct blkg_conf_ctx *ctx);
@@ -375,7 +377,7 @@ static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
* @q: request_queue of interest
*
* Lookup blkg for the @blkcg - @q pair. This function should be called
- * under RCU read loc.
+ * under RCU read lock.
*/
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
struct request_queue *q)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 3fa1fa59f9b2..0bf056de5cc3 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -140,6 +140,7 @@ typedef int (poll_fn)(struct blk_mq_hw_ctx *);
typedef int (map_queues_fn)(struct blk_mq_tag_set *set);
typedef bool (busy_fn)(struct request_queue *);
typedef void (complete_fn)(struct request *);
+typedef void (cleanup_rq_fn)(struct request *);
struct blk_mq_ops {
@@ -201,6 +202,12 @@ struct blk_mq_ops {
void (*initialize_rq_fn)(struct request *rq);
/*
+ * Called before freeing one request which isn't completed yet,
+ * and usually for freeing the driver private data
+ */
+ cleanup_rq_fn *cleanup_rq;
+
+ /*
* If set, returns whether or not this queue currently is busy
*/
busy_fn *busy;
@@ -241,12 +248,12 @@ enum {
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
- struct request_queue *q);
+ struct request_queue *q,
+ bool elevator_init);
struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
const struct blk_mq_ops *ops,
unsigned int queue_depth,
unsigned int set_flags);
-int blk_mq_register_dev(struct device *, struct request_queue *);
void blk_mq_unregister_dev(struct device *, struct request_queue *);
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
@@ -296,6 +303,7 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
int blk_mq_request_started(struct request *rq);
+int blk_mq_request_completed(struct request *rq);
void blk_mq_start_request(struct request *rq);
void blk_mq_end_request(struct request *rq, blk_status_t error);
void __blk_mq_end_request(struct request *rq, blk_status_t error);
@@ -304,7 +312,6 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
void blk_mq_kick_requeue_list(struct request_queue *q);
void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
bool blk_mq_complete_request(struct request *rq);
-void blk_mq_complete_request_sync(struct request *rq);
bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
struct bio *bio, unsigned int nr_segs);
bool blk_mq_queue_stopped(struct request_queue *q);
@@ -321,6 +328,7 @@ bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_run_hw_queues(struct request_queue *q, bool async);
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
busy_tag_iter_fn *fn, void *priv);
+void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset);
void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_unfreeze_queue(struct request_queue *q);
void blk_freeze_queue_start(struct request_queue *q);
@@ -366,4 +374,10 @@ static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx,
BLK_QC_T_INTERNAL;
}
+static inline void blk_mq_cleanup_rq(struct request *rq)
+{
+ if (rq->q->mq_ops->cleanup_rq)
+ rq->q->mq_ops->cleanup_rq(rq);
+}
+
#endif
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1b1fa1557e68..d688b96d1d63 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -169,6 +169,9 @@ struct bio {
*/
struct blkcg_gq *bi_blkg;
struct bio_issue bi_issue;
+#ifdef CONFIG_BLK_CGROUP_IOCOST
+ u64 bi_iocost_cost;
+#endif
#endif
union {
#if defined(CONFIG_BLK_DEV_INTEGRITY)
@@ -209,6 +212,7 @@ enum {
BIO_BOUNCED, /* bio is a bounce bio */
BIO_USER_MAPPED, /* contains user pages */
BIO_NULL_MAPPED, /* contains invalid user pages */
+ BIO_WORKINGSET, /* contains userspace workingset pages */
BIO_QUIET, /* Make BIO Quiet */
BIO_CHAIN, /* chained bio, ->bi_remaining in effect */
BIO_REFFED, /* bio has elevated ->bi_cnt */
@@ -282,6 +286,8 @@ enum req_opf {
REQ_OP_ZONE_RESET = 6,
/* write the same sector many times */
REQ_OP_WRITE_SAME = 7,
+ /* reset all the zone present on the device */
+ REQ_OP_ZONE_RESET_ALL = 8,
/* write the zero filled sector many times */
REQ_OP_WRITE_ZEROES = 9,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1ef375dafb1c..3094f2d513b2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -194,7 +194,11 @@ struct request {
struct gendisk *rq_disk;
struct hd_struct *part;
- /* Time that I/O was submitted to the kernel. */
+#ifdef CONFIG_BLK_RQ_ALLOC_TIME
+ /* Time that the first bio started allocating this request. */
+ u64 alloc_time_ns;
+#endif
+ /* Time that this request was allocated for this IO. */
u64 start_time_ns;
/* Time that I/O was submitted to the device. */
u64 io_start_time_ns;
@@ -202,9 +206,12 @@ struct request {
#ifdef CONFIG_BLK_WBT
unsigned short wbt_flags;
#endif
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- unsigned short throtl_size;
-#endif
+ /*
+ * rq sectors used for blk stats. It has the same value
+ * with blk_rq_sectors(rq), except that it never be zeroed
+ * by completion.
+ */
+ unsigned short stats_sectors;
/*
* Number of scatter-gather DMA addr+len pairs after
@@ -391,10 +398,6 @@ static inline int blkdev_reset_zones_ioctl(struct block_device *bdev,
#endif /* CONFIG_BLK_DEV_ZONED */
struct request_queue {
- /*
- * Together with queue_head for cacheline sharing
- */
- struct list_head queue_head;
struct request *last_merge;
struct elevator_queue *elevator;
@@ -496,6 +499,8 @@ struct request_queue {
struct queue_limits limits;
+ unsigned int required_elevator_features;
+
#ifdef CONFIG_BLK_DEV_ZONED
/*
* Zoned block device information for request dispatch control.
@@ -539,6 +544,7 @@ struct request_queue {
struct delayed_work requeue_work;
struct mutex sysfs_lock;
+ struct mutex sysfs_dir_lock;
/*
* for reusing dead hctx instance in case of updating
@@ -611,6 +617,8 @@ struct request_queue {
#define QUEUE_FLAG_SCSI_PASSTHROUGH 23 /* queue supports SCSI commands */
#define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */
#define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */
+#define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */
+#define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */
#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
(1 << QUEUE_FLAG_SAME_COMP))
@@ -630,6 +638,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
#define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
#define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
#define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
+#define blk_queue_zone_resetall(q) \
+ test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags)
#define blk_queue_secure_erase(q) \
(test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags))
#define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags)
@@ -637,6 +647,12 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags)
#define blk_queue_pci_p2pdma(q) \
test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags)
+#ifdef CONFIG_BLK_RQ_ALLOC_TIME
+#define blk_queue_rq_alloc_time(q) \
+ test_bit(QUEUE_FLAG_RQ_ALLOC_TIME, &(q)->queue_flags)
+#else
+#define blk_queue_rq_alloc_time(q) false
+#endif
#define blk_noretry_request(rq) \
((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
@@ -644,6 +660,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
#define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags)
#define blk_queue_pm_only(q) atomic_read(&(q)->pm_only)
#define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
+#define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
extern void blk_set_pm_only(struct request_queue *q);
extern void blk_clear_pm_only(struct request_queue *q);
@@ -903,6 +920,7 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
* blk_rq_err_bytes() : bytes left till the next error boundary
* blk_rq_sectors() : sectors left in the entire request
* blk_rq_cur_sectors() : sectors left in the current segment
+ * blk_rq_stats_sectors() : sectors of the entire request used for stats
*/
static inline sector_t blk_rq_pos(const struct request *rq)
{
@@ -931,6 +949,11 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
}
+static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
+{
+ return rq->stats_sectors;
+}
+
#ifdef CONFIG_BLK_DEV_ZONED
static inline unsigned int blk_rq_zone_no(struct request *rq)
{
@@ -1085,6 +1108,8 @@ extern void blk_queue_dma_alignment(struct request_queue *, int);
extern void blk_queue_update_dma_alignment(struct request_queue *, int);
extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
+extern void blk_queue_required_elevator_features(struct request_queue *q,
+ unsigned int features);
/*
* Number of physical segments as sent to the device.
@@ -1232,42 +1257,42 @@ enum blk_default_limits {
BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL,
};
-static inline unsigned long queue_segment_boundary(struct request_queue *q)
+static inline unsigned long queue_segment_boundary(const struct request_queue *q)
{
return q->limits.seg_boundary_mask;
}
-static inline unsigned long queue_virt_boundary(struct request_queue *q)
+static inline unsigned long queue_virt_boundary(const struct request_queue *q)
{
return q->limits.virt_boundary_mask;
}
-static inline unsigned int queue_max_sectors(struct request_queue *q)
+static inline unsigned int queue_max_sectors(const struct request_queue *q)
{
return q->limits.max_sectors;
}
-static inline unsigned int queue_max_hw_sectors(struct request_queue *q)
+static inline unsigned int queue_max_hw_sectors(const struct request_queue *q)
{
return q->limits.max_hw_sectors;
}
-static inline unsigned short queue_max_segments(struct request_queue *q)
+static inline unsigned short queue_max_segments(const struct request_queue *q)
{
return q->limits.max_segments;
}
-static inline unsigned short queue_max_discard_segments(struct request_queue *q)
+static inline unsigned short queue_max_discard_segments(const struct request_queue *q)
{
return q->limits.max_discard_segments;
}
-static inline unsigned int queue_max_segment_size(struct request_queue *q)
+static inline unsigned int queue_max_segment_size(const struct request_queue *q)
{
return q->limits.max_segment_size;
}
-static inline unsigned short queue_logical_block_size(struct request_queue *q)
+static inline unsigned short queue_logical_block_size(const struct request_queue *q)
{
int retval = 512;
@@ -1282,7 +1307,7 @@ static inline unsigned short bdev_logical_block_size(struct block_device *bdev)
return queue_logical_block_size(bdev_get_queue(bdev));
}
-static inline unsigned int queue_physical_block_size(struct request_queue *q)
+static inline unsigned int queue_physical_block_size(const struct request_queue *q)
{
return q->limits.physical_block_size;
}
@@ -1292,7 +1317,7 @@ static inline unsigned int bdev_physical_block_size(struct block_device *bdev)
return queue_physical_block_size(bdev_get_queue(bdev));
}
-static inline unsigned int queue_io_min(struct request_queue *q)
+static inline unsigned int queue_io_min(const struct request_queue *q)
{
return q->limits.io_min;
}
@@ -1302,7 +1327,7 @@ static inline int bdev_io_min(struct block_device *bdev)
return queue_io_min(bdev_get_queue(bdev));
}
-static inline unsigned int queue_io_opt(struct request_queue *q)
+static inline unsigned int queue_io_opt(const struct request_queue *q)
{
return q->limits.io_opt;
}
@@ -1312,7 +1337,7 @@ static inline int bdev_io_opt(struct block_device *bdev)
return queue_io_opt(bdev_get_queue(bdev));
}
-static inline int queue_alignment_offset(struct request_queue *q)
+static inline int queue_alignment_offset(const struct request_queue *q)
{
if (q->limits.misaligned)
return -1;
@@ -1342,7 +1367,7 @@ static inline int bdev_alignment_offset(struct block_device *bdev)
return q->limits.alignment_offset;
}
-static inline int queue_discard_alignment(struct request_queue *q)
+static inline int queue_discard_alignment(const struct request_queue *q)
{
if (q->limits.discard_misaligned)
return -1;
@@ -1432,7 +1457,7 @@ static inline sector_t bdev_zone_sectors(struct block_device *bdev)
return 0;
}
-static inline int queue_dma_alignment(struct request_queue *q)
+static inline int queue_dma_alignment(const struct request_queue *q)
{
return q ? q->dma_alignment : 511;
}
@@ -1543,7 +1568,7 @@ static inline void blk_queue_max_integrity_segments(struct request_queue *q,
}
static inline unsigned short
-queue_max_integrity_segments(struct request_queue *q)
+queue_max_integrity_segments(const struct request_queue *q)
{
return q->limits.max_integrity_segments;
}
@@ -1626,7 +1651,7 @@ static inline void blk_queue_max_integrity_segments(struct request_queue *q,
unsigned int segs)
{
}
-static inline unsigned short queue_max_integrity_segments(struct request_queue *q)
+static inline unsigned short queue_max_integrity_segments(const struct request_queue *q)
{
return 0;
}
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 1dd014c9c87b..901bda352dcb 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -76,6 +76,7 @@ struct elevator_type
struct elv_fs_entry *elevator_attrs;
const char *elevator_name;
const char *elevator_alias;
+ const unsigned int elevator_features;
struct module *elevator_owner;
#ifdef CONFIG_BLK_DEBUG_FS
const struct blk_mq_debugfs_attr *queue_debugfs_attrs;
@@ -165,5 +166,12 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
#define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist)
#define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist)
+/*
+ * Elevator features.
+ */
+
+/* Supports zoned block devices sequential write constraint */
+#define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0)
+
#endif /* CONFIG_BLOCK */
#endif
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 4d0d5655c7b2..ee8ec2e68055 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -88,8 +88,7 @@ typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *);
typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int);
typedef int (nvm_get_chk_meta_fn)(struct nvm_dev *, sector_t, int,
struct nvm_chk_meta *);
-typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
-typedef int (nvm_submit_io_sync_fn)(struct nvm_dev *, struct nvm_rq *);
+typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *, void *);
typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *, int);
typedef void (nvm_destroy_dma_pool_fn)(void *);
typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t,
@@ -104,7 +103,6 @@ struct nvm_dev_ops {
nvm_get_chk_meta_fn *get_chk_meta;
nvm_submit_io_fn *submit_io;
- nvm_submit_io_sync_fn *submit_io_sync;
nvm_create_dma_pool_fn *create_dma_pool;
nvm_destroy_dma_pool_fn *destroy_dma_pool;
@@ -682,8 +680,8 @@ extern int nvm_get_chunk_meta(struct nvm_tgt_dev *, struct ppa_addr,
int, struct nvm_chk_meta *);
extern int nvm_set_chunk_meta(struct nvm_tgt_dev *, struct ppa_addr *,
int, int);
-extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
-extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *);
+extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *, void *);
+extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *, void *);
extern void nvm_end_io(struct nvm_rq *);
#else /* CONFIG_NVM */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2cd4359cb38c..ad8f1a397ae4 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -184,6 +184,23 @@ struct memcg_padding {
#endif
/*
+ * Remember four most recent foreign writebacks with dirty pages in this
+ * cgroup. Inode sharing is expected to be uncommon and, even if we miss
+ * one in a given round, we're likely to catch it later if it keeps
+ * foreign-dirtying, so a fairly low count should be enough.
+ *
+ * See mem_cgroup_track_foreign_dirty_slowpath() for details.
+ */
+#define MEMCG_CGWB_FRN_CNT 4
+
+struct memcg_cgwb_frn {
+ u64 bdi_id; /* bdi->id of the foreign inode */
+ int memcg_id; /* memcg->css.id of foreign inode */
+ u64 at; /* jiffies_64 at the time of dirtying */
+ struct wb_completion done; /* tracks in-flight foreign writebacks */
+};
+
+/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
@@ -307,6 +324,7 @@ struct mem_cgroup {
#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head cgwb_list;
struct wb_domain cgwb_domain;
+ struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
#endif
/* List of events which userspace want to receive */
@@ -1237,6 +1255,18 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
unsigned long *pheadroom, unsigned long *pdirty,
unsigned long *pwriteback);
+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+ struct bdi_writeback *wb);
+
+static inline void mem_cgroup_track_foreign_dirty(struct page *page,
+ struct bdi_writeback *wb)
+{
+ if (unlikely(&page->mem_cgroup->css != wb->memcg_css))
+ mem_cgroup_track_foreign_dirty_slowpath(page, wb);
+}
+
+void mem_cgroup_flush_foreign(struct bdi_writeback *wb);
+
#else /* CONFIG_CGROUP_WRITEBACK */
static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
@@ -1252,6 +1282,15 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
{
}
+static inline void mem_cgroup_track_foreign_dirty(struct page *page,
+ struct bdi_writeback *wb)
+{
+}
+
+static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
+{
+}
+
#endif /* CONFIG_CGROUP_WRITEBACK */
struct sock;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 01aa6a6c241d..f61d6906e59d 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -140,6 +140,7 @@ enum {
* Submission and Completion Queue Entry Sizes for the NVM command set.
* (In bytes and specified as a power of two (2^n)).
*/
+#define NVME_ADM_SQES 6
#define NVME_NVM_IOSQES 6
#define NVME_NVM_IOCQES 4
@@ -814,6 +815,7 @@ enum nvme_admin_opcode {
nvme_admin_security_send = 0x81,
nvme_admin_security_recv = 0x82,
nvme_admin_sanitize_nvm = 0x84,
+ nvme_admin_get_lba_status = 0x86,
};
#define nvme_admin_opcode_name(opcode) { opcode, #opcode }
@@ -840,7 +842,8 @@ enum nvme_admin_opcode {
nvme_admin_opcode_name(nvme_admin_format_nvm), \
nvme_admin_opcode_name(nvme_admin_security_send), \
nvme_admin_opcode_name(nvme_admin_security_recv), \
- nvme_admin_opcode_name(nvme_admin_sanitize_nvm))
+ nvme_admin_opcode_name(nvme_admin_sanitize_nvm), \
+ nvme_admin_opcode_name(nvme_admin_get_lba_status))
enum {
NVME_QUEUE_PHYS_CONTIG = (1 << 0),
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 8945aac31392..a19d845dd7eb 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -217,6 +217,8 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
void wbc_detach_inode(struct writeback_control *wbc);
void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
size_t bytes);
+int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr_pages,
+ enum wb_reason reason, struct wb_completion *done);
void cgroup_writeback_umount(void);
/**
diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h
new file mode 100644
index 000000000000..7ecaa65b7106
--- /dev/null
+++ b/include/trace/events/iocost.h
@@ -0,0 +1,178 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM iocost
+
+struct ioc;
+struct ioc_now;
+struct ioc_gq;
+
+#if !defined(_TRACE_BLK_IOCOST_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BLK_IOCOST_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(iocost_iocg_activate,
+
+ TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
+ u64 last_period, u64 cur_period, u64 vtime),
+
+ TP_ARGS(iocg, path, now, last_period, cur_period, vtime),
+
+ TP_STRUCT__entry (
+ __string(devname, ioc_name(iocg->ioc))
+ __string(cgroup, path)
+ __field(u64, now)
+ __field(u64, vnow)
+ __field(u64, vrate)
+ __field(u64, last_period)
+ __field(u64, cur_period)
+ __field(u64, last_vtime)
+ __field(u64, vtime)
+ __field(u32, weight)
+ __field(u32, inuse)
+ __field(u64, hweight_active)
+ __field(u64, hweight_inuse)
+ ),
+
+ TP_fast_assign(
+ __assign_str(devname, ioc_name(iocg->ioc));
+ __assign_str(cgroup, path);
+ __entry->now = now->now;
+ __entry->vnow = now->vnow;
+ __entry->vrate = now->vrate;
+ __entry->last_period = last_period;
+ __entry->cur_period = cur_period;
+ __entry->last_vtime = iocg->last_vtime;
+ __entry->vtime = vtime;
+ __entry->weight = iocg->weight;
+ __entry->inuse = iocg->inuse;
+ __entry->hweight_active = iocg->hweight_active;
+ __entry->hweight_inuse = iocg->hweight_inuse;
+ ),
+
+ TP_printk("[%s:%s] now=%llu:%llu vrate=%llu "
+ "period=%llu->%llu vtime=%llu->%llu "
+ "weight=%u/%u hweight=%llu/%llu",
+ __get_str(devname), __get_str(cgroup),
+ __entry->now, __entry->vnow, __entry->vrate,
+ __entry->last_period, __entry->cur_period,
+ __entry->last_vtime, __entry->vtime,
+ __entry->inuse, __entry->weight,
+ __entry->hweight_inuse, __entry->hweight_active
+ )
+);
+
+DECLARE_EVENT_CLASS(iocg_inuse_update,
+
+ TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
+ u32 old_inuse, u32 new_inuse,
+ u64 old_hw_inuse, u64 new_hw_inuse),
+
+ TP_ARGS(iocg, path, now, old_inuse, new_inuse,
+ old_hw_inuse, new_hw_inuse),
+
+ TP_STRUCT__entry (
+ __string(devname, ioc_name(iocg->ioc))
+ __string(cgroup, path)
+ __field(u64, now)
+ __field(u32, old_inuse)
+ __field(u32, new_inuse)
+ __field(u64, old_hweight_inuse)
+ __field(u64, new_hweight_inuse)
+ ),
+
+ TP_fast_assign(
+ __assign_str(devname, ioc_name(iocg->ioc));
+ __assign_str(cgroup, path);
+ __entry->now = now->now;
+ __entry->old_inuse = old_inuse;
+ __entry->new_inuse = new_inuse;
+ __entry->old_hweight_inuse = old_hw_inuse;
+ __entry->new_hweight_inuse = new_hw_inuse;
+ ),
+
+ TP_printk("[%s:%s] now=%llu inuse=%u->%u hw_inuse=%llu->%llu",
+ __get_str(devname), __get_str(cgroup), __entry->now,
+ __entry->old_inuse, __entry->new_inuse,
+ __entry->old_hweight_inuse, __entry->new_hweight_inuse
+ )
+);
+
+DEFINE_EVENT(iocg_inuse_update, iocost_inuse_takeback,
+
+ TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
+ u32 old_inuse, u32 new_inuse,
+ u64 old_hw_inuse, u64 new_hw_inuse),
+
+ TP_ARGS(iocg, path, now, old_inuse, new_inuse,
+ old_hw_inuse, new_hw_inuse)
+);
+
+DEFINE_EVENT(iocg_inuse_update, iocost_inuse_giveaway,
+
+ TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
+ u32 old_inuse, u32 new_inuse,
+ u64 old_hw_inuse, u64 new_hw_inuse),
+
+ TP_ARGS(iocg, path, now, old_inuse, new_inuse,
+ old_hw_inuse, new_hw_inuse)
+);
+
+DEFINE_EVENT(iocg_inuse_update, iocost_inuse_reset,
+
+ TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
+ u32 old_inuse, u32 new_inuse,
+ u64 old_hw_inuse, u64 new_hw_inuse),
+
+ TP_ARGS(iocg, path, now, old_inuse, new_inuse,
+ old_hw_inuse, new_hw_inuse)
+);
+
+TRACE_EVENT(iocost_ioc_vrate_adj,
+
+ TP_PROTO(struct ioc *ioc, u64 new_vrate, u32 (*missed_ppm)[2],
+ u32 rq_wait_pct, int nr_lagging, int nr_shortages,
+ int nr_surpluses),
+
+ TP_ARGS(ioc, new_vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages,
+ nr_surpluses),
+
+ TP_STRUCT__entry (
+ __string(devname, ioc_name(ioc))
+ __field(u64, old_vrate)
+ __field(u64, new_vrate)
+ __field(int, busy_level)
+ __field(u32, read_missed_ppm)
+ __field(u32, write_missed_ppm)
+ __field(u32, rq_wait_pct)
+ __field(int, nr_lagging)
+ __field(int, nr_shortages)
+ __field(int, nr_surpluses)
+ ),
+
+ TP_fast_assign(
+ __assign_str(devname, ioc_name(ioc));
+ __entry->old_vrate = atomic64_read(&ioc->vtime_rate);;
+ __entry->new_vrate = new_vrate;
+ __entry->busy_level = ioc->busy_level;
+ __entry->read_missed_ppm = (*missed_ppm)[READ];
+ __entry->write_missed_ppm = (*missed_ppm)[WRITE];
+ __entry->rq_wait_pct = rq_wait_pct;
+ __entry->nr_lagging = nr_lagging;
+ __entry->nr_shortages = nr_shortages;
+ __entry->nr_surpluses = nr_surpluses;
+ ),
+
+ TP_printk("[%s] vrate=%llu->%llu busy=%d missed_ppm=%u:%u rq_wait_pct=%u lagging=%d shortages=%d surpluses=%d",
+ __get_str(devname), __entry->old_vrate, __entry->new_vrate,
+ __entry->busy_level,
+ __entry->read_missed_ppm, __entry->write_missed_ppm,
+ __entry->rq_wait_pct, __entry->nr_lagging, __entry->nr_shortages,
+ __entry->nr_surpluses
+ )
+);
+
+#endif /* _TRACE_BLK_IOCOST_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index aa7f3aeac740..3a27335fce2c 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -176,6 +176,132 @@ static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *w
#endif /* CONFIG_CGROUP_WRITEBACK */
#endif /* CREATE_TRACE_POINTS */
+#ifdef CONFIG_CGROUP_WRITEBACK
+TRACE_EVENT(inode_foreign_history,
+
+ TP_PROTO(struct inode *inode, struct writeback_control *wbc,
+ unsigned int history),
+
+ TP_ARGS(inode, wbc, history),
+
+ TP_STRUCT__entry(
+ __array(char, name, 32)
+ __field(unsigned long, ino)
+ __field(unsigned int, cgroup_ino)
+ __field(unsigned int, history)
+ ),
+
+ TP_fast_assign(
+ strncpy(__entry->name, dev_name(inode_to_bdi(inode)->dev), 32);
+ __entry->ino = inode->i_ino;
+ __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc);
+ __entry->history = history;
+ ),
+
+ TP_printk("bdi %s: ino=%lu cgroup_ino=%u history=0x%x",
+ __entry->name,
+ __entry->ino,
+ __entry->cgroup_ino,
+ __entry->history
+ )
+);
+
+TRACE_EVENT(inode_switch_wbs,
+
+ TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb,
+ struct bdi_writeback *new_wb),
+
+ TP_ARGS(inode, old_wb, new_wb),
+
+ TP_STRUCT__entry(
+ __array(char, name, 32)
+ __field(unsigned long, ino)
+ __field(unsigned int, old_cgroup_ino)
+ __field(unsigned int, new_cgroup_ino)
+ ),
+
+ TP_fast_assign(
+ strncpy(__entry->name, dev_name(old_wb->bdi->dev), 32);
+ __entry->ino = inode->i_ino;
+ __entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb);
+ __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb);
+ ),
+
+ TP_printk("bdi %s: ino=%lu old_cgroup_ino=%u new_cgroup_ino=%u",
+ __entry->name,
+ __entry->ino,
+ __entry->old_cgroup_ino,
+ __entry->new_cgroup_ino
+ )
+);
+
+TRACE_EVENT(track_foreign_dirty,
+
+ TP_PROTO(struct page *page, struct bdi_writeback *wb),
+
+ TP_ARGS(page, wb),
+
+ TP_STRUCT__entry(
+ __array(char, name, 32)
+ __field(u64, bdi_id)
+ __field(unsigned long, ino)
+ __field(unsigned int, memcg_id)
+ __field(unsigned int, cgroup_ino)
+ __field(unsigned int, page_cgroup_ino)
+ ),
+
+ TP_fast_assign(
+ struct address_space *mapping = page_mapping(page);
+ struct inode *inode = mapping ? mapping->host : NULL;
+
+ strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
+ __entry->bdi_id = wb->bdi->id;
+ __entry->ino = inode ? inode->i_ino : 0;
+ __entry->memcg_id = wb->memcg_css->id;
+ __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
+ __entry->page_cgroup_ino = page->mem_cgroup->css.cgroup->kn->id.ino;
+ ),
+
+ TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%u page_cgroup_ino=%u",
+ __entry->name,
+ __entry->bdi_id,
+ __entry->ino,
+ __entry->memcg_id,
+ __entry->cgroup_ino,
+ __entry->page_cgroup_ino
+ )
+);
+
+TRACE_EVENT(flush_foreign,
+
+ TP_PROTO(struct bdi_writeback *wb, unsigned int frn_bdi_id,
+ unsigned int frn_memcg_id),
+
+ TP_ARGS(wb, frn_bdi_id, frn_memcg_id),
+
+ TP_STRUCT__entry(
+ __array(char, name, 32)
+ __field(unsigned int, cgroup_ino)
+ __field(unsigned int, frn_bdi_id)
+ __field(unsigned int, frn_memcg_id)
+ ),
+
+ TP_fast_assign(
+ strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
+ __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
+ __entry->frn_bdi_id = frn_bdi_id;
+ __entry->frn_memcg_id = frn_memcg_id;
+ ),
+
+ TP_printk("bdi %s: cgroup_ino=%u frn_bdi_id=%u frn_memcg_id=%u",
+ __entry->name,
+ __entry->cgroup_ino,
+ __entry->frn_bdi_id,
+ __entry->frn_memcg_id
+ )
+);
+#endif
+
DECLARE_EVENT_CLASS(writeback_write_inode_template,
TP_PROTO(struct inode *inode, struct writeback_control *wbc),
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index b0d15c73f6d7..1f2d8c81f0e0 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -329,6 +329,7 @@ struct mdp_superblock_1 {
#define MD_FEATURE_JOURNAL 512 /* support write cache */
#define MD_FEATURE_PPL 1024 /* support PPL */
#define MD_FEATURE_MULTIPLE_PPLS 2048 /* support for multiple PPLs */
+#define MD_FEATURE_RAID0_LAYOUT 4096 /* layout is meaningful for RAID0 */
#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \
|MD_FEATURE_RECOVERY_OFFSET \
|MD_FEATURE_RESHAPE_ACTIVE \
@@ -341,6 +342,7 @@ struct mdp_superblock_1 {
|MD_FEATURE_JOURNAL \
|MD_FEATURE_PPL \
|MD_FEATURE_MULTIPLE_PPLS \
+ |MD_FEATURE_RAID0_LAYOUT \
)
struct r5l_payload_header {