diff options
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/Kconfig | 3 | ||||
-rw-r--r-- | drivers/block/brd.c | 48 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_bitmap.c | 2 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.c | 27 | ||||
-rw-r--r-- | drivers/block/loop.c | 5 | ||||
-rw-r--r-- | drivers/block/mtip32xx/mtip32xx.c | 490 | ||||
-rw-r--r-- | drivers/block/mtip32xx/mtip32xx.h | 10 | ||||
-rw-r--r-- | drivers/block/nbd.c | 14 | ||||
-rw-r--r-- | drivers/block/rbd.c | 366 | ||||
-rw-r--r-- | drivers/block/virtio_blk.c | 10 | ||||
-rw-r--r-- | drivers/block/xen-blkback/xenbus.c | 8 | ||||
-rw-r--r-- | drivers/block/zram/zram_drv.c | 577 | ||||
-rw-r--r-- | drivers/block/zram/zram_drv.h | 6 |
13 files changed, 738 insertions, 828 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 19df4918e37e..8ddc98279c8f 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -219,7 +219,7 @@ config BLK_DEV_LOOP To use the loop device, you need the losetup utility, found in the util-linux package, see - <ftp://ftp.kernel.org/pub/linux/utils/util-linux/>. + <https://www.kernel.org/pub/linux/utils/util-linux/>. The loop device driver can also be used to "hide" a file system in a disk partition, floppy, or regular file, either using encryption @@ -323,6 +323,7 @@ config BLK_DEV_SX8 config BLK_DEV_RAM tristate "RAM block device support" + select DAX if BLK_DEV_RAM_DAX ---help--- Saying Y here will allow you to use a portion of your RAM memory as a block device, so that you can make file systems on it, read and diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 4ec84d504780..57b574f2f66a 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -21,6 +21,7 @@ #include <linux/slab.h> #ifdef CONFIG_BLK_DEV_RAM_DAX #include <linux/pfn_t.h> +#include <linux/dax.h> #endif #include <linux/uaccess.h> @@ -41,6 +42,9 @@ struct brd_device { struct request_queue *brd_queue; struct gendisk *brd_disk; +#ifdef CONFIG_BLK_DEV_RAM_DAX + struct dax_device *dax_dev; +#endif struct list_head brd_list; /* @@ -326,30 +330,38 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector, } #ifdef CONFIG_BLK_DEV_RAM_DAX -static long brd_direct_access(struct block_device *bdev, sector_t sector, - void **kaddr, pfn_t *pfn, long size) +static long __brd_direct_access(struct brd_device *brd, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn) { - struct brd_device *brd = bdev->bd_disk->private_data; struct page *page; if (!brd) return -ENODEV; - page = brd_insert_page(brd, sector); + page = brd_insert_page(brd, PFN_PHYS(pgoff) / 512); if (!page) return -ENOSPC; *kaddr = page_address(page); *pfn = page_to_pfn_t(page); - return PAGE_SIZE; + return 1; } -#else -#define brd_direct_access NULL + +static long brd_dax_direct_access(struct dax_device *dax_dev, + pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) +{ + struct brd_device *brd = dax_get_private(dax_dev); + + return __brd_direct_access(brd, pgoff, nr_pages, kaddr, pfn); +} + +static const struct dax_operations brd_dax_ops = { + .direct_access = brd_dax_direct_access, +}; #endif static const struct block_device_operations brd_fops = { .owner = THIS_MODULE, .rw_page = brd_rw_page, - .direct_access = brd_direct_access, }; /* @@ -415,9 +427,6 @@ static struct brd_device *brd_alloc(int i) * is harmless) */ blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE); -#ifdef CONFIG_BLK_DEV_RAM_DAX - queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue); -#endif disk = brd->brd_disk = alloc_disk(max_part); if (!disk) goto out_free_queue; @@ -430,8 +439,21 @@ static struct brd_device *brd_alloc(int i) sprintf(disk->disk_name, "ram%d", i); set_capacity(disk, rd_size * 2); +#ifdef CONFIG_BLK_DEV_RAM_DAX + queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue); + brd->dax_dev = alloc_dax(brd, disk->disk_name, &brd_dax_ops); + if (!brd->dax_dev) + goto out_free_inode; +#endif + + return brd; +#ifdef CONFIG_BLK_DEV_RAM_DAX +out_free_inode: + kill_dax(brd->dax_dev); + put_dax(brd->dax_dev); +#endif out_free_queue: blk_cleanup_queue(brd->brd_queue); out_free_dev: @@ -471,6 +493,10 @@ out: static void brd_del_one(struct brd_device *brd) { list_del(&brd->brd_list); +#ifdef CONFIG_BLK_DEV_RAM_DAX + kill_dax(brd->dax_dev); + put_dax(brd->dax_dev); +#endif del_gendisk(brd->brd_disk); brd_free(brd); } diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index dece26f119d4..a804a4107fbc 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -409,7 +409,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN); if (!new_pages) { new_pages = __vmalloc(bytes, - GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO, + GFP_NOIO | __GFP_ZERO, PAGE_KERNEL); if (!new_pages) return NULL; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index b5730e17b455..656624314f0d 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -315,24 +315,32 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) } /* still holds resource->req_lock */ -static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put) +static void drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put) { struct drbd_device *device = req->device; D_ASSERT(device, m || (req->rq_state & RQ_POSTPONED)); + if (!put) + return; + if (!atomic_sub_and_test(put, &req->completion_ref)) - return 0; + return; drbd_req_complete(req, m); + /* local completion may still come in later, + * we need to keep the req object around. */ + if (req->rq_state & RQ_LOCAL_ABORTED) + return; + if (req->rq_state & RQ_POSTPONED) { /* don't destroy the req object just yet, * but queue it for retry */ drbd_restart_request(req); - return 0; + return; } - return 1; + kref_put(&req->kref, drbd_req_destroy); } static void set_if_null_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req) @@ -519,12 +527,8 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, if (req->i.waiting) wake_up(&device->misc_wait); - if (c_put) { - if (drbd_req_put_completion_ref(req, m, c_put)) - kref_put(&req->kref, drbd_req_destroy); - } else { - kref_put(&req->kref, drbd_req_destroy); - } + drbd_req_put_completion_ref(req, m, c_put); + kref_put(&req->kref, drbd_req_destroy); } static void drbd_report_io_error(struct drbd_device *device, struct drbd_request *req) @@ -1366,8 +1370,7 @@ nodata: } out: - if (drbd_req_put_completion_ref(req, &m, 1)) - kref_put(&req->kref, drbd_req_destroy); + drbd_req_put_completion_ref(req, &m, 1); spin_unlock_irq(&resource->req_lock); /* Even though above is a kref_put(), this is safe. diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 994403efee19..28d932906f24 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1697,9 +1697,8 @@ static void loop_queue_work(struct kthread_work *work) loop_handle_cmd(cmd); } -static int loop_init_request(void *data, struct request *rq, - unsigned int hctx_idx, unsigned int request_idx, - unsigned int numa_node) +static int loop_init_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx, unsigned int numa_node) { struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 02804cc79d82..3a779a4f5653 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -195,7 +195,7 @@ static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd) if (mtip_check_surprise_removal(dd->pdev)) return NULL; - rq = blk_mq_alloc_request(dd->queue, 0, BLK_MQ_REQ_RESERVED); + rq = blk_mq_alloc_request(dd->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_RESERVED); if (IS_ERR(rq)) return NULL; @@ -205,66 +205,12 @@ static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd) return blk_mq_rq_to_pdu(rq); } -static void mtip_put_int_command(struct driver_data *dd, struct mtip_cmd *cmd) -{ - blk_put_request(blk_mq_rq_from_pdu(cmd)); -} - -/* - * Once we add support for one hctx per mtip group, this will change a bit - */ -static struct request *mtip_rq_from_tag(struct driver_data *dd, - unsigned int tag) -{ - struct blk_mq_hw_ctx *hctx = dd->queue->queue_hw_ctx[0]; - - return blk_mq_tag_to_rq(hctx->tags, tag); -} - static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd, unsigned int tag) { - struct request *rq = mtip_rq_from_tag(dd, tag); - - return blk_mq_rq_to_pdu(rq); -} - -/* - * IO completion function. - * - * This completion function is called by the driver ISR when a - * command that was issued by the kernel completes. It first calls the - * asynchronous completion function which normally calls back into the block - * layer passing the asynchronous callback data, then unmaps the - * scatter list associated with the completed command, and finally - * clears the allocated bit associated with the completed command. - * - * @port Pointer to the port data structure. - * @tag Tag of the command. - * @data Pointer to driver_data. - * @status Completion status. - * - * return value - * None - */ -static void mtip_async_complete(struct mtip_port *port, - int tag, struct mtip_cmd *cmd, int status) -{ - struct driver_data *dd = port->dd; - struct request *rq; - - if (unlikely(!dd) || unlikely(!port)) - return; - - if (unlikely(status == PORT_IRQ_TF_ERR)) { - dev_warn(&port->dd->pdev->dev, - "Command tag %d failed due to TFE\n", tag); - } - - rq = mtip_rq_from_tag(dd, tag); + struct blk_mq_hw_ctx *hctx = dd->queue->queue_hw_ctx[0]; - cmd->status = status; - blk_mq_complete_request(rq); + return blk_mq_rq_to_pdu(blk_mq_tag_to_rq(hctx->tags, tag)); } /* @@ -581,43 +527,19 @@ static void print_tags(struct driver_data *dd, "%d command(s) %s: tagmap [%s]", cnt, msg, tagmap); } -/* - * Internal command completion callback function. - * - * This function is normally called by the driver ISR when an internal - * command completed. This function signals the command completion by - * calling complete(). - * - * @port Pointer to the port data structure. - * @tag Tag of the command that has completed. - * @data Pointer to a completion structure. - * @status Completion status. - * - * return value - * None - */ -static void mtip_completion(struct mtip_port *port, - int tag, struct mtip_cmd *command, int status) -{ - struct completion *waiting = command->comp_data; - if (unlikely(status == PORT_IRQ_TF_ERR)) - dev_warn(&port->dd->pdev->dev, - "Internal command %d completed with TFE\n", tag); - - command->comp_func = NULL; - command->comp_data = NULL; - complete(waiting); -} - -static void mtip_null_completion(struct mtip_port *port, - int tag, struct mtip_cmd *command, int status) -{ -} - static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer, dma_addr_t buffer_dma, unsigned int sectors); static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id, struct smart_attr *attrib); + +static void mtip_complete_command(struct mtip_cmd *cmd, int status) +{ + struct request *req = blk_mq_rq_from_pdu(cmd); + + cmd->status = status; + blk_mq_complete_request(req); +} + /* * Handle an error. * @@ -646,11 +568,7 @@ static void mtip_handle_tfe(struct driver_data *dd) if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL); dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n"); - - if (cmd->comp_data && cmd->comp_func) { - cmd->comp_func(port, MTIP_TAG_INTERNAL, - cmd, PORT_IRQ_TF_ERR); - } + mtip_complete_command(cmd, -EIO); return; } @@ -677,19 +595,9 @@ static void mtip_handle_tfe(struct driver_data *dd) continue; cmd = mtip_cmd_from_tag(dd, tag); - if (likely(cmd->comp_func)) { - set_bit(tag, tagaccum); - cmd_cnt++; - cmd->comp_func(port, tag, cmd, 0); - } else { - dev_err(&port->dd->pdev->dev, - "Missing completion func for tag %d", - tag); - if (mtip_check_surprise_removal(dd->pdev)) { - /* don't proceed further */ - return; - } - } + mtip_complete_command(cmd, 0); + set_bit(tag, tagaccum); + cmd_cnt++; } } @@ -759,10 +667,7 @@ static void mtip_handle_tfe(struct driver_data *dd) tag, fail_reason != NULL ? fail_reason : "unknown"); - if (cmd->comp_func) { - cmd->comp_func(port, tag, - cmd, -ENODATA); - } + mtip_complete_command(cmd, -ENODATA); continue; } } @@ -785,12 +690,7 @@ static void mtip_handle_tfe(struct driver_data *dd) dev_warn(&port->dd->pdev->dev, "retiring tag %d\n", tag); - if (cmd->comp_func) - cmd->comp_func(port, tag, cmd, PORT_IRQ_TF_ERR); - else - dev_warn(&port->dd->pdev->dev, - "Bad completion for tag %d\n", - tag); + mtip_complete_command(cmd, -EIO); } } print_tags(dd, "reissued (TFE)", tagaccum, cmd_cnt); @@ -823,18 +723,7 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, continue; command = mtip_cmd_from_tag(dd, tag); - if (likely(command->comp_func)) - command->comp_func(port, tag, command, 0); - else { - dev_dbg(&dd->pdev->dev, - "Null completion for tag %d", - tag); - - if (mtip_check_surprise_removal( - dd->pdev)) { - return; - } - } + mtip_complete_command(command, 0); } completed >>= 1; } @@ -852,16 +741,13 @@ static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat) struct mtip_port *port = dd->port; struct mtip_cmd *cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL); - if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) && - (cmd != NULL) && !(readl(port->cmd_issue[MTIP_TAG_INTERNAL]) - & (1 << MTIP_TAG_INTERNAL))) { - if (cmd->comp_func) { - cmd->comp_func(port, MTIP_TAG_INTERNAL, cmd, 0); - return; - } - } + if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) && cmd) { + int group = MTIP_TAG_INDEX(MTIP_TAG_INTERNAL); + int status = readl(port->cmd_issue[group]); - return; + if (!(status & (1 << MTIP_TAG_BIT(MTIP_TAG_INTERNAL)))) + mtip_complete_command(cmd, 0); + } } /* @@ -869,7 +755,6 @@ static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat) */ static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat) { - if (unlikely(port_stat & PORT_IRQ_CONNECT)) { dev_warn(&dd->pdev->dev, "Clearing PxSERR.DIAG.x\n"); @@ -996,8 +881,7 @@ static irqreturn_t mtip_irq_handler(int irq, void *instance) static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag) { - writel(1 << MTIP_TAG_BIT(tag), - port->cmd_issue[MTIP_TAG_INDEX(tag)]); + writel(1 << MTIP_TAG_BIT(tag), port->cmd_issue[MTIP_TAG_INDEX(tag)]); } static bool mtip_pause_ncq(struct mtip_port *port, @@ -1035,53 +919,53 @@ static bool mtip_pause_ncq(struct mtip_port *port, return false; } +static bool mtip_commands_active(struct mtip_port *port) +{ + unsigned int active; + unsigned int n; + + /* + * Ignore s_active bit 0 of array element 0. + * This bit will always be set + */ + active = readl(port->s_active[0]) & 0xFFFFFFFE; + for (n = 1; n < port->dd->slot_groups; n++) + active |= readl(port->s_active[n]); + + return active != 0; +} + /* * Wait for port to quiesce * * @port Pointer to port data structure * @timeout Max duration to wait (ms) - * @atomic gfp_t flag to indicate blockable context or not * * return value * 0 Success * -EBUSY Commands still active */ -static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout, - gfp_t atomic) +static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout) { unsigned long to; - unsigned int n; - unsigned int active = 1; + bool active = true; blk_mq_stop_hw_queues(port->dd->queue); to = jiffies + msecs_to_jiffies(timeout); do { if (test_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags) && - test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags) && - atomic == GFP_KERNEL) { + test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) { msleep(20); continue; /* svc thd is actively issuing commands */ } - if (atomic == GFP_KERNEL) - msleep(100); - else { - cpu_relax(); - udelay(100); - } + msleep(100); if (mtip_check_surprise_removal(port->dd->pdev)) goto err_fault; - /* - * Ignore s_active bit 0 of array element 0. - * This bit will always be set - */ - active = readl(port->s_active[0]) & 0xFFFFFFFE; - for (n = 1; n < port->dd->slot_groups; n++) - active |= readl(port->s_active[n]); - + active = mtip_commands_active(port); if (!active) break; } while (time_before(jiffies, to)); @@ -1093,6 +977,13 @@ err_fault: return -EFAULT; } +struct mtip_int_cmd { + int fis_len; + dma_addr_t buffer; + int buf_len; + u32 opts; +}; + /* * Execute an internal command and wait for the completion. * @@ -1117,13 +1008,17 @@ static int mtip_exec_internal_command(struct mtip_port *port, dma_addr_t buffer, int buf_len, u32 opts, - gfp_t atomic, unsigned long timeout) { - struct mtip_cmd_sg *command_sg; - DECLARE_COMPLETION_ONSTACK(wait); struct mtip_cmd *int_cmd; struct driver_data *dd = port->dd; + struct request *rq; + struct mtip_int_cmd icmd = { + .fis_len = fis_len, + .buffer = buffer, + .buf_len = buf_len, + .opts = opts + }; int rv = 0; unsigned long start; @@ -1138,6 +1033,8 @@ static int mtip_exec_internal_command(struct mtip_port *port, dbg_printk(MTIP_DRV_NAME "Unable to allocate tag for PIO cmd\n"); return -EFAULT; } + rq = blk_mq_rq_from_pdu(int_cmd); + rq->special = &icmd; set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); @@ -1146,135 +1043,60 @@ static int mtip_exec_internal_command(struct mtip_port *port, clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags); - if (atomic == GFP_KERNEL) { - if (fis->command != ATA_CMD_STANDBYNOW1) { - /* wait for io to complete if non atomic */ - if (mtip_quiesce_io(port, - MTIP_QUIESCE_IO_TIMEOUT_MS, atomic) < 0) { - dev_warn(&dd->pdev->dev, - "Failed to quiesce IO\n"); - mtip_put_int_command(dd, int_cmd); - clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); - wake_up_interruptible(&port->svc_wait); - return -EBUSY; - } + if (fis->command != ATA_CMD_STANDBYNOW1) { + /* wait for io to complete if non atomic */ + if (mtip_quiesce_io(port, MTIP_QUIESCE_IO_TIMEOUT_MS) < 0) { + dev_warn(&dd->pdev->dev, "Failed to quiesce IO\n"); + blk_mq_free_request(rq); + clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); + wake_up_interruptible(&port->svc_wait); + return -EBUSY; } - - /* Set the completion function and data for the command. */ - int_cmd->comp_data = &wait; - int_cmd->comp_func = mtip_completion; - - } else { - /* Clear completion - we're going to poll */ - int_cmd->comp_data = NULL; - int_cmd->comp_func = mtip_null_completion; } /* Copy the command to the command table */ memcpy(int_cmd->command, fis, fis_len*4); - /* Populate the SG list */ - int_cmd->command_header->opts = - __force_bit2int cpu_to_le32(opts | fis_len); - if (buf_len) { - command_sg = int_cmd->command + AHCI_CMD_TBL_HDR_SZ; - - command_sg->info = - __force_bit2int cpu_to_le32((buf_len-1) & 0x3FFFFF); - command_sg->dba = - __force_bit2int cpu_to_le32(buffer & 0xFFFFFFFF); - command_sg->dba_upper = - __force_bit2int cpu_to_le32((buffer >> 16) >> 16); - - int_cmd->command_header->opts |= - __force_bit2int cpu_to_le32((1 << 16)); - } - - /* Populate the command header */ - int_cmd->command_header->byte_count = 0; - start = jiffies; + rq->timeout = timeout; - /* Issue the command to the hardware */ - mtip_issue_non_ncq_command(port, MTIP_TAG_INTERNAL); - - if (atomic == GFP_KERNEL) { - /* Wait for the command to complete or timeout. */ - if ((rv = wait_for_completion_interruptible_timeout( - &wait, - msecs_to_jiffies(timeout))) <= 0) { - - if (rv == -ERESTARTSYS) { /* interrupted */ - dev_err(&dd->pdev->dev, - "Internal command [%02X] was interrupted after %u ms\n", - fis->command, - jiffies_to_msecs(jiffies - start)); - rv = -EINTR; - goto exec_ic_exit; - } else if (rv == 0) /* timeout */ - dev_err(&dd->pdev->dev, - "Internal command did not complete [%02X] within timeout of %lu ms\n", - fis->command, timeout); - else - dev_err(&dd->pdev->dev, - "Internal command [%02X] wait returned code [%d] after %lu ms - unhandled\n", - fis->command, rv, timeout); - - if (mtip_check_surprise_removal(dd->pdev) || - test_bit(MTIP_DDF_REMOVE_PENDING_BIT, - &dd->dd_flag)) { - dev_err(&dd->pdev->dev, - "Internal command [%02X] wait returned due to SR\n", - fis->command); - rv = -ENXIO; - goto exec_ic_exit; - } - mtip_device_reset(dd); /* recover from timeout issue */ - rv = -EAGAIN; + /* insert request and run queue */ + blk_execute_rq(rq->q, NULL, rq, true); + + rv = int_cmd->status; + if (rv < 0) { + if (rv == -ERESTARTSYS) { /* interrupted */ + dev_err(&dd->pdev->dev, + "Internal command [%02X] was interrupted after %u ms\n", + fis->command, + jiffies_to_msecs(jiffies - start)); + rv = -EINTR; goto exec_ic_exit; - } - } else { - u32 hba_stat, port_stat; - - /* Spin for <timeout> checking if command still outstanding */ - timeout = jiffies + msecs_to_jiffies(timeout); - while ((readl(port->cmd_issue[MTIP_TAG_INTERNAL]) - & (1 << MTIP_TAG_INTERNAL)) - && time_before(jiffies, timeout)) { - if (mtip_check_surprise_removal(dd->pdev)) { - rv = -ENXIO; - goto exec_ic_exit; - } - if ((fis->command != ATA_CMD_STANDBYNOW1) && - test_bit(MTIP_DDF_REMOVE_PENDING_BIT, - &dd->dd_flag)) { - rv = -ENXIO; - goto exec_ic_exit; - } - port_stat = readl(port->mmio + PORT_IRQ_STAT); - if (!port_stat) - continue; + } else if (rv == 0) /* timeout */ + dev_err(&dd->pdev->dev, + "Internal command did not complete [%02X] within timeout of %lu ms\n", + fis->command, timeout); + else + dev_err(&dd->pdev->dev, + "Internal command [%02X] wait returned code [%d] after %lu ms - unhandled\n", + fis->command, rv, timeout); - if (port_stat & PORT_IRQ_ERR) { - dev_err(&dd->pdev->dev, - "Internal command [%02X] failed\n", - fis->command); - mtip_device_reset(dd); - rv = -EIO; - goto exec_ic_exit; - } else { - writel(port_stat, port->mmio + PORT_IRQ_STAT); - hba_stat = readl(dd->mmio + HOST_IRQ_STAT); - if (hba_stat) - writel(hba_stat, - dd->mmio + HOST_IRQ_STAT); - } - break; + if (mtip_check_surprise_removal(dd->pdev) || + test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &dd->dd_flag)) { + dev_err(&dd->pdev->dev, + "Internal command [%02X] wait returned due to SR\n", + fis->command); + rv = -ENXIO; + goto exec_ic_exit; } + mtip_device_reset(dd); /* recover from timeout issue */ + rv = -EAGAIN; + goto exec_ic_exit; } - if (readl(port->cmd_issue[MTIP_TAG_INTERNAL]) - & (1 << MTIP_TAG_INTERNAL)) { + if (readl(port->cmd_issue[MTIP_TAG_INDEX(MTIP_TAG_INTERNAL)]) + & (1 << MTIP_TAG_BIT(MTIP_TAG_INTERNAL))) { rv = -ENXIO; if (!test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) { mtip_device_reset(dd); @@ -1283,7 +1105,7 @@ static int mtip_exec_internal_command(struct mtip_port *port, } exec_ic_exit: /* Clear the allocated and active bits for the internal command. */ - mtip_put_int_command(dd, int_cmd); + blk_mq_free_request(rq); clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); if (rv >= 0 && mtip_pause_ncq(port, fis)) { /* NCQ paused */ @@ -1391,7 +1213,6 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer) port->identify_dma, sizeof(u16) * ATA_ID_WORDS, 0, - GFP_KERNEL, MTIP_INT_CMD_TIMEOUT_MS) < 0) { rv = -1; @@ -1477,7 +1298,6 @@ static int mtip_standby_immediate(struct mtip_port *port) 0, 0, 0, - GFP_ATOMIC, timeout); dbg_printk(MTIP_DRV_NAME "Time taken to complete standby cmd: %d ms\n", jiffies_to_msecs(jiffies - start)); @@ -1523,7 +1343,6 @@ static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer, buffer_dma, sectors * ATA_SECT_SIZE, 0, - GFP_ATOMIC, MTIP_INT_CMD_TIMEOUT_MS); } @@ -1558,7 +1377,6 @@ static int mtip_get_smart_data(struct mtip_port *port, u8 *buffer, buffer_dma, ATA_SECT_SIZE, 0, - GFP_ATOMIC, 15000); } @@ -1686,7 +1504,6 @@ static int mtip_send_trim(struct driver_data *dd, unsigned int lba, dma_addr, ATA_SECT_SIZE, 0, - GFP_KERNEL, MTIP_TRIM_TIMEOUT_MS) < 0) rv = -EIO; @@ -1850,7 +1667,6 @@ static int exec_drive_task(struct mtip_port *port, u8 *command) 0, 0, 0, - GFP_KERNEL, to) < 0) { return -1; } @@ -1946,7 +1762,6 @@ static int exec_drive_command(struct mtip_port *port, u8 *command, (xfer_sz ? dma_addr : 0), (xfer_sz ? ATA_SECT_SIZE * xfer_sz : 0), 0, - GFP_KERNEL, to) < 0) { rv = -EFAULT; @@ -2189,7 +2004,6 @@ static int exec_drive_taskfile(struct driver_data *dd, dma_buffer, transfer_size, 0, - GFP_KERNEL, timeout) < 0) { err = -EIO; goto abort; @@ -2446,12 +2260,6 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, (nents << 16) | 5 | AHCI_CMD_PREFETCH); command->command_header->byte_count = 0; - /* - * Set the completion function and data for the command - * within this layer. - */ - command->comp_data = dd; - command->comp_func = mtip_async_complete; command->direction = dma_dir; /* @@ -3825,6 +3633,42 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx, return false; } +static int mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + struct driver_data *dd = hctx->queue->queuedata; + struct mtip_int_cmd *icmd = rq->special; + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); + struct mtip_cmd_sg *command_sg; + + if (mtip_commands_active(dd->port)) + return BLK_MQ_RQ_QUEUE_BUSY; + + /* Populate the SG list */ + cmd->command_header->opts = + __force_bit2int cpu_to_le32(icmd->opts | icmd->fis_len); + if (icmd->buf_len) { + command_sg = cmd->command + AHCI_CMD_TBL_HDR_SZ; + + command_sg->info = + __force_bit2int cpu_to_le32((icmd->buf_len-1) & 0x3FFFFF); + command_sg->dba = + __force_bit2int cpu_to_le32(icmd->buffer & 0xFFFFFFFF); + command_sg->dba_upper = + __force_bit2int cpu_to_le32((icmd->buffer >> 16) >> 16); + + cmd->command_header->opts |= + __force_bit2int cpu_to_le32((1 << 16)); + } + + /* Populate the command header */ + cmd->command_header->byte_count = 0; + + blk_mq_start_request(rq); + mtip_issue_non_ncq_command(dd->port, rq->tag); + return BLK_MQ_RQ_QUEUE_OK; +} + static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -3833,6 +3677,9 @@ static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, mtip_init_cmd_header(rq); + if (blk_rq_is_passthrough(rq)) + return mtip_issue_reserved_cmd(hctx, rq); + if (unlikely(mtip_check_unal_depth(hctx, rq))) return BLK_MQ_RQ_QUEUE_BUSY; @@ -3845,10 +3692,10 @@ static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_MQ_RQ_QUEUE_ERROR; } -static void mtip_free_cmd(void *data, struct request *rq, - unsigned int hctx_idx, unsigned int request_idx) +static void mtip_free_cmd(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx) { - struct driver_data *dd = data; + struct driver_data *dd = set->driver_data; struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); if (!cmd->command) @@ -3858,20 +3705,12 @@ static void mtip_free_cmd(void *data, struct request *rq, cmd->command, cmd->command_dma); } -static int mtip_init_cmd(void *data, struct request *rq, unsigned int hctx_idx, - unsigned int request_idx, unsigned int numa_node) +static int mtip_init_cmd(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx, unsigned int numa_node) { - struct driver_data *dd = data; + struct driver_data *dd = set->driver_data; struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); - /* - * For flush requests, request_idx starts at the end of the - * tag space. Since we don't support FLUSH/FUA, simply return - * 0 as there's nothing to be done. - */ - if (request_idx >= MTIP_MAX_COMMAND_SLOTS) - return 0; - cmd->command = dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, &cmd->command_dma, GFP_KERNEL); if (!cmd->command) @@ -3888,8 +3727,12 @@ static enum blk_eh_timer_return mtip_cmd_timeout(struct request *req, { struct driver_data *dd = req->q->queuedata; - if (reserved) - goto exit_handler; + if (reserved) { + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req); + + cmd->status = -ETIME; + return BLK_EH_HANDLED; + } if (test_bit(req->tag, dd->port->cmds_to_issue)) goto exit_handler; @@ -3982,7 +3825,7 @@ static int mtip_block_initialize(struct driver_data *dd) dd->tags.reserved_tags = 1; dd->tags.cmd_size = sizeof(struct mtip_cmd); dd->tags.numa_node = dd->numa_node; - dd->tags.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_NO_SCHED; + dd->tags.flags = BLK_MQ_F_SHOULD_MERGE; dd->tags.driver_data = dd; dd->tags.timeout = MTIP_NCQ_CMD_TIMEOUT_MS; @@ -4116,20 +3959,10 @@ protocol_init_error: static void mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv) { - struct driver_data *dd = (struct driver_data *)data; - struct mtip_cmd *cmd; - - if (likely(!reserv)) { - cmd = blk_mq_rq_to_pdu(rq); - cmd->status = -ENODEV; - blk_mq_complete_request(rq); - } else if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &dd->port->flags)) { + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); - cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL); - if (cmd->comp_func) - cmd->comp_func(dd->port, MTIP_TAG_INTERNAL, - cmd, -ENODEV); - } + cmd->status = -ENODEV; + blk_mq_complete_request(rq); } /* @@ -4168,8 +4001,7 @@ static int mtip_block_remove(struct driver_data *dd) * Explicitly wait here for IOs to quiesce, * as mtip_standby_drive usually won't wait for IOs. */ - if (!mtip_quiesce_io(dd->port, MTIP_QUIESCE_IO_TIMEOUT_MS, - GFP_KERNEL)) + if (!mtip_quiesce_io(dd->port, MTIP_QUIESCE_IO_TIMEOUT_MS)) mtip_standby_drive(dd); } else diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 57b41528a824..37b8e3e0bb78 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -333,16 +333,6 @@ struct mtip_cmd { dma_addr_t command_dma; /* corresponding physical address */ - void *comp_data; /* data passed to completion function comp_func() */ - /* - * Completion function called by the ISR upon completion of - * a command. - */ - void (*comp_func)(struct mtip_port *port, - int tag, - struct mtip_cmd *cmd, - int status); - int scatter_ents; /* Number of scatter list entries used */ int unaligned; /* command is unaligned on 4k boundary */ diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 9b482baa869e..9a7bb2c29447 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -18,6 +18,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/fs.h> #include <linux/bio.h> #include <linux/stat.h> @@ -347,7 +348,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send, struct socket *sock = config->socks[index]->sock; int result; struct msghdr msg; - unsigned long pflags = current->flags; + unsigned int noreclaim_flag; if (unlikely(!sock)) { dev_err_ratelimited(disk_to_dev(nbd->disk), @@ -358,7 +359,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send, msg.msg_iter = *iter; - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); do { sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC; msg.msg_name = NULL; @@ -381,7 +382,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send, *sent += result; } while (msg_data_left(&msg)); - current_restore_flags(pflags, PF_MEMALLOC); + memalloc_noreclaim_restore(noreclaim_flag); return result; } @@ -1396,12 +1397,11 @@ static void nbd_dbg_close(void) #endif -static int nbd_init_request(void *data, struct request *rq, - unsigned int hctx_idx, unsigned int request_idx, - unsigned int numa_node) +static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx, unsigned int numa_node) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq); - cmd->nbd = data; + cmd->nbd = set->driver_data; return 0; } diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 089ac4179919..454bf9c34882 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -387,6 +387,7 @@ struct rbd_device { struct rw_semaphore lock_rwsem; enum rbd_lock_state lock_state; + char lock_cookie[32]; struct rbd_client_id owner_cid; struct work_struct acquired_lock_work; struct work_struct released_lock_work; @@ -477,13 +478,6 @@ static int minor_to_rbd_dev_id(int minor) return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; } -static bool rbd_is_lock_supported(struct rbd_device *rbd_dev) -{ - return (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) && - rbd_dev->spec->snap_id == CEPH_NOSNAP && - !rbd_dev->mapping.read_only; -} - static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev) { return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED || @@ -731,7 +725,7 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) kref_init(&rbdc->kref); INIT_LIST_HEAD(&rbdc->node); - rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); + rbdc->client = ceph_create_client(ceph_opts, rbdc); if (IS_ERR(rbdc->client)) goto out_rbdc; ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ @@ -804,6 +798,7 @@ enum { Opt_read_only, Opt_read_write, Opt_lock_on_read, + Opt_exclusive, Opt_err }; @@ -816,6 +811,7 @@ static match_table_t rbd_opts_tokens = { {Opt_read_write, "read_write"}, {Opt_read_write, "rw"}, /* Alternate spelling */ {Opt_lock_on_read, "lock_on_read"}, + {Opt_exclusive, "exclusive"}, {Opt_err, NULL} }; @@ -823,11 +819,13 @@ struct rbd_options { int queue_depth; bool read_only; bool lock_on_read; + bool exclusive; }; #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ #define RBD_READ_ONLY_DEFAULT false #define RBD_LOCK_ON_READ_DEFAULT false +#define RBD_EXCLUSIVE_DEFAULT false static int parse_rbd_opts_token(char *c, void *private) { @@ -866,6 +864,9 @@ static int parse_rbd_opts_token(char *c, void *private) case Opt_lock_on_read: rbd_opts->lock_on_read = true; break; + case Opt_exclusive: + rbd_opts->exclusive = true; + break; default: /* libceph prints "bad option" msg */ return -EINVAL; @@ -1922,7 +1923,7 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) { struct ceph_osd_request *osd_req = obj_request->osd_req; - osd_req->r_mtime = CURRENT_TIME; + ktime_get_real_ts(&osd_req->r_mtime); osd_req->r_data_offset = obj_request->offset; } @@ -3079,7 +3080,8 @@ static int rbd_lock(struct rbd_device *rbd_dev) char cookie[32]; int ret; - WARN_ON(__rbd_is_lock_owner(rbd_dev)); + WARN_ON(__rbd_is_lock_owner(rbd_dev) || + rbd_dev->lock_cookie[0] != '\0'); format_lock_cookie(rbd_dev, cookie); ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, @@ -3089,6 +3091,7 @@ static int rbd_lock(struct rbd_device *rbd_dev) return ret; rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED; + strcpy(rbd_dev->lock_cookie, cookie); rbd_set_owner_cid(rbd_dev, &cid); queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work); return 0; @@ -3097,27 +3100,24 @@ static int rbd_lock(struct rbd_device *rbd_dev) /* * lock_rwsem must be held for write */ -static int rbd_unlock(struct rbd_device *rbd_dev) +static void rbd_unlock(struct rbd_device *rbd_dev) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; - char cookie[32]; int ret; - WARN_ON(!__rbd_is_lock_owner(rbd_dev)); - - rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; + WARN_ON(!__rbd_is_lock_owner(rbd_dev) || + rbd_dev->lock_cookie[0] == '\0'); - format_lock_cookie(rbd_dev, cookie); ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, - RBD_LOCK_NAME, cookie); - if (ret && ret != -ENOENT) { - rbd_warn(rbd_dev, "cls_unlock failed: %d", ret); - return ret; - } + RBD_LOCK_NAME, rbd_dev->lock_cookie); + if (ret && ret != -ENOENT) + rbd_warn(rbd_dev, "failed to unlock: %d", ret); + /* treat errors as the image is unlocked */ + rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; + rbd_dev->lock_cookie[0] = '\0'; rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work); - return 0; } static int __rbd_notify_op_lock(struct rbd_device *rbd_dev, @@ -3447,6 +3447,18 @@ again: ret = rbd_request_lock(rbd_dev); if (ret == -ETIMEDOUT) { goto again; /* treat this as a dead client */ + } else if (ret == -EROFS) { + rbd_warn(rbd_dev, "peer will not release lock"); + /* + * If this is rbd_add_acquire_lock(), we want to fail + * immediately -- reuse BLACKLISTED flag. Otherwise we + * want to block. + */ + if (!(rbd_dev->disk->flags & GENHD_FL_UP)) { + set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags); + /* wake "rbd map --exclusive" process */ + wake_requests(rbd_dev, false); + } } else if (ret < 0) { rbd_warn(rbd_dev, "error requesting lock: %d", ret); mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, @@ -3490,16 +3502,15 @@ static bool rbd_release_lock(struct rbd_device *rbd_dev) if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING) return false; - if (!rbd_unlock(rbd_dev)) - /* - * Give others a chance to grab the lock - we would re-acquire - * almost immediately if we got new IO during ceph_osdc_sync() - * otherwise. We need to ack our own notifications, so this - * lock_dwork will be requeued from rbd_wait_state_locked() - * after wake_requests() in rbd_handle_released_lock(). - */ - cancel_delayed_work(&rbd_dev->lock_dwork); - + rbd_unlock(rbd_dev); + /* + * Give others a chance to grab the lock - we would re-acquire + * almost immediately if we got new IO during ceph_osdc_sync() + * otherwise. We need to ack our own notifications, so this + * lock_dwork will be requeued from rbd_wait_state_locked() + * after wake_requests() in rbd_handle_released_lock(). + */ + cancel_delayed_work(&rbd_dev->lock_dwork); return true; } @@ -3580,12 +3591,16 @@ static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v, up_read(&rbd_dev->lock_rwsem); } -static bool rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v, - void **p) +/* + * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no + * ResponseMessage is needed. + */ +static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v, + void **p) { struct rbd_client_id my_cid = rbd_get_cid(rbd_dev); struct rbd_client_id cid = { 0 }; - bool need_to_send; + int result = 1; if (struct_v >= 2) { cid.gid = ceph_decode_64(p); @@ -3595,19 +3610,36 @@ static bool rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v, dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, cid.handle); if (rbd_cid_equal(&cid, &my_cid)) - return false; + return result; down_read(&rbd_dev->lock_rwsem); - need_to_send = __rbd_is_lock_owner(rbd_dev); - if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) { - if (!rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid)) { - dout("%s rbd_dev %p queueing unlock_work\n", __func__, - rbd_dev); - queue_work(rbd_dev->task_wq, &rbd_dev->unlock_work); + if (__rbd_is_lock_owner(rbd_dev)) { + if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED && + rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid)) + goto out_unlock; + + /* + * encode ResponseMessage(0) so the peer can detect + * a missing owner + */ + result = 0; + + if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) { + if (!rbd_dev->opts->exclusive) { + dout("%s rbd_dev %p queueing unlock_work\n", + __func__, rbd_dev); + queue_work(rbd_dev->task_wq, + &rbd_dev->unlock_work); + } else { + /* refuse to release the lock */ + result = -EROFS; + } } } + +out_unlock: up_read(&rbd_dev->lock_rwsem); - return need_to_send; + return result; } static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev, @@ -3690,13 +3722,10 @@ static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie, rbd_acknowledge_notify(rbd_dev, notify_id, cookie); break; case RBD_NOTIFY_OP_REQUEST_LOCK: - if (rbd_handle_request_lock(rbd_dev, struct_v, &p)) - /* - * send ResponseMessage(0) back so the client - * can detect a missing owner - */ + ret = rbd_handle_request_lock(rbd_dev, struct_v, &p); + if (ret <= 0) rbd_acknowledge_notify_result(rbd_dev, notify_id, - cookie, 0); + cookie, ret); else rbd_acknowledge_notify(rbd_dev, notify_id, cookie); break; @@ -3821,24 +3850,51 @@ static void rbd_unregister_watch(struct rbd_device *rbd_dev) ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); } +/* + * lock_rwsem must be held for write + */ +static void rbd_reacquire_lock(struct rbd_device *rbd_dev) +{ + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + char cookie[32]; + int ret; + + WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED); + + format_lock_cookie(rbd_dev, cookie); + ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid, + &rbd_dev->header_oloc, RBD_LOCK_NAME, + CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie, + RBD_LOCK_TAG, cookie); + if (ret) { + if (ret != -EOPNOTSUPP) + rbd_warn(rbd_dev, "failed to update lock cookie: %d", + ret); + + /* + * Lock cookie cannot be updated on older OSDs, so do + * a manual release and queue an acquire. + */ + if (rbd_release_lock(rbd_dev)) + queue_delayed_work(rbd_dev->task_wq, + &rbd_dev->lock_dwork, 0); + } else { + strcpy(rbd_dev->lock_cookie, cookie); + } +} + static void rbd_reregister_watch(struct work_struct *work) { struct rbd_device *rbd_dev = container_of(to_delayed_work(work), struct rbd_device, watch_dwork); - bool was_lock_owner = false; - bool need_to_wake = false; int ret; dout("%s rbd_dev %p\n", __func__, rbd_dev); - down_write(&rbd_dev->lock_rwsem); - if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) - was_lock_owner = rbd_release_lock(rbd_dev); - mutex_lock(&rbd_dev->watch_mutex); if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) { mutex_unlock(&rbd_dev->watch_mutex); - goto out; + return; } ret = __rbd_register_watch(rbd_dev); @@ -3846,36 +3902,28 @@ static void rbd_reregister_watch(struct work_struct *work) rbd_warn(rbd_dev, "failed to reregister watch: %d", ret); if (ret == -EBLACKLISTED || ret == -ENOENT) { set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags); - need_to_wake = true; + wake_requests(rbd_dev, true); } else { queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, RBD_RETRY_DELAY); } mutex_unlock(&rbd_dev->watch_mutex); - goto out; + return; } - need_to_wake = true; rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; mutex_unlock(&rbd_dev->watch_mutex); + down_write(&rbd_dev->lock_rwsem); + if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) + rbd_reacquire_lock(rbd_dev); + up_write(&rbd_dev->lock_rwsem); + ret = rbd_dev_refresh(rbd_dev); if (ret) rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret); - - if (was_lock_owner) { - ret = rbd_try_lock(rbd_dev); - if (ret) - rbd_warn(rbd_dev, "reregisteration lock failed: %d", - ret); - } - -out: - up_write(&rbd_dev->lock_rwsem); - if (need_to_wake) - wake_requests(rbd_dev, true); } /* @@ -4034,10 +4082,6 @@ static void rbd_queue_workfn(struct work_struct *work) if (op_type != OBJ_OP_READ) { snapc = rbd_dev->header.snapc; ceph_get_snap_context(snapc); - must_be_locked = rbd_is_lock_supported(rbd_dev); - } else { - must_be_locked = rbd_dev->opts->lock_on_read && - rbd_is_lock_supported(rbd_dev); } up_read(&rbd_dev->header_rwsem); @@ -4048,14 +4092,20 @@ static void rbd_queue_workfn(struct work_struct *work) goto err_rq; } + must_be_locked = + (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) && + (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read); if (must_be_locked) { down_read(&rbd_dev->lock_rwsem); if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED && - !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) + !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) { + if (rbd_dev->opts->exclusive) { + rbd_warn(rbd_dev, "exclusive lock required"); + result = -EROFS; + goto err_unlock; + } rbd_wait_state_locked(rbd_dev); - - WARN_ON((rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) ^ - !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)); + } if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) { result = -EBLACKLISTED; goto err_unlock; @@ -4114,19 +4164,10 @@ static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, static void rbd_free_disk(struct rbd_device *rbd_dev) { - struct gendisk *disk = rbd_dev->disk; - - if (!disk) - return; - + blk_cleanup_queue(rbd_dev->disk->queue); + blk_mq_free_tag_set(&rbd_dev->tag_set); + put_disk(rbd_dev->disk); rbd_dev->disk = NULL; - if (disk->flags & GENHD_FL_UP) { - del_gendisk(disk); - if (disk->queue) - blk_cleanup_queue(disk->queue); - blk_mq_free_tag_set(&rbd_dev->tag_set); - } - put_disk(disk); } static int rbd_obj_read_sync(struct rbd_device *rbd_dev, @@ -4307,9 +4348,8 @@ out: return ret; } -static int rbd_init_request(void *data, struct request *rq, - unsigned int hctx_idx, unsigned int request_idx, - unsigned int numa_node) +static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx, unsigned int numa_node) { struct work_struct *work = blk_mq_rq_to_pdu(rq); @@ -4384,8 +4424,12 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; + /* + * disk_release() expects a queue ref from add_disk() and will + * put it. Hold an extra ref until add_disk() is called. + */ + WARN_ON(!blk_get_queue(q)); disk->queue = q; - q->queuedata = rbd_dev; rbd_dev->disk = disk; @@ -5625,6 +5669,7 @@ static int rbd_add_parse_args(const char *buf, rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT; + rbd_opts->exclusive = RBD_EXCLUSIVE_DEFAULT; copts = ceph_parse_options(options, mon_addrs, mon_addrs + mon_addrs_size - 1, @@ -5683,6 +5728,33 @@ again: return ret; } +static void rbd_dev_image_unlock(struct rbd_device *rbd_dev) +{ + down_write(&rbd_dev->lock_rwsem); + if (__rbd_is_lock_owner(rbd_dev)) + rbd_unlock(rbd_dev); + up_write(&rbd_dev->lock_rwsem); +} + +static int rbd_add_acquire_lock(struct rbd_device *rbd_dev) +{ + if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) { + rbd_warn(rbd_dev, "exclusive-lock feature is not enabled"); + return -EINVAL; + } + + /* FIXME: "rbd map --exclusive" should be in interruptible */ + down_read(&rbd_dev->lock_rwsem); + rbd_wait_state_locked(rbd_dev); + up_read(&rbd_dev->lock_rwsem); + if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) { + rbd_warn(rbd_dev, "failed to acquire exclusive lock"); + return -EROFS; + } + + return 0; +} + /* * An rbd format 2 image has a unique identifier, distinct from the * name given to it by the user. Internally, that identifier is @@ -5874,6 +5946,15 @@ out_err: return ret; } +static void rbd_dev_device_release(struct rbd_device *rbd_dev) +{ + clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); + rbd_dev_mapping_clear(rbd_dev); + rbd_free_disk(rbd_dev); + if (!single_major) + unregister_blkdev(rbd_dev->major, rbd_dev->name); +} + /* * rbd_dev->header_rwsem must be locked for write and will be unlocked * upon return. @@ -5909,26 +5990,13 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); - dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); - ret = device_add(&rbd_dev->dev); + ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); if (ret) goto err_out_mapping; - /* Everything's ready. Announce the disk to the world. */ - set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); up_write(&rbd_dev->header_rwsem); - - spin_lock(&rbd_dev_list_lock); - list_add_tail(&rbd_dev->node, &rbd_dev_list); - spin_unlock(&rbd_dev_list_lock); - - add_disk(rbd_dev->disk); - pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name, - (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT, - rbd_dev->header.features); - - return ret; + return 0; err_out_mapping: rbd_dev_mapping_clear(rbd_dev); @@ -5963,11 +6031,11 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev) static void rbd_dev_image_release(struct rbd_device *rbd_dev) { rbd_dev_unprobe(rbd_dev); + if (rbd_dev->opts) + rbd_unregister_watch(rbd_dev); rbd_dev->image_format = 0; kfree(rbd_dev->spec->image_id); rbd_dev->spec->image_id = NULL; - - rbd_dev_destroy(rbd_dev); } /* @@ -6127,22 +6195,43 @@ static ssize_t do_rbd_add(struct bus_type *bus, rbd_dev->mapping.read_only = read_only; rc = rbd_dev_device_setup(rbd_dev); - if (rc) { - /* - * rbd_unregister_watch() can't be moved into - * rbd_dev_image_release() without refactoring, see - * commit 1f3ef78861ac. - */ - rbd_unregister_watch(rbd_dev); - rbd_dev_image_release(rbd_dev); - goto out; + if (rc) + goto err_out_image_probe; + + if (rbd_dev->opts->exclusive) { + rc = rbd_add_acquire_lock(rbd_dev); + if (rc) + goto err_out_device_setup; } + /* Everything's ready. Announce the disk to the world. */ + + rc = device_add(&rbd_dev->dev); + if (rc) + goto err_out_image_lock; + + add_disk(rbd_dev->disk); + /* see rbd_init_disk() */ + blk_put_queue(rbd_dev->disk->queue); + + spin_lock(&rbd_dev_list_lock); + list_add_tail(&rbd_dev->node, &rbd_dev_list); + spin_unlock(&rbd_dev_list_lock); + + pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name, + (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT, + rbd_dev->header.features); rc = count; out: module_put(THIS_MODULE); return rc; +err_out_image_lock: + rbd_dev_image_unlock(rbd_dev); +err_out_device_setup: + rbd_dev_device_release(rbd_dev); +err_out_image_probe: + rbd_dev_image_release(rbd_dev); err_out_rbd_dev: rbd_dev_destroy(rbd_dev); err_out_client: @@ -6170,21 +6259,6 @@ static ssize_t rbd_add_single_major(struct bus_type *bus, return do_rbd_add(bus, buf, count); } -static void rbd_dev_device_release(struct rbd_device *rbd_dev) -{ - rbd_free_disk(rbd_dev); - - spin_lock(&rbd_dev_list_lock); - list_del_init(&rbd_dev->node); - spin_unlock(&rbd_dev_list_lock); - - clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); - device_del(&rbd_dev->dev); - rbd_dev_mapping_clear(rbd_dev); - if (!single_major) - unregister_blkdev(rbd_dev->major, rbd_dev->name); -} - static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) { while (rbd_dev->parent) { @@ -6202,6 +6276,7 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) } rbd_assert(second); rbd_dev_image_release(second); + rbd_dev_destroy(second); first->parent = NULL; first->parent_overlap = 0; @@ -6270,21 +6345,16 @@ static ssize_t do_rbd_remove(struct bus_type *bus, blk_set_queue_dying(rbd_dev->disk->queue); } - down_write(&rbd_dev->lock_rwsem); - if (__rbd_is_lock_owner(rbd_dev)) - rbd_unlock(rbd_dev); - up_write(&rbd_dev->lock_rwsem); - rbd_unregister_watch(rbd_dev); + del_gendisk(rbd_dev->disk); + spin_lock(&rbd_dev_list_lock); + list_del_init(&rbd_dev->node); + spin_unlock(&rbd_dev_list_lock); + device_del(&rbd_dev->dev); - /* - * Don't free anything from rbd_dev->disk until after all - * notifies are completely processed. Otherwise - * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting - * in a potential use after free of rbd_dev->disk or rbd_dev. - */ + rbd_dev_image_unlock(rbd_dev); rbd_dev_device_release(rbd_dev); rbd_dev_image_release(rbd_dev); - + rbd_dev_destroy(rbd_dev); return count; } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index f94614257462..553cc4c542b4 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -452,8 +452,7 @@ static int init_vq(struct virtio_blk *vblk) } /* Discover virtqueues and write information to configuration. */ - err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names, - &desc); + err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc); if (err) goto out; @@ -573,11 +572,10 @@ static const struct device_attribute dev_attr_cache_type_rw = __ATTR(cache_type, S_IRUGO|S_IWUSR, virtblk_cache_type_show, virtblk_cache_type_store); -static int virtblk_init_request(void *data, struct request *rq, - unsigned int hctx_idx, unsigned int request_idx, - unsigned int numa_node) +static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx, unsigned int numa_node) { - struct virtio_blk *vblk = data; + struct virtio_blk *vblk = set->driver_data; struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq); #ifdef CONFIG_VIRTIO_BLK_SCSI diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 8fe61b5dc5a6..1f3dfaa54d87 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -504,11 +504,13 @@ static int xen_blkbk_remove(struct xenbus_device *dev) dev_set_drvdata(&dev->dev, NULL); - if (be->blkif) + if (be->blkif) { xen_blkif_disconnect(be->blkif); - /* Put the reference we set in xen_blkif_alloc(). */ - xen_blkif_put(be->blkif); + /* Put the reference we set in xen_blkif_alloc(). */ + xen_blkif_put(be->blkif); + } + kfree(be->mode); kfree(be); return 0; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 6fac5fedd610..debee952dcc1 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -45,6 +45,8 @@ static const char *default_compressor = "lzo"; /* Module params (documentation at end) */ static unsigned int num_devices = 1; +static void zram_free_page(struct zram *zram, size_t index); + static inline bool init_done(struct zram *zram) { return zram->disksize; @@ -55,53 +57,70 @@ static inline struct zram *dev_to_zram(struct device *dev) return (struct zram *)dev_to_disk(dev)->private_data; } +static unsigned long zram_get_handle(struct zram *zram, u32 index) +{ + return zram->table[index].handle; +} + +static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) +{ + zram->table[index].handle = handle; +} + /* flag operations require table entry bit_spin_lock() being held */ -static int zram_test_flag(struct zram_meta *meta, u32 index, +static int zram_test_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { - return meta->table[index].value & BIT(flag); + return zram->table[index].value & BIT(flag); } -static void zram_set_flag(struct zram_meta *meta, u32 index, +static void zram_set_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { - meta->table[index].value |= BIT(flag); + zram->table[index].value |= BIT(flag); } -static void zram_clear_flag(struct zram_meta *meta, u32 index, +static void zram_clear_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { - meta->table[index].value &= ~BIT(flag); + zram->table[index].value &= ~BIT(flag); } -static inline void zram_set_element(struct zram_meta *meta, u32 index, +static inline void zram_set_element(struct zram *zram, u32 index, unsigned long element) { - meta->table[index].element = element; + zram->table[index].element = element; } -static inline void zram_clear_element(struct zram_meta *meta, u32 index) +static unsigned long zram_get_element(struct zram *zram, u32 index) { - meta->table[index].element = 0; + return zram->table[index].element; } -static size_t zram_get_obj_size(struct zram_meta *meta, u32 index) +static size_t zram_get_obj_size(struct zram *zram, u32 index) { - return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); + return zram->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); } -static void zram_set_obj_size(struct zram_meta *meta, +static void zram_set_obj_size(struct zram *zram, u32 index, size_t size) { - unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT; + unsigned long flags = zram->table[index].value >> ZRAM_FLAG_SHIFT; - meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; + zram->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; } +#if PAGE_SIZE != 4096 static inline bool is_partial_io(struct bio_vec *bvec) { return bvec->bv_len != PAGE_SIZE; } +#else +static inline bool is_partial_io(struct bio_vec *bvec) +{ + return false; +} +#endif static void zram_revalidate_disk(struct zram *zram) { @@ -137,8 +156,7 @@ static inline bool valid_io_request(struct zram *zram, static void update_position(u32 *index, int *offset, struct bio_vec *bvec) { - if (*offset + bvec->bv_len >= PAGE_SIZE) - (*index)++; + *index += (*offset + bvec->bv_len) / PAGE_SIZE; *offset = (*offset + bvec->bv_len) % PAGE_SIZE; } @@ -177,31 +195,21 @@ static bool page_same_filled(void *ptr, unsigned long *element) { unsigned int pos; unsigned long *page; + unsigned long val; page = (unsigned long *)ptr; + val = page[0]; - for (pos = 0; pos < PAGE_SIZE / sizeof(*page) - 1; pos++) { - if (page[pos] != page[pos + 1]) + for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) { + if (val != page[pos]) return false; } - *element = page[pos]; + *element = val; return true; } -static void handle_same_page(struct bio_vec *bvec, unsigned long element) -{ - struct page *page = bvec->bv_page; - void *user_mem; - - user_mem = kmap_atomic(page); - zram_fill_page(user_mem + bvec->bv_offset, bvec->bv_len, element); - kunmap_atomic(user_mem); - - flush_dcache_page(page); -} - static ssize_t initstate_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -254,9 +262,8 @@ static ssize_t mem_used_max_store(struct device *dev, down_read(&zram->init_lock); if (init_done(zram)) { - struct zram_meta *meta = zram->meta; atomic_long_set(&zram->stats.max_used_pages, - zs_get_total_pages(meta->mem_pool)); + zs_get_total_pages(zram->mem_pool)); } up_read(&zram->init_lock); @@ -329,7 +336,6 @@ static ssize_t compact_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct zram *zram = dev_to_zram(dev); - struct zram_meta *meta; down_read(&zram->init_lock); if (!init_done(zram)) { @@ -337,8 +343,7 @@ static ssize_t compact_store(struct device *dev, return -EINVAL; } - meta = zram->meta; - zs_compact(meta->mem_pool); + zs_compact(zram->mem_pool); up_read(&zram->init_lock); return len; @@ -375,8 +380,8 @@ static ssize_t mm_stat_show(struct device *dev, down_read(&zram->init_lock); if (init_done(zram)) { - mem_used = zs_get_total_pages(zram->meta->mem_pool); - zs_pool_stats(zram->meta->mem_pool, &pool_stats); + mem_used = zs_get_total_pages(zram->mem_pool); + zs_pool_stats(zram->mem_pool, &pool_stats); } orig_size = atomic64_read(&zram->stats.pages_stored); @@ -417,56 +422,89 @@ static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); static DEVICE_ATTR_RO(debug_stat); -static void zram_meta_free(struct zram_meta *meta, u64 disksize) +static void zram_slot_lock(struct zram *zram, u32 index) +{ + bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value); +} + +static void zram_slot_unlock(struct zram *zram, u32 index) +{ + bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value); +} + +static bool zram_same_page_read(struct zram *zram, u32 index, + struct page *page, + unsigned int offset, unsigned int len) +{ + zram_slot_lock(zram, index); + if (unlikely(!zram_get_handle(zram, index) || + zram_test_flag(zram, index, ZRAM_SAME))) { + void *mem; + + zram_slot_unlock(zram, index); + mem = kmap_atomic(page); + zram_fill_page(mem + offset, len, + zram_get_element(zram, index)); + kunmap_atomic(mem); + return true; + } + zram_slot_unlock(zram, index); + + return false; +} + +static bool zram_same_page_write(struct zram *zram, u32 index, + struct page *page) +{ + unsigned long element; + void *mem = kmap_atomic(page); + + if (page_same_filled(mem, &element)) { + kunmap_atomic(mem); + /* Free memory associated with this sector now. */ + zram_slot_lock(zram, index); + zram_free_page(zram, index); + zram_set_flag(zram, index, ZRAM_SAME); + zram_set_element(zram, index, element); + zram_slot_unlock(zram, index); + + atomic64_inc(&zram->stats.same_pages); + return true; + } + kunmap_atomic(mem); + + return false; +} + +static void zram_meta_free(struct zram *zram, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; size_t index; /* Free all pages that are still in this zram device */ - for (index = 0; index < num_pages; index++) { - unsigned long handle = meta->table[index].handle; - /* - * No memory is allocated for same element filled pages. - * Simply clear same page flag. - */ - if (!handle || zram_test_flag(meta, index, ZRAM_SAME)) - continue; - - zs_free(meta->mem_pool, handle); - } + for (index = 0; index < num_pages; index++) + zram_free_page(zram, index); - zs_destroy_pool(meta->mem_pool); - vfree(meta->table); - kfree(meta); + zs_destroy_pool(zram->mem_pool); + vfree(zram->table); } -static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize) +static bool zram_meta_alloc(struct zram *zram, u64 disksize) { size_t num_pages; - struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL); - - if (!meta) - return NULL; num_pages = disksize >> PAGE_SHIFT; - meta->table = vzalloc(num_pages * sizeof(*meta->table)); - if (!meta->table) { - pr_err("Error allocating zram address table\n"); - goto out_error; - } + zram->table = vzalloc(num_pages * sizeof(*zram->table)); + if (!zram->table) + return false; - meta->mem_pool = zs_create_pool(pool_name); - if (!meta->mem_pool) { - pr_err("Error creating memory pool\n"); - goto out_error; + zram->mem_pool = zs_create_pool(zram->disk->disk_name); + if (!zram->mem_pool) { + vfree(zram->table); + return false; } - return meta; - -out_error: - vfree(meta->table); - kfree(meta); - return NULL; + return true; } /* @@ -476,16 +514,15 @@ out_error: */ static void zram_free_page(struct zram *zram, size_t index) { - struct zram_meta *meta = zram->meta; - unsigned long handle = meta->table[index].handle; + unsigned long handle = zram_get_handle(zram, index); /* * No memory is allocated for same element filled pages. * Simply clear same page flag. */ - if (zram_test_flag(meta, index, ZRAM_SAME)) { - zram_clear_flag(meta, index, ZRAM_SAME); - zram_clear_element(meta, index); + if (zram_test_flag(zram, index, ZRAM_SAME)) { + zram_clear_flag(zram, index, ZRAM_SAME); + zram_set_element(zram, index, 0); atomic64_dec(&zram->stats.same_pages); return; } @@ -493,179 +530,111 @@ static void zram_free_page(struct zram *zram, size_t index) if (!handle) return; - zs_free(meta->mem_pool, handle); + zs_free(zram->mem_pool, handle); - atomic64_sub(zram_get_obj_size(meta, index), + atomic64_sub(zram_get_obj_size(zram, index), &zram->stats.compr_data_size); atomic64_dec(&zram->stats.pages_stored); - meta->table[index].handle = 0; - zram_set_obj_size(meta, index, 0); + zram_set_handle(zram, index, 0); + zram_set_obj_size(zram, index, 0); } -static int zram_decompress_page(struct zram *zram, char *mem, u32 index) +static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) { - int ret = 0; - unsigned char *cmem; - struct zram_meta *meta = zram->meta; + int ret; unsigned long handle; unsigned int size; + void *src, *dst; - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); - handle = meta->table[index].handle; - size = zram_get_obj_size(meta, index); - - if (!handle || zram_test_flag(meta, index, ZRAM_SAME)) { - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); - zram_fill_page(mem, PAGE_SIZE, meta->table[index].element); + if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE)) return 0; - } - cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); + zram_slot_lock(zram, index); + handle = zram_get_handle(zram, index); + size = zram_get_obj_size(zram, index); + + src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); if (size == PAGE_SIZE) { - memcpy(mem, cmem, PAGE_SIZE); + dst = kmap_atomic(page); + memcpy(dst, src, PAGE_SIZE); + kunmap_atomic(dst); + ret = 0; } else { struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp); - ret = zcomp_decompress(zstrm, cmem, size, mem); + dst = kmap_atomic(page); + ret = zcomp_decompress(zstrm, src, size, dst); + kunmap_atomic(dst); zcomp_stream_put(zram->comp); } - zs_unmap_object(meta->mem_pool, handle); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zs_unmap_object(zram->mem_pool, handle); + zram_slot_unlock(zram, index); /* Should NEVER happen. Return bio error if it does. */ - if (unlikely(ret)) { + if (unlikely(ret)) pr_err("Decompression failed! err=%d, page=%u\n", ret, index); - return ret; - } - return 0; + return ret; } static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, - u32 index, int offset) + u32 index, int offset) { int ret; struct page *page; - unsigned char *user_mem, *uncmem = NULL; - struct zram_meta *meta = zram->meta; - page = bvec->bv_page; - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); - if (unlikely(!meta->table[index].handle) || - zram_test_flag(meta, index, ZRAM_SAME)) { - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); - handle_same_page(bvec, meta->table[index].element); - return 0; + page = bvec->bv_page; + if (is_partial_io(bvec)) { + /* Use a temporary buffer to decompress the page */ + page = alloc_page(GFP_NOIO|__GFP_HIGHMEM); + if (!page) + return -ENOMEM; } - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); - if (is_partial_io(bvec)) - /* Use a temporary buffer to decompress the page */ - uncmem = kmalloc(PAGE_SIZE, GFP_NOIO); + ret = zram_decompress_page(zram, page, index); + if (unlikely(ret)) + goto out; - user_mem = kmap_atomic(page); - if (!is_partial_io(bvec)) - uncmem = user_mem; + if (is_partial_io(bvec)) { + void *dst = kmap_atomic(bvec->bv_page); + void *src = kmap_atomic(page); - if (!uncmem) { - pr_err("Unable to allocate temp memory\n"); - ret = -ENOMEM; - goto out_cleanup; + memcpy(dst + bvec->bv_offset, src + offset, bvec->bv_len); + kunmap_atomic(src); + kunmap_atomic(dst); } - - ret = zram_decompress_page(zram, uncmem, index); - /* Should NEVER happen. Return bio error if it does. */ - if (unlikely(ret)) - goto out_cleanup; - +out: if (is_partial_io(bvec)) - memcpy(user_mem + bvec->bv_offset, uncmem + offset, - bvec->bv_len); + __free_page(page); - flush_dcache_page(page); - ret = 0; -out_cleanup: - kunmap_atomic(user_mem); - if (is_partial_io(bvec)) - kfree(uncmem); return ret; } -static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset) +static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm, + struct page *page, + unsigned long *out_handle, unsigned int *out_comp_len) { - int ret = 0; - unsigned int clen; - unsigned long handle = 0; - struct page *page; - unsigned char *user_mem, *cmem, *src, *uncmem = NULL; - struct zram_meta *meta = zram->meta; - struct zcomp_strm *zstrm = NULL; + int ret; + unsigned int comp_len; + void *src; unsigned long alloced_pages; - unsigned long element; - - page = bvec->bv_page; - if (is_partial_io(bvec)) { - /* - * This is a partial IO. We need to read the full page - * before to write the changes. - */ - uncmem = kmalloc(PAGE_SIZE, GFP_NOIO); - if (!uncmem) { - ret = -ENOMEM; - goto out; - } - ret = zram_decompress_page(zram, uncmem, index); - if (ret) - goto out; - } + unsigned long handle = 0; compress_again: - user_mem = kmap_atomic(page); - if (is_partial_io(bvec)) { - memcpy(uncmem + offset, user_mem + bvec->bv_offset, - bvec->bv_len); - kunmap_atomic(user_mem); - user_mem = NULL; - } else { - uncmem = user_mem; - } - - if (page_same_filled(uncmem, &element)) { - if (user_mem) - kunmap_atomic(user_mem); - /* Free memory associated with this sector now. */ - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); - zram_free_page(zram, index); - zram_set_flag(meta, index, ZRAM_SAME); - zram_set_element(meta, index, element); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); - - atomic64_inc(&zram->stats.same_pages); - ret = 0; - goto out; - } - - zstrm = zcomp_stream_get(zram->comp); - ret = zcomp_compress(zstrm, uncmem, &clen); - if (!is_partial_io(bvec)) { - kunmap_atomic(user_mem); - user_mem = NULL; - uncmem = NULL; - } + src = kmap_atomic(page); + ret = zcomp_compress(*zstrm, src, &comp_len); + kunmap_atomic(src); if (unlikely(ret)) { pr_err("Compression failed! err=%d\n", ret); - goto out; + if (handle) + zs_free(zram->mem_pool, handle); + return ret; } - src = zstrm->buffer; - if (unlikely(clen > max_zpage_size)) { - clen = PAGE_SIZE; - if (is_partial_io(bvec)) - src = uncmem; - } + if (unlikely(comp_len > max_zpage_size)) + comp_len = PAGE_SIZE; /* * handle allocation has 2 paths: @@ -681,71 +650,121 @@ compress_again: * from the slow path and handle has already been allocated. */ if (!handle) - handle = zs_malloc(meta->mem_pool, clen, + handle = zs_malloc(zram->mem_pool, comp_len, __GFP_KSWAPD_RECLAIM | __GFP_NOWARN | __GFP_HIGHMEM | __GFP_MOVABLE); if (!handle) { zcomp_stream_put(zram->comp); - zstrm = NULL; - atomic64_inc(&zram->stats.writestall); - - handle = zs_malloc(meta->mem_pool, clen, + handle = zs_malloc(zram->mem_pool, comp_len, GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE); + *zstrm = zcomp_stream_get(zram->comp); if (handle) goto compress_again; - - pr_err("Error allocating memory for compressed page: %u, size=%u\n", - index, clen); - ret = -ENOMEM; - goto out; + return -ENOMEM; } - alloced_pages = zs_get_total_pages(meta->mem_pool); + alloced_pages = zs_get_total_pages(zram->mem_pool); update_used_max(zram, alloced_pages); if (zram->limit_pages && alloced_pages > zram->limit_pages) { - zs_free(meta->mem_pool, handle); - ret = -ENOMEM; - goto out; + zs_free(zram->mem_pool, handle); + return -ENOMEM; } - cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); + *out_handle = handle; + *out_comp_len = comp_len; + return 0; +} + +static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) +{ + int ret; + unsigned long handle; + unsigned int comp_len; + void *src, *dst; + struct zcomp_strm *zstrm; + struct page *page = bvec->bv_page; - if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) { + if (zram_same_page_write(zram, index, page)) + return 0; + + zstrm = zcomp_stream_get(zram->comp); + ret = zram_compress(zram, &zstrm, page, &handle, &comp_len); + if (ret) { + zcomp_stream_put(zram->comp); + return ret; + } + + dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); + + src = zstrm->buffer; + if (comp_len == PAGE_SIZE) src = kmap_atomic(page); - memcpy(cmem, src, PAGE_SIZE); + memcpy(dst, src, comp_len); + if (comp_len == PAGE_SIZE) kunmap_atomic(src); - } else { - memcpy(cmem, src, clen); - } zcomp_stream_put(zram->comp); - zstrm = NULL; - zs_unmap_object(meta->mem_pool, handle); + zs_unmap_object(zram->mem_pool, handle); /* * Free memory associated with this sector * before overwriting unused sectors. */ - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_lock(zram, index); zram_free_page(zram, index); - - meta->table[index].handle = handle; - zram_set_obj_size(meta, index, clen); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_set_handle(zram, index, handle); + zram_set_obj_size(zram, index, comp_len); + zram_slot_unlock(zram, index); /* Update stats */ - atomic64_add(clen, &zram->stats.compr_data_size); + atomic64_add(comp_len, &zram->stats.compr_data_size); atomic64_inc(&zram->stats.pages_stored); + return 0; +} + +static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, + u32 index, int offset) +{ + int ret; + struct page *page = NULL; + void *src; + struct bio_vec vec; + + vec = *bvec; + if (is_partial_io(bvec)) { + void *dst; + /* + * This is a partial IO. We need to read the full page + * before to write the changes. + */ + page = alloc_page(GFP_NOIO|__GFP_HIGHMEM); + if (!page) + return -ENOMEM; + + ret = zram_decompress_page(zram, page, index); + if (ret) + goto out; + + src = kmap_atomic(bvec->bv_page); + dst = kmap_atomic(page); + memcpy(dst + offset, src + bvec->bv_offset, bvec->bv_len); + kunmap_atomic(dst); + kunmap_atomic(src); + + vec.bv_page = page; + vec.bv_len = PAGE_SIZE; + vec.bv_offset = 0; + } + + ret = __zram_bvec_write(zram, &vec, index); out: - if (zstrm) - zcomp_stream_put(zram->comp); if (is_partial_io(bvec)) - kfree(uncmem); + __free_page(page); return ret; } @@ -758,7 +777,6 @@ static void zram_bio_discard(struct zram *zram, u32 index, int offset, struct bio *bio) { size_t n = bio->bi_iter.bi_size; - struct zram_meta *meta = zram->meta; /* * zram manages data in physical block size units. Because logical block @@ -779,9 +797,9 @@ static void zram_bio_discard(struct zram *zram, u32 index, } while (n >= PAGE_SIZE) { - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_lock(zram, index); zram_free_page(zram, index); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_unlock(zram, index); atomic64_inc(&zram->stats.notify_free); index++; n -= PAGE_SIZE; @@ -801,6 +819,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, if (!is_write) { atomic64_inc(&zram->stats.num_reads); ret = zram_bvec_read(zram, bvec, index, offset); + flush_dcache_page(bvec->bv_page); } else { atomic64_inc(&zram->stats.num_writes); ret = zram_bvec_write(zram, bvec, index, offset); @@ -840,34 +859,21 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) } bio_for_each_segment(bvec, bio, iter) { - int max_transfer_size = PAGE_SIZE - offset; - - if (bvec.bv_len > max_transfer_size) { - /* - * zram_bvec_rw() can only make operation on a single - * zram page. Split the bio vector. - */ - struct bio_vec bv; - - bv.bv_page = bvec.bv_page; - bv.bv_len = max_transfer_size; - bv.bv_offset = bvec.bv_offset; + struct bio_vec bv = bvec; + unsigned int unwritten = bvec.bv_len; + do { + bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset, + unwritten); if (zram_bvec_rw(zram, &bv, index, offset, - op_is_write(bio_op(bio))) < 0) + op_is_write(bio_op(bio))) < 0) goto out; - bv.bv_len = bvec.bv_len - max_transfer_size; - bv.bv_offset += max_transfer_size; - if (zram_bvec_rw(zram, &bv, index + 1, 0, - op_is_write(bio_op(bio))) < 0) - goto out; - } else - if (zram_bvec_rw(zram, &bvec, index, offset, - op_is_write(bio_op(bio))) < 0) - goto out; + bv.bv_offset += bv.bv_len; + unwritten -= bv.bv_len; - update_position(&index, &offset, &bvec); + update_position(&index, &offset, &bv); + } while (unwritten); } bio_endio(bio); @@ -884,8 +890,6 @@ static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio) { struct zram *zram = queue->queuedata; - blk_queue_split(queue, &bio, queue->bio_split); - if (!valid_io_request(zram, bio->bi_iter.bi_sector, bio->bi_iter.bi_size)) { atomic64_inc(&zram->stats.invalid_io); @@ -904,14 +908,12 @@ static void zram_slot_free_notify(struct block_device *bdev, unsigned long index) { struct zram *zram; - struct zram_meta *meta; zram = bdev->bd_disk->private_data; - meta = zram->meta; - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_lock(zram, index); zram_free_page(zram, index); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_unlock(zram, index); atomic64_inc(&zram->stats.notify_free); } @@ -955,7 +957,6 @@ out: static void zram_reset_device(struct zram *zram) { - struct zram_meta *meta; struct zcomp *comp; u64 disksize; @@ -968,12 +969,8 @@ static void zram_reset_device(struct zram *zram) return; } - meta = zram->meta; comp = zram->comp; disksize = zram->disksize; - - /* Reset stats */ - memset(&zram->stats, 0, sizeof(zram->stats)); zram->disksize = 0; set_capacity(zram->disk, 0); @@ -981,7 +978,8 @@ static void zram_reset_device(struct zram *zram) up_write(&zram->init_lock); /* I/O operation under all of CPU are done so let's free */ - zram_meta_free(meta, disksize); + zram_meta_free(zram, disksize); + memset(&zram->stats, 0, sizeof(zram->stats)); zcomp_destroy(comp); } @@ -990,7 +988,6 @@ static ssize_t disksize_store(struct device *dev, { u64 disksize; struct zcomp *comp; - struct zram_meta *meta; struct zram *zram = dev_to_zram(dev); int err; @@ -998,10 +995,18 @@ static ssize_t disksize_store(struct device *dev, if (!disksize) return -EINVAL; + down_write(&zram->init_lock); + if (init_done(zram)) { + pr_info("Cannot change disksize for initialized device\n"); + err = -EBUSY; + goto out_unlock; + } + disksize = PAGE_ALIGN(disksize); - meta = zram_meta_alloc(zram->disk->disk_name, disksize); - if (!meta) - return -ENOMEM; + if (!zram_meta_alloc(zram, disksize)) { + err = -ENOMEM; + goto out_unlock; + } comp = zcomp_create(zram->compressor); if (IS_ERR(comp)) { @@ -1011,14 +1016,6 @@ static ssize_t disksize_store(struct device *dev, goto out_free_meta; } - down_write(&zram->init_lock); - if (init_done(zram)) { - pr_info("Cannot change disksize for initialized device\n"); - err = -EBUSY; - goto out_destroy_comp; - } - - zram->meta = meta; zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); @@ -1027,11 +1024,10 @@ static ssize_t disksize_store(struct device *dev, return len; -out_destroy_comp: - up_write(&zram->init_lock); - zcomp_destroy(comp); out_free_meta: - zram_meta_free(meta, disksize); + zram_meta_free(zram, disksize); +out_unlock: + up_write(&zram->init_lock); return err; } @@ -1193,8 +1189,6 @@ static int zram_add(void) blk_queue_io_min(zram->disk->queue, PAGE_SIZE); blk_queue_io_opt(zram->disk->queue, PAGE_SIZE); zram->disk->queue->limits.discard_granularity = PAGE_SIZE; - zram->disk->queue->limits.max_sectors = SECTORS_PER_PAGE; - zram->disk->queue->limits.chunk_sectors = 0; blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue); @@ -1219,7 +1213,6 @@ static int zram_add(void) goto out_free_disk; } strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); - zram->meta = NULL; pr_info("Added device: %s\n", zram->disk->disk_name); return device_id; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index caeff51f1571..e34e44d02e3e 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -92,13 +92,9 @@ struct zram_stats { atomic64_t writestall; /* no. of write slow paths */ }; -struct zram_meta { +struct zram { struct zram_table_entry *table; struct zs_pool *mem_pool; -}; - -struct zram { - struct zram_meta *meta; struct zcomp *comp; struct gendisk *disk; /* Prevent concurrent execution of device init */ |