diff options
Diffstat (limited to 'drivers/block')
57 files changed, 8284 insertions, 2986 deletions
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index e086fbbbe853..9a13e889837e 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -1177,7 +1177,8 @@ static bool DAC960_V1_EnableMemoryMailboxInterface(DAC960_Controller_T int TimeoutCounter; int i; - + memset(&CommandMailbox, 0, sizeof(DAC960_V1_CommandMailbox_T)); + if (pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32))) return DAC960_Failure(Controller, "DMA mask out of range"); Controller->BounceBufferLimit = DMA_BIT_MASK(32); @@ -4627,7 +4628,8 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command) DAC960_Controller_T *Controller = Command->Controller; DAC960_CommandType_T CommandType = Command->CommandType; DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox; - DAC960_V2_IOCTL_Opcode_T CommandOpcode = CommandMailbox->Common.IOCTL_Opcode; + DAC960_V2_IOCTL_Opcode_T IOCTLOpcode = CommandMailbox->Common.IOCTL_Opcode; + DAC960_V2_CommandOpcode_T CommandOpcode = CommandMailbox->SCSI_10.CommandOpcode; DAC960_V2_CommandStatus_T CommandStatus = Command->V2.CommandStatus; if (CommandType == DAC960_ReadCommand || @@ -4699,7 +4701,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command) { if (Controller->ShutdownMonitoringTimer) return; - if (CommandOpcode == DAC960_V2_GetControllerInfo) + if (IOCTLOpcode == DAC960_V2_GetControllerInfo) { DAC960_V2_ControllerInfo_T *NewControllerInfo = Controller->V2.NewControllerInformation; @@ -4719,14 +4721,14 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command) memcpy(ControllerInfo, NewControllerInfo, sizeof(DAC960_V2_ControllerInfo_T)); } - else if (CommandOpcode == DAC960_V2_GetEvent) + else if (IOCTLOpcode == DAC960_V2_GetEvent) { if (CommandStatus == DAC960_V2_NormalCompletion) { DAC960_V2_ReportEvent(Controller, Controller->V2.Event); } Controller->V2.NextEventSequenceNumber++; } - else if (CommandOpcode == DAC960_V2_GetPhysicalDeviceInfoValid && + else if (IOCTLOpcode == DAC960_V2_GetPhysicalDeviceInfoValid && CommandStatus == DAC960_V2_NormalCompletion) { DAC960_V2_PhysicalDeviceInfo_T *NewPhysicalDeviceInfo = @@ -4915,7 +4917,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command) NewPhysicalDeviceInfo->LogicalUnit++; Controller->V2.PhysicalDeviceIndex++; } - else if (CommandOpcode == DAC960_V2_GetPhysicalDeviceInfoValid) + else if (IOCTLOpcode == DAC960_V2_GetPhysicalDeviceInfoValid) { unsigned int DeviceIndex; for (DeviceIndex = Controller->V2.PhysicalDeviceIndex; @@ -4938,7 +4940,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command) } Controller->V2.NeedPhysicalDeviceInformation = false; } - else if (CommandOpcode == DAC960_V2_GetLogicalDeviceInfoValid && + else if (IOCTLOpcode == DAC960_V2_GetLogicalDeviceInfoValid && CommandStatus == DAC960_V2_NormalCompletion) { DAC960_V2_LogicalDeviceInfo_T *NewLogicalDeviceInfo = @@ -5065,7 +5067,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command) [LogicalDeviceNumber] = true; NewLogicalDeviceInfo->LogicalDeviceNumber++; } - else if (CommandOpcode == DAC960_V2_GetLogicalDeviceInfoValid) + else if (IOCTLOpcode == DAC960_V2_GetLogicalDeviceInfoValid) { int LogicalDriveNumber; for (LogicalDriveNumber = 0; @@ -6578,24 +6580,21 @@ static const struct file_operations dac960_user_command_proc_fops = { static void DAC960_CreateProcEntries(DAC960_Controller_T *Controller) { - struct proc_dir_entry *StatusProcEntry; struct proc_dir_entry *ControllerProcEntry; - struct proc_dir_entry *UserCommandProcEntry; if (DAC960_ProcDirectoryEntry == NULL) { - DAC960_ProcDirectoryEntry = proc_mkdir("rd", NULL); - StatusProcEntry = proc_create("status", 0, - DAC960_ProcDirectoryEntry, - &dac960_proc_fops); + DAC960_ProcDirectoryEntry = proc_mkdir("rd", NULL); + proc_create("status", 0, DAC960_ProcDirectoryEntry, + &dac960_proc_fops); } - sprintf(Controller->ControllerName, "c%d", Controller->ControllerNumber); - ControllerProcEntry = proc_mkdir(Controller->ControllerName, - DAC960_ProcDirectoryEntry); - proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller); - proc_create_data("current_status", 0, ControllerProcEntry, &dac960_current_status_proc_fops, Controller); - UserCommandProcEntry = proc_create_data("user_command", S_IWUSR | S_IRUSR, ControllerProcEntry, &dac960_user_command_proc_fops, Controller); - Controller->ControllerProcEntry = ControllerProcEntry; + sprintf(Controller->ControllerName, "c%d", Controller->ControllerNumber); + ControllerProcEntry = proc_mkdir(Controller->ControllerName, + DAC960_ProcDirectoryEntry); + proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller); + proc_create_data("current_status", 0, ControllerProcEntry, &dac960_current_status_proc_fops, Controller); + proc_create_data("user_command", S_IWUSR | S_IRUSR, ControllerProcEntry, &dac960_user_command_proc_fops, Controller); + Controller->ControllerProcEntry = ControllerProcEntry; } diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 35e56e1c948f..a796407123c7 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -116,6 +116,8 @@ config PARIDE source "drivers/block/paride/Kconfig" +source "drivers/block/mtip32xx/Kconfig" + config BLK_CPQ_DA tristate "Compaq SMART2 support" depends on PCI && VIRT_TO_BUS @@ -352,7 +354,7 @@ config BLK_DEV_SX8 Use devices /dev/sx8/$N and /dev/sx8/$Np$M. config BLK_DEV_UB - tristate "Low Performance USB Block driver" + tristate "Low Performance USB Block driver (deprecated)" depends on USB help This driver supports certain USB attached storage devices diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 349539ad3ad9..5b795059f8fb 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -40,5 +40,6 @@ obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/ obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o +obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 8eba86bba599..386146d792d1 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -63,7 +63,7 @@ #include <linux/mutex.h> #include <linux/amifdreg.h> #include <linux/amifd.h> -#include <linux/buffer_head.h> +#include <linux/fs.h> #include <linux/blkdev.h> #include <linux/elevator.h> #include <linux/interrupt.h> diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 528f6318ded1..321de7b6c442 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -15,6 +15,7 @@ #include <linux/genhd.h> #include <linux/netdevice.h> #include <linux/mutex.h> +#include <linux/export.h> #include "aoe.h" static DEFINE_MUTEX(aoeblk_mutex); @@ -159,7 +160,7 @@ aoeblk_release(struct gendisk *disk, fmode_t mode) return 0; } -static int +static void aoeblk_make_request(struct request_queue *q, struct bio *bio) { struct sk_buff_head queue; @@ -172,25 +173,25 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio) if (bio == NULL) { printk(KERN_ERR "aoe: bio is NULL\n"); BUG(); - return 0; + return; } d = bio->bi_bdev->bd_disk->private_data; if (d == NULL) { printk(KERN_ERR "aoe: bd_disk->private_data is NULL\n"); BUG(); bio_endio(bio, -ENXIO); - return 0; + return; } else if (bio->bi_io_vec == NULL) { printk(KERN_ERR "aoe: bi_io_vec is NULL\n"); BUG(); bio_endio(bio, -ENXIO); - return 0; + return; } buf = mempool_alloc(d->bufpool, GFP_NOIO); if (buf == NULL) { printk(KERN_INFO "aoe: buf allocation failure\n"); bio_endio(bio, -ENOMEM); - return 0; + return; } memset(buf, 0, sizeof(*buf)); INIT_LIST_HEAD(&buf->bufs); @@ -211,7 +212,7 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio) spin_unlock_irqrestore(&d->lock, flags); mempool_free(buf, d->bufpool); bio_endio(bio, -ENXIO); - return 0; + return; } list_add_tail(&buf->bufs, &d->bufq); @@ -222,8 +223,6 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio) spin_unlock_irqrestore(&d->lock, flags); aoenet_xmit(&queue); - - return 0; } static int diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c index 146296ca4965..e86d2062a164 100644 --- a/drivers/block/aoe/aoechr.c +++ b/drivers/block/aoe/aoechr.c @@ -11,6 +11,7 @@ #include <linux/slab.h> #include <linux/mutex.h> #include <linux/skbuff.h> +#include <linux/export.h> #include "aoe.h" enum { @@ -269,7 +270,7 @@ static const struct file_operations aoe_fops = { .llseek = noop_llseek, }; -static char *aoe_devnode(struct device *dev, mode_t *mode) +static char *aoe_devnode(struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "etherd/%s", dev_name(dev)); } diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index de0435e63b02..887f68f6d79a 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -35,6 +35,7 @@ new_skb(ulong len) skb_reset_mac_header(skb); skb_reset_network_header(skb); skb->protocol = __constant_htons(ETH_P_AOE); + skb_checksum_none_assert(skb); } return skb; } diff --git a/drivers/block/brd.c b/drivers/block/brd.c index dba1c32e1ddf..531ceb31d0ff 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -17,7 +17,7 @@ #include <linux/highmem.h> #include <linux/mutex.h> #include <linux/radix-tree.h> -#include <linux/buffer_head.h> /* invalidate_bh_lrus() */ +#include <linux/fs.h> #include <linux/slab.h> #include <asm/uaccess.h> @@ -242,9 +242,9 @@ static void copy_to_brd(struct brd_device *brd, const void *src, page = brd_lookup_page(brd, sector); BUG_ON(!page); - dst = kmap_atomic(page, KM_USER1); + dst = kmap_atomic(page); memcpy(dst + offset, src, copy); - kunmap_atomic(dst, KM_USER1); + kunmap_atomic(dst); if (copy < n) { src += copy; @@ -253,9 +253,9 @@ static void copy_to_brd(struct brd_device *brd, const void *src, page = brd_lookup_page(brd, sector); BUG_ON(!page); - dst = kmap_atomic(page, KM_USER1); + dst = kmap_atomic(page); memcpy(dst, src, copy); - kunmap_atomic(dst, KM_USER1); + kunmap_atomic(dst); } } @@ -273,9 +273,9 @@ static void copy_from_brd(void *dst, struct brd_device *brd, copy = min_t(size_t, n, PAGE_SIZE - offset); page = brd_lookup_page(brd, sector); if (page) { - src = kmap_atomic(page, KM_USER1); + src = kmap_atomic(page); memcpy(dst, src + offset, copy); - kunmap_atomic(src, KM_USER1); + kunmap_atomic(src); } else memset(dst, 0, copy); @@ -285,9 +285,9 @@ static void copy_from_brd(void *dst, struct brd_device *brd, copy = n - copy; page = brd_lookup_page(brd, sector); if (page) { - src = kmap_atomic(page, KM_USER1); + src = kmap_atomic(page); memcpy(dst, src, copy); - kunmap_atomic(src, KM_USER1); + kunmap_atomic(src); } else memset(dst, 0, copy); } @@ -309,7 +309,7 @@ static int brd_do_bvec(struct brd_device *brd, struct page *page, goto out; } - mem = kmap_atomic(page, KM_USER0); + mem = kmap_atomic(page); if (rw == READ) { copy_from_brd(mem + off, brd, sector, len); flush_dcache_page(page); @@ -317,13 +317,13 @@ static int brd_do_bvec(struct brd_device *brd, struct page *page, flush_dcache_page(page); copy_to_brd(brd, mem + off, sector, len); } - kunmap_atomic(mem, KM_USER0); + kunmap_atomic(mem); out: return err; } -static int brd_make_request(struct request_queue *q, struct bio *bio) +static void brd_make_request(struct request_queue *q, struct bio *bio) { struct block_device *bdev = bio->bi_bdev; struct brd_device *brd = bdev->bd_disk->private_data; @@ -359,8 +359,6 @@ static int brd_make_request(struct request_queue *q, struct bio *bio) out: bio_endio(bio, err); - - return 0; } #ifdef CONFIG_BLK_DEV_XIP @@ -404,14 +402,13 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode, error = -EBUSY; if (bdev->bd_openers <= 1) { /* - * Invalidate the cache first, so it isn't written - * back to the device. + * Kill the cache first, so it isn't written back to the + * device. * * Another thread might instantiate more buffercache here, * but there is not much we can do to close that race. */ - invalidate_bh_lrus(); - truncate_inode_pages(bdev->bd_inode->i_mapping, 0); + kill_bdev(bdev); brd_free_pages(brd); error = 0; } diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 8f4ef656a1af..b0f553b26d0f 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -24,6 +24,7 @@ #include <linux/interrupt.h> #include <linux/types.h> #include <linux/pci.h> +#include <linux/pci-aspm.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/delay.h> @@ -68,6 +69,10 @@ static int cciss_tape_cmds = 6; module_param(cciss_tape_cmds, int, 0644); MODULE_PARM_DESC(cciss_tape_cmds, "number of commands to allocate for tape devices (default: 6)"); +static int cciss_simple_mode; +module_param(cciss_simple_mode, int, S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(cciss_simple_mode, + "Use 'simple mode' rather than 'performant mode'"); static DEFINE_MUTEX(cciss_mutex); static struct proc_dir_entry *proc_cciss; @@ -176,6 +181,7 @@ static void cciss_geometry_inquiry(ctlr_info_t *h, int logvol, unsigned int block_size, InquiryData_struct *inq_buff, drive_info_struct *drv); static void __devinit cciss_interrupt_mode(ctlr_info_t *); +static int __devinit cciss_enter_simple_mode(struct ctlr_info *h); static void start_io(ctlr_info_t *h); static int sendcmd_withirq(ctlr_info_t *h, __u8 cmd, void *buff, size_t size, __u8 page_code, unsigned char scsi3addr[], @@ -388,7 +394,7 @@ static void cciss_seq_show_header(struct seq_file *seq) h->product_name, (unsigned long)h->board_id, h->firm_ver[0], h->firm_ver[1], h->firm_ver[2], - h->firm_ver[3], (unsigned int)h->intr[PERF_MODE_INT], + h->firm_ver[3], (unsigned int)h->intr[h->intr_mode], h->num_luns, h->Qdepth, h->commands_outstanding, h->maxQsinceinit, h->max_outstanding, h->maxSG); @@ -636,6 +642,18 @@ static ssize_t host_store_rescan(struct device *dev, } static DEVICE_ATTR(rescan, S_IWUSR, NULL, host_store_rescan); +static ssize_t host_show_transport_mode(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ctlr_info *h = to_hba(dev); + + return snprintf(buf, 20, "%s\n", + h->transMethod & CFGTBL_Trans_Performant ? + "performant" : "simple"); +} +static DEVICE_ATTR(transport_mode, S_IRUGO, host_show_transport_mode, NULL); + static ssize_t dev_show_unique_id(struct device *dev, struct device_attribute *attr, char *buf) @@ -808,6 +826,7 @@ static DEVICE_ATTR(usage_count, S_IRUGO, cciss_show_usage_count, NULL); static struct attribute *cciss_host_attrs[] = { &dev_attr_rescan.attr, &dev_attr_resettable.attr, + &dev_attr_transport_mode.attr, NULL }; @@ -1716,7 +1735,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode, case CCISS_BIG_PASSTHRU: return cciss_bigpassthru(h, argp); - /* scsi_cmd_ioctl handles these, below, though some are not */ + /* scsi_cmd_blk_ioctl handles these, below, though some are not */ /* very meaningful for cciss. SG_IO is the main one people want. */ case SG_GET_VERSION_NUM: @@ -1727,9 +1746,9 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode, case SG_EMULATED_HOST: case SG_IO: case SCSI_IOCTL_SEND_COMMAND: - return scsi_cmd_ioctl(disk->queue, disk, mode, cmd, argp); + return scsi_cmd_blk_ioctl(bdev, mode, cmd, argp); - /* scsi_cmd_ioctl would normally handle these, below, but */ + /* scsi_cmd_blk_ioctl would normally handle these, below, but */ /* they aren't a good fit for cciss, as CD-ROMs are */ /* not supported, and we don't have any bus/target/lun */ /* which we present to the kernel. */ @@ -2582,6 +2601,8 @@ static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff, c->Request.Timeout = 0; c->Request.CDB[0] = BMIC_WRITE; c->Request.CDB[6] = BMIC_CACHE_FLUSH; + c->Request.CDB[7] = (size >> 8) & 0xFF; + c->Request.CDB[8] = size & 0xFF; break; case TEST_UNIT_READY: c->Request.CDBLen = 6; @@ -3984,6 +4005,9 @@ static void __devinit cciss_put_controller_into_performant_mode(ctlr_info_t *h) { __u32 trans_support; + if (cciss_simple_mode) + return; + dev_dbg(&h->pdev->dev, "Trying to put board into Performant mode\n"); /* Attempt to put controller into performant mode if supported */ /* Does board support performant mode? */ @@ -4081,7 +4105,7 @@ static void __devinit cciss_interrupt_mode(ctlr_info_t *h) default_int_mode: #endif /* CONFIG_PCI_MSI */ /* if we get here we're going to use the default interrupt mode */ - h->intr[PERF_MODE_INT] = h->pdev->irq; + h->intr[h->intr_mode] = h->pdev->irq; return; } @@ -4298,6 +4322,10 @@ static int __devinit cciss_pci_init(ctlr_info_t *h) dev_warn(&h->pdev->dev, "controller appears to be disabled\n"); return -ENODEV; } + + pci_disable_link_state(h->pdev, PCIE_LINK_STATE_L0S | + PCIE_LINK_STATE_L1 | PCIE_LINK_STATE_CLKPM); + err = pci_enable_device(h->pdev); if (err) { dev_warn(&h->pdev->dev, "Unable to Enable PCI device\n"); @@ -4341,6 +4369,9 @@ static int __devinit cciss_pci_init(ctlr_info_t *h) } cciss_enable_scsi_prefetch(h); cciss_p600_dma_prefetch_quirk(h); + err = cciss_enter_simple_mode(h); + if (err) + goto err_out_free_res; cciss_put_controller_into_performant_mode(h); return 0; @@ -4533,6 +4564,13 @@ static int cciss_controller_hard_reset(struct pci_dev *pdev, pmcsr &= ~PCI_PM_CTRL_STATE_MASK; pmcsr |= PCI_D0; pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr); + + /* + * The P600 requires a small delay when changing states. + * Otherwise we may think the board did not reset and we bail. + * This for kdump only and is particular to the P600. + */ + msleep(500); } return 0; } @@ -4843,20 +4881,20 @@ static int cciss_request_irq(ctlr_info_t *h, irqreturn_t (*intxhandler)(int, void *)) { if (h->msix_vector || h->msi_vector) { - if (!request_irq(h->intr[PERF_MODE_INT], msixhandler, - IRQF_DISABLED, h->devname, h)) + if (!request_irq(h->intr[h->intr_mode], msixhandler, + 0, h->devname, h)) return 0; dev_err(&h->pdev->dev, "Unable to get msi irq %d" - " for %s\n", h->intr[PERF_MODE_INT], + " for %s\n", h->intr[h->intr_mode], h->devname); return -1; } - if (!request_irq(h->intr[PERF_MODE_INT], intxhandler, - IRQF_DISABLED, h->devname, h)) + if (!request_irq(h->intr[h->intr_mode], intxhandler, + IRQF_SHARED, h->devname, h)) return 0; dev_err(&h->pdev->dev, "Unable to get irq %d for %s\n", - h->intr[PERF_MODE_INT], h->devname); + h->intr[h->intr_mode], h->devname); return -1; } @@ -4887,7 +4925,7 @@ static void cciss_undo_allocations_after_kdump_soft_reset(ctlr_info_t *h) { int ctlr = h->ctlr; - free_irq(h->intr[PERF_MODE_INT], h); + free_irq(h->intr[h->intr_mode], h); #ifdef CONFIG_PCI_MSI if (h->msix_vector) pci_disable_msix(h->pdev); @@ -4953,6 +4991,7 @@ reinit_after_soft_reset: h = hba[i]; h->pdev = pdev; h->busy_initializing = 1; + h->intr_mode = cciss_simple_mode ? SIMPLE_MODE_INT : PERF_MODE_INT; INIT_LIST_HEAD(&h->cmpQ); INIT_LIST_HEAD(&h->reqQ); mutex_init(&h->busy_shutting_down); @@ -5009,7 +5048,7 @@ reinit_after_soft_reset: dev_info(&h->pdev->dev, "%s: <0x%x> at PCI %s IRQ %d%s using DAC\n", h->devname, pdev->device, pci_name(pdev), - h->intr[PERF_MODE_INT], dac ? "" : " not"); + h->intr[h->intr_mode], dac ? "" : " not"); if (cciss_allocate_cmd_pool(h)) goto clean4; @@ -5056,7 +5095,7 @@ reinit_after_soft_reset: spin_lock_irqsave(&h->lock, flags); h->access.set_intr_mask(h, CCISS_INTR_OFF); spin_unlock_irqrestore(&h->lock, flags); - free_irq(h->intr[PERF_MODE_INT], h); + free_irq(h->intr[h->intr_mode], h); rc = cciss_request_irq(h, cciss_msix_discard_completions, cciss_intx_discard_completions); if (rc) { @@ -5126,6 +5165,7 @@ reinit_after_soft_reset: h->cciss_max_sectors = 8192; rebuild_lun_table(h, 1, 0); + cciss_engage_scsi(h); h->busy_initializing = 0; return 1; @@ -5133,7 +5173,7 @@ clean4: cciss_free_cmd_pool(h); cciss_free_scatterlists(h); cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds); - free_irq(h->intr[PERF_MODE_INT], h); + free_irq(h->intr[h->intr_mode], h); clean2: unregister_blkdev(h->major, h->devname); clean1: @@ -5172,9 +5212,31 @@ static void cciss_shutdown(struct pci_dev *pdev) if (return_code != IO_OK) dev_warn(&h->pdev->dev, "Error flushing cache\n"); h->access.set_intr_mask(h, CCISS_INTR_OFF); - free_irq(h->intr[PERF_MODE_INT], h); + free_irq(h->intr[h->intr_mode], h); } +static int __devinit cciss_enter_simple_mode(struct ctlr_info *h) +{ + u32 trans_support; + + trans_support = readl(&(h->cfgtable->TransportSupport)); + if (!(trans_support & SIMPLE_MODE)) + return -ENOTSUPP; + + h->max_commands = readl(&(h->cfgtable->CmdsOutMax)); + writel(CFGTBL_Trans_Simple, &(h->cfgtable->HostWrite.TransportRequest)); + writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL); + cciss_wait_for_mode_change_ack(h); + print_cfg_table(h); + if (!(readl(&(h->cfgtable->TransportActive)) & CFGTBL_Trans_Simple)) { + dev_warn(&h->pdev->dev, "unable to get board into simple mode\n"); + return -ENODEV; + } + h->transMethod = CFGTBL_Trans_Simple; + return 0; +} + + static void __devexit cciss_remove_one(struct pci_dev *pdev) { ctlr_info_t *h; diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h index c049548e68b7..7fda30e4a241 100644 --- a/drivers/block/cciss.h +++ b/drivers/block/cciss.h @@ -92,6 +92,7 @@ struct ctlr_info unsigned int intr[4]; unsigned int msix_vector; unsigned int msi_vector; + int intr_mode; int cciss_max_sectors; BYTE cciss_read; BYTE cciss_write; diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c index 951a4e33b92b..da3311129a0c 100644 --- a/drivers/block/cciss_scsi.c +++ b/drivers/block/cciss_scsi.c @@ -763,16 +763,7 @@ static void complete_scsi_command(CommandList_struct *c, int timeout, { case CMD_TARGET_STATUS: /* Pass it up to the upper layers... */ - if( ei->ScsiStatus) - { -#if 0 - printk(KERN_WARNING "cciss: cmd %p " - "has SCSI Status = %x\n", - c, ei->ScsiStatus); -#endif - cmd->result |= (ei->ScsiStatus << 1); - } - else { /* scsi status is zero??? How??? */ + if (!ei->ScsiStatus) { /* Ordinarily, this case should never happen, but there is a bug in some released firmware revisions that allows it to happen @@ -804,6 +795,7 @@ static void complete_scsi_command(CommandList_struct *c, int timeout, } break; case CMD_PROTOCOL_ERR: + cmd->result = DID_ERROR << 16; dev_warn(&h->pdev->dev, "%p has protocol error\n", c); break; @@ -866,6 +858,7 @@ cciss_scsi_detect(ctlr_info_t *h) sh->can_queue = cciss_tape_cmds; sh->sg_tablesize = h->maxsgentries; sh->max_cmd_len = MAX_COMMAND_SIZE; + sh->max_sectors = h->cciss_max_sectors; ((struct cciss_scsi_adapter_data_t *) h->scsi_ctlr)->scsi_host = sh; @@ -1410,7 +1403,7 @@ static void cciss_scatter_gather(ctlr_info_t *h, CommandList_struct *c, /* track how many SG entries we are using */ if (request_nsgs > h->maxSG) h->maxSG = request_nsgs; - c->Header.SGTotal = (__u8) request_nsgs + chained; + c->Header.SGTotal = (u16) request_nsgs + chained; if (request_nsgs > h->max_cmd_sgentries) c->Header.SGList = h->max_cmd_sgentries; else @@ -1720,5 +1713,6 @@ static int cciss_eh_abort_handler(struct scsi_cmnd *scsicmd) /* If no tape support, then these become defined out of existence */ #define cciss_scsi_setup(cntl_num) +#define cciss_engage_scsi(h) #endif /* CONFIG_CISS_SCSI_TAPE */ diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c index b2fceb53e809..9125bbeacd4d 100644 --- a/drivers/block/cpqarray.c +++ b/drivers/block/cpqarray.c @@ -620,6 +620,7 @@ static int cpqarray_pci_init(ctlr_info_t *c, struct pci_dev *pdev) } vendor_id = pdev->vendor; device_id = pdev->device; + revision = pdev->revision; irq = pdev->irq; for(i=0; i<6; i++) @@ -632,7 +633,6 @@ static int cpqarray_pci_init(ctlr_info_t *c, struct pci_dev *pdev) } pci_read_config_word(pdev, PCI_COMMAND, &command); - pci_read_config_byte(pdev, PCI_CLASS_REVISION, &revision); pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &cache_line_size); pci_read_config_byte(pdev, PCI_LATENCY_TIMER, &latency_timer); diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index cf0e63dd97da..3fbef018ce55 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -65,39 +65,80 @@ struct drbd_atodb_wait { int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int); +void *drbd_md_get_buffer(struct drbd_conf *mdev) +{ + int r; + + wait_event(mdev->misc_wait, + (r = atomic_cmpxchg(&mdev->md_io_in_use, 0, 1)) == 0 || + mdev->state.disk <= D_FAILED); + + return r ? NULL : page_address(mdev->md_io_page); +} + +void drbd_md_put_buffer(struct drbd_conf *mdev) +{ + if (atomic_dec_and_test(&mdev->md_io_in_use)) + wake_up(&mdev->misc_wait); +} + +static bool md_io_allowed(struct drbd_conf *mdev) +{ + enum drbd_disk_state ds = mdev->state.disk; + return ds >= D_NEGOTIATING || ds == D_ATTACHING; +} + +void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, + unsigned int *done) +{ + long dt = bdev->dc.disk_timeout * HZ / 10; + if (dt == 0) + dt = MAX_SCHEDULE_TIMEOUT; + + dt = wait_event_timeout(mdev->misc_wait, *done || !md_io_allowed(mdev), dt); + if (dt == 0) + dev_err(DEV, "meta-data IO operation timed out\n"); +} + static int _drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, struct page *page, sector_t sector, int rw, int size) { struct bio *bio; - struct drbd_md_io md_io; int ok; - md_io.mdev = mdev; - init_completion(&md_io.event); - md_io.error = 0; + mdev->md_io.done = 0; + mdev->md_io.error = -ENODEV; if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags)) rw |= REQ_FUA | REQ_FLUSH; rw |= REQ_SYNC; - bio = bio_alloc(GFP_NOIO, 1); + bio = bio_alloc_drbd(GFP_NOIO); bio->bi_bdev = bdev->md_bdev; bio->bi_sector = sector; ok = (bio_add_page(bio, page, size, 0) == size); if (!ok) goto out; - bio->bi_private = &md_io; + bio->bi_private = &mdev->md_io; bio->bi_end_io = drbd_md_io_complete; bio->bi_rw = rw; + if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ + dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); + ok = 0; + goto out; + } + + bio_get(bio); /* one bio_put() is in the completion handler */ + atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */ if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) bio_endio(bio, -EIO); else submit_bio(rw, bio); - wait_for_completion(&md_io.event); - ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0; + wait_until_done_or_disk_failure(mdev, bdev, &mdev->md_io.done); + ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0; out: bio_put(bio); @@ -111,7 +152,7 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int offset = 0; struct page *iop = mdev->md_io_page; - D_ASSERT(mutex_is_locked(&mdev->md_io_mutex)); + D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1); BUG_ON(!bdev->md_bdev); @@ -328,8 +369,13 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) return 1; } - mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */ - buffer = (struct al_transaction *)page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */ + if (!buffer) { + dev_err(DEV, "disk failed while waiting for md_io buffer\n"); + complete(&((struct update_al_work *)w)->event); + put_ldev(mdev); + return 1; + } buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); buffer->tr_number = cpu_to_be32(mdev->al_tr_number); @@ -365,7 +411,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) + mdev->ldev->md.al_offset + mdev->al_tr_pos; if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) - drbd_chk_io_error(mdev, 1, true); + drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); if (++mdev->al_tr_pos > div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) @@ -374,7 +420,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); mdev->al_tr_number++; - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); complete(&((struct update_al_work *)w)->event); put_ldev(mdev); @@ -443,8 +489,9 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) /* lock out all other meta data io for now, * and make sure the page is mapped. */ - mutex_lock(&mdev->md_io_mutex); - buffer = page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); + if (!buffer) + return 0; /* Find the valid transaction in the log */ for (i = 0; i <= mx; i++) { @@ -452,7 +499,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (rv == 0) continue; if (rv == -1) { - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); return 0; } cnr = be32_to_cpu(buffer->tr_number); @@ -478,7 +525,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (!found_valid) { dev_warn(DEV, "No usable activity log found.\n"); - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); return 1; } @@ -493,7 +540,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) rv = drbd_al_read_tr(mdev, bdev, buffer, i); ERR_IF(rv == 0) goto cancel; if (rv == -1) { - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); return 0; } @@ -534,7 +581,7 @@ cancel: mdev->al_tr_pos = 0; /* ok, we are done with it */ - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n", transactions, active_extents); @@ -671,16 +718,20 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, else ext->rs_failed += count; if (ext->rs_left < ext->rs_failed) { - dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d " - "rs_failed=%d count=%d\n", + dev_warn(DEV, "BAD! sector=%llus enr=%u rs_left=%d " + "rs_failed=%d count=%d cstate=%s\n", (unsigned long long)sector, ext->lce.lc_number, ext->rs_left, - ext->rs_failed, count); - dump_stack(); - - lc_put(mdev->resync, &ext->lce); - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); - return; + ext->rs_failed, count, + drbd_conn_str(mdev->state.conn)); + + /* We don't expect to be able to clear more bits + * than have been set when we originally counted + * the set bits to cache that value in ext->rs_left. + * Whatever the reason (disconnect during resync, + * delayed local completion of an application write), + * try to fix it up by recounting here. */ + ext->rs_left = drbd_bm_e_weight(mdev, enr); } } else { /* Normally this element should be in the cache, @@ -825,7 +876,11 @@ int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, unsigned int enr, count = 0; struct lc_element *e; - if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { + /* this should be an empty REQ_FLUSH */ + if (size == 0) + return 0; + + if (size < 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { dev_err(DEV, "sector: %llus, size: %d\n", (unsigned long long)sector, size); return 0; @@ -1192,6 +1247,7 @@ int drbd_rs_del_all(struct drbd_conf *mdev) put_ldev(mdev); } spin_unlock_irq(&mdev->al_lock); + wake_up(&mdev->al_wait); return 0; } diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 7b976296b564..d84566496746 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -205,7 +205,7 @@ void drbd_bm_unlock(struct drbd_conf *mdev) static void bm_store_page_idx(struct page *page, unsigned long idx) { BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK)); - page_private(page) |= idx; + set_page_private(page, idx); } static unsigned long bm_page_to_idx(struct page *page) @@ -289,25 +289,25 @@ static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr) return page_nr; } -static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km) +static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx) { struct page *page = b->bm_pages[idx]; - return (unsigned long *) kmap_atomic(page, km); + return (unsigned long *) kmap_atomic(page); } static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx) { - return __bm_map_pidx(b, idx, KM_IRQ1); + return __bm_map_pidx(b, idx); } -static void __bm_unmap(unsigned long *p_addr, const enum km_type km) +static void __bm_unmap(unsigned long *p_addr) { - kunmap_atomic(p_addr, km); + kunmap_atomic(p_addr); }; static void bm_unmap(unsigned long *p_addr) { - return __bm_unmap(p_addr, KM_IRQ1); + return __bm_unmap(p_addr); } /* long word offset of _bitmap_ sector */ @@ -378,15 +378,14 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) * thread. As we have no disk yet, we are not in the IO path, * not even the IO path of the peer. */ bytes = sizeof(struct page *)*want; - new_pages = kmalloc(bytes, GFP_KERNEL); + new_pages = kzalloc(bytes, GFP_KERNEL); if (!new_pages) { - new_pages = vmalloc(bytes); + new_pages = vzalloc(bytes); if (!new_pages) return NULL; vmalloced = 1; } - memset(new_pages, 0, bytes); if (want >= have) { for (i = 0; i < have; i++) new_pages[i] = old_pages[i]; @@ -544,15 +543,15 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b) /* all but last page */ for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) { - p_addr = __bm_map_pidx(b, idx, KM_USER0); + p_addr = __bm_map_pidx(b, idx); for (i = 0; i < LWPP; i++) bits += hweight_long(p_addr[i]); - __bm_unmap(p_addr, KM_USER0); + __bm_unmap(p_addr); cond_resched(); } /* last (or only) page */ last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL; - p_addr = __bm_map_pidx(b, idx, KM_USER0); + p_addr = __bm_map_pidx(b, idx); for (i = 0; i < last_word; i++) bits += hweight_long(p_addr[i]); p_addr[last_word] &= cpu_to_lel(mask); @@ -560,7 +559,7 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b) /* 32bit arch, may have an unused padding long */ if (BITS_PER_LONG == 32 && (last_word & 1) == 0) p_addr[last_word+1] = 0; - __bm_unmap(p_addr, KM_USER0); + __bm_unmap(p_addr); return bits; } @@ -887,12 +886,22 @@ void drbd_bm_clear_all(struct drbd_conf *mdev) struct bm_aio_ctx { struct drbd_conf *mdev; atomic_t in_flight; - struct completion done; + unsigned int done; unsigned flags; #define BM_AIO_COPY_PAGES 1 +#define BM_WRITE_ALL_PAGES 2 int error; + struct kref kref; }; +static void bm_aio_ctx_destroy(struct kref *kref) +{ + struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref); + + put_ldev(ctx->mdev); + kfree(ctx); +} + /* bv_page may be a copy, or may be the original */ static void bm_async_io_complete(struct bio *bio, int error) { @@ -931,20 +940,21 @@ static void bm_async_io_complete(struct bio *bio, int error) bm_page_unlock_io(mdev, idx); - /* FIXME give back to page pool */ if (ctx->flags & BM_AIO_COPY_PAGES) - put_page(bio->bi_io_vec[0].bv_page); + mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool); bio_put(bio); - if (atomic_dec_and_test(&ctx->in_flight)) - complete(&ctx->done); + if (atomic_dec_and_test(&ctx->in_flight)) { + ctx->done = 1; + wake_up(&mdev->misc_wait); + kref_put(&ctx->kref, &bm_aio_ctx_destroy); + } } static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local) { - /* we are process context. we always get a bio */ - struct bio *bio = bio_alloc(GFP_KERNEL, 1); + struct bio *bio = bio_alloc_drbd(GFP_NOIO); struct drbd_conf *mdev = ctx->mdev; struct drbd_bitmap *b = mdev->bitmap; struct page *page; @@ -967,21 +977,21 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must bm_set_page_unchanged(b->bm_pages[page_nr]); if (ctx->flags & BM_AIO_COPY_PAGES) { - /* FIXME alloc_page is good enough for now, but actually needs - * to use pre-allocated page pool */ void *src, *dest; - page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT); - dest = kmap_atomic(page, KM_USER0); - src = kmap_atomic(b->bm_pages[page_nr], KM_USER1); + page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT); + dest = kmap_atomic(page); + src = kmap_atomic(b->bm_pages[page_nr]); memcpy(dest, src, PAGE_SIZE); - kunmap_atomic(src, KM_USER1); - kunmap_atomic(dest, KM_USER0); + kunmap_atomic(src); + kunmap_atomic(dest); bm_store_page_idx(page, page_nr); } else page = b->bm_pages[page_nr]; bio->bi_bdev = mdev->ldev->md_bdev; bio->bi_sector = on_disk_sector; + /* bio_add_page of a single page to an empty bio will always succeed, + * according to api. Do we want to assert that? */ bio_add_page(bio, page, len, 0); bio->bi_private = ctx; bio->bi_end_io = bm_async_io_complete; @@ -1000,14 +1010,9 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must /* * bm_rw: read/write the whole bitmap from/to its on disk location. */ -static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local) +static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local) { - struct bm_aio_ctx ctx = { - .mdev = mdev, - .in_flight = ATOMIC_INIT(1), - .done = COMPLETION_INITIALIZER_ONSTACK(ctx.done), - .flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0, - }; + struct bm_aio_ctx *ctx; struct drbd_bitmap *b = mdev->bitmap; int num_pages, i, count = 0; unsigned long now; @@ -1022,7 +1027,27 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id * For lazy writeout, we don't care for ongoing changes to the bitmap, * as we submit copies of pages anyways. */ - if (!ctx.flags) + + ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO); + if (!ctx) + return -ENOMEM; + + *ctx = (struct bm_aio_ctx) { + .mdev = mdev, + .in_flight = ATOMIC_INIT(1), + .done = 0, + .flags = flags, + .error = 0, + .kref = { ATOMIC_INIT(2) }, + }; + + if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */ + dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n"); + kfree(ctx); + return -ENODEV; + } + + if (!ctx->flags) WARN_ON(!(BM_LOCKED_MASK & b->bm_flags)); num_pages = b->bm_number_of_pages; @@ -1035,7 +1060,8 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) break; if (rw & WRITE) { - if (bm_test_page_unchanged(b->bm_pages[i])) { + if (!(flags & BM_WRITE_ALL_PAGES) && + bm_test_page_unchanged(b->bm_pages[i])) { dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i); continue; } @@ -1047,29 +1073,38 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id continue; } } - atomic_inc(&ctx.in_flight); - bm_page_io_async(&ctx, i, rw); + atomic_inc(&ctx->in_flight); + bm_page_io_async(ctx, i, rw); ++count; cond_resched(); } /* - * We initialize ctx.in_flight to one to make sure bm_async_io_complete - * will not complete() early, and decrement / test it here. If there + * We initialize ctx->in_flight to one to make sure bm_async_io_complete + * will not set ctx->done early, and decrement / test it here. If there * are still some bios in flight, we need to wait for them here. + * If all IO is done already (or nothing had been submitted), there is + * no need to wait. Still, we need to put the kref associated with the + * "in_flight reached zero, all done" event. */ - if (!atomic_dec_and_test(&ctx.in_flight)) - wait_for_completion(&ctx.done); + if (!atomic_dec_and_test(&ctx->in_flight)) + wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done); + else + kref_put(&ctx->kref, &bm_aio_ctx_destroy); + dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n", rw == WRITE ? "WRITE" : "READ", count, jiffies - now); - if (ctx.error) { + if (ctx->error) { dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); - drbd_chk_io_error(mdev, 1, true); - err = -EIO; /* ctx.error ? */ + drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); + err = -EIO; /* ctx->error ? */ } + if (atomic_read(&ctx->in_flight)) + err = -EIO; /* Disk failed during IO... */ + now = jiffies; if (rw == WRITE) { drbd_md_flush(mdev); @@ -1083,6 +1118,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); + kref_put(&ctx->kref, &bm_aio_ctx_destroy); return err; } @@ -1092,7 +1128,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id */ int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) { - return bm_rw(mdev, READ, 0); + return bm_rw(mdev, READ, 0, 0); } /** @@ -1103,7 +1139,18 @@ int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) */ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) { - return bm_rw(mdev, WRITE, 0); + return bm_rw(mdev, WRITE, 0, 0); +} + +/** + * drbd_bm_write_all() - Write the whole bitmap to its on disk location. + * @mdev: DRBD device. + * + * Will write all pages. + */ +int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local) +{ + return bm_rw(mdev, WRITE, BM_WRITE_ALL_PAGES, 0); } /** @@ -1113,7 +1160,23 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) */ int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local) { - return bm_rw(mdev, WRITE, upper_idx); + return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, upper_idx); +} + +/** + * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location. + * @mdev: DRBD device. + * + * Will only write pages that have changed since last IO. + * In contrast to drbd_bm_write(), this will copy the bitmap pages + * to temporary writeout pages. It is intended to trigger a full write-out + * while still allowing the bitmap to change, for example if a resync or online + * verify is aborted due to a failed peer disk, while local IO continues, or + * pending resync acks are still being processed. + */ +int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local) +{ + return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0); } @@ -1131,28 +1194,45 @@ int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(l */ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local) { - struct bm_aio_ctx ctx = { + struct bm_aio_ctx *ctx; + int err; + + if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) { + dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx); + return 0; + } + + ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO); + if (!ctx) + return -ENOMEM; + + *ctx = (struct bm_aio_ctx) { .mdev = mdev, .in_flight = ATOMIC_INIT(1), - .done = COMPLETION_INITIALIZER_ONSTACK(ctx.done), + .done = 0, .flags = BM_AIO_COPY_PAGES, + .error = 0, + .kref = { ATOMIC_INIT(2) }, }; - if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) { - dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx); - return 0; + if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */ + dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n"); + kfree(ctx); + return -ENODEV; } - bm_page_io_async(&ctx, idx, WRITE_SYNC); - wait_for_completion(&ctx.done); + bm_page_io_async(ctx, idx, WRITE_SYNC); + wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done); - if (ctx.error) - drbd_chk_io_error(mdev, 1, true); + if (ctx->error) + drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); /* that should force detach, so the in memory bitmap will be * gone in a moment as well. */ mdev->bm_writ_cnt++; - return ctx.error; + err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error; + kref_put(&ctx->kref, &bm_aio_ctx_destroy); + return err; } /* NOTE @@ -1164,7 +1244,7 @@ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(loc * this returns a bit number, NOT a sector! */ static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo, - const int find_zero_bit, const enum km_type km) + const int find_zero_bit) { struct drbd_bitmap *b = mdev->bitmap; unsigned long *p_addr; @@ -1179,7 +1259,7 @@ static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo, while (bm_fo < b->bm_bits) { /* bit offset of the first bit in the page */ bit_offset = bm_fo & ~BITS_PER_PAGE_MASK; - p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo), km); + p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo)); if (find_zero_bit) i = find_next_zero_bit_le(p_addr, @@ -1188,7 +1268,7 @@ static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo, i = find_next_bit_le(p_addr, PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK); - __bm_unmap(p_addr, km); + __bm_unmap(p_addr); if (i < PAGE_SIZE*8) { bm_fo = bit_offset + i; if (bm_fo >= b->bm_bits) @@ -1216,7 +1296,7 @@ static unsigned long bm_find_next(struct drbd_conf *mdev, if (BM_DONT_TEST & b->bm_flags) bm_print_lock_info(mdev); - i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1); + i = __bm_find_next(mdev, bm_fo, find_zero_bit); spin_unlock_irq(&b->bm_lock); return i; @@ -1240,13 +1320,13 @@ unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo) { /* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */ - return __bm_find_next(mdev, bm_fo, 0, KM_USER1); + return __bm_find_next(mdev, bm_fo, 0); } unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo) { /* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */ - return __bm_find_next(mdev, bm_fo, 1, KM_USER1); + return __bm_find_next(mdev, bm_fo, 1); } /* returns number of bits actually changed. @@ -1274,14 +1354,14 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, unsigned int page_nr = bm_bit_to_page_idx(b, bitnr); if (page_nr != last_page_nr) { if (p_addr) - __bm_unmap(p_addr, KM_IRQ1); + __bm_unmap(p_addr); if (c < 0) bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]); else if (c > 0) bm_set_page_need_writeout(b->bm_pages[last_page_nr]); changed_total += c; c = 0; - p_addr = __bm_map_pidx(b, page_nr, KM_IRQ1); + p_addr = __bm_map_pidx(b, page_nr); last_page_nr = page_nr; } if (val) @@ -1290,7 +1370,7 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr)); } if (p_addr) - __bm_unmap(p_addr, KM_IRQ1); + __bm_unmap(p_addr); if (c < 0) bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]); else if (c > 0) @@ -1343,13 +1423,13 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b, { int i; int bits; - unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_IRQ1); + unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]); for (i = first_word; i < last_word; i++) { bits = hweight_long(paddr[i]); paddr[i] = ~0UL; b->bm_set += BITS_PER_LONG - bits; } - kunmap_atomic(paddr, KM_IRQ1); + kunmap_atomic(paddr); } /* Same thing as drbd_bm_set_bits, @@ -1408,10 +1488,17 @@ void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsi first_word = 0; spin_lock_irq(&b->bm_lock); } - /* last page (respectively only page, for first page == last page) */ last_word = MLPP(el >> LN2_BPL); - bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word); + + /* consider bitmap->bm_bits = 32768, bitmap->bm_number_of_pages = 1. (or multiples). + * ==> e = 32767, el = 32768, last_page = 2, + * and now last_word = 0. + * We do not want to touch last_page in this case, + * as we did not allocate it, it is not present in bitmap->bm_pages. + */ + if (last_word) + bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word); /* possibly trailing bits. * example: (e & 63) == 63, el will be e+1. diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index ef2ceed3be4b..b953cc7c9c00 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -28,7 +28,6 @@ #include <linux/compiler.h> #include <linux/types.h> -#include <linux/version.h> #include <linux/list.h> #include <linux/sched.h> #include <linux/bitops.h> @@ -60,8 +59,8 @@ /* module parameter, defined in drbd_main.c */ extern unsigned int minor_count; -extern int disable_sendpage; -extern int allow_oos; +extern bool disable_sendpage; +extern bool allow_oos; extern unsigned int cn_idx; #ifdef CONFIG_DRBD_FAULT_INJECTION @@ -713,7 +712,6 @@ struct drbd_request { struct list_head tl_requests; /* ring list in the transfer log */ struct bio *master_bio; /* master bio pointer */ unsigned long rq_state; /* see comments above _req_mod() */ - int seq_num; unsigned long start_time; }; @@ -815,7 +813,6 @@ enum { SIGNAL_ASENDER, /* whether asender wants to be interrupted */ SEND_PING, /* whether asender should send a ping asap */ - UNPLUG_QUEUED, /* only relevant with kernel 2.4 */ UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ MD_DIRTY, /* current uuids and flags not yet on disk */ DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */ @@ -826,7 +823,6 @@ enum { CRASHED_PRIMARY, /* This node was a crashed primary. * Gets cleared when the state.conn * goes into C_CONNECTED state. */ - NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */ CONSIDER_RESYNC, MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ @@ -836,6 +832,7 @@ enum { BITMAP_IO_QUEUED, /* Started bitmap IO */ GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */ WAS_IO_ERROR, /* Local disk failed returned IO error */ + FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */ RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ NET_CONGESTED, /* The data socket is congested */ @@ -852,6 +849,14 @@ enum { NEW_CUR_UUID, /* Create new current UUID when thawing IO */ AL_SUSPENDED, /* Activity logging is currently suspended. */ AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */ + STATE_SENT, /* Do not change state/UUIDs while this is set */ + + CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC) + * pending, from drbd worker context. + * If set, bdi_write_congested() returns true, + * so shrink_page_list() would not recurse into, + * and potentially deadlock on, this drbd worker. + */ }; struct drbd_bitmap; /* opaque for drbd_conf */ @@ -863,31 +868,30 @@ enum bm_flag { BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */ /* currently locked for bulk operation */ - BM_LOCKED_MASK = 0x7, + BM_LOCKED_MASK = 0xf, /* in detail, that is: */ BM_DONT_CLEAR = 0x1, BM_DONT_SET = 0x2, BM_DONT_TEST = 0x4, + /* so we can mark it locked for bulk operation, + * and still allow all non-bulk operations */ + BM_IS_LOCKED = 0x8, + /* (test bit, count bit) allowed (common case) */ - BM_LOCKED_TEST_ALLOWED = 0x3, + BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED, /* testing bits, as well as setting new bits allowed, but clearing bits * would be unexpected. Used during bitmap receive. Setting new bits * requires sending of "out-of-sync" information, though. */ - BM_LOCKED_SET_ALLOWED = 0x1, + BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED, - /* clear is not expected while bitmap is locked for bulk operation */ + /* for drbd_bm_write_copy_pages, everything is allowed, + * only concurrent bulk operations are locked out. */ + BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED, }; - -/* TODO sort members for performance - * MAYBE group them further */ - -/* THINK maybe we actually want to use the default "event/%s" worker threads - * or similar in linux 2.6, which uses per cpu data and threads. - */ struct drbd_work_queue { struct list_head q; struct semaphore s; /* producers up it, worker down()s it */ @@ -928,7 +932,7 @@ struct drbd_md { #define NL_INT64(pn,pr,member) __u64 member; #define NL_BIT(pn,pr,member) unsigned member:1; #define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len; -#include "linux/drbd_nl.h" +#include <linux/drbd_nl.h> struct drbd_backing_dev { struct block_device *backing_bdev; @@ -939,8 +943,7 @@ struct drbd_backing_dev { }; struct drbd_md_io { - struct drbd_conf *mdev; - struct completion event; + unsigned int done; int error; }; @@ -1023,6 +1026,7 @@ struct drbd_conf { struct drbd_tl_epoch *newest_tle; struct drbd_tl_epoch *oldest_tle; struct list_head out_of_sequence_requests; + struct list_head barrier_acked_requests; struct hlist_head *tl_hash; unsigned int tl_hash_s; @@ -1057,6 +1061,8 @@ struct drbd_conf { struct crypto_hash *csums_tfm; struct crypto_hash *verify_tfm; + unsigned long last_reattach_jif; + unsigned long last_reconnect_jif; struct drbd_thread receiver; struct drbd_thread worker; struct drbd_thread asender; @@ -1095,7 +1101,8 @@ struct drbd_conf { wait_queue_head_t ee_wait; struct page *md_io_page; /* one page buffer for md_io */ struct page *md_io_tmpp; /* for logical_block_size != 512 */ - struct mutex md_io_mutex; /* protects the md_io_buffer */ + struct drbd_md_io md_io; + atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */ spinlock_t al_lock; wait_queue_head_t al_wait; struct lru_cache *act_log; /* activity log */ @@ -1129,8 +1136,8 @@ struct drbd_conf { int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ int rs_planed; /* resync sectors already planned */ atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ - int peer_max_bio_size; - int local_max_bio_size; + unsigned int peer_max_bio_size; + unsigned int local_max_bio_size; }; static inline struct drbd_conf *minor_to_mdev(unsigned int minor) @@ -1229,8 +1236,8 @@ extern int drbd_send_uuids(struct drbd_conf *mdev); extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev); extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags); -extern int _drbd_send_state(struct drbd_conf *mdev); -extern int drbd_send_state(struct drbd_conf *mdev); +extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s); +extern int drbd_send_current_state(struct drbd_conf *mdev); extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, enum drbd_packets cmd, struct p_header80 *h, size_t size, unsigned msg_flags); @@ -1434,9 +1441,9 @@ struct bm_extent { * hash table. */ #define HT_SHIFT 8 #define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT)) -#define DRBD_MAX_BIO_SIZE_SAFE (1 << 12) /* Works always = 4k */ +#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */ -#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */ +#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* The old header only allows packets up to 32Kib data */ /* Number of elements in the app_reads_hash */ #define APP_R_HSIZE 15 @@ -1462,6 +1469,8 @@ extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); +extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local); +extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local); extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr); extern size_t drbd_bm_words(struct drbd_conf *mdev); @@ -1494,11 +1503,38 @@ extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ extern mempool_t *drbd_request_mempool; extern mempool_t *drbd_ee_mempool; -extern struct page *drbd_pp_pool; /* drbd's page pool */ +/* drbd's page pool, used to buffer data received from the peer, + * or data requested by the peer. + * + * This does not have an emergency reserve. + * + * When allocating from this pool, it first takes pages from the pool. + * Only if the pool is depleted will try to allocate from the system. + * + * The assumption is that pages taken from this pool will be processed, + * and given back, "quickly", and then can be recycled, so we can avoid + * frequent calls to alloc_page(), and still will be able to make progress even + * under memory pressure. + */ +extern struct page *drbd_pp_pool; extern spinlock_t drbd_pp_lock; extern int drbd_pp_vacant; extern wait_queue_head_t drbd_pp_wait; +/* We also need a standard (emergency-reserve backed) page pool + * for meta data IO (activity log, bitmap). + * We can keep it global, as long as it is used as "N pages at a time". + * 128 should be plenty, currently we probably can get away with as few as 1. + */ +#define DRBD_MIN_POOL_PAGES 128 +extern mempool_t *drbd_md_io_page_pool; + +/* We also need to make sure we get a bio + * when we need it for housekeeping purposes */ +extern struct bio_set *drbd_md_io_bio_set; +/* to allocate from that set */ +extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); + extern rwlock_t global_state_lock; extern struct drbd_conf *drbd_new_device(unsigned int minor); @@ -1507,7 +1543,7 @@ extern void drbd_free_mdev(struct drbd_conf *mdev); extern int proc_details; /* drbd_req */ -extern int drbd_make_request(struct request_queue *q, struct bio *bio); +extern void drbd_make_request(struct request_queue *q, struct bio *bio); extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec); extern int is_valid_ar_handle(struct drbd_request *, sector_t); @@ -1537,8 +1573,12 @@ extern void resume_next_sg(struct drbd_conf *mdev); extern void suspend_other_sg(struct drbd_conf *mdev); extern int drbd_resync_finished(struct drbd_conf *mdev); /* maybe rather drbd_main.c ? */ +extern void *drbd_md_get_buffer(struct drbd_conf *mdev); +extern void drbd_md_put_buffer(struct drbd_conf *mdev); extern int drbd_md_sync_page_io(struct drbd_conf *mdev, - struct drbd_backing_dev *bdev, sector_t sector, int rw); + struct drbd_backing_dev *bdev, sector_t sector, int rw); +extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, + unsigned int *done); extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); extern void drbd_rs_controller_reset(struct drbd_conf *mdev); @@ -1755,19 +1795,6 @@ static inline struct page *page_chain_next(struct page *page) #define page_chain_for_each_safe(page, n) \ for (; page && ({ n = page_chain_next(page); 1; }); page = n) -static inline int drbd_bio_has_active_page(struct bio *bio) -{ - struct bio_vec *bvec; - int i; - - __bio_for_each_segment(bvec, bio, i, 0) { - if (page_count(bvec->bv_page) > 1) - return 1; - } - - return 0; -} - static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) { struct page *page = e->pages; @@ -1778,7 +1805,6 @@ static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) return 0; } - static inline void drbd_state_lock(struct drbd_conf *mdev) { wait_event(mdev->misc_wait, @@ -1821,12 +1847,20 @@ static inline int drbd_request_state(struct drbd_conf *mdev, return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED); } +enum drbd_force_detach_flags { + DRBD_IO_ERROR, + DRBD_META_IO_ERROR, + DRBD_FORCE_DETACH, +}; + #define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__) -static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where) +static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, + enum drbd_force_detach_flags forcedetach, + const char *where) { switch (mdev->ldev->dc.on_io_error) { case EP_PASS_ON: - if (!forcedetach) { + if (forcedetach == DRBD_IO_ERROR) { if (__ratelimit(&drbd_ratelimit_state)) dev_err(DEV, "Local IO failed in %s.\n", where); if (mdev->state.disk > D_INCONSISTENT) @@ -1837,6 +1871,8 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, case EP_DETACH: case EP_CALL_HELPER: set_bit(WAS_IO_ERROR, &mdev->flags); + if (forcedetach == DRBD_FORCE_DETACH) + set_bit(FORCE_DETACH, &mdev->flags); if (mdev->state.disk > D_FAILED) { _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); dev_err(DEV, @@ -1856,7 +1892,7 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, */ #define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__) static inline void drbd_chk_io_error_(struct drbd_conf *mdev, - int error, int forcedetach, const char *where) + int error, enum drbd_force_detach_flags forcedetach, const char *where) { if (error) { unsigned long flags; @@ -2231,7 +2267,7 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev, * Note: currently we don't support such large bitmaps on 32bit * arch anyways, but no harm done to be prepared for it here. */ - unsigned int shift = mdev->rs_total >= (1ULL << 32) ? 16 : 10; + unsigned int shift = mdev->rs_total > UINT_MAX ? 16 : 10; unsigned long left = *bits_left >> shift; unsigned long total = 1UL + (mdev->rs_total >> shift); unsigned long tmp = 1000UL - left * 1000UL/total; @@ -2307,12 +2343,12 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev) case D_OUTDATED: case D_CONSISTENT: case D_UP_TO_DATE: + case D_FAILED: /* disk state is stable as well. */ break; /* no new io accepted during tansitional states */ case D_ATTACHING: - case D_FAILED: case D_NEGOTIATING: case D_UNKNOWN: case D_MASK: @@ -2386,15 +2422,17 @@ static inline void dec_ap_bio(struct drbd_conf *mdev) int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt); D_ASSERT(ap_bio >= 0); + + if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) { + if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) + drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); + } + /* this currently does wake_up for every dec_ap_bio! * maybe rather introduce some type of hysteresis? * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */ if (ap_bio < mxb) wake_up(&mdev->misc_wait); - if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) { - if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) - drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); - } } static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 0358e55356c8..f93a0320e952 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -79,6 +79,7 @@ static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); static void md_sync_timer_fn(unsigned long data); static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused); +static void _tl_clear(struct drbd_conf *mdev); MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " "Lars Ellenberg <lars@linbit.com>"); @@ -117,8 +118,8 @@ module_param(fault_devs, int, 0644); /* module parameter, defined */ unsigned int minor_count = DRBD_MINOR_COUNT_DEF; -int disable_sendpage; -int allow_oos; +bool disable_sendpage; +bool allow_oos; unsigned int cn_idx = CN_IDX_DRBD; int proc_details; /* Detail level in proc drbd*/ @@ -139,6 +140,8 @@ struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ mempool_t *drbd_request_mempool; mempool_t *drbd_ee_mempool; +mempool_t *drbd_md_io_page_pool; +struct bio_set *drbd_md_io_bio_set; /* I do not use a standard mempool, because: 1) I want to hand out the pre-allocated objects first. @@ -159,7 +162,24 @@ static const struct block_device_operations drbd_ops = { .release = drbd_release, }; -#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0])) +static void bio_destructor_drbd(struct bio *bio) +{ + bio_free(bio, drbd_md_io_bio_set); +} + +struct bio *bio_alloc_drbd(gfp_t gfp_mask) +{ + struct bio *bio; + + if (!drbd_md_io_bio_set) + return bio_alloc(gfp_mask, 1); + + bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set); + if (!bio) + return NULL; + bio->bi_destructor = bio_destructor_drbd; + return bio; +} #ifdef __CHECKER__ /* When checking with sparse, and this is an inline function, sparse will @@ -208,6 +228,7 @@ static int tl_init(struct drbd_conf *mdev) mdev->oldest_tle = b; mdev->newest_tle = b; INIT_LIST_HEAD(&mdev->out_of_sequence_requests); + INIT_LIST_HEAD(&mdev->barrier_acked_requests); mdev->tl_hash = NULL; mdev->tl_hash_s = 0; @@ -246,9 +267,7 @@ void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) new->n_writes = 0; newest_before = mdev->newest_tle; - /* never send a barrier number == 0, because that is special-cased - * when using TCQ for our write ordering code */ - new->br_number = (newest_before->br_number+1) ?: 1; + new->br_number = newest_before->br_number+1; if (mdev->newest_tle != new) { mdev->newest_tle->next = new; mdev->newest_tle = new; @@ -311,7 +330,7 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, These have been list_move'd to the out_of_sequence_requests list in _req_mod(, barrier_acked) above. */ - list_del_init(&b->requests); + list_splice_init(&b->requests, &mdev->barrier_acked_requests); nob = b->next; if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { @@ -411,6 +430,14 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) b = tmp; list_splice(&carry_reads, &b->requests); } + + /* Actions operating on the disk state, also want to work on + requests that got barrier acked. */ + + list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { + req = list_entry(le, struct drbd_request, tl_requests); + _req_mod(req, what); + } } @@ -424,11 +451,16 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) */ void tl_clear(struct drbd_conf *mdev) { + spin_lock_irq(&mdev->req_lock); + _tl_clear(mdev); + spin_unlock_irq(&mdev->req_lock); +} + +static void _tl_clear(struct drbd_conf *mdev) +{ struct list_head *le, *tle; struct drbd_request *r; - spin_lock_irq(&mdev->req_lock); - _tl_restart(mdev, connection_lost_while_pending); /* we expect this list to be empty. */ @@ -447,7 +479,6 @@ void tl_clear(struct drbd_conf *mdev) memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); - spin_unlock_irq(&mdev->req_lock); } void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) @@ -458,6 +489,38 @@ void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) } /** + * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL + * @mdev: DRBD device. + */ +void tl_abort_disk_io(struct drbd_conf *mdev) +{ + struct drbd_tl_epoch *b; + struct list_head *le, *tle; + struct drbd_request *req; + + spin_lock_irq(&mdev->req_lock); + b = mdev->oldest_tle; + while (b) { + list_for_each_safe(le, tle, &b->requests) { + req = list_entry(le, struct drbd_request, tl_requests); + if (!(req->rq_state & RQ_LOCAL_PENDING)) + continue; + _req_mod(req, abort_disk_io); + } + b = b->next; + } + + list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { + req = list_entry(le, struct drbd_request, tl_requests); + if (!(req->rq_state & RQ_LOCAL_PENDING)) + continue; + _req_mod(req, abort_disk_io); + } + + spin_unlock_irq(&mdev->req_lock); +} + +/** * cl_wide_st_chg() - true if the state change is a cluster wide one * @mdev: DRBD device. * @os: old (current) state. @@ -470,7 +533,7 @@ static int cl_wide_st_chg(struct drbd_conf *mdev, ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || - (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) || + (os.disk != D_FAILED && ns.disk == D_FAILED))) || (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); } @@ -509,8 +572,16 @@ static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state); static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *, union drbd_state, union drbd_state); +enum sanitize_state_warnings { + NO_WARNING, + ABORTED_ONLINE_VERIFY, + ABORTED_RESYNC, + CONNECTION_LOST_NEGOTIATING, + IMPLICITLY_UPGRADED_DISK, + IMPLICITLY_UPGRADED_PDSK, +}; static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, const char **warn_sync_abort); + union drbd_state ns, enum sanitize_state_warnings *warn); int drbd_send_state_req(struct drbd_conf *, union drbd_state, union drbd_state); @@ -785,6 +856,13 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) rv = SS_IN_TRANSIENT_STATE; + /* While establishing a connection only allow cstate to change. + Delay/refuse role changes, detach attach etc... */ + if (test_bit(STATE_SENT, &mdev->flags) && + !(os.conn == C_WF_REPORT_PARAMS || + (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION))) + rv = SS_IN_TRANSIENT_STATE; + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) rv = SS_NEED_CONNECTION; @@ -803,6 +881,21 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, return rv; } +static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn) +{ + static const char *msg_table[] = { + [NO_WARNING] = "", + [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.", + [ABORTED_RESYNC] = "Resync aborted.", + [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!", + [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk", + [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk", + }; + + if (warn != NO_WARNING) + dev_warn(DEV, "%s\n", msg_table[warn]); +} + /** * sanitize_state() - Resolves implicitly necessary additional changes to a state transition * @mdev: DRBD device. @@ -814,11 +907,14 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, * to D_UNKNOWN. This rule and many more along those lines are in this function. */ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, const char **warn_sync_abort) + union drbd_state ns, enum sanitize_state_warnings *warn) { enum drbd_fencing_p fp; enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; + if (warn) + *warn = NO_WARNING; + fp = FP_DONT_CARE; if (get_ldev(mdev)) { fp = mdev->ldev->dc.fencing; @@ -833,18 +929,13 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow. * If you try to go into some Sync* state, that shall fail (elsewhere). */ if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && - ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN) + ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED) ns.conn = os.conn; /* we cannot fail (again) if we already detached */ if (ns.disk == D_FAILED && os.disk == D_DISKLESS) ns.disk = D_DISKLESS; - /* if we are only D_ATTACHING yet, - * we can (and should) go directly to D_DISKLESS. */ - if (ns.disk == D_FAILED && os.disk == D_ATTACHING) - ns.disk = D_DISKLESS; - /* After C_DISCONNECTING only C_STANDALONE may follow */ if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) ns.conn = os.conn; @@ -863,10 +954,9 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state /* Abort resync if a disk fails/detaches */ if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { - if (warn_sync_abort) - *warn_sync_abort = - os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ? - "Online-verify" : "Resync"; + if (warn) + *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ? + ABORTED_ONLINE_VERIFY : ABORTED_RESYNC; ns.conn = C_CONNECTED; } @@ -877,7 +967,8 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns.disk = mdev->new_state_tmp.disk; ns.pdsk = mdev->new_state_tmp.pdsk; } else { - dev_alert(DEV, "Connection lost while negotiating, no data!\n"); + if (warn) + *warn = CONNECTION_LOST_NEGOTIATING; ns.disk = D_DISKLESS; ns.pdsk = D_UNKNOWN; } @@ -959,16 +1050,16 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns.disk = disk_max; if (ns.disk < disk_min) { - dev_warn(DEV, "Implicitly set disk from %s to %s\n", - drbd_disk_str(ns.disk), drbd_disk_str(disk_min)); + if (warn) + *warn = IMPLICITLY_UPGRADED_DISK; ns.disk = disk_min; } if (ns.pdsk > pdsk_max) ns.pdsk = pdsk_max; if (ns.pdsk < pdsk_min) { - dev_warn(DEV, "Implicitly set pdsk from %s to %s\n", - drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min)); + if (warn) + *warn = IMPLICITLY_UPGRADED_PDSK; ns.pdsk = pdsk_min; } @@ -1045,12 +1136,12 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, { union drbd_state os; enum drbd_state_rv rv = SS_SUCCESS; - const char *warn_sync_abort = NULL; + enum sanitize_state_warnings ssw; struct after_state_chg_work *ascw; os = mdev->state; - ns = sanitize_state(mdev, os, ns, &warn_sync_abort); + ns = sanitize_state(mdev, os, ns, &ssw); if (ns.i == os.i) return SS_NOTHING_TO_DO; @@ -1076,8 +1167,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, return rv; } - if (warn_sync_abort) - dev_warn(DEV, "%s aborted.\n", warn_sync_abort); + print_sanitize_warnings(mdev, ssw); { char *pbp, pb[300]; @@ -1243,7 +1333,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, drbd_thread_stop_nowait(&mdev->receiver); /* Upon network failure, we need to restart the receiver. */ - if (os.conn > C_TEAR_DOWN && + if (os.conn > C_WF_CONNECTION && ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) drbd_thread_restart_nowait(&mdev->receiver); @@ -1251,6 +1341,15 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) drbd_resume_al(mdev); + /* remember last connect and attach times so request_timer_fn() won't + * kill newly established sessions while we are still trying to thaw + * previously frozen IO */ + if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS) + mdev->last_reconnect_jif = jiffies; + if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && + ns.disk > D_NEGOTIATING) + mdev->last_reattach_jif = jiffies; + ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); if (ascw) { ascw->os = os; @@ -1354,12 +1453,16 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Here we have the actions that are performed after a state change. This function might sleep */ + if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING) + mod_timer(&mdev->request_timer, jiffies + HZ); + nsm.i = -1; if (ns.susp_nod) { if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) what = resend; - if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING) + if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && + ns.disk > D_NEGOTIATING) what = restart_frozen_disk_io; if (what != nothing) @@ -1369,12 +1472,12 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, if (ns.susp_fen) { /* case1: The outdate peer handler is successful: */ if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) { - tl_clear(mdev); if (test_bit(NEW_CUR_UUID, &mdev->flags)) { drbd_uuid_new_current(mdev); clear_bit(NEW_CUR_UUID, &mdev->flags); } spin_lock_irq(&mdev->req_lock); + _tl_clear(mdev); _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL); spin_unlock_irq(&mdev->req_lock); } @@ -1407,8 +1510,15 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Do not change the order of the if above and the two below... */ if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ + /* we probably will start a resync soon. + * make sure those things are properly reset. */ + mdev->rs_total = 0; + mdev->rs_failed = 0; + atomic_set(&mdev->rs_pending_cnt, 0); + drbd_rs_cancel_all(mdev); + drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_state(mdev, ns); } /* No point in queuing send_bitmap if we don't have a connection * anymore, so check also the _current_ state, not only the new state @@ -1441,11 +1551,11 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, } if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { - if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) { + if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && + mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { drbd_uuid_new_current(mdev); drbd_send_uuids(mdev); } - /* D_DISKLESS Peer becomes secondary */ if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) /* We may still be Primary ourselves. @@ -1473,14 +1583,14 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { drbd_send_sizes(mdev, 0, 0); /* to start sync... */ drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_state(mdev, ns); } /* We want to pause/continue resync, tell peer. */ if (ns.conn >= C_CONNECTED && ((os.aftr_isp != ns.aftr_isp) || (os.user_isp != ns.user_isp))) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); /* In case one of the isp bits got set, suspend other devices. */ if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && @@ -1490,10 +1600,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Make sure the peer gets informed about eventual state changes (ISP bits) while we were in WFReportParams. */ if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); if (os.conn != C_AHEAD && ns.conn == C_AHEAD) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); /* We are in the progress to start a full sync... */ if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || @@ -1513,37 +1623,54 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* first half of local IO error, failure to attach, * or administrative detach */ if (os.disk != D_FAILED && ns.disk == D_FAILED) { - enum drbd_io_error_p eh; - int was_io_error; + enum drbd_io_error_p eh = EP_PASS_ON; + int was_io_error = 0; /* corresponding get_ldev was in __drbd_set_state, to serialize - * our cleanup here with the transition to D_DISKLESS, - * so it is safe to dreference ldev here. */ - eh = mdev->ldev->dc.on_io_error; - was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); - - /* current state still has to be D_FAILED, - * there is only one way out: to D_DISKLESS, - * and that may only happen after our put_ldev below. */ - if (mdev->state.disk != D_FAILED) - dev_err(DEV, - "ASSERT FAILED: disk is %s during detach\n", - drbd_disk_str(mdev->state.disk)); - - if (drbd_send_state(mdev)) - dev_warn(DEV, "Notified peer that I am detaching my disk\n"); - else - dev_err(DEV, "Sending state for detaching disk failed\n"); - - drbd_rs_cancel_all(mdev); - - /* In case we want to get something to stable storage still, - * this may be the last chance. - * Following put_ldev may transition to D_DISKLESS. */ - drbd_md_sync(mdev); + * our cleanup here with the transition to D_DISKLESS. + * But is is still not save to dreference ldev here, since + * we might come from an failed Attach before ldev was set. */ + if (mdev->ldev) { + eh = mdev->ldev->dc.on_io_error; + was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); + + if (was_io_error && eh == EP_CALL_HELPER) + drbd_khelper(mdev, "local-io-error"); + + /* Immediately allow completion of all application IO, + * that waits for completion from the local disk, + * if this was a force-detach due to disk_timeout + * or administrator request (drbdsetup detach --force). + * Do NOT abort otherwise. + * Aborting local requests may cause serious problems, + * if requests are completed to upper layers already, + * and then later the already submitted local bio completes. + * This can cause DMA into former bio pages that meanwhile + * have been re-used for other things. + * So aborting local requests may cause crashes, + * or even worse, silent data corruption. + */ + if (test_and_clear_bit(FORCE_DETACH, &mdev->flags)) + tl_abort_disk_io(mdev); + + /* current state still has to be D_FAILED, + * there is only one way out: to D_DISKLESS, + * and that may only happen after our put_ldev below. */ + if (mdev->state.disk != D_FAILED) + dev_err(DEV, + "ASSERT FAILED: disk is %s during detach\n", + drbd_disk_str(mdev->state.disk)); + + if (ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); + + drbd_rs_cancel_all(mdev); + + /* In case we want to get something to stable storage still, + * this may be the last chance. + * Following put_ldev may transition to D_DISKLESS. */ + drbd_md_sync(mdev); + } put_ldev(mdev); - - if (was_io_error && eh == EP_CALL_HELPER) - drbd_khelper(mdev, "local-io-error"); } /* second half of local IO error, failure to attach, @@ -1557,20 +1684,17 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, "ASSERT FAILED: disk is %s while going diskless\n", drbd_disk_str(mdev->state.disk)); - mdev->rs_total = 0; - mdev->rs_failed = 0; - atomic_set(&mdev->rs_pending_cnt, 0); + if (ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); - if (drbd_send_state(mdev)) - dev_warn(DEV, "Notified peer that I'm now diskless.\n"); /* corresponding get_ldev in __drbd_set_state * this may finally trigger drbd_ldev_destroy. */ put_ldev(mdev); } /* Notify peer that I had a local IO error, and did not detached.. */ - if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT) - drbd_send_state(mdev); + if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); /* Disks got bigger while they were detached */ if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && @@ -1588,7 +1712,13 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* sync target done with resync. Explicitly notify peer, even though * it should (at least for non-empty resyncs) already know itself. */ if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); + + /* Wake up role changes, that were delayed because of connection establishing */ + if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) { + clear_bit(STATE_SENT, &mdev->flags); + wake_up(&mdev->state_wait); + } /* This triggers bitmap writeout of potentially still unwritten pages * if the resync finished cleanly, or aborted because of peer disk @@ -1598,8 +1728,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, * No harm done if some bits change during this phase. */ if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) { - drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, - "write from resync_finished", BM_LOCKED_SET_ALLOWED); + drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL, + "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); put_ldev(mdev); } @@ -2057,7 +2187,11 @@ int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev) D_ASSERT(mdev->state.disk == D_UP_TO_DATE); - uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET; + uuid = mdev->ldev->md.uuid[UI_BITMAP]; + if (uuid && uuid != UUID_JUST_CREATED) + uuid = uuid + UUID_NEW_BM_OFFSET; + else + get_random_bytes(&uuid, sizeof(u64)); drbd_uuid_set(mdev, UI_BITMAP, uuid); drbd_print_uuids(mdev, "updated sync UUID"); drbd_md_sync(mdev); @@ -2071,7 +2205,8 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl { struct p_sizes p; sector_t d_size, u_size; - int q_order_type, max_bio_size; + int q_order_type; + unsigned int max_bio_size; int ok; if (get_ldev_if_state(mdev, D_NEGOTIATING)) { @@ -2080,7 +2215,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl u_size = mdev->ldev->dc.disk_size; q_order_type = drbd_queue_order_type(mdev); max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9; - max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE); + max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE); put_ldev(mdev); } else { d_size = 0; @@ -2089,6 +2224,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ } + /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */ + if (mdev->agreed_pro_version <= 94) + max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + p.d_size = cpu_to_be64(d_size); p.u_size = cpu_to_be64(u_size); p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); @@ -2102,10 +2241,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl } /** - * drbd_send_state() - Sends the drbd state to the peer + * drbd_send_current_state() - Sends the drbd state to the peer * @mdev: DRBD device. */ -int drbd_send_state(struct drbd_conf *mdev) +int drbd_send_current_state(struct drbd_conf *mdev) { struct socket *sock; struct p_state p; @@ -2131,6 +2270,37 @@ int drbd_send_state(struct drbd_conf *mdev) return ok; } +/** + * drbd_send_state() - After a state change, sends the new state to the peer + * @mdev: DRBD device. + * @state: the state to send, not necessarily the current state. + * + * Each state change queues an "after_state_ch" work, which will eventually + * send the resulting new state to the peer. If more state changes happen + * between queuing and processing of the after_state_ch work, we still + * want to send each intermediary state in the order it occurred. + */ +int drbd_send_state(struct drbd_conf *mdev, union drbd_state state) +{ + struct socket *sock; + struct p_state p; + int ok = 0; + + mutex_lock(&mdev->data.mutex); + + p.state = cpu_to_be32(state.i); + sock = mdev->data.socket; + + if (likely(sock != NULL)) { + ok = _drbd_send_cmd(mdev, sock, P_STATE, + (struct p_header80 *)&p, sizeof(p), 0); + } + + mutex_unlock(&mdev->data.mutex); + + return ok; +} + int drbd_send_state_req(struct drbd_conf *mdev, union drbd_state mask, union drbd_state val) { @@ -2615,7 +2785,7 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) struct bio_vec *bvec; int i; /* hint all but last page with MSG_MORE */ - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment(bvec, bio, i) { if (!_drbd_no_send_page(mdev, bvec->bv_page, bvec->bv_offset, bvec->bv_len, i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) @@ -2629,7 +2799,7 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) struct bio_vec *bvec; int i; /* hint all but last page with MSG_MORE */ - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment(bvec, bio, i) { if (!_drbd_send_page(mdev, bvec->bv_page, bvec->bv_offset, bvec->bv_len, i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) @@ -2695,8 +2865,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) p.sector = cpu_to_be64(req->sector); p.block_id = (unsigned long)req; - p.seq_num = cpu_to_be32(req->seq_num = - atomic_add_return(1, &mdev->packet_seq)); + p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw); @@ -2987,8 +3156,8 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) atomic_set(&mdev->rs_sect_in, 0); atomic_set(&mdev->rs_sect_ev, 0); atomic_set(&mdev->ap_in_flight, 0); + atomic_set(&mdev->md_io_in_use, 0); - mutex_init(&mdev->md_io_mutex); mutex_init(&mdev->data.mutex); mutex_init(&mdev->meta.mutex); sema_init(&mdev->data.work.s, 0); @@ -3126,6 +3295,10 @@ static void drbd_destroy_mempools(void) /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */ + if (drbd_md_io_bio_set) + bioset_free(drbd_md_io_bio_set); + if (drbd_md_io_page_pool) + mempool_destroy(drbd_md_io_page_pool); if (drbd_ee_mempool) mempool_destroy(drbd_ee_mempool); if (drbd_request_mempool) @@ -3139,6 +3312,8 @@ static void drbd_destroy_mempools(void) if (drbd_al_ext_cache) kmem_cache_destroy(drbd_al_ext_cache); + drbd_md_io_bio_set = NULL; + drbd_md_io_page_pool = NULL; drbd_ee_mempool = NULL; drbd_request_mempool = NULL; drbd_ee_cache = NULL; @@ -3162,6 +3337,8 @@ static int drbd_create_mempools(void) drbd_bm_ext_cache = NULL; drbd_al_ext_cache = NULL; drbd_pp_pool = NULL; + drbd_md_io_page_pool = NULL; + drbd_md_io_bio_set = NULL; /* caches */ drbd_request_cache = kmem_cache_create( @@ -3185,6 +3362,16 @@ static int drbd_create_mempools(void) goto Enomem; /* mempools */ +#ifdef COMPAT_HAVE_BIOSET_CREATE + drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0); + if (drbd_md_io_bio_set == NULL) + goto Enomem; +#endif + + drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0); + if (drbd_md_io_page_pool == NULL) + goto Enomem; + drbd_request_mempool = mempool_create(number, mempool_alloc_slab, mempool_free_slab, drbd_request_cache); if (drbd_request_mempool == NULL) @@ -3262,6 +3449,8 @@ static void drbd_delete_device(unsigned int minor) if (!mdev) return; + del_timer_sync(&mdev->request_timer); + /* paranoia asserts */ if (mdev->open_cnt != 0) dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt, @@ -3344,9 +3533,9 @@ static void drbd_cleanup(void) } /** - * drbd_congested() - Callback for pdflush + * drbd_congested() - Callback for the flusher thread * @congested_data: User data - * @bdi_bits: Bits pdflush is currently interested in + * @bdi_bits: Bits the BDI flusher thread is currently interested in * * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested. */ @@ -3364,6 +3553,22 @@ static int drbd_congested(void *congested_data, int bdi_bits) goto out; } + if (test_bit(CALLBACK_PENDING, &mdev->flags)) { + r |= (1 << BDI_async_congested); + /* Without good local data, we would need to read from remote, + * and that would need the worker thread as well, which is + * currently blocked waiting for that usermode helper to + * finish. + */ + if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) + r |= (1 << BDI_sync_congested); + else + put_ldev(mdev); + r &= bdi_bits; + reason = 'c'; + goto out; + } + if (get_ldev(mdev)) { q = bdev_get_queue(mdev->ldev->backing_bdev); r = bdi_congested(&q->backing_dev_info, bdi_bits); @@ -3427,6 +3632,7 @@ struct drbd_conf *drbd_new_device(unsigned int minor) q->backing_dev_info.congested_data = mdev; blk_queue_make_request(q, drbd_make_request); + blk_queue_flush(q, REQ_FLUSH | REQ_FUA); /* Setting the max_hw_sectors to an odd value of 8kibyte here This triggers a max_bio_size message upon first attach or connect */ blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); @@ -3666,8 +3872,10 @@ void drbd_md_sync(struct drbd_conf *mdev) if (!get_ldev_if_state(mdev, D_FAILED)) return; - mutex_lock(&mdev->md_io_mutex); - buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); + if (!buffer) + goto out; + memset(buffer, 0, 512); buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); @@ -3691,14 +3899,15 @@ void drbd_md_sync(struct drbd_conf *mdev) if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { /* this was a try anyways ... */ dev_err(DEV, "meta data update failed!\n"); - drbd_chk_io_error(mdev, 1, true); + drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); } /* Update mdev->ldev->md.la_size_sect, * since we updated it on metadata. */ mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev); - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); +out: put_ldev(mdev); } @@ -3718,8 +3927,9 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (!get_ldev_if_state(mdev, D_ATTACHING)) return ERR_IO_MD_DISK; - mutex_lock(&mdev->md_io_mutex); - buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); + if (!buffer) + goto out; if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { /* NOTE: can't do normal error processing here as this is @@ -3769,9 +3979,9 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) spin_lock_irq(&mdev->req_lock); if (mdev->state.conn < C_CONNECTED) { - int peer; + unsigned int peer; peer = be32_to_cpu(buffer->la_peer_max_bio_size); - peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE); + peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE); mdev->peer_max_bio_size = peer; } spin_unlock_irq(&mdev->req_lock); @@ -3780,7 +3990,8 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) mdev->sync_conf.al_extents = 127; err: - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); + out: put_ldev(mdev); return rv; @@ -4183,12 +4394,11 @@ const char *drbd_buildtag(void) static char buildtag[38] = "\0uilt-in"; if (buildtag[0] == 0) { -#ifdef CONFIG_MODULES - if (THIS_MODULE != NULL) - sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); - else +#ifdef MODULE + sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); +#else + buildtag[0] = 'b'; #endif - buildtag[0] = 'b'; } return buildtag; diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 0feab261e295..edb490aad8b4 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -94,7 +94,7 @@ static int name ## _from_tags(struct drbd_conf *mdev, \ arg->member ## _len = dlen; \ memcpy(arg->member, tags, min_t(size_t, dlen, len)); \ break; -#include "linux/drbd_nl.h" +#include <linux/drbd_nl.h> /* Generate the struct to tag_list functions */ #define NL_PACKET(name, number, fields) \ @@ -129,7 +129,7 @@ name ## _to_tags(struct drbd_conf *mdev, \ put_unaligned(arg->member ## _len, tags++); \ memcpy(tags, arg->member, arg->member ## _len); \ tags = (unsigned short *)((char *)tags + arg->member ## _len); -#include "linux/drbd_nl.h" +#include <linux/drbd_nl.h> void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name); void drbd_nl_send_reply(struct cn_msg *, int); @@ -147,6 +147,9 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd) char *argv[] = {usermode_helper, cmd, mb, NULL }; int ret; + if (current == mdev->worker.task) + set_bit(CALLBACK_PENDING, &mdev->flags); + snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev)); if (get_net_conf(mdev)) { @@ -179,7 +182,7 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd) dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); drbd_bcast_ev_helper(mdev, cmd); - ret = call_usermodehelper(usermode_helper, argv, envp, 1); + ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); if (ret) dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", usermode_helper, cmd, mb, @@ -189,6 +192,9 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd) usermode_helper, cmd, mb, (ret >> 8) & 0xff, ret); + if (current == mdev->worker.task) + clear_bit(CALLBACK_PENDING, &mdev->flags); + if (ret < 0) /* Ignore any ERRNOs we got. */ ret = 0; @@ -289,7 +295,7 @@ static int _try_outdate_peer_async(void *data) */ spin_lock_irq(&mdev->req_lock); ns = mdev->state; - if (ns.conn < C_WF_REPORT_PARAMS) { + if (ns.conn < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &mdev->flags)) { ns.pdsk = nps; _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); } @@ -432,7 +438,7 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) /* if this was forced, we should consider sync */ if (forced) drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_current_state(mdev); } drbd_md_sync(mdev); @@ -668,8 +674,8 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds la_size_changed && md_moved ? "size changed and md moved" : la_size_changed ? "size changed" : "md moved"); /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ - err = drbd_bitmap_io(mdev, &drbd_bm_write, - "size changed", BM_LOCKED_MASK); + err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write, + "size changed", BM_LOCKED_MASK); if (err) { rv = dev_size_error; goto out; @@ -795,8 +801,8 @@ static int drbd_check_al_size(struct drbd_conf *mdev) static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size) { struct request_queue * const q = mdev->rq_queue; - int max_hw_sectors = max_bio_size >> 9; - int max_segments = 0; + unsigned int max_hw_sectors = max_bio_size >> 9; + unsigned int max_segments = 0; if (get_ldev_if_state(mdev, D_ATTACHING)) { struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; @@ -829,7 +835,7 @@ static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) { - int now, new, local, peer; + unsigned int now, new, local, peer; now = queue_max_hw_sectors(mdev->rq_queue) << 9; local = mdev->local_max_bio_size; /* Eventually last known value, from volatile memory */ @@ -840,23 +846,25 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) mdev->local_max_bio_size = local; put_ldev(mdev); } + local = min(local, DRBD_MAX_BIO_SIZE); /* We may ignore peer limits if the peer is modern enough. Because new from 8.3.8 onwards the peer can use multiple BIOs for a single peer_request */ if (mdev->state.conn >= C_CONNECTED) { - if (mdev->agreed_pro_version < 94) - peer = mdev->peer_max_bio_size; - else if (mdev->agreed_pro_version == 94) + if (mdev->agreed_pro_version < 94) { + peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ + } else if (mdev->agreed_pro_version == 94) peer = DRBD_MAX_SIZE_H80_PACKET; else /* drbd 8.3.8 onwards */ peer = DRBD_MAX_BIO_SIZE; } - new = min_t(int, local, peer); + new = min(local, peer); if (mdev->state.role == R_PRIMARY && new < now) - dev_err(DEV, "ASSERT FAILED new < now; (%d < %d)\n", new, now); + dev_err(DEV, "ASSERT FAILED new < now; (%u < %u)\n", new, now); if (new != now) dev_info(DEV, "max BIO size = %u\n", new); @@ -949,6 +957,14 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp * to realize a "hot spare" feature (not that I'd recommend that) */ wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); + /* make sure there is no leftover from previous force-detach attempts */ + clear_bit(FORCE_DETACH, &mdev->flags); + + /* and no leftover from previously aborted resync or verify, either */ + mdev->rs_total = 0; + mdev->rs_failed = 0; + atomic_set(&mdev->rs_pending_cnt, 0); + /* allocation not in the IO path, cqueue thread context */ nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); if (!nbc) { @@ -1032,7 +1048,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", (unsigned long long) drbd_get_max_capacity(nbc), (unsigned long long) nbc->dc.disk_size); - retcode = ERR_DISK_TO_SMALL; + retcode = ERR_DISK_TOO_SMALL; goto fail; } @@ -1046,7 +1062,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp } if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { - retcode = ERR_MD_DISK_TO_SMALL; + retcode = ERR_MD_DISK_TOO_SMALL; dev_warn(DEV, "refusing attach: md-device too small, " "at least %llu sectors needed for this meta-disk type\n", (unsigned long long) min_md_device_sectors); @@ -1057,7 +1073,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp * (we may currently be R_PRIMARY with no local disk...) */ if (drbd_get_max_capacity(nbc) < drbd_get_capacity(mdev->this_bdev)) { - retcode = ERR_DISK_TO_SMALL; + retcode = ERR_DISK_TOO_SMALL; goto fail; } @@ -1138,7 +1154,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) { dev_warn(DEV, "refusing to truncate a consistent device\n"); - retcode = ERR_DISK_TO_SMALL; + retcode = ERR_DISK_TOO_SMALL; goto force_diskless_dec; } @@ -1336,17 +1352,35 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, { enum drbd_ret_code retcode; int ret; + struct detach dt = {}; + + if (!detach_from_tags(mdev, nlp->tag_list, &dt)) { + reply->ret_code = ERR_MANDATORY_TAG; + goto out; + } + + if (dt.detach_force) { + set_bit(FORCE_DETACH, &mdev->flags); + drbd_force_state(mdev, NS(disk, D_FAILED)); + reply->ret_code = SS_SUCCESS; + goto out; + } + drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */ + drbd_md_get_buffer(mdev); /* make sure there is no in-flight meta-data IO */ retcode = drbd_request_state(mdev, NS(disk, D_FAILED)); + drbd_md_put_buffer(mdev); /* D_FAILED will transition to DISKLESS. */ ret = wait_event_interruptible(mdev->misc_wait, mdev->state.disk != D_FAILED); drbd_resume_io(mdev); + if ((int)retcode == (int)SS_IS_DISKLESS) retcode = SS_NOTHING_TO_DO; if (ret) retcode = ERR_INTR; reply->ret_code = retcode; +out: return 0; } @@ -1711,7 +1745,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, if (rs.no_resync && mdev->agreed_pro_version < 93) { retcode = ERR_NEED_APV_93; - goto fail; + goto fail_ldev; } if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) @@ -1738,6 +1772,10 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, fail: reply->ret_code = retcode; return 0; + + fail_ldev: + put_ldev(mdev); + goto fail; } static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, @@ -1940,8 +1978,11 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl int retcode; /* If there is still bitmap IO pending, probably because of a previous - * resync just being finished, wait for it before requesting a new resync. */ + * resync just being finished, wait for it before requesting a new resync. + * Also wait for it's after_state_ch(). */ + drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); + drbd_flush_workqueue(mdev); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); @@ -1959,6 +2000,7 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); } + drbd_resume_io(mdev); reply->ret_code = retcode; return 0; @@ -1979,8 +2021,11 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re int retcode; /* If there is still bitmap IO pending, probably because of a previous - * resync just being finished, wait for it before requesting a new resync. */ + * resync just being finished, wait for it before requesting a new resync. + * Also wait for it's after_state_ch(). */ + drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); + drbd_flush_workqueue(mdev); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); @@ -1998,6 +2043,7 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re } else retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); } + drbd_resume_io(mdev); reply->ret_code = retcode; return 0; @@ -2170,11 +2216,13 @@ static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, /* If there is still bitmap IO pending, e.g. previous resync or verify * just being finished, wait for it before requesting a new resync. */ + drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); /* w_make_ov_request expects position to be aligned */ mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT; reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); + drbd_resume_io(mdev); return 0; } @@ -2297,7 +2345,7 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms return; } - if (!cap_raised(current_cap(), CAP_SYS_ADMIN)) { + if (!capable(CAP_SYS_ADMIN)) { retcode = ERR_PERM; goto fail; } @@ -2526,10 +2574,10 @@ void drbd_bcast_ee(struct drbd_conf *mdev, page = e->pages; page_chain_for_each(page) { - void *d = kmap_atomic(page, KM_USER0); + void *d = kmap_atomic(page); unsigned l = min_t(unsigned, len, PAGE_SIZE); memcpy(tl, d, l); - kunmap_atomic(d, KM_USER0); + kunmap_atomic(d); tl = (unsigned short*)((char*)tl + l); len -= l; if (len == 0) diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 2959cdfb77f5..5496104f90b9 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -52,7 +52,7 @@ void seq_printf_with_thousands_grouping(struct seq_file *seq, long v) if (unlikely(v >= 1000000)) { /* cool: > GiByte/s */ seq_printf(seq, "%ld,", v / 1000000); - v /= 1000000; + v %= 1000000; seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000); } else if (likely(v >= 1000)) seq_printf(seq, "%ld,%03ld", v/1000, v % 1000); @@ -245,6 +245,9 @@ static int drbd_seq_show(struct seq_file *seq, void *v) mdev->state.role == R_SECONDARY) { seq_printf(seq, "%2d: cs:Unconfigured\n", i); } else { + /* reset mdev->congestion_reason */ + bdi_rw_congested(&mdev->rq_queue->backing_dev_info); + seq_printf(seq, "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n" " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 43beaca53179..c74ca2df7431 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -277,6 +277,9 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net) atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use; int i; + if (page == NULL) + return; + if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE)*minor_count) i = page_chain_free(page); else { @@ -316,7 +319,7 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, gfp_t gfp_mask) __must_hold(local) { struct drbd_epoch_entry *e; - struct page *page; + struct page *page = NULL; unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE)) @@ -329,9 +332,11 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, return NULL; } - page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT)); - if (!page) - goto fail; + if (data_size) { + page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT)); + if (!page) + goto fail; + } INIT_HLIST_NODE(&e->collision); e->epoch = NULL; @@ -466,6 +471,7 @@ static int drbd_accept(struct drbd_conf *mdev, const char **what, goto out; } (*newsock)->ops = sock->ops; + __module_get((*newsock)->ops->owner); out: return err; @@ -664,7 +670,7 @@ static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev) timeo = mdev->net_conf->try_connect_int * HZ; timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ - s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */ + s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ s_listen->sk->sk_rcvtimeo = timeo; s_listen->sk->sk_sndtimeo = timeo; drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size, @@ -750,6 +756,7 @@ static int drbd_connect(struct drbd_conf *mdev) { struct socket *s, *sock, *msock; int try, h, ok; + enum drbd_state_rv rv; D_ASSERT(!mdev->data.socket); @@ -841,8 +848,8 @@ retry: } } while (1); - msock->sk->sk_reuse = 1; /* SO_REUSEADDR */ - sock->sk->sk_reuse = 1; /* SO_REUSEADDR */ + msock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ + sock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ sock->sk->sk_allocation = GFP_NOIO; msock->sk->sk_allocation = GFP_NOIO; @@ -888,25 +895,32 @@ retry: } } - if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS) - return 0; - sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; atomic_set(&mdev->packet_seq, 0); mdev->peer_seq = 0; - drbd_thread_start(&mdev->asender); - if (drbd_send_protocol(mdev) == -1) return -1; + set_bit(STATE_SENT, &mdev->flags); drbd_send_sync_param(mdev, &mdev->sync_conf); drbd_send_sizes(mdev, 0, 0); drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_current_state(mdev); clear_bit(USE_DEGR_WFC_T, &mdev->flags); clear_bit(RESIZE_PENDING, &mdev->flags); + + spin_lock_irq(&mdev->req_lock); + rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL); + if (mdev->state.conn != C_WF_REPORT_PARAMS) + clear_bit(STATE_SENT, &mdev->flags); + spin_unlock_irq(&mdev->req_lock); + + if (rv < SS_SUCCESS) + return 0; + + drbd_thread_start(&mdev->asender); mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ return 1; @@ -957,7 +971,7 @@ static void drbd_flush(struct drbd_conf *mdev) rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL, NULL); if (rv) { - dev_err(DEV, "local disk flush failed with status %d\n", rv); + dev_info(DEV, "local disk flush failed with status %d\n", rv); /* would rather check on EOPNOTSUPP, but that is not reliable. * don't try again for ANY return value != 0 * if (rv == -EOPNOTSUPP) */ @@ -1001,13 +1015,14 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, if (epoch_size != 0 && atomic_read(&epoch->active) == 0 && - test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) { + (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) { if (!(ev & EV_CLEANUP)) { spin_unlock(&mdev->epoch_lock); drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); spin_lock(&mdev->epoch_lock); } - dec_unacked(mdev); + if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) + dec_unacked(mdev); if (mdev->current_epoch != epoch) { next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); @@ -1096,7 +1111,11 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, /* In most cases, we will only need one bio. But in case the lower * level restrictions happen to be different at this offset on this * side than those of the sending peer, we may need to submit the - * request in more than one bio. */ + * request in more than one bio. + * + * Plain bio_alloc is good enough here, this is no DRBD internally + * generated bio, but a bio allocated on behalf of the peer. + */ next_bio: bio = bio_alloc(GFP_NOIO, nr_pages); if (!bio) { @@ -1256,7 +1275,6 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __ data_size -= dgs; - ERR_IF(data_size == 0) return NULL; ERR_IF(data_size & 0x1ff) return NULL; ERR_IF(data_size > DRBD_MAX_BIO_SIZE) return NULL; @@ -1277,6 +1295,9 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __ if (!e) return NULL; + if (!data_size) + return e; + ds = data_size; page = e->pages; page_chain_for_each(page) { @@ -1583,6 +1604,24 @@ static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int u return ok; } +static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e) +{ + + struct drbd_epoch_entry *rs_e; + bool rv = 0; + + spin_lock_irq(&mdev->req_lock); + list_for_each_entry(rs_e, &mdev->sync_ee, w.list) { + if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) { + rv = 1; + break; + } + } + spin_unlock_irq(&mdev->req_lock); + + return rv; +} + /* Called from receive_Data. * Synchronize packets on sock with packets on msock. * @@ -1683,6 +1722,10 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned dp_flags = be32_to_cpu(p->dp_flags); rw |= wire_flags_to_bio(mdev, dp_flags); + if (e->pages == NULL) { + D_ASSERT(e->size == 0); + D_ASSERT(dp_flags & DP_FLUSH); + } if (dp_flags & DP_MAY_SET_IN_SYNC) e->flags |= EE_MAY_SET_IN_SYNC; @@ -1826,6 +1869,9 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned list_add(&e->w.list, &mdev->active_ee); spin_unlock_irq(&mdev->req_lock); + if (mdev->state.conn == C_SYNC_TARGET) + wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e)); + switch (mdev->net_conf->wire_protocol) { case DRBD_PROT_C: inc_unacked(mdev); @@ -2420,7 +2466,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1]; - dev_info(DEV, "Did not got last syncUUID packet, corrected:\n"); + dev_info(DEV, "Lost last syncUUID packet, corrected:\n"); drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); return -1; @@ -2806,10 +2852,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi if (apv >= 88) { if (apv == 88) { - if (data_size > SHARED_SECRET_MAX) { - dev_err(DEV, "verify-alg too long, " - "peer wants %u, accepting only %u byte\n", - data_size, SHARED_SECRET_MAX); + if (data_size > SHARED_SECRET_MAX || data_size == 0) { + dev_err(DEV, "verify-alg of wrong size, " + "peer wants %u, accepting only up to %u byte\n", + data_size, SHARED_SECRET_MAX); return false; } @@ -3168,9 +3214,20 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned os = ns = mdev->state; spin_unlock_irq(&mdev->req_lock); - /* peer says his disk is uptodate, while we think it is inconsistent, - * and this happens while we think we have a sync going on. */ - if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE && + /* If some other part of the code (asender thread, timeout) + * already decided to close the connection again, + * we must not "re-establish" it here. */ + if (os.conn <= C_TEAR_DOWN) + return false; + + /* If this is the "end of sync" confirmation, usually the peer disk + * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits + * set) resync started in PausedSyncT, or if the timing of pause-/ + * unpause-sync events has been "just right", the peer disk may + * transition from D_CONSISTENT to D_UP_TO_DATE as well. + */ + if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) && + real_peer_disk == D_UP_TO_DATE && os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) { /* If we are (becoming) SyncSource, but peer is still in sync * preparation, ignore its uptodate-ness to avoid flapping, it @@ -3288,7 +3345,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned /* Nowadays only used when forcing a node into primary role and setting its disk to UpToDate with that */ drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_current_state(mdev); } } @@ -3755,11 +3812,18 @@ void drbd_free_tl_hash(struct drbd_conf *mdev) mdev->ee_hash = NULL; mdev->ee_hash_s = 0; - /* paranoia code */ - for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) - if (h->first) - dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n", - (int)(h - mdev->tl_hash), h->first); + /* We may not have had the chance to wait for all locally pending + * application requests. The hlist_add_fake() prevents access after + * free on master bio completion. */ + for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) { + struct drbd_request *req; + struct hlist_node *pos, *n; + hlist_for_each_entry_safe(req, pos, n, h, collision) { + hlist_del_init(&req->collision); + hlist_add_fake(&req->collision); + } + } + kfree(mdev->tl_hash); mdev->tl_hash = NULL; mdev->tl_hash_s = 0; @@ -3776,6 +3840,13 @@ static void drbd_disconnect(struct drbd_conf *mdev) if (mdev->state.conn == C_STANDALONE) return; + /* We are about to start the cleanup after connection loss. + * Make sure drbd_make_request knows about that. + * Usually we should be in some network failure state already, + * but just in case we are not, we fix it up here. + */ + drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); + /* asender does not clean up anything. it must not interfere, either */ drbd_thread_stop(&mdev->asender); drbd_free_sock(mdev); @@ -3803,8 +3874,6 @@ static void drbd_disconnect(struct drbd_conf *mdev) atomic_set(&mdev->rs_pending_cnt, 0); wake_up(&mdev->misc_wait); - del_timer(&mdev->request_timer); - /* make sure syncer is stopped and w_resume_next_sg queued */ del_timer_sync(&mdev->resync_timer); resync_timer_fn((unsigned long)mdev); @@ -4433,7 +4502,7 @@ static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h) if (mdev->state.conn == C_AHEAD && atomic_read(&mdev->ap_in_flight) == 0 && - !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags)) { + !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) { mdev->start_resync_timer.expires = jiffies + HZ; add_timer(&mdev->start_resync_timer); } diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 3424d675b769..01b2ac641c7b 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -37,6 +37,7 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req const int rw = bio_data_dir(bio); int cpu; cpu = part_stat_lock(); + part_round_stats(cpu, &mdev->vdisk->part0); part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); part_inc_in_flight(&mdev->vdisk->part0, rw); @@ -214,8 +215,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) { const unsigned long s = req->rq_state; struct drbd_conf *mdev = req->mdev; - /* only WRITES may end up here without a master bio (on barrier ack) */ - int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE; + int rw = req->rq_state & RQ_WRITE ? WRITE : READ; /* we must not complete the master bio, while it is * still being processed by _drbd_send_zc_bio (drbd_send_dblock) @@ -230,7 +230,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) return; if (s & RQ_NET_PENDING) return; - if (s & RQ_LOCAL_PENDING) + if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) return; if (req->master_bio) { @@ -277,6 +277,9 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) req->master_bio = NULL; } + if (s & RQ_LOCAL_PENDING) + return; + if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { /* this is disconnected (local only) operation, * or protocol C P_WRITE_ACK, @@ -429,7 +432,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, break; case completed_ok: - if (bio_data_dir(req->master_bio) == WRITE) + if (req->rq_state & RQ_WRITE) mdev->writ_cnt += req->size>>9; else mdev->read_cnt += req->size>>9; @@ -438,16 +441,22 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state &= ~RQ_LOCAL_PENDING; _req_may_be_done_not_susp(req, m); - put_ldev(mdev); + break; + + case abort_disk_io: + req->rq_state |= RQ_LOCAL_ABORTED; + if (req->rq_state & RQ_WRITE) + _req_may_be_done_not_susp(req, m); + else + goto goto_queue_for_net_read; break; case write_completed_with_error: req->rq_state |= RQ_LOCAL_COMPLETED; req->rq_state &= ~RQ_LOCAL_PENDING; - __drbd_chk_io_error(mdev, false); + __drbd_chk_io_error(mdev, DRBD_IO_ERROR); _req_may_be_done_not_susp(req, m); - put_ldev(mdev); break; case read_ahead_completed_with_error: @@ -455,7 +464,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state |= RQ_LOCAL_COMPLETED; req->rq_state &= ~RQ_LOCAL_PENDING; _req_may_be_done_not_susp(req, m); - put_ldev(mdev); break; case read_completed_with_error: @@ -464,10 +472,16 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state |= RQ_LOCAL_COMPLETED; req->rq_state &= ~RQ_LOCAL_PENDING; - D_ASSERT(!(req->rq_state & RQ_NET_MASK)); + if (req->rq_state & RQ_LOCAL_ABORTED) { + _req_may_be_done(req, m); + break; + } - __drbd_chk_io_error(mdev, false); - put_ldev(mdev); + __drbd_chk_io_error(mdev, DRBD_IO_ERROR); + + goto_queue_for_net_read: + + D_ASSERT(!(req->rq_state & RQ_NET_MASK)); /* no point in retrying if there is no good remote data, * or we have no connection. */ @@ -556,10 +570,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, drbd_queue_work(&mdev->data.work, &req->w); break; - case oos_handed_to_network: - /* actually the same */ + case read_retry_remote_canceled: case send_canceled: - /* treat it the same */ case send_failed: /* real cleanup will be done from tl_clear. just update flags * so it is no longer marked as on the worker queue */ @@ -589,17 +601,17 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, } req->rq_state &= ~RQ_NET_QUEUED; req->rq_state |= RQ_NET_SENT; - /* because _drbd_send_zc_bio could sleep, and may want to - * dereference the bio even after the "write_acked_by_peer" and - * "completed_ok" events came in, once we return from - * _drbd_send_zc_bio (drbd_send_dblock), we have to check - * whether it is done already, and end it. */ _req_may_be_done_not_susp(req, m); break; - case read_retry_remote_canceled: + case oos_handed_to_network: + /* Was not set PENDING, no longer QUEUED, so is now DONE + * as far as this connection is concerned. */ req->rq_state &= ~RQ_NET_QUEUED; - /* fall through, in case we raced with drbd_disconnect */ + req->rq_state |= RQ_NET_DONE; + _req_may_be_done_not_susp(req, m); + break; + case connection_lost_while_pending: /* transfer log cleanup after connection loss */ /* assert something? */ @@ -616,8 +628,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, _req_may_be_done(req, m); /* Allowed while state.susp */ break; - case write_acked_by_peer_and_sis: - req->rq_state |= RQ_NET_SIS; case conflict_discarded_by_peer: /* for discarded conflicting writes of multiple primaries, * there is no need to keep anything in the tl, potential @@ -628,18 +638,15 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, (unsigned long long)req->sector, req->size); req->rq_state |= RQ_NET_DONE; /* fall through */ + case write_acked_by_peer_and_sis: case write_acked_by_peer: + if (what == write_acked_by_peer_and_sis) + req->rq_state |= RQ_NET_SIS; /* protocol C; successfully written on peer. - * Nothing to do here. + * Nothing more to do here. * We want to keep the tl in place for all protocols, to cater - * for volatile write-back caches on lower level devices. - * - * A barrier request is expected to have forced all prior - * requests onto stable storage, so completion of a barrier - * request could set NET_DONE right here, and not wait for the - * P_BARRIER_ACK, but that is an unnecessary optimization. */ + * for volatile write-back caches on lower level devices. */ - /* this makes it effectively the same as for: */ case recv_acked_by_peer: /* protocol B; pretends to be successfully written on peer. * see also notes above in handed_over_to_network about @@ -688,6 +695,12 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, break; case resend: + /* Simply complete (local only) READs. */ + if (!(req->rq_state & RQ_WRITE) && !req->w.cb) { + _req_may_be_done(req, m); + break; + } + /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK before the connection loss (B&C only); only P_BARRIER_ACK was missing. Trowing them out of the TL here by pretending we got a BARRIER_ACK @@ -763,6 +776,40 @@ static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int s return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr); } +static void maybe_pull_ahead(struct drbd_conf *mdev) +{ + int congested = 0; + + /* If I don't even have good local storage, we can not reasonably try + * to pull ahead of the peer. We also need the local reference to make + * sure mdev->act_log is there. + * Note: caller has to make sure that net_conf is there. + */ + if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) + return; + + if (mdev->net_conf->cong_fill && + atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) { + dev_info(DEV, "Congestion-fill threshold reached\n"); + congested = 1; + } + + if (mdev->act_log->used >= mdev->net_conf->cong_extents) { + dev_info(DEV, "Congestion-extents threshold reached\n"); + congested = 1; + } + + if (congested) { + queue_barrier(mdev); /* last barrier, after mirrored writes */ + + if (mdev->net_conf->on_congestion == OC_PULL_AHEAD) + _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL); + else /*mdev->net_conf->on_congestion == OC_DISCONNECT */ + _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL); + } + put_ldev(mdev); +} + static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) { const int rw = bio_rw(bio); @@ -773,6 +820,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns int local, remote, send_oos = 0; int err = -EIO; int ret = 0; + union drbd_state s; /* allocate outside of all locks; */ req = drbd_req_new(mdev, bio); @@ -792,7 +840,15 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns req->private_bio = NULL; } if (rw == WRITE) { - remote = 1; + /* Need to replicate writes. Unless it is an empty flush, + * which is better mapped to a DRBD P_BARRIER packet, + * also for drbd wire protocol compatibility reasons. */ + if (unlikely(size == 0)) { + /* The only size==0 bios we expect are empty flushes. */ + D_ASSERT(bio->bi_rw & REQ_FLUSH); + remote = 0; + } else + remote = 1; } else { /* READ || READA */ if (local) { @@ -828,14 +884,18 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns * extent. This waits for any resync activity in the corresponding * resync extent to finish, and, if necessary, pulls in the target * extent into the activity log, which involves further disk io because - * of transactional on-disk meta data updates. */ - if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) { + * of transactional on-disk meta data updates. + * Empty flushes don't need to go into the activity log, they can only + * flush data for pending writes which are already in there. */ + if (rw == WRITE && local && size + && !test_bit(AL_SUSPENDED, &mdev->flags)) { req->rq_state |= RQ_IN_ACT_LOG; drbd_al_begin_io(mdev, sector); } - remote = remote && drbd_should_do_remote(mdev->state); - send_oos = rw == WRITE && drbd_should_send_oos(mdev->state); + s = mdev->state; + remote = remote && drbd_should_do_remote(s); + send_oos = rw == WRITE && drbd_should_send_oos(s); D_ASSERT(!(remote && send_oos)); if (!(local || remote) && !is_susp(mdev->state)) { @@ -867,7 +927,7 @@ allocate_barrier: if (is_susp(mdev->state)) { /* If we got suspended, use the retry mechanism of - generic_make_request() to restart processing of this + drbd_make_request() to restart processing of this bio. In the next call to drbd_make_request we sleep in inc_ap_bio() */ ret = 1; @@ -951,7 +1011,10 @@ allocate_barrier: if (rw == WRITE && _req_conflicts(req)) goto fail_conflicting; - list_add_tail(&req->tl_requests, &mdev->newest_tle->requests); + /* no point in adding empty flushes to the transfer log, + * they are mapped to drbd barriers already. */ + if (likely(size!=0)) + list_add_tail(&req->tl_requests, &mdev->newest_tle->requests); /* NOTE remote first: to get the concurrent write detection right, * we must register the request before start of local IO. */ @@ -968,29 +1031,16 @@ allocate_barrier: _req_mod(req, queue_for_send_oos); if (remote && - mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) { - int congested = 0; - - if (mdev->net_conf->cong_fill && - atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) { - dev_info(DEV, "Congestion-fill threshold reached\n"); - congested = 1; - } - - if (mdev->act_log->used >= mdev->net_conf->cong_extents) { - dev_info(DEV, "Congestion-extents threshold reached\n"); - congested = 1; - } - - if (congested) { - queue_barrier(mdev); /* last barrier, after mirrored writes */ - - if (mdev->net_conf->on_congestion == OC_PULL_AHEAD) - _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL); - else /*mdev->net_conf->on_congestion == OC_DISCONNECT */ - _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL); - } - } + mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) + maybe_pull_ahead(mdev); + + /* If this was a flush, queue a drbd barrier/start a new epoch. + * Unless the current epoch was empty anyways, or we are not currently + * replicating, in which case there is no point. */ + if (unlikely(bio->bi_rw & REQ_FLUSH) + && mdev->newest_tle->n_writes + && drbd_should_do_remote(mdev->state)) + queue_barrier(mdev); spin_unlock_irq(&mdev->req_lock); kfree(b); /* if someone else has beaten us to it... */ @@ -1073,7 +1123,7 @@ static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write) return 0; } -int drbd_make_request(struct request_queue *q, struct bio *bio) +void drbd_make_request(struct request_queue *q, struct bio *bio) { unsigned int s_enr, e_enr; struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; @@ -1081,7 +1131,7 @@ int drbd_make_request(struct request_queue *q, struct bio *bio) if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) { bio_endio(bio, -EPERM); - return 0; + return; } start_time = jiffies; @@ -1089,18 +1139,18 @@ int drbd_make_request(struct request_queue *q, struct bio *bio) /* * what we "blindly" assume: */ - D_ASSERT(bio->bi_size > 0); D_ASSERT((bio->bi_size & 0x1ff) == 0); - D_ASSERT(bio->bi_idx == 0); /* to make some things easier, force alignment of requests within the * granularity of our hash tables */ s_enr = bio->bi_sector >> HT_SHIFT; - e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT; + e_enr = bio->bi_size ? (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT : s_enr; if (likely(s_enr == e_enr)) { - inc_ap_bio(mdev, 1); - return drbd_make_request_common(mdev, bio, start_time); + do { + inc_ap_bio(mdev, 1); + } while (drbd_make_request_common(mdev, bio, start_time)); + return; } /* can this bio be split generically? @@ -1148,7 +1198,6 @@ int drbd_make_request(struct request_queue *q, struct bio *bio) bio_pair_release(bp); } - return 0; } /* This is called by bio_add_page(). With this function we reduce @@ -1196,36 +1245,66 @@ void request_timer_fn(unsigned long data) struct drbd_conf *mdev = (struct drbd_conf *) data; struct drbd_request *req; /* oldest request */ struct list_head *le; - unsigned long et = 0; /* effective timeout = ko_count * timeout */ + unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ + unsigned long now; if (get_net_conf(mdev)) { - et = mdev->net_conf->timeout*HZ/10 * mdev->net_conf->ko_count; + if (mdev->state.conn >= C_WF_REPORT_PARAMS) + ent = mdev->net_conf->timeout*HZ/10 + * mdev->net_conf->ko_count; put_net_conf(mdev); } - if (!et || mdev->state.conn < C_WF_REPORT_PARAMS) + if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */ + dt = mdev->ldev->dc.disk_timeout * HZ / 10; + put_ldev(mdev); + } + et = min_not_zero(dt, ent); + + if (!et) return; /* Recurring timer stopped */ + now = jiffies; + spin_lock_irq(&mdev->req_lock); le = &mdev->oldest_tle->requests; if (list_empty(le)) { spin_unlock_irq(&mdev->req_lock); - mod_timer(&mdev->request_timer, jiffies + et); + mod_timer(&mdev->request_timer, now + et); return; } le = le->prev; req = list_entry(le, struct drbd_request, tl_requests); - if (time_is_before_eq_jiffies(req->start_time + et)) { - if (req->rq_state & RQ_NET_PENDING) { - dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); - _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE, NULL); - } else { - dev_warn(DEV, "Local backing block device frozen?\n"); - mod_timer(&mdev->request_timer, jiffies + et); - } - } else { - mod_timer(&mdev->request_timer, req->start_time + et); - } + /* The request is considered timed out, if + * - we have some effective timeout from the configuration, + * with above state restrictions applied, + * - the oldest request is waiting for a response from the network + * resp. the local disk, + * - the oldest request is in fact older than the effective timeout, + * - the connection was established (resp. disk was attached) + * for longer than the timeout already. + * Note that for 32bit jiffies and very stable connections/disks, + * we may have a wrap around, which is catched by + * !time_in_range(now, last_..._jif, last_..._jif + timeout). + * + * Side effect: once per 32bit wrap-around interval, which means every + * ~198 days with 250 HZ, we have a window where the timeout would need + * to expire twice (worst case) to become effective. Good enough. + */ + if (ent && req->rq_state & RQ_NET_PENDING && + time_after(now, req->start_time + ent) && + !time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) { + dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); + _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); + } + if (dt && req->rq_state & RQ_LOCAL_PENDING && + time_after(now, req->start_time + dt) && + !time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) { + dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n"); + __drbd_chk_io_error(mdev, DRBD_FORCE_DETACH); + } + nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; spin_unlock_irq(&mdev->req_lock); + mod_timer(&mdev->request_timer, nt); } diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 68a234a5fdc5..3d2111919486 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -105,6 +105,7 @@ enum drbd_req_event { read_completed_with_error, read_ahead_completed_with_error, write_completed_with_error, + abort_disk_io, completed_ok, resend, fail_frozen_disk_io, @@ -118,18 +119,21 @@ enum drbd_req_event { * same time, so we should hold the request lock anyways. */ enum drbd_req_state_bits { - /* 210 - * 000: no local possible - * 001: to be submitted + /* 3210 + * 0000: no local possible + * 0001: to be submitted * UNUSED, we could map: 011: submitted, completion still pending - * 110: completed ok - * 010: completed with error + * 0110: completed ok + * 0010: completed with error + * 1001: Aborted (before completion) + * 1x10: Aborted and completed -> free */ __RQ_LOCAL_PENDING, __RQ_LOCAL_COMPLETED, __RQ_LOCAL_OK, + __RQ_LOCAL_ABORTED, - /* 76543 + /* 87654 * 00000: no network possible * 00001: to be send * 00011: to be send, on worker queue @@ -199,8 +203,9 @@ enum drbd_req_state_bits { #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) #define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED) #define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK) +#define RQ_LOCAL_ABORTED (1UL << __RQ_LOCAL_ABORTED) -#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */ +#define RQ_LOCAL_MASK ((RQ_LOCAL_ABORTED << 1)-1) #define RQ_NET_PENDING (1UL << __RQ_NET_PENDING) #define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 4d3e6f6213ba..6bce2cc179d4 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -70,11 +70,29 @@ rwlock_t global_state_lock; void drbd_md_io_complete(struct bio *bio, int error) { struct drbd_md_io *md_io; + struct drbd_conf *mdev; md_io = (struct drbd_md_io *)bio->bi_private; + mdev = container_of(md_io, struct drbd_conf, md_io); + md_io->error = error; - complete(&md_io->event); + /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able + * to timeout on the lower level device, and eventually detach from it. + * If this io completion runs after that timeout expired, this + * drbd_md_put_buffer() may allow us to finally try and re-attach. + * During normal operation, this only puts that extra reference + * down to 1 again. + * Make sure we first drop the reference, and only then signal + * completion, or we may (in drbd_al_read_log()) cycle so fast into the + * next drbd_md_sync_page_io(), that we trigger the + * ASSERT(atomic_read(&mdev->md_io_in_use) == 1) there. + */ + drbd_md_put_buffer(mdev); + md_io->done = 1; + wake_up(&mdev->misc_wait); + bio_put(bio); + put_ldev(mdev); } /* reads on behalf of the partner, @@ -93,7 +111,7 @@ void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local) if (list_empty(&mdev->read_ee)) wake_up(&mdev->ee_wait); if (test_bit(__EE_WAS_ERROR, &e->flags)) - __drbd_chk_io_error(mdev, false); + __drbd_chk_io_error(mdev, DRBD_IO_ERROR); spin_unlock_irqrestore(&mdev->req_lock, flags); drbd_queue_work(&mdev->data.work, &e->w); @@ -136,7 +154,7 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo : list_empty(&mdev->active_ee); if (test_bit(__EE_WAS_ERROR, &e->flags)) - __drbd_chk_io_error(mdev, false); + __drbd_chk_io_error(mdev, DRBD_IO_ERROR); spin_unlock_irqrestore(&mdev->req_lock, flags); if (is_syncer_req) @@ -226,6 +244,7 @@ void drbd_endio_pri(struct bio *bio, int error) spin_lock_irqsave(&mdev->req_lock, flags); __req_mod(req, what, &m); spin_unlock_irqrestore(&mdev->req_lock, flags); + put_ldev(mdev); if (m.bio) complete_master_bio(mdev, &m); @@ -290,7 +309,7 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio * sg_init_table(&sg, 1); crypto_hash_init(&desc); - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment(bvec, bio, i) { sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset); crypto_hash_update(&desc, &sg, sg.length); } @@ -728,7 +747,7 @@ int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel) } drbd_start_resync(mdev, C_SYNC_SOURCE); - clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags); + clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags); return 1; } @@ -1482,14 +1501,6 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) return; } - if (mdev->state.conn < C_AHEAD) { - /* In case a previous resync run was aborted by an IO error/detach on the peer. */ - drbd_rs_cancel_all(mdev); - /* This should be done when we abort the resync. We definitely do not - want to have this for connections going back and forth between - Ahead/Behind and SyncSource/SyncTarget */ - } - if (side == C_SYNC_TARGET) { /* Since application IO was locked out during C_WF_BITMAP_T and C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET @@ -1519,14 +1530,14 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) } drbd_state_lock(mdev); - + write_lock_irq(&global_state_lock); if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { + write_unlock_irq(&global_state_lock); drbd_state_unlock(mdev); return; } - write_lock_irq(&global_state_lock); - ns = mdev->state; + ns.i = mdev->state.i; ns.aftr_isp = !_drbd_may_sync_now(mdev); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 9955a53733b2..a7d6347aaa79 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -188,10 +188,10 @@ static int print_unex = 1; #include <linux/init.h> #include <linux/platform_device.h> #include <linux/mod_devicetable.h> -#include <linux/buffer_head.h> /* for invalidate_buffers() */ #include <linux/mutex.h> #include <linux/io.h> #include <linux/uaccess.h> +#include <linux/async.h> /* * PS/2 floppies have much slower step rates than regular floppies. @@ -203,7 +203,6 @@ static int slow_floppy; #include <asm/dma.h> #include <asm/irq.h> -#include <asm/system.h> static int FLOPPY_IRQ = 6; static int FLOPPY_DMA = 2; @@ -553,7 +552,7 @@ static void floppy_ready(void); static void floppy_start(void); static void process_fd_request(void); static void recalibrate_floppy(void); -static void floppy_shutdown(unsigned long); +static void floppy_shutdown(struct work_struct *); static int floppy_request_regions(int); static void floppy_release_regions(int); @@ -590,6 +589,8 @@ static int buffer_max = -1; static struct floppy_fdc_state fdc_state[N_FDC]; static int fdc; /* current fdc */ +static struct workqueue_struct *floppy_wq; + static struct floppy_struct *_floppy = floppy_type; static unsigned char current_drive; static long current_count_sectors; @@ -631,16 +632,15 @@ static inline void set_debugt(void) { } static inline void debugt(const char *func, const char *msg) { } #endif /* DEBUGT */ -typedef void (*timeout_fn)(unsigned long); -static DEFINE_TIMER(fd_timeout, floppy_shutdown, 0, 0); +static DECLARE_DELAYED_WORK(fd_timeout, floppy_shutdown); static const char *timeout_message; static void is_alive(const char *func, const char *message) { /* this routine checks whether the floppy driver is "alive" */ if (test_bit(0, &fdc_busy) && command_status < 2 && - !timer_pending(&fd_timeout)) { + !delayed_work_pending(&fd_timeout)) { DPRINT("%s: timeout handler died. %s\n", func, message); } } @@ -668,15 +668,19 @@ static int output_log_pos; static void __reschedule_timeout(int drive, const char *message) { + unsigned long delay; + if (drive == current_reqD) drive = current_drive; - del_timer(&fd_timeout); + __cancel_delayed_work(&fd_timeout); + if (drive < 0 || drive >= N_DRIVE) { - fd_timeout.expires = jiffies + 20UL * HZ; + delay = 20UL * HZ; drive = 0; } else - fd_timeout.expires = jiffies + UDP->timeout; - add_timer(&fd_timeout); + delay = UDP->timeout; + + queue_delayed_work(floppy_wq, &fd_timeout, delay); if (UDP->flags & FD_DEBUG) DPRINT("reschedule timeout %s\n", message); timeout_message = message; @@ -874,7 +878,7 @@ static int lock_fdc(int drive, bool interruptible) command_status = FD_COMMAND_NONE; - __reschedule_timeout(drive, "lock fdc"); + reschedule_timeout(drive, "lock fdc"); set_fdc(drive); return 0; } @@ -882,23 +886,15 @@ static int lock_fdc(int drive, bool interruptible) /* unlocks the driver */ static void unlock_fdc(void) { - unsigned long flags; - - raw_cmd = NULL; if (!test_bit(0, &fdc_busy)) DPRINT("FDC access conflict!\n"); - if (do_floppy) - DPRINT("device interrupt still active at FDC release: %pf!\n", - do_floppy); + raw_cmd = NULL; command_status = FD_COMMAND_NONE; - spin_lock_irqsave(&floppy_lock, flags); - del_timer(&fd_timeout); + __cancel_delayed_work(&fd_timeout); + do_floppy = NULL; cont = NULL; clear_bit(0, &fdc_busy); - if (current_req || set_next_request()) - do_fd_request(current_req->q); - spin_unlock_irqrestore(&floppy_lock, flags); wake_up(&fdc_wait); } @@ -970,26 +966,24 @@ static DECLARE_WORK(floppy_work, NULL); static void schedule_bh(void (*handler)(void)) { + WARN_ON(work_pending(&floppy_work)); + PREPARE_WORK(&floppy_work, (work_func_t)handler); - schedule_work(&floppy_work); + queue_work(floppy_wq, &floppy_work); } -static DEFINE_TIMER(fd_timer, NULL, 0, 0); +static DECLARE_DELAYED_WORK(fd_timer, NULL); static void cancel_activity(void) { - unsigned long flags; - - spin_lock_irqsave(&floppy_lock, flags); do_floppy = NULL; - PREPARE_WORK(&floppy_work, (work_func_t)empty); - del_timer(&fd_timer); - spin_unlock_irqrestore(&floppy_lock, flags); + cancel_delayed_work_sync(&fd_timer); + cancel_work_sync(&floppy_work); } /* this function makes sure that the disk stays in the drive during the * transfer */ -static void fd_watchdog(void) +static void fd_watchdog(struct work_struct *arg) { debug_dcl(DP->flags, "calling disk change from watchdog\n"); @@ -999,21 +993,20 @@ static void fd_watchdog(void) cont->done(0); reset_fdc(); } else { - del_timer(&fd_timer); - fd_timer.function = (timeout_fn)fd_watchdog; - fd_timer.expires = jiffies + HZ / 10; - add_timer(&fd_timer); + cancel_delayed_work(&fd_timer); + PREPARE_DELAYED_WORK(&fd_timer, fd_watchdog); + queue_delayed_work(floppy_wq, &fd_timer, HZ / 10); } } static void main_command_interrupt(void) { - del_timer(&fd_timer); + cancel_delayed_work(&fd_timer); cont->interrupt(); } /* waits for a delay (spinup or select) to pass */ -static int fd_wait_for_completion(unsigned long delay, timeout_fn function) +static int fd_wait_for_completion(unsigned long expires, work_func_t function) { if (FDCS->reset) { reset_fdc(); /* do the reset during sleep to win time @@ -1022,47 +1015,15 @@ static int fd_wait_for_completion(unsigned long delay, timeout_fn function) return 1; } - if (time_before(jiffies, delay)) { - del_timer(&fd_timer); - fd_timer.function = function; - fd_timer.expires = delay; - add_timer(&fd_timer); + if (time_before(jiffies, expires)) { + cancel_delayed_work(&fd_timer); + PREPARE_DELAYED_WORK(&fd_timer, function); + queue_delayed_work(floppy_wq, &fd_timer, expires - jiffies); return 1; } return 0; } -static DEFINE_SPINLOCK(floppy_hlt_lock); -static int hlt_disabled; -static void floppy_disable_hlt(void) -{ - unsigned long flags; - - WARN_ONCE(1, "floppy_disable_hlt() scheduled for removal in 2012"); - spin_lock_irqsave(&floppy_hlt_lock, flags); - if (!hlt_disabled) { - hlt_disabled = 1; -#ifdef HAVE_DISABLE_HLT - disable_hlt(); -#endif - } - spin_unlock_irqrestore(&floppy_hlt_lock, flags); -} - -static void floppy_enable_hlt(void) -{ - unsigned long flags; - - spin_lock_irqsave(&floppy_hlt_lock, flags); - if (hlt_disabled) { - hlt_disabled = 0; -#ifdef HAVE_DISABLE_HLT - enable_hlt(); -#endif - } - spin_unlock_irqrestore(&floppy_hlt_lock, flags); -} - static void setup_DMA(void) { unsigned long f; @@ -1107,7 +1068,6 @@ static void setup_DMA(void) fd_enable_dma(); release_dma_lock(f); #endif - floppy_disable_hlt(); } static void show_floppy(void); @@ -1376,7 +1336,7 @@ static int fdc_dtr(void) */ FDCS->dtr = raw_cmd->rate & 3; return fd_wait_for_completion(jiffies + 2UL * HZ / 100, - (timeout_fn)floppy_ready); + (work_func_t)floppy_ready); } /* fdc_dtr */ static void tell_sector(void) @@ -1481,7 +1441,7 @@ static void setup_rw_floppy(void) int flags; int dflags; unsigned long ready_date; - timeout_fn function; + work_func_t function; flags = raw_cmd->flags; if (flags & (FD_RAW_READ | FD_RAW_WRITE)) @@ -1495,9 +1455,9 @@ static void setup_rw_floppy(void) */ if (time_after(ready_date, jiffies + DP->select_delay)) { ready_date -= DP->select_delay; - function = (timeout_fn)floppy_start; + function = (work_func_t)floppy_start; } else - function = (timeout_fn)setup_rw_floppy; + function = (work_func_t)setup_rw_floppy; /* wait until the floppy is spinning fast enough */ if (fd_wait_for_completion(ready_date, function)) @@ -1527,7 +1487,7 @@ static void setup_rw_floppy(void) inr = result(); cont->interrupt(); } else if (flags & FD_RAW_NEED_DISK) - fd_watchdog(); + fd_watchdog(NULL); } static int blind_seek; @@ -1709,7 +1669,6 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id) fd_disable_dma(); release_dma_lock(f); - floppy_enable_hlt(); do_floppy = NULL; if (fdc >= N_FDC || FDCS->address == -1) { /* we don't even know which FDC is the culprit */ @@ -1837,20 +1796,22 @@ static void show_floppy(void) pr_info("do_floppy=%pf\n", do_floppy); if (work_pending(&floppy_work)) pr_info("floppy_work.func=%pf\n", floppy_work.func); - if (timer_pending(&fd_timer)) - pr_info("fd_timer.function=%pf\n", fd_timer.function); - if (timer_pending(&fd_timeout)) { - pr_info("timer_function=%pf\n", fd_timeout.function); - pr_info("expires=%lu\n", fd_timeout.expires - jiffies); - pr_info("now=%lu\n", jiffies); - } + if (delayed_work_pending(&fd_timer)) + pr_info("delayed work.function=%p expires=%ld\n", + fd_timer.work.func, + fd_timer.timer.expires - jiffies); + if (delayed_work_pending(&fd_timeout)) + pr_info("timer_function=%p expires=%ld\n", + fd_timeout.work.func, + fd_timeout.timer.expires - jiffies); + pr_info("cont=%p\n", cont); pr_info("current_req=%p\n", current_req); pr_info("command_status=%d\n", command_status); pr_info("\n"); } -static void floppy_shutdown(unsigned long data) +static void floppy_shutdown(struct work_struct *arg) { unsigned long flags; @@ -1858,8 +1819,6 @@ static void floppy_shutdown(unsigned long data) show_floppy(); cancel_activity(); - floppy_enable_hlt(); - flags = claim_dma_lock(); fd_disable_dma(); release_dma_lock(flags); @@ -1905,7 +1864,7 @@ static int start_motor(void (*function)(void)) /* wait_for_completion also schedules reset if needed. */ return fd_wait_for_completion(DRS->select_date + DP->select_delay, - (timeout_fn)function); + (work_func_t)function); } static void floppy_ready(void) @@ -2558,8 +2517,7 @@ static int make_raw_rw_request(void) set_fdc((long)current_req->rq_disk->private_data); raw_cmd = &default_raw_cmd; - raw_cmd->flags = FD_RAW_SPIN | FD_RAW_NEED_DISK | FD_RAW_NEED_DISK | - FD_RAW_NEED_SEEK; + raw_cmd->flags = FD_RAW_SPIN | FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK; raw_cmd->cmd_count = NR_RW; if (rq_data_dir(current_req) == READ) { raw_cmd->flags |= FD_RAW_READ; @@ -2858,7 +2816,6 @@ do_request: spin_lock_irq(&floppy_lock); pending = set_next_request(); spin_unlock_irq(&floppy_lock); - if (!pending) { do_floppy = NULL; unlock_fdc(); @@ -2935,13 +2892,15 @@ static void do_fd_request(struct request_queue *q) current_req->cmd_flags)) return; - if (test_bit(0, &fdc_busy)) { + if (test_and_set_bit(0, &fdc_busy)) { /* fdc busy, this new request will be treated when the current one is done */ is_alive(__func__, "old request running"); return; } - lock_fdc(MAXTIMEOUT, false); + command_status = FD_COMMAND_NONE; + __reschedule_timeout(MAXTIMEOUT, "fd_request"); + set_fdc(0); process_fd_request(); is_alive(__func__, ""); } @@ -3649,9 +3608,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode) mutex_lock(&floppy_mutex); mutex_lock(&open_lock); - if (UDRS->fd_ref < 0) - UDRS->fd_ref = 0; - else if (!UDRS->fd_ref--) { + if (!UDRS->fd_ref--) { DPRINT("floppy_release with fd_ref == 0"); UDRS->fd_ref = 0; } @@ -3687,13 +3644,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) set_bit(FD_VERIFY_BIT, &UDRS->flags); } - if (UDRS->fd_ref == -1 || (UDRS->fd_ref && (mode & FMODE_EXCL))) - goto out2; - - if (mode & FMODE_EXCL) - UDRS->fd_ref = -1; - else - UDRS->fd_ref++; + UDRS->fd_ref++; opened_bdev[drive] = bdev; @@ -3756,10 +3707,8 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) mutex_unlock(&floppy_mutex); return 0; out: - if (UDRS->fd_ref < 0) - UDRS->fd_ref = 0; - else - UDRS->fd_ref--; + UDRS->fd_ref--; + if (!UDRS->fd_ref) opened_bdev[drive] = NULL; out2: @@ -3833,7 +3782,7 @@ static int __floppy_read_block_0(struct block_device *bdev) bio.bi_size = size; bio.bi_bdev = bdev; bio.bi_sector = 0; - bio.bi_flags = BIO_QUIET; + bio.bi_flags = (1 << BIO_QUIET); init_completion(&complete); bio.bi_private = &complete; bio.bi_end_io = floppy_rb0_complete; @@ -4174,7 +4123,7 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data) return get_disk(disks[drive]); } -static int __init floppy_init(void) +static int __init do_floppy_init(void) { int i, unit, drive; int err, dr; @@ -4196,10 +4145,16 @@ static int __init floppy_init(void) goto out_put_disk; } + floppy_wq = alloc_ordered_workqueue("floppy", 0); + if (!floppy_wq) { + err = -ENOMEM; + goto out_put_disk; + } + disks[dr]->queue = blk_init_queue(do_fd_request, &floppy_lock); if (!disks[dr]->queue) { err = -ENOMEM; - goto out_put_disk; + goto out_destroy_workq; } blk_queue_max_hw_sectors(disks[dr]->queue, 64); @@ -4250,7 +4205,7 @@ static int __init floppy_init(void) use_virtual_dma = can_use_virtual_dma & 1; fdc_state[0].address = FDC1; if (fdc_state[0].address == -1) { - del_timer_sync(&fd_timeout); + cancel_delayed_work(&fd_timeout); err = -ENODEV; goto out_unreg_region; } @@ -4261,7 +4216,7 @@ static int __init floppy_init(void) fdc = 0; /* reset fdc in case of unexpected interrupt */ err = floppy_grab_irq_and_dma(); if (err) { - del_timer_sync(&fd_timeout); + cancel_delayed_work(&fd_timeout); err = -EBUSY; goto out_unreg_region; } @@ -4318,13 +4273,13 @@ static int __init floppy_init(void) user_reset_fdc(-1, FD_RESET_ALWAYS, false); } fdc = 0; - del_timer_sync(&fd_timeout); + cancel_delayed_work(&fd_timeout); current_drive = 0; initialized = true; if (have_no_fdc) { DPRINT("no floppy controllers found\n"); err = have_no_fdc; - goto out_flush_work; + goto out_release_dma; } for (drive = 0; drive < N_DRIVE; drive++) { @@ -4339,7 +4294,7 @@ static int __init floppy_init(void) err = platform_device_register(&floppy_device[drive]); if (err) - goto out_flush_work; + goto out_release_dma; err = device_create_file(&floppy_device[drive].dev, &dev_attr_cmos); @@ -4357,25 +4312,50 @@ static int __init floppy_init(void) out_unreg_platform_dev: platform_device_unregister(&floppy_device[drive]); -out_flush_work: - flush_work_sync(&floppy_work); +out_release_dma: if (atomic_read(&usage_count)) floppy_release_irq_and_dma(); out_unreg_region: blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); platform_driver_unregister(&floppy_driver); +out_destroy_workq: + destroy_workqueue(floppy_wq); out_unreg_blkdev: unregister_blkdev(FLOPPY_MAJOR, "fd"); out_put_disk: while (dr--) { del_timer_sync(&motor_off_timer[dr]); - if (disks[dr]->queue) + if (disks[dr]->queue) { blk_cleanup_queue(disks[dr]->queue); + /* + * put_disk() is not paired with add_disk() and + * will put queue reference one extra time. fix it. + */ + disks[dr]->queue = NULL; + } put_disk(disks[dr]); } return err; } +#ifndef MODULE +static __init void floppy_async_init(void *data, async_cookie_t cookie) +{ + do_floppy_init(); +} +#endif + +static int __init floppy_init(void) +{ +#ifdef MODULE + return do_floppy_init(); +#else + /* Don't hold up the bootup by the floppy initialization */ + async_schedule(floppy_async_init, NULL); + return 0; +#endif +} + static const struct io_region { int offset; int size; @@ -4428,7 +4408,7 @@ static int floppy_grab_irq_and_dma(void) * We might have scheduled a free_irq(), wait it to * drain first: */ - flush_work_sync(&floppy_work); + flush_workqueue(floppy_wq); if (fd_request_irq()) { DPRINT("Unable to grab IRQ%d for the floppy driver\n", @@ -4504,7 +4484,6 @@ static void floppy_release_irq_and_dma(void) #if N_FDC > 1 set_dor(1, ~8, 0); #endif - floppy_enable_hlt(); if (floppy_track_buffer && max_buffer_sectors) { tmpsize = max_buffer_sectors * 1024; @@ -4520,9 +4499,9 @@ static void floppy_release_irq_and_dma(void) pr_info("motor off timer %d still active\n", drive); #endif - if (timer_pending(&fd_timeout)) + if (delayed_work_pending(&fd_timeout)) pr_info("floppy timer still active:%s\n", timeout_message); - if (timer_pending(&fd_timer)) + if (delayed_work_pending(&fd_timer)) pr_info("auxiliary floppy timer still active\n"); if (work_pending(&floppy_work)) pr_info("work still pending\n"); @@ -4580,11 +4559,21 @@ static void __exit floppy_module_exit(void) platform_device_unregister(&floppy_device[drive]); } blk_cleanup_queue(disks[drive]->queue); + + /* + * These disks have not called add_disk(). Don't put down + * queue reference in put_disk(). + */ + if (!(allowed_drive_mask & (1 << drive)) || + fdc_state[FDC(drive)].version == FDC_NONE) + disks[drive]->queue = NULL; + put_disk(disks[drive]); } - del_timer_sync(&fd_timeout); - del_timer_sync(&fd_timer); + cancel_delayed_work_sync(&fd_timeout); + cancel_delayed_work_sync(&fd_timer); + destroy_workqueue(floppy_wq); if (atomic_read(&usage_count)) floppy_release_irq_and_dma(); diff --git a/drivers/block/hd.c b/drivers/block/hd.c index b52c9ca146fc..bf397bf108b7 100644 --- a/drivers/block/hd.c +++ b/drivers/block/hd.c @@ -44,7 +44,6 @@ #define HD_IRQ 14 #define REALLY_SLOW_IO -#include <asm/system.h> #include <asm/io.h> #include <asm/uaccess.h> diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 4720c7ade0ae..3bba65510d23 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -69,13 +69,14 @@ #include <linux/freezer.h> #include <linux/mutex.h> #include <linux/writeback.h> -#include <linux/buffer_head.h> /* for invalidate_bdev() */ #include <linux/completion.h> #include <linux/highmem.h> #include <linux/kthread.h> #include <linux/splice.h> #include <linux/sysfs.h> #include <linux/miscdevice.h> +#include <linux/falloc.h> + #include <asm/uaccess.h> static DEFINE_IDR(loop_index_idr); @@ -92,16 +93,16 @@ static int transfer_none(struct loop_device *lo, int cmd, struct page *loop_page, unsigned loop_off, int size, sector_t real_block) { - char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off; - char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off; + char *raw_buf = kmap_atomic(raw_page) + raw_off; + char *loop_buf = kmap_atomic(loop_page) + loop_off; if (cmd == READ) memcpy(loop_buf, raw_buf, size); else memcpy(raw_buf, loop_buf, size); - kunmap_atomic(loop_buf, KM_USER1); - kunmap_atomic(raw_buf, KM_USER0); + kunmap_atomic(loop_buf); + kunmap_atomic(raw_buf); cond_resched(); return 0; } @@ -111,8 +112,8 @@ static int transfer_xor(struct loop_device *lo, int cmd, struct page *loop_page, unsigned loop_off, int size, sector_t real_block) { - char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off; - char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off; + char *raw_buf = kmap_atomic(raw_page) + raw_off; + char *loop_buf = kmap_atomic(loop_page) + loop_off; char *in, *out, *key; int i, keysize; @@ -129,8 +130,8 @@ static int transfer_xor(struct loop_device *lo, int cmd, for (i = 0; i < size; i++) *out++ = *in++ ^ key[(i & 511) % keysize]; - kunmap_atomic(loop_buf, KM_USER1); - kunmap_atomic(raw_buf, KM_USER0); + kunmap_atomic(loop_buf); + kunmap_atomic(raw_buf); cond_resched(); return 0; } @@ -159,17 +160,19 @@ static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = { &xor_funcs }; -static loff_t get_loop_size(struct loop_device *lo, struct file *file) +static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file) { - loff_t size, offset, loopsize; + loff_t size, loopsize; /* Compute loopsize in bytes */ size = i_size_read(file->f_mapping->host); - offset = lo->lo_offset; loopsize = size - offset; - if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize) - loopsize = lo->lo_sizelimit; + /* offset is beyond i_size, wierd but possible */ + if (loopsize < 0) + return 0; + if (sizelimit > 0 && sizelimit < loopsize) + loopsize = sizelimit; /* * Unfortunately, if we want to do I/O on the device, * the number of 512-byte sectors has to fit into a sector_t. @@ -177,17 +180,25 @@ static loff_t get_loop_size(struct loop_device *lo, struct file *file) return loopsize >> 9; } +static loff_t get_loop_size(struct loop_device *lo, struct file *file) +{ + return get_size(lo->lo_offset, lo->lo_sizelimit, file); +} + static int -figure_loop_size(struct loop_device *lo) +figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit) { - loff_t size = get_loop_size(lo, lo->lo_backing_file); + loff_t size = get_size(offset, sizelimit, lo->lo_backing_file); sector_t x = (sector_t)size; if (unlikely((loff_t)x != size)) return -EFBIG; - + if (lo->lo_offset != offset) + lo->lo_offset = offset; + if (lo->lo_sizelimit != sizelimit) + lo->lo_sizelimit = sizelimit; set_capacity(lo->lo_disk, x); - return 0; + return 0; } static inline int @@ -203,74 +214,6 @@ lo_do_transfer(struct loop_device *lo, int cmd, } /** - * do_lo_send_aops - helper for writing data to a loop device - * - * This is the fast version for backing filesystems which implement the address - * space operations write_begin and write_end. - */ -static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec, - loff_t pos, struct page *unused) -{ - struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */ - struct address_space *mapping = file->f_mapping; - pgoff_t index; - unsigned offset, bv_offs; - int len, ret; - - mutex_lock(&mapping->host->i_mutex); - index = pos >> PAGE_CACHE_SHIFT; - offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1); - bv_offs = bvec->bv_offset; - len = bvec->bv_len; - while (len > 0) { - sector_t IV; - unsigned size, copied; - int transfer_result; - struct page *page; - void *fsdata; - - IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9); - size = PAGE_CACHE_SIZE - offset; - if (size > len) - size = len; - - ret = pagecache_write_begin(file, mapping, pos, size, 0, - &page, &fsdata); - if (ret) - goto fail; - - file_update_time(file); - - transfer_result = lo_do_transfer(lo, WRITE, page, offset, - bvec->bv_page, bv_offs, size, IV); - copied = size; - if (unlikely(transfer_result)) - copied = 0; - - ret = pagecache_write_end(file, mapping, pos, size, copied, - page, fsdata); - if (ret < 0 || ret != copied) - goto fail; - - if (unlikely(transfer_result)) - goto fail; - - bv_offs += copied; - len -= copied; - offset = 0; - index++; - pos += copied; - } - ret = 0; -out: - mutex_unlock(&mapping->host->i_mutex); - return ret; -fail: - ret = -1; - goto out; -} - -/** * __do_lo_send_write - helper for writing data to a loop device * * This helper just factors out common code between do_lo_send_direct_write() @@ -297,10 +240,8 @@ static int __do_lo_send_write(struct file *file, /** * do_lo_send_direct_write - helper for writing data to a loop device * - * This is the fast, non-transforming version for backing filesystems which do - * not implement the address space operations write_begin and write_end. - * It uses the write file operation which should be present on all writeable - * filesystems. + * This is the fast, non-transforming version that does not need double + * buffering. */ static int do_lo_send_direct_write(struct loop_device *lo, struct bio_vec *bvec, loff_t pos, struct page *page) @@ -316,15 +257,9 @@ static int do_lo_send_direct_write(struct loop_device *lo, /** * do_lo_send_write - helper for writing data to a loop device * - * This is the slow, transforming version for filesystems which do not - * implement the address space operations write_begin and write_end. It - * uses the write file operation which should be present on all writeable - * filesystems. - * - * Using fops->write is slower than using aops->{prepare,commit}_write in the - * transforming case because we need to double buffer the data as we cannot do - * the transformations in place as we do not have direct access to the - * destination pages of the backing file. + * This is the slow, transforming version that needs to double buffer the + * data as it cannot do the transformations in place without having direct + * access to the destination pages of the backing file. */ static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec, loff_t pos, struct page *page) @@ -350,17 +285,16 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos) struct page *page = NULL; int i, ret = 0; - do_lo_send = do_lo_send_aops; - if (!(lo->lo_flags & LO_FLAGS_USE_AOPS)) { + if (lo->transfer != transfer_none) { + page = alloc_page(GFP_NOIO | __GFP_HIGHMEM); + if (unlikely(!page)) + goto fail; + kmap(page); + do_lo_send = do_lo_send_write; + } else { do_lo_send = do_lo_send_direct_write; - if (lo->transfer != transfer_none) { - page = alloc_page(GFP_NOIO | __GFP_HIGHMEM); - if (unlikely(!page)) - goto fail; - kmap(page); - do_lo_send = do_lo_send_write; - } } + bio_for_each_segment(bvec, bio, i) { ret = do_lo_send(lo, bvec, pos, page); if (ret < 0) @@ -422,14 +356,14 @@ lo_direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) return __splice_from_pipe(pipe, sd, lo_splice_actor); } -static int +static ssize_t do_lo_receive(struct loop_device *lo, struct bio_vec *bvec, int bsize, loff_t pos) { struct lo_read_data cookie; struct splice_desc sd; struct file *file; - long retval; + ssize_t retval; cookie.lo = lo; cookie.page = bvec->bv_page; @@ -445,25 +379,28 @@ do_lo_receive(struct loop_device *lo, file = lo->lo_backing_file; retval = splice_direct_to_actor(file, &sd, lo_direct_splice_actor); - if (retval < 0) - return retval; - - return 0; + return retval; } static int lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos) { struct bio_vec *bvec; - int i, ret = 0; + ssize_t s; + int i; bio_for_each_segment(bvec, bio, i) { - ret = do_lo_receive(lo, bvec, bsize, pos); - if (ret < 0) + s = do_lo_receive(lo, bvec, bsize, pos); + if (s < 0) + return s; + + if (s != bvec->bv_len) { + zero_fill_bio(bio); break; + } pos += bvec->bv_len; } - return ret; + return 0; } static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) @@ -484,6 +421,29 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) } } + /* + * We use punch hole to reclaim the free space used by the + * image a.k.a. discard. However we do not support discard if + * encryption is enabled, because it may give an attacker + * useful information. + */ + if (bio->bi_rw & REQ_DISCARD) { + struct file *file = lo->lo_backing_file; + int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; + + if ((!file->f_op->fallocate) || + lo->lo_encrypt_key_size) { + ret = -EOPNOTSUPP; + goto out; + } + ret = file->f_op->fallocate(file, mode, pos, + bio->bi_size); + if (unlikely(ret && ret != -EINVAL && + ret != -EOPNOTSUPP)) + ret = -EIO; + goto out; + } + ret = lo_send(lo, bio, pos); if ((bio->bi_rw & REQ_FUA) && !ret) { @@ -514,7 +474,7 @@ static struct bio *loop_get_bio(struct loop_device *lo) return bio_list_pop(&lo->lo_bio_list); } -static int loop_make_request(struct request_queue *q, struct bio *old_bio) +static void loop_make_request(struct request_queue *q, struct bio *old_bio) { struct loop_device *lo = q->queuedata; int rw = bio_rw(old_bio); @@ -532,12 +492,11 @@ static int loop_make_request(struct request_queue *q, struct bio *old_bio) loop_add_bio(lo, old_bio); wake_up(&lo->lo_event); spin_unlock_irq(&lo->lo_lock); - return 0; + return; out: spin_unlock_irq(&lo->lo_lock); bio_io_error(old_bio); - return 0; } struct switch_request { @@ -700,7 +659,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, goto out_putf; fput(old_file); - if (max_part > 0) + if (lo->lo_flags & LO_FLAGS_PARTSCAN) ioctl_by_bdev(bdev, BLKRRPART, 0); return 0; @@ -777,16 +736,25 @@ static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf) return sprintf(buf, "%s\n", autoclear ? "1" : "0"); } +static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf) +{ + int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN); + + return sprintf(buf, "%s\n", partscan ? "1" : "0"); +} + LOOP_ATTR_RO(backing_file); LOOP_ATTR_RO(offset); LOOP_ATTR_RO(sizelimit); LOOP_ATTR_RO(autoclear); +LOOP_ATTR_RO(partscan); static struct attribute *loop_attrs[] = { &loop_attr_backing_file.attr, &loop_attr_offset.attr, &loop_attr_sizelimit.attr, &loop_attr_autoclear.attr, + &loop_attr_partscan.attr, NULL, }; @@ -807,6 +775,35 @@ static void loop_sysfs_exit(struct loop_device *lo) &loop_attribute_group); } +static void loop_config_discard(struct loop_device *lo) +{ + struct file *file = lo->lo_backing_file; + struct inode *inode = file->f_mapping->host; + struct request_queue *q = lo->lo_queue; + + /* + * We use punch hole to reclaim the free space used by the + * image a.k.a. discard. However we do support discard if + * encryption is enabled, because it may give an attacker + * useful information. + */ + if ((!file->f_op->fallocate) || + lo->lo_encrypt_key_size) { + q->limits.discard_granularity = 0; + q->limits.discard_alignment = 0; + q->limits.max_discard_sectors = 0; + q->limits.discard_zeroes_data = 0; + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); + return; + } + + q->limits.discard_granularity = inode->i_sb->s_blocksize; + q->limits.discard_alignment = 0; + q->limits.max_discard_sectors = UINT_MAX >> 9; + q->limits.discard_zeroes_data = 1; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); +} + static int loop_set_fd(struct loop_device *lo, fmode_t mode, struct block_device *bdev, unsigned int arg) { @@ -849,35 +846,23 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, mapping = file->f_mapping; inode = mapping->host; - if (!(file->f_mode & FMODE_WRITE)) - lo_flags |= LO_FLAGS_READ_ONLY; - error = -EINVAL; - if (S_ISREG(inode->i_mode) || S_ISBLK(inode->i_mode)) { - const struct address_space_operations *aops = mapping->a_ops; - - if (aops->write_begin) - lo_flags |= LO_FLAGS_USE_AOPS; - if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write) - lo_flags |= LO_FLAGS_READ_ONLY; + if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) + goto out_putf; - lo_blocksize = S_ISBLK(inode->i_mode) ? - inode->i_bdev->bd_block_size : PAGE_SIZE; + if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) || + !file->f_op->write) + lo_flags |= LO_FLAGS_READ_ONLY; - error = 0; - } else { - goto out_putf; - } + lo_blocksize = S_ISBLK(inode->i_mode) ? + inode->i_bdev->bd_block_size : PAGE_SIZE; + error = -EFBIG; size = get_loop_size(lo, file); - - if ((loff_t)(sector_t)size != size) { - error = -EFBIG; + if ((loff_t)(sector_t)size != size) goto out_putf; - } - if (!(mode & FMODE_WRITE)) - lo_flags |= LO_FLAGS_READ_ONLY; + error = 0; set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0); @@ -919,7 +904,9 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, } lo->lo_state = Lo_bound; wake_up_process(lo->lo_thread); - if (max_part > 0) + if (part_shift) + lo->lo_flags |= LO_FLAGS_PARTSCAN; + if (lo->lo_flags & LO_FLAGS_PARTSCAN) ioctl_by_bdev(bdev, BLKRRPART, 0); return 0; @@ -980,10 +967,11 @@ loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer, return err; } -static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev) +static int loop_clr_fd(struct loop_device *lo) { struct file *filp = lo->lo_backing_file; gfp_t gfp = lo->old_gfp_mask; + struct block_device *bdev = lo->lo_device; if (lo->lo_state != Lo_bound) return -ENXIO; @@ -1012,7 +1000,6 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev) lo->lo_offset = 0; lo->lo_sizelimit = 0; lo->lo_encrypt_key_size = 0; - lo->lo_flags = 0; lo->lo_thread = NULL; memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); @@ -1030,8 +1017,11 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev) lo->lo_state = Lo_unbound; /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); - if (max_part > 0 && bdev) + if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev) ioctl_by_bdev(bdev, BLKRRPART, 0); + lo->lo_flags = 0; + if (!part_shift) + lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; mutex_unlock(&lo->lo_ctl_mutex); /* * Need not hold lo_ctl_mutex to fput backing file. @@ -1080,11 +1070,10 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) if (lo->lo_offset != info->lo_offset || lo->lo_sizelimit != info->lo_sizelimit) { - lo->lo_offset = info->lo_offset; - lo->lo_sizelimit = info->lo_sizelimit; - if (figure_loop_size(lo)) + if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) return -EFBIG; } + loop_config_discard(lo); memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE); @@ -1100,6 +1089,13 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) (info->lo_flags & LO_FLAGS_AUTOCLEAR)) lo->lo_flags ^= LO_FLAGS_AUTOCLEAR; + if ((info->lo_flags & LO_FLAGS_PARTSCAN) && + !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { + lo->lo_flags |= LO_FLAGS_PARTSCAN; + lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; + ioctl_by_bdev(lo->lo_device, BLKRRPART, 0); + } + lo->lo_encrypt_key_size = info->lo_encrypt_key_size; lo->lo_init[0] = info->lo_init[0]; lo->lo_init[1] = info->lo_init[1]; @@ -1260,7 +1256,7 @@ static int loop_set_capacity(struct loop_device *lo, struct block_device *bdev) err = -ENXIO; if (unlikely(lo->lo_state != Lo_bound)) goto out; - err = figure_loop_size(lo); + err = figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit); if (unlikely(err)) goto out; sec = get_capacity(lo->lo_disk); @@ -1293,18 +1289,24 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode, break; case LOOP_CLR_FD: /* loop_clr_fd would have unlocked lo_ctl_mutex on success */ - err = loop_clr_fd(lo, bdev); + err = loop_clr_fd(lo); if (!err) goto out_unlocked; break; case LOOP_SET_STATUS: - err = loop_set_status_old(lo, (struct loop_info __user *) arg); + err = -EPERM; + if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) + err = loop_set_status_old(lo, + (struct loop_info __user *)arg); break; case LOOP_GET_STATUS: err = loop_get_status_old(lo, (struct loop_info __user *) arg); break; case LOOP_SET_STATUS64: - err = loop_set_status64(lo, (struct loop_info64 __user *) arg); + err = -EPERM; + if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) + err = loop_set_status64(lo, + (struct loop_info64 __user *) arg); break; case LOOP_GET_STATUS64: err = loop_get_status64(lo, (struct loop_info64 __user *) arg); @@ -1513,7 +1515,7 @@ static int lo_release(struct gendisk *disk, fmode_t mode) * In autoclear mode, stop the loop thread * and remove configuration after last close. */ - err = loop_clr_fd(lo, NULL); + err = loop_clr_fd(lo); if (!err) goto out_unlocked; } else { @@ -1595,14 +1597,12 @@ static int loop_add(struct loop_device **l, int i) struct gendisk *disk; int err; + err = -ENOMEM; lo = kzalloc(sizeof(*lo), GFP_KERNEL); - if (!lo) { - err = -ENOMEM; + if (!lo) goto out; - } - err = idr_pre_get(&loop_index_idr, GFP_KERNEL); - if (err < 0) + if (!idr_pre_get(&loop_index_idr, GFP_KERNEL)) goto out_free_dev; if (i >= 0) { @@ -1635,6 +1635,27 @@ static int loop_add(struct loop_device **l, int i) if (!disk) goto out_free_queue; + /* + * Disable partition scanning by default. The in-kernel partition + * scanning can be requested individually per-device during its + * setup. Userspace can always add and remove partitions from all + * devices. The needed partition minors are allocated from the + * extended minor space, the main loop device numbers will continue + * to match the loop minors, regardless of the number of partitions + * used. + * + * If max_part is given, partition scanning is globally enabled for + * all loop devices. The minors for the main loop devices will be + * multiples of max_part. + * + * Note: Global-for-all-devices, set-only-at-init, read-only module + * parameteters like 'max_loop' and 'max_part' make things needlessly + * complicated, are too static, inflexible and may surprise + * userspace tools. Parameters like this in general should be avoided. + */ + if (!part_shift) + disk->flags |= GENHD_FL_NO_PART_SCAN; + disk->flags |= GENHD_FL_EXT_DEVT; mutex_init(&lo->lo_ctl_mutex); lo->lo_number = i; lo->lo_thread = NULL; diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 76fa3deaee84..1788f491e0fb 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -780,9 +780,9 @@ static const struct block_device_operations mg_disk_ops = { .getgeo = mg_getgeo }; -static int mg_suspend(struct platform_device *plat_dev, pm_message_t state) +static int mg_suspend(struct device *dev) { - struct mg_drv_data *prv_data = plat_dev->dev.platform_data; + struct mg_drv_data *prv_data = dev->platform_data; struct mg_host *host = prv_data->host; if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD)) @@ -804,9 +804,9 @@ static int mg_suspend(struct platform_device *plat_dev, pm_message_t state) return 0; } -static int mg_resume(struct platform_device *plat_dev) +static int mg_resume(struct device *dev) { - struct mg_drv_data *prv_data = plat_dev->dev.platform_data; + struct mg_drv_data *prv_data = dev->platform_data; struct mg_host *host = prv_data->host; if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD)) @@ -825,6 +825,8 @@ static int mg_resume(struct platform_device *plat_dev) return 0; } +static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume); + static int mg_probe(struct platform_device *plat_dev) { struct mg_host *host; @@ -1074,11 +1076,10 @@ static int mg_remove(struct platform_device *plat_dev) static struct platform_driver mg_disk_driver = { .probe = mg_probe, .remove = mg_remove, - .suspend = mg_suspend, - .resume = mg_resume, .driver = { .name = MG_DEV_NAME, .owner = THIS_MODULE, + .pm = &mg_pm, } }; diff --git a/drivers/block/mtip32xx/Kconfig b/drivers/block/mtip32xx/Kconfig new file mode 100644 index 000000000000..0ba837fc62a8 --- /dev/null +++ b/drivers/block/mtip32xx/Kconfig @@ -0,0 +1,9 @@ +# +# mtip32xx device driver configuration +# + +config BLK_DEV_PCIESSD_MTIP32XX + tristate "Block Device Driver for Micron PCIe SSDs" + depends on PCI + help + This enables the block driver for Micron PCIe SSDs. diff --git a/drivers/block/mtip32xx/Makefile b/drivers/block/mtip32xx/Makefile new file mode 100644 index 000000000000..4fbef8c8329b --- /dev/null +++ b/drivers/block/mtip32xx/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for Block device driver for Micron PCIe SSD +# + +obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx.o diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c new file mode 100644 index 000000000000..f946d31d6917 --- /dev/null +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -0,0 +1,4266 @@ +/* + * Driver for the Micron P320 SSD + * Copyright (C) 2011 Micron Technology, Inc. + * + * Portions of this code were derived from works subjected to the + * following copyright: + * Copyright (C) 2009 Integrated Device Technology, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/pci.h> +#include <linux/interrupt.h> +#include <linux/ata.h> +#include <linux/delay.h> +#include <linux/hdreg.h> +#include <linux/uaccess.h> +#include <linux/random.h> +#include <linux/smp.h> +#include <linux/compat.h> +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/genhd.h> +#include <linux/blkdev.h> +#include <linux/bio.h> +#include <linux/dma-mapping.h> +#include <linux/idr.h> +#include <linux/kthread.h> +#include <../drivers/ata/ahci.h> +#include <linux/export.h> +#include <linux/debugfs.h> +#include "mtip32xx.h" + +#define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32) +#define HW_CMD_TBL_SZ (AHCI_CMD_TBL_HDR_SZ + (MTIP_MAX_SG * 16)) +#define HW_CMD_TBL_AR_SZ (HW_CMD_TBL_SZ * MTIP_MAX_COMMAND_SLOTS) +#define HW_PORT_PRIV_DMA_SZ \ + (HW_CMD_SLOT_SZ + HW_CMD_TBL_AR_SZ + AHCI_RX_FIS_SZ) + +#define HOST_CAP_NZDMA (1 << 19) +#define HOST_HSORG 0xFC +#define HSORG_DISABLE_SLOTGRP_INTR (1<<24) +#define HSORG_DISABLE_SLOTGRP_PXIS (1<<16) +#define HSORG_HWREV 0xFF00 +#define HSORG_STYLE 0x8 +#define HSORG_SLOTGROUPS 0x7 + +#define PORT_COMMAND_ISSUE 0x38 +#define PORT_SDBV 0x7C + +#define PORT_OFFSET 0x100 +#define PORT_MEM_SIZE 0x80 + +#define PORT_IRQ_ERR \ + (PORT_IRQ_HBUS_ERR | PORT_IRQ_IF_ERR | PORT_IRQ_CONNECT | \ + PORT_IRQ_PHYRDY | PORT_IRQ_UNK_FIS | PORT_IRQ_BAD_PMP | \ + PORT_IRQ_TF_ERR | PORT_IRQ_HBUS_DATA_ERR | PORT_IRQ_IF_NONFATAL | \ + PORT_IRQ_OVERFLOW) +#define PORT_IRQ_LEGACY \ + (PORT_IRQ_PIOS_FIS | PORT_IRQ_D2H_REG_FIS) +#define PORT_IRQ_HANDLED \ + (PORT_IRQ_SDB_FIS | PORT_IRQ_LEGACY | \ + PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR | \ + PORT_IRQ_CONNECT | PORT_IRQ_PHYRDY) +#define DEF_PORT_IRQ \ + (PORT_IRQ_ERR | PORT_IRQ_LEGACY | PORT_IRQ_SDB_FIS) + +/* product numbers */ +#define MTIP_PRODUCT_UNKNOWN 0x00 +#define MTIP_PRODUCT_ASICFPGA 0x11 + +/* Device instance number, incremented each time a device is probed. */ +static int instance; + +/* + * Global variable used to hold the major block device number + * allocated in mtip_init(). + */ +static int mtip_major; +static struct dentry *dfs_parent; + +static DEFINE_SPINLOCK(rssd_index_lock); +static DEFINE_IDA(rssd_index_ida); + +static int mtip_block_initialize(struct driver_data *dd); + +#ifdef CONFIG_COMPAT +struct mtip_compat_ide_task_request_s { + __u8 io_ports[8]; + __u8 hob_ports[8]; + ide_reg_valid_t out_flags; + ide_reg_valid_t in_flags; + int data_phase; + int req_cmd; + compat_ulong_t out_size; + compat_ulong_t in_size; +}; +#endif + +/* + * This function check_for_surprise_removal is called + * while card is removed from the system and it will + * read the vendor id from the configration space + * + * @pdev Pointer to the pci_dev structure. + * + * return value + * true if device removed, else false + */ +static bool mtip_check_surprise_removal(struct pci_dev *pdev) +{ + u16 vendor_id = 0; + + /* Read the vendorID from the configuration space */ + pci_read_config_word(pdev, 0x00, &vendor_id); + if (vendor_id == 0xFFFF) + return true; /* device removed */ + + return false; /* device present */ +} + +/* + * This function is called for clean the pending command in the + * command slot during the surprise removal of device and return + * error to the upper layer. + * + * @dd Pointer to the DRIVER_DATA structure. + * + * return value + * None + */ +static void mtip_command_cleanup(struct driver_data *dd) +{ + int group = 0, commandslot = 0, commandindex = 0; + struct mtip_cmd *command; + struct mtip_port *port = dd->port; + static int in_progress; + + if (in_progress) + return; + + in_progress = 1; + + for (group = 0; group < 4; group++) { + for (commandslot = 0; commandslot < 32; commandslot++) { + if (!(port->allocated[group] & (1 << commandslot))) + continue; + + commandindex = group << 5 | commandslot; + command = &port->commands[commandindex]; + + if (atomic_read(&command->active) + && (command->async_callback)) { + command->async_callback(command->async_data, + -ENODEV); + command->async_callback = NULL; + command->async_data = NULL; + } + + dma_unmap_sg(&port->dd->pdev->dev, + command->sg, + command->scatter_ents, + command->direction); + } + } + + up(&port->cmd_slot); + + set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag); + in_progress = 0; +} + +/* + * Obtain an empty command slot. + * + * This function needs to be reentrant since it could be called + * at the same time on multiple CPUs. The allocation of the + * command slot must be atomic. + * + * @port Pointer to the port data structure. + * + * return value + * >= 0 Index of command slot obtained. + * -1 No command slots available. + */ +static int get_slot(struct mtip_port *port) +{ + int slot, i; + unsigned int num_command_slots = port->dd->slot_groups * 32; + + /* + * Try 10 times, because there is a small race here. + * that's ok, because it's still cheaper than a lock. + * + * Race: Since this section is not protected by lock, same bit + * could be chosen by different process contexts running in + * different processor. So instead of costly lock, we are going + * with loop. + */ + for (i = 0; i < 10; i++) { + slot = find_next_zero_bit(port->allocated, + num_command_slots, 1); + if ((slot < num_command_slots) && + (!test_and_set_bit(slot, port->allocated))) + return slot; + } + dev_warn(&port->dd->pdev->dev, "Failed to get a tag.\n"); + + if (mtip_check_surprise_removal(port->dd->pdev)) { + /* Device not present, clean outstanding commands */ + mtip_command_cleanup(port->dd); + } + return -1; +} + +/* + * Release a command slot. + * + * @port Pointer to the port data structure. + * @tag Tag of command to release + * + * return value + * None + */ +static inline void release_slot(struct mtip_port *port, int tag) +{ + smp_mb__before_clear_bit(); + clear_bit(tag, port->allocated); + smp_mb__after_clear_bit(); +} + +/* + * Reset the HBA (without sleeping) + * + * Just like hba_reset, except does not call sleep, so can be + * run from interrupt/tasklet context. + * + * @dd Pointer to the driver data structure. + * + * return value + * 0 The reset was successful. + * -1 The HBA Reset bit did not clear. + */ +static int hba_reset_nosleep(struct driver_data *dd) +{ + unsigned long timeout; + + /* Chip quirk: quiesce any chip function */ + mdelay(10); + + /* Set the reset bit */ + writel(HOST_RESET, dd->mmio + HOST_CTL); + + /* Flush */ + readl(dd->mmio + HOST_CTL); + + /* + * Wait 10ms then spin for up to 1 second + * waiting for reset acknowledgement + */ + timeout = jiffies + msecs_to_jiffies(1000); + mdelay(10); + while ((readl(dd->mmio + HOST_CTL) & HOST_RESET) + && time_before(jiffies, timeout)) + mdelay(1); + + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) + return -1; + + if (readl(dd->mmio + HOST_CTL) & HOST_RESET) + return -1; + + return 0; +} + +/* + * Issue a command to the hardware. + * + * Set the appropriate bit in the s_active and Command Issue hardware + * registers, causing hardware command processing to begin. + * + * @port Pointer to the port structure. + * @tag The tag of the command to be issued. + * + * return value + * None + */ +static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag) +{ + atomic_set(&port->commands[tag].active, 1); + + spin_lock(&port->cmd_issue_lock); + + writel((1 << MTIP_TAG_BIT(tag)), + port->s_active[MTIP_TAG_INDEX(tag)]); + writel((1 << MTIP_TAG_BIT(tag)), + port->cmd_issue[MTIP_TAG_INDEX(tag)]); + + spin_unlock(&port->cmd_issue_lock); + + /* Set the command's timeout value.*/ + port->commands[tag].comp_time = jiffies + msecs_to_jiffies( + MTIP_NCQ_COMMAND_TIMEOUT_MS); +} + +/* + * Enable/disable the reception of FIS + * + * @port Pointer to the port data structure + * @enable 1 to enable, 0 to disable + * + * return value + * Previous state: 1 enabled, 0 disabled + */ +static int mtip_enable_fis(struct mtip_port *port, int enable) +{ + u32 tmp; + + /* enable FIS reception */ + tmp = readl(port->mmio + PORT_CMD); + if (enable) + writel(tmp | PORT_CMD_FIS_RX, port->mmio + PORT_CMD); + else + writel(tmp & ~PORT_CMD_FIS_RX, port->mmio + PORT_CMD); + + /* Flush */ + readl(port->mmio + PORT_CMD); + + return (((tmp & PORT_CMD_FIS_RX) == PORT_CMD_FIS_RX)); +} + +/* + * Enable/disable the DMA engine + * + * @port Pointer to the port data structure + * @enable 1 to enable, 0 to disable + * + * return value + * Previous state: 1 enabled, 0 disabled. + */ +static int mtip_enable_engine(struct mtip_port *port, int enable) +{ + u32 tmp; + + /* enable FIS reception */ + tmp = readl(port->mmio + PORT_CMD); + if (enable) + writel(tmp | PORT_CMD_START, port->mmio + PORT_CMD); + else + writel(tmp & ~PORT_CMD_START, port->mmio + PORT_CMD); + + readl(port->mmio + PORT_CMD); + return (((tmp & PORT_CMD_START) == PORT_CMD_START)); +} + +/* + * Enables the port DMA engine and FIS reception. + * + * return value + * None + */ +static inline void mtip_start_port(struct mtip_port *port) +{ + /* Enable FIS reception */ + mtip_enable_fis(port, 1); + + /* Enable the DMA engine */ + mtip_enable_engine(port, 1); +} + +/* + * Deinitialize a port by disabling port interrupts, the DMA engine, + * and FIS reception. + * + * @port Pointer to the port structure + * + * return value + * None + */ +static inline void mtip_deinit_port(struct mtip_port *port) +{ + /* Disable interrupts on this port */ + writel(0, port->mmio + PORT_IRQ_MASK); + + /* Disable the DMA engine */ + mtip_enable_engine(port, 0); + + /* Disable FIS reception */ + mtip_enable_fis(port, 0); +} + +/* + * Initialize a port. + * + * This function deinitializes the port by calling mtip_deinit_port() and + * then initializes it by setting the command header and RX FIS addresses, + * clearing the SError register and any pending port interrupts before + * re-enabling the default set of port interrupts. + * + * @port Pointer to the port structure. + * + * return value + * None + */ +static void mtip_init_port(struct mtip_port *port) +{ + int i; + mtip_deinit_port(port); + + /* Program the command list base and FIS base addresses */ + if (readl(port->dd->mmio + HOST_CAP) & HOST_CAP_64) { + writel((port->command_list_dma >> 16) >> 16, + port->mmio + PORT_LST_ADDR_HI); + writel((port->rxfis_dma >> 16) >> 16, + port->mmio + PORT_FIS_ADDR_HI); + } + + writel(port->command_list_dma & 0xFFFFFFFF, + port->mmio + PORT_LST_ADDR); + writel(port->rxfis_dma & 0xFFFFFFFF, port->mmio + PORT_FIS_ADDR); + + /* Clear SError */ + writel(readl(port->mmio + PORT_SCR_ERR), port->mmio + PORT_SCR_ERR); + + /* reset the completed registers.*/ + for (i = 0; i < port->dd->slot_groups; i++) + writel(0xFFFFFFFF, port->completed[i]); + + /* Clear any pending interrupts for this port */ + writel(readl(port->mmio + PORT_IRQ_STAT), port->mmio + PORT_IRQ_STAT); + + /* Clear any pending interrupts on the HBA. */ + writel(readl(port->dd->mmio + HOST_IRQ_STAT), + port->dd->mmio + HOST_IRQ_STAT); + + /* Enable port interrupts */ + writel(DEF_PORT_IRQ, port->mmio + PORT_IRQ_MASK); +} + +/* + * Restart a port + * + * @port Pointer to the port data structure. + * + * return value + * None + */ +static void mtip_restart_port(struct mtip_port *port) +{ + unsigned long timeout; + + /* Disable the DMA engine */ + mtip_enable_engine(port, 0); + + /* Chip quirk: wait up to 500ms for PxCMD.CR == 0 */ + timeout = jiffies + msecs_to_jiffies(500); + while ((readl(port->mmio + PORT_CMD) & PORT_CMD_LIST_ON) + && time_before(jiffies, timeout)) + ; + + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag)) + return; + + /* + * Chip quirk: escalate to hba reset if + * PxCMD.CR not clear after 500 ms + */ + if (readl(port->mmio + PORT_CMD) & PORT_CMD_LIST_ON) { + dev_warn(&port->dd->pdev->dev, + "PxCMD.CR not clear, escalating reset\n"); + + if (hba_reset_nosleep(port->dd)) + dev_err(&port->dd->pdev->dev, + "HBA reset escalation failed.\n"); + + /* 30 ms delay before com reset to quiesce chip */ + mdelay(30); + } + + dev_warn(&port->dd->pdev->dev, "Issuing COM reset\n"); + + /* Set PxSCTL.DET */ + writel(readl(port->mmio + PORT_SCR_CTL) | + 1, port->mmio + PORT_SCR_CTL); + readl(port->mmio + PORT_SCR_CTL); + + /* Wait 1 ms to quiesce chip function */ + timeout = jiffies + msecs_to_jiffies(1); + while (time_before(jiffies, timeout)) + ; + + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag)) + return; + + /* Clear PxSCTL.DET */ + writel(readl(port->mmio + PORT_SCR_CTL) & ~1, + port->mmio + PORT_SCR_CTL); + readl(port->mmio + PORT_SCR_CTL); + + /* Wait 500 ms for bit 0 of PORT_SCR_STS to be set */ + timeout = jiffies + msecs_to_jiffies(500); + while (((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0) + && time_before(jiffies, timeout)) + ; + + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag)) + return; + + if ((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0) + dev_warn(&port->dd->pdev->dev, + "COM reset failed\n"); + + mtip_init_port(port); + mtip_start_port(port); + +} + +/* + * Helper function for tag logging + */ +static void print_tags(struct driver_data *dd, + char *msg, + unsigned long *tagbits, + int cnt) +{ + unsigned char tagmap[128]; + int group, tagmap_len = 0; + + memset(tagmap, 0, sizeof(tagmap)); + for (group = SLOTBITS_IN_LONGS; group > 0; group--) + tagmap_len = sprintf(tagmap + tagmap_len, "%016lX ", + tagbits[group-1]); + dev_warn(&dd->pdev->dev, + "%d command(s) %s: tagmap [%s]", cnt, msg, tagmap); +} + +/* + * Called periodically to see if any read/write commands are + * taking too long to complete. + * + * @data Pointer to the PORT data structure. + * + * return value + * None + */ +static void mtip_timeout_function(unsigned long int data) +{ + struct mtip_port *port = (struct mtip_port *) data; + struct host_to_dev_fis *fis; + struct mtip_cmd *command; + int tag, cmdto_cnt = 0; + unsigned int bit, group; + unsigned int num_command_slots = port->dd->slot_groups * 32; + unsigned long to, tagaccum[SLOTBITS_IN_LONGS]; + + if (unlikely(!port)) + return; + + if (test_bit(MTIP_DDF_RESUME_BIT, &port->dd->dd_flag)) { + mod_timer(&port->cmd_timer, + jiffies + msecs_to_jiffies(30000)); + return; + } + /* clear the tag accumulator */ + memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long)); + + for (tag = 0; tag < num_command_slots; tag++) { + /* + * Skip internal command slot as it has + * its own timeout mechanism + */ + if (tag == MTIP_TAG_INTERNAL) + continue; + + if (atomic_read(&port->commands[tag].active) && + (time_after(jiffies, port->commands[tag].comp_time))) { + group = tag >> 5; + bit = tag & 0x1F; + + command = &port->commands[tag]; + fis = (struct host_to_dev_fis *) command->command; + + set_bit(tag, tagaccum); + cmdto_cnt++; + if (cmdto_cnt == 1) + set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); + + /* + * Clear the completed bit. This should prevent + * any interrupt handlers from trying to retire + * the command. + */ + writel(1 << bit, port->completed[group]); + + /* Call the async completion callback. */ + if (likely(command->async_callback)) + command->async_callback(command->async_data, + -EIO); + command->async_callback = NULL; + command->comp_func = NULL; + + /* Unmap the DMA scatter list entries */ + dma_unmap_sg(&port->dd->pdev->dev, + command->sg, + command->scatter_ents, + command->direction); + + /* + * Clear the allocated bit and active tag for the + * command. + */ + atomic_set(&port->commands[tag].active, 0); + release_slot(port, tag); + + up(&port->cmd_slot); + } + } + + if (cmdto_cnt && !test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { + print_tags(port->dd, "timed out", tagaccum, cmdto_cnt); + + mtip_restart_port(port); + clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); + wake_up_interruptible(&port->svc_wait); + } + + if (port->ic_pause_timer) { + to = port->ic_pause_timer + msecs_to_jiffies(1000); + if (time_after(jiffies, to)) { + if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { + port->ic_pause_timer = 0; + clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); + clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags); + clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); + wake_up_interruptible(&port->svc_wait); + } + + + } + } + + /* Restart the timer */ + mod_timer(&port->cmd_timer, + jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD)); +} + +/* + * IO completion function. + * + * This completion function is called by the driver ISR when a + * command that was issued by the kernel completes. It first calls the + * asynchronous completion function which normally calls back into the block + * layer passing the asynchronous callback data, then unmaps the + * scatter list associated with the completed command, and finally + * clears the allocated bit associated with the completed command. + * + * @port Pointer to the port data structure. + * @tag Tag of the command. + * @data Pointer to driver_data. + * @status Completion status. + * + * return value + * None + */ +static void mtip_async_complete(struct mtip_port *port, + int tag, + void *data, + int status) +{ + struct mtip_cmd *command; + struct driver_data *dd = data; + int cb_status = status ? -EIO : 0; + + if (unlikely(!dd) || unlikely(!port)) + return; + + command = &port->commands[tag]; + + if (unlikely(status == PORT_IRQ_TF_ERR)) { + dev_warn(&port->dd->pdev->dev, + "Command tag %d failed due to TFE\n", tag); + } + + /* Upper layer callback */ + if (likely(command->async_callback)) + command->async_callback(command->async_data, cb_status); + + command->async_callback = NULL; + command->comp_func = NULL; + + /* Unmap the DMA scatter list entries */ + dma_unmap_sg(&dd->pdev->dev, + command->sg, + command->scatter_ents, + command->direction); + + /* Clear the allocated and active bits for the command */ + atomic_set(&port->commands[tag].active, 0); + release_slot(port, tag); + + up(&port->cmd_slot); +} + +/* + * Internal command completion callback function. + * + * This function is normally called by the driver ISR when an internal + * command completed. This function signals the command completion by + * calling complete(). + * + * @port Pointer to the port data structure. + * @tag Tag of the command that has completed. + * @data Pointer to a completion structure. + * @status Completion status. + * + * return value + * None + */ +static void mtip_completion(struct mtip_port *port, + int tag, + void *data, + int status) +{ + struct mtip_cmd *command = &port->commands[tag]; + struct completion *waiting = data; + if (unlikely(status == PORT_IRQ_TF_ERR)) + dev_warn(&port->dd->pdev->dev, + "Internal command %d completed with TFE\n", tag); + + command->async_callback = NULL; + command->comp_func = NULL; + + complete(waiting); +} + +static void mtip_null_completion(struct mtip_port *port, + int tag, + void *data, + int status) +{ + return; +} + +static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer, + dma_addr_t buffer_dma, unsigned int sectors); +static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id, + struct smart_attr *attrib); +/* + * Handle an error. + * + * @dd Pointer to the DRIVER_DATA structure. + * + * return value + * None + */ +static void mtip_handle_tfe(struct driver_data *dd) +{ + int group, tag, bit, reissue, rv; + struct mtip_port *port; + struct mtip_cmd *cmd; + u32 completed; + struct host_to_dev_fis *fis; + unsigned long tagaccum[SLOTBITS_IN_LONGS]; + unsigned int cmd_cnt = 0; + unsigned char *buf; + char *fail_reason = NULL; + int fail_all_ncq_write = 0, fail_all_ncq_cmds = 0; + + dev_warn(&dd->pdev->dev, "Taskfile error\n"); + + port = dd->port; + + /* Stop the timer to prevent command timeouts. */ + del_timer(&port->cmd_timer); + set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); + + if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) && + test_bit(MTIP_TAG_INTERNAL, port->allocated)) { + cmd = &port->commands[MTIP_TAG_INTERNAL]; + dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n"); + + atomic_inc(&cmd->active); /* active > 1 indicates error */ + if (cmd->comp_data && cmd->comp_func) { + cmd->comp_func(port, MTIP_TAG_INTERNAL, + cmd->comp_data, PORT_IRQ_TF_ERR); + } + goto handle_tfe_exit; + } + + /* clear the tag accumulator */ + memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long)); + + /* Loop through all the groups */ + for (group = 0; group < dd->slot_groups; group++) { + completed = readl(port->completed[group]); + + /* clear completed status register in the hardware.*/ + writel(completed, port->completed[group]); + + /* Process successfully completed commands */ + for (bit = 0; bit < 32 && completed; bit++) { + if (!(completed & (1<<bit))) + continue; + tag = (group << 5) + bit; + + /* Skip the internal command slot */ + if (tag == MTIP_TAG_INTERNAL) + continue; + + cmd = &port->commands[tag]; + if (likely(cmd->comp_func)) { + set_bit(tag, tagaccum); + cmd_cnt++; + atomic_set(&cmd->active, 0); + cmd->comp_func(port, + tag, + cmd->comp_data, + 0); + } else { + dev_err(&port->dd->pdev->dev, + "Missing completion func for tag %d", + tag); + if (mtip_check_surprise_removal(dd->pdev)) { + mtip_command_cleanup(dd); + /* don't proceed further */ + return; + } + } + } + } + + print_tags(dd, "completed (TFE)", tagaccum, cmd_cnt); + + /* Restart the port */ + mdelay(20); + mtip_restart_port(port); + + /* Trying to determine the cause of the error */ + rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ, + dd->port->log_buf, + dd->port->log_buf_dma, 1); + if (rv) { + dev_warn(&dd->pdev->dev, + "Error in READ LOG EXT (10h) command\n"); + /* non-critical error, don't fail the load */ + } else { + buf = (unsigned char *)dd->port->log_buf; + if (buf[259] & 0x1) { + dev_info(&dd->pdev->dev, + "Write protect bit is set.\n"); + set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag); + fail_all_ncq_write = 1; + fail_reason = "write protect"; + } + if (buf[288] == 0xF7) { + dev_info(&dd->pdev->dev, + "Exceeded Tmax, drive in thermal shutdown.\n"); + set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag); + fail_all_ncq_cmds = 1; + fail_reason = "thermal shutdown"; + } + if (buf[288] == 0xBF) { + dev_info(&dd->pdev->dev, + "Drive indicates rebuild has failed.\n"); + fail_all_ncq_cmds = 1; + fail_reason = "rebuild failed"; + } + } + + /* clear the tag accumulator */ + memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long)); + + /* Loop through all the groups */ + for (group = 0; group < dd->slot_groups; group++) { + for (bit = 0; bit < 32; bit++) { + reissue = 1; + tag = (group << 5) + bit; + cmd = &port->commands[tag]; + + /* If the active bit is set re-issue the command */ + if (atomic_read(&cmd->active) == 0) + continue; + + fis = (struct host_to_dev_fis *)cmd->command; + + /* Should re-issue? */ + if (tag == MTIP_TAG_INTERNAL || + fis->command == ATA_CMD_SET_FEATURES) + reissue = 0; + else { + if (fail_all_ncq_cmds || + (fail_all_ncq_write && + fis->command == ATA_CMD_FPDMA_WRITE)) { + dev_warn(&dd->pdev->dev, + " Fail: %s w/tag %d [%s].\n", + fis->command == ATA_CMD_FPDMA_WRITE ? + "write" : "read", + tag, + fail_reason != NULL ? + fail_reason : "unknown"); + atomic_set(&cmd->active, 0); + if (cmd->comp_func) { + cmd->comp_func(port, tag, + cmd->comp_data, + -ENODATA); + } + continue; + } + } + + /* + * First check if this command has + * exceeded its retries. + */ + if (reissue && (cmd->retries-- > 0)) { + + set_bit(tag, tagaccum); + + /* Re-issue the command. */ + mtip_issue_ncq_command(port, tag); + + continue; + } + + /* Retire a command that will not be reissued */ + dev_warn(&port->dd->pdev->dev, + "retiring tag %d\n", tag); + atomic_set(&cmd->active, 0); + + if (cmd->comp_func) + cmd->comp_func( + port, + tag, + cmd->comp_data, + PORT_IRQ_TF_ERR); + else + dev_warn(&port->dd->pdev->dev, + "Bad completion for tag %d\n", + tag); + } + } + print_tags(dd, "reissued (TFE)", tagaccum, cmd_cnt); + +handle_tfe_exit: + /* clear eh_active */ + clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); + wake_up_interruptible(&port->svc_wait); + + mod_timer(&port->cmd_timer, + jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD)); +} + +/* + * Handle a set device bits interrupt + */ +static inline void mtip_process_sdbf(struct driver_data *dd) +{ + struct mtip_port *port = dd->port; + int group, tag, bit; + u32 completed; + struct mtip_cmd *command; + + /* walk all bits in all slot groups */ + for (group = 0; group < dd->slot_groups; group++) { + completed = readl(port->completed[group]); + if (!completed) + continue; + + /* clear completed status register in the hardware.*/ + writel(completed, port->completed[group]); + + /* Process completed commands. */ + for (bit = 0; + (bit < 32) && completed; + bit++, completed >>= 1) { + if (completed & 0x01) { + tag = (group << 5) | bit; + + /* skip internal command slot. */ + if (unlikely(tag == MTIP_TAG_INTERNAL)) + continue; + + command = &port->commands[tag]; + /* make internal callback */ + if (likely(command->comp_func)) { + command->comp_func( + port, + tag, + command->comp_data, + 0); + } else { + dev_warn(&dd->pdev->dev, + "Null completion " + "for tag %d", + tag); + + if (mtip_check_surprise_removal( + dd->pdev)) { + mtip_command_cleanup(dd); + return; + } + } + } + } + } +} + +/* + * Process legacy pio and d2h interrupts + */ +static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat) +{ + struct mtip_port *port = dd->port; + struct mtip_cmd *cmd = &port->commands[MTIP_TAG_INTERNAL]; + + if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) && + (cmd != NULL) && !(readl(port->cmd_issue[MTIP_TAG_INTERNAL]) + & (1 << MTIP_TAG_INTERNAL))) { + if (cmd->comp_func) { + cmd->comp_func(port, + MTIP_TAG_INTERNAL, + cmd->comp_data, + 0); + return; + } + } + + return; +} + +/* + * Demux and handle errors + */ +static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat) +{ + if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR))) + mtip_handle_tfe(dd); + + if (unlikely(port_stat & PORT_IRQ_CONNECT)) { + dev_warn(&dd->pdev->dev, + "Clearing PxSERR.DIAG.x\n"); + writel((1 << 26), dd->port->mmio + PORT_SCR_ERR); + } + + if (unlikely(port_stat & PORT_IRQ_PHYRDY)) { + dev_warn(&dd->pdev->dev, + "Clearing PxSERR.DIAG.n\n"); + writel((1 << 16), dd->port->mmio + PORT_SCR_ERR); + } + + if (unlikely(port_stat & ~PORT_IRQ_HANDLED)) { + dev_warn(&dd->pdev->dev, + "Port stat errors %x unhandled\n", + (port_stat & ~PORT_IRQ_HANDLED)); + } +} + +static inline irqreturn_t mtip_handle_irq(struct driver_data *data) +{ + struct driver_data *dd = (struct driver_data *) data; + struct mtip_port *port = dd->port; + u32 hba_stat, port_stat; + int rv = IRQ_NONE; + + hba_stat = readl(dd->mmio + HOST_IRQ_STAT); + if (hba_stat) { + rv = IRQ_HANDLED; + + /* Acknowledge the interrupt status on the port.*/ + port_stat = readl(port->mmio + PORT_IRQ_STAT); + writel(port_stat, port->mmio + PORT_IRQ_STAT); + + /* Demux port status */ + if (likely(port_stat & PORT_IRQ_SDB_FIS)) + mtip_process_sdbf(dd); + + if (unlikely(port_stat & PORT_IRQ_ERR)) { + if (unlikely(mtip_check_surprise_removal(dd->pdev))) { + mtip_command_cleanup(dd); + /* don't proceed further */ + return IRQ_HANDLED; + } + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &dd->dd_flag)) + return rv; + + mtip_process_errors(dd, port_stat & PORT_IRQ_ERR); + } + + if (unlikely(port_stat & PORT_IRQ_LEGACY)) + mtip_process_legacy(dd, port_stat & PORT_IRQ_LEGACY); + } + + /* acknowledge interrupt */ + writel(hba_stat, dd->mmio + HOST_IRQ_STAT); + + return rv; +} + +/* + * Wrapper for mtip_handle_irq + * (ignores return code) + */ +static void mtip_tasklet(unsigned long data) +{ + mtip_handle_irq((struct driver_data *) data); +} + +/* + * HBA interrupt subroutine. + * + * @irq IRQ number. + * @instance Pointer to the driver data structure. + * + * return value + * IRQ_HANDLED A HBA interrupt was pending and handled. + * IRQ_NONE This interrupt was not for the HBA. + */ +static irqreturn_t mtip_irq_handler(int irq, void *instance) +{ + struct driver_data *dd = instance; + tasklet_schedule(&dd->tasklet); + return IRQ_HANDLED; +} + +static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag) +{ + atomic_set(&port->commands[tag].active, 1); + writel(1 << MTIP_TAG_BIT(tag), + port->cmd_issue[MTIP_TAG_INDEX(tag)]); +} + +static bool mtip_pause_ncq(struct mtip_port *port, + struct host_to_dev_fis *fis) +{ + struct host_to_dev_fis *reply; + unsigned long task_file_data; + + reply = port->rxfis + RX_FIS_D2H_REG; + task_file_data = readl(port->mmio+PORT_TFDATA); + + if (fis->command == ATA_CMD_SEC_ERASE_UNIT) + clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag); + + if ((task_file_data & 1)) + return false; + + if (fis->command == ATA_CMD_SEC_ERASE_PREP) { + set_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); + set_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag); + port->ic_pause_timer = jiffies; + return true; + } else if ((fis->command == ATA_CMD_DOWNLOAD_MICRO) && + (fis->features == 0x03)) { + set_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags); + port->ic_pause_timer = jiffies; + return true; + } else if ((fis->command == ATA_CMD_SEC_ERASE_UNIT) || + ((fis->command == 0xFC) && + (fis->features == 0x27 || fis->features == 0x72 || + fis->features == 0x62 || fis->features == 0x26))) { + /* Com reset after secure erase or lowlevel format */ + mtip_restart_port(port); + return false; + } + + return false; +} + +/* + * Wait for port to quiesce + * + * @port Pointer to port data structure + * @timeout Max duration to wait (ms) + * + * return value + * 0 Success + * -EBUSY Commands still active + */ +static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout) +{ + unsigned long to; + unsigned int n; + unsigned int active = 1; + + to = jiffies + msecs_to_jiffies(timeout); + do { + if (test_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags) && + test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) { + msleep(20); + continue; /* svc thd is actively issuing commands */ + } + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag)) + return -EFAULT; + /* + * Ignore s_active bit 0 of array element 0. + * This bit will always be set + */ + active = readl(port->s_active[0]) & 0xFFFFFFFE; + for (n = 1; n < port->dd->slot_groups; n++) + active |= readl(port->s_active[n]); + + if (!active) + break; + + msleep(20); + } while (time_before(jiffies, to)); + + return active ? -EBUSY : 0; +} + +/* + * Execute an internal command and wait for the completion. + * + * @port Pointer to the port data structure. + * @fis Pointer to the FIS that describes the command. + * @fis_len Length in WORDS of the FIS. + * @buffer DMA accessible for command data. + * @buf_len Length, in bytes, of the data buffer. + * @opts Command header options, excluding the FIS length + * and the number of PRD entries. + * @timeout Time in ms to wait for the command to complete. + * + * return value + * 0 Command completed successfully. + * -EFAULT The buffer address is not correctly aligned. + * -EBUSY Internal command or other IO in progress. + * -EAGAIN Time out waiting for command to complete. + */ +static int mtip_exec_internal_command(struct mtip_port *port, + struct host_to_dev_fis *fis, + int fis_len, + dma_addr_t buffer, + int buf_len, + u32 opts, + gfp_t atomic, + unsigned long timeout) +{ + struct mtip_cmd_sg *command_sg; + DECLARE_COMPLETION_ONSTACK(wait); + int rv = 0, ready2go = 1; + struct mtip_cmd *int_cmd = &port->commands[MTIP_TAG_INTERNAL]; + unsigned long to; + + /* Make sure the buffer is 8 byte aligned. This is asic specific. */ + if (buffer & 0x00000007) { + dev_err(&port->dd->pdev->dev, + "SG buffer is not 8 byte aligned\n"); + return -EFAULT; + } + + to = jiffies + msecs_to_jiffies(timeout); + do { + ready2go = !test_and_set_bit(MTIP_TAG_INTERNAL, + port->allocated); + if (ready2go) + break; + mdelay(100); + } while (time_before(jiffies, to)); + if (!ready2go) { + dev_warn(&port->dd->pdev->dev, + "Internal cmd active. new cmd [%02X]\n", fis->command); + return -EBUSY; + } + set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); + port->ic_pause_timer = 0; + + if (fis->command == ATA_CMD_SEC_ERASE_UNIT) + clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); + else if (fis->command == ATA_CMD_DOWNLOAD_MICRO) + clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags); + + if (atomic == GFP_KERNEL) { + if (fis->command != ATA_CMD_STANDBYNOW1) { + /* wait for io to complete if non atomic */ + if (mtip_quiesce_io(port, 5000) < 0) { + dev_warn(&port->dd->pdev->dev, + "Failed to quiesce IO\n"); + release_slot(port, MTIP_TAG_INTERNAL); + clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); + wake_up_interruptible(&port->svc_wait); + return -EBUSY; + } + } + + /* Set the completion function and data for the command. */ + int_cmd->comp_data = &wait; + int_cmd->comp_func = mtip_completion; + + } else { + /* Clear completion - we're going to poll */ + int_cmd->comp_data = NULL; + int_cmd->comp_func = mtip_null_completion; + } + + /* Copy the command to the command table */ + memcpy(int_cmd->command, fis, fis_len*4); + + /* Populate the SG list */ + int_cmd->command_header->opts = + __force_bit2int cpu_to_le32(opts | fis_len); + if (buf_len) { + command_sg = int_cmd->command + AHCI_CMD_TBL_HDR_SZ; + + command_sg->info = + __force_bit2int cpu_to_le32((buf_len-1) & 0x3FFFFF); + command_sg->dba = + __force_bit2int cpu_to_le32(buffer & 0xFFFFFFFF); + command_sg->dba_upper = + __force_bit2int cpu_to_le32((buffer >> 16) >> 16); + + int_cmd->command_header->opts |= + __force_bit2int cpu_to_le32((1 << 16)); + } + + /* Populate the command header */ + int_cmd->command_header->byte_count = 0; + + /* Issue the command to the hardware */ + mtip_issue_non_ncq_command(port, MTIP_TAG_INTERNAL); + + /* Poll if atomic, wait_for_completion otherwise */ + if (atomic == GFP_KERNEL) { + /* Wait for the command to complete or timeout. */ + if (wait_for_completion_timeout( + &wait, + msecs_to_jiffies(timeout)) == 0) { + dev_err(&port->dd->pdev->dev, + "Internal command did not complete [%d] " + "within timeout of %lu ms\n", + atomic, timeout); + if (mtip_check_surprise_removal(port->dd->pdev) || + test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &port->dd->dd_flag)) { + rv = -ENXIO; + goto exec_ic_exit; + } + rv = -EAGAIN; + } + } else { + /* Spin for <timeout> checking if command still outstanding */ + timeout = jiffies + msecs_to_jiffies(timeout); + while ((readl(port->cmd_issue[MTIP_TAG_INTERNAL]) + & (1 << MTIP_TAG_INTERNAL)) + && time_before(jiffies, timeout)) { + if (mtip_check_surprise_removal(port->dd->pdev)) { + rv = -ENXIO; + goto exec_ic_exit; + } + if ((fis->command != ATA_CMD_STANDBYNOW1) && + test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &port->dd->dd_flag)) { + rv = -ENXIO; + goto exec_ic_exit; + } + if (readl(port->mmio + PORT_IRQ_STAT) & PORT_IRQ_ERR) { + atomic_inc(&int_cmd->active); /* error */ + break; + } + } + } + + if (atomic_read(&int_cmd->active) > 1) { + dev_err(&port->dd->pdev->dev, + "Internal command [%02X] failed\n", fis->command); + rv = -EIO; + } + if (readl(port->cmd_issue[MTIP_TAG_INTERNAL]) + & (1 << MTIP_TAG_INTERNAL)) { + rv = -ENXIO; + if (!test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &port->dd->dd_flag)) { + mtip_restart_port(port); + rv = -EAGAIN; + } + } +exec_ic_exit: + /* Clear the allocated and active bits for the internal command. */ + atomic_set(&int_cmd->active, 0); + release_slot(port, MTIP_TAG_INTERNAL); + if (rv >= 0 && mtip_pause_ncq(port, fis)) { + /* NCQ paused */ + return rv; + } + clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); + wake_up_interruptible(&port->svc_wait); + + return rv; +} + +/* + * Byte-swap ATA ID strings. + * + * ATA identify data contains strings in byte-swapped 16-bit words. + * They must be swapped (on all architectures) to be usable as C strings. + * This function swaps bytes in-place. + * + * @buf The buffer location of the string + * @len The number of bytes to swap + * + * return value + * None + */ +static inline void ata_swap_string(u16 *buf, unsigned int len) +{ + int i; + for (i = 0; i < (len/2); i++) + be16_to_cpus(&buf[i]); +} + +/* + * Request the device identity information. + * + * If a user space buffer is not specified, i.e. is NULL, the + * identify information is still read from the drive and placed + * into the identify data buffer (@e port->identify) in the + * port data structure. + * When the identify buffer contains valid identify information @e + * port->identify_valid is non-zero. + * + * @port Pointer to the port structure. + * @user_buffer A user space buffer where the identify data should be + * copied. + * + * return value + * 0 Command completed successfully. + * -EFAULT An error occurred while coping data to the user buffer. + * -1 Command failed. + */ +static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer) +{ + int rv = 0; + struct host_to_dev_fis fis; + + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag)) + return -EFAULT; + + /* Build the FIS. */ + memset(&fis, 0, sizeof(struct host_to_dev_fis)); + fis.type = 0x27; + fis.opts = 1 << 7; + fis.command = ATA_CMD_ID_ATA; + + /* Set the identify information as invalid. */ + port->identify_valid = 0; + + /* Clear the identify information. */ + memset(port->identify, 0, sizeof(u16) * ATA_ID_WORDS); + + /* Execute the command. */ + if (mtip_exec_internal_command(port, + &fis, + 5, + port->identify_dma, + sizeof(u16) * ATA_ID_WORDS, + 0, + GFP_KERNEL, + MTIP_INTERNAL_COMMAND_TIMEOUT_MS) + < 0) { + rv = -1; + goto out; + } + + /* + * Perform any necessary byte-swapping. Yes, the kernel does in fact + * perform field-sensitive swapping on the string fields. + * See the kernel use of ata_id_string() for proof of this. + */ +#ifdef __LITTLE_ENDIAN + ata_swap_string(port->identify + 27, 40); /* model string*/ + ata_swap_string(port->identify + 23, 8); /* firmware string*/ + ata_swap_string(port->identify + 10, 20); /* serial# string*/ +#else + { + int i; + for (i = 0; i < ATA_ID_WORDS; i++) + port->identify[i] = le16_to_cpu(port->identify[i]); + } +#endif + + /* Set the identify buffer as valid. */ + port->identify_valid = 1; + + if (user_buffer) { + if (copy_to_user( + user_buffer, + port->identify, + ATA_ID_WORDS * sizeof(u16))) { + rv = -EFAULT; + goto out; + } + } + +out: + return rv; +} + +/* + * Issue a standby immediate command to the device. + * + * @port Pointer to the port structure. + * + * return value + * 0 Command was executed successfully. + * -1 An error occurred while executing the command. + */ +static int mtip_standby_immediate(struct mtip_port *port) +{ + int rv; + struct host_to_dev_fis fis; + unsigned long start; + + /* Build the FIS. */ + memset(&fis, 0, sizeof(struct host_to_dev_fis)); + fis.type = 0x27; + fis.opts = 1 << 7; + fis.command = ATA_CMD_STANDBYNOW1; + + start = jiffies; + rv = mtip_exec_internal_command(port, + &fis, + 5, + 0, + 0, + 0, + GFP_ATOMIC, + 15000); + dbg_printk(MTIP_DRV_NAME "Time taken to complete standby cmd: %d ms\n", + jiffies_to_msecs(jiffies - start)); + if (rv) + dev_warn(&port->dd->pdev->dev, + "STANDBY IMMEDIATE command failed.\n"); + + return rv; +} + +/* + * Issue a READ LOG EXT command to the device. + * + * @port pointer to the port structure. + * @page page number to fetch + * @buffer pointer to buffer + * @buffer_dma dma address corresponding to @buffer + * @sectors page length to fetch, in sectors + * + * return value + * @rv return value from mtip_exec_internal_command() + */ +static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer, + dma_addr_t buffer_dma, unsigned int sectors) +{ + struct host_to_dev_fis fis; + + memset(&fis, 0, sizeof(struct host_to_dev_fis)); + fis.type = 0x27; + fis.opts = 1 << 7; + fis.command = ATA_CMD_READ_LOG_EXT; + fis.sect_count = sectors & 0xFF; + fis.sect_cnt_ex = (sectors >> 8) & 0xFF; + fis.lba_low = page; + fis.lba_mid = 0; + fis.device = ATA_DEVICE_OBS; + + memset(buffer, 0, sectors * ATA_SECT_SIZE); + + return mtip_exec_internal_command(port, + &fis, + 5, + buffer_dma, + sectors * ATA_SECT_SIZE, + 0, + GFP_ATOMIC, + MTIP_INTERNAL_COMMAND_TIMEOUT_MS); +} + +/* + * Issue a SMART READ DATA command to the device. + * + * @port pointer to the port structure. + * @buffer pointer to buffer + * @buffer_dma dma address corresponding to @buffer + * + * return value + * @rv return value from mtip_exec_internal_command() + */ +static int mtip_get_smart_data(struct mtip_port *port, u8 *buffer, + dma_addr_t buffer_dma) +{ + struct host_to_dev_fis fis; + + memset(&fis, 0, sizeof(struct host_to_dev_fis)); + fis.type = 0x27; + fis.opts = 1 << 7; + fis.command = ATA_CMD_SMART; + fis.features = 0xD0; + fis.sect_count = 1; + fis.lba_mid = 0x4F; + fis.lba_hi = 0xC2; + fis.device = ATA_DEVICE_OBS; + + return mtip_exec_internal_command(port, + &fis, + 5, + buffer_dma, + ATA_SECT_SIZE, + 0, + GFP_ATOMIC, + 15000); +} + +/* + * Get the value of a smart attribute + * + * @port pointer to the port structure + * @id attribute number + * @attrib pointer to return attrib information corresponding to @id + * + * return value + * -EINVAL NULL buffer passed or unsupported attribute @id. + * -EPERM Identify data not valid, SMART not supported or not enabled + */ +static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id, + struct smart_attr *attrib) +{ + int rv, i; + struct smart_attr *pattr; + + if (!attrib) + return -EINVAL; + + if (!port->identify_valid) { + dev_warn(&port->dd->pdev->dev, "IDENTIFY DATA not valid\n"); + return -EPERM; + } + if (!(port->identify[82] & 0x1)) { + dev_warn(&port->dd->pdev->dev, "SMART not supported\n"); + return -EPERM; + } + if (!(port->identify[85] & 0x1)) { + dev_warn(&port->dd->pdev->dev, "SMART not enabled\n"); + return -EPERM; + } + + memset(port->smart_buf, 0, ATA_SECT_SIZE); + rv = mtip_get_smart_data(port, port->smart_buf, port->smart_buf_dma); + if (rv) { + dev_warn(&port->dd->pdev->dev, "Failed to ge SMART data\n"); + return rv; + } + + pattr = (struct smart_attr *)(port->smart_buf + 2); + for (i = 0; i < 29; i++, pattr++) + if (pattr->attr_id == id) { + memcpy(attrib, pattr, sizeof(struct smart_attr)); + break; + } + + if (i == 29) { + dev_warn(&port->dd->pdev->dev, + "Query for invalid SMART attribute ID\n"); + rv = -EINVAL; + } + + return rv; +} + +/* + * Get the drive capacity. + * + * @dd Pointer to the device data structure. + * @sectors Pointer to the variable that will receive the sector count. + * + * return value + * 1 Capacity was returned successfully. + * 0 The identify information is invalid. + */ +static bool mtip_hw_get_capacity(struct driver_data *dd, sector_t *sectors) +{ + struct mtip_port *port = dd->port; + u64 total, raw0, raw1, raw2, raw3; + raw0 = port->identify[100]; + raw1 = port->identify[101]; + raw2 = port->identify[102]; + raw3 = port->identify[103]; + total = raw0 | raw1<<16 | raw2<<32 | raw3<<48; + *sectors = total; + return (bool) !!port->identify_valid; +} + +/* + * Reset the HBA. + * + * Resets the HBA by setting the HBA Reset bit in the Global + * HBA Control register. After setting the HBA Reset bit the + * function waits for 1 second before reading the HBA Reset + * bit to make sure it has cleared. If HBA Reset is not clear + * an error is returned. Cannot be used in non-blockable + * context. + * + * @dd Pointer to the driver data structure. + * + * return value + * 0 The reset was successful. + * -1 The HBA Reset bit did not clear. + */ +static int mtip_hba_reset(struct driver_data *dd) +{ + mtip_deinit_port(dd->port); + + /* Set the reset bit */ + writel(HOST_RESET, dd->mmio + HOST_CTL); + + /* Flush */ + readl(dd->mmio + HOST_CTL); + + /* Wait for reset to clear */ + ssleep(1); + + /* Check the bit has cleared */ + if (readl(dd->mmio + HOST_CTL) & HOST_RESET) { + dev_err(&dd->pdev->dev, + "Reset bit did not clear.\n"); + return -1; + } + + return 0; +} + +/* + * Display the identify command data. + * + * @port Pointer to the port data structure. + * + * return value + * None + */ +static void mtip_dump_identify(struct mtip_port *port) +{ + sector_t sectors; + unsigned short revid; + char cbuf[42]; + + if (!port->identify_valid) + return; + + strlcpy(cbuf, (char *)(port->identify+10), 21); + dev_info(&port->dd->pdev->dev, + "Serial No.: %s\n", cbuf); + + strlcpy(cbuf, (char *)(port->identify+23), 9); + dev_info(&port->dd->pdev->dev, + "Firmware Ver.: %s\n", cbuf); + + strlcpy(cbuf, (char *)(port->identify+27), 41); + dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf); + + if (mtip_hw_get_capacity(port->dd, §ors)) + dev_info(&port->dd->pdev->dev, + "Capacity: %llu sectors (%llu MB)\n", + (u64)sectors, + ((u64)sectors) * ATA_SECT_SIZE >> 20); + + pci_read_config_word(port->dd->pdev, PCI_REVISION_ID, &revid); + switch (revid & 0xFF) { + case 0x1: + strlcpy(cbuf, "A0", 3); + break; + case 0x3: + strlcpy(cbuf, "A2", 3); + break; + default: + strlcpy(cbuf, "?", 2); + break; + } + dev_info(&port->dd->pdev->dev, + "Card Type: %s\n", cbuf); +} + +/* + * Map the commands scatter list into the command table. + * + * @command Pointer to the command. + * @nents Number of scatter list entries. + * + * return value + * None + */ +static inline void fill_command_sg(struct driver_data *dd, + struct mtip_cmd *command, + int nents) +{ + int n; + unsigned int dma_len; + struct mtip_cmd_sg *command_sg; + struct scatterlist *sg = command->sg; + + command_sg = command->command + AHCI_CMD_TBL_HDR_SZ; + + for (n = 0; n < nents; n++) { + dma_len = sg_dma_len(sg); + if (dma_len > 0x400000) + dev_err(&dd->pdev->dev, + "DMA segment length truncated\n"); + command_sg->info = __force_bit2int + cpu_to_le32((dma_len-1) & 0x3FFFFF); + command_sg->dba = __force_bit2int + cpu_to_le32(sg_dma_address(sg)); + command_sg->dba_upper = __force_bit2int + cpu_to_le32((sg_dma_address(sg) >> 16) >> 16); + command_sg++; + sg++; + } +} + +/* + * @brief Execute a drive command. + * + * return value 0 The command completed successfully. + * return value -1 An error occurred while executing the command. + */ +static int exec_drive_task(struct mtip_port *port, u8 *command) +{ + struct host_to_dev_fis fis; + struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG); + + /* Build the FIS. */ + memset(&fis, 0, sizeof(struct host_to_dev_fis)); + fis.type = 0x27; + fis.opts = 1 << 7; + fis.command = command[0]; + fis.features = command[1]; + fis.sect_count = command[2]; + fis.sector = command[3]; + fis.cyl_low = command[4]; + fis.cyl_hi = command[5]; + fis.device = command[6] & ~0x10; /* Clear the dev bit*/ + + dbg_printk(MTIP_DRV_NAME " %s: User Command: cmd %x, feat %x, nsect %x, sect %x, lcyl %x, hcyl %x, sel %x\n", + __func__, + command[0], + command[1], + command[2], + command[3], + command[4], + command[5], + command[6]); + + /* Execute the command. */ + if (mtip_exec_internal_command(port, + &fis, + 5, + 0, + 0, + 0, + GFP_KERNEL, + MTIP_IOCTL_COMMAND_TIMEOUT_MS) < 0) { + return -1; + } + + command[0] = reply->command; /* Status*/ + command[1] = reply->features; /* Error*/ + command[4] = reply->cyl_low; + command[5] = reply->cyl_hi; + + dbg_printk(MTIP_DRV_NAME " %s: Completion Status: stat %x, err %x , cyl_lo %x cyl_hi %x\n", + __func__, + command[0], + command[1], + command[4], + command[5]); + + return 0; +} + +/* + * @brief Execute a drive command. + * + * @param port Pointer to the port data structure. + * @param command Pointer to the user specified command parameters. + * @param user_buffer Pointer to the user space buffer where read sector + * data should be copied. + * + * return value 0 The command completed successfully. + * return value -EFAULT An error occurred while copying the completion + * data to the user space buffer. + * return value -1 An error occurred while executing the command. + */ +static int exec_drive_command(struct mtip_port *port, u8 *command, + void __user *user_buffer) +{ + struct host_to_dev_fis fis; + struct host_to_dev_fis *reply; + u8 *buf = NULL; + dma_addr_t dma_addr = 0; + int rv = 0, xfer_sz = command[3]; + + if (xfer_sz) { + if (!user_buffer) + return -EFAULT; + + buf = dmam_alloc_coherent(&port->dd->pdev->dev, + ATA_SECT_SIZE * xfer_sz, + &dma_addr, + GFP_KERNEL); + if (!buf) { + dev_err(&port->dd->pdev->dev, + "Memory allocation failed (%d bytes)\n", + ATA_SECT_SIZE * xfer_sz); + return -ENOMEM; + } + memset(buf, 0, ATA_SECT_SIZE * xfer_sz); + } + + /* Build the FIS. */ + memset(&fis, 0, sizeof(struct host_to_dev_fis)); + fis.type = 0x27; + fis.opts = 1 << 7; + fis.command = command[0]; + fis.features = command[2]; + fis.sect_count = command[3]; + if (fis.command == ATA_CMD_SMART) { + fis.sector = command[1]; + fis.cyl_low = 0x4F; + fis.cyl_hi = 0xC2; + } + + if (xfer_sz) + reply = (port->rxfis + RX_FIS_PIO_SETUP); + else + reply = (port->rxfis + RX_FIS_D2H_REG); + + dbg_printk(MTIP_DRV_NAME + " %s: User Command: cmd %x, sect %x, " + "feat %x, sectcnt %x\n", + __func__, + command[0], + command[1], + command[2], + command[3]); + + /* Execute the command. */ + if (mtip_exec_internal_command(port, + &fis, + 5, + (xfer_sz ? dma_addr : 0), + (xfer_sz ? ATA_SECT_SIZE * xfer_sz : 0), + 0, + GFP_KERNEL, + MTIP_IOCTL_COMMAND_TIMEOUT_MS) + < 0) { + rv = -EFAULT; + goto exit_drive_command; + } + + /* Collect the completion status. */ + command[0] = reply->command; /* Status*/ + command[1] = reply->features; /* Error*/ + command[2] = reply->sect_count; + + dbg_printk(MTIP_DRV_NAME + " %s: Completion Status: stat %x, " + "err %x, nsect %x\n", + __func__, + command[0], + command[1], + command[2]); + + if (xfer_sz) { + if (copy_to_user(user_buffer, + buf, + ATA_SECT_SIZE * command[3])) { + rv = -EFAULT; + goto exit_drive_command; + } + } +exit_drive_command: + if (buf) + dmam_free_coherent(&port->dd->pdev->dev, + ATA_SECT_SIZE * xfer_sz, buf, dma_addr); + return rv; +} + +/* + * Indicates whether a command has a single sector payload. + * + * @command passed to the device to perform the certain event. + * @features passed to the device to perform the certain event. + * + * return value + * 1 command is one that always has a single sector payload, + * regardless of the value in the Sector Count field. + * 0 otherwise + * + */ +static unsigned int implicit_sector(unsigned char command, + unsigned char features) +{ + unsigned int rv = 0; + + /* list of commands that have an implicit sector count of 1 */ + switch (command) { + case ATA_CMD_SEC_SET_PASS: + case ATA_CMD_SEC_UNLOCK: + case ATA_CMD_SEC_ERASE_PREP: + case ATA_CMD_SEC_ERASE_UNIT: + case ATA_CMD_SEC_FREEZE_LOCK: + case ATA_CMD_SEC_DISABLE_PASS: + case ATA_CMD_PMP_READ: + case ATA_CMD_PMP_WRITE: + rv = 1; + break; + case ATA_CMD_SET_MAX: + if (features == ATA_SET_MAX_UNLOCK) + rv = 1; + break; + case ATA_CMD_SMART: + if ((features == ATA_SMART_READ_VALUES) || + (features == ATA_SMART_READ_THRESHOLDS)) + rv = 1; + break; + case ATA_CMD_CONF_OVERLAY: + if ((features == ATA_DCO_IDENTIFY) || + (features == ATA_DCO_SET)) + rv = 1; + break; + } + return rv; +} + +static void mtip_set_timeout(struct host_to_dev_fis *fis, unsigned int *timeout) +{ + switch (fis->command) { + case ATA_CMD_DOWNLOAD_MICRO: + *timeout = 120000; /* 2 minutes */ + break; + case ATA_CMD_SEC_ERASE_UNIT: + case 0xFC: + *timeout = 240000; /* 4 minutes */ + break; + case ATA_CMD_STANDBYNOW1: + *timeout = 120000; /* 2 minutes */ + break; + case 0xF7: + case 0xFA: + *timeout = 60000; /* 60 seconds */ + break; + case ATA_CMD_SMART: + *timeout = 15000; /* 15 seconds */ + break; + default: + *timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS; + break; + } +} + +/* + * Executes a taskfile + * See ide_taskfile_ioctl() for derivation + */ +static int exec_drive_taskfile(struct driver_data *dd, + void __user *buf, + ide_task_request_t *req_task, + int outtotal) +{ + struct host_to_dev_fis fis; + struct host_to_dev_fis *reply; + u8 *outbuf = NULL; + u8 *inbuf = NULL; + dma_addr_t outbuf_dma = 0; + dma_addr_t inbuf_dma = 0; + dma_addr_t dma_buffer = 0; + int err = 0; + unsigned int taskin = 0; + unsigned int taskout = 0; + u8 nsect = 0; + unsigned int timeout; + unsigned int force_single_sector; + unsigned int transfer_size; + unsigned long task_file_data; + int intotal = outtotal + req_task->out_size; + + taskout = req_task->out_size; + taskin = req_task->in_size; + /* 130560 = 512 * 0xFF*/ + if (taskin > 130560 || taskout > 130560) { + err = -EINVAL; + goto abort; + } + + if (taskout) { + outbuf = kzalloc(taskout, GFP_KERNEL); + if (outbuf == NULL) { + err = -ENOMEM; + goto abort; + } + if (copy_from_user(outbuf, buf + outtotal, taskout)) { + err = -EFAULT; + goto abort; + } + outbuf_dma = pci_map_single(dd->pdev, + outbuf, + taskout, + DMA_TO_DEVICE); + if (outbuf_dma == 0) { + err = -ENOMEM; + goto abort; + } + dma_buffer = outbuf_dma; + } + + if (taskin) { + inbuf = kzalloc(taskin, GFP_KERNEL); + if (inbuf == NULL) { + err = -ENOMEM; + goto abort; + } + + if (copy_from_user(inbuf, buf + intotal, taskin)) { + err = -EFAULT; + goto abort; + } + inbuf_dma = pci_map_single(dd->pdev, + inbuf, + taskin, DMA_FROM_DEVICE); + if (inbuf_dma == 0) { + err = -ENOMEM; + goto abort; + } + dma_buffer = inbuf_dma; + } + + /* only supports PIO and non-data commands from this ioctl. */ + switch (req_task->data_phase) { + case TASKFILE_OUT: + nsect = taskout / ATA_SECT_SIZE; + reply = (dd->port->rxfis + RX_FIS_PIO_SETUP); + break; + case TASKFILE_IN: + reply = (dd->port->rxfis + RX_FIS_PIO_SETUP); + break; + case TASKFILE_NO_DATA: + reply = (dd->port->rxfis + RX_FIS_D2H_REG); + break; + default: + err = -EINVAL; + goto abort; + } + + /* Build the FIS. */ + memset(&fis, 0, sizeof(struct host_to_dev_fis)); + + fis.type = 0x27; + fis.opts = 1 << 7; + fis.command = req_task->io_ports[7]; + fis.features = req_task->io_ports[1]; + fis.sect_count = req_task->io_ports[2]; + fis.lba_low = req_task->io_ports[3]; + fis.lba_mid = req_task->io_ports[4]; + fis.lba_hi = req_task->io_ports[5]; + /* Clear the dev bit*/ + fis.device = req_task->io_ports[6] & ~0x10; + + if ((req_task->in_flags.all == 0) && (req_task->out_flags.all & 1)) { + req_task->in_flags.all = + IDE_TASKFILE_STD_IN_FLAGS | + (IDE_HOB_STD_IN_FLAGS << 8); + fis.lba_low_ex = req_task->hob_ports[3]; + fis.lba_mid_ex = req_task->hob_ports[4]; + fis.lba_hi_ex = req_task->hob_ports[5]; + fis.features_ex = req_task->hob_ports[1]; + fis.sect_cnt_ex = req_task->hob_ports[2]; + + } else { + req_task->in_flags.all = IDE_TASKFILE_STD_IN_FLAGS; + } + + force_single_sector = implicit_sector(fis.command, fis.features); + + if ((taskin || taskout) && (!fis.sect_count)) { + if (nsect) + fis.sect_count = nsect; + else { + if (!force_single_sector) { + dev_warn(&dd->pdev->dev, + "data movement but " + "sect_count is 0\n"); + err = -EINVAL; + goto abort; + } + } + } + + dbg_printk(MTIP_DRV_NAME + " %s: cmd %x, feat %x, nsect %x," + " sect/lbal %x, lcyl/lbam %x, hcyl/lbah %x," + " head/dev %x\n", + __func__, + fis.command, + fis.features, + fis.sect_count, + fis.lba_low, + fis.lba_mid, + fis.lba_hi, + fis.device); + + mtip_set_timeout(&fis, &timeout); + + /* Determine the correct transfer size.*/ + if (force_single_sector) + transfer_size = ATA_SECT_SIZE; + else + transfer_size = ATA_SECT_SIZE * fis.sect_count; + + /* Execute the command.*/ + if (mtip_exec_internal_command(dd->port, + &fis, + 5, + dma_buffer, + transfer_size, + 0, + GFP_KERNEL, + timeout) < 0) { + err = -EIO; + goto abort; + } + + task_file_data = readl(dd->port->mmio+PORT_TFDATA); + + if ((req_task->data_phase == TASKFILE_IN) && !(task_file_data & 1)) { + reply = dd->port->rxfis + RX_FIS_PIO_SETUP; + req_task->io_ports[7] = reply->control; + } else { + reply = dd->port->rxfis + RX_FIS_D2H_REG; + req_task->io_ports[7] = reply->command; + } + + /* reclaim the DMA buffers.*/ + if (inbuf_dma) + pci_unmap_single(dd->pdev, inbuf_dma, + taskin, DMA_FROM_DEVICE); + if (outbuf_dma) + pci_unmap_single(dd->pdev, outbuf_dma, + taskout, DMA_TO_DEVICE); + inbuf_dma = 0; + outbuf_dma = 0; + + /* return the ATA registers to the caller.*/ + req_task->io_ports[1] = reply->features; + req_task->io_ports[2] = reply->sect_count; + req_task->io_ports[3] = reply->lba_low; + req_task->io_ports[4] = reply->lba_mid; + req_task->io_ports[5] = reply->lba_hi; + req_task->io_ports[6] = reply->device; + + if (req_task->out_flags.all & 1) { + + req_task->hob_ports[3] = reply->lba_low_ex; + req_task->hob_ports[4] = reply->lba_mid_ex; + req_task->hob_ports[5] = reply->lba_hi_ex; + req_task->hob_ports[1] = reply->features_ex; + req_task->hob_ports[2] = reply->sect_cnt_ex; + } + dbg_printk(MTIP_DRV_NAME + " %s: Completion: stat %x," + "err %x, sect_cnt %x, lbalo %x," + "lbamid %x, lbahi %x, dev %x\n", + __func__, + req_task->io_ports[7], + req_task->io_ports[1], + req_task->io_ports[2], + req_task->io_ports[3], + req_task->io_ports[4], + req_task->io_ports[5], + req_task->io_ports[6]); + + if (taskout) { + if (copy_to_user(buf + outtotal, outbuf, taskout)) { + err = -EFAULT; + goto abort; + } + } + if (taskin) { + if (copy_to_user(buf + intotal, inbuf, taskin)) { + err = -EFAULT; + goto abort; + } + } +abort: + if (inbuf_dma) + pci_unmap_single(dd->pdev, inbuf_dma, + taskin, DMA_FROM_DEVICE); + if (outbuf_dma) + pci_unmap_single(dd->pdev, outbuf_dma, + taskout, DMA_TO_DEVICE); + kfree(outbuf); + kfree(inbuf); + + return err; +} + +/* + * Handle IOCTL calls from the Block Layer. + * + * This function is called by the Block Layer when it receives an IOCTL + * command that it does not understand. If the IOCTL command is not supported + * this function returns -ENOTTY. + * + * @dd Pointer to the driver data structure. + * @cmd IOCTL command passed from the Block Layer. + * @arg IOCTL argument passed from the Block Layer. + * + * return value + * 0 The IOCTL completed successfully. + * -ENOTTY The specified command is not supported. + * -EFAULT An error occurred copying data to a user space buffer. + * -EIO An error occurred while executing the command. + */ +static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case HDIO_GET_IDENTITY: + { + if (copy_to_user((void __user *)arg, dd->port->identify, + sizeof(u16) * ATA_ID_WORDS)) + return -EFAULT; + break; + } + case HDIO_DRIVE_CMD: + { + u8 drive_command[4]; + + /* Copy the user command info to our buffer. */ + if (copy_from_user(drive_command, + (void __user *) arg, + sizeof(drive_command))) + return -EFAULT; + + /* Execute the drive command. */ + if (exec_drive_command(dd->port, + drive_command, + (void __user *) (arg+4))) + return -EIO; + + /* Copy the status back to the users buffer. */ + if (copy_to_user((void __user *) arg, + drive_command, + sizeof(drive_command))) + return -EFAULT; + + break; + } + case HDIO_DRIVE_TASK: + { + u8 drive_command[7]; + + /* Copy the user command info to our buffer. */ + if (copy_from_user(drive_command, + (void __user *) arg, + sizeof(drive_command))) + return -EFAULT; + + /* Execute the drive command. */ + if (exec_drive_task(dd->port, drive_command)) + return -EIO; + + /* Copy the status back to the users buffer. */ + if (copy_to_user((void __user *) arg, + drive_command, + sizeof(drive_command))) + return -EFAULT; + + break; + } + case HDIO_DRIVE_TASKFILE: { + ide_task_request_t req_task; + int ret, outtotal; + + if (copy_from_user(&req_task, (void __user *) arg, + sizeof(req_task))) + return -EFAULT; + + outtotal = sizeof(req_task); + + ret = exec_drive_taskfile(dd, (void __user *) arg, + &req_task, outtotal); + + if (copy_to_user((void __user *) arg, &req_task, + sizeof(req_task))) + return -EFAULT; + + return ret; + } + + default: + return -EINVAL; + } + return 0; +} + +/* + * Submit an IO to the hw + * + * This function is called by the block layer to issue an io + * to the device. Upon completion, the callback function will + * be called with the data parameter passed as the callback data. + * + * @dd Pointer to the driver data structure. + * @start First sector to read. + * @nsect Number of sectors to read. + * @nents Number of entries in scatter list for the read command. + * @tag The tag of this read command. + * @callback Pointer to the function that should be called + * when the read completes. + * @data Callback data passed to the callback function + * when the read completes. + * @dir Direction (read or write) + * + * return value + * None + */ +static void mtip_hw_submit_io(struct driver_data *dd, sector_t start, + int nsect, int nents, int tag, void *callback, + void *data, int dir) +{ + struct host_to_dev_fis *fis; + struct mtip_port *port = dd->port; + struct mtip_cmd *command = &port->commands[tag]; + int dma_dir = (dir == READ) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + + /* Map the scatter list for DMA access */ + nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir); + + command->scatter_ents = nents; + + /* + * The number of retries for this command before it is + * reported as a failure to the upper layers. + */ + command->retries = MTIP_MAX_RETRIES; + + /* Fill out fis */ + fis = command->command; + fis->type = 0x27; + fis->opts = 1 << 7; + fis->command = + (dir == READ ? ATA_CMD_FPDMA_READ : ATA_CMD_FPDMA_WRITE); + *((unsigned int *) &fis->lba_low) = (start & 0xFFFFFF); + *((unsigned int *) &fis->lba_low_ex) = ((start >> 24) & 0xFFFFFF); + fis->device = 1 << 6; + fis->features = nsect & 0xFF; + fis->features_ex = (nsect >> 8) & 0xFF; + fis->sect_count = ((tag << 3) | (tag >> 5)); + fis->sect_cnt_ex = 0; + fis->control = 0; + fis->res2 = 0; + fis->res3 = 0; + fill_command_sg(dd, command, nents); + + /* Populate the command header */ + command->command_header->opts = + __force_bit2int cpu_to_le32( + (nents << 16) | 5 | AHCI_CMD_PREFETCH); + command->command_header->byte_count = 0; + + /* + * Set the completion function and data for the command + * within this layer. + */ + command->comp_data = dd; + command->comp_func = mtip_async_complete; + command->direction = dma_dir; + + /* + * Set the completion function and data for the command passed + * from the upper layer. + */ + command->async_data = data; + command->async_callback = callback; + + /* + * To prevent this command from being issued + * if an internal command is in progress or error handling is active. + */ + if (port->flags & MTIP_PF_PAUSE_IO) { + set_bit(tag, port->cmds_to_issue); + set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); + return; + } + + /* Issue the command to the hardware */ + mtip_issue_ncq_command(port, tag); + + return; +} + +/* + * Release a command slot. + * + * @dd Pointer to the driver data structure. + * @tag Slot tag + * + * return value + * None + */ +static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag) +{ + release_slot(dd->port, tag); +} + +/* + * Obtain a command slot and return its associated scatter list. + * + * @dd Pointer to the driver data structure. + * @tag Pointer to an int that will receive the allocated command + * slot tag. + * + * return value + * Pointer to the scatter list for the allocated command slot + * or NULL if no command slots are available. + */ +static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd, + int *tag) +{ + /* + * It is possible that, even with this semaphore, a thread + * may think that no command slots are available. Therefore, we + * need to make an attempt to get_slot(). + */ + down(&dd->port->cmd_slot); + *tag = get_slot(dd->port); + + if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) { + up(&dd->port->cmd_slot); + return NULL; + } + if (unlikely(*tag < 0)) { + up(&dd->port->cmd_slot); + return NULL; + } + + return dd->port->commands[*tag].sg; +} + +/* + * Sysfs status dump. + * + * @dev Pointer to the device structure, passed by the kernrel. + * @attr Pointer to the device_attribute structure passed by the kernel. + * @buf Pointer to the char buffer that will receive the stats info. + * + * return value + * The size, in bytes, of the data copied into buf. + */ +static ssize_t mtip_hw_show_status(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct driver_data *dd = dev_to_disk(dev)->private_data; + int size = 0; + + if (test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag)) + size += sprintf(buf, "%s", "thermal_shutdown\n"); + else if (test_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag)) + size += sprintf(buf, "%s", "write_protect\n"); + else + size += sprintf(buf, "%s", "online\n"); + + return size; +} + +static DEVICE_ATTR(status, S_IRUGO, mtip_hw_show_status, NULL); + +static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf, + size_t len, loff_t *offset) +{ + struct driver_data *dd = (struct driver_data *)f->private_data; + char buf[MTIP_DFS_MAX_BUF_SIZE]; + u32 group_allocated; + int size = *offset; + int n; + + if (!len || size) + return 0; + + size += sprintf(&buf[size], "H/ S ACTive : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) + size += sprintf(&buf[size], "%08X ", + readl(dd->port->s_active[n])); + + size += sprintf(&buf[size], "]\n"); + size += sprintf(&buf[size], "H/ Command Issue : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) + size += sprintf(&buf[size], "%08X ", + readl(dd->port->cmd_issue[n])); + + size += sprintf(&buf[size], "]\n"); + size += sprintf(&buf[size], "H/ Completed : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) + size += sprintf(&buf[size], "%08X ", + readl(dd->port->completed[n])); + + size += sprintf(&buf[size], "]\n"); + size += sprintf(&buf[size], "H/ PORT IRQ STAT : [ 0x%08X ]\n", + readl(dd->port->mmio + PORT_IRQ_STAT)); + size += sprintf(&buf[size], "H/ HOST IRQ STAT : [ 0x%08X ]\n", + readl(dd->mmio + HOST_IRQ_STAT)); + size += sprintf(&buf[size], "\n"); + + size += sprintf(&buf[size], "L/ Allocated : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) { + if (sizeof(long) > sizeof(u32)) + group_allocated = + dd->port->allocated[n/2] >> (32*(n&1)); + else + group_allocated = dd->port->allocated[n]; + size += sprintf(&buf[size], "%08X ", group_allocated); + } + size += sprintf(&buf[size], "]\n"); + + size += sprintf(&buf[size], "L/ Commands in Q : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) { + if (sizeof(long) > sizeof(u32)) + group_allocated = + dd->port->cmds_to_issue[n/2] >> (32*(n&1)); + else + group_allocated = dd->port->cmds_to_issue[n]; + size += sprintf(&buf[size], "%08X ", group_allocated); + } + size += sprintf(&buf[size], "]\n"); + + *offset = size <= len ? size : len; + size = copy_to_user(ubuf, buf, *offset); + if (size) + return -EFAULT; + + return *offset; +} + +static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf, + size_t len, loff_t *offset) +{ + struct driver_data *dd = (struct driver_data *)f->private_data; + char buf[MTIP_DFS_MAX_BUF_SIZE]; + int size = *offset; + + if (!len || size) + return 0; + + size += sprintf(&buf[size], "Flag-port : [ %08lX ]\n", + dd->port->flags); + size += sprintf(&buf[size], "Flag-dd : [ %08lX ]\n", + dd->dd_flag); + + *offset = size <= len ? size : len; + size = copy_to_user(ubuf, buf, *offset); + if (size) + return -EFAULT; + + return *offset; +} + +static const struct file_operations mtip_regs_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = mtip_hw_read_registers, + .llseek = no_llseek, +}; + +static const struct file_operations mtip_flags_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = mtip_hw_read_flags, + .llseek = no_llseek, +}; + +/* + * Create the sysfs related attributes. + * + * @dd Pointer to the driver data structure. + * @kobj Pointer to the kobj for the block device. + * + * return value + * 0 Operation completed successfully. + * -EINVAL Invalid parameter. + */ +static int mtip_hw_sysfs_init(struct driver_data *dd, struct kobject *kobj) +{ + if (!kobj || !dd) + return -EINVAL; + + if (sysfs_create_file(kobj, &dev_attr_status.attr)) + dev_warn(&dd->pdev->dev, + "Error creating 'status' sysfs entry\n"); + return 0; +} + +/* + * Remove the sysfs related attributes. + * + * @dd Pointer to the driver data structure. + * @kobj Pointer to the kobj for the block device. + * + * return value + * 0 Operation completed successfully. + * -EINVAL Invalid parameter. + */ +static int mtip_hw_sysfs_exit(struct driver_data *dd, struct kobject *kobj) +{ + if (!kobj || !dd) + return -EINVAL; + + sysfs_remove_file(kobj, &dev_attr_status.attr); + + return 0; +} + +static int mtip_hw_debugfs_init(struct driver_data *dd) +{ + if (!dfs_parent) + return -1; + + dd->dfs_node = debugfs_create_dir(dd->disk->disk_name, dfs_parent); + if (IS_ERR_OR_NULL(dd->dfs_node)) { + dev_warn(&dd->pdev->dev, + "Error creating node %s under debugfs\n", + dd->disk->disk_name); + dd->dfs_node = NULL; + return -1; + } + + debugfs_create_file("flags", S_IRUGO, dd->dfs_node, dd, + &mtip_flags_fops); + debugfs_create_file("registers", S_IRUGO, dd->dfs_node, dd, + &mtip_regs_fops); + + return 0; +} + +static void mtip_hw_debugfs_exit(struct driver_data *dd) +{ + debugfs_remove_recursive(dd->dfs_node); +} + + +/* + * Perform any init/resume time hardware setup + * + * @dd Pointer to the driver data structure. + * + * return value + * None + */ +static inline void hba_setup(struct driver_data *dd) +{ + u32 hwdata; + hwdata = readl(dd->mmio + HOST_HSORG); + + /* interrupt bug workaround: use only 1 IS bit.*/ + writel(hwdata | + HSORG_DISABLE_SLOTGRP_INTR | + HSORG_DISABLE_SLOTGRP_PXIS, + dd->mmio + HOST_HSORG); +} + +/* + * Detect the details of the product, and store anything needed + * into the driver data structure. This includes product type and + * version and number of slot groups. + * + * @dd Pointer to the driver data structure. + * + * return value + * None + */ +static void mtip_detect_product(struct driver_data *dd) +{ + u32 hwdata; + unsigned int rev, slotgroups; + + /* + * HBA base + 0xFC [15:0] - vendor-specific hardware interface + * info register: + * [15:8] hardware/software interface rev# + * [ 3] asic-style interface + * [ 2:0] number of slot groups, minus 1 (only valid for asic-style). + */ + hwdata = readl(dd->mmio + HOST_HSORG); + + dd->product_type = MTIP_PRODUCT_UNKNOWN; + dd->slot_groups = 1; + + if (hwdata & 0x8) { + dd->product_type = MTIP_PRODUCT_ASICFPGA; + rev = (hwdata & HSORG_HWREV) >> 8; + slotgroups = (hwdata & HSORG_SLOTGROUPS) + 1; + dev_info(&dd->pdev->dev, + "ASIC-FPGA design, HS rev 0x%x, " + "%i slot groups [%i slots]\n", + rev, + slotgroups, + slotgroups * 32); + + if (slotgroups > MTIP_MAX_SLOT_GROUPS) { + dev_warn(&dd->pdev->dev, + "Warning: driver only supports " + "%i slot groups.\n", MTIP_MAX_SLOT_GROUPS); + slotgroups = MTIP_MAX_SLOT_GROUPS; + } + dd->slot_groups = slotgroups; + return; + } + + dev_warn(&dd->pdev->dev, "Unrecognized product id\n"); +} + +/* + * Blocking wait for FTL rebuild to complete + * + * @dd Pointer to the DRIVER_DATA structure. + * + * return value + * 0 FTL rebuild completed successfully + * -EFAULT FTL rebuild error/timeout/interruption + */ +static int mtip_ftl_rebuild_poll(struct driver_data *dd) +{ + unsigned long timeout, cnt = 0, start; + + dev_warn(&dd->pdev->dev, + "FTL rebuild in progress. Polling for completion.\n"); + + start = jiffies; + timeout = jiffies + msecs_to_jiffies(MTIP_FTL_REBUILD_TIMEOUT_MS); + + do { + if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &dd->dd_flag))) + return -EFAULT; + if (mtip_check_surprise_removal(dd->pdev)) + return -EFAULT; + + if (mtip_get_identify(dd->port, NULL) < 0) + return -EFAULT; + + if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) == + MTIP_FTL_REBUILD_MAGIC) { + ssleep(1); + /* Print message every 3 minutes */ + if (cnt++ >= 180) { + dev_warn(&dd->pdev->dev, + "FTL rebuild in progress (%d secs).\n", + jiffies_to_msecs(jiffies - start) / 1000); + cnt = 0; + } + } else { + dev_warn(&dd->pdev->dev, + "FTL rebuild complete (%d secs).\n", + jiffies_to_msecs(jiffies - start) / 1000); + mtip_block_initialize(dd); + return 0; + } + ssleep(10); + } while (time_before(jiffies, timeout)); + + /* Check for timeout */ + dev_err(&dd->pdev->dev, + "Timed out waiting for FTL rebuild to complete (%d secs).\n", + jiffies_to_msecs(jiffies - start) / 1000); + return -EFAULT; +} + +/* + * service thread to issue queued commands + * + * @data Pointer to the driver data structure. + * + * return value + * 0 + */ + +static int mtip_service_thread(void *data) +{ + struct driver_data *dd = (struct driver_data *)data; + unsigned long slot, slot_start, slot_wrap; + unsigned int num_cmd_slots = dd->slot_groups * 32; + struct mtip_port *port = dd->port; + + while (1) { + /* + * the condition is to check neither an internal command is + * is in progress nor error handling is active + */ + wait_event_interruptible(port->svc_wait, (port->flags) && + !(port->flags & MTIP_PF_PAUSE_IO)); + + if (kthread_should_stop()) + break; + + if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &dd->dd_flag))) + break; + + set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); + if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) { + slot = 1; + /* used to restrict the loop to one iteration */ + slot_start = num_cmd_slots; + slot_wrap = 0; + while (1) { + slot = find_next_bit(port->cmds_to_issue, + num_cmd_slots, slot); + if (slot_wrap == 1) { + if ((slot_start >= slot) || + (slot >= num_cmd_slots)) + break; + } + if (unlikely(slot_start == num_cmd_slots)) + slot_start = slot; + + if (unlikely(slot == num_cmd_slots)) { + slot = 1; + slot_wrap = 1; + continue; + } + + /* Issue the command to the hardware */ + mtip_issue_ncq_command(port, slot); + + clear_bit(slot, port->cmds_to_issue); + } + + clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); + } else if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) { + if (!mtip_ftl_rebuild_poll(dd)) + set_bit(MTIP_DDF_REBUILD_FAILED_BIT, + &dd->dd_flag); + clear_bit(MTIP_PF_REBUILD_BIT, &port->flags); + } + clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); + + if (test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags)) + break; + } + return 0; +} + +/* + * Called once for each card. + * + * @dd Pointer to the driver data structure. + * + * return value + * 0 on success, else an error code. + */ +static int mtip_hw_init(struct driver_data *dd) +{ + int i; + int rv; + unsigned int num_command_slots; + unsigned long timeout, timetaken; + unsigned char *buf; + struct smart_attr attr242; + + dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR]; + + mtip_detect_product(dd); + if (dd->product_type == MTIP_PRODUCT_UNKNOWN) { + rv = -EIO; + goto out1; + } + num_command_slots = dd->slot_groups * 32; + + hba_setup(dd); + + tasklet_init(&dd->tasklet, mtip_tasklet, (unsigned long)dd); + + dd->port = kzalloc(sizeof(struct mtip_port), GFP_KERNEL); + if (!dd->port) { + dev_err(&dd->pdev->dev, + "Memory allocation: port structure\n"); + return -ENOMEM; + } + + /* Counting semaphore to track command slot usage */ + sema_init(&dd->port->cmd_slot, num_command_slots - 1); + + /* Spinlock to prevent concurrent issue */ + spin_lock_init(&dd->port->cmd_issue_lock); + + /* Set the port mmio base address. */ + dd->port->mmio = dd->mmio + PORT_OFFSET; + dd->port->dd = dd; + + /* Allocate memory for the command list. */ + dd->port->command_list = + dmam_alloc_coherent(&dd->pdev->dev, + HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4), + &dd->port->command_list_dma, + GFP_KERNEL); + if (!dd->port->command_list) { + dev_err(&dd->pdev->dev, + "Memory allocation: command list\n"); + rv = -ENOMEM; + goto out1; + } + + /* Clear the memory we have allocated. */ + memset(dd->port->command_list, + 0, + HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4)); + + /* Setup the addresse of the RX FIS. */ + dd->port->rxfis = dd->port->command_list + HW_CMD_SLOT_SZ; + dd->port->rxfis_dma = dd->port->command_list_dma + HW_CMD_SLOT_SZ; + + /* Setup the address of the command tables. */ + dd->port->command_table = dd->port->rxfis + AHCI_RX_FIS_SZ; + dd->port->command_tbl_dma = dd->port->rxfis_dma + AHCI_RX_FIS_SZ; + + /* Setup the address of the identify data. */ + dd->port->identify = dd->port->command_table + + HW_CMD_TBL_AR_SZ; + dd->port->identify_dma = dd->port->command_tbl_dma + + HW_CMD_TBL_AR_SZ; + + /* Setup the address of the sector buffer - for some non-ncq cmds */ + dd->port->sector_buffer = (void *) dd->port->identify + ATA_SECT_SIZE; + dd->port->sector_buffer_dma = dd->port->identify_dma + ATA_SECT_SIZE; + + /* Setup the address of the log buf - for read log command */ + dd->port->log_buf = (void *)dd->port->sector_buffer + ATA_SECT_SIZE; + dd->port->log_buf_dma = dd->port->sector_buffer_dma + ATA_SECT_SIZE; + + /* Setup the address of the smart buf - for smart read data command */ + dd->port->smart_buf = (void *)dd->port->log_buf + ATA_SECT_SIZE; + dd->port->smart_buf_dma = dd->port->log_buf_dma + ATA_SECT_SIZE; + + + /* Point the command headers at the command tables. */ + for (i = 0; i < num_command_slots; i++) { + dd->port->commands[i].command_header = + dd->port->command_list + + (sizeof(struct mtip_cmd_hdr) * i); + dd->port->commands[i].command_header_dma = + dd->port->command_list_dma + + (sizeof(struct mtip_cmd_hdr) * i); + + dd->port->commands[i].command = + dd->port->command_table + (HW_CMD_TBL_SZ * i); + dd->port->commands[i].command_dma = + dd->port->command_tbl_dma + (HW_CMD_TBL_SZ * i); + + if (readl(dd->mmio + HOST_CAP) & HOST_CAP_64) + dd->port->commands[i].command_header->ctbau = + __force_bit2int cpu_to_le32( + (dd->port->commands[i].command_dma >> 16) >> 16); + dd->port->commands[i].command_header->ctba = + __force_bit2int cpu_to_le32( + dd->port->commands[i].command_dma & 0xFFFFFFFF); + + /* + * If this is not done, a bug is reported by the stock + * FC11 i386. Due to the fact that it has lots of kernel + * debugging enabled. + */ + sg_init_table(dd->port->commands[i].sg, MTIP_MAX_SG); + + /* Mark all commands as currently inactive.*/ + atomic_set(&dd->port->commands[i].active, 0); + } + + /* Setup the pointers to the extended s_active and CI registers. */ + for (i = 0; i < dd->slot_groups; i++) { + dd->port->s_active[i] = + dd->port->mmio + i*0x80 + PORT_SCR_ACT; + dd->port->cmd_issue[i] = + dd->port->mmio + i*0x80 + PORT_COMMAND_ISSUE; + dd->port->completed[i] = + dd->port->mmio + i*0x80 + PORT_SDBV; + } + + timetaken = jiffies; + timeout = jiffies + msecs_to_jiffies(30000); + while (((readl(dd->port->mmio + PORT_SCR_STAT) & 0x0F) != 0x03) && + time_before(jiffies, timeout)) { + mdelay(100); + } + if (unlikely(mtip_check_surprise_removal(dd->pdev))) { + timetaken = jiffies - timetaken; + dev_warn(&dd->pdev->dev, + "Surprise removal detected at %u ms\n", + jiffies_to_msecs(timetaken)); + rv = -ENODEV; + goto out2 ; + } + if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) { + timetaken = jiffies - timetaken; + dev_warn(&dd->pdev->dev, + "Removal detected at %u ms\n", + jiffies_to_msecs(timetaken)); + rv = -EFAULT; + goto out2; + } + + /* Conditionally reset the HBA. */ + if (!(readl(dd->mmio + HOST_CAP) & HOST_CAP_NZDMA)) { + if (mtip_hba_reset(dd) < 0) { + dev_err(&dd->pdev->dev, + "Card did not reset within timeout\n"); + rv = -EIO; + goto out2; + } + } else { + /* Clear any pending interrupts on the HBA */ + writel(readl(dd->mmio + HOST_IRQ_STAT), + dd->mmio + HOST_IRQ_STAT); + } + + mtip_init_port(dd->port); + mtip_start_port(dd->port); + + /* Setup the ISR and enable interrupts. */ + rv = devm_request_irq(&dd->pdev->dev, + dd->pdev->irq, + mtip_irq_handler, + IRQF_SHARED, + dev_driver_string(&dd->pdev->dev), + dd); + + if (rv) { + dev_err(&dd->pdev->dev, + "Unable to allocate IRQ %d\n", dd->pdev->irq); + goto out2; + } + + /* Enable interrupts on the HBA. */ + writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN, + dd->mmio + HOST_CTL); + + init_timer(&dd->port->cmd_timer); + init_waitqueue_head(&dd->port->svc_wait); + + dd->port->cmd_timer.data = (unsigned long int) dd->port; + dd->port->cmd_timer.function = mtip_timeout_function; + mod_timer(&dd->port->cmd_timer, + jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD)); + + + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) { + rv = -EFAULT; + goto out3; + } + + if (mtip_get_identify(dd->port, NULL) < 0) { + rv = -EFAULT; + goto out3; + } + + if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) == + MTIP_FTL_REBUILD_MAGIC) { + set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags); + return MTIP_FTL_REBUILD_MAGIC; + } + mtip_dump_identify(dd->port); + + /* check write protect, over temp and rebuild statuses */ + rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ, + dd->port->log_buf, + dd->port->log_buf_dma, 1); + if (rv) { + dev_warn(&dd->pdev->dev, + "Error in READ LOG EXT (10h) command\n"); + /* non-critical error, don't fail the load */ + } else { + buf = (unsigned char *)dd->port->log_buf; + if (buf[259] & 0x1) { + dev_info(&dd->pdev->dev, + "Write protect bit is set.\n"); + set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag); + } + if (buf[288] == 0xF7) { + dev_info(&dd->pdev->dev, + "Exceeded Tmax, drive in thermal shutdown.\n"); + set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag); + } + if (buf[288] == 0xBF) { + dev_info(&dd->pdev->dev, + "Drive indicates rebuild has failed.\n"); + /* TODO */ + } + } + + /* get write protect progess */ + memset(&attr242, 0, sizeof(struct smart_attr)); + if (mtip_get_smart_attr(dd->port, 242, &attr242)) + dev_warn(&dd->pdev->dev, + "Unable to check write protect progress\n"); + else + dev_info(&dd->pdev->dev, + "Write protect progress: %u%% (%u blocks)\n", + attr242.cur, le32_to_cpu(attr242.data)); + return rv; + +out3: + del_timer_sync(&dd->port->cmd_timer); + + /* Disable interrupts on the HBA. */ + writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN, + dd->mmio + HOST_CTL); + + /*Release the IRQ. */ + devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); + +out2: + mtip_deinit_port(dd->port); + + /* Free the command/command header memory. */ + dmam_free_coherent(&dd->pdev->dev, + HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4), + dd->port->command_list, + dd->port->command_list_dma); +out1: + /* Free the memory allocated for the for structure. */ + kfree(dd->port); + + return rv; +} + +/* + * Called to deinitialize an interface. + * + * @dd Pointer to the driver data structure. + * + * return value + * 0 + */ +static int mtip_hw_exit(struct driver_data *dd) +{ + /* + * Send standby immediate (E0h) to the drive so that it + * saves its state. + */ + if (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) { + + if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags)) + if (mtip_standby_immediate(dd->port)) + dev_warn(&dd->pdev->dev, + "STANDBY IMMEDIATE failed\n"); + + /* de-initialize the port. */ + mtip_deinit_port(dd->port); + + /* Disable interrupts on the HBA. */ + writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN, + dd->mmio + HOST_CTL); + } + + del_timer_sync(&dd->port->cmd_timer); + + /* Release the IRQ. */ + devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); + + /* Stop the bottom half tasklet. */ + tasklet_kill(&dd->tasklet); + + /* Free the command/command header memory. */ + dmam_free_coherent(&dd->pdev->dev, + HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4), + dd->port->command_list, + dd->port->command_list_dma); + /* Free the memory allocated for the for structure. */ + kfree(dd->port); + + return 0; +} + +/* + * Issue a Standby Immediate command to the device. + * + * This function is called by the Block Layer just before the + * system powers off during a shutdown. + * + * @dd Pointer to the driver data structure. + * + * return value + * 0 + */ +static int mtip_hw_shutdown(struct driver_data *dd) +{ + /* + * Send standby immediate (E0h) to the drive so that it + * saves its state. + */ + mtip_standby_immediate(dd->port); + + return 0; +} + +/* + * Suspend function + * + * This function is called by the Block Layer just before the + * system hibernates. + * + * @dd Pointer to the driver data structure. + * + * return value + * 0 Suspend was successful + * -EFAULT Suspend was not successful + */ +static int mtip_hw_suspend(struct driver_data *dd) +{ + /* + * Send standby immediate (E0h) to the drive + * so that it saves its state. + */ + if (mtip_standby_immediate(dd->port) != 0) { + dev_err(&dd->pdev->dev, + "Failed standby-immediate command\n"); + return -EFAULT; + } + + /* Disable interrupts on the HBA.*/ + writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN, + dd->mmio + HOST_CTL); + mtip_deinit_port(dd->port); + + return 0; +} + +/* + * Resume function + * + * This function is called by the Block Layer as the + * system resumes. + * + * @dd Pointer to the driver data structure. + * + * return value + * 0 Resume was successful + * -EFAULT Resume was not successful + */ +static int mtip_hw_resume(struct driver_data *dd) +{ + /* Perform any needed hardware setup steps */ + hba_setup(dd); + + /* Reset the HBA */ + if (mtip_hba_reset(dd) != 0) { + dev_err(&dd->pdev->dev, + "Unable to reset the HBA\n"); + return -EFAULT; + } + + /* + * Enable the port, DMA engine, and FIS reception specific + * h/w in controller. + */ + mtip_init_port(dd->port); + mtip_start_port(dd->port); + + /* Enable interrupts on the HBA.*/ + writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN, + dd->mmio + HOST_CTL); + + return 0; +} + +/* + * Helper function for reusing disk name + * upon hot insertion. + */ +static int rssd_disk_name_format(char *prefix, + int index, + char *buf, + int buflen) +{ + const int base = 'z' - 'a' + 1; + char *begin = buf + strlen(prefix); + char *end = buf + buflen; + char *p; + int unit; + + p = end - 1; + *p = '\0'; + unit = base; + do { + if (p == begin) + return -EINVAL; + *--p = 'a' + (index % unit); + index = (index / unit) - 1; + } while (index >= 0); + + memmove(begin, p, end - p); + memcpy(buf, prefix, strlen(prefix)); + + return 0; +} + +/* + * Block layer IOCTL handler. + * + * @dev Pointer to the block_device structure. + * @mode ignored + * @cmd IOCTL command passed from the user application. + * @arg Argument passed from the user application. + * + * return value + * 0 IOCTL completed successfully. + * -ENOTTY IOCTL not supported or invalid driver data + * structure pointer. + */ +static int mtip_block_ioctl(struct block_device *dev, + fmode_t mode, + unsigned cmd, + unsigned long arg) +{ + struct driver_data *dd = dev->bd_disk->private_data; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (!dd) + return -ENOTTY; + + if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) + return -ENOTTY; + + switch (cmd) { + case BLKFLSBUF: + return -ENOTTY; + default: + return mtip_hw_ioctl(dd, cmd, arg); + } +} + +#ifdef CONFIG_COMPAT +/* + * Block layer compat IOCTL handler. + * + * @dev Pointer to the block_device structure. + * @mode ignored + * @cmd IOCTL command passed from the user application. + * @arg Argument passed from the user application. + * + * return value + * 0 IOCTL completed successfully. + * -ENOTTY IOCTL not supported or invalid driver data + * structure pointer. + */ +static int mtip_block_compat_ioctl(struct block_device *dev, + fmode_t mode, + unsigned cmd, + unsigned long arg) +{ + struct driver_data *dd = dev->bd_disk->private_data; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (!dd) + return -ENOTTY; + + if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) + return -ENOTTY; + + switch (cmd) { + case BLKFLSBUF: + return -ENOTTY; + case HDIO_DRIVE_TASKFILE: { + struct mtip_compat_ide_task_request_s __user *compat_req_task; + ide_task_request_t req_task; + int compat_tasksize, outtotal, ret; + + compat_tasksize = + sizeof(struct mtip_compat_ide_task_request_s); + + compat_req_task = + (struct mtip_compat_ide_task_request_s __user *) arg; + + if (copy_from_user(&req_task, (void __user *) arg, + compat_tasksize - (2 * sizeof(compat_long_t)))) + return -EFAULT; + + if (get_user(req_task.out_size, &compat_req_task->out_size)) + return -EFAULT; + + if (get_user(req_task.in_size, &compat_req_task->in_size)) + return -EFAULT; + + outtotal = sizeof(struct mtip_compat_ide_task_request_s); + + ret = exec_drive_taskfile(dd, (void __user *) arg, + &req_task, outtotal); + + if (copy_to_user((void __user *) arg, &req_task, + compat_tasksize - + (2 * sizeof(compat_long_t)))) + return -EFAULT; + + if (put_user(req_task.out_size, &compat_req_task->out_size)) + return -EFAULT; + + if (put_user(req_task.in_size, &compat_req_task->in_size)) + return -EFAULT; + + return ret; + } + default: + return mtip_hw_ioctl(dd, cmd, arg); + } +} +#endif + +/* + * Obtain the geometry of the device. + * + * You may think that this function is obsolete, but some applications, + * fdisk for example still used CHS values. This function describes the + * device as having 224 heads and 56 sectors per cylinder. These values are + * chosen so that each cylinder is aligned on a 4KB boundary. Since a + * partition is described in terms of a start and end cylinder this means + * that each partition is also 4KB aligned. Non-aligned partitions adversely + * affects performance. + * + * @dev Pointer to the block_device strucutre. + * @geo Pointer to a hd_geometry structure. + * + * return value + * 0 Operation completed successfully. + * -ENOTTY An error occurred while reading the drive capacity. + */ +static int mtip_block_getgeo(struct block_device *dev, + struct hd_geometry *geo) +{ + struct driver_data *dd = dev->bd_disk->private_data; + sector_t capacity; + + if (!dd) + return -ENOTTY; + + if (!(mtip_hw_get_capacity(dd, &capacity))) { + dev_warn(&dd->pdev->dev, + "Could not get drive capacity.\n"); + return -ENOTTY; + } + + geo->heads = 224; + geo->sectors = 56; + sector_div(capacity, (geo->heads * geo->sectors)); + geo->cylinders = capacity; + return 0; +} + +/* + * Block device operation function. + * + * This structure contains pointers to the functions required by the block + * layer. + */ +static const struct block_device_operations mtip_block_ops = { + .ioctl = mtip_block_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = mtip_block_compat_ioctl, +#endif + .getgeo = mtip_block_getgeo, + .owner = THIS_MODULE +}; + +/* + * Block layer make request function. + * + * This function is called by the kernel to process a BIO for + * the P320 device. + * + * @queue Pointer to the request queue. Unused other than to obtain + * the driver data structure. + * @bio Pointer to the BIO. + * + */ +static void mtip_make_request(struct request_queue *queue, struct bio *bio) +{ + struct driver_data *dd = queue->queuedata; + struct scatterlist *sg; + struct bio_vec *bvec; + int nents = 0; + int tag = 0; + + if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) { + if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &dd->dd_flag))) { + bio_endio(bio, -ENXIO); + return; + } + if (unlikely(test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))) { + bio_endio(bio, -ENODATA); + return; + } + if (unlikely(test_bit(MTIP_DDF_WRITE_PROTECT_BIT, + &dd->dd_flag) && + bio_data_dir(bio))) { + bio_endio(bio, -ENODATA); + return; + } + if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))) { + bio_endio(bio, -ENODATA); + return; + } + } + + if (unlikely(!bio_has_data(bio))) { + blk_queue_flush(queue, 0); + bio_endio(bio, 0); + return; + } + + sg = mtip_hw_get_scatterlist(dd, &tag); + if (likely(sg != NULL)) { + blk_queue_bounce(queue, &bio); + + if (unlikely((bio)->bi_vcnt > MTIP_MAX_SG)) { + dev_warn(&dd->pdev->dev, + "Maximum number of SGL entries exceeded\n"); + bio_io_error(bio); + mtip_hw_release_scatterlist(dd, tag); + return; + } + + /* Create the scatter list for this bio. */ + bio_for_each_segment(bvec, bio, nents) { + sg_set_page(&sg[nents], + bvec->bv_page, + bvec->bv_len, + bvec->bv_offset); + } + + /* Issue the read/write. */ + mtip_hw_submit_io(dd, + bio->bi_sector, + bio_sectors(bio), + nents, + tag, + bio_endio, + bio, + bio_data_dir(bio)); + } else + bio_io_error(bio); +} + +/* + * Block layer initialization function. + * + * This function is called once by the PCI layer for each P320 + * device that is connected to the system. + * + * @dd Pointer to the driver data structure. + * + * return value + * 0 on success else an error code. + */ +static int mtip_block_initialize(struct driver_data *dd) +{ + int rv = 0, wait_for_rebuild = 0; + sector_t capacity; + unsigned int index = 0; + struct kobject *kobj; + unsigned char thd_name[16]; + + if (dd->disk) + goto skip_create_disk; /* hw init done, before rebuild */ + + /* Initialize the protocol layer. */ + wait_for_rebuild = mtip_hw_init(dd); + if (wait_for_rebuild < 0) { + dev_err(&dd->pdev->dev, + "Protocol layer initialization failed\n"); + rv = -EINVAL; + goto protocol_init_error; + } + + dd->disk = alloc_disk(MTIP_MAX_MINORS); + if (dd->disk == NULL) { + dev_err(&dd->pdev->dev, + "Unable to allocate gendisk structure\n"); + rv = -EINVAL; + goto alloc_disk_error; + } + + /* Generate the disk name, implemented same as in sd.c */ + do { + if (!ida_pre_get(&rssd_index_ida, GFP_KERNEL)) + goto ida_get_error; + + spin_lock(&rssd_index_lock); + rv = ida_get_new(&rssd_index_ida, &index); + spin_unlock(&rssd_index_lock); + } while (rv == -EAGAIN); + + if (rv) + goto ida_get_error; + + rv = rssd_disk_name_format("rssd", + index, + dd->disk->disk_name, + DISK_NAME_LEN); + if (rv) + goto disk_index_error; + + dd->disk->driverfs_dev = &dd->pdev->dev; + dd->disk->major = dd->major; + dd->disk->first_minor = dd->instance * MTIP_MAX_MINORS; + dd->disk->fops = &mtip_block_ops; + dd->disk->private_data = dd; + dd->index = index; + + /* + * if rebuild pending, start the service thread, and delay the block + * queue creation and add_disk() + */ + if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC) + goto start_service_thread; + +skip_create_disk: + /* Allocate the request queue. */ + dd->queue = blk_alloc_queue(GFP_KERNEL); + if (dd->queue == NULL) { + dev_err(&dd->pdev->dev, + "Unable to allocate request queue\n"); + rv = -ENOMEM; + goto block_queue_alloc_init_error; + } + + /* Attach our request function to the request queue. */ + blk_queue_make_request(dd->queue, mtip_make_request); + + dd->disk->queue = dd->queue; + dd->queue->queuedata = dd; + + /* Set device limits. */ + set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags); + blk_queue_max_segments(dd->queue, MTIP_MAX_SG); + blk_queue_physical_block_size(dd->queue, 4096); + blk_queue_max_hw_sectors(dd->queue, 0xffff); + blk_queue_max_segment_size(dd->queue, 0x400000); + blk_queue_io_min(dd->queue, 4096); + + /* + * write back cache is not supported in the device. FUA depends on + * write back cache support, hence setting flush support to zero. + */ + blk_queue_flush(dd->queue, 0); + + /* Set the capacity of the device in 512 byte sectors. */ + if (!(mtip_hw_get_capacity(dd, &capacity))) { + dev_warn(&dd->pdev->dev, + "Could not read drive capacity\n"); + rv = -EIO; + goto read_capacity_error; + } + set_capacity(dd->disk, capacity); + + /* Enable the block device and add it to /dev */ + add_disk(dd->disk); + + /* + * Now that the disk is active, initialize any sysfs attributes + * managed by the protocol layer. + */ + kobj = kobject_get(&disk_to_dev(dd->disk)->kobj); + if (kobj) { + mtip_hw_sysfs_init(dd, kobj); + kobject_put(kobj); + } + mtip_hw_debugfs_init(dd); + + if (dd->mtip_svc_handler) { + set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag); + return rv; /* service thread created for handling rebuild */ + } + +start_service_thread: + sprintf(thd_name, "mtip_svc_thd_%02d", index); + + dd->mtip_svc_handler = kthread_run(mtip_service_thread, + dd, thd_name); + + if (IS_ERR(dd->mtip_svc_handler)) { + dev_err(&dd->pdev->dev, "service thread failed to start\n"); + dd->mtip_svc_handler = NULL; + rv = -EFAULT; + goto kthread_run_error; + } + + if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC) + rv = wait_for_rebuild; + + return rv; + +kthread_run_error: + mtip_hw_debugfs_exit(dd); + + /* Delete our gendisk. This also removes the device from /dev */ + del_gendisk(dd->disk); + +read_capacity_error: + blk_cleanup_queue(dd->queue); + +block_queue_alloc_init_error: +disk_index_error: + spin_lock(&rssd_index_lock); + ida_remove(&rssd_index_ida, index); + spin_unlock(&rssd_index_lock); + +ida_get_error: + put_disk(dd->disk); + +alloc_disk_error: + mtip_hw_exit(dd); /* De-initialize the protocol layer. */ + +protocol_init_error: + return rv; +} + +/* + * Block layer deinitialization function. + * + * Called by the PCI layer as each P320 device is removed. + * + * @dd Pointer to the driver data structure. + * + * return value + * 0 + */ +static int mtip_block_remove(struct driver_data *dd) +{ + struct kobject *kobj; + + if (dd->mtip_svc_handler) { + set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags); + wake_up_interruptible(&dd->port->svc_wait); + kthread_stop(dd->mtip_svc_handler); + } + + /* Clean up the sysfs attributes, if created */ + if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) { + kobj = kobject_get(&disk_to_dev(dd->disk)->kobj); + if (kobj) { + mtip_hw_sysfs_exit(dd, kobj); + kobject_put(kobj); + } + } + mtip_hw_debugfs_exit(dd); + + /* + * Delete our gendisk structure. This also removes the device + * from /dev + */ + del_gendisk(dd->disk); + + spin_lock(&rssd_index_lock); + ida_remove(&rssd_index_ida, dd->index); + spin_unlock(&rssd_index_lock); + + blk_cleanup_queue(dd->queue); + dd->disk = NULL; + dd->queue = NULL; + + /* De-initialize the protocol layer. */ + mtip_hw_exit(dd); + + return 0; +} + +/* + * Function called by the PCI layer when just before the + * machine shuts down. + * + * If a protocol layer shutdown function is present it will be called + * by this function. + * + * @dd Pointer to the driver data structure. + * + * return value + * 0 + */ +static int mtip_block_shutdown(struct driver_data *dd) +{ + dev_info(&dd->pdev->dev, + "Shutting down %s ...\n", dd->disk->disk_name); + + /* Delete our gendisk structure, and cleanup the blk queue. */ + del_gendisk(dd->disk); + + spin_lock(&rssd_index_lock); + ida_remove(&rssd_index_ida, dd->index); + spin_unlock(&rssd_index_lock); + + blk_cleanup_queue(dd->queue); + dd->disk = NULL; + dd->queue = NULL; + + mtip_hw_shutdown(dd); + return 0; +} + +static int mtip_block_suspend(struct driver_data *dd) +{ + dev_info(&dd->pdev->dev, + "Suspending %s ...\n", dd->disk->disk_name); + mtip_hw_suspend(dd); + return 0; +} + +static int mtip_block_resume(struct driver_data *dd) +{ + dev_info(&dd->pdev->dev, "Resuming %s ...\n", + dd->disk->disk_name); + mtip_hw_resume(dd); + return 0; +} + +/* + * Called for each supported PCI device detected. + * + * This function allocates the private data structure, enables the + * PCI device and then calls the block layer initialization function. + * + * return value + * 0 on success else an error code. + */ +static int mtip_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + int rv = 0; + struct driver_data *dd = NULL; + + /* Allocate memory for this devices private data. */ + dd = kzalloc(sizeof(struct driver_data), GFP_KERNEL); + if (dd == NULL) { + dev_err(&pdev->dev, + "Unable to allocate memory for driver data\n"); + return -ENOMEM; + } + + /* Attach the private data to this PCI device. */ + pci_set_drvdata(pdev, dd); + + rv = pcim_enable_device(pdev); + if (rv < 0) { + dev_err(&pdev->dev, "Unable to enable device\n"); + goto iomap_err; + } + + /* Map BAR5 to memory. */ + rv = pcim_iomap_regions(pdev, 1 << MTIP_ABAR, MTIP_DRV_NAME); + if (rv < 0) { + dev_err(&pdev->dev, "Unable to map regions\n"); + goto iomap_err; + } + + if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) { + rv = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + + if (rv) { + rv = pci_set_consistent_dma_mask(pdev, + DMA_BIT_MASK(32)); + if (rv) { + dev_warn(&pdev->dev, + "64-bit DMA enable failed\n"); + goto setmask_err; + } + } + } + + pci_set_master(pdev); + + if (pci_enable_msi(pdev)) { + dev_warn(&pdev->dev, + "Unable to enable MSI interrupt.\n"); + goto block_initialize_err; + } + + /* Copy the info we may need later into the private data structure. */ + dd->major = mtip_major; + dd->instance = instance; + dd->pdev = pdev; + + /* Initialize the block layer. */ + rv = mtip_block_initialize(dd); + if (rv < 0) { + dev_err(&pdev->dev, + "Unable to initialize block layer\n"); + goto block_initialize_err; + } + + /* + * Increment the instance count so that each device has a unique + * instance number. + */ + instance++; + if (rv != MTIP_FTL_REBUILD_MAGIC) + set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag); + goto done; + +block_initialize_err: + pci_disable_msi(pdev); + +setmask_err: + pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); + +iomap_err: + kfree(dd); + pci_set_drvdata(pdev, NULL); + return rv; +done: + return rv; +} + +/* + * Called for each probed device when the device is removed or the + * driver is unloaded. + * + * return value + * None + */ +static void mtip_pci_remove(struct pci_dev *pdev) +{ + struct driver_data *dd = pci_get_drvdata(pdev); + int counter = 0; + + set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag); + + if (mtip_check_surprise_removal(pdev)) { + while (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) { + counter++; + msleep(20); + if (counter == 10) { + /* Cleanup the outstanding commands */ + mtip_command_cleanup(dd); + break; + } + } + } + + /* Clean up the block layer. */ + mtip_block_remove(dd); + + pci_disable_msi(pdev); + + kfree(dd); + pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); +} + +/* + * Called for each probed device when the device is suspended. + * + * return value + * 0 Success + * <0 Error + */ +static int mtip_pci_suspend(struct pci_dev *pdev, pm_message_t mesg) +{ + int rv = 0; + struct driver_data *dd = pci_get_drvdata(pdev); + + if (!dd) { + dev_err(&pdev->dev, + "Driver private datastructure is NULL\n"); + return -EFAULT; + } + + set_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag); + + /* Disable ports & interrupts then send standby immediate */ + rv = mtip_block_suspend(dd); + if (rv < 0) { + dev_err(&pdev->dev, + "Failed to suspend controller\n"); + return rv; + } + + /* + * Save the pci config space to pdev structure & + * disable the device + */ + pci_save_state(pdev); + pci_disable_device(pdev); + + /* Move to Low power state*/ + pci_set_power_state(pdev, PCI_D3hot); + + return rv; +} + +/* + * Called for each probed device when the device is resumed. + * + * return value + * 0 Success + * <0 Error + */ +static int mtip_pci_resume(struct pci_dev *pdev) +{ + int rv = 0; + struct driver_data *dd; + + dd = pci_get_drvdata(pdev); + if (!dd) { + dev_err(&pdev->dev, + "Driver private datastructure is NULL\n"); + return -EFAULT; + } + + /* Move the device to active State */ + pci_set_power_state(pdev, PCI_D0); + + /* Restore PCI configuration space */ + pci_restore_state(pdev); + + /* Enable the PCI device*/ + rv = pcim_enable_device(pdev); + if (rv < 0) { + dev_err(&pdev->dev, + "Failed to enable card during resume\n"); + goto err; + } + pci_set_master(pdev); + + /* + * Calls hbaReset, initPort, & startPort function + * then enables interrupts + */ + rv = mtip_block_resume(dd); + if (rv < 0) + dev_err(&pdev->dev, "Unable to resume\n"); + +err: + clear_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag); + + return rv; +} + +/* + * Shutdown routine + * + * return value + * None + */ +static void mtip_pci_shutdown(struct pci_dev *pdev) +{ + struct driver_data *dd = pci_get_drvdata(pdev); + if (dd) + mtip_block_shutdown(dd); +} + +/* Table of device ids supported by this driver. */ +static DEFINE_PCI_DEVICE_TABLE(mtip_pci_tbl) = { + { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320H_DEVICE_ID) }, + { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320M_DEVICE_ID) }, + { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320S_DEVICE_ID) }, + { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P325M_DEVICE_ID) }, + { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P420H_DEVICE_ID) }, + { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P420M_DEVICE_ID) }, + { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P425M_DEVICE_ID) }, + { 0 } +}; + +/* Structure that describes the PCI driver functions. */ +static struct pci_driver mtip_pci_driver = { + .name = MTIP_DRV_NAME, + .id_table = mtip_pci_tbl, + .probe = mtip_pci_probe, + .remove = mtip_pci_remove, + .suspend = mtip_pci_suspend, + .resume = mtip_pci_resume, + .shutdown = mtip_pci_shutdown, +}; + +MODULE_DEVICE_TABLE(pci, mtip_pci_tbl); + +/* + * Module initialization function. + * + * Called once when the module is loaded. This function allocates a major + * block device number to the Cyclone devices and registers the PCI layer + * of the driver. + * + * Return value + * 0 on success else error code. + */ +static int __init mtip_init(void) +{ + int error; + + pr_info(MTIP_DRV_NAME " Version " MTIP_DRV_VERSION "\n"); + + /* Allocate a major block device number to use with this driver. */ + error = register_blkdev(0, MTIP_DRV_NAME); + if (error <= 0) { + pr_err("Unable to register block device (%d)\n", + error); + return -EBUSY; + } + mtip_major = error; + + if (!dfs_parent) { + dfs_parent = debugfs_create_dir("rssd", NULL); + if (IS_ERR_OR_NULL(dfs_parent)) { + pr_warn("Error creating debugfs parent\n"); + dfs_parent = NULL; + } + } + + /* Register our PCI operations. */ + error = pci_register_driver(&mtip_pci_driver); + if (error) { + debugfs_remove(dfs_parent); + unregister_blkdev(mtip_major, MTIP_DRV_NAME); + } + + return error; +} + +/* + * Module de-initialization function. + * + * Called once when the module is unloaded. This function deallocates + * the major block device number allocated by mtip_init() and + * unregisters the PCI layer of the driver. + * + * Return value + * none + */ +static void __exit mtip_exit(void) +{ + debugfs_remove_recursive(dfs_parent); + + /* Release the allocated major block device number. */ + unregister_blkdev(mtip_major, MTIP_DRV_NAME); + + /* Unregister the PCI driver. */ + pci_unregister_driver(&mtip_pci_driver); +} + +MODULE_AUTHOR("Micron Technology, Inc"); +MODULE_DESCRIPTION("Micron RealSSD PCIe Block Driver"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(MTIP_DRV_VERSION); + +module_init(mtip_init); +module_exit(mtip_exit); diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h new file mode 100644 index 000000000000..18627a1d04c5 --- /dev/null +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -0,0 +1,463 @@ +/* + * mtip32xx.h - Header file for the P320 SSD Block Driver + * Copyright (C) 2011 Micron Technology, Inc. + * + * Portions of this code were derived from works subjected to the + * following copyright: + * Copyright (C) 2009 Integrated Device Technology, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef __MTIP32XX_H__ +#define __MTIP32XX_H__ + +#include <linux/spinlock.h> +#include <linux/rwsem.h> +#include <linux/ata.h> +#include <linux/interrupt.h> +#include <linux/genhd.h> + +/* Offset of Subsystem Device ID in pci confoguration space */ +#define PCI_SUBSYSTEM_DEVICEID 0x2E + +/* offset of Device Control register in PCIe extended capabilites space */ +#define PCIE_CONFIG_EXT_DEVICE_CONTROL_OFFSET 0x48 + +/* # of times to retry timed out/failed IOs */ +#define MTIP_MAX_RETRIES 2 + +/* Various timeout values in ms */ +#define MTIP_NCQ_COMMAND_TIMEOUT_MS 5000 +#define MTIP_IOCTL_COMMAND_TIMEOUT_MS 5000 +#define MTIP_INTERNAL_COMMAND_TIMEOUT_MS 5000 + +/* check for timeouts every 500ms */ +#define MTIP_TIMEOUT_CHECK_PERIOD 500 + +/* ftl rebuild */ +#define MTIP_FTL_REBUILD_OFFSET 142 +#define MTIP_FTL_REBUILD_MAGIC 0xED51 +#define MTIP_FTL_REBUILD_TIMEOUT_MS 2400000 + +/* Macro to extract the tag bit number from a tag value. */ +#define MTIP_TAG_BIT(tag) (tag & 0x1F) + +/* + * Macro to extract the tag index from a tag value. The index + * is used to access the correct s_active/Command Issue register based + * on the tag value. + */ +#define MTIP_TAG_INDEX(tag) (tag >> 5) + +/* + * Maximum number of scatter gather entries + * a single command may have. + */ +#define MTIP_MAX_SG 128 + +/* + * Maximum number of slot groups (Command Issue & s_active registers) + * NOTE: This is the driver maximum; check dd->slot_groups for actual value. + */ +#define MTIP_MAX_SLOT_GROUPS 8 + +/* Internal command tag. */ +#define MTIP_TAG_INTERNAL 0 + +/* Micron Vendor ID & P320x SSD Device ID */ +#define PCI_VENDOR_ID_MICRON 0x1344 +#define P320H_DEVICE_ID 0x5150 +#define P320M_DEVICE_ID 0x5151 +#define P320S_DEVICE_ID 0x5152 +#define P325M_DEVICE_ID 0x5153 +#define P420H_DEVICE_ID 0x5160 +#define P420M_DEVICE_ID 0x5161 +#define P425M_DEVICE_ID 0x5163 + +/* Driver name and version strings */ +#define MTIP_DRV_NAME "mtip32xx" +#define MTIP_DRV_VERSION "1.2.6os3" + +/* Maximum number of minor device numbers per device. */ +#define MTIP_MAX_MINORS 16 + +/* Maximum number of supported command slots. */ +#define MTIP_MAX_COMMAND_SLOTS (MTIP_MAX_SLOT_GROUPS * 32) + +/* + * Per-tag bitfield size in longs. + * Linux bit manipulation functions + * (i.e. test_and_set_bit, find_next_zero_bit) + * manipulate memory in longs, so we try to make the math work. + * take the slot groups and find the number of longs, rounding up. + * Careful! i386 and x86_64 use different size longs! + */ +#define U32_PER_LONG (sizeof(long) / sizeof(u32)) +#define SLOTBITS_IN_LONGS ((MTIP_MAX_SLOT_GROUPS + \ + (U32_PER_LONG-1))/U32_PER_LONG) + +/* BAR number used to access the HBA registers. */ +#define MTIP_ABAR 5 + +#ifdef DEBUG + #define dbg_printk(format, arg...) \ + printk(pr_fmt(format), ##arg); +#else + #define dbg_printk(format, arg...) +#endif + +#define MTIP_DFS_MAX_BUF_SIZE 1024 + +#define __force_bit2int (unsigned int __force) + +enum { + /* below are bit numbers in 'flags' defined in mtip_port */ + MTIP_PF_IC_ACTIVE_BIT = 0, /* pio/ioctl */ + MTIP_PF_EH_ACTIVE_BIT = 1, /* error handling */ + MTIP_PF_SE_ACTIVE_BIT = 2, /* secure erase */ + MTIP_PF_DM_ACTIVE_BIT = 3, /* download microcde */ + MTIP_PF_PAUSE_IO = ((1 << MTIP_PF_IC_ACTIVE_BIT) | \ + (1 << MTIP_PF_EH_ACTIVE_BIT) | \ + (1 << MTIP_PF_SE_ACTIVE_BIT) | \ + (1 << MTIP_PF_DM_ACTIVE_BIT)), + + MTIP_PF_SVC_THD_ACTIVE_BIT = 4, + MTIP_PF_ISSUE_CMDS_BIT = 5, + MTIP_PF_REBUILD_BIT = 6, + MTIP_PF_SVC_THD_STOP_BIT = 8, + + /* below are bit numbers in 'dd_flag' defined in driver_data */ + MTIP_DDF_SEC_LOCK_BIT = 0, + MTIP_DDF_REMOVE_PENDING_BIT = 1, + MTIP_DDF_OVER_TEMP_BIT = 2, + MTIP_DDF_WRITE_PROTECT_BIT = 3, + MTIP_DDF_STOP_IO = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) | \ + (1 << MTIP_DDF_SEC_LOCK_BIT) | \ + (1 << MTIP_DDF_OVER_TEMP_BIT) | \ + (1 << MTIP_DDF_WRITE_PROTECT_BIT)), + + MTIP_DDF_CLEANUP_BIT = 5, + MTIP_DDF_RESUME_BIT = 6, + MTIP_DDF_INIT_DONE_BIT = 7, + MTIP_DDF_REBUILD_FAILED_BIT = 8, +}; + +__packed struct smart_attr{ + u8 attr_id; + u16 flags; + u8 cur; + u8 worst; + u32 data; + u8 res[3]; +}; + +/* Register Frame Information Structure (FIS), host to device. */ +struct host_to_dev_fis { + /* + * FIS type. + * - 27h Register FIS, host to device. + * - 34h Register FIS, device to host. + * - 39h DMA Activate FIS, device to host. + * - 41h DMA Setup FIS, bi-directional. + * - 46h Data FIS, bi-directional. + * - 58h BIST Activate FIS, bi-directional. + * - 5Fh PIO Setup FIS, device to host. + * - A1h Set Device Bits FIS, device to host. + */ + unsigned char type; + unsigned char opts; + unsigned char command; + unsigned char features; + + union { + unsigned char lba_low; + unsigned char sector; + }; + union { + unsigned char lba_mid; + unsigned char cyl_low; + }; + union { + unsigned char lba_hi; + unsigned char cyl_hi; + }; + union { + unsigned char device; + unsigned char head; + }; + + union { + unsigned char lba_low_ex; + unsigned char sector_ex; + }; + union { + unsigned char lba_mid_ex; + unsigned char cyl_low_ex; + }; + union { + unsigned char lba_hi_ex; + unsigned char cyl_hi_ex; + }; + unsigned char features_ex; + + unsigned char sect_count; + unsigned char sect_cnt_ex; + unsigned char res2; + unsigned char control; + + unsigned int res3; +}; + +/* Command header structure. */ +struct mtip_cmd_hdr { + /* + * Command options. + * - Bits 31:16 Number of PRD entries. + * - Bits 15:8 Unused in this implementation. + * - Bit 7 Prefetch bit, informs the drive to prefetch PRD entries. + * - Bit 6 Write bit, should be set when writing data to the device. + * - Bit 5 Unused in this implementation. + * - Bits 4:0 Length of the command FIS in DWords (DWord = 4 bytes). + */ + unsigned int opts; + /* This field is unsed when using NCQ. */ + union { + unsigned int byte_count; + unsigned int status; + }; + /* + * Lower 32 bits of the command table address associated with this + * header. The command table addresses must be 128 byte aligned. + */ + unsigned int ctba; + /* + * If 64 bit addressing is used this field is the upper 32 bits + * of the command table address associated with this command. + */ + unsigned int ctbau; + /* Reserved and unused. */ + unsigned int res[4]; +}; + +/* Command scatter gather structure (PRD). */ +struct mtip_cmd_sg { + /* + * Low 32 bits of the data buffer address. For P320 this + * address must be 8 byte aligned signified by bits 2:0 being + * set to 0. + */ + unsigned int dba; + /* + * When 64 bit addressing is used this field is the upper + * 32 bits of the data buffer address. + */ + unsigned int dba_upper; + /* Unused. */ + unsigned int reserved; + /* + * Bit 31: interrupt when this data block has been transferred. + * Bits 30..22: reserved + * Bits 21..0: byte count (minus 1). For P320 the byte count must be + * 8 byte aligned signified by bits 2:0 being set to 1. + */ + unsigned int info; +}; +struct mtip_port; + +/* Structure used to describe a command. */ +struct mtip_cmd { + + struct mtip_cmd_hdr *command_header; /* ptr to command header entry */ + + dma_addr_t command_header_dma; /* corresponding physical address */ + + void *command; /* ptr to command table entry */ + + dma_addr_t command_dma; /* corresponding physical address */ + + void *comp_data; /* data passed to completion function comp_func() */ + /* + * Completion function called by the ISR upon completion of + * a command. + */ + void (*comp_func)(struct mtip_port *port, + int tag, + void *data, + int status); + /* Additional callback function that may be called by comp_func() */ + void (*async_callback)(void *data, int status); + + void *async_data; /* Addl. data passed to async_callback() */ + + int scatter_ents; /* Number of scatter list entries used */ + + struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */ + + int retries; /* The number of retries left for this command. */ + + int direction; /* Data transfer direction */ + + unsigned long comp_time; /* command completion time, in jiffies */ + + atomic_t active; /* declares if this command sent to the drive. */ +}; + +/* Structure used to describe a port. */ +struct mtip_port { + /* Pointer back to the driver data for this port. */ + struct driver_data *dd; + /* + * Used to determine if the data pointed to by the + * identify field is valid. + */ + unsigned long identify_valid; + /* Base address of the memory mapped IO for the port. */ + void __iomem *mmio; + /* Array of pointers to the memory mapped s_active registers. */ + void __iomem *s_active[MTIP_MAX_SLOT_GROUPS]; + /* Array of pointers to the memory mapped completed registers. */ + void __iomem *completed[MTIP_MAX_SLOT_GROUPS]; + /* Array of pointers to the memory mapped Command Issue registers. */ + void __iomem *cmd_issue[MTIP_MAX_SLOT_GROUPS]; + /* + * Pointer to the beginning of the command header memory as used + * by the driver. + */ + void *command_list; + /* + * Pointer to the beginning of the command header memory as used + * by the DMA. + */ + dma_addr_t command_list_dma; + /* + * Pointer to the beginning of the RX FIS memory as used + * by the driver. + */ + void *rxfis; + /* + * Pointer to the beginning of the RX FIS memory as used + * by the DMA. + */ + dma_addr_t rxfis_dma; + /* + * Pointer to the beginning of the command table memory as used + * by the driver. + */ + void *command_table; + /* + * Pointer to the beginning of the command table memory as used + * by the DMA. + */ + dma_addr_t command_tbl_dma; + /* + * Pointer to the beginning of the identify data memory as used + * by the driver. + */ + u16 *identify; + /* + * Pointer to the beginning of the identify data memory as used + * by the DMA. + */ + dma_addr_t identify_dma; + /* + * Pointer to the beginning of a sector buffer that is used + * by the driver when issuing internal commands. + */ + u16 *sector_buffer; + /* + * Pointer to the beginning of a sector buffer that is used + * by the DMA when the driver issues internal commands. + */ + dma_addr_t sector_buffer_dma; + /* + * Bit significant, used to determine if a command slot has + * been allocated. i.e. the slot is in use. Bits are cleared + * when the command slot and all associated data structures + * are no longer needed. + */ + u16 *log_buf; + dma_addr_t log_buf_dma; + + u8 *smart_buf; + dma_addr_t smart_buf_dma; + + unsigned long allocated[SLOTBITS_IN_LONGS]; + /* + * used to queue commands when an internal command is in progress + * or error handling is active + */ + unsigned long cmds_to_issue[SLOTBITS_IN_LONGS]; + /* + * Array of command slots. Structure includes pointers to the + * command header and command table, and completion function and data + * pointers. + */ + struct mtip_cmd commands[MTIP_MAX_COMMAND_SLOTS]; + /* Used by mtip_service_thread to wait for an event */ + wait_queue_head_t svc_wait; + /* + * indicates the state of the port. Also, helps the service thread + * to determine its action on wake up. + */ + unsigned long flags; + /* + * Timer used to complete commands that have been active for too long. + */ + struct timer_list cmd_timer; + unsigned long ic_pause_timer; + /* + * Semaphore used to block threads if there are no + * command slots available. + */ + struct semaphore cmd_slot; + /* Spinlock for working around command-issue bug. */ + spinlock_t cmd_issue_lock; +}; + +/* + * Driver private data structure. + * + * One structure is allocated per probed device. + */ +struct driver_data { + void __iomem *mmio; /* Base address of the HBA registers. */ + + int major; /* Major device number. */ + + int instance; /* Instance number. First device probed is 0, ... */ + + struct gendisk *disk; /* Pointer to our gendisk structure. */ + + struct pci_dev *pdev; /* Pointer to the PCI device structure. */ + + struct request_queue *queue; /* Our request queue. */ + + struct mtip_port *port; /* Pointer to the port data structure. */ + + /* Tasklet used to process the bottom half of the ISR. */ + struct tasklet_struct tasklet; + + unsigned product_type; /* magic value declaring the product type */ + + unsigned slot_groups; /* number of slot groups the product supports */ + + unsigned long index; /* Index to determine the disk name */ + + unsigned long dd_flag; /* NOTE: use atomic bit operations on this */ + + struct task_struct *mtip_svc_handler; /* task_struct of svc thd */ + + struct dentry *dfs_node; +}; + +#endif diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index f533f3375e24..0c03411c59eb 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -34,12 +34,11 @@ #include <linux/kthread.h> #include <asm/uaccess.h> -#include <asm/system.h> #include <asm/types.h> #include <linux/nbd.h> -#define LO_MAGIC 0x68797548 +#define NBD_MAGIC 0x68797548 #ifdef NDEBUG #define dprintk(flags, fmt...) @@ -116,7 +115,7 @@ static void nbd_end_request(struct request *req) spin_unlock_irqrestore(q->queue_lock, flags); } -static void sock_shutdown(struct nbd_device *lo, int lock) +static void sock_shutdown(struct nbd_device *nbd, int lock) { /* Forcibly shutdown the socket causing all listeners * to error @@ -125,15 +124,14 @@ static void sock_shutdown(struct nbd_device *lo, int lock) * there should be a more generic interface rather than * calling socket ops directly here */ if (lock) - mutex_lock(&lo->tx_lock); - if (lo->sock) { - printk(KERN_WARNING "%s: shutting down socket\n", - lo->disk->disk_name); - kernel_sock_shutdown(lo->sock, SHUT_RDWR); - lo->sock = NULL; + mutex_lock(&nbd->tx_lock); + if (nbd->sock) { + dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n"); + kernel_sock_shutdown(nbd->sock, SHUT_RDWR); + nbd->sock = NULL; } if (lock) - mutex_unlock(&lo->tx_lock); + mutex_unlock(&nbd->tx_lock); } static void nbd_xmit_timeout(unsigned long arg) @@ -148,18 +146,20 @@ static void nbd_xmit_timeout(unsigned long arg) /* * Send or receive packet. */ -static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size, +static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, int msg_flags) { - struct socket *sock = lo->sock; + struct socket *sock = nbd->sock; int result; struct msghdr msg; struct kvec iov; sigset_t blocked, oldset; + unsigned long pflags = current->flags; if (unlikely(!sock)) { - printk(KERN_ERR "%s: Attempted %s on closed socket in sock_xmit\n", - lo->disk->disk_name, (send ? "send" : "recv")); + dev_err(disk_to_dev(nbd->disk), + "Attempted %s on closed socket in sock_xmit\n", + (send ? "send" : "recv")); return -EINVAL; } @@ -168,8 +168,9 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size, siginitsetinv(&blocked, sigmask(SIGKILL)); sigprocmask(SIG_SETMASK, &blocked, &oldset); + current->flags |= PF_MEMALLOC; do { - sock->sk->sk_allocation = GFP_NOIO; + sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC; iov.iov_base = buf; iov.iov_len = size; msg.msg_name = NULL; @@ -181,15 +182,15 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size, if (send) { struct timer_list ti; - if (lo->xmit_timeout) { + if (nbd->xmit_timeout) { init_timer(&ti); ti.function = nbd_xmit_timeout; ti.data = (unsigned long)current; - ti.expires = jiffies + lo->xmit_timeout; + ti.expires = jiffies + nbd->xmit_timeout; add_timer(&ti); } result = kernel_sendmsg(sock, &msg, &iov, 1, size); - if (lo->xmit_timeout) + if (nbd->xmit_timeout) del_timer_sync(&ti); } else result = kernel_recvmsg(sock, &msg, &iov, 1, size, @@ -201,7 +202,7 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size, task_pid_nr(current), current->comm, dequeue_signal_lock(current, ¤t->blocked, &info)); result = -EINTR; - sock_shutdown(lo, !send); + sock_shutdown(nbd, !send); break; } @@ -215,22 +216,24 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size, } while (size > 0); sigprocmask(SIG_SETMASK, &oldset, NULL); + tsk_restore_flags(current, pflags, PF_MEMALLOC); return result; } -static inline int sock_send_bvec(struct nbd_device *lo, struct bio_vec *bvec, +static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec, int flags) { int result; void *kaddr = kmap(bvec->bv_page); - result = sock_xmit(lo, 1, kaddr + bvec->bv_offset, bvec->bv_len, flags); + result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset, + bvec->bv_len, flags); kunmap(bvec->bv_page); return result; } /* always call with the tx_lock held */ -static int nbd_send_req(struct nbd_device *lo, struct request *req) +static int nbd_send_req(struct nbd_device *nbd, struct request *req) { int result, flags; struct nbd_request request; @@ -243,15 +246,15 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req) memcpy(request.handle, &req, sizeof(req)); dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n", - lo->disk->disk_name, req, + nbd->disk->disk_name, req, nbdcmd_to_ascii(nbd_cmd(req)), (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req)); - result = sock_xmit(lo, 1, &request, sizeof(request), + result = sock_xmit(nbd, 1, &request, sizeof(request), (nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0); if (result <= 0) { - printk(KERN_ERR "%s: Send control failed (result %d)\n", - lo->disk->disk_name, result); + dev_err(disk_to_dev(nbd->disk), + "Send control failed (result %d)\n", result); goto error_out; } @@ -267,11 +270,12 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req) if (!rq_iter_last(req, iter)) flags = MSG_MORE; dprintk(DBG_TX, "%s: request %p: sending %d bytes data\n", - lo->disk->disk_name, req, bvec->bv_len); - result = sock_send_bvec(lo, bvec, flags); + nbd->disk->disk_name, req, bvec->bv_len); + result = sock_send_bvec(nbd, bvec, flags); if (result <= 0) { - printk(KERN_ERR "%s: Send data failed (result %d)\n", - lo->disk->disk_name, result); + dev_err(disk_to_dev(nbd->disk), + "Send data failed (result %d)\n", + result); goto error_out; } } @@ -282,25 +286,25 @@ error_out: return -EIO; } -static struct request *nbd_find_request(struct nbd_device *lo, +static struct request *nbd_find_request(struct nbd_device *nbd, struct request *xreq) { struct request *req, *tmp; int err; - err = wait_event_interruptible(lo->active_wq, lo->active_req != xreq); + err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq); if (unlikely(err)) goto out; - spin_lock(&lo->queue_lock); - list_for_each_entry_safe(req, tmp, &lo->queue_head, queuelist) { + spin_lock(&nbd->queue_lock); + list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) { if (req != xreq) continue; list_del_init(&req->queuelist); - spin_unlock(&lo->queue_lock); + spin_unlock(&nbd->queue_lock); return req; } - spin_unlock(&lo->queue_lock); + spin_unlock(&nbd->queue_lock); err = -ENOENT; @@ -308,79 +312,78 @@ out: return ERR_PTR(err); } -static inline int sock_recv_bvec(struct nbd_device *lo, struct bio_vec *bvec) +static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec) { int result; void *kaddr = kmap(bvec->bv_page); - result = sock_xmit(lo, 0, kaddr + bvec->bv_offset, bvec->bv_len, + result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len, MSG_WAITALL); kunmap(bvec->bv_page); return result; } /* NULL returned = something went wrong, inform userspace */ -static struct request *nbd_read_stat(struct nbd_device *lo) +static struct request *nbd_read_stat(struct nbd_device *nbd) { int result; struct nbd_reply reply; struct request *req; reply.magic = 0; - result = sock_xmit(lo, 0, &reply, sizeof(reply), MSG_WAITALL); + result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL); if (result <= 0) { - printk(KERN_ERR "%s: Receive control failed (result %d)\n", - lo->disk->disk_name, result); + dev_err(disk_to_dev(nbd->disk), + "Receive control failed (result %d)\n", result); goto harderror; } if (ntohl(reply.magic) != NBD_REPLY_MAGIC) { - printk(KERN_ERR "%s: Wrong magic (0x%lx)\n", - lo->disk->disk_name, + dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n", (unsigned long)ntohl(reply.magic)); result = -EPROTO; goto harderror; } - req = nbd_find_request(lo, *(struct request **)reply.handle); + req = nbd_find_request(nbd, *(struct request **)reply.handle); if (IS_ERR(req)) { result = PTR_ERR(req); if (result != -ENOENT) goto harderror; - printk(KERN_ERR "%s: Unexpected reply (%p)\n", - lo->disk->disk_name, reply.handle); + dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n", + reply.handle); result = -EBADR; goto harderror; } if (ntohl(reply.error)) { - printk(KERN_ERR "%s: Other side returned error (%d)\n", - lo->disk->disk_name, ntohl(reply.error)); + dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n", + ntohl(reply.error)); req->errors++; return req; } dprintk(DBG_RX, "%s: request %p: got reply\n", - lo->disk->disk_name, req); + nbd->disk->disk_name, req); if (nbd_cmd(req) == NBD_CMD_READ) { struct req_iterator iter; struct bio_vec *bvec; rq_for_each_segment(bvec, req, iter) { - result = sock_recv_bvec(lo, bvec); + result = sock_recv_bvec(nbd, bvec); if (result <= 0) { - printk(KERN_ERR "%s: Receive data failed (result %d)\n", - lo->disk->disk_name, result); + dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n", + result); req->errors++; return req; } dprintk(DBG_RX, "%s: request %p: got %d bytes data\n", - lo->disk->disk_name, req, bvec->bv_len); + nbd->disk->disk_name, req, bvec->bv_len); } } return req; harderror: - lo->harderror = result; + nbd->harderror = result; return NULL; } @@ -398,48 +401,57 @@ static struct device_attribute pid_attr = { .show = pid_show, }; -static int nbd_do_it(struct nbd_device *lo) +static int nbd_do_it(struct nbd_device *nbd) { struct request *req; int ret; - BUG_ON(lo->magic != LO_MAGIC); + BUG_ON(nbd->magic != NBD_MAGIC); - lo->pid = current->pid; - ret = sysfs_create_file(&disk_to_dev(lo->disk)->kobj, &pid_attr.attr); + sk_set_memalloc(nbd->sock->sk); + nbd->pid = task_pid_nr(current); + ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr); if (ret) { - printk(KERN_ERR "nbd: sysfs_create_file failed!"); - lo->pid = 0; + dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); + nbd->pid = 0; return ret; } - while ((req = nbd_read_stat(lo)) != NULL) + while ((req = nbd_read_stat(nbd)) != NULL) nbd_end_request(req); - sysfs_remove_file(&disk_to_dev(lo->disk)->kobj, &pid_attr.attr); - lo->pid = 0; + device_remove_file(disk_to_dev(nbd->disk), &pid_attr); + nbd->pid = 0; return 0; } -static void nbd_clear_que(struct nbd_device *lo) +static void nbd_clear_que(struct nbd_device *nbd) { struct request *req; - BUG_ON(lo->magic != LO_MAGIC); + BUG_ON(nbd->magic != NBD_MAGIC); /* - * Because we have set lo->sock to NULL under the tx_lock, all + * Because we have set nbd->sock to NULL under the tx_lock, all * modifications to the list must have completed by now. For * the same reason, the active_req must be NULL. * * As a consequence, we don't need to take the spin lock while * purging the list here. */ - BUG_ON(lo->sock); - BUG_ON(lo->active_req); + BUG_ON(nbd->sock); + BUG_ON(nbd->active_req); - while (!list_empty(&lo->queue_head)) { - req = list_entry(lo->queue_head.next, struct request, + while (!list_empty(&nbd->queue_head)) { + req = list_entry(nbd->queue_head.next, struct request, + queuelist); + list_del_init(&req->queuelist); + req->errors++; + nbd_end_request(req); + } + + while (!list_empty(&nbd->waiting_queue)) { + req = list_entry(nbd->waiting_queue.next, struct request, queuelist); list_del_init(&req->queuelist); req->errors++; @@ -448,7 +460,7 @@ static void nbd_clear_que(struct nbd_device *lo) } -static void nbd_handle_req(struct nbd_device *lo, struct request *req) +static void nbd_handle_req(struct nbd_device *nbd, struct request *req) { if (req->cmd_type != REQ_TYPE_FS) goto error_out; @@ -456,39 +468,38 @@ static void nbd_handle_req(struct nbd_device *lo, struct request *req) nbd_cmd(req) = NBD_CMD_READ; if (rq_data_dir(req) == WRITE) { nbd_cmd(req) = NBD_CMD_WRITE; - if (lo->flags & NBD_READ_ONLY) { - printk(KERN_ERR "%s: Write on read-only\n", - lo->disk->disk_name); + if (nbd->flags & NBD_READ_ONLY) { + dev_err(disk_to_dev(nbd->disk), + "Write on read-only\n"); goto error_out; } } req->errors = 0; - mutex_lock(&lo->tx_lock); - if (unlikely(!lo->sock)) { - mutex_unlock(&lo->tx_lock); - printk(KERN_ERR "%s: Attempted send on closed socket\n", - lo->disk->disk_name); + mutex_lock(&nbd->tx_lock); + if (unlikely(!nbd->sock)) { + mutex_unlock(&nbd->tx_lock); + dev_err(disk_to_dev(nbd->disk), + "Attempted send on closed socket\n"); goto error_out; } - lo->active_req = req; + nbd->active_req = req; - if (nbd_send_req(lo, req) != 0) { - printk(KERN_ERR "%s: Request send failed\n", - lo->disk->disk_name); + if (nbd_send_req(nbd, req) != 0) { + dev_err(disk_to_dev(nbd->disk), "Request send failed\n"); req->errors++; nbd_end_request(req); } else { - spin_lock(&lo->queue_lock); - list_add(&req->queuelist, &lo->queue_head); - spin_unlock(&lo->queue_lock); + spin_lock(&nbd->queue_lock); + list_add_tail(&req->queuelist, &nbd->queue_head); + spin_unlock(&nbd->queue_lock); } - lo->active_req = NULL; - mutex_unlock(&lo->tx_lock); - wake_up_all(&lo->active_wq); + nbd->active_req = NULL; + mutex_unlock(&nbd->tx_lock); + wake_up_all(&nbd->active_wq); return; @@ -499,28 +510,28 @@ error_out: static int nbd_thread(void *data) { - struct nbd_device *lo = data; + struct nbd_device *nbd = data; struct request *req; set_user_nice(current, -20); - while (!kthread_should_stop() || !list_empty(&lo->waiting_queue)) { + while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) { /* wait for something to do */ - wait_event_interruptible(lo->waiting_wq, + wait_event_interruptible(nbd->waiting_wq, kthread_should_stop() || - !list_empty(&lo->waiting_queue)); + !list_empty(&nbd->waiting_queue)); /* extract request */ - if (list_empty(&lo->waiting_queue)) + if (list_empty(&nbd->waiting_queue)) continue; - spin_lock_irq(&lo->queue_lock); - req = list_entry(lo->waiting_queue.next, struct request, + spin_lock_irq(&nbd->queue_lock); + req = list_entry(nbd->waiting_queue.next, struct request, queuelist); list_del_init(&req->queuelist); - spin_unlock_irq(&lo->queue_lock); + spin_unlock_irq(&nbd->queue_lock); /* handle request */ - nbd_handle_req(lo, req); + nbd_handle_req(nbd, req); } return 0; } @@ -528,7 +539,7 @@ static int nbd_thread(void *data) /* * We always wait for result of write, for now. It would be nice to make it optional * in future - * if ((rq_data_dir(req) == WRITE) && (lo->flags & NBD_WRITE_NOCHK)) + * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK)) * { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); } */ @@ -537,31 +548,31 @@ static void do_nbd_request(struct request_queue *q) struct request *req; while ((req = blk_fetch_request(q)) != NULL) { - struct nbd_device *lo; + struct nbd_device *nbd; spin_unlock_irq(q->queue_lock); dprintk(DBG_BLKDEV, "%s: request %p: dequeued (flags=%x)\n", req->rq_disk->disk_name, req, req->cmd_type); - lo = req->rq_disk->private_data; + nbd = req->rq_disk->private_data; - BUG_ON(lo->magic != LO_MAGIC); + BUG_ON(nbd->magic != NBD_MAGIC); - if (unlikely(!lo->sock)) { - printk(KERN_ERR "%s: Attempted send on closed socket\n", - lo->disk->disk_name); + if (unlikely(!nbd->sock)) { + dev_err(disk_to_dev(nbd->disk), + "Attempted send on closed socket\n"); req->errors++; nbd_end_request(req); spin_lock_irq(q->queue_lock); continue; } - spin_lock_irq(&lo->queue_lock); - list_add_tail(&req->queuelist, &lo->waiting_queue); - spin_unlock_irq(&lo->queue_lock); + spin_lock_irq(&nbd->queue_lock); + list_add_tail(&req->queuelist, &nbd->waiting_queue); + spin_unlock_irq(&nbd->queue_lock); - wake_up(&lo->waiting_wq); + wake_up(&nbd->waiting_wq); spin_lock_irq(q->queue_lock); } @@ -569,32 +580,33 @@ static void do_nbd_request(struct request_queue *q) /* Must be called with tx_lock held */ -static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, +static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, unsigned int cmd, unsigned long arg) { switch (cmd) { case NBD_DISCONNECT: { struct request sreq; - printk(KERN_INFO "%s: NBD_DISCONNECT\n", lo->disk->disk_name); + dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); blk_rq_init(NULL, &sreq); sreq.cmd_type = REQ_TYPE_SPECIAL; nbd_cmd(&sreq) = NBD_CMD_DISC; - if (!lo->sock) + if (!nbd->sock) return -EINVAL; - nbd_send_req(lo, &sreq); + nbd_send_req(nbd, &sreq); return 0; } case NBD_CLEAR_SOCK: { struct file *file; - lo->sock = NULL; - file = lo->file; - lo->file = NULL; - nbd_clear_que(lo); - BUG_ON(!list_empty(&lo->queue_head)); + nbd->sock = NULL; + file = nbd->file; + nbd->file = NULL; + nbd_clear_que(nbd); + BUG_ON(!list_empty(&nbd->queue_head)); + BUG_ON(!list_empty(&nbd->waiting_queue)); if (file) fput(file); return 0; @@ -602,14 +614,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, case NBD_SET_SOCK: { struct file *file; - if (lo->file) + if (nbd->file) return -EBUSY; file = fget(arg); if (file) { struct inode *inode = file->f_path.dentry->d_inode; if (S_ISSOCK(inode->i_mode)) { - lo->file = file; - lo->sock = SOCKET_I(inode); + nbd->file = file; + nbd->sock = SOCKET_I(inode); if (max_part > 0) bdev->bd_invalidated = 1; return 0; @@ -621,29 +633,29 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, } case NBD_SET_BLKSIZE: - lo->blksize = arg; - lo->bytesize &= ~(lo->blksize-1); - bdev->bd_inode->i_size = lo->bytesize; - set_blocksize(bdev, lo->blksize); - set_capacity(lo->disk, lo->bytesize >> 9); + nbd->blksize = arg; + nbd->bytesize &= ~(nbd->blksize-1); + bdev->bd_inode->i_size = nbd->bytesize; + set_blocksize(bdev, nbd->blksize); + set_capacity(nbd->disk, nbd->bytesize >> 9); return 0; case NBD_SET_SIZE: - lo->bytesize = arg & ~(lo->blksize-1); - bdev->bd_inode->i_size = lo->bytesize; - set_blocksize(bdev, lo->blksize); - set_capacity(lo->disk, lo->bytesize >> 9); + nbd->bytesize = arg & ~(nbd->blksize-1); + bdev->bd_inode->i_size = nbd->bytesize; + set_blocksize(bdev, nbd->blksize); + set_capacity(nbd->disk, nbd->bytesize >> 9); return 0; case NBD_SET_TIMEOUT: - lo->xmit_timeout = arg * HZ; + nbd->xmit_timeout = arg * HZ; return 0; case NBD_SET_SIZE_BLOCKS: - lo->bytesize = ((u64) arg) * lo->blksize; - bdev->bd_inode->i_size = lo->bytesize; - set_blocksize(bdev, lo->blksize); - set_capacity(lo->disk, lo->bytesize >> 9); + nbd->bytesize = ((u64) arg) * nbd->blksize; + bdev->bd_inode->i_size = nbd->bytesize; + set_blocksize(bdev, nbd->blksize); + set_capacity(nbd->disk, nbd->bytesize >> 9); return 0; case NBD_DO_IT: { @@ -651,38 +663,38 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, struct file *file; int error; - if (lo->pid) + if (nbd->pid) return -EBUSY; - if (!lo->file) + if (!nbd->file) return -EINVAL; - mutex_unlock(&lo->tx_lock); + mutex_unlock(&nbd->tx_lock); - thread = kthread_create(nbd_thread, lo, lo->disk->disk_name); + thread = kthread_create(nbd_thread, nbd, nbd->disk->disk_name); if (IS_ERR(thread)) { - mutex_lock(&lo->tx_lock); + mutex_lock(&nbd->tx_lock); return PTR_ERR(thread); } wake_up_process(thread); - error = nbd_do_it(lo); + error = nbd_do_it(nbd); kthread_stop(thread); - mutex_lock(&lo->tx_lock); + mutex_lock(&nbd->tx_lock); if (error) return error; - sock_shutdown(lo, 0); - file = lo->file; - lo->file = NULL; - nbd_clear_que(lo); - printk(KERN_WARNING "%s: queue cleared\n", lo->disk->disk_name); + sock_shutdown(nbd, 0); + file = nbd->file; + nbd->file = NULL; + nbd_clear_que(nbd); + dev_warn(disk_to_dev(nbd->disk), "queue cleared\n"); if (file) fput(file); - lo->bytesize = 0; + nbd->bytesize = 0; bdev->bd_inode->i_size = 0; - set_capacity(lo->disk, 0); + set_capacity(nbd->disk, 0); if (max_part > 0) ioctl_by_bdev(bdev, BLKRRPART, 0); - return lo->harderror; + return nbd->harderror; } case NBD_CLEAR_QUE: @@ -690,14 +702,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, * This is for compatibility only. The queue is always cleared * by NBD_DO_IT or NBD_CLEAR_SOCK. */ - BUG_ON(!lo->sock && !list_empty(&lo->queue_head)); + BUG_ON(!nbd->sock && !list_empty(&nbd->queue_head)); return 0; case NBD_PRINT_DEBUG: - printk(KERN_INFO "%s: next = %p, prev = %p, head = %p\n", - bdev->bd_disk->disk_name, - lo->queue_head.next, lo->queue_head.prev, - &lo->queue_head); + dev_info(disk_to_dev(nbd->disk), + "next = %p, prev = %p, head = %p\n", + nbd->queue_head.next, nbd->queue_head.prev, + &nbd->queue_head); return 0; } return -ENOTTY; @@ -706,21 +718,21 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, static int nbd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - struct nbd_device *lo = bdev->bd_disk->private_data; + struct nbd_device *nbd = bdev->bd_disk->private_data; int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - BUG_ON(lo->magic != LO_MAGIC); + BUG_ON(nbd->magic != NBD_MAGIC); /* Anyone capable of this syscall can do *real bad* things */ dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n", - lo->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg); + nbd->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg); - mutex_lock(&lo->tx_lock); - error = __nbd_ioctl(bdev, lo, cmd, arg); - mutex_unlock(&lo->tx_lock); + mutex_lock(&nbd->tx_lock); + error = __nbd_ioctl(bdev, nbd, cmd, arg); + mutex_unlock(&nbd->tx_lock); return error; } @@ -745,7 +757,7 @@ static int __init nbd_init(void) BUILD_BUG_ON(sizeof(struct nbd_request) != 28); if (max_part < 0) { - printk(KERN_CRIT "nbd: max_part must be >= 0\n"); + printk(KERN_ERR "nbd: max_part must be >= 0\n"); return -EINVAL; } @@ -806,7 +818,7 @@ static int __init nbd_init(void) for (i = 0; i < nbds_max; i++) { struct gendisk *disk = nbd_dev[i].disk; nbd_dev[i].file = NULL; - nbd_dev[i].magic = LO_MAGIC; + nbd_dev[i].magic = NBD_MAGIC; nbd_dev[i].flags = 0; INIT_LIST_HEAD(&nbd_dev[i].waiting_queue); spin_lock_init(&nbd_dev[i].queue_lock); diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index f9ad514c9227..ad16c68c8645 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -39,7 +39,8 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/types.h> -#include <linux/version.h> + +#include <asm-generic/io-64-nonatomic-lo-hi.h> #define NVME_Q_DEPTH 1024 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) @@ -614,11 +615,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, return result; } -/* - * NB: return value of non-zero would mean that we were a stacking driver. - * make_request must always succeed. - */ -static int nvme_make_request(struct request_queue *q, struct bio *bio) +static void nvme_make_request(struct request_queue *q, struct bio *bio) { struct nvme_ns *ns = q->queuedata; struct nvme_queue *nvmeq = get_nvmeq(ns->dev); @@ -635,8 +632,6 @@ static int nvme_make_request(struct request_queue *q, struct bio *bio) spin_unlock_irq(&nvmeq->q_lock); put_nvmeq(nvmeq); - - return 0; } static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq) diff --git a/drivers/block/paride/bpck6.c b/drivers/block/paride/bpck6.c index ad124525ac23..ec64e7f5d1ce 100644 --- a/drivers/block/paride/bpck6.c +++ b/drivers/block/paride/bpck6.c @@ -20,9 +20,6 @@ */ -/* PARAMETERS */ -static int verbose; /* set this to 1 to see debugging messages and whatnot */ - #define BACKPACK_VERSION "2.0.2" #include <linux/module.h> @@ -36,6 +33,8 @@ static int verbose; /* set this to 1 to see debugging messages and whatnot */ #include "ppc6lnx.c" #include "paride.h" +/* PARAMETERS */ +static bool verbose; /* set this to 1 to see debugging messages and whatnot */ #define PPCSTRUCT(pi) ((Interface *)(pi->private)) diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 46b8136c31bb..ba2b6b5e5910 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -144,7 +144,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY}; static DEFINE_MUTEX(pcd_mutex); static DEFINE_SPINLOCK(pcd_lock); -module_param(verbose, bool, 0644); +module_param(verbose, int, 0644); module_param(major, int, 0); module_param(name, charp, 0); module_param(nice, int, 0); diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 869e7676d46f..831e3ac156e6 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -124,8 +124,9 @@ by default. */ +#include <linux/types.h> -static int verbose = 0; +static bool verbose = 0; static int major = PD_MAJOR; static char *name = PD_NAME; static int cluster = 64; diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index f21b520ef419..ec8f9ed6326e 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -118,13 +118,15 @@ #define PF_NAME "pf" #define PF_UNITS 4 +#include <linux/types.h> + /* Here are things one can override from the insmod command. Most are autoprobed by paride unless set here. Verbose is off by default. */ -static int verbose = 0; +static bool verbose = 0; static int major = PF_MAJOR; static char *name = PF_NAME; static int cluster = 64; diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c index 6b9a2000d56a..4a27b1de5fcb 100644 --- a/drivers/block/paride/pg.c +++ b/drivers/block/paride/pg.c @@ -130,13 +130,14 @@ #define PI_PG 4 #endif +#include <linux/types.h> /* Here are things one can override from the insmod command. Most are autoprobed by paride unless set here. Verbose is 0 by default. */ -static int verbose = 0; +static bool verbose = 0; static int major = PG_MAJOR; static char *name = PG_NAME; static int disable = 0; @@ -630,6 +631,7 @@ static ssize_t pg_read(struct file *filp, char __user *buf, size_t count, loff_t if (dev->status & 0x10) return -ETIME; + memset(&hdr, 0, sizeof(hdr)); hdr.magic = PG_MAGIC; hdr.dlen = dev->dlen; copy = 0; diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c index 7179f79d7468..2596042eb987 100644 --- a/drivers/block/paride/pt.c +++ b/drivers/block/paride/pt.c @@ -109,13 +109,15 @@ #define PT_NAME "pt" #define PT_UNITS 4 +#include <linux/types.h> + /* Here are things one can override from the insmod command. Most are autoprobed by paride unless set here. Verbose is on by default. */ -static int verbose = 0; +static bool verbose = 0; static int major = PT_MAJOR; static char *name = PT_NAME; static int disable = 0; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index e133f094ab08..ba66e4445f41 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -987,14 +987,14 @@ static void pkt_copy_bio_data(struct bio *src_bio, int seg, int offs, struct pag while (copy_size > 0) { struct bio_vec *src_bvl = bio_iovec_idx(src_bio, seg); - void *vfrom = kmap_atomic(src_bvl->bv_page, KM_USER0) + + void *vfrom = kmap_atomic(src_bvl->bv_page) + src_bvl->bv_offset + offs; void *vto = page_address(dst_page) + dst_offs; int len = min_t(int, copy_size, src_bvl->bv_len - offs); BUG_ON(len < 0); memcpy(vto, vfrom, len); - kunmap_atomic(vfrom, KM_USER0); + kunmap_atomic(vfrom); seg++; offs = 0; @@ -1019,10 +1019,10 @@ static void pkt_make_local_copy(struct packet_data *pkt, struct bio_vec *bvec) offs = 0; for (f = 0; f < pkt->frames; f++) { if (bvec[f].bv_page != pkt->pages[p]) { - void *vfrom = kmap_atomic(bvec[f].bv_page, KM_USER0) + bvec[f].bv_offset; + void *vfrom = kmap_atomic(bvec[f].bv_page) + bvec[f].bv_offset; void *vto = page_address(pkt->pages[p]) + offs; memcpy(vto, vfrom, CD_FRAMESIZE); - kunmap_atomic(vfrom, KM_USER0); + kunmap_atomic(vfrom); bvec[f].bv_page = pkt->pages[p]; bvec[f].bv_offset = offs; } else { @@ -2444,7 +2444,7 @@ static void pkt_end_io_read_cloned(struct bio *bio, int err) pkt_bio_finished(pd); } -static int pkt_make_request(struct request_queue *q, struct bio *bio) +static void pkt_make_request(struct request_queue *q, struct bio *bio) { struct pktcdvd_device *pd; char b[BDEVNAME_SIZE]; @@ -2473,7 +2473,7 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio) cloned_bio->bi_end_io = pkt_end_io_read_cloned; pd->stats.secs_r += bio->bi_size >> 9; pkt_queue_bio(pd, cloned_bio); - return 0; + return; } if (!test_bit(PACKET_WRITABLE, &pd->flags)) { @@ -2509,7 +2509,7 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio) pkt_make_request(q, &bp->bio1); pkt_make_request(q, &bp->bio2); bio_pair_release(bp); - return 0; + return; } } @@ -2533,7 +2533,7 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio) } spin_unlock(&pkt->lock); spin_unlock(&pd->cdrw.active_list_lock); - return 0; + return; } else { blocked_bio = 1; } @@ -2584,10 +2584,9 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio) */ wake_up(&pd->wqueue); } - return 0; + return; end_io: bio_io_error(bio); - return 0; } @@ -2818,7 +2817,7 @@ static const struct block_device_operations pktcdvd_ops = { .check_events = pkt_check_events, }; -static char *pktcdvd_devnode(struct gendisk *gd, mode_t *mode) +static char *pktcdvd_devnode(struct gendisk *gd, umode_t *mode) { return kasprintf(GFP_KERNEL, "pktcdvd/%s", gd->disk_name); } diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index 8e1ce2e2916a..da0abc1838c1 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -21,6 +21,7 @@ #include <linux/ata.h> #include <linux/blkdev.h> #include <linux/slab.h> +#include <linux/module.h> #include <asm/lv1call.h> #include <asm/ps3stor.h> diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index b3bdb8af89cf..f58cdcfb305f 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -10,6 +10,7 @@ #include <linux/blkdev.h> #include <linux/delay.h> +#include <linux/module.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> @@ -596,7 +597,7 @@ out: return next; } -static int ps3vram_make_request(struct request_queue *q, struct bio *bio) +static void ps3vram_make_request(struct request_queue *q, struct bio *bio) { struct ps3_system_bus_device *dev = q->queuedata; struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); @@ -610,13 +611,11 @@ static int ps3vram_make_request(struct request_queue *q, struct bio *bio) spin_unlock_irq(&priv->lock); if (busy) - return 0; + return; do { bio = ps3vram_do_bio(dev, bio); } while (bio); - - return 0; } static int __devinit ps3vram_probe(struct ps3_system_bus_device *dev) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 15f65b5f3fc7..54a55f03115d 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -41,19 +41,33 @@ #include "rbd_types.h" -#define DRV_NAME "rbd" -#define DRV_NAME_LONG "rbd (rados block device)" +/* + * The basic unit of block I/O is a sector. It is interpreted in a + * number of contexts in Linux (blk, bio, genhd), but the default is + * universally 512 bytes. These symbols are just slightly more + * meaningful than the bare numbers they represent. + */ +#define SECTOR_SHIFT 9 +#define SECTOR_SIZE (1ULL << SECTOR_SHIFT) + +#define RBD_DRV_NAME "rbd" +#define RBD_DRV_NAME_LONG "rbd (rados block device)" #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ -#define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX)) -#define RBD_MAX_POOL_NAME_LEN 64 #define RBD_MAX_SNAP_NAME_LEN 32 #define RBD_MAX_OPT_LEN 1024 #define RBD_SNAP_HEAD_NAME "-" +/* + * An RBD device name will be "rbd#", where the "rbd" comes from + * RBD_DRV_NAME above, and # is a unique integer identifier. + * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big + * enough to hold all possible device names. + */ #define DEV_NAME_LEN 32 +#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) #define RBD_NOTIFY_TIMEOUT_DEFAULT 10 @@ -62,14 +76,12 @@ */ struct rbd_image_header { u64 image_size; - char block_name[32]; + char *object_prefix; __u8 obj_order; __u8 crypt_type; __u8 comp_type; - struct rw_semaphore snap_rwsem; struct ceph_snap_context *snapc; size_t snap_names_len; - u64 snap_seq; u32 total_snaps; char *snap_names; @@ -83,7 +95,7 @@ struct rbd_options { }; /* - * an instance of the client. multiple devices may share a client. + * an instance of the client. multiple devices may share an rbd client. */ struct rbd_client { struct ceph_client *client; @@ -92,20 +104,9 @@ struct rbd_client { struct list_head node; }; -struct rbd_req_coll; - /* - * a single io request + * a request completion status */ -struct rbd_request { - struct request *rq; /* blk layer request */ - struct bio *bio; /* cloned bio */ - struct page **pages; /* list of used pages */ - u64 len; - int coll_index; - struct rbd_req_coll *coll; -}; - struct rbd_req_status { int done; int rc; @@ -122,10 +123,22 @@ struct rbd_req_coll { struct rbd_req_status status[0]; }; +/* + * a single io request + */ +struct rbd_request { + struct request *rq; /* blk layer request */ + struct bio *bio; /* cloned bio */ + struct page **pages; /* list of used pages */ + u64 len; + int coll_index; + struct rbd_req_coll *coll; +}; + struct rbd_snap { struct device dev; const char *name; - size_t size; + u64 size; struct list_head node; u64 id; }; @@ -134,13 +147,12 @@ struct rbd_snap { * a single device */ struct rbd_device { - int id; /* blkdev unique id */ + int dev_id; /* blkdev unique id */ int major; /* blkdev assigned major */ struct gendisk *disk; /* blkdev's gendisk and rq */ struct request_queue *q; - struct ceph_client *client; struct rbd_client *rbd_client; char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ @@ -148,19 +160,24 @@ struct rbd_device { spinlock_t lock; /* queue lock */ struct rbd_image_header header; - char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */ - int obj_len; - char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */ - char pool_name[RBD_MAX_POOL_NAME_LEN]; - int poolid; + char *image_name; + size_t image_name_len; + char *header_name; + char *pool_name; + int pool_id; struct ceph_osd_event *watch_event; struct ceph_osd_request *watch_request; - char snap_name[RBD_MAX_SNAP_NAME_LEN]; - u32 cur_snap; /* index+1 of current snapshot within snap context - 0 - for the head */ - int read_only; + /* protects updating the header */ + struct rw_semaphore header_rwsem; + /* name of the snapshot this device reads from */ + char *snap_name; + /* id of the snapshot this device reads from */ + u64 snap_id; /* current snapshot id */ + /* whether the snap_id this device reads from still exists */ + bool snap_exists; + int read_only; struct list_head node; @@ -171,35 +188,48 @@ struct rbd_device { struct device dev; }; -static struct bus_type rbd_bus_type = { - .name = "rbd", -}; - -static spinlock_t node_lock; /* protects client get/put */ - static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ + static LIST_HEAD(rbd_dev_list); /* devices */ -static LIST_HEAD(rbd_client_list); /* clients */ +static DEFINE_SPINLOCK(rbd_dev_list_lock); + +static LIST_HEAD(rbd_client_list); /* clients */ +static DEFINE_SPINLOCK(rbd_client_list_lock); static int __rbd_init_snaps_header(struct rbd_device *rbd_dev); static void rbd_dev_release(struct device *dev); -static ssize_t rbd_snap_rollback(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t size); static ssize_t rbd_snap_add(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); -static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev, - struct rbd_snap *snap);; +static void __rbd_remove_snap_dev(struct rbd_snap *snap); +static ssize_t rbd_add(struct bus_type *bus, const char *buf, + size_t count); +static ssize_t rbd_remove(struct bus_type *bus, const char *buf, + size_t count); -static struct rbd_device *dev_to_rbd(struct device *dev) +static struct bus_attribute rbd_bus_attrs[] = { + __ATTR(add, S_IWUSR, NULL, rbd_add), + __ATTR(remove, S_IWUSR, NULL, rbd_remove), + __ATTR_NULL +}; + +static struct bus_type rbd_bus_type = { + .name = "rbd", + .bus_attrs = rbd_bus_attrs, +}; + +static void rbd_root_dev_release(struct device *dev) { - return container_of(dev, struct rbd_device, dev); } +static struct device rbd_root_dev = { + .init_name = "rbd", + .release = rbd_root_dev_release, +}; + + static struct device *rbd_get_dev(struct rbd_device *rbd_dev) { return get_device(&rbd_dev->dev); @@ -210,20 +240,18 @@ static void rbd_put_dev(struct rbd_device *rbd_dev) put_device(&rbd_dev->dev); } -static int __rbd_update_snaps(struct rbd_device *rbd_dev); +static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver); static int rbd_open(struct block_device *bdev, fmode_t mode) { - struct gendisk *disk = bdev->bd_disk; - struct rbd_device *rbd_dev = disk->private_data; - - rbd_get_dev(rbd_dev); - - set_device_ro(bdev, rbd_dev->read_only); + struct rbd_device *rbd_dev = bdev->bd_disk->private_data; if ((mode & FMODE_WRITE) && rbd_dev->read_only) return -EROFS; + rbd_get_dev(rbd_dev); + set_device_ro(bdev, rbd_dev->read_only); + return 0; } @@ -244,9 +272,9 @@ static const struct block_device_operations rbd_bd_ops = { /* * Initialize an rbd client instance. - * We own *opt. + * We own *ceph_opts. */ -static struct rbd_client *rbd_client_create(struct ceph_options *opt, +static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts, struct rbd_options *rbd_opts) { struct rbd_client *rbdc; @@ -260,10 +288,12 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt, kref_init(&rbdc->kref); INIT_LIST_HEAD(&rbdc->node); - rbdc->client = ceph_create_client(opt, rbdc); + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); if (IS_ERR(rbdc->client)) - goto out_rbdc; - opt = NULL; /* Now rbdc->client is responsible for opt */ + goto out_mutex; + ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ ret = ceph_open_session(rbdc->client); if (ret < 0) @@ -271,35 +301,38 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt, rbdc->rbd_opts = rbd_opts; - spin_lock(&node_lock); + spin_lock(&rbd_client_list_lock); list_add_tail(&rbdc->node, &rbd_client_list); - spin_unlock(&node_lock); + spin_unlock(&rbd_client_list_lock); + + mutex_unlock(&ctl_mutex); dout("rbd_client_create created %p\n", rbdc); return rbdc; out_err: ceph_destroy_client(rbdc->client); -out_rbdc: +out_mutex: + mutex_unlock(&ctl_mutex); kfree(rbdc); out_opt: - if (opt) - ceph_destroy_options(opt); + if (ceph_opts) + ceph_destroy_options(ceph_opts); return ERR_PTR(ret); } /* * Find a ceph client with specific addr and configuration. */ -static struct rbd_client *__rbd_client_find(struct ceph_options *opt) +static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts) { struct rbd_client *client_node; - if (opt->flags & CEPH_OPT_NOSHARE) + if (ceph_opts->flags & CEPH_OPT_NOSHARE) return NULL; list_for_each_entry(client_node, &rbd_client_list, node) - if (ceph_compare_options(opt, client_node->client) == 0) + if (!ceph_compare_options(ceph_opts, client_node->client)) return client_node; return NULL; } @@ -315,7 +348,7 @@ enum { /* string args above */ }; -static match_table_t rbdopt_tokens = { +static match_table_t rbd_opts_tokens = { {Opt_notify_timeout, "notify_timeout=%d"}, /* int args above */ /* string args above */ @@ -324,11 +357,11 @@ static match_table_t rbdopt_tokens = { static int parse_rbd_opts_token(char *c, void *private) { - struct rbd_options *rbdopt = private; + struct rbd_options *rbd_opts = private; substring_t argstr[MAX_OPT_ARGS]; int token, intval, ret; - token = match_token((char *)c, rbdopt_tokens, argstr); + token = match_token(c, rbd_opts_tokens, argstr); if (token < 0) return -EINVAL; @@ -349,7 +382,7 @@ static int parse_rbd_opts_token(char *c, void *private) switch (token) { case Opt_notify_timeout: - rbdopt->notify_timeout = intval; + rbd_opts->notify_timeout = intval; break; default: BUG_ON(token); @@ -361,64 +394,63 @@ static int parse_rbd_opts_token(char *c, void *private) * Get a ceph client with specific addr and configuration, if one does * not exist create it. */ -static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, - char *options) +static struct rbd_client *rbd_get_client(const char *mon_addr, + size_t mon_addr_len, + char *options) { struct rbd_client *rbdc; - struct ceph_options *opt; - int ret; + struct ceph_options *ceph_opts; struct rbd_options *rbd_opts; rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL); if (!rbd_opts) - return -ENOMEM; + return ERR_PTR(-ENOMEM); rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; - ret = ceph_parse_options(&opt, options, mon_addr, - mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts); - if (ret < 0) - goto done_err; + ceph_opts = ceph_parse_options(options, mon_addr, + mon_addr + mon_addr_len, + parse_rbd_opts_token, rbd_opts); + if (IS_ERR(ceph_opts)) { + kfree(rbd_opts); + return ERR_CAST(ceph_opts); + } - spin_lock(&node_lock); - rbdc = __rbd_client_find(opt); + spin_lock(&rbd_client_list_lock); + rbdc = __rbd_client_find(ceph_opts); if (rbdc) { - ceph_destroy_options(opt); - /* using an existing client */ kref_get(&rbdc->kref); - rbd_dev->rbd_client = rbdc; - rbd_dev->client = rbdc->client; - spin_unlock(&node_lock); - return 0; - } - spin_unlock(&node_lock); + spin_unlock(&rbd_client_list_lock); - rbdc = rbd_client_create(opt, rbd_opts); - if (IS_ERR(rbdc)) { - ret = PTR_ERR(rbdc); - goto done_err; + ceph_destroy_options(ceph_opts); + kfree(rbd_opts); + + return rbdc; } + spin_unlock(&rbd_client_list_lock); - rbd_dev->rbd_client = rbdc; - rbd_dev->client = rbdc->client; - return 0; -done_err: - kfree(rbd_opts); - return ret; + rbdc = rbd_client_create(ceph_opts, rbd_opts); + + if (IS_ERR(rbdc)) + kfree(rbd_opts); + + return rbdc; } /* * Destroy ceph client + * + * Caller must hold rbd_client_list_lock. */ static void rbd_client_release(struct kref *kref) { struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); dout("rbd_release_client %p\n", rbdc); - spin_lock(&node_lock); + spin_lock(&rbd_client_list_lock); list_del(&rbdc->node); - spin_unlock(&node_lock); + spin_unlock(&rbd_client_list_lock); ceph_destroy_client(rbdc->client); kfree(rbdc->rbd_opts); @@ -433,7 +465,6 @@ static void rbd_put_client(struct rbd_device *rbd_dev) { kref_put(&rbd_dev->rbd_client->kref, rbd_client_release); rbd_dev->rbd_client = NULL; - rbd_dev->client = NULL; } /* @@ -448,28 +479,37 @@ static void rbd_coll_release(struct kref *kref) kfree(coll); } +static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) +{ + return !memcmp(&ondisk->text, + RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)); +} + /* * Create a new header structure, translate header format from the on-disk * header. */ static int rbd_header_from_disk(struct rbd_image_header *header, struct rbd_image_header_ondisk *ondisk, - int allocated_snaps, - gfp_t gfp_flags) + u32 allocated_snaps) { - int i; - u32 snap_count = le32_to_cpu(ondisk->snap_count); - int ret = -ENOMEM; + u32 snap_count; - init_rwsem(&header->snap_rwsem); - header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); + if (!rbd_dev_ondisk_valid(ondisk)) + return -ENXIO; + + snap_count = le32_to_cpu(ondisk->snap_count); + if (snap_count > (SIZE_MAX - sizeof(struct ceph_snap_context)) + / sizeof (u64)) + return -EINVAL; header->snapc = kmalloc(sizeof(struct ceph_snap_context) + - snap_count * - sizeof(struct rbd_image_snap_ondisk), - gfp_flags); + snap_count * sizeof(u64), + GFP_KERNEL); if (!header->snapc) return -ENOMEM; + if (snap_count) { + header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); header->snap_names = kmalloc(header->snap_names_len, GFP_KERNEL); if (!header->snap_names) @@ -479,11 +519,20 @@ static int rbd_header_from_disk(struct rbd_image_header *header, if (!header->snap_sizes) goto err_names; } else { + WARN_ON(ondisk->snap_names_len); + header->snap_names_len = 0; header->snap_names = NULL; header->snap_sizes = NULL; } - memcpy(header->block_name, ondisk->block_name, + + header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1, + GFP_KERNEL); + if (!header->object_prefix) + goto err_sizes; + + memcpy(header->object_prefix, ondisk->block_name, sizeof(ondisk->block_name)); + header->object_prefix[sizeof (ondisk->block_name)] = '\0'; header->image_size = le64_to_cpu(ondisk->image_size); header->obj_order = ondisk->options.order; @@ -491,12 +540,13 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->comp_type = ondisk->options.comp_type; atomic_set(&header->snapc->nref, 1); - header->snap_seq = le64_to_cpu(ondisk->snap_seq); + header->snapc->seq = le64_to_cpu(ondisk->snap_seq); header->snapc->num_snaps = snap_count; header->total_snaps = snap_count; - if (snap_count && - allocated_snaps == snap_count) { + if (snap_count && allocated_snaps == snap_count) { + int i; + for (i = 0; i < snap_count; i++) { header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id); @@ -505,32 +555,23 @@ static int rbd_header_from_disk(struct rbd_image_header *header, } /* copy snapshot names */ - memcpy(header->snap_names, &ondisk->snaps[i], + memcpy(header->snap_names, &ondisk->snaps[snap_count], header->snap_names_len); } return 0; +err_sizes: + kfree(header->snap_sizes); + header->snap_sizes = NULL; err_names: kfree(header->snap_names); + header->snap_names = NULL; err_snapc: kfree(header->snapc); - return ret; -} - -static int snap_index(struct rbd_image_header *header, int snap_num) -{ - return header->total_snaps - snap_num; -} - -static u64 cur_snap_id(struct rbd_device *rbd_dev) -{ - struct rbd_image_header *header = &rbd_dev->header; - - if (!rbd_dev->cur_snap) - return 0; + header->snapc = NULL; - return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)]; + return -ENOMEM; } static int snap_by_name(struct rbd_image_header *header, const char *snap_name, @@ -539,70 +580,66 @@ static int snap_by_name(struct rbd_image_header *header, const char *snap_name, int i; char *p = header->snap_names; - for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) { - if (strcmp(snap_name, p) == 0) - break; - } - if (i == header->total_snaps) - return -ENOENT; - if (seq) - *seq = header->snapc->snaps[i]; + for (i = 0; i < header->total_snaps; i++) { + if (!strcmp(snap_name, p)) { - if (size) - *size = header->snap_sizes[i]; + /* Found it. Pass back its id and/or size */ - return i; + if (seq) + *seq = header->snapc->snaps[i]; + if (size) + *size = header->snap_sizes[i]; + return i; + } + p += strlen(p) + 1; /* Skip ahead to the next name */ + } + return -ENOENT; } -static int rbd_header_set_snap(struct rbd_device *dev, - const char *snap_name, - u64 *size) -{ - struct rbd_image_header *header = &dev->header; - struct ceph_snap_context *snapc = header->snapc; - int ret = -ENOENT; - - down_write(&header->snap_rwsem); - - if (!snap_name || - !*snap_name || - strcmp(snap_name, "-") == 0 || - strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) { - if (header->total_snaps) - snapc->seq = header->snap_seq; - else - snapc->seq = 0; - dev->cur_snap = 0; - dev->read_only = 0; +static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size) +{ + int ret; + + down_write(&rbd_dev->header_rwsem); + + if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, + sizeof (RBD_SNAP_HEAD_NAME))) { + rbd_dev->snap_id = CEPH_NOSNAP; + rbd_dev->snap_exists = false; + rbd_dev->read_only = 0; if (size) - *size = header->image_size; + *size = rbd_dev->header.image_size; } else { - ret = snap_by_name(header, snap_name, &snapc->seq, size); + u64 snap_id = 0; + + ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name, + &snap_id, size); if (ret < 0) goto done; - - dev->cur_snap = header->total_snaps - ret; - dev->read_only = 1; + rbd_dev->snap_id = snap_id; + rbd_dev->snap_exists = true; + rbd_dev->read_only = 1; } ret = 0; done: - up_write(&header->snap_rwsem); + up_write(&rbd_dev->header_rwsem); return ret; } static void rbd_header_free(struct rbd_image_header *header) { - kfree(header->snapc); - kfree(header->snap_names); + kfree(header->object_prefix); kfree(header->snap_sizes); + kfree(header->snap_names); + ceph_put_snap_context(header->snapc); } /* * get the actual striped segment name, offset and length */ static u64 rbd_get_segment(struct rbd_image_header *header, - const char *block_name, + const char *object_prefix, u64 ofs, u64 len, char *seg_name, u64 *segofs) { @@ -610,7 +647,7 @@ static u64 rbd_get_segment(struct rbd_image_header *header, if (seg_name) snprintf(seg_name, RBD_MAX_SEG_NAME_LEN, - "%s.%012llx", block_name, seg); + "%s.%012llx", object_prefix, seg); ofs = ofs & ((1 << header->obj_order) - 1); len = min_t(u64, len, (1 << header->obj_order) - ofs); @@ -708,13 +745,12 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next, * split_bio will BUG_ON if this is not the case */ dout("bio_chain_clone split! total=%d remaining=%d" - "bi_size=%d\n", - (int)total, (int)len-total, - (int)old_chain->bi_size); + "bi_size=%u\n", + total, len - total, old_chain->bi_size); /* split the bio. We'll release it either in the next call, or it will have to be released outside */ - bp = bio_split(old_chain, (len - total) / 512ULL); + bp = bio_split(old_chain, (len - total) / SECTOR_SIZE); if (!bp) goto err_out; @@ -759,22 +795,24 @@ err_out: /* * helpers for osd request op vectors. */ -static int rbd_create_rw_ops(struct ceph_osd_req_op **ops, - int num_ops, - int opcode, - u32 payload_len) -{ - *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1), - GFP_NOIO); - if (!*ops) - return -ENOMEM; - (*ops)[0].op = opcode; +static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops, + int opcode, u32 payload_len) +{ + struct ceph_osd_req_op *ops; + + ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO); + if (!ops) + return NULL; + + ops[0].op = opcode; + /* * op extent offset and length will be set later on * in calc_raw_layout() */ - (*ops)[0].payload_len = payload_len; - return 0; + ops[0].payload_len = payload_len; + + return ops; } static void rbd_destroy_ops(struct ceph_osd_req_op *ops) @@ -790,8 +828,8 @@ static void rbd_coll_end_req_index(struct request *rq, struct request_queue *q; int min, max, i; - dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n", - coll, index, ret, len); + dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n", + coll, index, ret, (unsigned long long) len); if (!rq) return; @@ -830,16 +868,15 @@ static void rbd_coll_end_req(struct rbd_request *req, * Send ceph osd request */ static int rbd_do_request(struct request *rq, - struct rbd_device *dev, + struct rbd_device *rbd_dev, struct ceph_snap_context *snapc, u64 snapid, - const char *obj, u64 ofs, u64 len, + const char *object_name, u64 ofs, u64 len, struct bio *bio, struct page **pages, int num_pages, int flags, struct ceph_osd_req_op *ops, - int num_reply, struct rbd_req_coll *coll, int coll_index, void (*rbd_cb)(struct ceph_osd_request *req, @@ -854,7 +891,7 @@ static int rbd_do_request(struct request *rq, struct timespec mtime = CURRENT_TIME; struct rbd_request *req_data; struct ceph_osd_request_head *reqhead; - struct rbd_image_header *header = &dev->header; + struct ceph_osd_client *osdc; req_data = kzalloc(sizeof(*req_data), GFP_NOIO); if (!req_data) { @@ -869,17 +906,13 @@ static int rbd_do_request(struct request *rq, req_data->coll_index = coll_index; } - dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs); + dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name, + (unsigned long long) ofs, (unsigned long long) len); - down_read(&header->snap_rwsem); - - req = ceph_osdc_alloc_request(&dev->client->osdc, flags, - snapc, - ops, - false, - GFP_NOIO, pages, bio); + osdc = &rbd_dev->rbd_client->client->osdc; + req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, + false, GFP_NOIO, pages, bio); if (!req) { - up_read(&header->snap_rwsem); ret = -ENOMEM; goto done_pages; } @@ -896,7 +929,7 @@ static int rbd_do_request(struct request *rq, reqhead = req->r_request->front.iov_base; reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); - strncpy(req->r_oid, obj, sizeof(req->r_oid)); + strncpy(req->r_oid, object_name, sizeof(req->r_oid)); req->r_oid_len = strlen(req->r_oid); layout = &req->r_file_layout; @@ -904,33 +937,32 @@ static int rbd_do_request(struct request *rq, layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); layout->fl_stripe_count = cpu_to_le32(1); layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - layout->fl_pg_preferred = cpu_to_le32(-1); - layout->fl_pg_pool = cpu_to_le32(dev->poolid); - ceph_calc_raw_layout(&dev->client->osdc, layout, snapid, - ofs, &len, &bno, req, ops); + layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id); + ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, + req, ops); ceph_osdc_build_request(req, ofs, &len, ops, snapc, &mtime, req->r_oid, req->r_oid_len); - up_read(&header->snap_rwsem); if (linger_req) { - ceph_osdc_set_request_linger(&dev->client->osdc, req); + ceph_osdc_set_request_linger(osdc, req); *linger_req = req; } - ret = ceph_osdc_start_request(&dev->client->osdc, req, false); + ret = ceph_osdc_start_request(osdc, req, false); if (ret < 0) goto done_err; if (!rbd_cb) { - ret = ceph_osdc_wait_request(&dev->client->osdc, req); + ret = ceph_osdc_wait_request(osdc, req); if (ver) *ver = le64_to_cpu(req->r_reassert_version.version); - dout("reassert_ver=%lld\n", - le64_to_cpu(req->r_reassert_version.version)); + dout("reassert_ver=%llu\n", + (unsigned long long) + le64_to_cpu(req->r_reassert_version.version)); ceph_osdc_put_request(req); } return ret; @@ -962,9 +994,10 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) op = (void *)(replyhead + 1); rc = le32_to_cpu(replyhead->result); bytes = le64_to_cpu(op->extent.length); - read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ); + read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ); - dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc); + dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n", + (unsigned long long) bytes, read_op, (int) rc); if (rc == -ENOENT && read_op) { zero_bio_chain(req_data->bio, 0); @@ -991,14 +1024,12 @@ static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg /* * Do a synchronous ceph osd operation */ -static int rbd_req_sync_op(struct rbd_device *dev, +static int rbd_req_sync_op(struct rbd_device *rbd_dev, struct ceph_snap_context *snapc, u64 snapid, - int opcode, int flags, - struct ceph_osd_req_op *orig_ops, - int num_reply, - const char *obj, + struct ceph_osd_req_op *ops, + const char *object_name, u64 ofs, u64 len, char *buf, struct ceph_osd_request **linger_req, @@ -1007,45 +1038,28 @@ static int rbd_req_sync_op(struct rbd_device *dev, int ret; struct page **pages; int num_pages; - struct ceph_osd_req_op *ops = orig_ops; - u32 payload_len; + + BUG_ON(ops == NULL); num_pages = calc_pages_for(ofs , len); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) return PTR_ERR(pages); - if (!orig_ops) { - payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0); - ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len); - if (ret < 0) - goto done; - - if ((flags & CEPH_OSD_FLAG_WRITE) && buf) { - ret = ceph_copy_to_page_vector(pages, buf, ofs, len); - if (ret < 0) - goto done_ops; - } - } - - ret = rbd_do_request(NULL, dev, snapc, snapid, - obj, ofs, len, NULL, + ret = rbd_do_request(NULL, rbd_dev, snapc, snapid, + object_name, ofs, len, NULL, pages, num_pages, flags, ops, - 2, NULL, 0, NULL, linger_req, ver); if (ret < 0) - goto done_ops; + goto done; if ((flags & CEPH_OSD_FLAG_READ) && buf) ret = ceph_copy_from_page_vector(pages, buf, ofs, ret); -done_ops: - if (!orig_ops) - rbd_destroy_ops(ops); done: ceph_release_page_vector(pages, num_pages); return ret; @@ -1055,10 +1069,10 @@ done: * Do an asynchronous ceph osd operation */ static int rbd_do_op(struct request *rq, - struct rbd_device *rbd_dev , + struct rbd_device *rbd_dev, struct ceph_snap_context *snapc, u64 snapid, - int opcode, int flags, int num_reply, + int opcode, int flags, u64 ofs, u64 len, struct bio *bio, struct rbd_req_coll *coll, @@ -1076,14 +1090,15 @@ static int rbd_do_op(struct request *rq, return -ENOMEM; seg_len = rbd_get_segment(&rbd_dev->header, - rbd_dev->header.block_name, + rbd_dev->header.object_prefix, ofs, len, seg_name, &seg_ofs); payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0); - ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len); - if (ret < 0) + ret = -ENOMEM; + ops = rbd_create_rw_ops(1, opcode, payload_len); + if (!ops) goto done; /* we've taken care of segment sizes earlier when we @@ -1097,7 +1112,6 @@ static int rbd_do_op(struct request *rq, NULL, 0, flags, ops, - num_reply, coll, coll_index, rbd_req_cb, 0, NULL); @@ -1121,7 +1135,6 @@ static int rbd_req_write(struct request *rq, return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - 2, ofs, len, bio, coll, coll_index); } @@ -1137,58 +1150,61 @@ static int rbd_req_read(struct request *rq, int coll_index) { return rbd_do_op(rq, rbd_dev, NULL, - (snapid ? snapid : CEPH_NOSNAP), + snapid, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, - 2, ofs, len, bio, coll, coll_index); } /* * Request sync osd read */ -static int rbd_req_sync_read(struct rbd_device *dev, - struct ceph_snap_context *snapc, +static int rbd_req_sync_read(struct rbd_device *rbd_dev, u64 snapid, - const char *obj, + const char *object_name, u64 ofs, u64 len, char *buf, u64 *ver) { - return rbd_req_sync_op(dev, NULL, - (snapid ? snapid : CEPH_NOSNAP), - CEPH_OSD_OP_READ, + struct ceph_osd_req_op *ops; + int ret; + + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0); + if (!ops) + return -ENOMEM; + + ret = rbd_req_sync_op(rbd_dev, NULL, + snapid, CEPH_OSD_FLAG_READ, - NULL, - 1, obj, ofs, len, buf, NULL, ver); + ops, object_name, ofs, len, buf, NULL, ver); + rbd_destroy_ops(ops); + + return ret; } /* * Request sync osd watch */ -static int rbd_req_sync_notify_ack(struct rbd_device *dev, +static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, u64 ver, - u64 notify_id, - const char *obj) + u64 notify_id) { struct ceph_osd_req_op *ops; - struct page **pages = NULL; int ret; - ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0); - if (ret < 0) - return ret; + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0); + if (!ops) + return -ENOMEM; - ops[0].watch.ver = cpu_to_le64(dev->header.obj_version); + ops[0].watch.ver = cpu_to_le64(ver); ops[0].watch.cookie = notify_id; ops[0].watch.flag = 0; - ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP, - obj, 0, 0, NULL, - pages, 0, + ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, + rbd_dev->header_name, 0, 0, NULL, + NULL, 0, CEPH_OSD_FLAG_READ, ops, - 1, NULL, 0, rbd_simple_req_cb, 0, NULL); @@ -1198,54 +1214,53 @@ static int rbd_req_sync_notify_ack(struct rbd_device *dev, static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) { - struct rbd_device *dev = (struct rbd_device *)data; + struct rbd_device *rbd_dev = (struct rbd_device *)data; + u64 hver; int rc; - if (!dev) + if (!rbd_dev) return; - dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, - notify_id, (int)opcode); - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - rc = __rbd_update_snaps(dev); - mutex_unlock(&ctl_mutex); + dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", + rbd_dev->header_name, (unsigned long long) notify_id, + (unsigned int) opcode); + rc = rbd_refresh_header(rbd_dev, &hver); if (rc) - pr_warning(DRV_NAME "%d got notification but failed to update" - " snaps: %d\n", dev->major, rc); + pr_warning(RBD_DRV_NAME "%d got notification but failed to " + " update snaps: %d\n", rbd_dev->major, rc); - rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name); + rbd_req_sync_notify_ack(rbd_dev, hver, notify_id); } /* * Request sync osd watch */ -static int rbd_req_sync_watch(struct rbd_device *dev, - const char *obj, - u64 ver) +static int rbd_req_sync_watch(struct rbd_device *rbd_dev) { struct ceph_osd_req_op *ops; - struct ceph_osd_client *osdc = &dev->client->osdc; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + int ret; - int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0); - if (ret < 0) - return ret; + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); + if (!ops) + return -ENOMEM; ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, - (void *)dev, &dev->watch_event); + (void *)rbd_dev, &rbd_dev->watch_event); if (ret < 0) goto fail; - ops[0].watch.ver = cpu_to_le64(ver); - ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie); + ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version); + ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); ops[0].watch.flag = 1; - ret = rbd_req_sync_op(dev, NULL, + ret = rbd_req_sync_op(rbd_dev, NULL, CEPH_NOSNAP, - 0, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, ops, - 1, obj, 0, 0, NULL, - &dev->watch_request, NULL); + rbd_dev->header_name, + 0, 0, NULL, + &rbd_dev->watch_request, NULL); if (ret < 0) goto fail_event; @@ -1254,8 +1269,8 @@ static int rbd_req_sync_watch(struct rbd_device *dev, return 0; fail_event: - ceph_osdc_cancel_event(dev->watch_event); - dev->watch_event = NULL; + ceph_osdc_cancel_event(rbd_dev->watch_event); + rbd_dev->watch_event = NULL; fail: rbd_destroy_ops(ops); return ret; @@ -1264,64 +1279,65 @@ fail: /* * Request sync osd unwatch */ -static int rbd_req_sync_unwatch(struct rbd_device *dev, - const char *obj) +static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) { struct ceph_osd_req_op *ops; + int ret; - int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0); - if (ret < 0) - return ret; + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); + if (!ops) + return -ENOMEM; ops[0].watch.ver = 0; - ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie); + ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); ops[0].watch.flag = 0; - ret = rbd_req_sync_op(dev, NULL, + ret = rbd_req_sync_op(rbd_dev, NULL, CEPH_NOSNAP, - 0, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, ops, - 1, obj, 0, 0, NULL, NULL, NULL); + rbd_dev->header_name, + 0, 0, NULL, NULL, NULL); + rbd_destroy_ops(ops); - ceph_osdc_cancel_event(dev->watch_event); - dev->watch_event = NULL; + ceph_osdc_cancel_event(rbd_dev->watch_event); + rbd_dev->watch_event = NULL; return ret; } struct rbd_notify_info { - struct rbd_device *dev; + struct rbd_device *rbd_dev; }; static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data) { - struct rbd_device *dev = (struct rbd_device *)data; - if (!dev) + struct rbd_device *rbd_dev = (struct rbd_device *)data; + if (!rbd_dev) return; - dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, - notify_id, (int)opcode); + dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n", + rbd_dev->header_name, (unsigned long long) notify_id, + (unsigned int) opcode); } /* * Request sync osd notify */ -static int rbd_req_sync_notify(struct rbd_device *dev, - const char *obj) +static int rbd_req_sync_notify(struct rbd_device *rbd_dev) { struct ceph_osd_req_op *ops; - struct ceph_osd_client *osdc = &dev->client->osdc; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct ceph_osd_event *event; struct rbd_notify_info info; int payload_len = sizeof(u32) + sizeof(u32); int ret; - ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len); - if (ret < 0) - return ret; + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len); + if (!ops) + return -ENOMEM; - info.dev = dev; + info.rbd_dev = rbd_dev; ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1, (void *)&info, &event); @@ -1334,12 +1350,12 @@ static int rbd_req_sync_notify(struct rbd_device *dev, ops[0].watch.prot_ver = RADOS_NOTIFY_VER; ops[0].watch.timeout = 12; - ret = rbd_req_sync_op(dev, NULL, + ret = rbd_req_sync_op(rbd_dev, NULL, CEPH_NOSNAP, - 0, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, ops, - 1, obj, 0, 0, NULL, NULL, NULL); + rbd_dev->header_name, + 0, 0, NULL, NULL, NULL); if (ret < 0) goto fail_event; @@ -1356,64 +1372,39 @@ fail: } /* - * Request sync osd rollback - */ -static int rbd_req_sync_rollback_obj(struct rbd_device *dev, - u64 snapid, - const char *obj) -{ - struct ceph_osd_req_op *ops; - int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_ROLLBACK, 0); - if (ret < 0) - return ret; - - ops[0].snap.snapid = snapid; - - ret = rbd_req_sync_op(dev, NULL, - CEPH_NOSNAP, - 0, - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - ops, - 1, obj, 0, 0, NULL, NULL, NULL); - - rbd_destroy_ops(ops); - - return ret; -} - -/* * Request sync osd read */ -static int rbd_req_sync_exec(struct rbd_device *dev, - const char *obj, - const char *cls, - const char *method, +static int rbd_req_sync_exec(struct rbd_device *rbd_dev, + const char *object_name, + const char *class_name, + const char *method_name, const char *data, int len, u64 *ver) { struct ceph_osd_req_op *ops; - int cls_len = strlen(cls); - int method_len = strlen(method); - int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL, - cls_len + method_len + len); - if (ret < 0) - return ret; + int class_name_len = strlen(class_name); + int method_name_len = strlen(method_name); + int ret; - ops[0].cls.class_name = cls; - ops[0].cls.class_len = (__u8)cls_len; - ops[0].cls.method_name = method; - ops[0].cls.method_len = (__u8)method_len; + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, + class_name_len + method_name_len + len); + if (!ops) + return -ENOMEM; + + ops[0].cls.class_name = class_name; + ops[0].cls.class_len = (__u8) class_name_len; + ops[0].cls.method_name = method_name; + ops[0].cls.method_len = (__u8) method_name_len; ops[0].cls.argc = 0; ops[0].cls.indata = data; ops[0].cls.indata_len = len; - ret = rbd_req_sync_op(dev, NULL, + ret = rbd_req_sync_op(rbd_dev, NULL, CEPH_NOSNAP, - 0, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, ops, - 1, obj, 0, 0, NULL, NULL, ver); + object_name, 0, 0, NULL, NULL, ver); rbd_destroy_ops(ops); @@ -1444,16 +1435,16 @@ static void rbd_rq_fn(struct request_queue *q) struct request *rq; struct bio_pair *bp = NULL; - rq = blk_fetch_request(q); - - while (1) { + while ((rq = blk_fetch_request(q))) { struct bio *bio; struct bio *rq_bio, *next_bio = NULL; bool do_write; - int size, op_size = 0; + unsigned int size; + u64 op_size = 0; u64 ofs; int num_segs, cur_seg = 0; struct rbd_req_coll *coll; + struct ceph_snap_context *snapc; /* peek at request from block layer */ if (!rq) @@ -1464,39 +1455,54 @@ static void rbd_rq_fn(struct request_queue *q) /* filter out block requests we don't understand */ if ((rq->cmd_type != REQ_TYPE_FS)) { __blk_end_request_all(rq, 0); - goto next; + continue; } /* deduce our operation (read, write) */ do_write = (rq_data_dir(rq) == WRITE); size = blk_rq_bytes(rq); - ofs = blk_rq_pos(rq) * 512ULL; + ofs = blk_rq_pos(rq) * SECTOR_SIZE; rq_bio = rq->bio; if (do_write && rbd_dev->read_only) { __blk_end_request_all(rq, -EROFS); - goto next; + continue; } spin_unlock_irq(q->queue_lock); + down_read(&rbd_dev->header_rwsem); + + if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) { + up_read(&rbd_dev->header_rwsem); + dout("request for non-existent snapshot"); + spin_lock_irq(q->queue_lock); + __blk_end_request_all(rq, -ENXIO); + continue; + } + + snapc = ceph_get_snap_context(rbd_dev->header.snapc); + + up_read(&rbd_dev->header_rwsem); + dout("%s 0x%x bytes at 0x%llx\n", do_write ? "write" : "read", - size, blk_rq_pos(rq) * 512ULL); + size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); coll = rbd_alloc_coll(num_segs); if (!coll) { spin_lock_irq(q->queue_lock); __blk_end_request_all(rq, -ENOMEM); - goto next; + ceph_put_snap_context(snapc); + continue; } do { /* a bio clone to be passed down to OSD req */ - dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt); + dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); op_size = rbd_get_segment(&rbd_dev->header, - rbd_dev->header.block_name, + rbd_dev->header.object_prefix, ofs, size, NULL, NULL); kref_get(&coll->kref); @@ -1512,13 +1518,13 @@ static void rbd_rq_fn(struct request_queue *q) /* init OSD command: write or read */ if (do_write) rbd_req_write(rq, rbd_dev, - rbd_dev->header.snapc, + snapc, ofs, op_size, bio, coll, cur_seg); else rbd_req_read(rq, rbd_dev, - cur_snap_id(rbd_dev), + rbd_dev->snap_id, ofs, op_size, bio, coll, cur_seg); @@ -1535,8 +1541,8 @@ next_seg: if (bp) bio_pair_release(bp); spin_lock_irq(q->queue_lock); -next: - rq = blk_fetch_request(q); + + ceph_put_snap_context(snapc); } } @@ -1549,13 +1555,17 @@ static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, struct bio_vec *bvec) { struct rbd_device *rbd_dev = q->queuedata; - unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9); - sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev); - unsigned int bio_sectors = bmd->bi_size >> 9; + unsigned int chunk_sectors; + sector_t sector; + unsigned int bio_sectors; int max; + chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); + sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev); + bio_sectors = bmd->bi_size >> SECTOR_SHIFT; + max = (chunk_sectors - ((sector & (chunk_sectors - 1)) - + bio_sectors)) << 9; + + bio_sectors)) << SECTOR_SHIFT; if (max < 0) max = 0; /* bio_add cannot handle a negative return */ if (max <= bvec->bv_len && bio_sectors == 0) @@ -1587,40 +1597,48 @@ static int rbd_read_header(struct rbd_device *rbd_dev, { ssize_t rc; struct rbd_image_header_ondisk *dh; - int snap_count = 0; - u64 snap_names_len = 0; + u32 snap_count = 0; u64 ver; + size_t len; + /* + * First reads the fixed-size header to determine the number + * of snapshots, then re-reads it, along with all snapshot + * records as well as their stored names. + */ + len = sizeof (*dh); while (1) { - int len = sizeof(*dh) + - snap_count * sizeof(struct rbd_image_snap_ondisk) + - snap_names_len; - - rc = -ENOMEM; dh = kmalloc(len, GFP_KERNEL); if (!dh) return -ENOMEM; rc = rbd_req_sync_read(rbd_dev, - NULL, CEPH_NOSNAP, - rbd_dev->obj_md_name, + CEPH_NOSNAP, + rbd_dev->header_name, 0, len, (char *)dh, &ver); if (rc < 0) goto out_dh; - rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL); - if (rc < 0) + rc = rbd_header_from_disk(header, dh, snap_count); + if (rc < 0) { + if (rc == -ENXIO) + pr_warning("unrecognized header format" + " for image %s\n", + rbd_dev->image_name); goto out_dh; - - if (snap_count != header->total_snaps) { - snap_count = header->total_snaps; - snap_names_len = header->snap_names_len; - rbd_header_free(header); - kfree(dh); - continue; } - break; + + if (snap_count == header->total_snaps) + break; + + snap_count = header->total_snaps; + len = sizeof (*dh) + + snap_count * sizeof(struct rbd_image_snap_ondisk) + + header->snap_names_len; + + rbd_header_free(header); + kfree(dh); } header->obj_version = ver; @@ -1632,7 +1650,7 @@ out_dh: /* * create a snapshot */ -static int rbd_header_add_snap(struct rbd_device *dev, +static int rbd_header_add_snap(struct rbd_device *rbd_dev, const char *snap_name, gfp_t gfp_flags) { @@ -1640,15 +1658,15 @@ static int rbd_header_add_snap(struct rbd_device *dev, u64 new_snapid; int ret; void *data, *p, *e; - u64 ver; + struct ceph_mon_client *monc; /* we should create a snapshot only if we're pointing at the head */ - if (dev->cur_snap) + if (rbd_dev->snap_id != CEPH_NOSNAP) return -EINVAL; - ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid, - &new_snapid); - dout("created snapid=%lld\n", new_snapid); + monc = &rbd_dev->rbd_client->client->monc; + ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid); + dout("created snapid=%llu\n", (unsigned long long) new_snapid); if (ret < 0) return ret; @@ -1662,17 +1680,13 @@ static int rbd_header_add_snap(struct rbd_device *dev, ceph_encode_string_safe(&p, e, snap_name, name_len, bad); ceph_encode_64_safe(&p, e, new_snapid, bad); - ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add", - data, p - data, &ver); + ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + "rbd", "snap_add", + data, p - data, NULL); kfree(data); - if (ret < 0) - return ret; - - dev->header.snapc->seq = new_snapid; - - return 0; + return ret < 0 ? ret : 0; bad: return -ERANGE; } @@ -1680,56 +1694,67 @@ bad: static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev) { struct rbd_snap *snap; + struct rbd_snap *next; - while (!list_empty(&rbd_dev->snaps)) { - snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node); - __rbd_remove_snap_dev(rbd_dev, snap); - } + list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) + __rbd_remove_snap_dev(snap); } /* * only read the first part of the ondisk header, without the snaps info */ -static int __rbd_update_snaps(struct rbd_device *rbd_dev) +static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) { int ret; struct rbd_image_header h; - u64 snap_seq; - int follow_seq = 0; ret = rbd_read_header(rbd_dev, &h); if (ret < 0) return ret; - /* resized? */ - set_capacity(rbd_dev->disk, h.image_size / 512ULL); + down_write(&rbd_dev->header_rwsem); - down_write(&rbd_dev->header.snap_rwsem); + /* resized? */ + if (rbd_dev->snap_id == CEPH_NOSNAP) { + sector_t size = (sector_t) h.image_size / SECTOR_SIZE; - snap_seq = rbd_dev->header.snapc->seq; - if (rbd_dev->header.total_snaps && - rbd_dev->header.snapc->snaps[0] == snap_seq) - /* pointing at the head, will need to follow that - if head moves */ - follow_seq = 1; + dout("setting size to %llu sectors", (unsigned long long) size); + set_capacity(rbd_dev->disk, size); + } - kfree(rbd_dev->header.snapc); - kfree(rbd_dev->header.snap_names); + /* rbd_dev->header.object_prefix shouldn't change */ kfree(rbd_dev->header.snap_sizes); + kfree(rbd_dev->header.snap_names); + /* osd requests may still refer to snapc */ + ceph_put_snap_context(rbd_dev->header.snapc); + if (hver) + *hver = h.obj_version; + rbd_dev->header.obj_version = h.obj_version; + rbd_dev->header.image_size = h.image_size; rbd_dev->header.total_snaps = h.total_snaps; rbd_dev->header.snapc = h.snapc; rbd_dev->header.snap_names = h.snap_names; rbd_dev->header.snap_names_len = h.snap_names_len; rbd_dev->header.snap_sizes = h.snap_sizes; - if (follow_seq) - rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0]; - else - rbd_dev->header.snapc->seq = snap_seq; + /* Free the extra copy of the object prefix */ + WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); + kfree(h.object_prefix); ret = __rbd_init_snaps_header(rbd_dev); - up_write(&rbd_dev->header.snap_rwsem); + up_write(&rbd_dev->header_rwsem); + + return ret; +} + +static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) +{ + int ret; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + ret = __rbd_refresh_header(rbd_dev, hver); + mutex_unlock(&ctl_mutex); return ret; } @@ -1739,6 +1764,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) struct gendisk *disk; struct request_queue *q; int rc; + u64 segment_size; u64 total_size = 0; /* contact OSD, request size info about the object being mapped */ @@ -1751,7 +1777,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (rc) return rc; - rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size); + rc = rbd_header_set_snap(rbd_dev, &total_size); if (rc) return rc; @@ -1761,8 +1787,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (!disk) goto out; - snprintf(disk->disk_name, sizeof(disk->disk_name), DRV_NAME "%d", - rbd_dev->id); + snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", + rbd_dev->dev_id); disk->major = rbd_dev->major; disk->first_minor = 0; disk->fops = &rbd_bd_ops; @@ -1774,11 +1800,15 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (!q) goto out_disk; + /* We use the default size, but let's be explicit about it. */ + blk_queue_physical_block_size(q, SECTOR_SIZE); + /* set io sizes to object size */ - blk_queue_max_hw_sectors(q, rbd_obj_bytes(&rbd_dev->header) / 512ULL); - blk_queue_max_segment_size(q, rbd_obj_bytes(&rbd_dev->header)); - blk_queue_io_min(q, rbd_obj_bytes(&rbd_dev->header)); - blk_queue_io_opt(q, rbd_obj_bytes(&rbd_dev->header)); + segment_size = rbd_obj_bytes(&rbd_dev->header); + blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); + blk_queue_max_segment_size(q, segment_size); + blk_queue_io_min(q, segment_size); + blk_queue_io_opt(q, segment_size); blk_queue_merge_bvec(q, rbd_merge_bvec); disk->queue = q; @@ -1789,7 +1819,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) rbd_dev->q = q; /* finally, announce the disk to the world */ - set_capacity(disk, total_size / 512ULL); + set_capacity(disk, total_size / SECTOR_SIZE); add_disk(disk); pr_info("%s: added with size 0x%llx\n", @@ -1806,18 +1836,28 @@ out: sysfs */ +static struct rbd_device *dev_to_rbd_dev(struct device *dev) +{ + return container_of(dev, struct rbd_device, dev); +} + static ssize_t rbd_size_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + sector_t size; + + down_read(&rbd_dev->header_rwsem); + size = get_capacity(rbd_dev->disk); + up_read(&rbd_dev->header_rwsem); - return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size); + return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); } static ssize_t rbd_major_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); return sprintf(buf, "%d\n", rbd_dev->major); } @@ -1825,32 +1865,41 @@ static ssize_t rbd_major_show(struct device *dev, static ssize_t rbd_client_id_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "client%lld\n", ceph_client_id(rbd_dev->client)); + return sprintf(buf, "client%lld\n", + ceph_client_id(rbd_dev->rbd_client->client)); } static ssize_t rbd_pool_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); return sprintf(buf, "%s\n", rbd_dev->pool_name); } +static ssize_t rbd_pool_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + + return sprintf(buf, "%d\n", rbd_dev->pool_id); +} + static ssize_t rbd_name_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "%s\n", rbd_dev->obj); + return sprintf(buf, "%s\n", rbd_dev->image_name); } static ssize_t rbd_snap_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); return sprintf(buf, "%s\n", rbd_dev->snap_name); } @@ -1860,40 +1909,34 @@ static ssize_t rbd_image_refresh(struct device *dev, const char *buf, size_t size) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); - int rc; - int ret = size; - - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + int ret; - rc = __rbd_update_snaps(rbd_dev); - if (rc < 0) - ret = rc; + ret = rbd_refresh_header(rbd_dev, NULL); - mutex_unlock(&ctl_mutex); - return ret; + return ret < 0 ? ret : size; } static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); +static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add); -static DEVICE_ATTR(rollback_snap, S_IWUSR, NULL, rbd_snap_rollback); static struct attribute *rbd_attrs[] = { &dev_attr_size.attr, &dev_attr_major.attr, &dev_attr_client_id.attr, &dev_attr_pool.attr, + &dev_attr_pool_id.attr, &dev_attr_name.attr, &dev_attr_current_snap.attr, &dev_attr_refresh.attr, &dev_attr_create_snap.attr, - &dev_attr_rollback_snap.attr, NULL }; @@ -1927,7 +1970,7 @@ static ssize_t rbd_snap_size_show(struct device *dev, { struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); - return sprintf(buf, "%lld\n", (long long)snap->size); + return sprintf(buf, "%llu\n", (unsigned long long)snap->size); } static ssize_t rbd_snap_id_show(struct device *dev, @@ -1936,7 +1979,7 @@ static ssize_t rbd_snap_id_show(struct device *dev, { struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); - return sprintf(buf, "%lld\n", (long long)snap->id); + return sprintf(buf, "%llu\n", (unsigned long long)snap->id); } static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); @@ -1969,15 +2012,13 @@ static struct device_type rbd_snap_device_type = { .release = rbd_snap_dev_release, }; -static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev, - struct rbd_snap *snap) +static void __rbd_remove_snap_dev(struct rbd_snap *snap) { list_del(&snap->node); device_unregister(&snap->dev); } -static int rbd_register_snap_dev(struct rbd_device *rbd_dev, - struct rbd_snap *snap, +static int rbd_register_snap_dev(struct rbd_snap *snap, struct device *parent) { struct device *dev = &snap->dev; @@ -1992,29 +2033,36 @@ static int rbd_register_snap_dev(struct rbd_device *rbd_dev, return ret; } -static int __rbd_add_snap_dev(struct rbd_device *rbd_dev, - int i, const char *name, - struct rbd_snap **snapp) +static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, + int i, const char *name) { + struct rbd_snap *snap; int ret; - struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL); + + snap = kzalloc(sizeof (*snap), GFP_KERNEL); if (!snap) - return -ENOMEM; + return ERR_PTR(-ENOMEM); + + ret = -ENOMEM; snap->name = kstrdup(name, GFP_KERNEL); + if (!snap->name) + goto err; + snap->size = rbd_dev->header.snap_sizes[i]; snap->id = rbd_dev->header.snapc->snaps[i]; if (device_is_registered(&rbd_dev->dev)) { - ret = rbd_register_snap_dev(rbd_dev, snap, - &rbd_dev->dev); + ret = rbd_register_snap_dev(snap, &rbd_dev->dev); if (ret < 0) goto err; } - *snapp = snap; - return 0; + + return snap; + err: kfree(snap->name); kfree(snap); - return ret; + + return ERR_PTR(ret); } /* @@ -2047,7 +2095,6 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) const char *name, *first_name; int i = rbd_dev->header.total_snaps; struct rbd_snap *snap, *old_snap = NULL; - int ret; struct list_head *p, *n; first_name = rbd_dev->header.snap_names; @@ -2062,8 +2109,15 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) cur_id = rbd_dev->header.snapc->snaps[i - 1]; if (!i || old_snap->id < cur_id) { - /* old_snap->id was skipped, thus was removed */ - __rbd_remove_snap_dev(rbd_dev, old_snap); + /* + * old_snap->id was skipped, thus was + * removed. If this rbd_dev is mapped to + * the removed snapshot, record that it no + * longer exists, to prevent further I/O. + */ + if (rbd_dev->snap_id == old_snap->id) + rbd_dev->snap_exists = false; + __rbd_remove_snap_dev(old_snap); continue; } if (old_snap->id == cur_id) { @@ -2083,9 +2137,9 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) if (cur_id >= old_snap->id) break; /* a new snapshot */ - ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap); - if (ret < 0) - return ret; + snap = __rbd_add_snap_dev(rbd_dev, i - 1, name); + if (IS_ERR(snap)) + return PTR_ERR(snap); /* note that we add it backward so using n and not p */ list_add(&snap->node, n); @@ -2099,28 +2153,18 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) WARN_ON(1); return -EINVAL; } - ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap); - if (ret < 0) - return ret; + snap = __rbd_add_snap_dev(rbd_dev, i - 1, name); + if (IS_ERR(snap)) + return PTR_ERR(snap); list_add(&snap->node, &rbd_dev->snaps); } return 0; } - -static void rbd_root_dev_release(struct device *dev) -{ -} - -static struct device rbd_root_dev = { - .init_name = "rbd", - .release = rbd_root_dev_release, -}; - static int rbd_bus_add_dev(struct rbd_device *rbd_dev) { - int ret = -ENOMEM; + int ret; struct device *dev; struct rbd_snap *snap; @@ -2131,21 +2175,17 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev) dev->type = &rbd_device_type; dev->parent = &rbd_root_dev; dev->release = rbd_dev_release; - dev_set_name(dev, "%d", rbd_dev->id); + dev_set_name(dev, "%d", rbd_dev->dev_id); ret = device_register(dev); if (ret < 0) - goto done_free; + goto out; list_for_each_entry(snap, &rbd_dev->snaps, node) { - ret = rbd_register_snap_dev(rbd_dev, snap, - &rbd_dev->dev); + ret = rbd_register_snap_dev(snap, &rbd_dev->dev); if (ret < 0) break; } - - mutex_unlock(&ctl_mutex); - return 0; -done_free: +out: mutex_unlock(&ctl_mutex); return ret; } @@ -2160,12 +2200,9 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev) int ret, rc; do { - ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name, - rbd_dev->header.obj_version); + ret = rbd_req_sync_watch(rbd_dev); if (ret == -ERANGE) { - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - rc = __rbd_update_snaps(rbd_dev); - mutex_unlock(&ctl_mutex); + rc = rbd_refresh_header(rbd_dev, NULL); if (rc < 0) return rc; } @@ -2174,102 +2211,305 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev) return ret; } +static atomic64_t rbd_id_max = ATOMIC64_INIT(0); + +/* + * Get a unique rbd identifier for the given new rbd_dev, and add + * the rbd_dev to the global list. The minimum rbd id is 1. + */ +static void rbd_id_get(struct rbd_device *rbd_dev) +{ + rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max); + + spin_lock(&rbd_dev_list_lock); + list_add_tail(&rbd_dev->node, &rbd_dev_list); + spin_unlock(&rbd_dev_list_lock); +} + +/* + * Remove an rbd_dev from the global list, and record that its + * identifier is no longer in use. + */ +static void rbd_id_put(struct rbd_device *rbd_dev) +{ + struct list_head *tmp; + int rbd_id = rbd_dev->dev_id; + int max_id; + + BUG_ON(rbd_id < 1); + + spin_lock(&rbd_dev_list_lock); + list_del_init(&rbd_dev->node); + + /* + * If the id being "put" is not the current maximum, there + * is nothing special we need to do. + */ + if (rbd_id != atomic64_read(&rbd_id_max)) { + spin_unlock(&rbd_dev_list_lock); + return; + } + + /* + * We need to update the current maximum id. Search the + * list to find out what it is. We're more likely to find + * the maximum at the end, so search the list backward. + */ + max_id = 0; + list_for_each_prev(tmp, &rbd_dev_list) { + struct rbd_device *rbd_dev; + + rbd_dev = list_entry(tmp, struct rbd_device, node); + if (rbd_id > max_id) + max_id = rbd_id; + } + spin_unlock(&rbd_dev_list_lock); + + /* + * The max id could have been updated by rbd_id_get(), in + * which case it now accurately reflects the new maximum. + * Be careful not to overwrite the maximum value in that + * case. + */ + atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id); +} + +/* + * Skips over white space at *buf, and updates *buf to point to the + * first found non-space character (if any). Returns the length of + * the token (string of non-white space characters) found. Note + * that *buf must be terminated with '\0'. + */ +static inline size_t next_token(const char **buf) +{ + /* + * These are the characters that produce nonzero for + * isspace() in the "C" and "POSIX" locales. + */ + const char *spaces = " \f\n\r\t\v"; + + *buf += strspn(*buf, spaces); /* Find start of token */ + + return strcspn(*buf, spaces); /* Return token length */ +} + +/* + * Finds the next token in *buf, and if the provided token buffer is + * big enough, copies the found token into it. The result, if + * copied, is guaranteed to be terminated with '\0'. Note that *buf + * must be terminated with '\0' on entry. + * + * Returns the length of the token found (not including the '\0'). + * Return value will be 0 if no token is found, and it will be >= + * token_size if the token would not fit. + * + * The *buf pointer will be updated to point beyond the end of the + * found token. Note that this occurs even if the token buffer is + * too small to hold it. + */ +static inline size_t copy_token(const char **buf, + char *token, + size_t token_size) +{ + size_t len; + + len = next_token(buf); + if (len < token_size) { + memcpy(token, *buf, len); + *(token + len) = '\0'; + } + *buf += len; + + return len; +} + +/* + * Finds the next token in *buf, dynamically allocates a buffer big + * enough to hold a copy of it, and copies the token into the new + * buffer. The copy is guaranteed to be terminated with '\0'. Note + * that a duplicate buffer is created even for a zero-length token. + * + * Returns a pointer to the newly-allocated duplicate, or a null + * pointer if memory for the duplicate was not available. If + * the lenp argument is a non-null pointer, the length of the token + * (not including the '\0') is returned in *lenp. + * + * If successful, the *buf pointer will be updated to point beyond + * the end of the found token. + * + * Note: uses GFP_KERNEL for allocation. + */ +static inline char *dup_token(const char **buf, size_t *lenp) +{ + char *dup; + size_t len; + + len = next_token(buf); + dup = kmalloc(len + 1, GFP_KERNEL); + if (!dup) + return NULL; + + memcpy(dup, *buf, len); + *(dup + len) = '\0'; + *buf += len; + + if (lenp) + *lenp = len; + + return dup; +} + +/* + * This fills in the pool_name, image_name, image_name_len, snap_name, + * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based + * on the list of monitor addresses and other options provided via + * /sys/bus/rbd/add. + * + * Note: rbd_dev is assumed to have been initially zero-filled. + */ +static int rbd_add_parse_args(struct rbd_device *rbd_dev, + const char *buf, + const char **mon_addrs, + size_t *mon_addrs_size, + char *options, + size_t options_size) +{ + size_t len; + int ret; + + /* The first four tokens are required */ + + len = next_token(&buf); + if (!len) + return -EINVAL; + *mon_addrs_size = len + 1; + *mon_addrs = buf; + + buf += len; + + len = copy_token(&buf, options, options_size); + if (!len || len >= options_size) + return -EINVAL; + + ret = -ENOMEM; + rbd_dev->pool_name = dup_token(&buf, NULL); + if (!rbd_dev->pool_name) + goto out_err; + + rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len); + if (!rbd_dev->image_name) + goto out_err; + + /* Create the name of the header object */ + + rbd_dev->header_name = kmalloc(rbd_dev->image_name_len + + sizeof (RBD_SUFFIX), + GFP_KERNEL); + if (!rbd_dev->header_name) + goto out_err; + sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX); + + /* + * The snapshot name is optional. If none is is supplied, + * we use the default value. + */ + rbd_dev->snap_name = dup_token(&buf, &len); + if (!rbd_dev->snap_name) + goto out_err; + if (!len) { + /* Replace the empty name with the default */ + kfree(rbd_dev->snap_name); + rbd_dev->snap_name + = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL); + if (!rbd_dev->snap_name) + goto out_err; + + memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, + sizeof (RBD_SNAP_HEAD_NAME)); + } + + return 0; + +out_err: + kfree(rbd_dev->header_name); + kfree(rbd_dev->image_name); + kfree(rbd_dev->pool_name); + rbd_dev->pool_name = NULL; + + return ret; +} + static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count) { - struct ceph_osd_client *osdc; - struct rbd_device *rbd_dev; - ssize_t rc = -ENOMEM; - int irc, new_id = 0; - struct list_head *tmp; - char *mon_dev_name; char *options; + struct rbd_device *rbd_dev = NULL; + const char *mon_addrs = NULL; + size_t mon_addrs_size = 0; + struct ceph_osd_client *osdc; + int rc = -ENOMEM; if (!try_module_get(THIS_MODULE)) return -ENODEV; - mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL); - if (!mon_dev_name) - goto err_out_mod; - - options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL); + options = kmalloc(count, GFP_KERNEL); if (!options) - goto err_mon_dev; - - /* new rbd_device object */ + goto err_nomem; rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); if (!rbd_dev) - goto err_out_opt; + goto err_nomem; /* static rbd_device initialization */ spin_lock_init(&rbd_dev->lock); INIT_LIST_HEAD(&rbd_dev->node); INIT_LIST_HEAD(&rbd_dev->snaps); + init_rwsem(&rbd_dev->header_rwsem); /* generate unique id: find highest unique id, add one */ - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - - list_for_each(tmp, &rbd_dev_list) { - struct rbd_device *rbd_dev; + rbd_id_get(rbd_dev); - rbd_dev = list_entry(tmp, struct rbd_device, node); - if (rbd_dev->id >= new_id) - new_id = rbd_dev->id + 1; - } - - rbd_dev->id = new_id; - - /* add to global list */ - list_add_tail(&rbd_dev->node, &rbd_dev_list); + /* Fill in the device name, now that we have its id. */ + BUILD_BUG_ON(DEV_NAME_LEN + < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); + sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); /* parse add command */ - if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s " - "%" __stringify(RBD_MAX_OPT_LEN) "s " - "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s " - "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s" - "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s", - mon_dev_name, options, rbd_dev->pool_name, - rbd_dev->obj, rbd_dev->snap_name) < 4) { - rc = -EINVAL; - goto err_out_slot; - } - - if (rbd_dev->snap_name[0] == 0) - rbd_dev->snap_name[0] = '-'; - - rbd_dev->obj_len = strlen(rbd_dev->obj); - snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s", - rbd_dev->obj, RBD_SUFFIX); - - /* initialize rest of new object */ - snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id); - rc = rbd_get_client(rbd_dev, mon_dev_name, options); - if (rc < 0) - goto err_out_slot; + rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size, + options, count); + if (rc) + goto err_put_id; - mutex_unlock(&ctl_mutex); + rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1, + options); + if (IS_ERR(rbd_dev->rbd_client)) { + rc = PTR_ERR(rbd_dev->rbd_client); + goto err_put_id; + } /* pick the pool */ - osdc = &rbd_dev->client->osdc; + osdc = &rbd_dev->rbd_client->client->osdc; rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name); if (rc < 0) goto err_out_client; - rbd_dev->poolid = rc; + rbd_dev->pool_id = rc; /* register our block device */ - irc = register_blkdev(0, rbd_dev->name); - if (irc < 0) { - rc = irc; + rc = register_blkdev(0, rbd_dev->name); + if (rc < 0) goto err_out_client; - } - rbd_dev->major = irc; + rbd_dev->major = rc; rc = rbd_bus_add_dev(rbd_dev); if (rc) goto err_out_blkdev; - /* set up and announce blkdev mapping */ + /* + * At this point cleanup in the event of an error is the job + * of the sysfs code (initiated by rbd_bus_del_dev()). + * + * Set up and announce blkdev mapping. + */ rc = rbd_init_disk(rbd_dev); if (rc) goto err_out_bus; @@ -2281,66 +2521,76 @@ static ssize_t rbd_add(struct bus_type *bus, return count; err_out_bus: - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - list_del_init(&rbd_dev->node); - mutex_unlock(&ctl_mutex); - /* this will also clean up rest of rbd_dev stuff */ rbd_bus_del_dev(rbd_dev); kfree(options); - kfree(mon_dev_name); return rc; err_out_blkdev: unregister_blkdev(rbd_dev->major, rbd_dev->name); err_out_client: rbd_put_client(rbd_dev); - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); -err_out_slot: - list_del_init(&rbd_dev->node); - mutex_unlock(&ctl_mutex); - +err_put_id: + if (rbd_dev->pool_name) { + kfree(rbd_dev->snap_name); + kfree(rbd_dev->header_name); + kfree(rbd_dev->image_name); + kfree(rbd_dev->pool_name); + } + rbd_id_put(rbd_dev); +err_nomem: kfree(rbd_dev); -err_out_opt: kfree(options); -err_mon_dev: - kfree(mon_dev_name); -err_out_mod: + dout("Error adding device %s\n", buf); module_put(THIS_MODULE); - return rc; + + return (ssize_t) rc; } -static struct rbd_device *__rbd_get_dev(unsigned long id) +static struct rbd_device *__rbd_get_dev(unsigned long dev_id) { struct list_head *tmp; struct rbd_device *rbd_dev; + spin_lock(&rbd_dev_list_lock); list_for_each(tmp, &rbd_dev_list) { rbd_dev = list_entry(tmp, struct rbd_device, node); - if (rbd_dev->id == id) + if (rbd_dev->dev_id == dev_id) { + spin_unlock(&rbd_dev_list_lock); return rbd_dev; + } } + spin_unlock(&rbd_dev_list_lock); return NULL; } static void rbd_dev_release(struct device *dev) { - struct rbd_device *rbd_dev = - container_of(dev, struct rbd_device, dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + + if (rbd_dev->watch_request) { + struct ceph_client *client = rbd_dev->rbd_client->client; - if (rbd_dev->watch_request) - ceph_osdc_unregister_linger_request(&rbd_dev->client->osdc, + ceph_osdc_unregister_linger_request(&client->osdc, rbd_dev->watch_request); + } if (rbd_dev->watch_event) - rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name); + rbd_req_sync_unwatch(rbd_dev); rbd_put_client(rbd_dev); /* clean up and free blkdev */ rbd_free_disk(rbd_dev); unregister_blkdev(rbd_dev->major, rbd_dev->name); + + /* done with the id, and with the rbd_dev */ + kfree(rbd_dev->snap_name); + kfree(rbd_dev->header_name); + kfree(rbd_dev->pool_name); + kfree(rbd_dev->image_name); + rbd_id_put(rbd_dev); kfree(rbd_dev); /* release module ref */ @@ -2373,8 +2623,6 @@ static ssize_t rbd_remove(struct bus_type *bus, goto done; } - list_del_init(&rbd_dev->node); - __rbd_remove_all_snaps(rbd_dev); rbd_bus_del_dev(rbd_dev); @@ -2388,7 +2636,7 @@ static ssize_t rbd_snap_add(struct device *dev, const char *buf, size_t count) { - struct rbd_device *rbd_dev = dev_to_rbd(dev); + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); int ret; char *name = kmalloc(count + 1, GFP_KERNEL); if (!name) @@ -2403,7 +2651,7 @@ static ssize_t rbd_snap_add(struct device *dev, if (ret < 0) goto err_unlock; - ret = __rbd_update_snaps(rbd_dev); + ret = __rbd_refresh_header(rbd_dev, NULL); if (ret < 0) goto err_unlock; @@ -2412,7 +2660,7 @@ static ssize_t rbd_snap_add(struct device *dev, mutex_unlock(&ctl_mutex); /* make a best effort, don't error if failed */ - rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name); + rbd_req_sync_notify(rbd_dev); ret = count; kfree(name); @@ -2424,70 +2672,6 @@ err_unlock: return ret; } -static ssize_t rbd_snap_rollback(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - struct rbd_device *rbd_dev = dev_to_rbd(dev); - int ret; - u64 snapid; - u64 cur_ofs; - char *seg_name = NULL; - char *snap_name = kmalloc(count + 1, GFP_KERNEL); - ret = -ENOMEM; - if (!snap_name) - return ret; - - /* parse snaps add command */ - snprintf(snap_name, count, "%s", buf); - seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); - if (!seg_name) - goto done; - - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - - ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL); - if (ret < 0) - goto done_unlock; - - dout("snapid=%lld\n", snapid); - - cur_ofs = 0; - while (cur_ofs < rbd_dev->header.image_size) { - cur_ofs += rbd_get_segment(&rbd_dev->header, - rbd_dev->obj, - cur_ofs, (u64)-1, - seg_name, NULL); - dout("seg_name=%s\n", seg_name); - - ret = rbd_req_sync_rollback_obj(rbd_dev, snapid, seg_name); - if (ret < 0) - pr_warning("could not roll back obj %s err=%d\n", - seg_name, ret); - } - - ret = __rbd_update_snaps(rbd_dev); - if (ret < 0) - goto done_unlock; - - ret = count; - -done_unlock: - mutex_unlock(&ctl_mutex); -done: - kfree(seg_name); - kfree(snap_name); - - return ret; -} - -static struct bus_attribute rbd_bus_attrs[] = { - __ATTR(add, S_IWUSR, NULL, rbd_add), - __ATTR(remove, S_IWUSR, NULL, rbd_remove), - __ATTR_NULL -}; - /* * create control files in sysfs * /sys/bus/rbd/... @@ -2496,21 +2680,21 @@ static int rbd_sysfs_init(void) { int ret; - rbd_bus_type.bus_attrs = rbd_bus_attrs; - - ret = bus_register(&rbd_bus_type); - if (ret < 0) + ret = device_register(&rbd_root_dev); + if (ret < 0) return ret; - ret = device_register(&rbd_root_dev); + ret = bus_register(&rbd_bus_type); + if (ret < 0) + device_unregister(&rbd_root_dev); return ret; } static void rbd_sysfs_cleanup(void) { - device_unregister(&rbd_root_dev); bus_unregister(&rbd_bus_type); + device_unregister(&rbd_root_dev); } int __init rbd_init(void) @@ -2520,8 +2704,7 @@ int __init rbd_init(void) rc = rbd_sysfs_init(); if (rc) return rc; - spin_lock_init(&node_lock); - pr_info("loaded " DRV_NAME_LONG "\n"); + pr_info("loaded " RBD_DRV_NAME_LONG "\n"); return 0; } diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h index fc6c678aa2cb..0924e9e41a60 100644 --- a/drivers/block/rbd_types.h +++ b/drivers/block/rbd_types.h @@ -31,7 +31,6 @@ #define RBD_MIN_OBJ_ORDER 16 #define RBD_MAX_OBJ_ORDER 30 -#define RBD_MAX_OBJ_NAME_LEN 96 #define RBD_MAX_SEG_NAME_LEN 128 #define RBD_COMP_NONE 0 @@ -41,10 +40,6 @@ #define RBD_HEADER_SIGNATURE "RBD" #define RBD_HEADER_VERSION "001.005" -struct rbd_info { - __le64 max_id; -} __attribute__ ((packed)); - struct rbd_image_snap_ondisk { __le64 id; __le64 image_size; diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 48e8fee9f2d4..9dcf76a10bb6 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -839,10 +839,7 @@ static struct vio_driver vdc_port_driver = { .id_table = vdc_port_match, .probe = vdc_port_probe, .remove = vdc_port_remove, - .driver = { - .name = "vdc_port", - .owner = THIS_MODULE, - } + .name = "vdc_port", }; static int __init vdc_init(void) diff --git a/drivers/block/swim.c b/drivers/block/swim.c index fd5adcd55944..6d5a914b9619 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -26,7 +26,6 @@ #include <linux/delay.h> #include <linux/platform_device.h> -#include <asm/macintosh.h> #include <asm/mac_via.h> #define CARDNAME "swim" diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index ae3e167e17ad..89ddab127e33 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -16,6 +16,8 @@ * handle GCR disks */ +#undef DEBUG + #include <linux/stddef.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -36,13 +38,11 @@ #include <asm/machdep.h> #include <asm/pmac_feature.h> -static DEFINE_MUTEX(swim3_mutex); -static struct request_queue *swim3_queue; -static struct gendisk *disks[2]; -static struct request *fd_req; - #define MAX_FLOPPIES 2 +static DEFINE_MUTEX(swim3_mutex); +static struct gendisk *disks[MAX_FLOPPIES]; + enum swim_state { idle, locating, @@ -177,7 +177,6 @@ struct swim3 { struct floppy_state { enum swim_state state; - spinlock_t lock; struct swim3 __iomem *swim3; /* hardware registers */ struct dbdma_regs __iomem *dma; /* DMA controller registers */ int swim3_intr; /* interrupt number for SWIM3 */ @@ -204,8 +203,20 @@ struct floppy_state { int wanted; struct macio_dev *mdev; char dbdma_cmd_space[5 * sizeof(struct dbdma_cmd)]; + int index; + struct request *cur_req; }; +#define swim3_err(fmt, arg...) dev_err(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg) +#define swim3_warn(fmt, arg...) dev_warn(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg) +#define swim3_info(fmt, arg...) dev_info(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg) + +#ifdef DEBUG +#define swim3_dbg(fmt, arg...) dev_dbg(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg) +#else +#define swim3_dbg(fmt, arg...) do { } while(0) +#endif + static struct floppy_state floppy_states[MAX_FLOPPIES]; static int floppy_count = 0; static DEFINE_SPINLOCK(swim3_lock); @@ -224,17 +235,8 @@ static unsigned short write_postamble[] = { 0, 0, 0, 0, 0, 0 }; -static void swim3_select(struct floppy_state *fs, int sel); -static void swim3_action(struct floppy_state *fs, int action); -static int swim3_readbit(struct floppy_state *fs, int bit); -static void do_fd_request(struct request_queue * q); -static void start_request(struct floppy_state *fs); -static void set_timeout(struct floppy_state *fs, int nticks, - void (*proc)(unsigned long)); -static void scan_track(struct floppy_state *fs); static void seek_track(struct floppy_state *fs, int n); static void init_dma(struct dbdma_cmd *cp, int cmd, void *buf, int count); -static void setup_transfer(struct floppy_state *fs); static void act(struct floppy_state *fs); static void scan_timeout(unsigned long data); static void seek_timeout(unsigned long data); @@ -254,18 +256,21 @@ static unsigned int floppy_check_events(struct gendisk *disk, unsigned int clearing); static int floppy_revalidate(struct gendisk *disk); -static bool swim3_end_request(int err, unsigned int nr_bytes) +static bool swim3_end_request(struct floppy_state *fs, int err, unsigned int nr_bytes) { - if (__blk_end_request(fd_req, err, nr_bytes)) - return true; + struct request *req = fs->cur_req; + int rc; - fd_req = NULL; - return false; -} + swim3_dbg(" end request, err=%d nr_bytes=%d, cur_req=%p\n", + err, nr_bytes, req); -static bool swim3_end_request_cur(int err) -{ - return swim3_end_request(err, blk_rq_cur_bytes(fd_req)); + if (err) + nr_bytes = blk_rq_cur_bytes(req); + rc = __blk_end_request(req, err, nr_bytes); + if (rc) + return true; + fs->cur_req = NULL; + return false; } static void swim3_select(struct floppy_state *fs, int sel) @@ -303,50 +308,53 @@ static int swim3_readbit(struct floppy_state *fs, int bit) return (stat & DATA) == 0; } -static void do_fd_request(struct request_queue * q) -{ - int i; - - for(i=0; i<floppy_count; i++) { - struct floppy_state *fs = &floppy_states[i]; - if (fs->mdev->media_bay && - check_media_bay(fs->mdev->media_bay) != MB_FD) - continue; - start_request(fs); - } -} - static void start_request(struct floppy_state *fs) { struct request *req; unsigned long x; + swim3_dbg("start request, initial state=%d\n", fs->state); + if (fs->state == idle && fs->wanted) { fs->state = available; wake_up(&fs->wait); return; } while (fs->state == idle) { - if (!fd_req) { - fd_req = blk_fetch_request(swim3_queue); - if (!fd_req) + swim3_dbg("start request, idle loop, cur_req=%p\n", fs->cur_req); + if (!fs->cur_req) { + fs->cur_req = blk_fetch_request(disks[fs->index]->queue); + swim3_dbg(" fetched request %p\n", fs->cur_req); + if (!fs->cur_req) break; } - req = fd_req; -#if 0 - printk("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n", - req->rq_disk->disk_name, req->cmd, - (long)blk_rq_pos(req), blk_rq_sectors(req), req->buffer); - printk(" errors=%d current_nr_sectors=%u\n", - req->errors, blk_rq_cur_sectors(req)); + req = fs->cur_req; + + if (fs->mdev->media_bay && + check_media_bay(fs->mdev->media_bay) != MB_FD) { + swim3_dbg("%s", " media bay absent, dropping req\n"); + swim3_end_request(fs, -ENODEV, 0); + continue; + } + +#if 0 /* This is really too verbose */ + swim3_dbg("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n", + req->rq_disk->disk_name, req->cmd, + (long)blk_rq_pos(req), blk_rq_sectors(req), + req->buffer); + swim3_dbg(" errors=%d current_nr_sectors=%u\n", + req->errors, blk_rq_cur_sectors(req)); #endif if (blk_rq_pos(req) >= fs->total_secs) { - swim3_end_request_cur(-EIO); + swim3_dbg(" pos out of bounds (%ld, max is %ld)\n", + (long)blk_rq_pos(req), (long)fs->total_secs); + swim3_end_request(fs, -EIO, 0); continue; } if (fs->ejected) { - swim3_end_request_cur(-EIO); + swim3_dbg("%s", " disk ejected\n"); + swim3_end_request(fs, -EIO, 0); continue; } @@ -354,7 +362,8 @@ static void start_request(struct floppy_state *fs) if (fs->write_prot < 0) fs->write_prot = swim3_readbit(fs, WRITE_PROT); if (fs->write_prot) { - swim3_end_request_cur(-EIO); + swim3_dbg("%s", " try to write, disk write protected\n"); + swim3_end_request(fs, -EIO, 0); continue; } } @@ -369,7 +378,6 @@ static void start_request(struct floppy_state *fs) x = ((long)blk_rq_pos(req)) % fs->secpercyl; fs->head = x / fs->secpertrack; fs->req_sector = x % fs->secpertrack + 1; - fd_req = req; fs->state = do_transfer; fs->retries = 0; @@ -377,12 +385,14 @@ static void start_request(struct floppy_state *fs) } } +static void do_fd_request(struct request_queue * q) +{ + start_request(q->queuedata); +} + static void set_timeout(struct floppy_state *fs, int nticks, void (*proc)(unsigned long)) { - unsigned long flags; - - spin_lock_irqsave(&fs->lock, flags); if (fs->timeout_pending) del_timer(&fs->timeout); fs->timeout.expires = jiffies + nticks; @@ -390,7 +400,6 @@ static void set_timeout(struct floppy_state *fs, int nticks, fs->timeout.data = (unsigned long) fs; add_timer(&fs->timeout); fs->timeout_pending = 1; - spin_unlock_irqrestore(&fs->lock, flags); } static inline void scan_track(struct floppy_state *fs) @@ -442,40 +451,45 @@ static inline void setup_transfer(struct floppy_state *fs) struct swim3 __iomem *sw = fs->swim3; struct dbdma_cmd *cp = fs->dma_cmd; struct dbdma_regs __iomem *dr = fs->dma; + struct request *req = fs->cur_req; - if (blk_rq_cur_sectors(fd_req) <= 0) { - printk(KERN_ERR "swim3: transfer 0 sectors?\n"); + if (blk_rq_cur_sectors(req) <= 0) { + swim3_warn("%s", "Transfer 0 sectors ?\n"); return; } - if (rq_data_dir(fd_req) == WRITE) + if (rq_data_dir(req) == WRITE) n = 1; else { n = fs->secpertrack - fs->req_sector + 1; - if (n > blk_rq_cur_sectors(fd_req)) - n = blk_rq_cur_sectors(fd_req); + if (n > blk_rq_cur_sectors(req)) + n = blk_rq_cur_sectors(req); } + + swim3_dbg(" setup xfer at sect %d (of %d) head %d for %d\n", + fs->req_sector, fs->secpertrack, fs->head, n); + fs->scount = n; swim3_select(fs, fs->head? READ_DATA_1: READ_DATA_0); out_8(&sw->sector, fs->req_sector); out_8(&sw->nsect, n); out_8(&sw->gap3, 0); out_le32(&dr->cmdptr, virt_to_bus(cp)); - if (rq_data_dir(fd_req) == WRITE) { + if (rq_data_dir(req) == WRITE) { /* Set up 3 dma commands: write preamble, data, postamble */ init_dma(cp, OUTPUT_MORE, write_preamble, sizeof(write_preamble)); ++cp; - init_dma(cp, OUTPUT_MORE, fd_req->buffer, 512); + init_dma(cp, OUTPUT_MORE, req->buffer, 512); ++cp; init_dma(cp, OUTPUT_LAST, write_postamble, sizeof(write_postamble)); } else { - init_dma(cp, INPUT_LAST, fd_req->buffer, n * 512); + init_dma(cp, INPUT_LAST, req->buffer, n * 512); } ++cp; out_le16(&cp->command, DBDMA_STOP); out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS); in_8(&sw->error); out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS); - if (rq_data_dir(fd_req) == WRITE) + if (rq_data_dir(req) == WRITE) out_8(&sw->control_bis, WRITE_SECTORS); in_8(&sw->intr); out_le32(&dr->control, (RUN << 16) | RUN); @@ -488,12 +502,16 @@ static inline void setup_transfer(struct floppy_state *fs) static void act(struct floppy_state *fs) { for (;;) { + swim3_dbg(" act loop, state=%d, req_cyl=%d, cur_cyl=%d\n", + fs->state, fs->req_cyl, fs->cur_cyl); + switch (fs->state) { case idle: return; /* XXX shouldn't get here */ case locating: if (swim3_readbit(fs, TRACK_ZERO)) { + swim3_dbg("%s", " locate track 0\n"); fs->cur_cyl = 0; if (fs->req_cyl == 0) fs->state = do_transfer; @@ -511,7 +529,7 @@ static void act(struct floppy_state *fs) break; } if (fs->req_cyl == fs->cur_cyl) { - printk("whoops, seeking 0\n"); + swim3_warn("%s", "Whoops, seeking 0\n"); fs->state = do_transfer; break; } @@ -527,7 +545,9 @@ static void act(struct floppy_state *fs) case do_transfer: if (fs->cur_cyl != fs->req_cyl) { if (fs->retries > 5) { - swim3_end_request_cur(-EIO); + swim3_err("Wrong cylinder in transfer, want: %d got %d\n", + fs->req_cyl, fs->cur_cyl); + swim3_end_request(fs, -EIO, 0); fs->state = idle; return; } @@ -542,7 +562,7 @@ static void act(struct floppy_state *fs) return; default: - printk(KERN_ERR"swim3: unknown state %d\n", fs->state); + swim3_err("Unknown state %d\n", fs->state); return; } } @@ -552,59 +572,75 @@ static void scan_timeout(unsigned long data) { struct floppy_state *fs = (struct floppy_state *) data; struct swim3 __iomem *sw = fs->swim3; + unsigned long flags; + + swim3_dbg("* scan timeout, state=%d\n", fs->state); + spin_lock_irqsave(&swim3_lock, flags); fs->timeout_pending = 0; out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS); out_8(&sw->select, RELAX); out_8(&sw->intr_enable, 0); fs->cur_cyl = -1; if (fs->retries > 5) { - swim3_end_request_cur(-EIO); + swim3_end_request(fs, -EIO, 0); fs->state = idle; start_request(fs); } else { fs->state = jogging; act(fs); } + spin_unlock_irqrestore(&swim3_lock, flags); } static void seek_timeout(unsigned long data) { struct floppy_state *fs = (struct floppy_state *) data; struct swim3 __iomem *sw = fs->swim3; + unsigned long flags; + + swim3_dbg("* seek timeout, state=%d\n", fs->state); + spin_lock_irqsave(&swim3_lock, flags); fs->timeout_pending = 0; out_8(&sw->control_bic, DO_SEEK); out_8(&sw->select, RELAX); out_8(&sw->intr_enable, 0); - printk(KERN_ERR "swim3: seek timeout\n"); - swim3_end_request_cur(-EIO); + swim3_err("%s", "Seek timeout\n"); + swim3_end_request(fs, -EIO, 0); fs->state = idle; start_request(fs); + spin_unlock_irqrestore(&swim3_lock, flags); } static void settle_timeout(unsigned long data) { struct floppy_state *fs = (struct floppy_state *) data; struct swim3 __iomem *sw = fs->swim3; + unsigned long flags; + + swim3_dbg("* settle timeout, state=%d\n", fs->state); + spin_lock_irqsave(&swim3_lock, flags); fs->timeout_pending = 0; if (swim3_readbit(fs, SEEK_COMPLETE)) { out_8(&sw->select, RELAX); fs->state = locating; act(fs); - return; + goto unlock; } out_8(&sw->select, RELAX); if (fs->settle_time < 2*HZ) { ++fs->settle_time; set_timeout(fs, 1, settle_timeout); - return; + goto unlock; } - printk(KERN_ERR "swim3: seek settle timeout\n"); - swim3_end_request_cur(-EIO); + swim3_err("%s", "Seek settle timeout\n"); + swim3_end_request(fs, -EIO, 0); fs->state = idle; start_request(fs); + unlock: + spin_unlock_irqrestore(&swim3_lock, flags); } static void xfer_timeout(unsigned long data) @@ -612,8 +648,12 @@ static void xfer_timeout(unsigned long data) struct floppy_state *fs = (struct floppy_state *) data; struct swim3 __iomem *sw = fs->swim3; struct dbdma_regs __iomem *dr = fs->dma; + unsigned long flags; int n; + swim3_dbg("* xfer timeout, state=%d\n", fs->state); + + spin_lock_irqsave(&swim3_lock, flags); fs->timeout_pending = 0; out_le32(&dr->control, RUN << 16); /* We must wait a bit for dbdma to stop */ @@ -622,12 +662,13 @@ static void xfer_timeout(unsigned long data) out_8(&sw->intr_enable, 0); out_8(&sw->control_bic, WRITE_SECTORS | DO_ACTION); out_8(&sw->select, RELAX); - printk(KERN_ERR "swim3: timeout %sing sector %ld\n", - (rq_data_dir(fd_req)==WRITE? "writ": "read"), - (long)blk_rq_pos(fd_req)); - swim3_end_request_cur(-EIO); + swim3_err("Timeout %sing sector %ld\n", + (rq_data_dir(fs->cur_req)==WRITE? "writ": "read"), + (long)blk_rq_pos(fs->cur_req)); + swim3_end_request(fs, -EIO, 0); fs->state = idle; start_request(fs); + spin_unlock_irqrestore(&swim3_lock, flags); } static irqreturn_t swim3_interrupt(int irq, void *dev_id) @@ -638,12 +679,17 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) int stat, resid; struct dbdma_regs __iomem *dr; struct dbdma_cmd *cp; + unsigned long flags; + struct request *req = fs->cur_req; + + swim3_dbg("* interrupt, state=%d\n", fs->state); + spin_lock_irqsave(&swim3_lock, flags); intr = in_8(&sw->intr); err = (intr & ERROR_INTR)? in_8(&sw->error): 0; if ((intr & ERROR_INTR) && fs->state != do_transfer) - printk(KERN_ERR "swim3_interrupt, state=%d, dir=%x, intr=%x, err=%x\n", - fs->state, rq_data_dir(fd_req), intr, err); + swim3_err("Non-transfer error interrupt: state=%d, dir=%x, intr=%x, err=%x\n", + fs->state, rq_data_dir(req), intr, err); switch (fs->state) { case locating: if (intr & SEEN_SECTOR) { @@ -653,10 +699,10 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) del_timer(&fs->timeout); fs->timeout_pending = 0; if (sw->ctrack == 0xff) { - printk(KERN_ERR "swim3: seen sector but cyl=ff?\n"); + swim3_err("%s", "Seen sector but cyl=ff?\n"); fs->cur_cyl = -1; if (fs->retries > 5) { - swim3_end_request_cur(-EIO); + swim3_end_request(fs, -EIO, 0); fs->state = idle; start_request(fs); } else { @@ -668,8 +714,8 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) fs->cur_cyl = sw->ctrack; fs->cur_sector = sw->csect; if (fs->expect_cyl != -1 && fs->expect_cyl != fs->cur_cyl) - printk(KERN_ERR "swim3: expected cyl %d, got %d\n", - fs->expect_cyl, fs->cur_cyl); + swim3_err("Expected cyl %d, got %d\n", + fs->expect_cyl, fs->cur_cyl); fs->state = do_transfer; act(fs); } @@ -704,7 +750,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) fs->timeout_pending = 0; dr = fs->dma; cp = fs->dma_cmd; - if (rq_data_dir(fd_req) == WRITE) + if (rq_data_dir(req) == WRITE) ++cp; /* * Check that the main data transfer has finished. @@ -729,31 +775,32 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) if (intr & ERROR_INTR) { n = fs->scount - 1 - resid / 512; if (n > 0) { - blk_update_request(fd_req, 0, n << 9); + blk_update_request(req, 0, n << 9); fs->req_sector += n; } if (fs->retries < 5) { ++fs->retries; act(fs); } else { - printk("swim3: error %sing block %ld (err=%x)\n", - rq_data_dir(fd_req) == WRITE? "writ": "read", - (long)blk_rq_pos(fd_req), err); - swim3_end_request_cur(-EIO); + swim3_err("Error %sing block %ld (err=%x)\n", + rq_data_dir(req) == WRITE? "writ": "read", + (long)blk_rq_pos(req), err); + swim3_end_request(fs, -EIO, 0); fs->state = idle; } } else { if ((stat & ACTIVE) == 0 || resid != 0) { /* musta been an error */ - printk(KERN_ERR "swim3: fd dma: stat=%x resid=%d\n", stat, resid); - printk(KERN_ERR " state=%d, dir=%x, intr=%x, err=%x\n", - fs->state, rq_data_dir(fd_req), intr, err); - swim3_end_request_cur(-EIO); + swim3_err("fd dma error: stat=%x resid=%d\n", stat, resid); + swim3_err(" state=%d, dir=%x, intr=%x, err=%x\n", + fs->state, rq_data_dir(req), intr, err); + swim3_end_request(fs, -EIO, 0); fs->state = idle; start_request(fs); break; } - if (swim3_end_request(0, fs->scount << 9)) { + fs->retries = 0; + if (swim3_end_request(fs, 0, fs->scount << 9)) { fs->req_sector += fs->scount; if (fs->req_sector > fs->secpertrack) { fs->req_sector -= fs->secpertrack; @@ -770,8 +817,9 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) start_request(fs); break; default: - printk(KERN_ERR "swim3: don't know what to do in state %d\n", fs->state); + swim3_err("Don't know what to do in state %d\n", fs->state); } + spin_unlock_irqrestore(&swim3_lock, flags); return IRQ_HANDLED; } @@ -781,26 +829,31 @@ static void fd_dma_interrupt(int irq, void *dev_id) } */ +/* Called under the mutex to grab exclusive access to a drive */ static int grab_drive(struct floppy_state *fs, enum swim_state state, int interruptible) { unsigned long flags; - spin_lock_irqsave(&fs->lock, flags); - if (fs->state != idle) { + swim3_dbg("%s", "-> grab drive\n"); + + spin_lock_irqsave(&swim3_lock, flags); + if (fs->state != idle && fs->state != available) { ++fs->wanted; while (fs->state != available) { + spin_unlock_irqrestore(&swim3_lock, flags); if (interruptible && signal_pending(current)) { --fs->wanted; - spin_unlock_irqrestore(&fs->lock, flags); return -EINTR; } interruptible_sleep_on(&fs->wait); + spin_lock_irqsave(&swim3_lock, flags); } --fs->wanted; } fs->state = state; - spin_unlock_irqrestore(&fs->lock, flags); + spin_unlock_irqrestore(&swim3_lock, flags); + return 0; } @@ -808,10 +861,12 @@ static void release_drive(struct floppy_state *fs) { unsigned long flags; - spin_lock_irqsave(&fs->lock, flags); + swim3_dbg("%s", "-> release drive\n"); + + spin_lock_irqsave(&swim3_lock, flags); fs->state = idle; start_request(fs); - spin_unlock_irqrestore(&fs->lock, flags); + spin_unlock_irqrestore(&swim3_lock, flags); } static int fd_eject(struct floppy_state *fs) @@ -966,6 +1021,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode) { struct floppy_state *fs = disk->private_data; struct swim3 __iomem *sw = fs->swim3; + mutex_lock(&swim3_mutex); if (fs->ref_count > 0 && --fs->ref_count == 0) { swim3_action(fs, MOTOR_OFF); @@ -1031,30 +1087,48 @@ static const struct block_device_operations floppy_fops = { .revalidate_disk= floppy_revalidate, }; +static void swim3_mb_event(struct macio_dev* mdev, int mb_state) +{ + struct floppy_state *fs = macio_get_drvdata(mdev); + struct swim3 __iomem *sw = fs->swim3; + + if (!fs) + return; + if (mb_state != MB_FD) + return; + + /* Clear state */ + out_8(&sw->intr_enable, 0); + in_8(&sw->intr); + in_8(&sw->error); +} + static int swim3_add_device(struct macio_dev *mdev, int index) { struct device_node *swim = mdev->ofdev.dev.of_node; struct floppy_state *fs = &floppy_states[index]; int rc = -EBUSY; + /* Do this first for message macros */ + memset(fs, 0, sizeof(*fs)); + fs->mdev = mdev; + fs->index = index; + /* Check & Request resources */ if (macio_resource_count(mdev) < 2) { - printk(KERN_WARNING "ifd%d: no address for %s\n", - index, swim->full_name); + swim3_err("%s", "No address in device-tree\n"); return -ENXIO; } - if (macio_irq_count(mdev) < 2) { - printk(KERN_WARNING "fd%d: no intrs for device %s\n", - index, swim->full_name); + if (macio_irq_count(mdev) < 1) { + swim3_err("%s", "No interrupt in device-tree\n"); + return -ENXIO; } if (macio_request_resource(mdev, 0, "swim3 (mmio)")) { - printk(KERN_ERR "fd%d: can't request mmio resource for %s\n", - index, swim->full_name); + swim3_err("%s", "Can't request mmio resource\n"); return -EBUSY; } if (macio_request_resource(mdev, 1, "swim3 (dma)")) { - printk(KERN_ERR "fd%d: can't request dma resource for %s\n", - index, swim->full_name); + swim3_err("%s", "Can't request dma resource\n"); macio_release_resource(mdev, 0); return -EBUSY; } @@ -1063,22 +1137,18 @@ static int swim3_add_device(struct macio_dev *mdev, int index) if (mdev->media_bay == NULL) pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 1); - memset(fs, 0, sizeof(*fs)); - spin_lock_init(&fs->lock); fs->state = idle; fs->swim3 = (struct swim3 __iomem *) ioremap(macio_resource_start(mdev, 0), 0x200); if (fs->swim3 == NULL) { - printk("fd%d: couldn't map registers for %s\n", - index, swim->full_name); + swim3_err("%s", "Couldn't map mmio registers\n"); rc = -ENOMEM; goto out_release; } fs->dma = (struct dbdma_regs __iomem *) ioremap(macio_resource_start(mdev, 1), 0x200); if (fs->dma == NULL) { - printk("fd%d: couldn't map DMA for %s\n", - index, swim->full_name); + swim3_err("%s", "Couldn't map dma registers\n"); iounmap(fs->swim3); rc = -ENOMEM; goto out_release; @@ -1090,31 +1160,25 @@ static int swim3_add_device(struct macio_dev *mdev, int index) fs->secpercyl = 36; fs->secpertrack = 18; fs->total_secs = 2880; - fs->mdev = mdev; init_waitqueue_head(&fs->wait); fs->dma_cmd = (struct dbdma_cmd *) DBDMA_ALIGN(fs->dbdma_cmd_space); memset(fs->dma_cmd, 0, 2 * sizeof(struct dbdma_cmd)); st_le16(&fs->dma_cmd[1].command, DBDMA_STOP); + if (mdev->media_bay == NULL || check_media_bay(mdev->media_bay) == MB_FD) + swim3_mb_event(mdev, MB_FD); + if (request_irq(fs->swim3_intr, swim3_interrupt, 0, "SWIM3", fs)) { - printk(KERN_ERR "fd%d: couldn't request irq %d for %s\n", - index, fs->swim3_intr, swim->full_name); + swim3_err("%s", "Couldn't request interrupt\n"); pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 0); goto out_unmap; return -EBUSY; } -/* - if (request_irq(fs->dma_intr, fd_dma_interrupt, 0, "SWIM3-dma", fs)) { - printk(KERN_ERR "Couldn't get irq %d for SWIM3 DMA", - fs->dma_intr); - return -EBUSY; - } -*/ init_timer(&fs->timeout); - printk(KERN_INFO "fd%d: SWIM3 floppy controller %s\n", floppy_count, + swim3_info("SWIM3 floppy controller %s\n", mdev->media_bay ? "in media bay" : ""); return 0; @@ -1132,41 +1196,42 @@ static int swim3_add_device(struct macio_dev *mdev, int index) static int __devinit swim3_attach(struct macio_dev *mdev, const struct of_device_id *match) { - int i, rc; struct gendisk *disk; + int index, rc; + + index = floppy_count++; + if (index >= MAX_FLOPPIES) + return -ENXIO; /* Add the drive */ - rc = swim3_add_device(mdev, floppy_count); + rc = swim3_add_device(mdev, index); if (rc) return rc; + /* Now register that disk. Same comment about failure handling */ + disk = disks[index] = alloc_disk(1); + if (disk == NULL) + return -ENOMEM; + disk->queue = blk_init_queue(do_fd_request, &swim3_lock); + if (disk->queue == NULL) { + put_disk(disk); + return -ENOMEM; + } + disk->queue->queuedata = &floppy_states[index]; - /* Now create the queue if not there yet */ - if (swim3_queue == NULL) { + if (index == 0) { /* If we failed, there isn't much we can do as the driver is still * too dumb to remove the device, just bail out */ if (register_blkdev(FLOPPY_MAJOR, "fd")) return 0; - swim3_queue = blk_init_queue(do_fd_request, &swim3_lock); - if (swim3_queue == NULL) { - unregister_blkdev(FLOPPY_MAJOR, "fd"); - return 0; - } } - /* Now register that disk. Same comment about failure handling */ - i = floppy_count++; - disk = disks[i] = alloc_disk(1); - if (disk == NULL) - return 0; - disk->major = FLOPPY_MAJOR; - disk->first_minor = i; + disk->first_minor = index; disk->fops = &floppy_fops; - disk->private_data = &floppy_states[i]; - disk->queue = swim3_queue; + disk->private_data = &floppy_states[index]; disk->flags |= GENHD_FL_REMOVABLE; - sprintf(disk->disk_name, "fd%d", i); + sprintf(disk->disk_name, "fd%d", index); set_capacity(disk, 2880); add_disk(disk); @@ -1194,6 +1259,9 @@ static struct macio_driver swim3_driver = .of_match_table = swim3_match, }, .probe = swim3_attach, +#ifdef CONFIG_PMAC_MEDIABAY + .mediabay_event = swim3_mb_event, +#endif #if 0 .suspend = swim3_suspend, .resume = swim3_resume, diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index b70f0fca9a42..3fb6ab4c8b4e 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -619,8 +619,10 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx) host->state == HST_DEV_SCAN); spin_unlock_irq(&host->lock); - DPRINTK("blk_insert_request, tag == %u\n", idx); - blk_insert_request(host->oob_q, crq->rq, 1, crq); + DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx); + crq->rq->cmd_type = REQ_TYPE_SPECIAL; + crq->rq->special = crq; + blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL); return 0; @@ -658,8 +660,10 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func) BUG_ON(rc < 0); crq->msg_bucket = (u32) rc; - DPRINTK("blk_insert_request, tag == %u\n", idx); - blk_insert_request(host->oob_q, crq->rq, 1, crq); + DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx); + crq->rq->cmd_type = REQ_TYPE_SPECIAL; + crq->rq->special = crq; + blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL); return 0; } @@ -1116,7 +1120,7 @@ static inline void carm_handle_resp(struct carm_host *host, break; case MISC_GET_FW_VER: { struct carm_fw_ver *ver = (struct carm_fw_ver *) - mem + sizeof(struct carm_msg_get_fw_ver); + (mem + sizeof(struct carm_msg_get_fw_ver)); if (!error) { host->fw_ver = le32_to_cpu(ver->version); host->flags |= (ver->features & FL_FW_VER_MASK); diff --git a/drivers/block/ub.c b/drivers/block/ub.c index 0e376d46bdd1..fcec0225ac76 100644 --- a/drivers/block/ub.c +++ b/drivers/block/ub.c @@ -119,43 +119,6 @@ /* */ - -/* command block wrapper */ -struct bulk_cb_wrap { - __le32 Signature; /* contains 'USBC' */ - u32 Tag; /* unique per command id */ - __le32 DataTransferLength; /* size of data */ - u8 Flags; /* direction in bit 0 */ - u8 Lun; /* LUN */ - u8 Length; /* of of the CDB */ - u8 CDB[UB_MAX_CDB_SIZE]; /* max command */ -}; - -#define US_BULK_CB_WRAP_LEN 31 -#define US_BULK_CB_SIGN 0x43425355 /*spells out USBC */ -#define US_BULK_FLAG_IN 1 -#define US_BULK_FLAG_OUT 0 - -/* command status wrapper */ -struct bulk_cs_wrap { - __le32 Signature; /* should = 'USBS' */ - u32 Tag; /* same as original command */ - __le32 Residue; /* amount not transferred */ - u8 Status; /* see below */ -}; - -#define US_BULK_CS_WRAP_LEN 13 -#define US_BULK_CS_SIGN 0x53425355 /* spells out 'USBS' */ -#define US_BULK_STAT_OK 0 -#define US_BULK_STAT_FAIL 1 -#define US_BULK_STAT_PHASE 2 - -/* bulk-only class specific requests */ -#define US_BULK_RESET_REQUEST 0xff -#define US_BULK_GET_MAX_LUN 0xfe - -/* - */ struct ub_dev; #define UB_MAX_REQ_SG 9 /* cdrecord requires 32KB and maybe a header */ @@ -1744,12 +1707,11 @@ static int ub_bd_release(struct gendisk *disk, fmode_t mode) static int ub_bd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - struct gendisk *disk = bdev->bd_disk; void __user *usermem = (void __user *) arg; int ret; mutex_lock(&ub_mutex); - ret = scsi_cmd_ioctl(disk->queue, disk, mode, cmd, usermem); + ret = scsi_cmd_blk_ioctl(bdev, mode, cmd, usermem); mutex_unlock(&ub_mutex); return ret; @@ -2478,6 +2440,8 @@ static int __init ub_init(void) int rc; int i; + pr_info("'Low Performance USB Block' driver is deprecated. " + "Please switch to usb-storage\n"); for (i = 0; i < UB_QLOCK_NUM; i++) spin_lock_init(&ub_qlockv[i]); diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 031ca720d926..eb0d8216f557 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -513,7 +513,22 @@ static void process_page(unsigned long data) } } -static int mm_make_request(struct request_queue *q, struct bio *bio) +static void mm_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct cardinfo *card = cb->data; + + spin_lock_irq(&card->lock); + activate(card); + spin_unlock_irq(&card->lock); + kfree(cb); +} + +static int mm_check_plugged(struct cardinfo *card) +{ + return !!blk_check_plugged(mm_unplug, card, sizeof(struct blk_plug_cb)); +} + +static void mm_make_request(struct request_queue *q, struct bio *bio) { struct cardinfo *card = q->queuedata; pr_debug("mm_make_request %llu %u\n", @@ -523,9 +538,11 @@ static int mm_make_request(struct request_queue *q, struct bio *bio) *card->biotail = bio; bio->bi_next = NULL; card->biotail = &bio->bi_next; + if (bio->bi_rw & REQ_SYNC || !mm_check_plugged(card)) + activate(card); spin_unlock_irq(&card->lock); - return 0; + return; } static irqreturn_t mm_interrupt(int irq, void *__card) diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c deleted file mode 100644 index 9a5b2a2d616d..000000000000 --- a/drivers/block/viodasd.c +++ /dev/null @@ -1,809 +0,0 @@ -/* -*- linux-c -*- - * viodasd.c - * Authors: Dave Boutcher <boutcher@us.ibm.com> - * Ryan Arnold <ryanarn@us.ibm.com> - * Colin Devilbiss <devilbis@us.ibm.com> - * Stephen Rothwell - * - * (C) Copyright 2000-2004 IBM Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * This routine provides access to disk space (termed "DASD" in historical - * IBM terms) owned and managed by an OS/400 partition running on the - * same box as this Linux partition. - * - * All disk operations are performed by sending messages back and forth to - * the OS/400 partition. - */ - -#define pr_fmt(fmt) "viod: " fmt - -#include <linux/major.h> -#include <linux/fs.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/blkdev.h> -#include <linux/genhd.h> -#include <linux/hdreg.h> -#include <linux/errno.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/mutex.h> -#include <linux/dma-mapping.h> -#include <linux/completion.h> -#include <linux/device.h> -#include <linux/scatterlist.h> - -#include <asm/uaccess.h> -#include <asm/vio.h> -#include <asm/iseries/hv_types.h> -#include <asm/iseries/hv_lp_event.h> -#include <asm/iseries/hv_lp_config.h> -#include <asm/iseries/vio.h> -#include <asm/firmware.h> - -MODULE_DESCRIPTION("iSeries Virtual DASD"); -MODULE_AUTHOR("Dave Boutcher"); -MODULE_LICENSE("GPL"); - -/* - * We only support 7 partitions per physical disk....so with minor - * numbers 0-255 we get a maximum of 32 disks. - */ -#define VIOD_GENHD_NAME "iseries/vd" - -#define VIOD_VERS "1.64" - -enum { - PARTITION_SHIFT = 3, - MAX_DISKNO = HVMAXARCHITECTEDVIRTUALDISKS, - MAX_DISK_NAME = FIELD_SIZEOF(struct gendisk, disk_name) -}; - -static DEFINE_MUTEX(viodasd_mutex); -static DEFINE_SPINLOCK(viodasd_spinlock); - -#define VIOMAXREQ 16 - -#define DEVICE_NO(cell) ((struct viodasd_device *)(cell) - &viodasd_devices[0]) - -struct viodasd_waitevent { - struct completion com; - int rc; - u16 sub_result; - int max_disk; /* open */ -}; - -static const struct vio_error_entry viodasd_err_table[] = { - { 0x0201, EINVAL, "Invalid Range" }, - { 0x0202, EINVAL, "Invalid Token" }, - { 0x0203, EIO, "DMA Error" }, - { 0x0204, EIO, "Use Error" }, - { 0x0205, EIO, "Release Error" }, - { 0x0206, EINVAL, "Invalid Disk" }, - { 0x0207, EBUSY, "Can't Lock" }, - { 0x0208, EIO, "Already Locked" }, - { 0x0209, EIO, "Already Unlocked" }, - { 0x020A, EIO, "Invalid Arg" }, - { 0x020B, EIO, "Bad IFS File" }, - { 0x020C, EROFS, "Read Only Device" }, - { 0x02FF, EIO, "Internal Error" }, - { 0x0000, 0, NULL }, -}; - -/* - * Figure out the biggest I/O request (in sectors) we can accept - */ -#define VIODASD_MAXSECTORS (4096 / 512 * VIOMAXBLOCKDMA) - -/* - * Number of disk I/O requests we've sent to OS/400 - */ -static int num_req_outstanding; - -/* - * This is our internal structure for keeping track of disk devices - */ -struct viodasd_device { - u16 cylinders; - u16 tracks; - u16 sectors; - u16 bytes_per_sector; - u64 size; - int read_only; - spinlock_t q_lock; - struct gendisk *disk; - struct device *dev; -} viodasd_devices[MAX_DISKNO]; - -/* - * External open entry point. - */ -static int viodasd_open(struct block_device *bdev, fmode_t mode) -{ - struct viodasd_device *d = bdev->bd_disk->private_data; - HvLpEvent_Rc hvrc; - struct viodasd_waitevent we; - u16 flags = 0; - - if (d->read_only) { - if (mode & FMODE_WRITE) - return -EROFS; - flags = vioblockflags_ro; - } - - init_completion(&we.com); - - /* Send the open event to OS/400 */ - hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, - HvLpEvent_Type_VirtualIo, - viomajorsubtype_blockio | vioblockopen, - HvLpEvent_AckInd_DoAck, HvLpEvent_AckType_ImmediateAck, - viopath_sourceinst(viopath_hostLp), - viopath_targetinst(viopath_hostLp), - (u64)(unsigned long)&we, VIOVERSION << 16, - ((u64)DEVICE_NO(d) << 48) | ((u64)flags << 32), - 0, 0, 0); - if (hvrc != 0) { - pr_warning("HV open failed %d\n", (int)hvrc); - return -EIO; - } - - wait_for_completion(&we.com); - - /* Check the return code */ - if (we.rc != 0) { - const struct vio_error_entry *err = - vio_lookup_rc(viodasd_err_table, we.sub_result); - - pr_warning("bad rc opening disk: %d:0x%04x (%s)\n", - (int)we.rc, we.sub_result, err->msg); - return -EIO; - } - - return 0; -} - -static int viodasd_unlocked_open(struct block_device *bdev, fmode_t mode) -{ - int ret; - - mutex_lock(&viodasd_mutex); - ret = viodasd_open(bdev, mode); - mutex_unlock(&viodasd_mutex); - - return ret; -} - - -/* - * External release entry point. - */ -static int viodasd_release(struct gendisk *disk, fmode_t mode) -{ - struct viodasd_device *d = disk->private_data; - HvLpEvent_Rc hvrc; - - mutex_lock(&viodasd_mutex); - /* Send the event to OS/400. We DON'T expect a response */ - hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, - HvLpEvent_Type_VirtualIo, - viomajorsubtype_blockio | vioblockclose, - HvLpEvent_AckInd_NoAck, HvLpEvent_AckType_ImmediateAck, - viopath_sourceinst(viopath_hostLp), - viopath_targetinst(viopath_hostLp), - 0, VIOVERSION << 16, - ((u64)DEVICE_NO(d) << 48) /* | ((u64)flags << 32) */, - 0, 0, 0); - if (hvrc != 0) - pr_warning("HV close call failed %d\n", (int)hvrc); - - mutex_unlock(&viodasd_mutex); - - return 0; -} - - -/* External ioctl entry point. - */ -static int viodasd_getgeo(struct block_device *bdev, struct hd_geometry *geo) -{ - struct gendisk *disk = bdev->bd_disk; - struct viodasd_device *d = disk->private_data; - - geo->sectors = d->sectors ? d->sectors : 32; - geo->heads = d->tracks ? d->tracks : 64; - geo->cylinders = d->cylinders ? d->cylinders : - get_capacity(disk) / (geo->sectors * geo->heads); - - return 0; -} - -/* - * Our file operations table - */ -static const struct block_device_operations viodasd_fops = { - .owner = THIS_MODULE, - .open = viodasd_unlocked_open, - .release = viodasd_release, - .getgeo = viodasd_getgeo, -}; - -/* - * End a request - */ -static void viodasd_end_request(struct request *req, int error, - int num_sectors) -{ - __blk_end_request(req, error, num_sectors << 9); -} - -/* - * Send an actual I/O request to OS/400 - */ -static int send_request(struct request *req) -{ - u64 start; - int direction; - int nsg; - u16 viocmd; - HvLpEvent_Rc hvrc; - struct vioblocklpevent *bevent; - struct HvLpEvent *hev; - struct scatterlist sg[VIOMAXBLOCKDMA]; - int sgindex; - struct viodasd_device *d; - unsigned long flags; - - start = (u64)blk_rq_pos(req) << 9; - - if (rq_data_dir(req) == READ) { - direction = DMA_FROM_DEVICE; - viocmd = viomajorsubtype_blockio | vioblockread; - } else { - direction = DMA_TO_DEVICE; - viocmd = viomajorsubtype_blockio | vioblockwrite; - } - - d = req->rq_disk->private_data; - - /* Now build the scatter-gather list */ - sg_init_table(sg, VIOMAXBLOCKDMA); - nsg = blk_rq_map_sg(req->q, req, sg); - nsg = dma_map_sg(d->dev, sg, nsg, direction); - - spin_lock_irqsave(&viodasd_spinlock, flags); - num_req_outstanding++; - - /* This optimization handles a single DMA block */ - if (nsg == 1) - hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, - HvLpEvent_Type_VirtualIo, viocmd, - HvLpEvent_AckInd_DoAck, - HvLpEvent_AckType_ImmediateAck, - viopath_sourceinst(viopath_hostLp), - viopath_targetinst(viopath_hostLp), - (u64)(unsigned long)req, VIOVERSION << 16, - ((u64)DEVICE_NO(d) << 48), start, - ((u64)sg_dma_address(&sg[0])) << 32, - sg_dma_len(&sg[0])); - else { - bevent = (struct vioblocklpevent *) - vio_get_event_buffer(viomajorsubtype_blockio); - if (bevent == NULL) { - pr_warning("error allocating disk event buffer\n"); - goto error_ret; - } - - /* - * Now build up the actual request. Note that we store - * the pointer to the request in the correlation - * token so we can match the response up later - */ - memset(bevent, 0, sizeof(struct vioblocklpevent)); - hev = &bevent->event; - hev->flags = HV_LP_EVENT_VALID | HV_LP_EVENT_DO_ACK | - HV_LP_EVENT_INT; - hev->xType = HvLpEvent_Type_VirtualIo; - hev->xSubtype = viocmd; - hev->xSourceLp = HvLpConfig_getLpIndex(); - hev->xTargetLp = viopath_hostLp; - hev->xSizeMinus1 = - offsetof(struct vioblocklpevent, u.rw_data.dma_info) + - (sizeof(bevent->u.rw_data.dma_info[0]) * nsg) - 1; - hev->xSourceInstanceId = viopath_sourceinst(viopath_hostLp); - hev->xTargetInstanceId = viopath_targetinst(viopath_hostLp); - hev->xCorrelationToken = (u64)req; - bevent->version = VIOVERSION; - bevent->disk = DEVICE_NO(d); - bevent->u.rw_data.offset = start; - - /* - * Copy just the dma information from the sg list - * into the request - */ - for (sgindex = 0; sgindex < nsg; sgindex++) { - bevent->u.rw_data.dma_info[sgindex].token = - sg_dma_address(&sg[sgindex]); - bevent->u.rw_data.dma_info[sgindex].len = - sg_dma_len(&sg[sgindex]); - } - - /* Send the request */ - hvrc = HvCallEvent_signalLpEvent(&bevent->event); - vio_free_event_buffer(viomajorsubtype_blockio, bevent); - } - - if (hvrc != HvLpEvent_Rc_Good) { - pr_warning("error sending disk event to OS/400 (rc %d)\n", - (int)hvrc); - goto error_ret; - } - spin_unlock_irqrestore(&viodasd_spinlock, flags); - return 0; - -error_ret: - num_req_outstanding--; - spin_unlock_irqrestore(&viodasd_spinlock, flags); - dma_unmap_sg(d->dev, sg, nsg, direction); - return -1; -} - -/* - * This is the external request processing routine - */ -static void do_viodasd_request(struct request_queue *q) -{ - struct request *req; - - /* - * If we already have the maximum number of requests - * outstanding to OS/400 just bail out. We'll come - * back later. - */ - while (num_req_outstanding < VIOMAXREQ) { - req = blk_fetch_request(q); - if (req == NULL) - return; - /* check that request contains a valid command */ - if (req->cmd_type != REQ_TYPE_FS) { - viodasd_end_request(req, -EIO, blk_rq_sectors(req)); - continue; - } - /* Try sending the request */ - if (send_request(req) != 0) - viodasd_end_request(req, -EIO, blk_rq_sectors(req)); - } -} - -/* - * Probe a single disk and fill in the viodasd_device structure - * for it. - */ -static int probe_disk(struct viodasd_device *d) -{ - HvLpEvent_Rc hvrc; - struct viodasd_waitevent we; - int dev_no = DEVICE_NO(d); - struct gendisk *g; - struct request_queue *q; - u16 flags = 0; - -retry: - init_completion(&we.com); - - /* Send the open event to OS/400 */ - hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, - HvLpEvent_Type_VirtualIo, - viomajorsubtype_blockio | vioblockopen, - HvLpEvent_AckInd_DoAck, HvLpEvent_AckType_ImmediateAck, - viopath_sourceinst(viopath_hostLp), - viopath_targetinst(viopath_hostLp), - (u64)(unsigned long)&we, VIOVERSION << 16, - ((u64)dev_no << 48) | ((u64)flags<< 32), - 0, 0, 0); - if (hvrc != 0) { - pr_warning("bad rc on HV open %d\n", (int)hvrc); - return 0; - } - - wait_for_completion(&we.com); - - if (we.rc != 0) { - if (flags != 0) - return 0; - /* try again with read only flag set */ - flags = vioblockflags_ro; - goto retry; - } - if (we.max_disk > (MAX_DISKNO - 1)) { - printk_once(KERN_INFO pr_fmt("Only examining the first %d of %d disks connected\n"), - MAX_DISKNO, we.max_disk + 1); - } - - /* Send the close event to OS/400. We DON'T expect a response */ - hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, - HvLpEvent_Type_VirtualIo, - viomajorsubtype_blockio | vioblockclose, - HvLpEvent_AckInd_NoAck, HvLpEvent_AckType_ImmediateAck, - viopath_sourceinst(viopath_hostLp), - viopath_targetinst(viopath_hostLp), - 0, VIOVERSION << 16, - ((u64)dev_no << 48) | ((u64)flags << 32), - 0, 0, 0); - if (hvrc != 0) { - pr_warning("bad rc sending event to OS/400 %d\n", (int)hvrc); - return 0; - } - - if (d->dev == NULL) { - /* this is when we reprobe for new disks */ - if (vio_create_viodasd(dev_no) == NULL) { - pr_warning("cannot allocate virtual device for disk %d\n", - dev_no); - return 0; - } - /* - * The vio_create_viodasd will have recursed into this - * routine with d->dev set to the new vio device and - * will finish the setup of the disk below. - */ - return 1; - } - - /* create the request queue for the disk */ - spin_lock_init(&d->q_lock); - q = blk_init_queue(do_viodasd_request, &d->q_lock); - if (q == NULL) { - pr_warning("cannot allocate queue for disk %d\n", dev_no); - return 0; - } - g = alloc_disk(1 << PARTITION_SHIFT); - if (g == NULL) { - pr_warning("cannot allocate disk structure for disk %d\n", - dev_no); - blk_cleanup_queue(q); - return 0; - } - - d->disk = g; - blk_queue_max_segments(q, VIOMAXBLOCKDMA); - blk_queue_max_hw_sectors(q, VIODASD_MAXSECTORS); - g->major = VIODASD_MAJOR; - g->first_minor = dev_no << PARTITION_SHIFT; - if (dev_no >= 26) - snprintf(g->disk_name, sizeof(g->disk_name), - VIOD_GENHD_NAME "%c%c", - 'a' + (dev_no / 26) - 1, 'a' + (dev_no % 26)); - else - snprintf(g->disk_name, sizeof(g->disk_name), - VIOD_GENHD_NAME "%c", 'a' + (dev_no % 26)); - g->fops = &viodasd_fops; - g->queue = q; - g->private_data = d; - g->driverfs_dev = d->dev; - set_capacity(g, d->size >> 9); - - pr_info("disk %d: %lu sectors (%lu MB) CHS=%d/%d/%d sector size %d%s\n", - dev_no, (unsigned long)(d->size >> 9), - (unsigned long)(d->size >> 20), - (int)d->cylinders, (int)d->tracks, - (int)d->sectors, (int)d->bytes_per_sector, - d->read_only ? " (RO)" : ""); - - /* register us in the global list */ - add_disk(g); - return 1; -} - -/* returns the total number of scatterlist elements converted */ -static int block_event_to_scatterlist(const struct vioblocklpevent *bevent, - struct scatterlist *sg, int *total_len) -{ - int i, numsg; - const struct rw_data *rw_data = &bevent->u.rw_data; - static const int offset = - offsetof(struct vioblocklpevent, u.rw_data.dma_info); - static const int element_size = sizeof(rw_data->dma_info[0]); - - numsg = ((bevent->event.xSizeMinus1 + 1) - offset) / element_size; - if (numsg > VIOMAXBLOCKDMA) - numsg = VIOMAXBLOCKDMA; - - *total_len = 0; - sg_init_table(sg, VIOMAXBLOCKDMA); - for (i = 0; (i < numsg) && (rw_data->dma_info[i].len > 0); ++i) { - sg_dma_address(&sg[i]) = rw_data->dma_info[i].token; - sg_dma_len(&sg[i]) = rw_data->dma_info[i].len; - *total_len += rw_data->dma_info[i].len; - } - return i; -} - -/* - * Restart all queues, starting with the one _after_ the disk given, - * thus reducing the chance of starvation of higher numbered disks. - */ -static void viodasd_restart_all_queues_starting_from(int first_index) -{ - int i; - - for (i = first_index + 1; i < MAX_DISKNO; ++i) - if (viodasd_devices[i].disk) - blk_run_queue(viodasd_devices[i].disk->queue); - for (i = 0; i <= first_index; ++i) - if (viodasd_devices[i].disk) - blk_run_queue(viodasd_devices[i].disk->queue); -} - -/* - * For read and write requests, decrement the number of outstanding requests, - * Free the DMA buffers we allocated. - */ -static int viodasd_handle_read_write(struct vioblocklpevent *bevent) -{ - int num_sg, num_sect, pci_direction, total_len; - struct request *req; - struct scatterlist sg[VIOMAXBLOCKDMA]; - struct HvLpEvent *event = &bevent->event; - unsigned long irq_flags; - struct viodasd_device *d; - int error; - spinlock_t *qlock; - - num_sg = block_event_to_scatterlist(bevent, sg, &total_len); - num_sect = total_len >> 9; - if (event->xSubtype == (viomajorsubtype_blockio | vioblockread)) - pci_direction = DMA_FROM_DEVICE; - else - pci_direction = DMA_TO_DEVICE; - req = (struct request *)bevent->event.xCorrelationToken; - d = req->rq_disk->private_data; - - dma_unmap_sg(d->dev, sg, num_sg, pci_direction); - - /* - * Since this is running in interrupt mode, we need to make sure - * we're not stepping on any global I/O operations - */ - spin_lock_irqsave(&viodasd_spinlock, irq_flags); - num_req_outstanding--; - spin_unlock_irqrestore(&viodasd_spinlock, irq_flags); - - error = (event->xRc == HvLpEvent_Rc_Good) ? 0 : -EIO; - if (error) { - const struct vio_error_entry *err; - err = vio_lookup_rc(viodasd_err_table, bevent->sub_result); - pr_warning("read/write error %d:0x%04x (%s)\n", - event->xRc, bevent->sub_result, err->msg); - num_sect = blk_rq_sectors(req); - } - qlock = req->q->queue_lock; - spin_lock_irqsave(qlock, irq_flags); - viodasd_end_request(req, error, num_sect); - spin_unlock_irqrestore(qlock, irq_flags); - - /* Finally, try to get more requests off of this device's queue */ - viodasd_restart_all_queues_starting_from(DEVICE_NO(d)); - - return 0; -} - -/* This routine handles incoming block LP events */ -static void handle_block_event(struct HvLpEvent *event) -{ - struct vioblocklpevent *bevent = (struct vioblocklpevent *)event; - struct viodasd_waitevent *pwe; - - if (event == NULL) - /* Notification that a partition went away! */ - return; - /* First, we should NEVER get an int here...only acks */ - if (hvlpevent_is_int(event)) { - pr_warning("Yikes! got an int in viodasd event handler!\n"); - if (hvlpevent_need_ack(event)) { - event->xRc = HvLpEvent_Rc_InvalidSubtype; - HvCallEvent_ackLpEvent(event); - } - } - - switch (event->xSubtype & VIOMINOR_SUBTYPE_MASK) { - case vioblockopen: - /* - * Handle a response to an open request. We get all the - * disk information in the response, so update it. The - * correlation token contains a pointer to a waitevent - * structure that has a completion in it. update the - * return code in the waitevent structure and post the - * completion to wake up the guy who sent the request - */ - pwe = (struct viodasd_waitevent *)event->xCorrelationToken; - pwe->rc = event->xRc; - pwe->sub_result = bevent->sub_result; - if (event->xRc == HvLpEvent_Rc_Good) { - const struct open_data *data = &bevent->u.open_data; - struct viodasd_device *device = - &viodasd_devices[bevent->disk]; - device->read_only = - bevent->flags & vioblockflags_ro; - device->size = data->disk_size; - device->cylinders = data->cylinders; - device->tracks = data->tracks; - device->sectors = data->sectors; - device->bytes_per_sector = data->bytes_per_sector; - pwe->max_disk = data->max_disk; - } - complete(&pwe->com); - break; - case vioblockclose: - break; - case vioblockread: - case vioblockwrite: - viodasd_handle_read_write(bevent); - break; - - default: - pr_warning("invalid subtype!"); - if (hvlpevent_need_ack(event)) { - event->xRc = HvLpEvent_Rc_InvalidSubtype; - HvCallEvent_ackLpEvent(event); - } - } -} - -/* - * Get the driver to reprobe for more disks. - */ -static ssize_t probe_disks(struct device_driver *drv, const char *buf, - size_t count) -{ - struct viodasd_device *d; - - for (d = viodasd_devices; d < &viodasd_devices[MAX_DISKNO]; d++) { - if (d->disk == NULL) - probe_disk(d); - } - return count; -} -static DRIVER_ATTR(probe, S_IWUSR, NULL, probe_disks); - -static int viodasd_probe(struct vio_dev *vdev, const struct vio_device_id *id) -{ - struct viodasd_device *d = &viodasd_devices[vdev->unit_address]; - - d->dev = &vdev->dev; - if (!probe_disk(d)) - return -ENODEV; - return 0; -} - -static int viodasd_remove(struct vio_dev *vdev) -{ - struct viodasd_device *d; - - d = &viodasd_devices[vdev->unit_address]; - if (d->disk) { - del_gendisk(d->disk); - blk_cleanup_queue(d->disk->queue); - put_disk(d->disk); - d->disk = NULL; - } - d->dev = NULL; - return 0; -} - -/** - * viodasd_device_table: Used by vio.c to match devices that we - * support. - */ -static struct vio_device_id viodasd_device_table[] __devinitdata = { - { "block", "IBM,iSeries-viodasd" }, - { "", "" } -}; -MODULE_DEVICE_TABLE(vio, viodasd_device_table); - -static struct vio_driver viodasd_driver = { - .id_table = viodasd_device_table, - .probe = viodasd_probe, - .remove = viodasd_remove, - .driver = { - .name = "viodasd", - .owner = THIS_MODULE, - } -}; - -static int need_delete_probe; - -/* - * Initialize the whole device driver. Handle module and non-module - * versions - */ -static int __init viodasd_init(void) -{ - int rc; - - if (!firmware_has_feature(FW_FEATURE_ISERIES)) { - rc = -ENODEV; - goto early_fail; - } - - /* Try to open to our host lp */ - if (viopath_hostLp == HvLpIndexInvalid) - vio_set_hostlp(); - - if (viopath_hostLp == HvLpIndexInvalid) { - pr_warning("invalid hosting partition\n"); - rc = -EIO; - goto early_fail; - } - - pr_info("vers " VIOD_VERS ", hosting partition %d\n", viopath_hostLp); - - /* register the block device */ - rc = register_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME); - if (rc) { - pr_warning("Unable to get major number %d for %s\n", - VIODASD_MAJOR, VIOD_GENHD_NAME); - goto early_fail; - } - /* Actually open the path to the hosting partition */ - rc = viopath_open(viopath_hostLp, viomajorsubtype_blockio, - VIOMAXREQ + 2); - if (rc) { - pr_warning("error opening path to host partition %d\n", - viopath_hostLp); - goto unregister_blk; - } - - /* Initialize our request handler */ - vio_setHandler(viomajorsubtype_blockio, handle_block_event); - - rc = vio_register_driver(&viodasd_driver); - if (rc) { - pr_warning("vio_register_driver failed\n"); - goto unset_handler; - } - - /* - * If this call fails, it just means that we cannot dynamically - * add virtual disks, but the driver will still work fine for - * all existing disk, so ignore the failure. - */ - if (!driver_create_file(&viodasd_driver.driver, &driver_attr_probe)) - need_delete_probe = 1; - - return 0; - -unset_handler: - vio_clearHandler(viomajorsubtype_blockio); - viopath_close(viopath_hostLp, viomajorsubtype_blockio, VIOMAXREQ + 2); -unregister_blk: - unregister_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME); -early_fail: - return rc; -} -module_init(viodasd_init); - -void __exit viodasd_exit(void) -{ - if (need_delete_probe) - driver_remove_file(&viodasd_driver.driver, &driver_attr_probe); - vio_unregister_driver(&viodasd_driver); - vio_clearHandler(viomajorsubtype_blockio); - viopath_close(viopath_hostLp, viomajorsubtype_blockio, VIOMAXREQ + 2); - unregister_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME); -} -module_exit(viodasd_exit); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 079c08808d8a..c0bbeb470754 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -3,45 +3,53 @@ #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/hdreg.h> +#include <linux/module.h> +#include <linux/mutex.h> #include <linux/virtio.h> #include <linux/virtio_blk.h> #include <linux/scatterlist.h> #include <linux/string_helpers.h> #include <scsi/scsi_cmnd.h> +#include <linux/idr.h> #define PART_BITS 4 -static int major, index; +static int major; +static DEFINE_IDA(vd_index_ida); + struct workqueue_struct *virtblk_wq; struct virtio_blk { - spinlock_t lock; - struct virtio_device *vdev; struct virtqueue *vq; /* The disk structure for the kernel. */ struct gendisk *disk; - /* Request tracking. */ - struct list_head reqs; - mempool_t *pool; /* Process context for config space updates */ struct work_struct config_work; + /* Lock for config space updates */ + struct mutex config_lock; + + /* enable config space updates */ + bool config_enable; + /* What host tells us, plus 2 for header & tailer. */ unsigned int sg_elems; + /* Ida index - used to track minor number allocations. */ + int index; + /* Scatterlist: can be too big for stack. */ struct scatterlist sg[/*sg_elems*/]; }; struct virtblk_req { - struct list_head list; struct request *req; struct virtio_blk_outhdr out_hdr; struct virtio_scsi_inhdr in_hdr; @@ -55,7 +63,7 @@ static void blk_done(struct virtqueue *vq) unsigned int len; unsigned long flags; - spin_lock_irqsave(&vblk->lock, flags); + spin_lock_irqsave(vblk->disk->queue->queue_lock, flags); while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { int error; @@ -85,12 +93,11 @@ static void blk_done(struct virtqueue *vq) } __blk_end_request_all(vbr->req, error); - list_del(&vbr->list); mempool_free(vbr, vblk->pool); } /* In case queue is stopped waiting for more buffers. */ blk_start_queue(vblk->disk->queue); - spin_unlock_irqrestore(&vblk->lock, flags); + spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags); } static bool do_req(struct request_queue *q, struct virtio_blk *vblk, @@ -165,12 +172,11 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, } } - if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr) < 0) { + if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, GFP_ATOMIC)<0) { mempool_free(vbr, vblk->pool); return false; } - list_add_tail(&vbr->list, &vblk->reqs); return true; } @@ -236,8 +242,8 @@ static int virtblk_ioctl(struct block_device *bdev, fmode_t mode, if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI)) return -ENOTTY; - return scsi_cmd_ioctl(disk->queue, disk, mode, cmd, - (void __user *)data); + return scsi_cmd_blk_ioctl(bdev, mode, cmd, + (void __user *)data); } /* We provide getgeo only to please some old bootloader/partitioning tools */ @@ -276,6 +282,11 @@ static int index_to_minor(int index) return index << PART_BITS; } +static int minor_to_index(int minor) +{ + return minor >> PART_BITS; +} + static ssize_t virtblk_serial_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -306,6 +317,10 @@ static void virtblk_config_changed_work(struct work_struct *work) char cap_str_2[10], cap_str_10[10]; u64 capacity, size; + mutex_lock(&vblk->config_lock); + if (!vblk->config_enable) + goto done; + /* Host must always specify the capacity. */ vdev->config->get(vdev, offsetof(struct virtio_blk_config, capacity), &capacity, sizeof(capacity)); @@ -328,6 +343,9 @@ static void virtblk_config_changed_work(struct work_struct *work) cap_str_10, cap_str_2); set_capacity(vblk->disk, capacity); + revalidate_disk(vblk->disk); +done: + mutex_unlock(&vblk->config_lock); } static void virtblk_config_changed(struct virtio_device *vdev) @@ -337,18 +355,138 @@ static void virtblk_config_changed(struct virtio_device *vdev) queue_work(virtblk_wq, &vblk->config_work); } +static int init_vq(struct virtio_blk *vblk) +{ + int err = 0; + + /* We expect one virtqueue, for output. */ + vblk->vq = virtio_find_single_vq(vblk->vdev, blk_done, "requests"); + if (IS_ERR(vblk->vq)) + err = PTR_ERR(vblk->vq); + + return err; +} + +/* + * Legacy naming scheme used for virtio devices. We are stuck with it for + * virtio blk but don't ever use it for any new driver. + */ +static int virtblk_name_format(char *prefix, int index, char *buf, int buflen) +{ + const int base = 'z' - 'a' + 1; + char *begin = buf + strlen(prefix); + char *end = buf + buflen; + char *p; + int unit; + + p = end - 1; + *p = '\0'; + unit = base; + do { + if (p == begin) + return -EINVAL; + *--p = 'a' + (index % unit); + index = (index / unit) - 1; + } while (index >= 0); + + memmove(begin, p, end - p); + memcpy(buf, prefix, strlen(prefix)); + + return 0; +} + +static int virtblk_get_cache_mode(struct virtio_device *vdev) +{ + u8 writeback; + int err; + + err = virtio_config_val(vdev, VIRTIO_BLK_F_CONFIG_WCE, + offsetof(struct virtio_blk_config, wce), + &writeback); + if (err) + writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE); + + return writeback; +} + +static void virtblk_update_cache_mode(struct virtio_device *vdev) +{ + u8 writeback = virtblk_get_cache_mode(vdev); + struct virtio_blk *vblk = vdev->priv; + + if (writeback) + blk_queue_flush(vblk->disk->queue, REQ_FLUSH); + else + blk_queue_flush(vblk->disk->queue, 0); + + revalidate_disk(vblk->disk); +} + +static const char *const virtblk_cache_types[] = { + "write through", "write back" +}; + +static ssize_t +virtblk_cache_type_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + struct virtio_blk *vblk = disk->private_data; + struct virtio_device *vdev = vblk->vdev; + int i; + u8 writeback; + + BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE)); + for (i = ARRAY_SIZE(virtblk_cache_types); --i >= 0; ) + if (sysfs_streq(buf, virtblk_cache_types[i])) + break; + + if (i < 0) + return -EINVAL; + + writeback = i; + vdev->config->set(vdev, + offsetof(struct virtio_blk_config, wce), + &writeback, sizeof(writeback)); + + virtblk_update_cache_mode(vdev); + return count; +} + +static ssize_t +virtblk_cache_type_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + struct virtio_blk *vblk = disk->private_data; + u8 writeback = virtblk_get_cache_mode(vblk->vdev); + + BUG_ON(writeback >= ARRAY_SIZE(virtblk_cache_types)); + return snprintf(buf, 40, "%s\n", virtblk_cache_types[writeback]); +} + +static const struct device_attribute dev_attr_cache_type_ro = + __ATTR(cache_type, S_IRUGO, + virtblk_cache_type_show, NULL); +static const struct device_attribute dev_attr_cache_type_rw = + __ATTR(cache_type, S_IRUGO|S_IWUSR, + virtblk_cache_type_show, virtblk_cache_type_store); + static int __devinit virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; struct request_queue *q; - int err; + int err, index; u64 cap; u32 v, blk_size, sg_elems, opt_io_size; u16 min_io_size; u8 physical_block_exp, alignment_offset; - if (index_to_minor(index) >= 1 << MINORBITS) - return -ENOSPC; + err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS), + GFP_KERNEL); + if (err < 0) + goto out; + index = err; /* We need to know how many segments before we allocate. */ err = virtio_config_val(vdev, VIRTIO_BLK_F_SEG_MAX, @@ -365,22 +503,19 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) sizeof(vblk->sg[0]) * sg_elems, GFP_KERNEL); if (!vblk) { err = -ENOMEM; - goto out; + goto out_free_index; } - INIT_LIST_HEAD(&vblk->reqs); - spin_lock_init(&vblk->lock); vblk->vdev = vdev; vblk->sg_elems = sg_elems; sg_init_table(vblk->sg, vblk->sg_elems); + mutex_init(&vblk->config_lock); INIT_WORK(&vblk->config_work, virtblk_config_changed_work); + vblk->config_enable = true; - /* We expect one virtqueue, for output. */ - vblk->vq = virtio_find_single_vq(vdev, blk_done, "requests"); - if (IS_ERR(vblk->vq)) { - err = PTR_ERR(vblk->vq); + err = init_vq(vblk); + if (err) goto out_free_vblk; - } vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req)); if (!vblk->pool) { @@ -395,7 +530,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) goto out_mempool; } - q = vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock); + q = vblk->disk->queue = blk_init_queue(do_virtblk_request, NULL); if (!q) { err = -ENOMEM; goto out_put_disk; @@ -403,29 +538,17 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) q->queuedata = vblk; - if (index < 26) { - sprintf(vblk->disk->disk_name, "vd%c", 'a' + index % 26); - } else if (index < (26 + 1) * 26) { - sprintf(vblk->disk->disk_name, "vd%c%c", - 'a' + index / 26 - 1, 'a' + index % 26); - } else { - const unsigned int m1 = (index / 26 - 1) / 26 - 1; - const unsigned int m2 = (index / 26 - 1) % 26; - const unsigned int m3 = index % 26; - sprintf(vblk->disk->disk_name, "vd%c%c%c", - 'a' + m1, 'a' + m2, 'a' + m3); - } + virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); vblk->disk->major = major; vblk->disk->first_minor = index_to_minor(index); vblk->disk->private_data = vblk; vblk->disk->fops = &virtblk_fops; vblk->disk->driverfs_dev = &vdev->dev; - index++; + vblk->index = index; /* configure queue flush support */ - if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) - blk_queue_flush(q, REQ_FLUSH); + virtblk_update_cache_mode(vdev); /* If disk is read-only in the host, the guest should obey */ if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) @@ -503,6 +626,14 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) if (err) goto out_del_disk; + if (virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE)) + err = device_create_file(disk_to_dev(vblk->disk), + &dev_attr_cache_type_rw); + else + err = device_create_file(disk_to_dev(vblk->disk), + &dev_attr_cache_type_ro); + if (err) + goto out_del_disk; return 0; out_del_disk: @@ -516,6 +647,8 @@ out_free_vq: vdev->config->del_vqs(vdev); out_free_vblk: kfree(vblk); +out_free_index: + ida_simple_remove(&vd_index_ida, index); out: return err; } @@ -523,22 +656,67 @@ out: static void __devexit virtblk_remove(struct virtio_device *vdev) { struct virtio_blk *vblk = vdev->priv; + int index = vblk->index; - flush_work(&vblk->config_work); + /* Prevent config work handler from accessing the device. */ + mutex_lock(&vblk->config_lock); + vblk->config_enable = false; + mutex_unlock(&vblk->config_lock); - /* Nothing should be pending. */ - BUG_ON(!list_empty(&vblk->reqs)); + del_gendisk(vblk->disk); + blk_cleanup_queue(vblk->disk->queue); /* Stop all the virtqueues. */ vdev->config->reset(vdev); - del_gendisk(vblk->disk); - blk_cleanup_queue(vblk->disk->queue); + flush_work(&vblk->config_work); + put_disk(vblk->disk); mempool_destroy(vblk->pool); vdev->config->del_vqs(vdev); kfree(vblk); + ida_simple_remove(&vd_index_ida, index); +} + +#ifdef CONFIG_PM +static int virtblk_freeze(struct virtio_device *vdev) +{ + struct virtio_blk *vblk = vdev->priv; + + /* Ensure we don't receive any more interrupts */ + vdev->config->reset(vdev); + + /* Prevent config work handler from accessing the device. */ + mutex_lock(&vblk->config_lock); + vblk->config_enable = false; + mutex_unlock(&vblk->config_lock); + + flush_work(&vblk->config_work); + + spin_lock_irq(vblk->disk->queue->queue_lock); + blk_stop_queue(vblk->disk->queue); + spin_unlock_irq(vblk->disk->queue->queue_lock); + blk_sync_queue(vblk->disk->queue); + + vdev->config->del_vqs(vdev); + return 0; +} + +static int virtblk_restore(struct virtio_device *vdev) +{ + struct virtio_blk *vblk = vdev->priv; + int ret; + + vblk->config_enable = true; + ret = init_vq(vdev->priv); + if (!ret) { + spin_lock_irq(vblk->disk->queue->queue_lock); + blk_start_queue(vblk->disk->queue); + spin_unlock_irq(vblk->disk->queue->queue_lock); + } + return ret; } +#endif static const struct virtio_device_id id_table[] = { { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID }, @@ -548,7 +726,7 @@ static const struct virtio_device_id id_table[] = { static unsigned int features[] = { VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI, - VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY + VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE }; /* @@ -565,6 +743,10 @@ static struct virtio_driver __refdata virtio_blk = { .probe = virtblk_probe, .remove = __devexit_p(virtblk_remove), .config_changed = virtblk_config_changed, +#ifdef CONFIG_PM + .freeze = virtblk_freeze, + .restore = virtblk_restore, +#endif }; static int __init init(void) diff --git a/drivers/block/xd.c b/drivers/block/xd.c index 4abd2bcd20fb..ff540520bada 100644 --- a/drivers/block/xd.c +++ b/drivers/block/xd.c @@ -52,7 +52,6 @@ #include <linux/io.h> #include <linux/gfp.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/dma.h> @@ -148,7 +147,7 @@ static volatile int xdc_busy; static struct timer_list xd_watchdog_int; static volatile u_char xd_error; -static int nodma = XD_DONT_USE_DMA; +static bool nodma = XD_DONT_USE_DMA; static struct request_queue *xd_queue; diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 2330a9ad5e95..c6decb901e5e 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -258,13 +258,16 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id) static void print_stats(struct xen_blkif *blkif) { - pr_info("xen-blkback (%s): oo %3d | rd %4d | wr %4d | f %4d\n", + pr_info("xen-blkback (%s): oo %3d | rd %4d | wr %4d | f %4d" + " | ds %4d\n", current->comm, blkif->st_oo_req, - blkif->st_rd_req, blkif->st_wr_req, blkif->st_f_req); + blkif->st_rd_req, blkif->st_wr_req, + blkif->st_f_req, blkif->st_ds_req); blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); blkif->st_rd_req = 0; blkif->st_wr_req = 0; blkif->st_oo_req = 0; + blkif->st_ds_req = 0; } int xen_blkif_schedule(void *arg) @@ -318,6 +321,7 @@ struct seg_buf { static void xen_blkbk_unmap(struct pending_req *req) { struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; unsigned int i, invcount = 0; grant_handle_t handle; int ret; @@ -329,25 +333,12 @@ static void xen_blkbk_unmap(struct pending_req *req) gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), GNTMAP_host_map, handle); pending_handle(req, i) = BLKBACK_INVALID_HANDLE; + pages[invcount] = virt_to_page(vaddr(req, i)); invcount++; } - ret = HYPERVISOR_grant_table_op( - GNTTABOP_unmap_grant_ref, unmap, invcount); + ret = gnttab_unmap_refs(unmap, NULL, pages, invcount); BUG_ON(ret); - /* - * Note, we use invcount, so nr->pages, so we can't index - * using vaddr(req, i). - */ - for (i = 0; i < invcount; i++) { - ret = m2p_remove_override( - virt_to_page(unmap[i].host_addr), false); - if (ret) { - pr_alert(DRV_PFX "Failed to remove M2P override for %lx\n", - (unsigned long)unmap[i].host_addr); - continue; - } - } } static int xen_blkbk_map(struct blkif_request *req, @@ -356,7 +347,7 @@ static int xen_blkbk_map(struct blkif_request *req, { struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; int i; - int nseg = req->nr_segments; + int nseg = req->u.rw.nr_segments; int ret = 0; /* @@ -375,7 +366,7 @@ static int xen_blkbk_map(struct blkif_request *req, pending_req->blkif->domid); } - ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg); + ret = gnttab_map_refs(map, NULL, &blkbk->pending_page(pending_req, 0), nseg); BUG_ON(ret); /* @@ -395,21 +386,59 @@ static int xen_blkbk_map(struct blkif_request *req, if (ret) continue; - ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr), - blkbk->pending_page(pending_req, i), false); - if (ret) { - pr_alert(DRV_PFX "Failed to install M2P override for %lx (ret: %d)\n", - (unsigned long)map[i].dev_bus_addr, ret); - /* We could switch over to GNTTABOP_copy */ - continue; - } - seg[i].buf = map[i].dev_bus_addr | (req->u.rw.seg[i].first_sect << 9); } return ret; } +static int dispatch_discard_io(struct xen_blkif *blkif, + struct blkif_request *req) +{ + int err = 0; + int status = BLKIF_RSP_OKAY; + struct block_device *bdev = blkif->vbd.bdev; + unsigned long secure; + + blkif->st_ds_req++; + + xen_blkif_get(blkif); + secure = (blkif->vbd.discard_secure && + (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? + BLKDEV_DISCARD_SECURE : 0; + + err = blkdev_issue_discard(bdev, req->u.discard.sector_number, + req->u.discard.nr_sectors, + GFP_KERNEL, secure); + + if (err == -EOPNOTSUPP) { + pr_debug(DRV_PFX "discard op failed, not supported\n"); + status = BLKIF_RSP_EOPNOTSUPP; + } else if (err) + status = BLKIF_RSP_ERROR; + + make_response(blkif, req->u.discard.id, req->operation, status); + xen_blkif_put(blkif); + return err; +} + +static void xen_blk_drain_io(struct xen_blkif *blkif) +{ + atomic_set(&blkif->drain, 1); + do { + /* The initial value is one, and one refcnt taken at the + * start of the xen_blkif_schedule thread. */ + if (atomic_read(&blkif->refcnt) <= 2) + break; + wait_for_completion_interruptible_timeout( + &blkif->drain_complete, HZ); + + if (!atomic_read(&blkif->drain)) + break; + } while (!kthread_should_stop()); + atomic_set(&blkif->drain, 0); +} + /* * Completion callback on the bio's. Called as bh->b_end_io() */ @@ -422,6 +451,11 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) pr_debug(DRV_PFX "flush diskcache op failed, not supported\n"); xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0); pending_req->status = BLKIF_RSP_EOPNOTSUPP; + } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && + (error == -EOPNOTSUPP)) { + pr_debug(DRV_PFX "write barrier op failed, not supported\n"); + xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0); + pending_req->status = BLKIF_RSP_EOPNOTSUPP; } else if (error) { pr_debug(DRV_PFX "Buffer not up-to-date at end of operation," " error=%d\n", error); @@ -438,6 +472,10 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) make_response(pending_req->blkif, pending_req->id, pending_req->operation, pending_req->status); xen_blkif_put(pending_req->blkif); + if (atomic_read(&pending_req->blkif->refcnt) <= 2) { + if (atomic_read(&pending_req->blkif->drain)) + complete(&pending_req->blkif->drain_complete); + } free_req(pending_req); } } @@ -505,8 +543,11 @@ __do_block_io_op(struct xen_blkif *blkif) /* Apply all sanity checks to /private copy/ of request. */ barrier(); - - if (dispatch_rw_block_io(blkif, &req, pending_req)) + if (unlikely(req.operation == BLKIF_OP_DISCARD)) { + free_req(pending_req); + if (dispatch_discard_io(blkif, &req)) + break; + } else if (dispatch_rw_block_io(blkif, &req, pending_req)) break; /* Yield point for this unbounded loop. */ @@ -532,7 +573,6 @@ do_block_io_op(struct xen_blkif *blkif) return more_to_do; } - /* * Transmutation of the 'struct blkif_request' to a proper 'struct bio' * and call the 'submit_bio' to pass it to the underlying storage. @@ -549,6 +589,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, int i, nbio = 0; int operation; struct blk_plug plug; + bool drain = false; switch (req->operation) { case BLKIF_OP_READ: @@ -559,11 +600,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, blkif->st_wr_req++; operation = WRITE_ODIRECT; break; + case BLKIF_OP_WRITE_BARRIER: + drain = true; case BLKIF_OP_FLUSH_DISKCACHE: blkif->st_f_req++; operation = WRITE_FLUSH; break; - case BLKIF_OP_WRITE_BARRIER: default: operation = 0; /* make gcc happy */ goto fail_response; @@ -571,7 +613,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, } /* Check that the number of segments is sane. */ - nseg = req->nr_segments; + nseg = req->u.rw.nr_segments; + if (unlikely(nseg == 0 && operation != WRITE_FLUSH) || unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { pr_debug(DRV_PFX "Bad number of segments in request (%d)\n", @@ -580,12 +623,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, goto fail_response; } - preq.dev = req->handle; + preq.dev = req->u.rw.handle; preq.sector_number = req->u.rw.sector_number; preq.nr_sects = 0; pending_req->blkif = blkif; - pending_req->id = req->id; + pending_req->id = req->u.rw.id; pending_req->operation = req->operation; pending_req->status = BLKIF_RSP_OKAY; pending_req->nr_pages = nseg; @@ -621,6 +664,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, } } + /* Wait on all outstanding I/O's and once that has been completed + * issue the WRITE_FLUSH. + */ + if (drain) + xen_blk_drain_io(pending_req->blkif); + /* * If we have failed at this point, we need to undo the M2P override, * set gnttab_set_unmap_op on all of the grant references and perform @@ -630,7 +679,10 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, if (xen_blkbk_map(req, pending_req, seg)) goto fail_flush; - /* This corresponding xen_blkif_put is done in __end_block_io_op */ + /* + * This corresponding xen_blkif_put is done in __end_block_io_op, or + * below (in "!bio") if we are handling a BLKIF_OP_DISCARD. + */ xen_blkif_get(blkif); for (i = 0; i < nseg; i++) { @@ -654,7 +706,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, preq.sector_number += seg[i].nsec; } - /* This will be hit if the operation was a flush. */ + /* This will be hit if the operation was a flush or discard. */ if (!bio) { BUG_ON(operation != WRITE_FLUSH); @@ -685,7 +737,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, if (operation == READ) blkif->st_rd_sect += preq.nr_sects; - else if (operation == WRITE || operation == WRITE_FLUSH) + else if (operation & WRITE) blkif->st_wr_sect += preq.nr_sects; return 0; @@ -694,7 +746,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, xen_blkbk_unmap(pending_req); fail_response: /* Haven't submitted any bio's yet. */ - make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); + make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR); free_req(pending_req); msleep(1); /* back off a bit */ return -EIO; @@ -754,7 +806,7 @@ static int __init xen_blkif_init(void) int i, mmap_pages; int rc = 0; - if (!xen_pv_domain()) + if (!xen_domain()) return -ENODEV; blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL); @@ -765,9 +817,9 @@ static int __init xen_blkif_init(void) mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; - blkbk->pending_reqs = kmalloc(sizeof(blkbk->pending_reqs[0]) * + blkbk->pending_reqs = kzalloc(sizeof(blkbk->pending_reqs[0]) * xen_blkif_reqs, GFP_KERNEL); - blkbk->pending_grant_handles = kzalloc(sizeof(blkbk->pending_grant_handles[0]) * + blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) * mmap_pages, GFP_KERNEL); blkbk->pending_pages = kzalloc(sizeof(blkbk->pending_pages[0]) * mmap_pages, GFP_KERNEL); @@ -790,8 +842,6 @@ static int __init xen_blkif_init(void) if (rc) goto failed_init; - memset(blkbk->pending_reqs, 0, sizeof(blkbk->pending_reqs)); - INIT_LIST_HEAD(&blkbk->pending_free); spin_lock_init(&blkbk->pending_free_lock); init_waitqueue_head(&blkbk->pending_free_wq); diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 00c57c90e2d6..9ad3b5ec1dc1 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -27,7 +27,6 @@ #ifndef __XEN_BLKIF__BACKEND__COMMON_H__ #define __XEN_BLKIF__BACKEND__COMMON_H__ -#include <linux/version.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/slab.h> @@ -61,32 +60,66 @@ struct blkif_common_response { char dummy; }; -/* i386 protocol version */ -#pragma pack(push, 4) -struct blkif_x86_32_request { - uint8_t operation; /* BLKIF_OP_??? */ +struct blkif_x86_32_request_rw { uint8_t nr_segments; /* number of segments */ blkif_vdev_t handle; /* only for read/write requests */ uint64_t id; /* private guest value, echoed in resp */ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; -}; +} __attribute__((__packed__)); + +struct blkif_x86_32_request_discard { + uint8_t flag; /* BLKIF_DISCARD_SECURE or zero */ + blkif_vdev_t _pad1; /* was "handle" for read/write requests */ + uint64_t id; /* private guest value, echoed in resp */ + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + uint64_t nr_sectors; +} __attribute__((__packed__)); + +struct blkif_x86_32_request { + uint8_t operation; /* BLKIF_OP_??? */ + union { + struct blkif_x86_32_request_rw rw; + struct blkif_x86_32_request_discard discard; + } u; +} __attribute__((__packed__)); + +/* i386 protocol version */ +#pragma pack(push, 4) struct blkif_x86_32_response { uint64_t id; /* copied from request */ uint8_t operation; /* copied from request */ int16_t status; /* BLKIF_RSP_??? */ }; #pragma pack(pop) - /* x86_64 protocol version */ -struct blkif_x86_64_request { - uint8_t operation; /* BLKIF_OP_??? */ + +struct blkif_x86_64_request_rw { uint8_t nr_segments; /* number of segments */ blkif_vdev_t handle; /* only for read/write requests */ - uint64_t __attribute__((__aligned__(8))) id; + uint32_t _pad1; /* offsetof(blkif_reqest..,u.rw.id)==8 */ + uint64_t id; blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; -}; +} __attribute__((__packed__)); + +struct blkif_x86_64_request_discard { + uint8_t flag; /* BLKIF_DISCARD_SECURE or zero */ + blkif_vdev_t _pad1; /* was "handle" for read/write requests */ + uint32_t _pad2; /* offsetof(blkif_..,u.discard.id)==8 */ + uint64_t id; + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + uint64_t nr_sectors; +} __attribute__((__packed__)); + +struct blkif_x86_64_request { + uint8_t operation; /* BLKIF_OP_??? */ + union { + struct blkif_x86_64_request_rw rw; + struct blkif_x86_64_request_discard discard; + } u; +} __attribute__((__packed__)); + struct blkif_x86_64_response { uint64_t __attribute__((__aligned__(8))) id; uint8_t operation; /* copied from request */ @@ -126,6 +159,7 @@ struct xen_vbd { /* Cached size parameter. */ sector_t size; bool flush_support; + bool discard_secure; }; struct backend_info; @@ -139,7 +173,7 @@ struct xen_blkif { /* Comms information. */ enum blkif_protocol blk_protocol; union blkif_back_rings blk_rings; - struct vm_struct *blk_ring_area; + void *blk_ring; /* The VBD attached to this interface. */ struct xen_vbd vbd; /* Back pointer to the backend_info. */ @@ -149,6 +183,9 @@ struct xen_blkif { atomic_t refcnt; wait_queue_head_t wq; + /* for barrier (drain) requests */ + struct completion drain_complete; + atomic_t drain; /* One thread per one blkif. */ struct task_struct *xenblkd; unsigned int waiting_reqs; @@ -159,13 +196,11 @@ struct xen_blkif { int st_wr_req; int st_oo_req; int st_f_req; + int st_ds_req; int st_rd_sect; int st_wr_sect; wait_queue_head_t waiting_to_free; - - grant_handle_t shmem_handle; - grant_ref_t shmem_ref; }; @@ -182,7 +217,7 @@ struct xen_blkif { struct phys_req { unsigned short dev; - unsigned short nr_sects; + blkif_sector_t nr_sects; struct block_device *bdev; blkif_sector_t sector_number; }; @@ -196,6 +231,8 @@ int xen_blkif_schedule(void *arg); int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, struct backend_info *be, int state); +int xen_blkbk_barrier(struct xenbus_transaction xbt, + struct backend_info *be, int state); struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be); static inline void blkif_get_x86_32_req(struct blkif_request *dst, @@ -203,15 +240,30 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst, { int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; dst->operation = src->operation; - dst->nr_segments = src->nr_segments; - dst->handle = src->handle; - dst->id = src->id; - dst->u.rw.sector_number = src->sector_number; - barrier(); - if (n > dst->nr_segments) - n = dst->nr_segments; - for (i = 0; i < n; i++) - dst->u.rw.seg[i] = src->seg[i]; + switch (src->operation) { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + case BLKIF_OP_WRITE_BARRIER: + case BLKIF_OP_FLUSH_DISKCACHE: + dst->u.rw.nr_segments = src->u.rw.nr_segments; + dst->u.rw.handle = src->u.rw.handle; + dst->u.rw.id = src->u.rw.id; + dst->u.rw.sector_number = src->u.rw.sector_number; + barrier(); + if (n > dst->u.rw.nr_segments) + n = dst->u.rw.nr_segments; + for (i = 0; i < n; i++) + dst->u.rw.seg[i] = src->u.rw.seg[i]; + break; + case BLKIF_OP_DISCARD: + dst->u.discard.flag = src->u.discard.flag; + dst->u.discard.id = src->u.discard.id; + dst->u.discard.sector_number = src->u.discard.sector_number; + dst->u.discard.nr_sectors = src->u.discard.nr_sectors; + break; + default: + break; + } } static inline void blkif_get_x86_64_req(struct blkif_request *dst, @@ -219,15 +271,30 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst, { int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; dst->operation = src->operation; - dst->nr_segments = src->nr_segments; - dst->handle = src->handle; - dst->id = src->id; - dst->u.rw.sector_number = src->sector_number; - barrier(); - if (n > dst->nr_segments) - n = dst->nr_segments; - for (i = 0; i < n; i++) - dst->u.rw.seg[i] = src->seg[i]; + switch (src->operation) { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + case BLKIF_OP_WRITE_BARRIER: + case BLKIF_OP_FLUSH_DISKCACHE: + dst->u.rw.nr_segments = src->u.rw.nr_segments; + dst->u.rw.handle = src->u.rw.handle; + dst->u.rw.id = src->u.rw.id; + dst->u.rw.sector_number = src->u.rw.sector_number; + barrier(); + if (n > dst->u.rw.nr_segments) + n = dst->u.rw.nr_segments; + for (i = 0; i < n; i++) + dst->u.rw.seg[i] = src->u.rw.seg[i]; + break; + case BLKIF_OP_DISCARD: + dst->u.discard.flag = src->u.discard.flag; + dst->u.discard.id = src->u.discard.id; + dst->u.discard.sector_number = src->u.discard.sector_number; + dst->u.discard.nr_sectors = src->u.discard.nr_sectors; + break; + default: + break; + } } #endif /* __XEN_BLKIF__BACKEND__COMMON_H__ */ diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 5fd2010f7d2b..4f66171c6683 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -114,44 +114,14 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) spin_lock_init(&blkif->blk_ring_lock); atomic_set(&blkif->refcnt, 1); init_waitqueue_head(&blkif->wq); + init_completion(&blkif->drain_complete); + atomic_set(&blkif->drain, 0); blkif->st_print = jiffies; init_waitqueue_head(&blkif->waiting_to_free); return blkif; } -static int map_frontend_page(struct xen_blkif *blkif, unsigned long shared_page) -{ - struct gnttab_map_grant_ref op; - - gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr, - GNTMAP_host_map, shared_page, blkif->domid); - - if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) - BUG(); - - if (op.status) { - DPRINTK("Grant table operation failure !\n"); - return op.status; - } - - blkif->shmem_ref = shared_page; - blkif->shmem_handle = op.handle; - - return 0; -} - -static void unmap_frontend_page(struct xen_blkif *blkif) -{ - struct gnttab_unmap_grant_ref op; - - gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr, - GNTMAP_host_map, blkif->shmem_handle); - - if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) - BUG(); -} - static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page, unsigned int evtchn) { @@ -161,35 +131,29 @@ static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page, if (blkif->irq) return 0; - blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE); - if (!blkif->blk_ring_area) - return -ENOMEM; - - err = map_frontend_page(blkif, shared_page); - if (err) { - free_vm_area(blkif->blk_ring_area); + err = xenbus_map_ring_valloc(blkif->be->dev, shared_page, &blkif->blk_ring); + if (err < 0) return err; - } switch (blkif->blk_protocol) { case BLKIF_PROTOCOL_NATIVE: { struct blkif_sring *sring; - sring = (struct blkif_sring *)blkif->blk_ring_area->addr; + sring = (struct blkif_sring *)blkif->blk_ring; BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE); break; } case BLKIF_PROTOCOL_X86_32: { struct blkif_x86_32_sring *sring_x86_32; - sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring_area->addr; + sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring; BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE); break; } case BLKIF_PROTOCOL_X86_64: { struct blkif_x86_64_sring *sring_x86_64; - sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring_area->addr; + sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring; BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE); break; } @@ -201,8 +165,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page, xen_blkif_be_int, 0, "blkif-backend", blkif); if (err < 0) { - unmap_frontend_page(blkif); - free_vm_area(blkif->blk_ring_area); + xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); blkif->blk_rings.common.sring = NULL; return err; } @@ -228,8 +191,7 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif) } if (blkif->blk_rings.common.sring) { - unmap_frontend_page(blkif); - free_vm_area(blkif->blk_ring_area); + xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); blkif->blk_rings.common.sring = NULL; } } @@ -272,6 +234,7 @@ VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req); VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req); VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req); VBD_SHOW(f_req, "%d\n", be->blkif->st_f_req); +VBD_SHOW(ds_req, "%d\n", be->blkif->st_ds_req); VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect); VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect); @@ -280,6 +243,7 @@ static struct attribute *xen_vbdstat_attrs[] = { &dev_attr_rd_req.attr, &dev_attr_wr_req.attr, &dev_attr_f_req.attr, + &dev_attr_ds_req.attr, &dev_attr_rd_sect.attr, &dev_attr_wr_sect.attr, NULL @@ -374,6 +338,9 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, if (q && q->flush_flags) vbd->flush_support = true; + if (q && blk_queue_secdiscard(q)) + vbd->discard_secure = true; + DPRINTK("Successful creation of handle=%04x (dom=%u)\n", handle, blkif->domid); return 0; @@ -414,7 +381,60 @@ int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, err = xenbus_printf(xbt, dev->nodename, "feature-flush-cache", "%d", state); if (err) - xenbus_dev_fatal(dev, err, "writing feature-flush-cache"); + dev_warn(&dev->dev, "writing feature-flush-cache (%d)", err); + + return err; +} + +static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info *be) +{ + struct xenbus_device *dev = be->dev; + struct xen_blkif *blkif = be->blkif; + int err; + int state = 0; + struct block_device *bdev = be->blkif->vbd.bdev; + struct request_queue *q = bdev_get_queue(bdev); + + if (blk_queue_discard(q)) { + err = xenbus_printf(xbt, dev->nodename, + "discard-granularity", "%u", + q->limits.discard_granularity); + if (err) { + dev_warn(&dev->dev, "writing discard-granularity (%d)", err); + return; + } + err = xenbus_printf(xbt, dev->nodename, + "discard-alignment", "%u", + q->limits.discard_alignment); + if (err) { + dev_warn(&dev->dev, "writing discard-alignment (%d)", err); + return; + } + state = 1; + /* Optional. */ + err = xenbus_printf(xbt, dev->nodename, + "discard-secure", "%d", + blkif->vbd.discard_secure); + if (err) { + dev_warn(&dev->dev, "writing discard-secure (%d)", err); + return; + } + } + err = xenbus_printf(xbt, dev->nodename, "feature-discard", + "%d", state); + if (err) + dev_warn(&dev->dev, "writing feature-discard (%d)", err); +} +int xen_blkbk_barrier(struct xenbus_transaction xbt, + struct backend_info *be, int state) +{ + struct xenbus_device *dev = be->dev; + int err; + + err = xenbus_printf(xbt, dev->nodename, "feature-barrier", + "%d", state); + if (err) + dev_warn(&dev->dev, "writing feature-barrier (%d)", err); return err; } @@ -582,7 +602,7 @@ static void frontend_changed(struct xenbus_device *dev, case XenbusStateConnected: /* * Ensure we connect even when two watches fire in - * close successsion and we miss the intermediate value + * close succession and we miss the intermediate value * of frontend_state. */ if (dev->state == XenbusStateConnected) @@ -646,9 +666,12 @@ again: return; } - err = xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support); - if (err) - goto abort; + /* If we can't advertise it is OK. */ + xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support); + + xen_blkbk_discard(xbt, be); + + xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support); err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", (unsigned long long)vbd_sz(&be->blkif->vbd)); @@ -751,17 +774,14 @@ static const struct xenbus_device_id xen_blkbk_ids[] = { }; -static struct xenbus_driver xen_blkbk = { - .name = "vbd", - .owner = THIS_MODULE, - .ids = xen_blkbk_ids, +static DEFINE_XENBUS_DRIVER(xen_blkbk, , .probe = xen_blkbk_probe, .remove = xen_blkbk_remove, .otherend_changed = frontend_changed -}; +); int xen_blkif_xenbus_init(void) { - return xenbus_register_backend(&xen_blkbk); + return xenbus_register_backend(&xen_blkbk_driver); } diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 9ea8c2576c70..2c2d2e5c1597 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -43,6 +43,7 @@ #include <linux/slab.h> #include <linux/mutex.h> #include <linux/scatterlist.h> +#include <linux/bitmap.h> #include <xen/xen.h> #include <xen/xenbus.h> @@ -81,6 +82,7 @@ static const struct block_device_operations xlvbd_block_fops; */ struct blkfront_info { + spinlock_t io_lock; struct mutex mutex; struct xenbus_device *xbdev; struct gendisk *gd; @@ -98,11 +100,13 @@ struct blkfront_info unsigned long shadow_free; unsigned int feature_flush; unsigned int flush_op; + unsigned int feature_discard:1; + unsigned int feature_secdiscard:1; + unsigned int discard_granularity; + unsigned int discard_alignment; int is_ready; }; -static DEFINE_SPINLOCK(blkif_io_lock); - static unsigned int nr_minors; static unsigned long *minors; static DEFINE_SPINLOCK(minor_lock); @@ -132,19 +136,41 @@ static int get_id_from_freelist(struct blkfront_info *info) { unsigned long free = info->shadow_free; BUG_ON(free >= BLK_RING_SIZE); - info->shadow_free = info->shadow[free].req.id; - info->shadow[free].req.id = 0x0fffffee; /* debug */ + info->shadow_free = info->shadow[free].req.u.rw.id; + info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ return free; } -static void add_id_to_freelist(struct blkfront_info *info, +static int add_id_to_freelist(struct blkfront_info *info, unsigned long id) { - info->shadow[id].req.id = info->shadow_free; + if (info->shadow[id].req.u.rw.id != id) + return -EINVAL; + if (info->shadow[id].request == NULL) + return -EINVAL; + info->shadow[id].req.u.rw.id = info->shadow_free; info->shadow[id].request = NULL; info->shadow_free = id; + return 0; } +static const char *op_name(int op) +{ + static const char *const names[] = { + [BLKIF_OP_READ] = "read", + [BLKIF_OP_WRITE] = "write", + [BLKIF_OP_WRITE_BARRIER] = "barrier", + [BLKIF_OP_FLUSH_DISKCACHE] = "flush", + [BLKIF_OP_DISCARD] = "discard" }; + + if (op < 0 || op >= ARRAY_SIZE(names)) + return "unknown"; + + if (!names[op]) + return "reserved"; + + return names[op]; +} static int xlbd_reserve_minors(unsigned int minor, unsigned int nr) { unsigned int end = minor + nr; @@ -153,7 +179,7 @@ static int xlbd_reserve_minors(unsigned int minor, unsigned int nr) if (end > nr_minors) { unsigned long *bitmap, *old; - bitmap = kzalloc(BITS_TO_LONGS(end) * sizeof(*bitmap), + bitmap = kcalloc(BITS_TO_LONGS(end), sizeof(*bitmap), GFP_KERNEL); if (bitmap == NULL) return -ENOMEM; @@ -173,8 +199,7 @@ static int xlbd_reserve_minors(unsigned int minor, unsigned int nr) spin_lock(&minor_lock); if (find_next_bit(minors, end, minor) >= end) { - for (; minor < end; ++minor) - __set_bit(minor, minors); + bitmap_set(minors, minor, nr); rc = 0; } else rc = -EBUSY; @@ -189,8 +214,7 @@ static void xlbd_release_minors(unsigned int minor, unsigned int nr) BUG_ON(end > nr_minors); spin_lock(&minor_lock); - for (; minor < end; ++minor) - __clear_bit(minor, minors); + bitmap_clear(minors, minor, nr); spin_unlock(&minor_lock); } @@ -284,9 +308,9 @@ static int blkif_queue_request(struct request *req) id = get_id_from_freelist(info); info->shadow[id].request = req; - ring_req->id = id; + ring_req->u.rw.id = id; ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); - ring_req->handle = info->handle; + ring_req->u.rw.handle = info->handle; ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE : BLKIF_OP_READ; @@ -302,29 +326,41 @@ static int blkif_queue_request(struct request *req) ring_req->operation = info->flush_op; } - ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg); - BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); - - for_each_sg(info->sg, sg, ring_req->nr_segments, i) { - buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg))); - fsect = sg->offset >> 9; - lsect = fsect + (sg->length >> 9) - 1; - /* install a grant reference. */ - ref = gnttab_claim_grant_reference(&gref_head); - BUG_ON(ref == -ENOSPC); - - gnttab_grant_foreign_access_ref( - ref, - info->xbdev->otherend_id, - buffer_mfn, - rq_data_dir(req) ); - - info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); - ring_req->u.rw.seg[i] = - (struct blkif_request_segment) { - .gref = ref, - .first_sect = fsect, - .last_sect = lsect }; + if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { + /* id, sector_number and handle are set above. */ + ring_req->operation = BLKIF_OP_DISCARD; + ring_req->u.discard.nr_sectors = blk_rq_sectors(req); + if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) + ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; + else + ring_req->u.discard.flag = 0; + } else { + ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req, + info->sg); + BUG_ON(ring_req->u.rw.nr_segments > + BLKIF_MAX_SEGMENTS_PER_REQUEST); + + for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) { + buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg))); + fsect = sg->offset >> 9; + lsect = fsect + (sg->length >> 9) - 1; + /* install a grant reference. */ + ref = gnttab_claim_grant_reference(&gref_head); + BUG_ON(ref == -ENOSPC); + + gnttab_grant_foreign_access_ref( + ref, + info->xbdev->otherend_id, + buffer_mfn, + rq_data_dir(req)); + + info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); + ring_req->u.rw.seg[i] = + (struct blkif_request_segment) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + } } info->ring.req_prod_pvt++; @@ -370,7 +406,9 @@ static void do_blkif_request(struct request_queue *rq) blk_start_request(req); - if (req->cmd_type != REQ_TYPE_FS) { + if ((req->cmd_type != REQ_TYPE_FS) || + ((req->cmd_flags & (REQ_FLUSH | REQ_FUA)) && + !info->flush_op)) { __blk_end_request_all(req, -EIO); continue; } @@ -399,13 +437,23 @@ wait: static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) { struct request_queue *rq; + struct blkfront_info *info = gd->private_data; - rq = blk_init_queue(do_blkif_request, &blkif_io_lock); + rq = blk_init_queue(do_blkif_request, &info->io_lock); if (rq == NULL) return -1; queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); + if (info->feature_discard) { + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, rq); + blk_queue_max_discard_sectors(rq, get_capacity(gd)); + rq->limits.discard_granularity = info->discard_granularity; + rq->limits.discard_alignment = info->discard_alignment; + if (info->feature_secdiscard) + queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq); + } + /* Hard sector size and max sectors impersonate the equiv. hardware. */ blk_queue_logical_block_size(rq, sector_size); blk_queue_max_hw_sectors(rq, 512); @@ -500,6 +548,14 @@ static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) return 0; } +static char *encode_disk_name(char *ptr, unsigned int n) +{ + if (n >= 26) + ptr = encode_disk_name(ptr, n / 26 - 1); + *ptr = 'a' + n % 26; + return ptr + 1; +} + static int xlvbd_alloc_gendisk(blkif_sector_t capacity, struct blkfront_info *info, u16 vdisk_info, u16 sector_size) @@ -510,6 +566,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, unsigned int offset; int minor; int nr_parts; + char *ptr; BUG_ON(info->gd != NULL); BUG_ON(info->rq != NULL); @@ -534,7 +591,11 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, "emulated IDE disks,\n\t choose an xvd device name" "from xvde on\n", info->vdevice); } - err = -ENODEV; + if (minor >> MINORBITS) { + pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n", + info->vdevice, minor); + return -ENODEV; + } if ((minor % nr_parts) == 0) nr_minors = nr_parts; @@ -548,23 +609,14 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, if (gd == NULL) goto release; - if (nr_minors > 1) { - if (offset < 26) - sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset); - else - sprintf(gd->disk_name, "%s%c%c", DEV_NAME, - 'a' + ((offset / 26)-1), 'a' + (offset % 26)); - } else { - if (offset < 26) - sprintf(gd->disk_name, "%s%c%d", DEV_NAME, - 'a' + offset, - minor & (nr_parts - 1)); - else - sprintf(gd->disk_name, "%s%c%c%d", DEV_NAME, - 'a' + ((offset / 26) - 1), - 'a' + (offset % 26), - minor & (nr_parts - 1)); - } + strcpy(gd->disk_name, DEV_NAME); + ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset); + BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN); + if (nr_minors > 1) + *ptr = 0; + else + snprintf(ptr, gd->disk_name + DISK_NAME_LEN - ptr, + "%d", minor & (nr_parts - 1)); gd->major = XENVBD_MAJOR; gd->first_minor = minor; @@ -608,14 +660,14 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) if (info->rq == NULL) return; - spin_lock_irqsave(&blkif_io_lock, flags); + spin_lock_irqsave(&info->io_lock, flags); /* No more blkif_request(). */ blk_stop_queue(info->rq); /* No more gnttab callback work. */ gnttab_cancel_free_callback(&info->callback); - spin_unlock_irqrestore(&blkif_io_lock, flags); + spin_unlock_irqrestore(&info->io_lock, flags); /* Flush gnttab callback work. Must be done with no locks held. */ flush_work_sync(&info->work); @@ -647,16 +699,16 @@ static void blkif_restart_queue(struct work_struct *work) { struct blkfront_info *info = container_of(work, struct blkfront_info, work); - spin_lock_irq(&blkif_io_lock); + spin_lock_irq(&info->io_lock); if (info->connected == BLKIF_STATE_CONNECTED) kick_pending_request_queues(info); - spin_unlock_irq(&blkif_io_lock); + spin_unlock_irq(&info->io_lock); } static void blkif_free(struct blkfront_info *info, int suspend) { /* Prevent new requests being issued until we fix things up. */ - spin_lock_irq(&blkif_io_lock); + spin_lock_irq(&info->io_lock); info->connected = suspend ? BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; /* No more blkif_request(). */ @@ -664,7 +716,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) blk_stop_queue(info->rq); /* No more gnttab callback work. */ gnttab_cancel_free_callback(&info->callback); - spin_unlock_irq(&blkif_io_lock); + spin_unlock_irq(&info->io_lock); /* Flush gnttab callback work. Must be done with no locks held. */ flush_work_sync(&info->work); @@ -685,7 +737,9 @@ static void blkif_free(struct blkfront_info *info, int suspend) static void blkif_completion(struct blk_shadow *s) { int i; - for (i = 0; i < s->req.nr_segments; i++) + /* Do not let BLKIF_OP_DISCARD as nr_segment is in the same place + * flag. */ + for (i = 0; i < s->req.u.rw.nr_segments; i++) gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL); } @@ -698,10 +752,10 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) struct blkfront_info *info = (struct blkfront_info *)dev_id; int error; - spin_lock_irqsave(&blkif_io_lock, flags); + spin_lock_irqsave(&info->io_lock, flags); if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { - spin_unlock_irqrestore(&blkif_io_lock, flags); + spin_unlock_irqrestore(&info->io_lock, flags); return IRQ_HANDLED; } @@ -714,29 +768,55 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) bret = RING_GET_RESPONSE(&info->ring, i); id = bret->id; + /* + * The backend has messed up and given us an id that we would + * never have given to it (we stamp it up to BLK_RING_SIZE - + * look in get_id_from_freelist. + */ + if (id >= BLK_RING_SIZE) { + WARN(1, "%s: response to %s has incorrect id (%ld)\n", + info->gd->disk_name, op_name(bret->operation), id); + /* We can't safely get the 'struct request' as + * the id is busted. */ + continue; + } req = info->shadow[id].request; - blkif_completion(&info->shadow[id]); + if (bret->operation != BLKIF_OP_DISCARD) + blkif_completion(&info->shadow[id]); - add_id_to_freelist(info, id); + if (add_id_to_freelist(info, id)) { + WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", + info->gd->disk_name, op_name(bret->operation), id); + continue; + } error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; switch (bret->operation) { + case BLKIF_OP_DISCARD: + if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { + struct request_queue *rq = info->rq; + printk(KERN_WARNING "blkfront: %s: %s op failed\n", + info->gd->disk_name, op_name(bret->operation)); + error = -EOPNOTSUPP; + info->feature_discard = 0; + info->feature_secdiscard = 0; + queue_flag_clear(QUEUE_FLAG_DISCARD, rq); + queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq); + } + __blk_end_request_all(req, error); + break; case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_WRITE_BARRIER: if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { - printk(KERN_WARNING "blkfront: %s: write %s op failed\n", - info->flush_op == BLKIF_OP_WRITE_BARRIER ? - "barrier" : "flush disk cache", - info->gd->disk_name); + printk(KERN_WARNING "blkfront: %s: %s op failed\n", + info->gd->disk_name, op_name(bret->operation)); error = -EOPNOTSUPP; } if (unlikely(bret->status == BLKIF_RSP_ERROR && - info->shadow[id].req.nr_segments == 0)) { - printk(KERN_WARNING "blkfront: %s: empty write %s op failed\n", - info->flush_op == BLKIF_OP_WRITE_BARRIER ? - "barrier" : "flush disk cache", - info->gd->disk_name); + info->shadow[id].req.u.rw.nr_segments == 0)) { + printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", + info->gd->disk_name, op_name(bret->operation)); error = -EOPNOTSUPP; } if (unlikely(error)) { @@ -772,7 +852,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) kick_pending_request_queues(info); - spin_unlock_irqrestore(&blkif_io_lock, flags); + spin_unlock_irqrestore(&info->io_lock, flags); return IRQ_HANDLED; } @@ -808,9 +888,8 @@ static int setup_blkring(struct xenbus_device *dev, if (err) goto fail; - err = bind_evtchn_to_irqhandler(info->evtchn, - blkif_interrupt, - IRQF_SAMPLE_RANDOM, "blkif", info); + err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0, + "blkif", info); if (err <= 0) { xenbus_dev_fatal(dev, err, "bind_evtchn_to_irqhandler failed"); @@ -947,14 +1026,15 @@ static int blkfront_probe(struct xenbus_device *dev, } mutex_init(&info->mutex); + spin_lock_init(&info->io_lock); info->xbdev = dev; info->vdevice = vdevice; info->connected = BLKIF_STATE_DISCONNECTED; INIT_WORK(&info->work, blkif_restart_queue); for (i = 0; i < BLK_RING_SIZE; i++) - info->shadow[i].req.id = i+1; - info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; + info->shadow[i].req.u.rw.id = i+1; + info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; /* Front end dir is a number, which is used as the id. */ info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); @@ -988,9 +1068,9 @@ static int blkif_recover(struct blkfront_info *info) /* Stage 2: Set up free list. */ memset(&info->shadow, 0, sizeof(info->shadow)); for (i = 0; i < BLK_RING_SIZE; i++) - info->shadow[i].req.id = i+1; + info->shadow[i].req.u.rw.id = i+1; info->shadow_free = info->ring.req_prod_pvt; - info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; + info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; /* Stage 3: Find pending requests and requeue them. */ for (i = 0; i < BLK_RING_SIZE; i++) { @@ -1003,17 +1083,19 @@ static int blkif_recover(struct blkfront_info *info) *req = copy[i].req; /* We get a new request id, and must reset the shadow state. */ - req->id = get_id_from_freelist(info); - memcpy(&info->shadow[req->id], ©[i], sizeof(copy[i])); + req->u.rw.id = get_id_from_freelist(info); + memcpy(&info->shadow[req->u.rw.id], ©[i], sizeof(copy[i])); + if (req->operation != BLKIF_OP_DISCARD) { /* Rewrite any grant references invalidated by susp/resume. */ - for (j = 0; j < req->nr_segments; j++) - gnttab_grant_foreign_access_ref( - req->u.rw.seg[j].gref, - info->xbdev->otherend_id, - pfn_to_mfn(info->shadow[req->id].frame[j]), - rq_data_dir(info->shadow[req->id].request)); - info->shadow[req->id].req = *req; + for (j = 0; j < req->u.rw.nr_segments; j++) + gnttab_grant_foreign_access_ref( + req->u.rw.seg[j].gref, + info->xbdev->otherend_id, + pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]), + rq_data_dir(info->shadow[req->u.rw.id].request)); + } + info->shadow[req->u.rw.id].req = *req; info->ring.req_prod_pvt++; } @@ -1022,7 +1104,7 @@ static int blkif_recover(struct blkfront_info *info) xenbus_switch_state(info->xbdev, XenbusStateConnected); - spin_lock_irq(&blkif_io_lock); + spin_lock_irq(&info->io_lock); /* Now safe for us to use the shared ring */ info->connected = BLKIF_STATE_CONNECTED; @@ -1033,7 +1115,7 @@ static int blkif_recover(struct blkfront_info *info) /* Kick any other new requests queued since we resumed */ kick_pending_request_queues(info); - spin_unlock_irq(&blkif_io_lock); + spin_unlock_irq(&info->io_lock); return 0; } @@ -1098,6 +1180,41 @@ blkfront_closing(struct blkfront_info *info) bdput(bdev); } +static void blkfront_setup_discard(struct blkfront_info *info) +{ + int err; + char *type; + unsigned int discard_granularity; + unsigned int discard_alignment; + unsigned int discard_secure; + + type = xenbus_read(XBT_NIL, info->xbdev->otherend, "type", NULL); + if (IS_ERR(type)) + return; + + info->feature_secdiscard = 0; + if (strncmp(type, "phy", 3) == 0) { + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "discard-granularity", "%u", &discard_granularity, + "discard-alignment", "%u", &discard_alignment, + NULL); + if (!err) { + info->feature_discard = 1; + info->discard_granularity = discard_granularity; + info->discard_alignment = discard_alignment; + } + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "discard-secure", "%d", &discard_secure, + NULL); + if (!err) + info->feature_secdiscard = discard_secure; + + } else if (strncmp(type, "file", 4) == 0) + info->feature_discard = 1; + + kfree(type); +} + /* * Invoked when the backend is finally 'ready' (and has told produced * the details about the physical device - #sectors, size, etc). @@ -1108,7 +1225,7 @@ static void blkfront_connect(struct blkfront_info *info) unsigned long sector_size; unsigned int binfo; int err; - int barrier, flush; + int barrier, flush, discard; switch (info->connected) { case BLKIF_STATE_CONNECTED: @@ -1178,7 +1295,14 @@ static void blkfront_connect(struct blkfront_info *info) info->feature_flush = REQ_FLUSH; info->flush_op = BLKIF_OP_FLUSH_DISKCACHE; } - + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-discard", "%d", &discard, + NULL); + + if (!err && discard) + blkfront_setup_discard(info); + err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); if (err) { xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", @@ -1189,10 +1313,10 @@ static void blkfront_connect(struct blkfront_info *info) xenbus_switch_state(info->xbdev, XenbusStateConnected); /* Kick pending requests. */ - spin_lock_irq(&blkif_io_lock); + spin_lock_irq(&info->io_lock); info->connected = BLKIF_STATE_CONNECTED; kick_pending_request_queues(info); - spin_unlock_irq(&blkif_io_lock); + spin_unlock_irq(&info->io_lock); add_disk(info->gd); @@ -1322,7 +1446,6 @@ static int blkif_release(struct gendisk *disk, fmode_t mode) mutex_lock(&blkfront_mutex); bdev = bdget_disk(disk, 0); - bdput(bdev); if (bdev->bd_openers) goto out; @@ -1353,6 +1476,7 @@ static int blkif_release(struct gendisk *disk, fmode_t mode) } out: + bdput(bdev); mutex_unlock(&blkfront_mutex); return 0; } @@ -1372,36 +1496,46 @@ static const struct xenbus_device_id blkfront_ids[] = { { "" } }; -static struct xenbus_driver blkfront = { - .name = "vbd", - .owner = THIS_MODULE, - .ids = blkfront_ids, +static DEFINE_XENBUS_DRIVER(blkfront, , .probe = blkfront_probe, .remove = blkfront_remove, .resume = blkfront_resume, .otherend_changed = blkback_changed, .is_ready = blkfront_is_ready, -}; +); static int __init xlblk_init(void) { + int ret; + if (!xen_domain()) return -ENODEV; + if (xen_hvm_domain() && !xen_platform_pci_unplug) + return -ENODEV; + if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) { printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n", XENVBD_MAJOR, DEV_NAME); return -ENODEV; } - return xenbus_register_frontend(&blkfront); + ret = xenbus_register_frontend(&blkfront_driver); + if (ret) { + unregister_blkdev(XENVBD_MAJOR, DEV_NAME); + return ret; + } + + return 0; } module_init(xlblk_init); static void __exit xlblk_exit(void) { - return xenbus_unregister_driver(&blkfront); + xenbus_unregister_driver(&blkfront_driver); + unregister_blkdev(XENVBD_MAJOR, DEV_NAME); + kfree(minors); } module_exit(xlblk_exit); diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index fb1975d82a73..1a17e338735e 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -456,7 +456,7 @@ static inline void ace_fsm_yieldirq(struct ace_device *ace) { dev_dbg(ace->dev, "ace_fsm_yieldirq()\n"); - if (ace->irq == NO_IRQ) + if (!ace->irq) /* No IRQ assigned, so need to poll */ tasklet_schedule(&ace->fsm_tasklet); ace->fsm_continue_flag = 0; @@ -1034,12 +1034,12 @@ static int __devinit ace_setup(struct ace_device *ace) ACE_CTRL_DATABUFRDYIRQ | ACE_CTRL_ERRORIRQ); /* Now we can hook up the irq handler */ - if (ace->irq != NO_IRQ) { + if (ace->irq) { rc = request_irq(ace->irq, ace_interrupt, 0, "systemace", ace); if (rc) { /* Failure - fall back to polled mode */ dev_err(ace->dev, "request_irq failed\n"); - ace->irq = NO_IRQ; + ace->irq = 0; } } @@ -1086,7 +1086,7 @@ static void __devexit ace_teardown(struct ace_device *ace) tasklet_kill(&ace->fsm_tasklet); - if (ace->irq != NO_IRQ) + if (ace->irq) free_irq(ace->irq, ace); iounmap(ace->baseaddr); @@ -1156,7 +1156,7 @@ static int __devinit ace_probe(struct platform_device *dev) resource_size_t physaddr = 0; int bus_width = ACE_BUS_WIDTH_16; /* FIXME: should not be hard coded */ u32 id = dev->id; - int irq = NO_IRQ; + int irq = 0; int i; dev_dbg(&dev->dev, "ace_probe(%p)\n", dev); |