57 files changed, 8284 insertions, 2986 deletions
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index e086fbbbe853..9a13e889837e 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -1177,7 +1177,8 @@ static bool DAC960_V1_EnableMemoryMailboxInterface(DAC960_Controller_T
   int TimeoutCounter;
   int i;
 
-  
+  memset(&CommandMailbox, 0, sizeof(DAC960_V1_CommandMailbox_T));
+
   if (pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32)))
 	return DAC960_Failure(Controller, "DMA mask out of range");
   Controller->BounceBufferLimit = DMA_BIT_MASK(32);
@@ -4627,7 +4628,8 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
   DAC960_Controller_T *Controller = Command->Controller;
   DAC960_CommandType_T CommandType = Command->CommandType;
   DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox;
-  DAC960_V2_IOCTL_Opcode_T CommandOpcode = CommandMailbox->Common.IOCTL_Opcode;
+  DAC960_V2_IOCTL_Opcode_T IOCTLOpcode = CommandMailbox->Common.IOCTL_Opcode;
+  DAC960_V2_CommandOpcode_T CommandOpcode = CommandMailbox->SCSI_10.CommandOpcode;
   DAC960_V2_CommandStatus_T CommandStatus = Command->V2.CommandStatus;
 
   if (CommandType == DAC960_ReadCommand ||
@@ -4699,7 +4701,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
     {
       if (Controller->ShutdownMonitoringTimer)
 	      return;
-      if (CommandOpcode == DAC960_V2_GetControllerInfo)
+      if (IOCTLOpcode == DAC960_V2_GetControllerInfo)
 	{
 	  DAC960_V2_ControllerInfo_T *NewControllerInfo =
 	    Controller->V2.NewControllerInformation;
@@ -4719,14 +4721,14 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
 	  memcpy(ControllerInfo, NewControllerInfo,
 		 sizeof(DAC960_V2_ControllerInfo_T));
 	}
-      else if (CommandOpcode == DAC960_V2_GetEvent)
+      else if (IOCTLOpcode == DAC960_V2_GetEvent)
 	{
 	  if (CommandStatus == DAC960_V2_NormalCompletion) {
 	    DAC960_V2_ReportEvent(Controller, Controller->V2.Event);
 	  }
 	  Controller->V2.NextEventSequenceNumber++;
 	}
-      else if (CommandOpcode == DAC960_V2_GetPhysicalDeviceInfoValid &&
+      else if (IOCTLOpcode == DAC960_V2_GetPhysicalDeviceInfoValid &&
 	       CommandStatus == DAC960_V2_NormalCompletion)
 	{
 	  DAC960_V2_PhysicalDeviceInfo_T *NewPhysicalDeviceInfo =
@@ -4915,7 +4917,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
 	  NewPhysicalDeviceInfo->LogicalUnit++;
 	  Controller->V2.PhysicalDeviceIndex++;
 	}
-      else if (CommandOpcode == DAC960_V2_GetPhysicalDeviceInfoValid)
+      else if (IOCTLOpcode == DAC960_V2_GetPhysicalDeviceInfoValid)
 	{
 	  unsigned int DeviceIndex;
 	  for (DeviceIndex = Controller->V2.PhysicalDeviceIndex;
@@ -4938,7 +4940,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
 	    }
 	  Controller->V2.NeedPhysicalDeviceInformation = false;
 	}
-      else if (CommandOpcode == DAC960_V2_GetLogicalDeviceInfoValid &&
+      else if (IOCTLOpcode == DAC960_V2_GetLogicalDeviceInfoValid &&
 	       CommandStatus == DAC960_V2_NormalCompletion)
 	{
 	  DAC960_V2_LogicalDeviceInfo_T *NewLogicalDeviceInfo =
@@ -5065,7 +5067,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
 			 [LogicalDeviceNumber] = true;
 	  NewLogicalDeviceInfo->LogicalDeviceNumber++;
 	}
-      else if (CommandOpcode == DAC960_V2_GetLogicalDeviceInfoValid)
+      else if (IOCTLOpcode == DAC960_V2_GetLogicalDeviceInfoValid)
 	{
 	  int LogicalDriveNumber;
 	  for (LogicalDriveNumber = 0;
@@ -6578,24 +6580,21 @@ static const struct file_operations dac960_user_command_proc_fops = {
 
 static void DAC960_CreateProcEntries(DAC960_Controller_T *Controller)
 {
-	struct proc_dir_entry *StatusProcEntry;
 	struct proc_dir_entry *ControllerProcEntry;
-	struct proc_dir_entry *UserCommandProcEntry;
 
 	if (DAC960_ProcDirectoryEntry == NULL) {
-  		DAC960_ProcDirectoryEntry = proc_mkdir("rd", NULL);
-  		StatusProcEntry = proc_create("status", 0,
-					   DAC960_ProcDirectoryEntry,
-					   &dac960_proc_fops);
+		DAC960_ProcDirectoryEntry = proc_mkdir("rd", NULL);
+		proc_create("status", 0, DAC960_ProcDirectoryEntry,
+			    &dac960_proc_fops);
 	}
 
-      sprintf(Controller->ControllerName, "c%d", Controller->ControllerNumber);
-      ControllerProcEntry = proc_mkdir(Controller->ControllerName,
-				       DAC960_ProcDirectoryEntry);
-      proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller);
-      proc_create_data("current_status", 0, ControllerProcEntry, &dac960_current_status_proc_fops, Controller);
-      UserCommandProcEntry = proc_create_data("user_command", S_IWUSR | S_IRUSR, ControllerProcEntry, &dac960_user_command_proc_fops, Controller);
-      Controller->ControllerProcEntry = ControllerProcEntry;
+	sprintf(Controller->ControllerName, "c%d", Controller->ControllerNumber);
+	ControllerProcEntry = proc_mkdir(Controller->ControllerName,
+					 DAC960_ProcDirectoryEntry);
+	proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller);
+	proc_create_data("current_status", 0, ControllerProcEntry, &dac960_current_status_proc_fops, Controller);
+	proc_create_data("user_command", S_IWUSR | S_IRUSR, ControllerProcEntry, &dac960_user_command_proc_fops, Controller);
+	Controller->ControllerProcEntry = ControllerProcEntry;
 }
 
 
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 35e56e1c948f..a796407123c7 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -116,6 +116,8 @@ config PARIDE
 
 source "drivers/block/paride/Kconfig"
 
+source "drivers/block/mtip32xx/Kconfig"
+
 config BLK_CPQ_DA
 	tristate "Compaq SMART2 support"
 	depends on PCI && VIRT_TO_BUS
@@ -352,7 +354,7 @@ config BLK_DEV_SX8
 	  Use devices /dev/sx8/$N and /dev/sx8/$Np$M.
 
 config BLK_DEV_UB
-	tristate "Low Performance USB Block driver"
+	tristate "Low Performance USB Block driver (deprecated)"
 	depends on USB
 	help
 	  This driver supports certain USB attached storage devices
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 349539ad3ad9..5b795059f8fb 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -40,5 +40,6 @@ obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= xen-blkfront.o
 obj-$(CONFIG_XEN_BLKDEV_BACKEND)	+= xen-blkback/
 obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/
 obj-$(CONFIG_BLK_DEV_RBD)     += rbd.o
+obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX)	+= mtip32xx/
 
 swim_mod-y	:= swim.o swim_asm.o
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 8eba86bba599..386146d792d1 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -63,7 +63,7 @@
 #include <linux/mutex.h>
 #include <linux/amifdreg.h>
 #include <linux/amifd.h>
-#include <linux/buffer_head.h>
+#include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/interrupt.h>
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 528f6318ded1..321de7b6c442 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -15,6 +15,7 @@
 #include <linux/genhd.h>
 #include <linux/netdevice.h>
 #include <linux/mutex.h>
+#include <linux/export.h>
 #include "aoe.h"
 
 static DEFINE_MUTEX(aoeblk_mutex);
@@ -159,7 +160,7 @@ aoeblk_release(struct gendisk *disk, fmode_t mode)
 	return 0;
 }
 
-static int
+static void
 aoeblk_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct sk_buff_head queue;
@@ -172,25 +173,25 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio)
 	if (bio == NULL) {
 		printk(KERN_ERR "aoe: bio is NULL\n");
 		BUG();
-		return 0;
+		return;
 	}
 	d = bio->bi_bdev->bd_disk->private_data;
 	if (d == NULL) {
 		printk(KERN_ERR "aoe: bd_disk->private_data is NULL\n");
 		BUG();
 		bio_endio(bio, -ENXIO);
-		return 0;
+		return;
 	} else if (bio->bi_io_vec == NULL) {
 		printk(KERN_ERR "aoe: bi_io_vec is NULL\n");
 		BUG();
 		bio_endio(bio, -ENXIO);
-		return 0;
+		return;
 	}
 	buf = mempool_alloc(d->bufpool, GFP_NOIO);
 	if (buf == NULL) {
 		printk(KERN_INFO "aoe: buf allocation failure\n");
 		bio_endio(bio, -ENOMEM);
-		return 0;
+		return;
 	}
 	memset(buf, 0, sizeof(*buf));
 	INIT_LIST_HEAD(&buf->bufs);
@@ -211,7 +212,7 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio)
 		spin_unlock_irqrestore(&d->lock, flags);
 		mempool_free(buf, d->bufpool);
 		bio_endio(bio, -ENXIO);
-		return 0;
+		return;
 	}
 
 	list_add_tail(&buf->bufs, &d->bufq);
@@ -222,8 +223,6 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio)
 
 	spin_unlock_irqrestore(&d->lock, flags);
 	aoenet_xmit(&queue);
-
-	return 0;
 }
 
 static int
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c
index 146296ca4965..e86d2062a164 100644
--- a/drivers/block/aoe/aoechr.c
+++ b/drivers/block/aoe/aoechr.c
@@ -11,6 +11,7 @@
 #include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/skbuff.h>
+#include <linux/export.h>
 #include "aoe.h"
 
 enum {
@@ -269,7 +270,7 @@ static const struct file_operations aoe_fops = {
 	.llseek = noop_llseek,
 };
 
-static char *aoe_devnode(struct device *dev, mode_t *mode)
+static char *aoe_devnode(struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "etherd/%s", dev_name(dev));
 }
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index de0435e63b02..887f68f6d79a 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -35,6 +35,7 @@ new_skb(ulong len)
 		skb_reset_mac_header(skb);
 		skb_reset_network_header(skb);
 		skb->protocol = __constant_htons(ETH_P_AOE);
+		skb_checksum_none_assert(skb);
 	}
 	return skb;
 }
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index dba1c32e1ddf..531ceb31d0ff 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -17,7 +17,7 @@
 #include <linux/highmem.h>
 #include <linux/mutex.h>
 #include <linux/radix-tree.h>
-#include <linux/buffer_head.h> /* invalidate_bh_lrus() */
+#include <linux/fs.h>
 #include <linux/slab.h>
 
 #include <asm/uaccess.h>
@@ -242,9 +242,9 @@ static void copy_to_brd(struct brd_device *brd, const void *src,
 	page = brd_lookup_page(brd, sector);
 	BUG_ON(!page);
 
-	dst = kmap_atomic(page, KM_USER1);
+	dst = kmap_atomic(page);
 	memcpy(dst + offset, src, copy);
-	kunmap_atomic(dst, KM_USER1);
+	kunmap_atomic(dst);
 
 	if (copy < n) {
 		src += copy;
@@ -253,9 +253,9 @@ static void copy_to_brd(struct brd_device *brd, const void *src,
 		page = brd_lookup_page(brd, sector);
 		BUG_ON(!page);
 
-		dst = kmap_atomic(page, KM_USER1);
+		dst = kmap_atomic(page);
 		memcpy(dst, src, copy);
-		kunmap_atomic(dst, KM_USER1);
+		kunmap_atomic(dst);
 	}
 }
 
@@ -273,9 +273,9 @@ static void copy_from_brd(void *dst, struct brd_device *brd,
 	copy = min_t(size_t, n, PAGE_SIZE - offset);
 	page = brd_lookup_page(brd, sector);
 	if (page) {
-		src = kmap_atomic(page, KM_USER1);
+		src = kmap_atomic(page);
 		memcpy(dst, src + offset, copy);
-		kunmap_atomic(src, KM_USER1);
+		kunmap_atomic(src);
 	} else
 		memset(dst, 0, copy);
 
@@ -285,9 +285,9 @@ static void copy_from_brd(void *dst, struct brd_device *brd,
 		copy = n - copy;
 		page = brd_lookup_page(brd, sector);
 		if (page) {
-			src = kmap_atomic(page, KM_USER1);
+			src = kmap_atomic(page);
 			memcpy(dst, src, copy);
-			kunmap_atomic(src, KM_USER1);
+			kunmap_atomic(src);
 		} else
 			memset(dst, 0, copy);
 	}
@@ -309,7 +309,7 @@ static int brd_do_bvec(struct brd_device *brd, struct page *page,
 			goto out;
 	}
 
-	mem = kmap_atomic(page, KM_USER0);
+	mem = kmap_atomic(page);
 	if (rw == READ) {
 		copy_from_brd(mem + off, brd, sector, len);
 		flush_dcache_page(page);
@@ -317,13 +317,13 @@ static int brd_do_bvec(struct brd_device *brd, struct page *page,
 		flush_dcache_page(page);
 		copy_to_brd(brd, mem + off, sector, len);
 	}
-	kunmap_atomic(mem, KM_USER0);
+	kunmap_atomic(mem);
 
 out:
 	return err;
 }
 
-static int brd_make_request(struct request_queue *q, struct bio *bio)
+static void brd_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct block_device *bdev = bio->bi_bdev;
 	struct brd_device *brd = bdev->bd_disk->private_data;
@@ -359,8 +359,6 @@ static int brd_make_request(struct request_queue *q, struct bio *bio)
 
 out:
 	bio_endio(bio, err);
-
-	return 0;
 }
 
 #ifdef CONFIG_BLK_DEV_XIP
@@ -404,14 +402,13 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode,
 	error = -EBUSY;
 	if (bdev->bd_openers <= 1) {
 		/*
-		 * Invalidate the cache first, so it isn't written
-		 * back to the device.
+		 * Kill the cache first, so it isn't written back to the
+		 * device.
 		 *
 		 * Another thread might instantiate more buffercache here,
 		 * but there is not much we can do to close that race.
 		 */
-		invalidate_bh_lrus();
-		truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
+		kill_bdev(bdev);
 		brd_free_pages(brd);
 		error = 0;
 	}
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 8f4ef656a1af..b0f553b26d0f 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -24,6 +24,7 @@
 #include <linux/interrupt.h>
 #include <linux/types.h>
 #include <linux/pci.h>
+#include <linux/pci-aspm.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
@@ -68,6 +69,10 @@ static int cciss_tape_cmds = 6;
 module_param(cciss_tape_cmds, int, 0644);
 MODULE_PARM_DESC(cciss_tape_cmds,
 	"number of commands to allocate for tape devices (default: 6)");
+static int cciss_simple_mode;
+module_param(cciss_simple_mode, int, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(cciss_simple_mode,
+	"Use 'simple mode' rather than 'performant mode'");
 
 static DEFINE_MUTEX(cciss_mutex);
 static struct proc_dir_entry *proc_cciss;
@@ -176,6 +181,7 @@ static void cciss_geometry_inquiry(ctlr_info_t *h, int logvol,
 			unsigned int block_size, InquiryData_struct *inq_buff,
 				   drive_info_struct *drv);
 static void __devinit cciss_interrupt_mode(ctlr_info_t *);
+static int __devinit cciss_enter_simple_mode(struct ctlr_info *h);
 static void start_io(ctlr_info_t *h);
 static int sendcmd_withirq(ctlr_info_t *h, __u8 cmd, void *buff, size_t size,
 			__u8 page_code, unsigned char scsi3addr[],
@@ -388,7 +394,7 @@ static void cciss_seq_show_header(struct seq_file *seq)
 		h->product_name,
 		(unsigned long)h->board_id,
 		h->firm_ver[0], h->firm_ver[1], h->firm_ver[2],
-		h->firm_ver[3], (unsigned int)h->intr[PERF_MODE_INT],
+		h->firm_ver[3], (unsigned int)h->intr[h->intr_mode],
 		h->num_luns,
 		h->Qdepth, h->commands_outstanding,
 		h->maxQsinceinit, h->max_outstanding, h->maxSG);
@@ -636,6 +642,18 @@ static ssize_t host_store_rescan(struct device *dev,
 }
 static DEVICE_ATTR(rescan, S_IWUSR, NULL, host_store_rescan);
 
+static ssize_t host_show_transport_mode(struct device *dev,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	struct ctlr_info *h = to_hba(dev);
+
+	return snprintf(buf, 20, "%s\n",
+		h->transMethod & CFGTBL_Trans_Performant ?
+			"performant" : "simple");
+}
+static DEVICE_ATTR(transport_mode, S_IRUGO, host_show_transport_mode, NULL);
+
 static ssize_t dev_show_unique_id(struct device *dev,
 				 struct device_attribute *attr,
 				 char *buf)
@@ -808,6 +826,7 @@ static DEVICE_ATTR(usage_count, S_IRUGO, cciss_show_usage_count, NULL);
 static struct attribute *cciss_host_attrs[] = {
 	&dev_attr_rescan.attr,
 	&dev_attr_resettable.attr,
+	&dev_attr_transport_mode.attr,
 	NULL
 };
 
@@ -1716,7 +1735,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 	case CCISS_BIG_PASSTHRU:
 		return cciss_bigpassthru(h, argp);
 
-	/* scsi_cmd_ioctl handles these, below, though some are not */
+	/* scsi_cmd_blk_ioctl handles these, below, though some are not */
 	/* very meaningful for cciss.  SG_IO is the main one people want. */
 
 	case SG_GET_VERSION_NUM:
@@ -1727,9 +1746,9 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 	case SG_EMULATED_HOST:
 	case SG_IO:
 	case SCSI_IOCTL_SEND_COMMAND:
-		return scsi_cmd_ioctl(disk->queue, disk, mode, cmd, argp);
+		return scsi_cmd_blk_ioctl(bdev, mode, cmd, argp);
 
-	/* scsi_cmd_ioctl would normally handle these, below, but */
+	/* scsi_cmd_blk_ioctl would normally handle these, below, but */
 	/* they aren't a good fit for cciss, as CD-ROMs are */
 	/* not supported, and we don't have any bus/target/lun */
 	/* which we present to the kernel. */
@@ -2582,6 +2601,8 @@ static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff,
 			c->Request.Timeout = 0;
 			c->Request.CDB[0] = BMIC_WRITE;
 			c->Request.CDB[6] = BMIC_CACHE_FLUSH;
+			c->Request.CDB[7] = (size >> 8) & 0xFF;
+			c->Request.CDB[8] = size & 0xFF;
 			break;
 		case TEST_UNIT_READY:
 			c->Request.CDBLen = 6;
@@ -3984,6 +4005,9 @@ static void __devinit cciss_put_controller_into_performant_mode(ctlr_info_t *h)
 {
 	__u32 trans_support;
 
+	if (cciss_simple_mode)
+		return;
+
 	dev_dbg(&h->pdev->dev, "Trying to put board into Performant mode\n");
 	/* Attempt to put controller into performant mode if supported */
 	/* Does board support performant mode? */
@@ -4081,7 +4105,7 @@ static void __devinit cciss_interrupt_mode(ctlr_info_t *h)
 default_int_mode:
 #endif				/* CONFIG_PCI_MSI */
 	/* if we get here we're going to use the default interrupt mode */
-	h->intr[PERF_MODE_INT] = h->pdev->irq;
+	h->intr[h->intr_mode] = h->pdev->irq;
 	return;
 }
 
@@ -4298,6 +4322,10 @@ static int __devinit cciss_pci_init(ctlr_info_t *h)
 		dev_warn(&h->pdev->dev, "controller appears to be disabled\n");
 		return -ENODEV;
 	}
+
+	pci_disable_link_state(h->pdev, PCIE_LINK_STATE_L0S |
+				PCIE_LINK_STATE_L1 | PCIE_LINK_STATE_CLKPM);
+
 	err = pci_enable_device(h->pdev);
 	if (err) {
 		dev_warn(&h->pdev->dev, "Unable to Enable PCI device\n");
@@ -4341,6 +4369,9 @@ static int __devinit cciss_pci_init(ctlr_info_t *h)
 	}
 	cciss_enable_scsi_prefetch(h);
 	cciss_p600_dma_prefetch_quirk(h);
+	err = cciss_enter_simple_mode(h);
+	if (err)
+		goto err_out_free_res;
 	cciss_put_controller_into_performant_mode(h);
 	return 0;
 
@@ -4533,6 +4564,13 @@ static int cciss_controller_hard_reset(struct pci_dev *pdev,
 		pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
 		pmcsr |= PCI_D0;
 		pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr);
+
+		/*
+		 * The P600 requires a small delay when changing states.
+		 * Otherwise we may think the board did not reset and we bail.
+		 * This for kdump only and is particular to the P600.
+		 */
+		msleep(500);
 	}
 	return 0;
 }
@@ -4843,20 +4881,20 @@ static int cciss_request_irq(ctlr_info_t *h,
 	irqreturn_t (*intxhandler)(int, void *))
 {
 	if (h->msix_vector || h->msi_vector) {
-		if (!request_irq(h->intr[PERF_MODE_INT], msixhandler,
-				IRQF_DISABLED, h->devname, h))
+		if (!request_irq(h->intr[h->intr_mode], msixhandler,
+				0, h->devname, h))
 			return 0;
 		dev_err(&h->pdev->dev, "Unable to get msi irq %d"
-			" for %s\n", h->intr[PERF_MODE_INT],
+			" for %s\n", h->intr[h->intr_mode],
 			h->devname);
 		return -1;
 	}
 
-	if (!request_irq(h->intr[PERF_MODE_INT], intxhandler,
-			IRQF_DISABLED, h->devname, h))
+	if (!request_irq(h->intr[h->intr_mode], intxhandler,
+			IRQF_SHARED, h->devname, h))
 		return 0;
 	dev_err(&h->pdev->dev, "Unable to get irq %d for %s\n",
-		h->intr[PERF_MODE_INT], h->devname);
+		h->intr[h->intr_mode], h->devname);
 	return -1;
 }
 
@@ -4887,7 +4925,7 @@ static void cciss_undo_allocations_after_kdump_soft_reset(ctlr_info_t *h)
 {
 	int ctlr = h->ctlr;
 
-	free_irq(h->intr[PERF_MODE_INT], h);
+	free_irq(h->intr[h->intr_mode], h);
 #ifdef CONFIG_PCI_MSI
 	if (h->msix_vector)
 		pci_disable_msix(h->pdev);
@@ -4953,6 +4991,7 @@ reinit_after_soft_reset:
 	h = hba[i];
 	h->pdev = pdev;
 	h->busy_initializing = 1;
+	h->intr_mode = cciss_simple_mode ? SIMPLE_MODE_INT : PERF_MODE_INT;
 	INIT_LIST_HEAD(&h->cmpQ);
 	INIT_LIST_HEAD(&h->reqQ);
 	mutex_init(&h->busy_shutting_down);
@@ -5009,7 +5048,7 @@ reinit_after_soft_reset:
 
 	dev_info(&h->pdev->dev, "%s: <0x%x> at PCI %s IRQ %d%s using DAC\n",
 	       h->devname, pdev->device, pci_name(pdev),
-	       h->intr[PERF_MODE_INT], dac ? "" : " not");
+	       h->intr[h->intr_mode], dac ? "" : " not");
 
 	if (cciss_allocate_cmd_pool(h))
 		goto clean4;
@@ -5056,7 +5095,7 @@ reinit_after_soft_reset:
 		spin_lock_irqsave(&h->lock, flags);
 		h->access.set_intr_mask(h, CCISS_INTR_OFF);
 		spin_unlock_irqrestore(&h->lock, flags);
-		free_irq(h->intr[PERF_MODE_INT], h);
+		free_irq(h->intr[h->intr_mode], h);
 		rc = cciss_request_irq(h, cciss_msix_discard_completions,
 					cciss_intx_discard_completions);
 		if (rc) {
@@ -5126,6 +5165,7 @@ reinit_after_soft_reset:
 	h->cciss_max_sectors = 8192;
 
 	rebuild_lun_table(h, 1, 0);
+	cciss_engage_scsi(h);
 	h->busy_initializing = 0;
 	return 1;
 
@@ -5133,7 +5173,7 @@ clean4:
 	cciss_free_cmd_pool(h);
 	cciss_free_scatterlists(h);
 	cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds);
-	free_irq(h->intr[PERF_MODE_INT], h);
+	free_irq(h->intr[h->intr_mode], h);
 clean2:
 	unregister_blkdev(h->major, h->devname);
 clean1:
@@ -5172,9 +5212,31 @@ static void cciss_shutdown(struct pci_dev *pdev)
 	if (return_code != IO_OK)
 		dev_warn(&h->pdev->dev, "Error flushing cache\n");
 	h->access.set_intr_mask(h, CCISS_INTR_OFF);
-	free_irq(h->intr[PERF_MODE_INT], h);
+	free_irq(h->intr[h->intr_mode], h);
 }
 
+static int __devinit cciss_enter_simple_mode(struct ctlr_info *h)
+{
+	u32 trans_support;
+
+	trans_support = readl(&(h->cfgtable->TransportSupport));
+	if (!(trans_support & SIMPLE_MODE))
+		return -ENOTSUPP;
+
+	h->max_commands = readl(&(h->cfgtable->CmdsOutMax));
+	writel(CFGTBL_Trans_Simple, &(h->cfgtable->HostWrite.TransportRequest));
+	writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
+	cciss_wait_for_mode_change_ack(h);
+	print_cfg_table(h);
+	if (!(readl(&(h->cfgtable->TransportActive)) & CFGTBL_Trans_Simple)) {
+		dev_warn(&h->pdev->dev, "unable to get board into simple mode\n");
+		return -ENODEV;
+	}
+	h->transMethod = CFGTBL_Trans_Simple;
+	return 0;
+}
+
+
 static void __devexit cciss_remove_one(struct pci_dev *pdev)
 {
 	ctlr_info_t *h;
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index c049548e68b7..7fda30e4a241 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -92,6 +92,7 @@ struct ctlr_info
 	unsigned int intr[4];
 	unsigned int msix_vector;
 	unsigned int msi_vector;
+	int	intr_mode;
 	int 	cciss_max_sectors;
 	BYTE	cciss_read;
 	BYTE	cciss_write;
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 951a4e33b92b..da3311129a0c 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -763,16 +763,7 @@ static void complete_scsi_command(CommandList_struct *c, int timeout,
 		{
 			case CMD_TARGET_STATUS:
 				/* Pass it up to the upper layers... */
-				if( ei->ScsiStatus)
-                		{
-#if 0
-                    			printk(KERN_WARNING "cciss: cmd %p "
-						"has SCSI Status = %x\n",
-						c, ei->ScsiStatus);
-#endif
-					cmd->result |= (ei->ScsiStatus << 1);
-                		}
-				else {  /* scsi status is zero??? How??? */
+				if (!ei->ScsiStatus) {
 					
 	/* Ordinarily, this case should never happen, but there is a bug
 	   in some released firmware revisions that allows it to happen
@@ -804,6 +795,7 @@ static void complete_scsi_command(CommandList_struct *c, int timeout,
 				}
 			break;
 			case CMD_PROTOCOL_ERR:
+				cmd->result = DID_ERROR << 16;
 				dev_warn(&h->pdev->dev,
 					"%p has protocol error\n", c);
                         break;
@@ -866,6 +858,7 @@ cciss_scsi_detect(ctlr_info_t *h)
 	sh->can_queue = cciss_tape_cmds;
 	sh->sg_tablesize = h->maxsgentries;
 	sh->max_cmd_len = MAX_COMMAND_SIZE;
+	sh->max_sectors = h->cciss_max_sectors;
 
 	((struct cciss_scsi_adapter_data_t *) 
 		h->scsi_ctlr)->scsi_host = sh;
@@ -1410,7 +1403,7 @@ static void cciss_scatter_gather(ctlr_info_t *h, CommandList_struct *c,
 	/* track how many SG entries we are using */
 	if (request_nsgs > h->maxSG)
 		h->maxSG = request_nsgs;
-	c->Header.SGTotal = (__u8) request_nsgs + chained;
+	c->Header.SGTotal = (u16) request_nsgs + chained;
 	if (request_nsgs > h->max_cmd_sgentries)
 		c->Header.SGList = h->max_cmd_sgentries;
 	else
@@ -1720,5 +1713,6 @@ static int  cciss_eh_abort_handler(struct scsi_cmnd *scsicmd)
 /* If no tape support, then these become defined out of existence */
 
 #define cciss_scsi_setup(cntl_num)
+#define cciss_engage_scsi(h)
 
 #endif /* CONFIG_CISS_SCSI_TAPE */
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index b2fceb53e809..9125bbeacd4d 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -620,6 +620,7 @@ static int cpqarray_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
 	}
 	vendor_id = pdev->vendor;
 	device_id = pdev->device;
+	revision  = pdev->revision;
 	irq = pdev->irq;
 
 	for(i=0; i<6; i++)
@@ -632,7 +633,6 @@ static int cpqarray_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
 	}
 
 	pci_read_config_word(pdev, PCI_COMMAND, &command);
-	pci_read_config_byte(pdev, PCI_CLASS_REVISION, &revision);
 	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &cache_line_size);
 	pci_read_config_byte(pdev, PCI_LATENCY_TIMER, &latency_timer);
 
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index cf0e63dd97da..3fbef018ce55 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -65,39 +65,80 @@ struct drbd_atodb_wait {
 
 int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
 
+void *drbd_md_get_buffer(struct drbd_conf *mdev)
+{
+	int r;
+
+	wait_event(mdev->misc_wait,
+		   (r = atomic_cmpxchg(&mdev->md_io_in_use, 0, 1)) == 0 ||
+		   mdev->state.disk <= D_FAILED);
+
+	return r ? NULL : page_address(mdev->md_io_page);
+}
+
+void drbd_md_put_buffer(struct drbd_conf *mdev)
+{
+	if (atomic_dec_and_test(&mdev->md_io_in_use))
+		wake_up(&mdev->misc_wait);
+}
+
+static bool md_io_allowed(struct drbd_conf *mdev)
+{
+	enum drbd_disk_state ds = mdev->state.disk;
+	return ds >= D_NEGOTIATING || ds == D_ATTACHING;
+}
+
+void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+				     unsigned int *done)
+{
+	long dt = bdev->dc.disk_timeout * HZ / 10;
+	if (dt == 0)
+		dt = MAX_SCHEDULE_TIMEOUT;
+
+	dt = wait_event_timeout(mdev->misc_wait, *done || !md_io_allowed(mdev), dt);
+	if (dt == 0)
+		dev_err(DEV, "meta-data IO operation timed out\n");
+}
+
 static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
 				 struct drbd_backing_dev *bdev,
 				 struct page *page, sector_t sector,
 				 int rw, int size)
 {
 	struct bio *bio;
-	struct drbd_md_io md_io;
 	int ok;
 
-	md_io.mdev = mdev;
-	init_completion(&md_io.event);
-	md_io.error = 0;
+	mdev->md_io.done = 0;
+	mdev->md_io.error = -ENODEV;
 
 	if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
 		rw |= REQ_FUA | REQ_FLUSH;
 	rw |= REQ_SYNC;
 
-	bio = bio_alloc(GFP_NOIO, 1);
+	bio = bio_alloc_drbd(GFP_NOIO);
 	bio->bi_bdev = bdev->md_bdev;
 	bio->bi_sector = sector;
 	ok = (bio_add_page(bio, page, size, 0) == size);
 	if (!ok)
 		goto out;
-	bio->bi_private = &md_io;
+	bio->bi_private = &mdev->md_io;
 	bio->bi_end_io = drbd_md_io_complete;
 	bio->bi_rw = rw;
 
+	if (!get_ldev_if_state(mdev, D_ATTACHING)) {  /* Corresponding put_ldev in drbd_md_io_complete() */
+		dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
+		ok = 0;
+		goto out;
+	}
+
+	bio_get(bio); /* one bio_put() is in the completion handler */
+	atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */
 	if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
 		bio_endio(bio, -EIO);
 	else
 		submit_bio(rw, bio);
-	wait_for_completion(&md_io.event);
-	ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
+	wait_until_done_or_disk_failure(mdev, bdev, &mdev->md_io.done);
+	ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0;
 
  out:
 	bio_put(bio);
@@ -111,7 +152,7 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
 	int offset = 0;
 	struct page *iop = mdev->md_io_page;
 
-	D_ASSERT(mutex_is_locked(&mdev->md_io_mutex));
+	D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1);
 
 	BUG_ON(!bdev->md_bdev);
 
@@ -328,8 +369,13 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 		return 1;
 	}
 
-	mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */
-	buffer = (struct al_transaction *)page_address(mdev->md_io_page);
+	buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */
+	if (!buffer) {
+		dev_err(DEV, "disk failed while waiting for md_io buffer\n");
+		complete(&((struct update_al_work *)w)->event);
+		put_ldev(mdev);
+		return 1;
+	}
 
 	buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
 	buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
@@ -365,7 +411,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 		+ mdev->ldev->md.al_offset + mdev->al_tr_pos;
 
 	if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
-		drbd_chk_io_error(mdev, 1, true);
+		drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
 
 	if (++mdev->al_tr_pos >
 	    div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
@@ -374,7 +420,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 	D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
 	mdev->al_tr_number++;
 
-	mutex_unlock(&mdev->md_io_mutex);
+	drbd_md_put_buffer(mdev);
 
 	complete(&((struct update_al_work *)w)->event);
 	put_ldev(mdev);
@@ -443,8 +489,9 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 	/* lock out all other meta data io for now,
 	 * and make sure the page is mapped.
 	 */
-	mutex_lock(&mdev->md_io_mutex);
-	buffer = page_address(mdev->md_io_page);
+	buffer = drbd_md_get_buffer(mdev);
+	if (!buffer)
+		return 0;
 
 	/* Find the valid transaction in the log */
 	for (i = 0; i <= mx; i++) {
@@ -452,7 +499,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 		if (rv == 0)
 			continue;
 		if (rv == -1) {
-			mutex_unlock(&mdev->md_io_mutex);
+			drbd_md_put_buffer(mdev);
 			return 0;
 		}
 		cnr = be32_to_cpu(buffer->tr_number);
@@ -478,7 +525,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 
 	if (!found_valid) {
 		dev_warn(DEV, "No usable activity log found.\n");
-		mutex_unlock(&mdev->md_io_mutex);
+		drbd_md_put_buffer(mdev);
 		return 1;
 	}
 
@@ -493,7 +540,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 		rv = drbd_al_read_tr(mdev, bdev, buffer, i);
 		ERR_IF(rv == 0) goto cancel;
 		if (rv == -1) {
-			mutex_unlock(&mdev->md_io_mutex);
+			drbd_md_put_buffer(mdev);
 			return 0;
 		}
 
@@ -534,7 +581,7 @@ cancel:
 		mdev->al_tr_pos = 0;
 
 	/* ok, we are done with it */
-	mutex_unlock(&mdev->md_io_mutex);
+	drbd_md_put_buffer(mdev);
 
 	dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
 	     transactions, active_extents);
@@ -671,16 +718,20 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
 			else
 				ext->rs_failed += count;
 			if (ext->rs_left < ext->rs_failed) {
-				dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
-				    "rs_failed=%d count=%d\n",
+				dev_warn(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
+				    "rs_failed=%d count=%d cstate=%s\n",
 				     (unsigned long long)sector,
 				     ext->lce.lc_number, ext->rs_left,
-				     ext->rs_failed, count);
-				dump_stack();
-
-				lc_put(mdev->resync, &ext->lce);
-				drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-				return;
+				     ext->rs_failed, count,
+				     drbd_conn_str(mdev->state.conn));
+
+				/* We don't expect to be able to clear more bits
+				 * than have been set when we originally counted
+				 * the set bits to cache that value in ext->rs_left.
+				 * Whatever the reason (disconnect during resync,
+				 * delayed local completion of an application write),
+				 * try to fix it up by recounting here. */
+				ext->rs_left = drbd_bm_e_weight(mdev, enr);
 			}
 		} else {
 			/* Normally this element should be in the cache,
@@ -825,7 +876,11 @@ int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
 	unsigned int enr, count = 0;
 	struct lc_element *e;
 
-	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
+	/* this should be an empty REQ_FLUSH */
+	if (size == 0)
+		return 0;
+
+	if (size < 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
 		dev_err(DEV, "sector: %llus, size: %d\n",
 			(unsigned long long)sector, size);
 		return 0;
@@ -1192,6 +1247,7 @@ int drbd_rs_del_all(struct drbd_conf *mdev)
 		put_ldev(mdev);
 	}
 	spin_unlock_irq(&mdev->al_lock);
+	wake_up(&mdev->al_wait);
 
 	return 0;
 }
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 7b976296b564..d84566496746 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -205,7 +205,7 @@ void drbd_bm_unlock(struct drbd_conf *mdev)
 static void bm_store_page_idx(struct page *page, unsigned long idx)
 {
 	BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
-	page_private(page) |= idx;
+	set_page_private(page, idx);
 }
 
 static unsigned long bm_page_to_idx(struct page *page)
@@ -289,25 +289,25 @@ static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
 	return page_nr;
 }
 
-static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km)
+static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
 {
 	struct page *page = b->bm_pages[idx];
-	return (unsigned long *) kmap_atomic(page, km);
+	return (unsigned long *) kmap_atomic(page);
 }
 
 static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
 {
-	return __bm_map_pidx(b, idx, KM_IRQ1);
+	return __bm_map_pidx(b, idx);
 }
 
-static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
+static void __bm_unmap(unsigned long *p_addr)
 {
-	kunmap_atomic(p_addr, km);
+	kunmap_atomic(p_addr);
 };
 
 static void bm_unmap(unsigned long *p_addr)
 {
-	return __bm_unmap(p_addr, KM_IRQ1);
+	return __bm_unmap(p_addr);
 }
 
 /* long word offset of _bitmap_ sector */
@@ -378,15 +378,14 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
 	 * thread.  As we have no disk yet, we are not in the IO path,
 	 * not even the IO path of the peer. */
 	bytes = sizeof(struct page *)*want;
-	new_pages = kmalloc(bytes, GFP_KERNEL);
+	new_pages = kzalloc(bytes, GFP_KERNEL);
 	if (!new_pages) {
-		new_pages = vmalloc(bytes);
+		new_pages = vzalloc(bytes);
 		if (!new_pages)
 			return NULL;
 		vmalloced = 1;
 	}
 
-	memset(new_pages, 0, bytes);
 	if (want >= have) {
 		for (i = 0; i < have; i++)
 			new_pages[i] = old_pages[i];
@@ -544,15 +543,15 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b)
 
 	/* all but last page */
 	for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
-		p_addr = __bm_map_pidx(b, idx, KM_USER0);
+		p_addr = __bm_map_pidx(b, idx);
 		for (i = 0; i < LWPP; i++)
 			bits += hweight_long(p_addr[i]);
-		__bm_unmap(p_addr, KM_USER0);
+		__bm_unmap(p_addr);
 		cond_resched();
 	}
 	/* last (or only) page */
 	last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
-	p_addr = __bm_map_pidx(b, idx, KM_USER0);
+	p_addr = __bm_map_pidx(b, idx);
 	for (i = 0; i < last_word; i++)
 		bits += hweight_long(p_addr[i]);
 	p_addr[last_word] &= cpu_to_lel(mask);
@@ -560,7 +559,7 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b)
 	/* 32bit arch, may have an unused padding long */
 	if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
 		p_addr[last_word+1] = 0;
-	__bm_unmap(p_addr, KM_USER0);
+	__bm_unmap(p_addr);
 	return bits;
 }
 
@@ -887,12 +886,22 @@ void drbd_bm_clear_all(struct drbd_conf *mdev)
 struct bm_aio_ctx {
 	struct drbd_conf *mdev;
 	atomic_t in_flight;
-	struct completion done;
+	unsigned int done;
 	unsigned flags;
 #define BM_AIO_COPY_PAGES	1
+#define BM_WRITE_ALL_PAGES	2
 	int error;
+	struct kref kref;
 };
 
+static void bm_aio_ctx_destroy(struct kref *kref)
+{
+	struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref);
+
+	put_ldev(ctx->mdev);
+	kfree(ctx);
+}
+
 /* bv_page may be a copy, or may be the original */
 static void bm_async_io_complete(struct bio *bio, int error)
 {
@@ -931,20 +940,21 @@ static void bm_async_io_complete(struct bio *bio, int error)
 
 	bm_page_unlock_io(mdev, idx);
 
-	/* FIXME give back to page pool */
 	if (ctx->flags & BM_AIO_COPY_PAGES)
-		put_page(bio->bi_io_vec[0].bv_page);
+		mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool);
 
 	bio_put(bio);
 
-	if (atomic_dec_and_test(&ctx->in_flight))
-		complete(&ctx->done);
+	if (atomic_dec_and_test(&ctx->in_flight)) {
+		ctx->done = 1;
+		wake_up(&mdev->misc_wait);
+		kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+	}
 }
 
 static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
 {
-	/* we are process context. we always get a bio */
-	struct bio *bio = bio_alloc(GFP_KERNEL, 1);
+	struct bio *bio = bio_alloc_drbd(GFP_NOIO);
 	struct drbd_conf *mdev = ctx->mdev;
 	struct drbd_bitmap *b = mdev->bitmap;
 	struct page *page;
@@ -967,21 +977,21 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
 	bm_set_page_unchanged(b->bm_pages[page_nr]);
 
 	if (ctx->flags & BM_AIO_COPY_PAGES) {
-		/* FIXME alloc_page is good enough for now, but actually needs
-		 * to use pre-allocated page pool */
 		void *src, *dest;
-		page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT);
-		dest = kmap_atomic(page, KM_USER0);
-		src = kmap_atomic(b->bm_pages[page_nr], KM_USER1);
+		page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT);
+		dest = kmap_atomic(page);
+		src = kmap_atomic(b->bm_pages[page_nr]);
 		memcpy(dest, src, PAGE_SIZE);
-		kunmap_atomic(src, KM_USER1);
-		kunmap_atomic(dest, KM_USER0);
+		kunmap_atomic(src);
+		kunmap_atomic(dest);
 		bm_store_page_idx(page, page_nr);
 	} else
 		page = b->bm_pages[page_nr];
 
 	bio->bi_bdev = mdev->ldev->md_bdev;
 	bio->bi_sector = on_disk_sector;
+	/* bio_add_page of a single page to an empty bio will always succeed,
+	 * according to api.  Do we want to assert that? */
 	bio_add_page(bio, page, len, 0);
 	bio->bi_private = ctx;
 	bio->bi_end_io = bm_async_io_complete;
@@ -1000,14 +1010,9 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
 /*
  * bm_rw: read/write the whole bitmap from/to its on disk location.
  */
-static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local)
+static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
 {
-	struct bm_aio_ctx ctx = {
-		.mdev = mdev,
-		.in_flight = ATOMIC_INIT(1),
-		.done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
-		.flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0,
-	};
+	struct bm_aio_ctx *ctx;
 	struct drbd_bitmap *b = mdev->bitmap;
 	int num_pages, i, count = 0;
 	unsigned long now;
@@ -1022,7 +1027,27 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
 	 * For lazy writeout, we don't care for ongoing changes to the bitmap,
 	 * as we submit copies of pages anyways.
 	 */
-	if (!ctx.flags)
+
+	ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
+	if (!ctx)
+		return -ENOMEM;
+
+	*ctx = (struct bm_aio_ctx) {
+		.mdev = mdev,
+		.in_flight = ATOMIC_INIT(1),
+		.done = 0,
+		.flags = flags,
+		.error = 0,
+		.kref = { ATOMIC_INIT(2) },
+	};
+
+	if (!get_ldev_if_state(mdev, D_ATTACHING)) {  /* put is in bm_aio_ctx_destroy() */
+		dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
+		kfree(ctx);
+		return -ENODEV;
+	}
+
+	if (!ctx->flags)
 		WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
 
 	num_pages = b->bm_number_of_pages;
@@ -1035,7 +1060,8 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
 		if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
 			break;
 		if (rw & WRITE) {
-			if (bm_test_page_unchanged(b->bm_pages[i])) {
+			if (!(flags & BM_WRITE_ALL_PAGES) &&
+			    bm_test_page_unchanged(b->bm_pages[i])) {
 				dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
 				continue;
 			}
@@ -1047,29 +1073,38 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
 				continue;
 			}
 		}
-		atomic_inc(&ctx.in_flight);
-		bm_page_io_async(&ctx, i, rw);
+		atomic_inc(&ctx->in_flight);
+		bm_page_io_async(ctx, i, rw);
 		++count;
 		cond_resched();
 	}
 
 	/*
-	 * We initialize ctx.in_flight to one to make sure bm_async_io_complete
-	 * will not complete() early, and decrement / test it here.  If there
+	 * We initialize ctx->in_flight to one to make sure bm_async_io_complete
+	 * will not set ctx->done early, and decrement / test it here.  If there
 	 * are still some bios in flight, we need to wait for them here.
+	 * If all IO is done already (or nothing had been submitted), there is
+	 * no need to wait.  Still, we need to put the kref associated with the
+	 * "in_flight reached zero, all done" event.
 	 */
-	if (!atomic_dec_and_test(&ctx.in_flight))
-		wait_for_completion(&ctx.done);
+	if (!atomic_dec_and_test(&ctx->in_flight))
+		wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
+	else
+		kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+
 	dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
 			rw == WRITE ? "WRITE" : "READ",
 			count, jiffies - now);
 
-	if (ctx.error) {
+	if (ctx->error) {
 		dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
-		drbd_chk_io_error(mdev, 1, true);
-		err = -EIO; /* ctx.error ? */
+		drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
+		err = -EIO; /* ctx->error ? */
 	}
 
+	if (atomic_read(&ctx->in_flight))
+		err = -EIO; /* Disk failed during IO... */
+
 	now = jiffies;
 	if (rw == WRITE) {
 		drbd_md_flush(mdev);
@@ -1083,6 +1118,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
 	dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
 	     ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
 
+	kref_put(&ctx->kref, &bm_aio_ctx_destroy);
 	return err;
 }
 
@@ -1092,7 +1128,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
  */
 int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
 {
-	return bm_rw(mdev, READ, 0);
+	return bm_rw(mdev, READ, 0, 0);
 }
 
 /**
@@ -1103,7 +1139,18 @@ int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
  */
 int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
 {
-	return bm_rw(mdev, WRITE, 0);
+	return bm_rw(mdev, WRITE, 0, 0);
+}
+
+/**
+ * drbd_bm_write_all() - Write the whole bitmap to its on disk location.
+ * @mdev:	DRBD device.
+ *
+ * Will write all pages.
+ */
+int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local)
+{
+	return bm_rw(mdev, WRITE, BM_WRITE_ALL_PAGES, 0);
 }
 
 /**
@@ -1113,7 +1160,23 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
  */
 int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local)
 {
-	return bm_rw(mdev, WRITE, upper_idx);
+	return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, upper_idx);
+}
+
+/**
+ * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location.
+ * @mdev:	DRBD device.
+ *
+ * Will only write pages that have changed since last IO.
+ * In contrast to drbd_bm_write(), this will copy the bitmap pages
+ * to temporary writeout pages. It is intended to trigger a full write-out
+ * while still allowing the bitmap to change, for example if a resync or online
+ * verify is aborted due to a failed peer disk, while local IO continues, or
+ * pending resync acks are still being processed.
+ */
+int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local)
+{
+	return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0);
 }
 
 
@@ -1131,28 +1194,45 @@ int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(l
  */
 int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local)
 {
-	struct bm_aio_ctx ctx = {
+	struct bm_aio_ctx *ctx;
+	int err;
+
+	if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {
+		dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx);
+		return 0;
+	}
+
+	ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
+	if (!ctx)
+		return -ENOMEM;
+
+	*ctx = (struct bm_aio_ctx) {
 		.mdev = mdev,
 		.in_flight = ATOMIC_INIT(1),
-		.done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
+		.done = 0,
 		.flags = BM_AIO_COPY_PAGES,
+		.error = 0,
+		.kref = { ATOMIC_INIT(2) },
 	};
 
-	if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {
-		dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx);
-		return 0;
+	if (!get_ldev_if_state(mdev, D_ATTACHING)) {  /* put is in bm_aio_ctx_destroy() */
+		dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n");
+		kfree(ctx);
+		return -ENODEV;
 	}
 
-	bm_page_io_async(&ctx, idx, WRITE_SYNC);
-	wait_for_completion(&ctx.done);
+	bm_page_io_async(ctx, idx, WRITE_SYNC);
+	wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
 
-	if (ctx.error)
-		drbd_chk_io_error(mdev, 1, true);
+	if (ctx->error)
+		drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
 		/* that should force detach, so the in memory bitmap will be
 		 * gone in a moment as well. */
 
 	mdev->bm_writ_cnt++;
-	return ctx.error;
+	err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error;
+	kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+	return err;
 }
 
 /* NOTE
@@ -1164,7 +1244,7 @@ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(loc
  * this returns a bit number, NOT a sector!
  */
 static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
-	const int find_zero_bit, const enum km_type km)
+	const int find_zero_bit)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
 	unsigned long *p_addr;
@@ -1179,7 +1259,7 @@ static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
 		while (bm_fo < b->bm_bits) {
 			/* bit offset of the first bit in the page */
 			bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
-			p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo), km);
+			p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo));
 
 			if (find_zero_bit)
 				i = find_next_zero_bit_le(p_addr,
@@ -1188,7 +1268,7 @@ static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
 				i = find_next_bit_le(p_addr,
 						PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
 
-			__bm_unmap(p_addr, km);
+			__bm_unmap(p_addr);
 			if (i < PAGE_SIZE*8) {
 				bm_fo = bit_offset + i;
 				if (bm_fo >= b->bm_bits)
@@ -1216,7 +1296,7 @@ static unsigned long bm_find_next(struct drbd_conf *mdev,
 	if (BM_DONT_TEST & b->bm_flags)
 		bm_print_lock_info(mdev);
 
-	i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
+	i = __bm_find_next(mdev, bm_fo, find_zero_bit);
 
 	spin_unlock_irq(&b->bm_lock);
 	return i;
@@ -1240,13 +1320,13 @@ unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo
 unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
 {
 	/* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
-	return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
+	return __bm_find_next(mdev, bm_fo, 0);
 }
 
 unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
 {
 	/* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
-	return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
+	return __bm_find_next(mdev, bm_fo, 1);
 }
 
 /* returns number of bits actually changed.
@@ -1274,14 +1354,14 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
 		unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
 		if (page_nr != last_page_nr) {
 			if (p_addr)
-				__bm_unmap(p_addr, KM_IRQ1);
+				__bm_unmap(p_addr);
 			if (c < 0)
 				bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
 			else if (c > 0)
 				bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
 			changed_total += c;
 			c = 0;
-			p_addr = __bm_map_pidx(b, page_nr, KM_IRQ1);
+			p_addr = __bm_map_pidx(b, page_nr);
 			last_page_nr = page_nr;
 		}
 		if (val)
@@ -1290,7 +1370,7 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
 			c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
 	}
 	if (p_addr)
-		__bm_unmap(p_addr, KM_IRQ1);
+		__bm_unmap(p_addr);
 	if (c < 0)
 		bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
 	else if (c > 0)
@@ -1343,13 +1423,13 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
 {
 	int i;
 	int bits;
-	unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_IRQ1);
+	unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
 	for (i = first_word; i < last_word; i++) {
 		bits = hweight_long(paddr[i]);
 		paddr[i] = ~0UL;
 		b->bm_set += BITS_PER_LONG - bits;
 	}
-	kunmap_atomic(paddr, KM_IRQ1);
+	kunmap_atomic(paddr);
 }
 
 /* Same thing as drbd_bm_set_bits,
@@ -1408,10 +1488,17 @@ void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
 		first_word = 0;
 		spin_lock_irq(&b->bm_lock);
 	}
-
 	/* last page (respectively only page, for first page == last page) */
 	last_word = MLPP(el >> LN2_BPL);
-	bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word);
+
+	/* consider bitmap->bm_bits = 32768, bitmap->bm_number_of_pages = 1. (or multiples).
+	 * ==> e = 32767, el = 32768, last_page = 2,
+	 * and now last_word = 0.
+	 * We do not want to touch last_page in this case,
+	 * as we did not allocate it, it is not present in bitmap->bm_pages.
+	 */
+	if (last_word)
+		bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word);
 
 	/* possibly trailing bits.
 	 * example: (e & 63) == 63, el will be e+1.
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index ef2ceed3be4b..b953cc7c9c00 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -28,7 +28,6 @@
 
 #include <linux/compiler.h>
 #include <linux/types.h>
-#include <linux/version.h>
 #include <linux/list.h>
 #include <linux/sched.h>
 #include <linux/bitops.h>
@@ -60,8 +59,8 @@
 
 /* module parameter, defined in drbd_main.c */
 extern unsigned int minor_count;
-extern int disable_sendpage;
-extern int allow_oos;
+extern bool disable_sendpage;
+extern bool allow_oos;
 extern unsigned int cn_idx;
 
 #ifdef CONFIG_DRBD_FAULT_INJECTION
@@ -713,7 +712,6 @@ struct drbd_request {
 	struct list_head tl_requests; /* ring list in the transfer log */
 	struct bio *master_bio;       /* master bio pointer */
 	unsigned long rq_state; /* see comments above _req_mod() */
-	int seq_num;
 	unsigned long start_time;
 };
 
@@ -815,7 +813,6 @@ enum {
 	SIGNAL_ASENDER,		/* whether asender wants to be interrupted */
 	SEND_PING,		/* whether asender should send a ping asap */
 
-	UNPLUG_QUEUED,		/* only relevant with kernel 2.4 */
 	UNPLUG_REMOTE,		/* sending a "UnplugRemote" could help */
 	MD_DIRTY,		/* current uuids and flags not yet on disk */
 	DISCARD_CONCURRENT,	/* Set on one node, cleared on the peer! */
@@ -826,7 +823,6 @@ enum {
 	CRASHED_PRIMARY,	/* This node was a crashed primary.
 				 * Gets cleared when the state.conn
 				 * goes into C_CONNECTED state. */
-	NO_BARRIER_SUPP,	/* underlying block device doesn't implement barriers */
 	CONSIDER_RESYNC,
 
 	MD_NO_FUA,		/* Users wants us to not use FUA/FLUSH on meta data dev */
@@ -836,6 +832,7 @@ enum {
 	BITMAP_IO_QUEUED,       /* Started bitmap IO */
 	GO_DISKLESS,		/* Disk is being detached, on io-error or admin request. */
 	WAS_IO_ERROR,		/* Local disk failed returned IO error */
+	FORCE_DETACH,		/* Force-detach from local disk, aborting any pending local IO */
 	RESYNC_AFTER_NEG,       /* Resync after online grow after the attach&negotiate finished. */
 	NET_CONGESTED,		/* The data socket is congested */
 
@@ -852,6 +849,14 @@ enum {
 	NEW_CUR_UUID,		/* Create new current UUID when thawing IO */
 	AL_SUSPENDED,		/* Activity logging is currently suspended. */
 	AHEAD_TO_SYNC_SOURCE,   /* Ahead -> SyncSource queued */
+	STATE_SENT,		/* Do not change state/UUIDs while this is set */
+
+	CALLBACK_PENDING,	/* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
+				 * pending, from drbd worker context.
+				 * If set, bdi_write_congested() returns true,
+				 * so shrink_page_list() would not recurse into,
+				 * and potentially deadlock on, this drbd worker.
+				 */
 };
 
 struct drbd_bitmap; /* opaque for drbd_conf */
@@ -863,31 +868,30 @@ enum bm_flag {
 	BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */
 
 	/* currently locked for bulk operation */
-	BM_LOCKED_MASK = 0x7,
+	BM_LOCKED_MASK = 0xf,
 
 	/* in detail, that is: */
 	BM_DONT_CLEAR = 0x1,
 	BM_DONT_SET   = 0x2,
 	BM_DONT_TEST  = 0x4,
 
+	/* so we can mark it locked for bulk operation,
+	 * and still allow all non-bulk operations */
+	BM_IS_LOCKED  = 0x8,
+
 	/* (test bit, count bit) allowed (common case) */
-	BM_LOCKED_TEST_ALLOWED = 0x3,
+	BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED,
 
 	/* testing bits, as well as setting new bits allowed, but clearing bits
 	 * would be unexpected.  Used during bitmap receive.  Setting new bits
 	 * requires sending of "out-of-sync" information, though. */
-	BM_LOCKED_SET_ALLOWED = 0x1,
+	BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED,
 
-	/* clear is not expected while bitmap is locked for bulk operation */
+	/* for drbd_bm_write_copy_pages, everything is allowed,
+	 * only concurrent bulk operations are locked out. */
+	BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED,
 };
 
-
-/* TODO sort members for performance
- * MAYBE group them further */
-
-/* THINK maybe we actually want to use the default "event/%s" worker threads
- * or similar in linux 2.6, which uses per cpu data and threads.
- */
 struct drbd_work_queue {
 	struct list_head q;
 	struct semaphore s; /* producers up it, worker down()s it */
@@ -928,7 +932,7 @@ struct drbd_md {
 #define NL_INT64(pn,pr,member) __u64 member;
 #define NL_BIT(pn,pr,member)   unsigned member:1;
 #define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len;
-#include "linux/drbd_nl.h"
+#include <linux/drbd_nl.h>
 
 struct drbd_backing_dev {
 	struct block_device *backing_bdev;
@@ -939,8 +943,7 @@ struct drbd_backing_dev {
 };
 
 struct drbd_md_io {
-	struct drbd_conf *mdev;
-	struct completion event;
+	unsigned int done;
 	int error;
 };
 
@@ -1023,6 +1026,7 @@ struct drbd_conf {
 	struct drbd_tl_epoch *newest_tle;
 	struct drbd_tl_epoch *oldest_tle;
 	struct list_head out_of_sequence_requests;
+	struct list_head barrier_acked_requests;
 	struct hlist_head *tl_hash;
 	unsigned int tl_hash_s;
 
@@ -1057,6 +1061,8 @@ struct drbd_conf {
 	struct crypto_hash *csums_tfm;
 	struct crypto_hash *verify_tfm;
 
+	unsigned long last_reattach_jif;
+	unsigned long last_reconnect_jif;
 	struct drbd_thread receiver;
 	struct drbd_thread worker;
 	struct drbd_thread asender;
@@ -1095,7 +1101,8 @@ struct drbd_conf {
 	wait_queue_head_t ee_wait;
 	struct page *md_io_page;	/* one page buffer for md_io */
 	struct page *md_io_tmpp;	/* for logical_block_size != 512 */
-	struct mutex md_io_mutex;	/* protects the md_io_buffer */
+	struct drbd_md_io md_io;
+	atomic_t md_io_in_use;		/* protects the md_io, md_io_page and md_io_tmpp */
 	spinlock_t al_lock;
 	wait_queue_head_t al_wait;
 	struct lru_cache *act_log;	/* activity log */
@@ -1129,8 +1136,8 @@ struct drbd_conf {
 	int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
 	int rs_planed;    /* resync sectors already planned */
 	atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
-	int peer_max_bio_size;
-	int local_max_bio_size;
+	unsigned int peer_max_bio_size;
+	unsigned int local_max_bio_size;
 };
 
 static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1229,8 +1236,8 @@ extern int drbd_send_uuids(struct drbd_conf *mdev);
 extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
 extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev);
 extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
-extern int _drbd_send_state(struct drbd_conf *mdev);
-extern int drbd_send_state(struct drbd_conf *mdev);
+extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s);
+extern int drbd_send_current_state(struct drbd_conf *mdev);
 extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
 			enum drbd_packets cmd, struct p_header80 *h,
 			size_t size, unsigned msg_flags);
@@ -1434,9 +1441,9 @@ struct bm_extent {
  * hash table. */
 #define HT_SHIFT 8
 #define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT))
-#define DRBD_MAX_BIO_SIZE_SAFE (1 << 12)       /* Works always = 4k */
+#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12)       /* Works always = 4k */
 
-#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */
+#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* The old header only allows packets up to 32Kib data */
 
 /* Number of elements in the app_reads_hash */
 #define APP_R_HSIZE 15
@@ -1462,6 +1469,8 @@ extern int  drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
 extern int  drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
 extern int  drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
 extern int  drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
+extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local);
+extern int  drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local);
 extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
 		unsigned long al_enr);
 extern size_t	     drbd_bm_words(struct drbd_conf *mdev);
@@ -1494,11 +1503,38 @@ extern struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
 extern mempool_t *drbd_request_mempool;
 extern mempool_t *drbd_ee_mempool;
 
-extern struct page *drbd_pp_pool; /* drbd's page pool */
+/* drbd's page pool, used to buffer data received from the peer,
+ * or data requested by the peer.
+ *
+ * This does not have an emergency reserve.
+ *
+ * When allocating from this pool, it first takes pages from the pool.
+ * Only if the pool is depleted will try to allocate from the system.
+ *
+ * The assumption is that pages taken from this pool will be processed,
+ * and given back, "quickly", and then can be recycled, so we can avoid
+ * frequent calls to alloc_page(), and still will be able to make progress even
+ * under memory pressure.
+ */
+extern struct page *drbd_pp_pool;
 extern spinlock_t   drbd_pp_lock;
 extern int	    drbd_pp_vacant;
 extern wait_queue_head_t drbd_pp_wait;
 
+/* We also need a standard (emergency-reserve backed) page pool
+ * for meta data IO (activity log, bitmap).
+ * We can keep it global, as long as it is used as "N pages at a time".
+ * 128 should be plenty, currently we probably can get away with as few as 1.
+ */
+#define DRBD_MIN_POOL_PAGES	128
+extern mempool_t *drbd_md_io_page_pool;
+
+/* We also need to make sure we get a bio
+ * when we need it for housekeeping purposes */
+extern struct bio_set *drbd_md_io_bio_set;
+/* to allocate from that set */
+extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
+
 extern rwlock_t global_state_lock;
 
 extern struct drbd_conf *drbd_new_device(unsigned int minor);
@@ -1507,7 +1543,7 @@ extern void drbd_free_mdev(struct drbd_conf *mdev);
 extern int proc_details;
 
 /* drbd_req */
-extern int drbd_make_request(struct request_queue *q, struct bio *bio);
+extern void drbd_make_request(struct request_queue *q, struct bio *bio);
 extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
 extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
 extern int is_valid_ar_handle(struct drbd_request *, sector_t);
@@ -1537,8 +1573,12 @@ extern void resume_next_sg(struct drbd_conf *mdev);
 extern void suspend_other_sg(struct drbd_conf *mdev);
 extern int drbd_resync_finished(struct drbd_conf *mdev);
 /* maybe rather drbd_main.c ? */
+extern void *drbd_md_get_buffer(struct drbd_conf *mdev);
+extern void drbd_md_put_buffer(struct drbd_conf *mdev);
 extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
-		struct drbd_backing_dev *bdev, sector_t sector, int rw);
+				struct drbd_backing_dev *bdev, sector_t sector, int rw);
+extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+					    unsigned int *done);
 extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
 extern void drbd_rs_controller_reset(struct drbd_conf *mdev);
 
@@ -1755,19 +1795,6 @@ static inline struct page *page_chain_next(struct page *page)
 #define page_chain_for_each_safe(page, n) \
 	for (; page && ({ n = page_chain_next(page); 1; }); page = n)
 
-static inline int drbd_bio_has_active_page(struct bio *bio)
-{
-	struct bio_vec *bvec;
-	int i;
-
-	__bio_for_each_segment(bvec, bio, i, 0) {
-		if (page_count(bvec->bv_page) > 1)
-			return 1;
-	}
-
-	return 0;
-}
-
 static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
 {
 	struct page *page = e->pages;
@@ -1778,7 +1805,6 @@ static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
 	return 0;
 }
 
-
 static inline void drbd_state_lock(struct drbd_conf *mdev)
 {
 	wait_event(mdev->misc_wait,
@@ -1821,12 +1847,20 @@ static inline int drbd_request_state(struct drbd_conf *mdev,
 	return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
 }
 
+enum drbd_force_detach_flags {
+	DRBD_IO_ERROR,
+	DRBD_META_IO_ERROR,
+	DRBD_FORCE_DETACH,
+};
+
 #define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
-static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where)
+static inline void __drbd_chk_io_error_(struct drbd_conf *mdev,
+		enum drbd_force_detach_flags forcedetach,
+		const char *where)
 {
 	switch (mdev->ldev->dc.on_io_error) {
 	case EP_PASS_ON:
-		if (!forcedetach) {
+		if (forcedetach == DRBD_IO_ERROR) {
 			if (__ratelimit(&drbd_ratelimit_state))
 				dev_err(DEV, "Local IO failed in %s.\n", where);
 			if (mdev->state.disk > D_INCONSISTENT)
@@ -1837,6 +1871,8 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach,
 	case EP_DETACH:
 	case EP_CALL_HELPER:
 		set_bit(WAS_IO_ERROR, &mdev->flags);
+		if (forcedetach == DRBD_FORCE_DETACH)
+			set_bit(FORCE_DETACH, &mdev->flags);
 		if (mdev->state.disk > D_FAILED) {
 			_drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
 			dev_err(DEV,
@@ -1856,7 +1892,7 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach,
  */
 #define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__)
 static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
-	int error, int forcedetach, const char *where)
+	int error, enum drbd_force_detach_flags forcedetach, const char *where)
 {
 	if (error) {
 		unsigned long flags;
@@ -2231,7 +2267,7 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
 		 * Note: currently we don't support such large bitmaps on 32bit
 		 * arch anyways, but no harm done to be prepared for it here.
 		 */
-		unsigned int shift = mdev->rs_total >= (1ULL << 32) ? 16 : 10;
+		unsigned int shift = mdev->rs_total > UINT_MAX ? 16 : 10;
 		unsigned long left = *bits_left >> shift;
 		unsigned long total = 1UL + (mdev->rs_total >> shift);
 		unsigned long tmp = 1000UL - left * 1000UL/total;
@@ -2307,12 +2343,12 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev)
 	case D_OUTDATED:
 	case D_CONSISTENT:
 	case D_UP_TO_DATE:
+	case D_FAILED:
 		/* disk state is stable as well. */
 		break;
 
 	/* no new io accepted during tansitional states */
 	case D_ATTACHING:
-	case D_FAILED:
 	case D_NEGOTIATING:
 	case D_UNKNOWN:
 	case D_MASK:
@@ -2386,15 +2422,17 @@ static inline void dec_ap_bio(struct drbd_conf *mdev)
 	int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt);
 
 	D_ASSERT(ap_bio >= 0);
+
+	if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
+		if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
+			drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
+	}
+
 	/* this currently does wake_up for every dec_ap_bio!
 	 * maybe rather introduce some type of hysteresis?
 	 * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */
 	if (ap_bio < mxb)
 		wake_up(&mdev->misc_wait);
-	if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
-		if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
-			drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
-	}
 }
 
 static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 0358e55356c8..f93a0320e952 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -79,6 +79,7 @@ static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
 static void md_sync_timer_fn(unsigned long data);
 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+static void _tl_clear(struct drbd_conf *mdev);
 
 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
 	      "Lars Ellenberg <lars@linbit.com>");
@@ -117,8 +118,8 @@ module_param(fault_devs, int, 0644);
 
 /* module parameter, defined */
 unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
-int disable_sendpage;
-int allow_oos;
+bool disable_sendpage;
+bool allow_oos;
 unsigned int cn_idx = CN_IDX_DRBD;
 int proc_details;       /* Detail level in proc drbd*/
 
@@ -139,6 +140,8 @@ struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
 struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
 mempool_t *drbd_request_mempool;
 mempool_t *drbd_ee_mempool;
+mempool_t *drbd_md_io_page_pool;
+struct bio_set *drbd_md_io_bio_set;
 
 /* I do not use a standard mempool, because:
    1) I want to hand out the pre-allocated objects first.
@@ -159,7 +162,24 @@ static const struct block_device_operations drbd_ops = {
 	.release = drbd_release,
 };
 
-#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
+static void bio_destructor_drbd(struct bio *bio)
+{
+	bio_free(bio, drbd_md_io_bio_set);
+}
+
+struct bio *bio_alloc_drbd(gfp_t gfp_mask)
+{
+	struct bio *bio;
+
+	if (!drbd_md_io_bio_set)
+		return bio_alloc(gfp_mask, 1);
+
+	bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+	if (!bio)
+		return NULL;
+	bio->bi_destructor = bio_destructor_drbd;
+	return bio;
+}
 
 #ifdef __CHECKER__
 /* When checking with sparse, and this is an inline function, sparse will
@@ -208,6 +228,7 @@ static int tl_init(struct drbd_conf *mdev)
 	mdev->oldest_tle = b;
 	mdev->newest_tle = b;
 	INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
+	INIT_LIST_HEAD(&mdev->barrier_acked_requests);
 
 	mdev->tl_hash = NULL;
 	mdev->tl_hash_s = 0;
@@ -246,9 +267,7 @@ void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
 	new->n_writes = 0;
 
 	newest_before = mdev->newest_tle;
-	/* never send a barrier number == 0, because that is special-cased
-	 * when using TCQ for our write ordering code */
-	new->br_number = (newest_before->br_number+1) ?: 1;
+	new->br_number = newest_before->br_number+1;
 	if (mdev->newest_tle != new) {
 		mdev->newest_tle->next = new;
 		mdev->newest_tle = new;
@@ -311,7 +330,7 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
 	   These have been list_move'd to the out_of_sequence_requests list in
 	   _req_mod(, barrier_acked) above.
 	   */
-	list_del_init(&b->requests);
+	list_splice_init(&b->requests, &mdev->barrier_acked_requests);
 
 	nob = b->next;
 	if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
@@ -411,6 +430,14 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
 		b = tmp;
 		list_splice(&carry_reads, &b->requests);
 	}
+
+	/* Actions operating on the disk state, also want to work on
+	   requests that got barrier acked. */
+
+	list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
+		req = list_entry(le, struct drbd_request, tl_requests);
+		_req_mod(req, what);
+	}
 }
 
 
@@ -424,11 +451,16 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
  */
 void tl_clear(struct drbd_conf *mdev)
 {
+	spin_lock_irq(&mdev->req_lock);
+	_tl_clear(mdev);
+	spin_unlock_irq(&mdev->req_lock);
+}
+
+static void _tl_clear(struct drbd_conf *mdev)
+{
 	struct list_head *le, *tle;
 	struct drbd_request *r;
 
-	spin_lock_irq(&mdev->req_lock);
-
 	_tl_restart(mdev, connection_lost_while_pending);
 
 	/* we expect this list to be empty. */
@@ -447,7 +479,6 @@ void tl_clear(struct drbd_conf *mdev)
 
 	memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
 
-	spin_unlock_irq(&mdev->req_lock);
 }
 
 void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
@@ -458,6 +489,38 @@ void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
 }
 
 /**
+ * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
+ * @mdev:	DRBD device.
+ */
+void tl_abort_disk_io(struct drbd_conf *mdev)
+{
+	struct drbd_tl_epoch *b;
+	struct list_head *le, *tle;
+	struct drbd_request *req;
+
+	spin_lock_irq(&mdev->req_lock);
+	b = mdev->oldest_tle;
+	while (b) {
+		list_for_each_safe(le, tle, &b->requests) {
+			req = list_entry(le, struct drbd_request, tl_requests);
+			if (!(req->rq_state & RQ_LOCAL_PENDING))
+				continue;
+			_req_mod(req, abort_disk_io);
+		}
+		b = b->next;
+	}
+
+	list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
+		req = list_entry(le, struct drbd_request, tl_requests);
+		if (!(req->rq_state & RQ_LOCAL_PENDING))
+			continue;
+		_req_mod(req, abort_disk_io);
+	}
+
+	spin_unlock_irq(&mdev->req_lock);
+}
+
+/**
  * cl_wide_st_chg() - true if the state change is a cluster wide one
  * @mdev:	DRBD device.
  * @os:		old (current) state.
@@ -470,7 +533,7 @@ static int cl_wide_st_chg(struct drbd_conf *mdev,
 		 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
 		  (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
 		  (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
-		  (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
+		  (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
 		(os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
 		(os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
 }
@@ -509,8 +572,16 @@ static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
 static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
 						    union drbd_state,
 						    union drbd_state);
+enum sanitize_state_warnings {
+	NO_WARNING,
+	ABORTED_ONLINE_VERIFY,
+	ABORTED_RESYNC,
+	CONNECTION_LOST_NEGOTIATING,
+	IMPLICITLY_UPGRADED_DISK,
+	IMPLICITLY_UPGRADED_PDSK,
+};
 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
-				       union drbd_state ns, const char **warn_sync_abort);
+				       union drbd_state ns, enum sanitize_state_warnings *warn);
 int drbd_send_state_req(struct drbd_conf *,
 			union drbd_state, union drbd_state);
 
@@ -785,6 +856,13 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
 	if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
 		rv = SS_IN_TRANSIENT_STATE;
 
+	/* While establishing a connection only allow cstate to change.
+	   Delay/refuse role changes, detach attach etc... */
+	if (test_bit(STATE_SENT, &mdev->flags) &&
+	    !(os.conn == C_WF_REPORT_PARAMS ||
+	      (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
+		rv = SS_IN_TRANSIENT_STATE;
+
 	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
 		rv = SS_NEED_CONNECTION;
 
@@ -803,6 +881,21 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
 	return rv;
 }
 
+static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
+{
+	static const char *msg_table[] = {
+		[NO_WARNING] = "",
+		[ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
+		[ABORTED_RESYNC] = "Resync aborted.",
+		[CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
+		[IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
+		[IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
+	};
+
+	if (warn != NO_WARNING)
+		dev_warn(DEV, "%s\n", msg_table[warn]);
+}
+
 /**
  * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
  * @mdev:	DRBD device.
@@ -814,11 +907,14 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
  * to D_UNKNOWN. This rule and many more along those lines are in this function.
  */
 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
-				       union drbd_state ns, const char **warn_sync_abort)
+				       union drbd_state ns, enum sanitize_state_warnings *warn)
 {
 	enum drbd_fencing_p fp;
 	enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
 
+	if (warn)
+		*warn = NO_WARNING;
+
 	fp = FP_DONT_CARE;
 	if (get_ldev(mdev)) {
 		fp = mdev->ldev->dc.fencing;
@@ -833,18 +929,13 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 	/* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
 	 * If you try to go into some Sync* state, that shall fail (elsewhere). */
 	if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
-	    ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
+	    ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
 		ns.conn = os.conn;
 
 	/* we cannot fail (again) if we already detached */
 	if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
 		ns.disk = D_DISKLESS;
 
-	/* if we are only D_ATTACHING yet,
-	 * we can (and should) go directly to D_DISKLESS. */
-	if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
-		ns.disk = D_DISKLESS;
-
 	/* After C_DISCONNECTING only C_STANDALONE may follow */
 	if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
 		ns.conn = os.conn;
@@ -863,10 +954,9 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 	/* Abort resync if a disk fails/detaches */
 	if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
 	    (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
-		if (warn_sync_abort)
-			*warn_sync_abort =
-				os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
-				"Online-verify" : "Resync";
+		if (warn)
+			*warn =	os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
+				ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
 		ns.conn = C_CONNECTED;
 	}
 
@@ -877,7 +967,8 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 			ns.disk = mdev->new_state_tmp.disk;
 			ns.pdsk = mdev->new_state_tmp.pdsk;
 		} else {
-			dev_alert(DEV, "Connection lost while negotiating, no data!\n");
+			if (warn)
+				*warn = CONNECTION_LOST_NEGOTIATING;
 			ns.disk = D_DISKLESS;
 			ns.pdsk = D_UNKNOWN;
 		}
@@ -959,16 +1050,16 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 		ns.disk = disk_max;
 
 	if (ns.disk < disk_min) {
-		dev_warn(DEV, "Implicitly set disk from %s to %s\n",
-			 drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
+		if (warn)
+			*warn = IMPLICITLY_UPGRADED_DISK;
 		ns.disk = disk_min;
 	}
 	if (ns.pdsk > pdsk_max)
 		ns.pdsk = pdsk_max;
 
 	if (ns.pdsk < pdsk_min) {
-		dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
-			 drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
+		if (warn)
+			*warn = IMPLICITLY_UPGRADED_PDSK;
 		ns.pdsk = pdsk_min;
 	}
 
@@ -1045,12 +1136,12 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
 {
 	union drbd_state os;
 	enum drbd_state_rv rv = SS_SUCCESS;
-	const char *warn_sync_abort = NULL;
+	enum sanitize_state_warnings ssw;
 	struct after_state_chg_work *ascw;
 
 	os = mdev->state;
 
-	ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
+	ns = sanitize_state(mdev, os, ns, &ssw);
 
 	if (ns.i == os.i)
 		return SS_NOTHING_TO_DO;
@@ -1076,8 +1167,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
 		return rv;
 	}
 
-	if (warn_sync_abort)
-		dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
+	print_sanitize_warnings(mdev, ssw);
 
 	{
 	char *pbp, pb[300];
@@ -1243,7 +1333,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
 		drbd_thread_stop_nowait(&mdev->receiver);
 
 	/* Upon network failure, we need to restart the receiver. */
-	if (os.conn > C_TEAR_DOWN &&
+	if (os.conn > C_WF_CONNECTION &&
 	    ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
 		drbd_thread_restart_nowait(&mdev->receiver);
 
@@ -1251,6 +1341,15 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
 	if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
 		drbd_resume_al(mdev);
 
+	/* remember last connect and attach times so request_timer_fn() won't
+	 * kill newly established sessions while we are still trying to thaw
+	 * previously frozen IO */
+	if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
+		mdev->last_reconnect_jif = jiffies;
+	if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+	    ns.disk > D_NEGOTIATING)
+		mdev->last_reattach_jif = jiffies;
+
 	ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
 	if (ascw) {
 		ascw->os = os;
@@ -1354,12 +1453,16 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	/* Here we have the actions that are performed after a
 	   state change. This function might sleep */
 
+	if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
+		mod_timer(&mdev->request_timer, jiffies + HZ);
+
 	nsm.i = -1;
 	if (ns.susp_nod) {
 		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
 			what = resend;
 
-		if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
+		if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+		    ns.disk > D_NEGOTIATING)
 			what = restart_frozen_disk_io;
 
 		if (what != nothing)
@@ -1369,12 +1472,12 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	if (ns.susp_fen) {
 		/* case1: The outdate peer handler is successful: */
 		if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
-			tl_clear(mdev);
 			if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
 				drbd_uuid_new_current(mdev);
 				clear_bit(NEW_CUR_UUID, &mdev->flags);
 			}
 			spin_lock_irq(&mdev->req_lock);
+			_tl_clear(mdev);
 			_drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
 			spin_unlock_irq(&mdev->req_lock);
 		}
@@ -1407,8 +1510,15 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 
 	/* Do not change the order of the if above and the two below... */
 	if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
+		/* we probably will start a resync soon.
+		 * make sure those things are properly reset. */
+		mdev->rs_total = 0;
+		mdev->rs_failed = 0;
+		atomic_set(&mdev->rs_pending_cnt, 0);
+		drbd_rs_cancel_all(mdev);
+
 		drbd_send_uuids(mdev);
-		drbd_send_state(mdev);
+		drbd_send_state(mdev, ns);
 	}
 	/* No point in queuing send_bitmap if we don't have a connection
 	 * anymore, so check also the _current_ state, not only the new state
@@ -1441,11 +1551,11 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	}
 
 	if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
-		if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
+		if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
+		    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
 			drbd_uuid_new_current(mdev);
 			drbd_send_uuids(mdev);
 		}
-
 		/* D_DISKLESS Peer becomes secondary */
 		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
 			/* We may still be Primary ourselves.
@@ -1473,14 +1583,14 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	    os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
 		drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
 		drbd_send_uuids(mdev);
-		drbd_send_state(mdev);
+		drbd_send_state(mdev, ns);
 	}
 
 	/* We want to pause/continue resync, tell peer. */
 	if (ns.conn >= C_CONNECTED &&
 	     ((os.aftr_isp != ns.aftr_isp) ||
 	      (os.user_isp != ns.user_isp)))
-		drbd_send_state(mdev);
+		drbd_send_state(mdev, ns);
 
 	/* In case one of the isp bits got set, suspend other devices. */
 	if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
@@ -1490,10 +1600,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	/* Make sure the peer gets informed about eventual state
 	   changes (ISP bits) while we were in WFReportParams. */
 	if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
-		drbd_send_state(mdev);
+		drbd_send_state(mdev, ns);
 
 	if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
-		drbd_send_state(mdev);
+		drbd_send_state(mdev, ns);
 
 	/* We are in the progress to start a full sync... */
 	if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
@@ -1513,37 +1623,54 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	/* first half of local IO error, failure to attach,
 	 * or administrative detach */
 	if (os.disk != D_FAILED && ns.disk == D_FAILED) {
-		enum drbd_io_error_p eh;
-		int was_io_error;
+		enum drbd_io_error_p eh = EP_PASS_ON;
+		int was_io_error = 0;
 		/* corresponding get_ldev was in __drbd_set_state, to serialize
-		 * our cleanup here with the transition to D_DISKLESS,
-		 * so it is safe to dreference ldev here. */
-		eh = mdev->ldev->dc.on_io_error;
-		was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
-
-		/* current state still has to be D_FAILED,
-		 * there is only one way out: to D_DISKLESS,
-		 * and that may only happen after our put_ldev below. */
-		if (mdev->state.disk != D_FAILED)
-			dev_err(DEV,
-				"ASSERT FAILED: disk is %s during detach\n",
-				drbd_disk_str(mdev->state.disk));
-
-		if (drbd_send_state(mdev))
-			dev_warn(DEV, "Notified peer that I am detaching my disk\n");
-		else
-			dev_err(DEV, "Sending state for detaching disk failed\n");
-
-		drbd_rs_cancel_all(mdev);
-
-		/* In case we want to get something to stable storage still,
-		 * this may be the last chance.
-		 * Following put_ldev may transition to D_DISKLESS. */
-		drbd_md_sync(mdev);
+		 * our cleanup here with the transition to D_DISKLESS.
+		 * But is is still not save to dreference ldev here, since
+		 * we might come from an failed Attach before ldev was set. */
+		if (mdev->ldev) {
+			eh = mdev->ldev->dc.on_io_error;
+			was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
+
+			if (was_io_error && eh == EP_CALL_HELPER)
+				drbd_khelper(mdev, "local-io-error");
+
+			/* Immediately allow completion of all application IO,
+			 * that waits for completion from the local disk,
+			 * if this was a force-detach due to disk_timeout
+			 * or administrator request (drbdsetup detach --force).
+			 * Do NOT abort otherwise.
+			 * Aborting local requests may cause serious problems,
+			 * if requests are completed to upper layers already,
+			 * and then later the already submitted local bio completes.
+			 * This can cause DMA into former bio pages that meanwhile
+			 * have been re-used for other things.
+			 * So aborting local requests may cause crashes,
+			 * or even worse, silent data corruption.
+			 */
+			if (test_and_clear_bit(FORCE_DETACH, &mdev->flags))
+				tl_abort_disk_io(mdev);
+
+			/* current state still has to be D_FAILED,
+			 * there is only one way out: to D_DISKLESS,
+			 * and that may only happen after our put_ldev below. */
+			if (mdev->state.disk != D_FAILED)
+				dev_err(DEV,
+					"ASSERT FAILED: disk is %s during detach\n",
+					drbd_disk_str(mdev->state.disk));
+
+			if (ns.conn >= C_CONNECTED)
+				drbd_send_state(mdev, ns);
+
+			drbd_rs_cancel_all(mdev);
+
+			/* In case we want to get something to stable storage still,
+			 * this may be the last chance.
+			 * Following put_ldev may transition to D_DISKLESS. */
+			drbd_md_sync(mdev);
+		}
 		put_ldev(mdev);
-
-		if (was_io_error && eh == EP_CALL_HELPER)
-			drbd_khelper(mdev, "local-io-error");
 	}
 
         /* second half of local IO error, failure to attach,
@@ -1557,20 +1684,17 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
                                 "ASSERT FAILED: disk is %s while going diskless\n",
                                 drbd_disk_str(mdev->state.disk));
 
-                mdev->rs_total = 0;
-                mdev->rs_failed = 0;
-                atomic_set(&mdev->rs_pending_cnt, 0);
+		if (ns.conn >= C_CONNECTED)
+			drbd_send_state(mdev, ns);
 
-		if (drbd_send_state(mdev))
-			dev_warn(DEV, "Notified peer that I'm now diskless.\n");
 		/* corresponding get_ldev in __drbd_set_state
 		 * this may finally trigger drbd_ldev_destroy. */
 		put_ldev(mdev);
 	}
 
 	/* Notify peer that I had a local IO error, and did not detached.. */
-	if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
-		drbd_send_state(mdev);
+	if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
+		drbd_send_state(mdev, ns);
 
 	/* Disks got bigger while they were detached */
 	if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
@@ -1588,7 +1712,13 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	/* sync target done with resync.  Explicitly notify peer, even though
 	 * it should (at least for non-empty resyncs) already know itself. */
 	if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
-		drbd_send_state(mdev);
+		drbd_send_state(mdev, ns);
+
+	/* Wake up role changes, that were delayed because of connection establishing */
+	if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
+		clear_bit(STATE_SENT, &mdev->flags);
+		wake_up(&mdev->state_wait);
+	}
 
 	/* This triggers bitmap writeout of potentially still unwritten pages
 	 * if the resync finished cleanly, or aborted because of peer disk
@@ -1598,8 +1728,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	 * No harm done if some bits change during this phase.
 	 */
 	if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
-		drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
-			"write from resync_finished", BM_LOCKED_SET_ALLOWED);
+		drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
+			"write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
 		put_ldev(mdev);
 	}
 
@@ -2057,7 +2187,11 @@ int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
 
 	D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
 
-	uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
+	uuid = mdev->ldev->md.uuid[UI_BITMAP];
+	if (uuid && uuid != UUID_JUST_CREATED)
+		uuid = uuid + UUID_NEW_BM_OFFSET;
+	else
+		get_random_bytes(&uuid, sizeof(u64));
 	drbd_uuid_set(mdev, UI_BITMAP, uuid);
 	drbd_print_uuids(mdev, "updated sync UUID");
 	drbd_md_sync(mdev);
@@ -2071,7 +2205,8 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
 {
 	struct p_sizes p;
 	sector_t d_size, u_size;
-	int q_order_type, max_bio_size;
+	int q_order_type;
+	unsigned int max_bio_size;
 	int ok;
 
 	if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
@@ -2080,7 +2215,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
 		u_size = mdev->ldev->dc.disk_size;
 		q_order_type = drbd_queue_order_type(mdev);
 		max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
-		max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
+		max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
 		put_ldev(mdev);
 	} else {
 		d_size = 0;
@@ -2089,6 +2224,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
 		max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
 	}
 
+	/* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
+	if (mdev->agreed_pro_version <= 94)
+		max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+
 	p.d_size = cpu_to_be64(d_size);
 	p.u_size = cpu_to_be64(u_size);
 	p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
@@ -2102,10 +2241,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
 }
 
 /**
- * drbd_send_state() - Sends the drbd state to the peer
+ * drbd_send_current_state() - Sends the drbd state to the peer
  * @mdev:	DRBD device.
  */
-int drbd_send_state(struct drbd_conf *mdev)
+int drbd_send_current_state(struct drbd_conf *mdev)
 {
 	struct socket *sock;
 	struct p_state p;
@@ -2131,6 +2270,37 @@ int drbd_send_state(struct drbd_conf *mdev)
 	return ok;
 }
 
+/**
+ * drbd_send_state() - After a state change, sends the new state to the peer
+ * @mdev:	DRBD device.
+ * @state:	the state to send, not necessarily the current state.
+ *
+ * Each state change queues an "after_state_ch" work, which will eventually
+ * send the resulting new state to the peer. If more state changes happen
+ * between queuing and processing of the after_state_ch work, we still
+ * want to send each intermediary state in the order it occurred.
+ */
+int drbd_send_state(struct drbd_conf *mdev, union drbd_state state)
+{
+	struct socket *sock;
+	struct p_state p;
+	int ok = 0;
+
+	mutex_lock(&mdev->data.mutex);
+
+	p.state = cpu_to_be32(state.i);
+	sock = mdev->data.socket;
+
+	if (likely(sock != NULL)) {
+		ok = _drbd_send_cmd(mdev, sock, P_STATE,
+				    (struct p_header80 *)&p, sizeof(p), 0);
+	}
+
+	mutex_unlock(&mdev->data.mutex);
+
+	return ok;
+}
+
 int drbd_send_state_req(struct drbd_conf *mdev,
 	union drbd_state mask, union drbd_state val)
 {
@@ -2615,7 +2785,7 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
 	struct bio_vec *bvec;
 	int i;
 	/* hint all but last page with MSG_MORE */
-	__bio_for_each_segment(bvec, bio, i, 0) {
+	bio_for_each_segment(bvec, bio, i) {
 		if (!_drbd_no_send_page(mdev, bvec->bv_page,
 				     bvec->bv_offset, bvec->bv_len,
 				     i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
@@ -2629,7 +2799,7 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
 	struct bio_vec *bvec;
 	int i;
 	/* hint all but last page with MSG_MORE */
-	__bio_for_each_segment(bvec, bio, i, 0) {
+	bio_for_each_segment(bvec, bio, i) {
 		if (!_drbd_send_page(mdev, bvec->bv_page,
 				     bvec->bv_offset, bvec->bv_len,
 				     i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
@@ -2695,8 +2865,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
 
 	p.sector   = cpu_to_be64(req->sector);
 	p.block_id = (unsigned long)req;
-	p.seq_num  = cpu_to_be32(req->seq_num =
-				 atomic_add_return(1, &mdev->packet_seq));
+	p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
 
 	dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
 
@@ -2987,8 +3156,8 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	atomic_set(&mdev->rs_sect_in, 0);
 	atomic_set(&mdev->rs_sect_ev, 0);
 	atomic_set(&mdev->ap_in_flight, 0);
+	atomic_set(&mdev->md_io_in_use, 0);
 
-	mutex_init(&mdev->md_io_mutex);
 	mutex_init(&mdev->data.mutex);
 	mutex_init(&mdev->meta.mutex);
 	sema_init(&mdev->data.work.s, 0);
@@ -3126,6 +3295,10 @@ static void drbd_destroy_mempools(void)
 
 	/* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
 
+	if (drbd_md_io_bio_set)
+		bioset_free(drbd_md_io_bio_set);
+	if (drbd_md_io_page_pool)
+		mempool_destroy(drbd_md_io_page_pool);
 	if (drbd_ee_mempool)
 		mempool_destroy(drbd_ee_mempool);
 	if (drbd_request_mempool)
@@ -3139,6 +3312,8 @@ static void drbd_destroy_mempools(void)
 	if (drbd_al_ext_cache)
 		kmem_cache_destroy(drbd_al_ext_cache);
 
+	drbd_md_io_bio_set   = NULL;
+	drbd_md_io_page_pool = NULL;
 	drbd_ee_mempool      = NULL;
 	drbd_request_mempool = NULL;
 	drbd_ee_cache        = NULL;
@@ -3162,6 +3337,8 @@ static int drbd_create_mempools(void)
 	drbd_bm_ext_cache    = NULL;
 	drbd_al_ext_cache    = NULL;
 	drbd_pp_pool         = NULL;
+	drbd_md_io_page_pool = NULL;
+	drbd_md_io_bio_set   = NULL;
 
 	/* caches */
 	drbd_request_cache = kmem_cache_create(
@@ -3185,6 +3362,16 @@ static int drbd_create_mempools(void)
 		goto Enomem;
 
 	/* mempools */
+#ifdef COMPAT_HAVE_BIOSET_CREATE
+	drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
+	if (drbd_md_io_bio_set == NULL)
+		goto Enomem;
+#endif
+
+	drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
+	if (drbd_md_io_page_pool == NULL)
+		goto Enomem;
+
 	drbd_request_mempool = mempool_create(number,
 		mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
 	if (drbd_request_mempool == NULL)
@@ -3262,6 +3449,8 @@ static void drbd_delete_device(unsigned int minor)
 	if (!mdev)
 		return;
 
+	del_timer_sync(&mdev->request_timer);
+
 	/* paranoia asserts */
 	if (mdev->open_cnt != 0)
 		dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
@@ -3344,9 +3533,9 @@ static void drbd_cleanup(void)
 }
 
 /**
- * drbd_congested() - Callback for pdflush
+ * drbd_congested() - Callback for the flusher thread
  * @congested_data:	User data
- * @bdi_bits:		Bits pdflush is currently interested in
+ * @bdi_bits:		Bits the BDI flusher thread is currently interested in
  *
  * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
  */
@@ -3364,6 +3553,22 @@ static int drbd_congested(void *congested_data, int bdi_bits)
 		goto out;
 	}
 
+	if (test_bit(CALLBACK_PENDING, &mdev->flags)) {
+		r |= (1 << BDI_async_congested);
+		/* Without good local data, we would need to read from remote,
+		 * and that would need the worker thread as well, which is
+		 * currently blocked waiting for that usermode helper to
+		 * finish.
+		 */
+		if (!get_ldev_if_state(mdev, D_UP_TO_DATE))
+			r |= (1 << BDI_sync_congested);
+		else
+			put_ldev(mdev);
+		r &= bdi_bits;
+		reason = 'c';
+		goto out;
+	}
+
 	if (get_ldev(mdev)) {
 		q = bdev_get_queue(mdev->ldev->backing_bdev);
 		r = bdi_congested(&q->backing_dev_info, bdi_bits);
@@ -3427,6 +3632,7 @@ struct drbd_conf *drbd_new_device(unsigned int minor)
 	q->backing_dev_info.congested_data = mdev;
 
 	blk_queue_make_request(q, drbd_make_request);
+	blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
 	/* Setting the max_hw_sectors to an odd value of 8kibyte here
 	   This triggers a max_bio_size message upon first attach or connect */
 	blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
@@ -3666,8 +3872,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
 	if (!get_ldev_if_state(mdev, D_FAILED))
 		return;
 
-	mutex_lock(&mdev->md_io_mutex);
-	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
+	buffer = drbd_md_get_buffer(mdev);
+	if (!buffer)
+		goto out;
+
 	memset(buffer, 0, 512);
 
 	buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
@@ -3691,14 +3899,15 @@ void drbd_md_sync(struct drbd_conf *mdev)
 	if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
 		/* this was a try anyways ... */
 		dev_err(DEV, "meta data update failed!\n");
-		drbd_chk_io_error(mdev, 1, true);
+		drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
 	}
 
 	/* Update mdev->ldev->md.la_size_sect,
 	 * since we updated it on metadata. */
 	mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
 
-	mutex_unlock(&mdev->md_io_mutex);
+	drbd_md_put_buffer(mdev);
+out:
 	put_ldev(mdev);
 }
 
@@ -3718,8 +3927,9 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 	if (!get_ldev_if_state(mdev, D_ATTACHING))
 		return ERR_IO_MD_DISK;
 
-	mutex_lock(&mdev->md_io_mutex);
-	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
+	buffer = drbd_md_get_buffer(mdev);
+	if (!buffer)
+		goto out;
 
 	if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
 		/* NOTE: can't do normal error processing here as this is
@@ -3769,9 +3979,9 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 
 	spin_lock_irq(&mdev->req_lock);
 	if (mdev->state.conn < C_CONNECTED) {
-		int peer;
+		unsigned int peer;
 		peer = be32_to_cpu(buffer->la_peer_max_bio_size);
-		peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
+		peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
 		mdev->peer_max_bio_size = peer;
 	}
 	spin_unlock_irq(&mdev->req_lock);
@@ -3780,7 +3990,8 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 		mdev->sync_conf.al_extents = 127;
 
  err:
-	mutex_unlock(&mdev->md_io_mutex);
+	drbd_md_put_buffer(mdev);
+ out:
 	put_ldev(mdev);
 
 	return rv;
@@ -4183,12 +4394,11 @@ const char *drbd_buildtag(void)
 	static char buildtag[38] = "\0uilt-in";
 
 	if (buildtag[0] == 0) {
-#ifdef CONFIG_MODULES
-		if (THIS_MODULE != NULL)
-			sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
-		else
+#ifdef MODULE
+		sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
+#else
+		buildtag[0] = 'b';
 #endif
-			buildtag[0] = 'b';
 	}
 
 	return buildtag;
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 0feab261e295..edb490aad8b4 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -94,7 +94,7 @@ static int name ## _from_tags(struct drbd_conf *mdev, \
 		 arg->member ## _len = dlen; \
 		 memcpy(arg->member, tags, min_t(size_t, dlen, len)); \
 		 break;
-#include "linux/drbd_nl.h"
+#include <linux/drbd_nl.h>
 
 /* Generate the struct to tag_list functions */
 #define NL_PACKET(name, number, fields) \
@@ -129,7 +129,7 @@ name ## _to_tags(struct drbd_conf *mdev, \
 	put_unaligned(arg->member ## _len, tags++);	\
 	memcpy(tags, arg->member, arg->member ## _len); \
 	tags = (unsigned short *)((char *)tags + arg->member ## _len);
-#include "linux/drbd_nl.h"
+#include <linux/drbd_nl.h>
 
 void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name);
 void drbd_nl_send_reply(struct cn_msg *, int);
@@ -147,6 +147,9 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
 	char *argv[] = {usermode_helper, cmd, mb, NULL };
 	int ret;
 
+	if (current == mdev->worker.task)
+		set_bit(CALLBACK_PENDING, &mdev->flags);
+
 	snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev));
 
 	if (get_net_conf(mdev)) {
@@ -179,7 +182,7 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
 	dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
 
 	drbd_bcast_ev_helper(mdev, cmd);
-	ret = call_usermodehelper(usermode_helper, argv, envp, 1);
+	ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
 	if (ret)
 		dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
 				usermode_helper, cmd, mb,
@@ -189,6 +192,9 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
 				usermode_helper, cmd, mb,
 				(ret >> 8) & 0xff, ret);
 
+	if (current == mdev->worker.task)
+		clear_bit(CALLBACK_PENDING, &mdev->flags);
+
 	if (ret < 0) /* Ignore any ERRNOs we got. */
 		ret = 0;
 
@@ -289,7 +295,7 @@ static int _try_outdate_peer_async(void *data)
 	*/
 	spin_lock_irq(&mdev->req_lock);
 	ns = mdev->state;
-	if (ns.conn < C_WF_REPORT_PARAMS) {
+	if (ns.conn < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &mdev->flags)) {
 		ns.pdsk = nps;
 		_drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
 	}
@@ -432,7 +438,7 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 		/* if this was forced, we should consider sync */
 		if (forced)
 			drbd_send_uuids(mdev);
-		drbd_send_state(mdev);
+		drbd_send_current_state(mdev);
 	}
 
 	drbd_md_sync(mdev);
@@ -668,8 +674,8 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
 			 la_size_changed && md_moved ? "size changed and md moved" :
 			 la_size_changed ? "size changed" : "md moved");
 		/* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
-		err = drbd_bitmap_io(mdev, &drbd_bm_write,
-				"size changed", BM_LOCKED_MASK);
+		err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
+				     "size changed", BM_LOCKED_MASK);
 		if (err) {
 			rv = dev_size_error;
 			goto out;
@@ -795,8 +801,8 @@ static int drbd_check_al_size(struct drbd_conf *mdev)
 static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size)
 {
 	struct request_queue * const q = mdev->rq_queue;
-	int max_hw_sectors = max_bio_size >> 9;
-	int max_segments = 0;
+	unsigned int max_hw_sectors = max_bio_size >> 9;
+	unsigned int max_segments = 0;
 
 	if (get_ldev_if_state(mdev, D_ATTACHING)) {
 		struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
@@ -829,7 +835,7 @@ static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_
 
 void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
 {
-	int now, new, local, peer;
+	unsigned int now, new, local, peer;
 
 	now = queue_max_hw_sectors(mdev->rq_queue) << 9;
 	local = mdev->local_max_bio_size; /* Eventually last known value, from volatile memory */
@@ -840,23 +846,25 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
 		mdev->local_max_bio_size = local;
 		put_ldev(mdev);
 	}
+	local = min(local, DRBD_MAX_BIO_SIZE);
 
 	/* We may ignore peer limits if the peer is modern enough.
 	   Because new from 8.3.8 onwards the peer can use multiple
 	   BIOs for a single peer_request */
 	if (mdev->state.conn >= C_CONNECTED) {
-		if (mdev->agreed_pro_version < 94)
-			peer = mdev->peer_max_bio_size;
-		else if (mdev->agreed_pro_version == 94)
+		if (mdev->agreed_pro_version < 94) {
+			peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+			/* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
+		} else if (mdev->agreed_pro_version == 94)
 			peer = DRBD_MAX_SIZE_H80_PACKET;
 		else /* drbd 8.3.8 onwards */
 			peer = DRBD_MAX_BIO_SIZE;
 	}
 
-	new = min_t(int, local, peer);
+	new = min(local, peer);
 
 	if (mdev->state.role == R_PRIMARY && new < now)
-		dev_err(DEV, "ASSERT FAILED new < now; (%d < %d)\n", new, now);
+		dev_err(DEV, "ASSERT FAILED new < now; (%u < %u)\n", new, now);
 
 	if (new != now)
 		dev_info(DEV, "max BIO size = %u\n", new);
@@ -949,6 +957,14 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	 * to realize a "hot spare" feature (not that I'd recommend that) */
 	wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
 
+	/* make sure there is no leftover from previous force-detach attempts */
+	clear_bit(FORCE_DETACH, &mdev->flags);
+
+	/* and no leftover from previously aborted resync or verify, either */
+	mdev->rs_total = 0;
+	mdev->rs_failed = 0;
+	atomic_set(&mdev->rs_pending_cnt, 0);
+
 	/* allocation not in the IO path, cqueue thread context */
 	nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
 	if (!nbc) {
@@ -1032,7 +1048,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
 			(unsigned long long) drbd_get_max_capacity(nbc),
 			(unsigned long long) nbc->dc.disk_size);
-		retcode = ERR_DISK_TO_SMALL;
+		retcode = ERR_DISK_TOO_SMALL;
 		goto fail;
 	}
 
@@ -1046,7 +1062,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	}
 
 	if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
-		retcode = ERR_MD_DISK_TO_SMALL;
+		retcode = ERR_MD_DISK_TOO_SMALL;
 		dev_warn(DEV, "refusing attach: md-device too small, "
 		     "at least %llu sectors needed for this meta-disk type\n",
 		     (unsigned long long) min_md_device_sectors);
@@ -1057,7 +1073,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	 * (we may currently be R_PRIMARY with no local disk...) */
 	if (drbd_get_max_capacity(nbc) <
 	    drbd_get_capacity(mdev->this_bdev)) {
-		retcode = ERR_DISK_TO_SMALL;
+		retcode = ERR_DISK_TOO_SMALL;
 		goto fail;
 	}
 
@@ -1138,7 +1154,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
 	    drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) {
 		dev_warn(DEV, "refusing to truncate a consistent device\n");
-		retcode = ERR_DISK_TO_SMALL;
+		retcode = ERR_DISK_TOO_SMALL;
 		goto force_diskless_dec;
 	}
 
@@ -1336,17 +1352,35 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 {
 	enum drbd_ret_code retcode;
 	int ret;
+	struct detach dt = {};
+
+	if (!detach_from_tags(mdev, nlp->tag_list, &dt)) {
+		reply->ret_code = ERR_MANDATORY_TAG;
+		goto out;
+	}
+
+	if (dt.detach_force) {
+		set_bit(FORCE_DETACH, &mdev->flags);
+		drbd_force_state(mdev, NS(disk, D_FAILED));
+		reply->ret_code = SS_SUCCESS;
+		goto out;
+	}
+
 	drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */
+	drbd_md_get_buffer(mdev); /* make sure there is no in-flight meta-data IO */
 	retcode = drbd_request_state(mdev, NS(disk, D_FAILED));
+	drbd_md_put_buffer(mdev);
 	/* D_FAILED will transition to DISKLESS. */
 	ret = wait_event_interruptible(mdev->misc_wait,
 			mdev->state.disk != D_FAILED);
 	drbd_resume_io(mdev);
+
 	if ((int)retcode == (int)SS_IS_DISKLESS)
 		retcode = SS_NOTHING_TO_DO;
 	if (ret)
 		retcode = ERR_INTR;
 	reply->ret_code = retcode;
+out:
 	return 0;
 }
 
@@ -1711,7 +1745,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 
 	if (rs.no_resync && mdev->agreed_pro_version < 93) {
 		retcode = ERR_NEED_APV_93;
-		goto fail;
+		goto fail_ldev;
 	}
 
 	if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
@@ -1738,6 +1772,10 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
  fail:
 	reply->ret_code = retcode;
 	return 0;
+
+ fail_ldev:
+	put_ldev(mdev);
+	goto fail;
 }
 
 static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
@@ -1940,8 +1978,11 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
 	int retcode;
 
 	/* If there is still bitmap IO pending, probably because of a previous
-	 * resync just being finished, wait for it before requesting a new resync. */
+	 * resync just being finished, wait for it before requesting a new resync.
+	 * Also wait for it's after_state_ch(). */
+	drbd_suspend_io(mdev);
 	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
+	drbd_flush_workqueue(mdev);
 
 	retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
 
@@ -1959,6 +2000,7 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
 
 		retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
 	}
+	drbd_resume_io(mdev);
 
 	reply->ret_code = retcode;
 	return 0;
@@ -1979,8 +2021,11 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re
 	int retcode;
 
 	/* If there is still bitmap IO pending, probably because of a previous
-	 * resync just being finished, wait for it before requesting a new resync. */
+	 * resync just being finished, wait for it before requesting a new resync.
+	 * Also wait for it's after_state_ch(). */
+	drbd_suspend_io(mdev);
 	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
+	drbd_flush_workqueue(mdev);
 
 	retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
 
@@ -1998,6 +2043,7 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re
 		} else
 			retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
 	}
+	drbd_resume_io(mdev);
 
 	reply->ret_code = retcode;
 	return 0;
@@ -2170,11 +2216,13 @@ static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 
 	/* If there is still bitmap IO pending, e.g. previous resync or verify
 	 * just being finished, wait for it before requesting a new resync. */
+	drbd_suspend_io(mdev);
 	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
 
 	/* w_make_ov_request expects position to be aligned */
 	mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
 	reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
+	drbd_resume_io(mdev);
 	return 0;
 }
 
@@ -2297,7 +2345,7 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms
 		return;
 	}
 
-	if (!cap_raised(current_cap(), CAP_SYS_ADMIN)) {
+	if (!capable(CAP_SYS_ADMIN)) {
 		retcode = ERR_PERM;
 		goto fail;
 	}
@@ -2526,10 +2574,10 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
 
 	page = e->pages;
 	page_chain_for_each(page) {
-		void *d = kmap_atomic(page, KM_USER0);
+		void *d = kmap_atomic(page);
 		unsigned l = min_t(unsigned, len, PAGE_SIZE);
 		memcpy(tl, d, l);
-		kunmap_atomic(d, KM_USER0);
+		kunmap_atomic(d);
 		tl = (unsigned short*)((char*)tl + l);
 		len -= l;
 		if (len == 0)
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 2959cdfb77f5..5496104f90b9 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -52,7 +52,7 @@ void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
 	if (unlikely(v >= 1000000)) {
 		/* cool: > GiByte/s */
 		seq_printf(seq, "%ld,", v / 1000000);
-		v /= 1000000;
+		v %= 1000000;
 		seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000);
 	} else if (likely(v >= 1000))
 		seq_printf(seq, "%ld,%03ld", v/1000, v % 1000);
@@ -245,6 +245,9 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 		    mdev->state.role == R_SECONDARY) {
 			seq_printf(seq, "%2d: cs:Unconfigured\n", i);
 		} else {
+			/* reset mdev->congestion_reason */
+			bdi_rw_congested(&mdev->rq_queue->backing_dev_info);
+
 			seq_printf(seq,
 			   "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
 			   "    ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 43beaca53179..c74ca2df7431 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -277,6 +277,9 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
 	atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
 	int i;
 
+	if (page == NULL)
+		return;
+
 	if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE)*minor_count)
 		i = page_chain_free(page);
 	else {
@@ -316,7 +319,7 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
 				     gfp_t gfp_mask) __must_hold(local)
 {
 	struct drbd_epoch_entry *e;
-	struct page *page;
+	struct page *page = NULL;
 	unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
 
 	if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE))
@@ -329,9 +332,11 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
 		return NULL;
 	}
 
-	page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
-	if (!page)
-		goto fail;
+	if (data_size) {
+		page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
+		if (!page)
+			goto fail;
+	}
 
 	INIT_HLIST_NODE(&e->collision);
 	e->epoch = NULL;
@@ -466,6 +471,7 @@ static int drbd_accept(struct drbd_conf *mdev, const char **what,
 		goto out;
 	}
 	(*newsock)->ops  = sock->ops;
+	__module_get((*newsock)->ops->owner);
 
 out:
 	return err;
@@ -664,7 +670,7 @@ static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
 	timeo = mdev->net_conf->try_connect_int * HZ;
 	timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
 
-	s_listen->sk->sk_reuse    = 1; /* SO_REUSEADDR */
+	s_listen->sk->sk_reuse    = SK_CAN_REUSE; /* SO_REUSEADDR */
 	s_listen->sk->sk_rcvtimeo = timeo;
 	s_listen->sk->sk_sndtimeo = timeo;
 	drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size,
@@ -750,6 +756,7 @@ static int drbd_connect(struct drbd_conf *mdev)
 {
 	struct socket *s, *sock, *msock;
 	int try, h, ok;
+	enum drbd_state_rv rv;
 
 	D_ASSERT(!mdev->data.socket);
 
@@ -841,8 +848,8 @@ retry:
 		}
 	} while (1);
 
-	msock->sk->sk_reuse = 1; /* SO_REUSEADDR */
-	sock->sk->sk_reuse = 1; /* SO_REUSEADDR */
+	msock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+	sock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
 
 	sock->sk->sk_allocation = GFP_NOIO;
 	msock->sk->sk_allocation = GFP_NOIO;
@@ -888,25 +895,32 @@ retry:
 		}
 	}
 
-	if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS)
-		return 0;
-
 	sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
 	sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
 
 	atomic_set(&mdev->packet_seq, 0);
 	mdev->peer_seq = 0;
 
-	drbd_thread_start(&mdev->asender);
-
 	if (drbd_send_protocol(mdev) == -1)
 		return -1;
+	set_bit(STATE_SENT, &mdev->flags);
 	drbd_send_sync_param(mdev, &mdev->sync_conf);
 	drbd_send_sizes(mdev, 0, 0);
 	drbd_send_uuids(mdev);
-	drbd_send_state(mdev);
+	drbd_send_current_state(mdev);
 	clear_bit(USE_DEGR_WFC_T, &mdev->flags);
 	clear_bit(RESIZE_PENDING, &mdev->flags);
+
+	spin_lock_irq(&mdev->req_lock);
+	rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL);
+	if (mdev->state.conn != C_WF_REPORT_PARAMS)
+		clear_bit(STATE_SENT, &mdev->flags);
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (rv < SS_SUCCESS)
+		return 0;
+
+	drbd_thread_start(&mdev->asender);
 	mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
 
 	return 1;
@@ -957,7 +971,7 @@ static void drbd_flush(struct drbd_conf *mdev)
 		rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
 					NULL);
 		if (rv) {
-			dev_err(DEV, "local disk flush failed with status %d\n", rv);
+			dev_info(DEV, "local disk flush failed with status %d\n", rv);
 			/* would rather check on EOPNOTSUPP, but that is not reliable.
 			 * don't try again for ANY return value != 0
 			 * if (rv == -EOPNOTSUPP) */
@@ -1001,13 +1015,14 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
 
 		if (epoch_size != 0 &&
 		    atomic_read(&epoch->active) == 0 &&
-		    test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) {
+		    (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
 			if (!(ev & EV_CLEANUP)) {
 				spin_unlock(&mdev->epoch_lock);
 				drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
 				spin_lock(&mdev->epoch_lock);
 			}
-			dec_unacked(mdev);
+			if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
+				dec_unacked(mdev);
 
 			if (mdev->current_epoch != epoch) {
 				next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
@@ -1096,7 +1111,11 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
 	/* In most cases, we will only need one bio.  But in case the lower
 	 * level restrictions happen to be different at this offset on this
 	 * side than those of the sending peer, we may need to submit the
-	 * request in more than one bio. */
+	 * request in more than one bio.
+	 *
+	 * Plain bio_alloc is good enough here, this is no DRBD internally
+	 * generated bio, but a bio allocated on behalf of the peer.
+	 */
 next_bio:
 	bio = bio_alloc(GFP_NOIO, nr_pages);
 	if (!bio) {
@@ -1256,7 +1275,6 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
 
 	data_size -= dgs;
 
-	ERR_IF(data_size == 0) return NULL;
 	ERR_IF(data_size &  0x1ff) return NULL;
 	ERR_IF(data_size >  DRBD_MAX_BIO_SIZE) return NULL;
 
@@ -1277,6 +1295,9 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
 	if (!e)
 		return NULL;
 
+	if (!data_size)
+		return e;
+
 	ds = data_size;
 	page = e->pages;
 	page_chain_for_each(page) {
@@ -1583,6 +1604,24 @@ static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int u
 	return ok;
 }
 
+static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e)
+{
+
+	struct drbd_epoch_entry *rs_e;
+	bool rv = 0;
+
+	spin_lock_irq(&mdev->req_lock);
+	list_for_each_entry(rs_e, &mdev->sync_ee, w.list) {
+		if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) {
+			rv = 1;
+			break;
+		}
+	}
+	spin_unlock_irq(&mdev->req_lock);
+
+	return rv;
+}
+
 /* Called from receive_Data.
  * Synchronize packets on sock with packets on msock.
  *
@@ -1683,6 +1722,10 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 
 	dp_flags = be32_to_cpu(p->dp_flags);
 	rw |= wire_flags_to_bio(mdev, dp_flags);
+	if (e->pages == NULL) {
+		D_ASSERT(e->size == 0);
+		D_ASSERT(dp_flags & DP_FLUSH);
+	}
 
 	if (dp_flags & DP_MAY_SET_IN_SYNC)
 		e->flags |= EE_MAY_SET_IN_SYNC;
@@ -1826,6 +1869,9 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 	list_add(&e->w.list, &mdev->active_ee);
 	spin_unlock_irq(&mdev->req_lock);
 
+	if (mdev->state.conn == C_SYNC_TARGET)
+		wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e));
+
 	switch (mdev->net_conf->wire_protocol) {
 	case DRBD_PROT_C:
 		inc_unacked(mdev);
@@ -2420,7 +2466,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
 			mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
 			mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
 
-			dev_info(DEV, "Did not got last syncUUID packet, corrected:\n");
+			dev_info(DEV, "Lost last syncUUID packet, corrected:\n");
 			drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
 
 			return -1;
@@ -2806,10 +2852,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
 
 	if (apv >= 88) {
 		if (apv == 88) {
-			if (data_size > SHARED_SECRET_MAX) {
-				dev_err(DEV, "verify-alg too long, "
-				    "peer wants %u, accepting only %u byte\n",
-						data_size, SHARED_SECRET_MAX);
+			if (data_size > SHARED_SECRET_MAX || data_size == 0) {
+				dev_err(DEV, "verify-alg of wrong size, "
+					"peer wants %u, accepting only up to %u byte\n",
+					data_size, SHARED_SECRET_MAX);
 				return false;
 			}
 
@@ -3168,9 +3214,20 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 	os = ns = mdev->state;
 	spin_unlock_irq(&mdev->req_lock);
 
-	/* peer says his disk is uptodate, while we think it is inconsistent,
-	 * and this happens while we think we have a sync going on. */
-	if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE &&
+	/* If some other part of the code (asender thread, timeout)
+	 * already decided to close the connection again,
+	 * we must not "re-establish" it here. */
+	if (os.conn <= C_TEAR_DOWN)
+		return false;
+
+	/* If this is the "end of sync" confirmation, usually the peer disk
+	 * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
+	 * set) resync started in PausedSyncT, or if the timing of pause-/
+	 * unpause-sync events has been "just right", the peer disk may
+	 * transition from D_CONSISTENT to D_UP_TO_DATE as well.
+	 */
+	if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
+	    real_peer_disk == D_UP_TO_DATE &&
 	    os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
 		/* If we are (becoming) SyncSource, but peer is still in sync
 		 * preparation, ignore its uptodate-ness to avoid flapping, it
@@ -3288,7 +3345,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 			/* Nowadays only used when forcing a node into primary role and
 			   setting its disk to UpToDate with that */
 			drbd_send_uuids(mdev);
-			drbd_send_state(mdev);
+			drbd_send_current_state(mdev);
 		}
 	}
 
@@ -3755,11 +3812,18 @@ void drbd_free_tl_hash(struct drbd_conf *mdev)
 	mdev->ee_hash = NULL;
 	mdev->ee_hash_s = 0;
 
-	/* paranoia code */
-	for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
-		if (h->first)
-			dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
-				(int)(h - mdev->tl_hash), h->first);
+	/* We may not have had the chance to wait for all locally pending
+	 * application requests. The hlist_add_fake() prevents access after
+	 * free on master bio completion. */
+	for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) {
+		struct drbd_request *req;
+		struct hlist_node *pos, *n;
+		hlist_for_each_entry_safe(req, pos, n, h, collision) {
+			hlist_del_init(&req->collision);
+			hlist_add_fake(&req->collision);
+		}
+	}
+
 	kfree(mdev->tl_hash);
 	mdev->tl_hash = NULL;
 	mdev->tl_hash_s = 0;
@@ -3776,6 +3840,13 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 	if (mdev->state.conn == C_STANDALONE)
 		return;
 
+	/* We are about to start the cleanup after connection loss.
+	 * Make sure drbd_make_request knows about that.
+	 * Usually we should be in some network failure state already,
+	 * but just in case we are not, we fix it up here.
+	 */
+	drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+
 	/* asender does not clean up anything. it must not interfere, either */
 	drbd_thread_stop(&mdev->asender);
 	drbd_free_sock(mdev);
@@ -3803,8 +3874,6 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 	atomic_set(&mdev->rs_pending_cnt, 0);
 	wake_up(&mdev->misc_wait);
 
-	del_timer(&mdev->request_timer);
-
 	/* make sure syncer is stopped and w_resume_next_sg queued */
 	del_timer_sync(&mdev->resync_timer);
 	resync_timer_fn((unsigned long)mdev);
@@ -4433,7 +4502,7 @@ static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
 
 	if (mdev->state.conn == C_AHEAD &&
 	    atomic_read(&mdev->ap_in_flight) == 0 &&
-	    !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags)) {
+	    !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) {
 		mdev->start_resync_timer.expires = jiffies + HZ;
 		add_timer(&mdev->start_resync_timer);
 	}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3424d675b769..01b2ac641c7b 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -37,6 +37,7 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req
 	const int rw = bio_data_dir(bio);
 	int cpu;
 	cpu = part_stat_lock();
+	part_round_stats(cpu, &mdev->vdisk->part0);
 	part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
 	part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
 	part_inc_in_flight(&mdev->vdisk->part0, rw);
@@ -214,8 +215,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
 {
 	const unsigned long s = req->rq_state;
 	struct drbd_conf *mdev = req->mdev;
-	/* only WRITES may end up here without a master bio (on barrier ack) */
-	int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE;
+	int rw = req->rq_state & RQ_WRITE ? WRITE : READ;
 
 	/* we must not complete the master bio, while it is
 	 *	still being processed by _drbd_send_zc_bio (drbd_send_dblock)
@@ -230,7 +230,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
 		return;
 	if (s & RQ_NET_PENDING)
 		return;
-	if (s & RQ_LOCAL_PENDING)
+	if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED))
 		return;
 
 	if (req->master_bio) {
@@ -277,6 +277,9 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
 		req->master_bio = NULL;
 	}
 
+	if (s & RQ_LOCAL_PENDING)
+		return;
+
 	if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
 		/* this is disconnected (local only) operation,
 		 * or protocol C P_WRITE_ACK,
@@ -429,7 +432,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		break;
 
 	case completed_ok:
-		if (bio_data_dir(req->master_bio) == WRITE)
+		if (req->rq_state & RQ_WRITE)
 			mdev->writ_cnt += req->size>>9;
 		else
 			mdev->read_cnt += req->size>>9;
@@ -438,16 +441,22 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		req->rq_state &= ~RQ_LOCAL_PENDING;
 
 		_req_may_be_done_not_susp(req, m);
-		put_ldev(mdev);
+		break;
+
+	case abort_disk_io:
+		req->rq_state |= RQ_LOCAL_ABORTED;
+		if (req->rq_state & RQ_WRITE)
+			_req_may_be_done_not_susp(req, m);
+		else
+			goto goto_queue_for_net_read;
 		break;
 
 	case write_completed_with_error:
 		req->rq_state |= RQ_LOCAL_COMPLETED;
 		req->rq_state &= ~RQ_LOCAL_PENDING;
 
-		__drbd_chk_io_error(mdev, false);
+		__drbd_chk_io_error(mdev, DRBD_IO_ERROR);
 		_req_may_be_done_not_susp(req, m);
-		put_ldev(mdev);
 		break;
 
 	case read_ahead_completed_with_error:
@@ -455,7 +464,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		req->rq_state |= RQ_LOCAL_COMPLETED;
 		req->rq_state &= ~RQ_LOCAL_PENDING;
 		_req_may_be_done_not_susp(req, m);
-		put_ldev(mdev);
 		break;
 
 	case read_completed_with_error:
@@ -464,10 +472,16 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		req->rq_state |= RQ_LOCAL_COMPLETED;
 		req->rq_state &= ~RQ_LOCAL_PENDING;
 
-		D_ASSERT(!(req->rq_state & RQ_NET_MASK));
+		if (req->rq_state & RQ_LOCAL_ABORTED) {
+			_req_may_be_done(req, m);
+			break;
+		}
 
-		__drbd_chk_io_error(mdev, false);
-		put_ldev(mdev);
+		__drbd_chk_io_error(mdev, DRBD_IO_ERROR);
+
+	goto_queue_for_net_read:
+
+		D_ASSERT(!(req->rq_state & RQ_NET_MASK));
 
 		/* no point in retrying if there is no good remote data,
 		 * or we have no connection. */
@@ -556,10 +570,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		drbd_queue_work(&mdev->data.work, &req->w);
 		break;
 
-	case oos_handed_to_network:
-		/* actually the same */
+	case read_retry_remote_canceled:
 	case send_canceled:
-		/* treat it the same */
 	case send_failed:
 		/* real cleanup will be done from tl_clear.  just update flags
 		 * so it is no longer marked as on the worker queue */
@@ -589,17 +601,17 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		}
 		req->rq_state &= ~RQ_NET_QUEUED;
 		req->rq_state |= RQ_NET_SENT;
-		/* because _drbd_send_zc_bio could sleep, and may want to
-		 * dereference the bio even after the "write_acked_by_peer" and
-		 * "completed_ok" events came in, once we return from
-		 * _drbd_send_zc_bio (drbd_send_dblock), we have to check
-		 * whether it is done already, and end it.  */
 		_req_may_be_done_not_susp(req, m);
 		break;
 
-	case read_retry_remote_canceled:
+	case oos_handed_to_network:
+		/* Was not set PENDING, no longer QUEUED, so is now DONE
+		 * as far as this connection is concerned. */
 		req->rq_state &= ~RQ_NET_QUEUED;
-		/* fall through, in case we raced with drbd_disconnect */
+		req->rq_state |= RQ_NET_DONE;
+		_req_may_be_done_not_susp(req, m);
+		break;
+
 	case connection_lost_while_pending:
 		/* transfer log cleanup after connection loss */
 		/* assert something? */
@@ -616,8 +628,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 			_req_may_be_done(req, m); /* Allowed while state.susp */
 		break;
 
-	case write_acked_by_peer_and_sis:
-		req->rq_state |= RQ_NET_SIS;
 	case conflict_discarded_by_peer:
 		/* for discarded conflicting writes of multiple primaries,
 		 * there is no need to keep anything in the tl, potential
@@ -628,18 +638,15 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 			      (unsigned long long)req->sector, req->size);
 		req->rq_state |= RQ_NET_DONE;
 		/* fall through */
+	case write_acked_by_peer_and_sis:
 	case write_acked_by_peer:
+		if (what == write_acked_by_peer_and_sis)
+			req->rq_state |= RQ_NET_SIS;
 		/* protocol C; successfully written on peer.
-		 * Nothing to do here.
+		 * Nothing more to do here.
 		 * We want to keep the tl in place for all protocols, to cater
-		 * for volatile write-back caches on lower level devices.
-		 *
-		 * A barrier request is expected to have forced all prior
-		 * requests onto stable storage, so completion of a barrier
-		 * request could set NET_DONE right here, and not wait for the
-		 * P_BARRIER_ACK, but that is an unnecessary optimization. */
+		 * for volatile write-back caches on lower level devices. */
 
-		/* this makes it effectively the same as for: */
 	case recv_acked_by_peer:
 		/* protocol B; pretends to be successfully written on peer.
 		 * see also notes above in handed_over_to_network about
@@ -688,6 +695,12 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		break;
 
 	case resend:
+		/* Simply complete (local only) READs. */
+		if (!(req->rq_state & RQ_WRITE) && !req->w.cb) {
+			_req_may_be_done(req, m);
+			break;
+		}
+
 		/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
 		   before the connection loss (B&C only); only P_BARRIER_ACK was missing.
 		   Trowing them out of the TL here by pretending we got a BARRIER_ACK
@@ -763,6 +776,40 @@ static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int s
 	return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
 }
 
+static void maybe_pull_ahead(struct drbd_conf *mdev)
+{
+	int congested = 0;
+
+	/* If I don't even have good local storage, we can not reasonably try
+	 * to pull ahead of the peer. We also need the local reference to make
+	 * sure mdev->act_log is there.
+	 * Note: caller has to make sure that net_conf is there.
+	 */
+	if (!get_ldev_if_state(mdev, D_UP_TO_DATE))
+		return;
+
+	if (mdev->net_conf->cong_fill &&
+	    atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) {
+		dev_info(DEV, "Congestion-fill threshold reached\n");
+		congested = 1;
+	}
+
+	if (mdev->act_log->used >= mdev->net_conf->cong_extents) {
+		dev_info(DEV, "Congestion-extents threshold reached\n");
+		congested = 1;
+	}
+
+	if (congested) {
+		queue_barrier(mdev); /* last barrier, after mirrored writes */
+
+		if (mdev->net_conf->on_congestion == OC_PULL_AHEAD)
+			_drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
+		else  /*mdev->net_conf->on_congestion == OC_DISCONNECT */
+			_drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
+	}
+	put_ldev(mdev);
+}
+
 static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
 {
 	const int rw = bio_rw(bio);
@@ -773,6 +820,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
 	int local, remote, send_oos = 0;
 	int err = -EIO;
 	int ret = 0;
+	union drbd_state s;
 
 	/* allocate outside of all locks; */
 	req = drbd_req_new(mdev, bio);
@@ -792,7 +840,15 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
 		req->private_bio = NULL;
 	}
 	if (rw == WRITE) {
-		remote = 1;
+		/* Need to replicate writes.  Unless it is an empty flush,
+		 * which is better mapped to a DRBD P_BARRIER packet,
+		 * also for drbd wire protocol compatibility reasons. */
+		if (unlikely(size == 0)) {
+			/* The only size==0 bios we expect are empty flushes. */
+			D_ASSERT(bio->bi_rw & REQ_FLUSH);
+			remote = 0;
+		} else
+			remote = 1;
 	} else {
 		/* READ || READA */
 		if (local) {
@@ -828,14 +884,18 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
 	 * extent.  This waits for any resync activity in the corresponding
 	 * resync extent to finish, and, if necessary, pulls in the target
 	 * extent into the activity log, which involves further disk io because
-	 * of transactional on-disk meta data updates. */
-	if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) {
+	 * of transactional on-disk meta data updates.
+	 * Empty flushes don't need to go into the activity log, they can only
+	 * flush data for pending writes which are already in there. */
+	if (rw == WRITE && local && size
+	&& !test_bit(AL_SUSPENDED, &mdev->flags)) {
 		req->rq_state |= RQ_IN_ACT_LOG;
 		drbd_al_begin_io(mdev, sector);
 	}
 
-	remote = remote && drbd_should_do_remote(mdev->state);
-	send_oos = rw == WRITE && drbd_should_send_oos(mdev->state);
+	s = mdev->state;
+	remote = remote && drbd_should_do_remote(s);
+	send_oos = rw == WRITE && drbd_should_send_oos(s);
 	D_ASSERT(!(remote && send_oos));
 
 	if (!(local || remote) && !is_susp(mdev->state)) {
@@ -867,7 +927,7 @@ allocate_barrier:
 
 	if (is_susp(mdev->state)) {
 		/* If we got suspended, use the retry mechanism of
-		   generic_make_request() to restart processing of this
+		   drbd_make_request() to restart processing of this
 		   bio. In the next call to drbd_make_request
 		   we sleep in inc_ap_bio() */
 		ret = 1;
@@ -951,7 +1011,10 @@ allocate_barrier:
 	if (rw == WRITE && _req_conflicts(req))
 		goto fail_conflicting;
 
-	list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
+	/* no point in adding empty flushes to the transfer log,
+	 * they are mapped to drbd barriers already. */
+	if (likely(size!=0))
+		list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
 
 	/* NOTE remote first: to get the concurrent write detection right,
 	 * we must register the request before start of local IO.  */
@@ -968,29 +1031,16 @@ allocate_barrier:
 		_req_mod(req, queue_for_send_oos);
 
 	if (remote &&
-	    mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) {
-		int congested = 0;
-
-		if (mdev->net_conf->cong_fill &&
-		    atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) {
-			dev_info(DEV, "Congestion-fill threshold reached\n");
-			congested = 1;
-		}
-
-		if (mdev->act_log->used >= mdev->net_conf->cong_extents) {
-			dev_info(DEV, "Congestion-extents threshold reached\n");
-			congested = 1;
-		}
-
-		if (congested) {
-			queue_barrier(mdev); /* last barrier, after mirrored writes */
-
-			if (mdev->net_conf->on_congestion == OC_PULL_AHEAD)
-				_drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
-			else  /*mdev->net_conf->on_congestion == OC_DISCONNECT */
-				_drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
-		}
-	}
+	    mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96)
+		maybe_pull_ahead(mdev);
+
+	/* If this was a flush, queue a drbd barrier/start a new epoch.
+	 * Unless the current epoch was empty anyways, or we are not currently
+	 * replicating, in which case there is no point. */
+	if (unlikely(bio->bi_rw & REQ_FLUSH)
+		&& mdev->newest_tle->n_writes
+		&& drbd_should_do_remote(mdev->state))
+		queue_barrier(mdev);
 
 	spin_unlock_irq(&mdev->req_lock);
 	kfree(b); /* if someone else has beaten us to it... */
@@ -1073,7 +1123,7 @@ static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
 	return 0;
 }
 
-int drbd_make_request(struct request_queue *q, struct bio *bio)
+void drbd_make_request(struct request_queue *q, struct bio *bio)
 {
 	unsigned int s_enr, e_enr;
 	struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
@@ -1081,7 +1131,7 @@ int drbd_make_request(struct request_queue *q, struct bio *bio)
 
 	if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
 		bio_endio(bio, -EPERM);
-		return 0;
+		return;
 	}
 
 	start_time = jiffies;
@@ -1089,18 +1139,18 @@ int drbd_make_request(struct request_queue *q, struct bio *bio)
 	/*
 	 * what we "blindly" assume:
 	 */
-	D_ASSERT(bio->bi_size > 0);
 	D_ASSERT((bio->bi_size & 0x1ff) == 0);
-	D_ASSERT(bio->bi_idx == 0);
 
 	/* to make some things easier, force alignment of requests within the
 	 * granularity of our hash tables */
 	s_enr = bio->bi_sector >> HT_SHIFT;
-	e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT;
+	e_enr = bio->bi_size ? (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT : s_enr;
 
 	if (likely(s_enr == e_enr)) {
-		inc_ap_bio(mdev, 1);
-		return drbd_make_request_common(mdev, bio, start_time);
+		do {
+			inc_ap_bio(mdev, 1);
+		} while (drbd_make_request_common(mdev, bio, start_time));
+		return;
 	}
 
 	/* can this bio be split generically?
@@ -1148,7 +1198,6 @@ int drbd_make_request(struct request_queue *q, struct bio *bio)
 
 		bio_pair_release(bp);
 	}
-	return 0;
 }
 
 /* This is called by bio_add_page().  With this function we reduce
@@ -1196,36 +1245,66 @@ void request_timer_fn(unsigned long data)
 	struct drbd_conf *mdev = (struct drbd_conf *) data;
 	struct drbd_request *req; /* oldest request */
 	struct list_head *le;
-	unsigned long et = 0; /* effective timeout = ko_count * timeout */
+	unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
+	unsigned long now;
 
 	if (get_net_conf(mdev)) {
-		et = mdev->net_conf->timeout*HZ/10 * mdev->net_conf->ko_count;
+		if (mdev->state.conn >= C_WF_REPORT_PARAMS)
+			ent = mdev->net_conf->timeout*HZ/10
+				* mdev->net_conf->ko_count;
 		put_net_conf(mdev);
 	}
-	if (!et || mdev->state.conn < C_WF_REPORT_PARAMS)
+	if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */
+		dt = mdev->ldev->dc.disk_timeout * HZ / 10;
+		put_ldev(mdev);
+	}
+	et = min_not_zero(dt, ent);
+
+	if (!et)
 		return; /* Recurring timer stopped */
 
+	now = jiffies;
+
 	spin_lock_irq(&mdev->req_lock);
 	le = &mdev->oldest_tle->requests;
 	if (list_empty(le)) {
 		spin_unlock_irq(&mdev->req_lock);
-		mod_timer(&mdev->request_timer, jiffies + et);
+		mod_timer(&mdev->request_timer, now + et);
 		return;
 	}
 
 	le = le->prev;
 	req = list_entry(le, struct drbd_request, tl_requests);
-	if (time_is_before_eq_jiffies(req->start_time + et)) {
-		if (req->rq_state & RQ_NET_PENDING) {
-			dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
-			_drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE, NULL);
-		} else {
-			dev_warn(DEV, "Local backing block device frozen?\n");
-			mod_timer(&mdev->request_timer, jiffies + et);
-		}
-	} else {
-		mod_timer(&mdev->request_timer, req->start_time + et);
-	}
 
+	/* The request is considered timed out, if
+	 * - we have some effective timeout from the configuration,
+	 *   with above state restrictions applied,
+	 * - the oldest request is waiting for a response from the network
+	 *   resp. the local disk,
+	 * - the oldest request is in fact older than the effective timeout,
+	 * - the connection was established (resp. disk was attached)
+	 *   for longer than the timeout already.
+	 * Note that for 32bit jiffies and very stable connections/disks,
+	 * we may have a wrap around, which is catched by
+	 *   !time_in_range(now, last_..._jif, last_..._jif + timeout).
+	 *
+	 * Side effect: once per 32bit wrap-around interval, which means every
+	 * ~198 days with 250 HZ, we have a window where the timeout would need
+	 * to expire twice (worst case) to become effective. Good enough.
+	 */
+	if (ent && req->rq_state & RQ_NET_PENDING &&
+		 time_after(now, req->start_time + ent) &&
+		!time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) {
+		dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
+		_drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
+	}
+	if (dt && req->rq_state & RQ_LOCAL_PENDING &&
+		 time_after(now, req->start_time + dt) &&
+		!time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) {
+		dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n");
+		__drbd_chk_io_error(mdev, DRBD_FORCE_DETACH);
+	}
+	nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et;
 	spin_unlock_irq(&mdev->req_lock);
+	mod_timer(&mdev->request_timer, nt);
 }
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 68a234a5fdc5..3d2111919486 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -105,6 +105,7 @@ enum drbd_req_event {
 	read_completed_with_error,
 	read_ahead_completed_with_error,
 	write_completed_with_error,
+	abort_disk_io,
 	completed_ok,
 	resend,
 	fail_frozen_disk_io,
@@ -118,18 +119,21 @@ enum drbd_req_event {
  * same time, so we should hold the request lock anyways.
  */
 enum drbd_req_state_bits {
-	/* 210
-	 * 000: no local possible
-	 * 001: to be submitted
+	/* 3210
+	 * 0000: no local possible
+	 * 0001: to be submitted
 	 *    UNUSED, we could map: 011: submitted, completion still pending
-	 * 110: completed ok
-	 * 010: completed with error
+	 * 0110: completed ok
+	 * 0010: completed with error
+	 * 1001: Aborted (before completion)
+	 * 1x10: Aborted and completed -> free
 	 */
 	__RQ_LOCAL_PENDING,
 	__RQ_LOCAL_COMPLETED,
 	__RQ_LOCAL_OK,
+	__RQ_LOCAL_ABORTED,
 
-	/* 76543
+	/* 87654
 	 * 00000: no network possible
 	 * 00001: to be send
 	 * 00011: to be send, on worker queue
@@ -199,8 +203,9 @@ enum drbd_req_state_bits {
 #define RQ_LOCAL_PENDING   (1UL << __RQ_LOCAL_PENDING)
 #define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
 #define RQ_LOCAL_OK        (1UL << __RQ_LOCAL_OK)
+#define RQ_LOCAL_ABORTED   (1UL << __RQ_LOCAL_ABORTED)
 
-#define RQ_LOCAL_MASK      ((RQ_LOCAL_OK << 1)-1) /* 0x07 */
+#define RQ_LOCAL_MASK      ((RQ_LOCAL_ABORTED << 1)-1)
 
 #define RQ_NET_PENDING     (1UL << __RQ_NET_PENDING)
 #define RQ_NET_QUEUED      (1UL << __RQ_NET_QUEUED)
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 4d3e6f6213ba..6bce2cc179d4 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -70,11 +70,29 @@ rwlock_t global_state_lock;
 void drbd_md_io_complete(struct bio *bio, int error)
 {
 	struct drbd_md_io *md_io;
+	struct drbd_conf *mdev;
 
 	md_io = (struct drbd_md_io *)bio->bi_private;
+	mdev = container_of(md_io, struct drbd_conf, md_io);
+
 	md_io->error = error;
 
-	complete(&md_io->event);
+	/* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
+	 * to timeout on the lower level device, and eventually detach from it.
+	 * If this io completion runs after that timeout expired, this
+	 * drbd_md_put_buffer() may allow us to finally try and re-attach.
+	 * During normal operation, this only puts that extra reference
+	 * down to 1 again.
+	 * Make sure we first drop the reference, and only then signal
+	 * completion, or we may (in drbd_al_read_log()) cycle so fast into the
+	 * next drbd_md_sync_page_io(), that we trigger the
+	 * ASSERT(atomic_read(&mdev->md_io_in_use) == 1) there.
+	 */
+	drbd_md_put_buffer(mdev);
+	md_io->done = 1;
+	wake_up(&mdev->misc_wait);
+	bio_put(bio);
+	put_ldev(mdev);
 }
 
 /* reads on behalf of the partner,
@@ -93,7 +111,7 @@ void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
 	if (list_empty(&mdev->read_ee))
 		wake_up(&mdev->ee_wait);
 	if (test_bit(__EE_WAS_ERROR, &e->flags))
-		__drbd_chk_io_error(mdev, false);
+		__drbd_chk_io_error(mdev, DRBD_IO_ERROR);
 	spin_unlock_irqrestore(&mdev->req_lock, flags);
 
 	drbd_queue_work(&mdev->data.work, &e->w);
@@ -136,7 +154,7 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo
 		: list_empty(&mdev->active_ee);
 
 	if (test_bit(__EE_WAS_ERROR, &e->flags))
-		__drbd_chk_io_error(mdev, false);
+		__drbd_chk_io_error(mdev, DRBD_IO_ERROR);
 	spin_unlock_irqrestore(&mdev->req_lock, flags);
 
 	if (is_syncer_req)
@@ -226,6 +244,7 @@ void drbd_endio_pri(struct bio *bio, int error)
 	spin_lock_irqsave(&mdev->req_lock, flags);
 	__req_mod(req, what, &m);
 	spin_unlock_irqrestore(&mdev->req_lock, flags);
+	put_ldev(mdev);
 
 	if (m.bio)
 		complete_master_bio(mdev, &m);
@@ -290,7 +309,7 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *
 	sg_init_table(&sg, 1);
 	crypto_hash_init(&desc);
 
-	__bio_for_each_segment(bvec, bio, i, 0) {
+	bio_for_each_segment(bvec, bio, i) {
 		sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
 		crypto_hash_update(&desc, &sg, sg.length);
 	}
@@ -728,7 +747,7 @@ int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	}
 
 	drbd_start_resync(mdev, C_SYNC_SOURCE);
-	clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags);
+	clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags);
 	return 1;
 }
 
@@ -1482,14 +1501,6 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
 		return;
 	}
 
-	if (mdev->state.conn < C_AHEAD) {
-		/* In case a previous resync run was aborted by an IO error/detach on the peer. */
-		drbd_rs_cancel_all(mdev);
-		/* This should be done when we abort the resync. We definitely do not
-		   want to have this for connections going back and forth between
-		   Ahead/Behind and SyncSource/SyncTarget */
-	}
-
 	if (side == C_SYNC_TARGET) {
 		/* Since application IO was locked out during C_WF_BITMAP_T and
 		   C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
@@ -1519,14 +1530,14 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
 	}
 
 	drbd_state_lock(mdev);
-
+	write_lock_irq(&global_state_lock);
 	if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		write_unlock_irq(&global_state_lock);
 		drbd_state_unlock(mdev);
 		return;
 	}
 
-	write_lock_irq(&global_state_lock);
-	ns = mdev->state;
+	ns.i = mdev->state.i;
 
 	ns.aftr_isp = !_drbd_may_sync_now(mdev);
 
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 9955a53733b2..a7d6347aaa79 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -188,10 +188,10 @@ static int print_unex = 1;
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/mod_devicetable.h>
-#include <linux/buffer_head.h>	/* for invalidate_buffers() */
 #include <linux/mutex.h>
 #include <linux/io.h>
 #include <linux/uaccess.h>
+#include <linux/async.h>
 
 /*
  * PS/2 floppies have much slower step rates than regular floppies.
@@ -203,7 +203,6 @@ static int slow_floppy;
 
 #include <asm/dma.h>
 #include <asm/irq.h>
-#include <asm/system.h>
 
 static int FLOPPY_IRQ = 6;
 static int FLOPPY_DMA = 2;
@@ -553,7 +552,7 @@ static void floppy_ready(void);
 static void floppy_start(void);
 static void process_fd_request(void);
 static void recalibrate_floppy(void);
-static void floppy_shutdown(unsigned long);
+static void floppy_shutdown(struct work_struct *);
 
 static int floppy_request_regions(int);
 static void floppy_release_regions(int);
@@ -590,6 +589,8 @@ static int buffer_max = -1;
 static struct floppy_fdc_state fdc_state[N_FDC];
 static int fdc;			/* current fdc */
 
+static struct workqueue_struct *floppy_wq;
+
 static struct floppy_struct *_floppy = floppy_type;
 static unsigned char current_drive;
 static long current_count_sectors;
@@ -631,16 +632,15 @@ static inline void set_debugt(void) { }
 static inline void debugt(const char *func, const char *msg) { }
 #endif /* DEBUGT */
 
-typedef void (*timeout_fn)(unsigned long);
-static DEFINE_TIMER(fd_timeout, floppy_shutdown, 0, 0);
 
+static DECLARE_DELAYED_WORK(fd_timeout, floppy_shutdown);
 static const char *timeout_message;
 
 static void is_alive(const char *func, const char *message)
 {
 	/* this routine checks whether the floppy driver is "alive" */
 	if (test_bit(0, &fdc_busy) && command_status < 2 &&
-	    !timer_pending(&fd_timeout)) {
+	    !delayed_work_pending(&fd_timeout)) {
 		DPRINT("%s: timeout handler died.  %s\n", func, message);
 	}
 }
@@ -668,15 +668,19 @@ static int output_log_pos;
 
 static void __reschedule_timeout(int drive, const char *message)
 {
+	unsigned long delay;
+
 	if (drive == current_reqD)
 		drive = current_drive;
-	del_timer(&fd_timeout);
+	__cancel_delayed_work(&fd_timeout);
+
 	if (drive < 0 || drive >= N_DRIVE) {
-		fd_timeout.expires = jiffies + 20UL * HZ;
+		delay = 20UL * HZ;
 		drive = 0;
 	} else
-		fd_timeout.expires = jiffies + UDP->timeout;
-	add_timer(&fd_timeout);
+		delay = UDP->timeout;
+
+	queue_delayed_work(floppy_wq, &fd_timeout, delay);
 	if (UDP->flags & FD_DEBUG)
 		DPRINT("reschedule timeout %s\n", message);
 	timeout_message = message;
@@ -874,7 +878,7 @@ static int lock_fdc(int drive, bool interruptible)
 
 	command_status = FD_COMMAND_NONE;
 
-	__reschedule_timeout(drive, "lock fdc");
+	reschedule_timeout(drive, "lock fdc");
 	set_fdc(drive);
 	return 0;
 }
@@ -882,23 +886,15 @@ static int lock_fdc(int drive, bool interruptible)
 /* unlocks the driver */
 static void unlock_fdc(void)
 {
-	unsigned long flags;
-
-	raw_cmd = NULL;
 	if (!test_bit(0, &fdc_busy))
 		DPRINT("FDC access conflict!\n");
 
-	if (do_floppy)
-		DPRINT("device interrupt still active at FDC release: %pf!\n",
-		       do_floppy);
+	raw_cmd = NULL;
 	command_status = FD_COMMAND_NONE;
-	spin_lock_irqsave(&floppy_lock, flags);
-	del_timer(&fd_timeout);
+	__cancel_delayed_work(&fd_timeout);
+	do_floppy = NULL;
 	cont = NULL;
 	clear_bit(0, &fdc_busy);
-	if (current_req || set_next_request())
-		do_fd_request(current_req->q);
-	spin_unlock_irqrestore(&floppy_lock, flags);
 	wake_up(&fdc_wait);
 }
 
@@ -970,26 +966,24 @@ static DECLARE_WORK(floppy_work, NULL);
 
 static void schedule_bh(void (*handler)(void))
 {
+	WARN_ON(work_pending(&floppy_work));
+
 	PREPARE_WORK(&floppy_work, (work_func_t)handler);
-	schedule_work(&floppy_work);
+	queue_work(floppy_wq, &floppy_work);
 }
 
-static DEFINE_TIMER(fd_timer, NULL, 0, 0);
+static DECLARE_DELAYED_WORK(fd_timer, NULL);
 
 static void cancel_activity(void)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&floppy_lock, flags);
 	do_floppy = NULL;
-	PREPARE_WORK(&floppy_work, (work_func_t)empty);
-	del_timer(&fd_timer);
-	spin_unlock_irqrestore(&floppy_lock, flags);
+	cancel_delayed_work_sync(&fd_timer);
+	cancel_work_sync(&floppy_work);
 }
 
 /* this function makes sure that the disk stays in the drive during the
  * transfer */
-static void fd_watchdog(void)
+static void fd_watchdog(struct work_struct *arg)
 {
 	debug_dcl(DP->flags, "calling disk change from watchdog\n");
 
@@ -999,21 +993,20 @@ static void fd_watchdog(void)
 		cont->done(0);
 		reset_fdc();
 	} else {
-		del_timer(&fd_timer);
-		fd_timer.function = (timeout_fn)fd_watchdog;
-		fd_timer.expires = jiffies + HZ / 10;
-		add_timer(&fd_timer);
+		cancel_delayed_work(&fd_timer);
+		PREPARE_DELAYED_WORK(&fd_timer, fd_watchdog);
+		queue_delayed_work(floppy_wq, &fd_timer, HZ / 10);
 	}
 }
 
 static void main_command_interrupt(void)
 {
-	del_timer(&fd_timer);
+	cancel_delayed_work(&fd_timer);
 	cont->interrupt();
 }
 
 /* waits for a delay (spinup or select) to pass */
-static int fd_wait_for_completion(unsigned long delay, timeout_fn function)
+static int fd_wait_for_completion(unsigned long expires, work_func_t function)
 {
 	if (FDCS->reset) {
 		reset_fdc();	/* do the reset during sleep to win time
@@ -1022,47 +1015,15 @@ static int fd_wait_for_completion(unsigned long delay, timeout_fn function)
 		return 1;
 	}
 
-	if (time_before(jiffies, delay)) {
-		del_timer(&fd_timer);
-		fd_timer.function = function;
-		fd_timer.expires = delay;
-		add_timer(&fd_timer);
+	if (time_before(jiffies, expires)) {
+		cancel_delayed_work(&fd_timer);
+		PREPARE_DELAYED_WORK(&fd_timer, function);
+		queue_delayed_work(floppy_wq, &fd_timer, expires - jiffies);
 		return 1;
 	}
 	return 0;
 }
 
-static DEFINE_SPINLOCK(floppy_hlt_lock);
-static int hlt_disabled;
-static void floppy_disable_hlt(void)
-{
-	unsigned long flags;
-
-	WARN_ONCE(1, "floppy_disable_hlt() scheduled for removal in 2012");
-	spin_lock_irqsave(&floppy_hlt_lock, flags);
-	if (!hlt_disabled) {
-		hlt_disabled = 1;
-#ifdef HAVE_DISABLE_HLT
-		disable_hlt();
-#endif
-	}
-	spin_unlock_irqrestore(&floppy_hlt_lock, flags);
-}
-
-static void floppy_enable_hlt(void)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&floppy_hlt_lock, flags);
-	if (hlt_disabled) {
-		hlt_disabled = 0;
-#ifdef HAVE_DISABLE_HLT
-		enable_hlt();
-#endif
-	}
-	spin_unlock_irqrestore(&floppy_hlt_lock, flags);
-}
-
 static void setup_DMA(void)
 {
 	unsigned long f;
@@ -1107,7 +1068,6 @@ static void setup_DMA(void)
 	fd_enable_dma();
 	release_dma_lock(f);
 #endif
-	floppy_disable_hlt();
 }
 
 static void show_floppy(void);
@@ -1376,7 +1336,7 @@ static int fdc_dtr(void)
 	 */
 	FDCS->dtr = raw_cmd->rate & 3;
 	return fd_wait_for_completion(jiffies + 2UL * HZ / 100,
-				      (timeout_fn)floppy_ready);
+				      (work_func_t)floppy_ready);
 }				/* fdc_dtr */
 
 static void tell_sector(void)
@@ -1481,7 +1441,7 @@ static void setup_rw_floppy(void)
 	int flags;
 	int dflags;
 	unsigned long ready_date;
-	timeout_fn function;
+	work_func_t function;
 
 	flags = raw_cmd->flags;
 	if (flags & (FD_RAW_READ | FD_RAW_WRITE))
@@ -1495,9 +1455,9 @@ static void setup_rw_floppy(void)
 		 */
 		if (time_after(ready_date, jiffies + DP->select_delay)) {
 			ready_date -= DP->select_delay;
-			function = (timeout_fn)floppy_start;
+			function = (work_func_t)floppy_start;
 		} else
-			function = (timeout_fn)setup_rw_floppy;
+			function = (work_func_t)setup_rw_floppy;
 
 		/* wait until the floppy is spinning fast enough */
 		if (fd_wait_for_completion(ready_date, function))
@@ -1527,7 +1487,7 @@ static void setup_rw_floppy(void)
 		inr = result();
 		cont->interrupt();
 	} else if (flags & FD_RAW_NEED_DISK)
-		fd_watchdog();
+		fd_watchdog(NULL);
 }
 
 static int blind_seek;
@@ -1709,7 +1669,6 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id)
 	fd_disable_dma();
 	release_dma_lock(f);
 
-	floppy_enable_hlt();
 	do_floppy = NULL;
 	if (fdc >= N_FDC || FDCS->address == -1) {
 		/* we don't even know which FDC is the culprit */
@@ -1837,20 +1796,22 @@ static void show_floppy(void)
 		pr_info("do_floppy=%pf\n", do_floppy);
 	if (work_pending(&floppy_work))
 		pr_info("floppy_work.func=%pf\n", floppy_work.func);
-	if (timer_pending(&fd_timer))
-		pr_info("fd_timer.function=%pf\n", fd_timer.function);
-	if (timer_pending(&fd_timeout)) {
-		pr_info("timer_function=%pf\n", fd_timeout.function);
-		pr_info("expires=%lu\n", fd_timeout.expires - jiffies);
-		pr_info("now=%lu\n", jiffies);
-	}
+	if (delayed_work_pending(&fd_timer))
+		pr_info("delayed work.function=%p expires=%ld\n",
+		       fd_timer.work.func,
+		       fd_timer.timer.expires - jiffies);
+	if (delayed_work_pending(&fd_timeout))
+		pr_info("timer_function=%p expires=%ld\n",
+		       fd_timeout.work.func,
+		       fd_timeout.timer.expires - jiffies);
+
 	pr_info("cont=%p\n", cont);
 	pr_info("current_req=%p\n", current_req);
 	pr_info("command_status=%d\n", command_status);
 	pr_info("\n");
 }
 
-static void floppy_shutdown(unsigned long data)
+static void floppy_shutdown(struct work_struct *arg)
 {
 	unsigned long flags;
 
@@ -1858,8 +1819,6 @@ static void floppy_shutdown(unsigned long data)
 		show_floppy();
 	cancel_activity();
 
-	floppy_enable_hlt();
-
 	flags = claim_dma_lock();
 	fd_disable_dma();
 	release_dma_lock(flags);
@@ -1905,7 +1864,7 @@ static int start_motor(void (*function)(void))
 
 	/* wait_for_completion also schedules reset if needed. */
 	return fd_wait_for_completion(DRS->select_date + DP->select_delay,
-				      (timeout_fn)function);
+				      (work_func_t)function);
 }
 
 static void floppy_ready(void)
@@ -2558,8 +2517,7 @@ static int make_raw_rw_request(void)
 	set_fdc((long)current_req->rq_disk->private_data);
 
 	raw_cmd = &default_raw_cmd;
-	raw_cmd->flags = FD_RAW_SPIN | FD_RAW_NEED_DISK | FD_RAW_NEED_DISK |
-	    FD_RAW_NEED_SEEK;
+	raw_cmd->flags = FD_RAW_SPIN | FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK;
 	raw_cmd->cmd_count = NR_RW;
 	if (rq_data_dir(current_req) == READ) {
 		raw_cmd->flags |= FD_RAW_READ;
@@ -2858,7 +2816,6 @@ do_request:
 		spin_lock_irq(&floppy_lock);
 		pending = set_next_request();
 		spin_unlock_irq(&floppy_lock);
-
 		if (!pending) {
 			do_floppy = NULL;
 			unlock_fdc();
@@ -2935,13 +2892,15 @@ static void do_fd_request(struct request_queue *q)
 		 current_req->cmd_flags))
 		return;
 
-	if (test_bit(0, &fdc_busy)) {
+	if (test_and_set_bit(0, &fdc_busy)) {
 		/* fdc busy, this new request will be treated when the
 		   current one is done */
 		is_alive(__func__, "old request running");
 		return;
 	}
-	lock_fdc(MAXTIMEOUT, false);
+	command_status = FD_COMMAND_NONE;
+	__reschedule_timeout(MAXTIMEOUT, "fd_request");
+	set_fdc(0);
 	process_fd_request();
 	is_alive(__func__, "");
 }
@@ -3649,9 +3608,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
 
 	mutex_lock(&floppy_mutex);
 	mutex_lock(&open_lock);
-	if (UDRS->fd_ref < 0)
-		UDRS->fd_ref = 0;
-	else if (!UDRS->fd_ref--) {
+	if (!UDRS->fd_ref--) {
 		DPRINT("floppy_release with fd_ref == 0");
 		UDRS->fd_ref = 0;
 	}
@@ -3687,13 +3644,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 		set_bit(FD_VERIFY_BIT, &UDRS->flags);
 	}
 
-	if (UDRS->fd_ref == -1 || (UDRS->fd_ref && (mode & FMODE_EXCL)))
-		goto out2;
-
-	if (mode & FMODE_EXCL)
-		UDRS->fd_ref = -1;
-	else
-		UDRS->fd_ref++;
+	UDRS->fd_ref++;
 
 	opened_bdev[drive] = bdev;
 
@@ -3756,10 +3707,8 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 	mutex_unlock(&floppy_mutex);
 	return 0;
 out:
-	if (UDRS->fd_ref < 0)
-		UDRS->fd_ref = 0;
-	else
-		UDRS->fd_ref--;
+	UDRS->fd_ref--;
+
 	if (!UDRS->fd_ref)
 		opened_bdev[drive] = NULL;
 out2:
@@ -3833,7 +3782,7 @@ static int __floppy_read_block_0(struct block_device *bdev)
 	bio.bi_size = size;
 	bio.bi_bdev = bdev;
 	bio.bi_sector = 0;
-	bio.bi_flags = BIO_QUIET;
+	bio.bi_flags = (1 << BIO_QUIET);
 	init_completion(&complete);
 	bio.bi_private = &complete;
 	bio.bi_end_io = floppy_rb0_complete;
@@ -4174,7 +4123,7 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data)
 	return get_disk(disks[drive]);
 }
 
-static int __init floppy_init(void)
+static int __init do_floppy_init(void)
 {
 	int i, unit, drive;
 	int err, dr;
@@ -4196,10 +4145,16 @@ static int __init floppy_init(void)
 			goto out_put_disk;
 		}
 
+		floppy_wq = alloc_ordered_workqueue("floppy", 0);
+		if (!floppy_wq) {
+			err = -ENOMEM;
+			goto out_put_disk;
+		}
+
 		disks[dr]->queue = blk_init_queue(do_fd_request, &floppy_lock);
 		if (!disks[dr]->queue) {
 			err = -ENOMEM;
-			goto out_put_disk;
+			goto out_destroy_workq;
 		}
 
 		blk_queue_max_hw_sectors(disks[dr]->queue, 64);
@@ -4250,7 +4205,7 @@ static int __init floppy_init(void)
 	use_virtual_dma = can_use_virtual_dma & 1;
 	fdc_state[0].address = FDC1;
 	if (fdc_state[0].address == -1) {
-		del_timer_sync(&fd_timeout);
+		cancel_delayed_work(&fd_timeout);
 		err = -ENODEV;
 		goto out_unreg_region;
 	}
@@ -4261,7 +4216,7 @@ static int __init floppy_init(void)
 	fdc = 0;		/* reset fdc in case of unexpected interrupt */
 	err = floppy_grab_irq_and_dma();
 	if (err) {
-		del_timer_sync(&fd_timeout);
+		cancel_delayed_work(&fd_timeout);
 		err = -EBUSY;
 		goto out_unreg_region;
 	}
@@ -4318,13 +4273,13 @@ static int __init floppy_init(void)
 		user_reset_fdc(-1, FD_RESET_ALWAYS, false);
 	}
 	fdc = 0;
-	del_timer_sync(&fd_timeout);
+	cancel_delayed_work(&fd_timeout);
 	current_drive = 0;
 	initialized = true;
 	if (have_no_fdc) {
 		DPRINT("no floppy controllers found\n");
 		err = have_no_fdc;
-		goto out_flush_work;
+		goto out_release_dma;
 	}
 
 	for (drive = 0; drive < N_DRIVE; drive++) {
@@ -4339,7 +4294,7 @@ static int __init floppy_init(void)
 
 		err = platform_device_register(&floppy_device[drive]);
 		if (err)
-			goto out_flush_work;
+			goto out_release_dma;
 
 		err = device_create_file(&floppy_device[drive].dev,
 					 &dev_attr_cmos);
@@ -4357,25 +4312,50 @@ static int __init floppy_init(void)
 
 out_unreg_platform_dev:
 	platform_device_unregister(&floppy_device[drive]);
-out_flush_work:
-	flush_work_sync(&floppy_work);
+out_release_dma:
 	if (atomic_read(&usage_count))
 		floppy_release_irq_and_dma();
 out_unreg_region:
 	blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
 	platform_driver_unregister(&floppy_driver);
+out_destroy_workq:
+	destroy_workqueue(floppy_wq);
 out_unreg_blkdev:
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
 out_put_disk:
 	while (dr--) {
 		del_timer_sync(&motor_off_timer[dr]);
-		if (disks[dr]->queue)
+		if (disks[dr]->queue) {
 			blk_cleanup_queue(disks[dr]->queue);
+			/*
+			 * put_disk() is not paired with add_disk() and
+			 * will put queue reference one extra time. fix it.
+			 */
+			disks[dr]->queue = NULL;
+		}
 		put_disk(disks[dr]);
 	}
 	return err;
 }
 
+#ifndef MODULE
+static __init void floppy_async_init(void *data, async_cookie_t cookie)
+{
+	do_floppy_init();
+}
+#endif
+
+static int __init floppy_init(void)
+{
+#ifdef MODULE
+	return do_floppy_init();
+#else
+	/* Don't hold up the bootup by the floppy initialization */
+	async_schedule(floppy_async_init, NULL);
+	return 0;
+#endif
+}
+
 static const struct io_region {
 	int offset;
 	int size;
@@ -4428,7 +4408,7 @@ static int floppy_grab_irq_and_dma(void)
 	 * We might have scheduled a free_irq(), wait it to
 	 * drain first:
 	 */
-	flush_work_sync(&floppy_work);
+	flush_workqueue(floppy_wq);
 
 	if (fd_request_irq()) {
 		DPRINT("Unable to grab IRQ%d for the floppy driver\n",
@@ -4504,7 +4484,6 @@ static void floppy_release_irq_and_dma(void)
 #if N_FDC > 1
 	set_dor(1, ~8, 0);
 #endif
-	floppy_enable_hlt();
 
 	if (floppy_track_buffer && max_buffer_sectors) {
 		tmpsize = max_buffer_sectors * 1024;
@@ -4520,9 +4499,9 @@ static void floppy_release_irq_and_dma(void)
 			pr_info("motor off timer %d still active\n", drive);
 #endif
 
-	if (timer_pending(&fd_timeout))
+	if (delayed_work_pending(&fd_timeout))
 		pr_info("floppy timer still active:%s\n", timeout_message);
-	if (timer_pending(&fd_timer))
+	if (delayed_work_pending(&fd_timer))
 		pr_info("auxiliary floppy timer still active\n");
 	if (work_pending(&floppy_work))
 		pr_info("work still pending\n");
@@ -4580,11 +4559,21 @@ static void __exit floppy_module_exit(void)
 			platform_device_unregister(&floppy_device[drive]);
 		}
 		blk_cleanup_queue(disks[drive]->queue);
+
+		/*
+		 * These disks have not called add_disk().  Don't put down
+		 * queue reference in put_disk().
+		 */
+		if (!(allowed_drive_mask & (1 << drive)) ||
+		    fdc_state[FDC(drive)].version == FDC_NONE)
+			disks[drive]->queue = NULL;
+
 		put_disk(disks[drive]);
 	}
 
-	del_timer_sync(&fd_timeout);
-	del_timer_sync(&fd_timer);
+	cancel_delayed_work_sync(&fd_timeout);
+	cancel_delayed_work_sync(&fd_timer);
+	destroy_workqueue(floppy_wq);
 
 	if (atomic_read(&usage_count))
 		floppy_release_irq_and_dma();
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index b52c9ca146fc..bf397bf108b7 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -44,7 +44,6 @@
 #define HD_IRQ 14
 
 #define REALLY_SLOW_IO
-#include <asm/system.h>
 #include <asm/io.h>
 #include <asm/uaccess.h>
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 4720c7ade0ae..3bba65510d23 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -69,13 +69,14 @@
 #include <linux/freezer.h>
 #include <linux/mutex.h>
 #include <linux/writeback.h>
-#include <linux/buffer_head.h>		/* for invalidate_bdev() */
 #include <linux/completion.h>
 #include <linux/highmem.h>
 #include <linux/kthread.h>
 #include <linux/splice.h>
 #include <linux/sysfs.h>
 #include <linux/miscdevice.h>
+#include <linux/falloc.h>
+
 #include <asm/uaccess.h>
 
 static DEFINE_IDR(loop_index_idr);
@@ -92,16 +93,16 @@ static int transfer_none(struct loop_device *lo, int cmd,
 			 struct page *loop_page, unsigned loop_off,
 			 int size, sector_t real_block)
 {
-	char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;
-	char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;
+	char *raw_buf = kmap_atomic(raw_page) + raw_off;
+	char *loop_buf = kmap_atomic(loop_page) + loop_off;
 
 	if (cmd == READ)
 		memcpy(loop_buf, raw_buf, size);
 	else
 		memcpy(raw_buf, loop_buf, size);
 
-	kunmap_atomic(loop_buf, KM_USER1);
-	kunmap_atomic(raw_buf, KM_USER0);
+	kunmap_atomic(loop_buf);
+	kunmap_atomic(raw_buf);
 	cond_resched();
 	return 0;
 }
@@ -111,8 +112,8 @@ static int transfer_xor(struct loop_device *lo, int cmd,
 			struct page *loop_page, unsigned loop_off,
 			int size, sector_t real_block)
 {
-	char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;
-	char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;
+	char *raw_buf = kmap_atomic(raw_page) + raw_off;
+	char *loop_buf = kmap_atomic(loop_page) + loop_off;
 	char *in, *out, *key;
 	int i, keysize;
 
@@ -129,8 +130,8 @@ static int transfer_xor(struct loop_device *lo, int cmd,
 	for (i = 0; i < size; i++)
 		*out++ = *in++ ^ key[(i & 511) % keysize];
 
-	kunmap_atomic(loop_buf, KM_USER1);
-	kunmap_atomic(raw_buf, KM_USER0);
+	kunmap_atomic(loop_buf);
+	kunmap_atomic(raw_buf);
 	cond_resched();
 	return 0;
 }
@@ -159,17 +160,19 @@ static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = {
 	&xor_funcs
 };
 
-static loff_t get_loop_size(struct loop_device *lo, struct file *file)
+static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file)
 {
-	loff_t size, offset, loopsize;
+	loff_t size, loopsize;
 
 	/* Compute loopsize in bytes */
 	size = i_size_read(file->f_mapping->host);
-	offset = lo->lo_offset;
 	loopsize = size - offset;
-	if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize)
-		loopsize = lo->lo_sizelimit;
+	/* offset is beyond i_size, wierd but possible */
+	if (loopsize < 0)
+		return 0;
 
+	if (sizelimit > 0 && sizelimit < loopsize)
+		loopsize = sizelimit;
 	/*
 	 * Unfortunately, if we want to do I/O on the device,
 	 * the number of 512-byte sectors has to fit into a sector_t.
@@ -177,17 +180,25 @@ static loff_t get_loop_size(struct loop_device *lo, struct file *file)
 	return loopsize >> 9;
 }
 
+static loff_t get_loop_size(struct loop_device *lo, struct file *file)
+{
+	return get_size(lo->lo_offset, lo->lo_sizelimit, file);
+}
+
 static int
-figure_loop_size(struct loop_device *lo)
+figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit)
 {
-	loff_t size = get_loop_size(lo, lo->lo_backing_file);
+	loff_t size = get_size(offset, sizelimit, lo->lo_backing_file);
 	sector_t x = (sector_t)size;
 
 	if (unlikely((loff_t)x != size))
 		return -EFBIG;
-
+	if (lo->lo_offset != offset)
+		lo->lo_offset = offset;
+	if (lo->lo_sizelimit != sizelimit)
+		lo->lo_sizelimit = sizelimit;
 	set_capacity(lo->lo_disk, x);
-	return 0;					
+	return 0;
 }
 
 static inline int
@@ -203,74 +214,6 @@ lo_do_transfer(struct loop_device *lo, int cmd,
 }
 
 /**
- * do_lo_send_aops - helper for writing data to a loop device
- *
- * This is the fast version for backing filesystems which implement the address
- * space operations write_begin and write_end.
- */
-static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
-		loff_t pos, struct page *unused)
-{
-	struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */
-	struct address_space *mapping = file->f_mapping;
-	pgoff_t index;
-	unsigned offset, bv_offs;
-	int len, ret;
-
-	mutex_lock(&mapping->host->i_mutex);
-	index = pos >> PAGE_CACHE_SHIFT;
-	offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1);
-	bv_offs = bvec->bv_offset;
-	len = bvec->bv_len;
-	while (len > 0) {
-		sector_t IV;
-		unsigned size, copied;
-		int transfer_result;
-		struct page *page;
-		void *fsdata;
-
-		IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
-		size = PAGE_CACHE_SIZE - offset;
-		if (size > len)
-			size = len;
-
-		ret = pagecache_write_begin(file, mapping, pos, size, 0,
-							&page, &fsdata);
-		if (ret)
-			goto fail;
-
-		file_update_time(file);
-
-		transfer_result = lo_do_transfer(lo, WRITE, page, offset,
-				bvec->bv_page, bv_offs, size, IV);
-		copied = size;
-		if (unlikely(transfer_result))
-			copied = 0;
-
-		ret = pagecache_write_end(file, mapping, pos, size, copied,
-							page, fsdata);
-		if (ret < 0 || ret != copied)
-			goto fail;
-
-		if (unlikely(transfer_result))
-			goto fail;
-
-		bv_offs += copied;
-		len -= copied;
-		offset = 0;
-		index++;
-		pos += copied;
-	}
-	ret = 0;
-out:
-	mutex_unlock(&mapping->host->i_mutex);
-	return ret;
-fail:
-	ret = -1;
-	goto out;
-}
-
-/**
  * __do_lo_send_write - helper for writing data to a loop device
  *
  * This helper just factors out common code between do_lo_send_direct_write()
@@ -297,10 +240,8 @@ static int __do_lo_send_write(struct file *file,
 /**
  * do_lo_send_direct_write - helper for writing data to a loop device
  *
- * This is the fast, non-transforming version for backing filesystems which do
- * not implement the address space operations write_begin and write_end.
- * It uses the write file operation which should be present on all writeable
- * filesystems.
+ * This is the fast, non-transforming version that does not need double
+ * buffering.
  */
 static int do_lo_send_direct_write(struct loop_device *lo,
 		struct bio_vec *bvec, loff_t pos, struct page *page)
@@ -316,15 +257,9 @@ static int do_lo_send_direct_write(struct loop_device *lo,
 /**
  * do_lo_send_write - helper for writing data to a loop device
  *
- * This is the slow, transforming version for filesystems which do not
- * implement the address space operations write_begin and write_end.  It
- * uses the write file operation which should be present on all writeable
- * filesystems.
- *
- * Using fops->write is slower than using aops->{prepare,commit}_write in the
- * transforming case because we need to double buffer the data as we cannot do
- * the transformations in place as we do not have direct access to the
- * destination pages of the backing file.
+ * This is the slow, transforming version that needs to double buffer the
+ * data as it cannot do the transformations in place without having direct
+ * access to the destination pages of the backing file.
  */
 static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
 		loff_t pos, struct page *page)
@@ -350,17 +285,16 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
 	struct page *page = NULL;
 	int i, ret = 0;
 
-	do_lo_send = do_lo_send_aops;
-	if (!(lo->lo_flags & LO_FLAGS_USE_AOPS)) {
+	if (lo->transfer != transfer_none) {
+		page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
+		if (unlikely(!page))
+			goto fail;
+		kmap(page);
+		do_lo_send = do_lo_send_write;
+	} else {
 		do_lo_send = do_lo_send_direct_write;
-		if (lo->transfer != transfer_none) {
-			page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
-			if (unlikely(!page))
-				goto fail;
-			kmap(page);
-			do_lo_send = do_lo_send_write;
-		}
 	}
+
 	bio_for_each_segment(bvec, bio, i) {
 		ret = do_lo_send(lo, bvec, pos, page);
 		if (ret < 0)
@@ -422,14 +356,14 @@ lo_direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd)
 	return __splice_from_pipe(pipe, sd, lo_splice_actor);
 }
 
-static int
+static ssize_t
 do_lo_receive(struct loop_device *lo,
 	      struct bio_vec *bvec, int bsize, loff_t pos)
 {
 	struct lo_read_data cookie;
 	struct splice_desc sd;
 	struct file *file;
-	long retval;
+	ssize_t retval;
 
 	cookie.lo = lo;
 	cookie.page = bvec->bv_page;
@@ -445,25 +379,28 @@ do_lo_receive(struct loop_device *lo,
 	file = lo->lo_backing_file;
 	retval = splice_direct_to_actor(file, &sd, lo_direct_splice_actor);
 
-	if (retval < 0)
-		return retval;
-
-	return 0;
+	return retval;
 }
 
 static int
 lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
 {
 	struct bio_vec *bvec;
-	int i, ret = 0;
+	ssize_t s;
+	int i;
 
 	bio_for_each_segment(bvec, bio, i) {
-		ret = do_lo_receive(lo, bvec, bsize, pos);
-		if (ret < 0)
+		s = do_lo_receive(lo, bvec, bsize, pos);
+		if (s < 0)
+			return s;
+
+		if (s != bvec->bv_len) {
+			zero_fill_bio(bio);
 			break;
+		}
 		pos += bvec->bv_len;
 	}
-	return ret;
+	return 0;
 }
 
 static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
@@ -484,6 +421,29 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
 			}
 		}
 
+		/*
+		 * We use punch hole to reclaim the free space used by the
+		 * image a.k.a. discard. However we do not support discard if
+		 * encryption is enabled, because it may give an attacker
+		 * useful information.
+		 */
+		if (bio->bi_rw & REQ_DISCARD) {
+			struct file *file = lo->lo_backing_file;
+			int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
+
+			if ((!file->f_op->fallocate) ||
+			    lo->lo_encrypt_key_size) {
+				ret = -EOPNOTSUPP;
+				goto out;
+			}
+			ret = file->f_op->fallocate(file, mode, pos,
+						    bio->bi_size);
+			if (unlikely(ret && ret != -EINVAL &&
+				     ret != -EOPNOTSUPP))
+				ret = -EIO;
+			goto out;
+		}
+
 		ret = lo_send(lo, bio, pos);
 
 		if ((bio->bi_rw & REQ_FUA) && !ret) {
@@ -514,7 +474,7 @@ static struct bio *loop_get_bio(struct loop_device *lo)
 	return bio_list_pop(&lo->lo_bio_list);
 }
 
-static int loop_make_request(struct request_queue *q, struct bio *old_bio)
+static void loop_make_request(struct request_queue *q, struct bio *old_bio)
 {
 	struct loop_device *lo = q->queuedata;
 	int rw = bio_rw(old_bio);
@@ -532,12 +492,11 @@ static int loop_make_request(struct request_queue *q, struct bio *old_bio)
 	loop_add_bio(lo, old_bio);
 	wake_up(&lo->lo_event);
 	spin_unlock_irq(&lo->lo_lock);
-	return 0;
+	return;
 
 out:
 	spin_unlock_irq(&lo->lo_lock);
 	bio_io_error(old_bio);
-	return 0;
 }
 
 struct switch_request {
@@ -700,7 +659,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 		goto out_putf;
 
 	fput(old_file);
-	if (max_part > 0)
+	if (lo->lo_flags & LO_FLAGS_PARTSCAN)
 		ioctl_by_bdev(bdev, BLKRRPART, 0);
 	return 0;
 
@@ -777,16 +736,25 @@ static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf)
 	return sprintf(buf, "%s\n", autoclear ? "1" : "0");
 }
 
+static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf)
+{
+	int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN);
+
+	return sprintf(buf, "%s\n", partscan ? "1" : "0");
+}
+
 LOOP_ATTR_RO(backing_file);
 LOOP_ATTR_RO(offset);
 LOOP_ATTR_RO(sizelimit);
 LOOP_ATTR_RO(autoclear);
+LOOP_ATTR_RO(partscan);
 
 static struct attribute *loop_attrs[] = {
 	&loop_attr_backing_file.attr,
 	&loop_attr_offset.attr,
 	&loop_attr_sizelimit.attr,
 	&loop_attr_autoclear.attr,
+	&loop_attr_partscan.attr,
 	NULL,
 };
 
@@ -807,6 +775,35 @@ static void loop_sysfs_exit(struct loop_device *lo)
 			   &loop_attribute_group);
 }
 
+static void loop_config_discard(struct loop_device *lo)
+{
+	struct file *file = lo->lo_backing_file;
+	struct inode *inode = file->f_mapping->host;
+	struct request_queue *q = lo->lo_queue;
+
+	/*
+	 * We use punch hole to reclaim the free space used by the
+	 * image a.k.a. discard. However we do support discard if
+	 * encryption is enabled, because it may give an attacker
+	 * useful information.
+	 */
+	if ((!file->f_op->fallocate) ||
+	    lo->lo_encrypt_key_size) {
+		q->limits.discard_granularity = 0;
+		q->limits.discard_alignment = 0;
+		q->limits.max_discard_sectors = 0;
+		q->limits.discard_zeroes_data = 0;
+		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
+		return;
+	}
+
+	q->limits.discard_granularity = inode->i_sb->s_blocksize;
+	q->limits.discard_alignment = 0;
+	q->limits.max_discard_sectors = UINT_MAX >> 9;
+	q->limits.discard_zeroes_data = 1;
+	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+}
+
 static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 		       struct block_device *bdev, unsigned int arg)
 {
@@ -849,35 +846,23 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	mapping = file->f_mapping;
 	inode = mapping->host;
 
-	if (!(file->f_mode & FMODE_WRITE))
-		lo_flags |= LO_FLAGS_READ_ONLY;
-
 	error = -EINVAL;
-	if (S_ISREG(inode->i_mode) || S_ISBLK(inode->i_mode)) {
-		const struct address_space_operations *aops = mapping->a_ops;
-
-		if (aops->write_begin)
-			lo_flags |= LO_FLAGS_USE_AOPS;
-		if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
-			lo_flags |= LO_FLAGS_READ_ONLY;
+	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+		goto out_putf;
 
-		lo_blocksize = S_ISBLK(inode->i_mode) ?
-			inode->i_bdev->bd_block_size : PAGE_SIZE;
+	if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) ||
+	    !file->f_op->write)
+		lo_flags |= LO_FLAGS_READ_ONLY;
 
-		error = 0;
-	} else {
-		goto out_putf;
-	}
+	lo_blocksize = S_ISBLK(inode->i_mode) ?
+		inode->i_bdev->bd_block_size : PAGE_SIZE;
 
+	error = -EFBIG;
 	size = get_loop_size(lo, file);
-
-	if ((loff_t)(sector_t)size != size) {
-		error = -EFBIG;
+	if ((loff_t)(sector_t)size != size)
 		goto out_putf;
-	}
 
-	if (!(mode & FMODE_WRITE))
-		lo_flags |= LO_FLAGS_READ_ONLY;
+	error = 0;
 
 	set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
 
@@ -919,7 +904,9 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	}
 	lo->lo_state = Lo_bound;
 	wake_up_process(lo->lo_thread);
-	if (max_part > 0)
+	if (part_shift)
+		lo->lo_flags |= LO_FLAGS_PARTSCAN;
+	if (lo->lo_flags & LO_FLAGS_PARTSCAN)
 		ioctl_by_bdev(bdev, BLKRRPART, 0);
 	return 0;
 
@@ -980,10 +967,11 @@ loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer,
 	return err;
 }
 
-static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
+static int loop_clr_fd(struct loop_device *lo)
 {
 	struct file *filp = lo->lo_backing_file;
 	gfp_t gfp = lo->old_gfp_mask;
+	struct block_device *bdev = lo->lo_device;
 
 	if (lo->lo_state != Lo_bound)
 		return -ENXIO;
@@ -1012,7 +1000,6 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 	lo->lo_offset = 0;
 	lo->lo_sizelimit = 0;
 	lo->lo_encrypt_key_size = 0;
-	lo->lo_flags = 0;
 	lo->lo_thread = NULL;
 	memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
 	memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
@@ -1030,8 +1017,11 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 	lo->lo_state = Lo_unbound;
 	/* This is safe: open() is still holding a reference. */
 	module_put(THIS_MODULE);
-	if (max_part > 0 && bdev)
+	if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev)
 		ioctl_by_bdev(bdev, BLKRRPART, 0);
+	lo->lo_flags = 0;
+	if (!part_shift)
+		lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
 	mutex_unlock(&lo->lo_ctl_mutex);
 	/*
 	 * Need not hold lo_ctl_mutex to fput backing file.
@@ -1080,11 +1070,10 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 
 	if (lo->lo_offset != info->lo_offset ||
 	    lo->lo_sizelimit != info->lo_sizelimit) {
-		lo->lo_offset = info->lo_offset;
-		lo->lo_sizelimit = info->lo_sizelimit;
-		if (figure_loop_size(lo))
+		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit))
 			return -EFBIG;
 	}
+	loop_config_discard(lo);
 
 	memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE);
 	memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE);
@@ -1100,6 +1089,13 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	     (info->lo_flags & LO_FLAGS_AUTOCLEAR))
 		lo->lo_flags ^= LO_FLAGS_AUTOCLEAR;
 
+	if ((info->lo_flags & LO_FLAGS_PARTSCAN) &&
+	     !(lo->lo_flags & LO_FLAGS_PARTSCAN)) {
+		lo->lo_flags |= LO_FLAGS_PARTSCAN;
+		lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;
+		ioctl_by_bdev(lo->lo_device, BLKRRPART, 0);
+	}
+
 	lo->lo_encrypt_key_size = info->lo_encrypt_key_size;
 	lo->lo_init[0] = info->lo_init[0];
 	lo->lo_init[1] = info->lo_init[1];
@@ -1260,7 +1256,7 @@ static int loop_set_capacity(struct loop_device *lo, struct block_device *bdev)
 	err = -ENXIO;
 	if (unlikely(lo->lo_state != Lo_bound))
 		goto out;
-	err = figure_loop_size(lo);
+	err = figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit);
 	if (unlikely(err))
 		goto out;
 	sec = get_capacity(lo->lo_disk);
@@ -1293,18 +1289,24 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		break;
 	case LOOP_CLR_FD:
 		/* loop_clr_fd would have unlocked lo_ctl_mutex on success */
-		err = loop_clr_fd(lo, bdev);
+		err = loop_clr_fd(lo);
 		if (!err)
 			goto out_unlocked;
 		break;
 	case LOOP_SET_STATUS:
-		err = loop_set_status_old(lo, (struct loop_info __user *) arg);
+		err = -EPERM;
+		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+			err = loop_set_status_old(lo,
+					(struct loop_info __user *)arg);
 		break;
 	case LOOP_GET_STATUS:
 		err = loop_get_status_old(lo, (struct loop_info __user *) arg);
 		break;
 	case LOOP_SET_STATUS64:
-		err = loop_set_status64(lo, (struct loop_info64 __user *) arg);
+		err = -EPERM;
+		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+			err = loop_set_status64(lo,
+					(struct loop_info64 __user *) arg);
 		break;
 	case LOOP_GET_STATUS64:
 		err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
@@ -1513,7 +1515,7 @@ static int lo_release(struct gendisk *disk, fmode_t mode)
 		 * In autoclear mode, stop the loop thread
 		 * and remove configuration after last close.
 		 */
-		err = loop_clr_fd(lo, NULL);
+		err = loop_clr_fd(lo);
 		if (!err)
 			goto out_unlocked;
 	} else {
@@ -1595,14 +1597,12 @@ static int loop_add(struct loop_device **l, int i)
 	struct gendisk *disk;
 	int err;
 
+	err = -ENOMEM;
 	lo = kzalloc(sizeof(*lo), GFP_KERNEL);
-	if (!lo) {
-		err = -ENOMEM;
+	if (!lo)
 		goto out;
-	}
 
-	err = idr_pre_get(&loop_index_idr, GFP_KERNEL);
-	if (err < 0)
+	if (!idr_pre_get(&loop_index_idr, GFP_KERNEL))
 		goto out_free_dev;
 
 	if (i >= 0) {
@@ -1635,6 +1635,27 @@ static int loop_add(struct loop_device **l, int i)
 	if (!disk)
 		goto out_free_queue;
 
+	/*
+	 * Disable partition scanning by default. The in-kernel partition
+	 * scanning can be requested individually per-device during its
+	 * setup. Userspace can always add and remove partitions from all
+	 * devices. The needed partition minors are allocated from the
+	 * extended minor space, the main loop device numbers will continue
+	 * to match the loop minors, regardless of the number of partitions
+	 * used.
+	 *
+	 * If max_part is given, partition scanning is globally enabled for
+	 * all loop devices. The minors for the main loop devices will be
+	 * multiples of max_part.
+	 *
+	 * Note: Global-for-all-devices, set-only-at-init, read-only module
+	 * parameteters like 'max_loop' and 'max_part' make things needlessly
+	 * complicated, are too static, inflexible and may surprise
+	 * userspace tools. Parameters like this in general should be avoided.
+	 */
+	if (!part_shift)
+		disk->flags |= GENHD_FL_NO_PART_SCAN;
+	disk->flags |= GENHD_FL_EXT_DEVT;
 	mutex_init(&lo->lo_ctl_mutex);
 	lo->lo_number		= i;
 	lo->lo_thread		= NULL;
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 76fa3deaee84..1788f491e0fb 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -780,9 +780,9 @@ static const struct block_device_operations mg_disk_ops = {
 	.getgeo = mg_getgeo
 };
 
-static int mg_suspend(struct platform_device *plat_dev, pm_message_t state)
+static int mg_suspend(struct device *dev)
 {
-	struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
+	struct mg_drv_data *prv_data = dev->platform_data;
 	struct mg_host *host = prv_data->host;
 
 	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
@@ -804,9 +804,9 @@ static int mg_suspend(struct platform_device *plat_dev, pm_message_t state)
 	return 0;
 }
 
-static int mg_resume(struct platform_device *plat_dev)
+static int mg_resume(struct device *dev)
 {
-	struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
+	struct mg_drv_data *prv_data = dev->platform_data;
 	struct mg_host *host = prv_data->host;
 
 	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
@@ -825,6 +825,8 @@ static int mg_resume(struct platform_device *plat_dev)
 	return 0;
 }
 
+static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume);
+
 static int mg_probe(struct platform_device *plat_dev)
 {
 	struct mg_host *host;
@@ -1074,11 +1076,10 @@ static int mg_remove(struct platform_device *plat_dev)
 static struct platform_driver mg_disk_driver = {
 	.probe = mg_probe,
 	.remove = mg_remove,
-	.suspend = mg_suspend,
-	.resume = mg_resume,
 	.driver = {
 		.name = MG_DEV_NAME,
 		.owner = THIS_MODULE,
+		.pm = &mg_pm,
 	}
 };
 
diff --git a/drivers/block/mtip32xx/Kconfig b/drivers/block/mtip32xx/Kconfig
new file mode 100644
index 000000000000..0ba837fc62a8
--- /dev/null
+++ b/drivers/block/mtip32xx/Kconfig
@@ -0,0 +1,9 @@
+#
+# mtip32xx device driver configuration
+#
+
+config BLK_DEV_PCIESSD_MTIP32XX
+	tristate "Block Device Driver for Micron PCIe SSDs"
+	depends on PCI
+	help
+          This enables the block driver for Micron PCIe SSDs.
diff --git a/drivers/block/mtip32xx/Makefile b/drivers/block/mtip32xx/Makefile
new file mode 100644
index 000000000000..4fbef8c8329b
--- /dev/null
+++ b/drivers/block/mtip32xx/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for  Block device driver for Micron PCIe SSD
+#
+
+obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx.o
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
new file mode 100644
index 000000000000..f946d31d6917
--- /dev/null
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -0,0 +1,4266 @@
+/*
+ * Driver for the Micron P320 SSD
+ *   Copyright (C) 2011 Micron Technology, Inc.
+ *
+ * Portions of this code were derived from works subjected to the
+ * following copyright:
+ *    Copyright (C) 2009 Integrated Device Technology, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/ata.h>
+#include <linux/delay.h>
+#include <linux/hdreg.h>
+#include <linux/uaccess.h>
+#include <linux/random.h>
+#include <linux/smp.h>
+#include <linux/compat.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/dma-mapping.h>
+#include <linux/idr.h>
+#include <linux/kthread.h>
+#include <../drivers/ata/ahci.h>
+#include <linux/export.h>
+#include <linux/debugfs.h>
+#include "mtip32xx.h"
+
+#define HW_CMD_SLOT_SZ		(MTIP_MAX_COMMAND_SLOTS * 32)
+#define HW_CMD_TBL_SZ		(AHCI_CMD_TBL_HDR_SZ + (MTIP_MAX_SG * 16))
+#define HW_CMD_TBL_AR_SZ	(HW_CMD_TBL_SZ * MTIP_MAX_COMMAND_SLOTS)
+#define HW_PORT_PRIV_DMA_SZ \
+		(HW_CMD_SLOT_SZ + HW_CMD_TBL_AR_SZ + AHCI_RX_FIS_SZ)
+
+#define HOST_CAP_NZDMA		(1 << 19)
+#define HOST_HSORG		0xFC
+#define HSORG_DISABLE_SLOTGRP_INTR (1<<24)
+#define HSORG_DISABLE_SLOTGRP_PXIS (1<<16)
+#define HSORG_HWREV		0xFF00
+#define HSORG_STYLE		0x8
+#define HSORG_SLOTGROUPS	0x7
+
+#define PORT_COMMAND_ISSUE	0x38
+#define PORT_SDBV		0x7C
+
+#define PORT_OFFSET		0x100
+#define PORT_MEM_SIZE		0x80
+
+#define PORT_IRQ_ERR \
+	(PORT_IRQ_HBUS_ERR | PORT_IRQ_IF_ERR | PORT_IRQ_CONNECT | \
+	 PORT_IRQ_PHYRDY | PORT_IRQ_UNK_FIS | PORT_IRQ_BAD_PMP | \
+	 PORT_IRQ_TF_ERR | PORT_IRQ_HBUS_DATA_ERR | PORT_IRQ_IF_NONFATAL | \
+	 PORT_IRQ_OVERFLOW)
+#define PORT_IRQ_LEGACY \
+	(PORT_IRQ_PIOS_FIS | PORT_IRQ_D2H_REG_FIS)
+#define PORT_IRQ_HANDLED \
+	(PORT_IRQ_SDB_FIS | PORT_IRQ_LEGACY | \
+	 PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR | \
+	 PORT_IRQ_CONNECT | PORT_IRQ_PHYRDY)
+#define DEF_PORT_IRQ \
+	(PORT_IRQ_ERR | PORT_IRQ_LEGACY | PORT_IRQ_SDB_FIS)
+
+/* product numbers */
+#define MTIP_PRODUCT_UNKNOWN	0x00
+#define MTIP_PRODUCT_ASICFPGA	0x11
+
+/* Device instance number, incremented each time a device is probed. */
+static int instance;
+
+/*
+ * Global variable used to hold the major block device number
+ * allocated in mtip_init().
+ */
+static int mtip_major;
+static struct dentry *dfs_parent;
+
+static DEFINE_SPINLOCK(rssd_index_lock);
+static DEFINE_IDA(rssd_index_ida);
+
+static int mtip_block_initialize(struct driver_data *dd);
+
+#ifdef CONFIG_COMPAT
+struct mtip_compat_ide_task_request_s {
+	__u8		io_ports[8];
+	__u8		hob_ports[8];
+	ide_reg_valid_t	out_flags;
+	ide_reg_valid_t	in_flags;
+	int		data_phase;
+	int		req_cmd;
+	compat_ulong_t	out_size;
+	compat_ulong_t	in_size;
+};
+#endif
+
+/*
+ * This function check_for_surprise_removal is called
+ * while card is removed from the system and it will
+ * read the vendor id from the configration space
+ *
+ * @pdev Pointer to the pci_dev structure.
+ *
+ * return value
+ *	 true if device removed, else false
+ */
+static bool mtip_check_surprise_removal(struct pci_dev *pdev)
+{
+	u16 vendor_id = 0;
+
+       /* Read the vendorID from the configuration space */
+	pci_read_config_word(pdev, 0x00, &vendor_id);
+	if (vendor_id == 0xFFFF)
+		return true; /* device removed */
+
+	return false; /* device present */
+}
+
+/*
+ * This function is called for clean the pending command in the
+ * command slot during the surprise removal of device and return
+ * error to the upper layer.
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_command_cleanup(struct driver_data *dd)
+{
+	int group = 0, commandslot = 0, commandindex = 0;
+	struct mtip_cmd *command;
+	struct mtip_port *port = dd->port;
+	static int in_progress;
+
+	if (in_progress)
+		return;
+
+	in_progress = 1;
+
+	for (group = 0; group < 4; group++) {
+		for (commandslot = 0; commandslot < 32; commandslot++) {
+			if (!(port->allocated[group] & (1 << commandslot)))
+				continue;
+
+			commandindex = group << 5 | commandslot;
+			command = &port->commands[commandindex];
+
+			if (atomic_read(&command->active)
+			    && (command->async_callback)) {
+				command->async_callback(command->async_data,
+					-ENODEV);
+				command->async_callback = NULL;
+				command->async_data = NULL;
+			}
+
+			dma_unmap_sg(&port->dd->pdev->dev,
+				command->sg,
+				command->scatter_ents,
+				command->direction);
+		}
+	}
+
+	up(&port->cmd_slot);
+
+	set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag);
+	in_progress = 0;
+}
+
+/*
+ * Obtain an empty command slot.
+ *
+ * This function needs to be reentrant since it could be called
+ * at the same time on multiple CPUs. The allocation of the
+ * command slot must be atomic.
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ *	>= 0	Index of command slot obtained.
+ *	-1	No command slots available.
+ */
+static int get_slot(struct mtip_port *port)
+{
+	int slot, i;
+	unsigned int num_command_slots = port->dd->slot_groups * 32;
+
+	/*
+	 * Try 10 times, because there is a small race here.
+	 *  that's ok, because it's still cheaper than a lock.
+	 *
+	 * Race: Since this section is not protected by lock, same bit
+	 * could be chosen by different process contexts running in
+	 * different processor. So instead of costly lock, we are going
+	 * with loop.
+	 */
+	for (i = 0; i < 10; i++) {
+		slot = find_next_zero_bit(port->allocated,
+					 num_command_slots, 1);
+		if ((slot < num_command_slots) &&
+		    (!test_and_set_bit(slot, port->allocated)))
+			return slot;
+	}
+	dev_warn(&port->dd->pdev->dev, "Failed to get a tag.\n");
+
+	if (mtip_check_surprise_removal(port->dd->pdev)) {
+		/* Device not present, clean outstanding commands */
+		mtip_command_cleanup(port->dd);
+	}
+	return -1;
+}
+
+/*
+ * Release a command slot.
+ *
+ * @port Pointer to the port data structure.
+ * @tag  Tag of command to release
+ *
+ * return value
+ *	None
+ */
+static inline void release_slot(struct mtip_port *port, int tag)
+{
+	smp_mb__before_clear_bit();
+	clear_bit(tag, port->allocated);
+	smp_mb__after_clear_bit();
+}
+
+/*
+ * Reset the HBA (without sleeping)
+ *
+ * Just like hba_reset, except does not call sleep, so can be
+ * run from interrupt/tasklet context.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0	The reset was successful.
+ *	-1	The HBA Reset bit did not clear.
+ */
+static int hba_reset_nosleep(struct driver_data *dd)
+{
+	unsigned long timeout;
+
+	/* Chip quirk: quiesce any chip function */
+	mdelay(10);
+
+	/* Set the reset bit */
+	writel(HOST_RESET, dd->mmio + HOST_CTL);
+
+	/* Flush */
+	readl(dd->mmio + HOST_CTL);
+
+	/*
+	 * Wait 10ms then spin for up to 1 second
+	 * waiting for reset acknowledgement
+	 */
+	timeout = jiffies + msecs_to_jiffies(1000);
+	mdelay(10);
+	while ((readl(dd->mmio + HOST_CTL) & HOST_RESET)
+		 && time_before(jiffies, timeout))
+		mdelay(1);
+
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))
+		return -1;
+
+	if (readl(dd->mmio + HOST_CTL) & HOST_RESET)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Issue a command to the hardware.
+ *
+ * Set the appropriate bit in the s_active and Command Issue hardware
+ * registers, causing hardware command processing to begin.
+ *
+ * @port Pointer to the port structure.
+ * @tag  The tag of the command to be issued.
+ *
+ * return value
+ *      None
+ */
+static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag)
+{
+	atomic_set(&port->commands[tag].active, 1);
+
+	spin_lock(&port->cmd_issue_lock);
+
+	writel((1 << MTIP_TAG_BIT(tag)),
+			port->s_active[MTIP_TAG_INDEX(tag)]);
+	writel((1 << MTIP_TAG_BIT(tag)),
+			port->cmd_issue[MTIP_TAG_INDEX(tag)]);
+
+	spin_unlock(&port->cmd_issue_lock);
+
+	/* Set the command's timeout value.*/
+	port->commands[tag].comp_time = jiffies + msecs_to_jiffies(
+					MTIP_NCQ_COMMAND_TIMEOUT_MS);
+}
+
+/*
+ * Enable/disable the reception of FIS
+ *
+ * @port   Pointer to the port data structure
+ * @enable 1 to enable, 0 to disable
+ *
+ * return value
+ *	Previous state: 1 enabled, 0 disabled
+ */
+static int mtip_enable_fis(struct mtip_port *port, int enable)
+{
+	u32 tmp;
+
+	/* enable FIS reception */
+	tmp = readl(port->mmio + PORT_CMD);
+	if (enable)
+		writel(tmp | PORT_CMD_FIS_RX, port->mmio + PORT_CMD);
+	else
+		writel(tmp & ~PORT_CMD_FIS_RX, port->mmio + PORT_CMD);
+
+	/* Flush */
+	readl(port->mmio + PORT_CMD);
+
+	return (((tmp & PORT_CMD_FIS_RX) == PORT_CMD_FIS_RX));
+}
+
+/*
+ * Enable/disable the DMA engine
+ *
+ * @port   Pointer to the port data structure
+ * @enable 1 to enable, 0 to disable
+ *
+ * return value
+ *	Previous state: 1 enabled, 0 disabled.
+ */
+static int mtip_enable_engine(struct mtip_port *port, int enable)
+{
+	u32 tmp;
+
+	/* enable FIS reception */
+	tmp = readl(port->mmio + PORT_CMD);
+	if (enable)
+		writel(tmp | PORT_CMD_START, port->mmio + PORT_CMD);
+	else
+		writel(tmp & ~PORT_CMD_START, port->mmio + PORT_CMD);
+
+	readl(port->mmio + PORT_CMD);
+	return (((tmp & PORT_CMD_START) == PORT_CMD_START));
+}
+
+/*
+ * Enables the port DMA engine and FIS reception.
+ *
+ * return value
+ *	None
+ */
+static inline void mtip_start_port(struct mtip_port *port)
+{
+	/* Enable FIS reception */
+	mtip_enable_fis(port, 1);
+
+	/* Enable the DMA engine */
+	mtip_enable_engine(port, 1);
+}
+
+/*
+ * Deinitialize a port by disabling port interrupts, the DMA engine,
+ * and FIS reception.
+ *
+ * @port Pointer to the port structure
+ *
+ * return value
+ *	None
+ */
+static inline void mtip_deinit_port(struct mtip_port *port)
+{
+	/* Disable interrupts on this port */
+	writel(0, port->mmio + PORT_IRQ_MASK);
+
+	/* Disable the DMA engine */
+	mtip_enable_engine(port, 0);
+
+	/* Disable FIS reception */
+	mtip_enable_fis(port, 0);
+}
+
+/*
+ * Initialize a port.
+ *
+ * This function deinitializes the port by calling mtip_deinit_port() and
+ * then initializes it by setting the command header and RX FIS addresses,
+ * clearing the SError register and any pending port interrupts before
+ * re-enabling the default set of port interrupts.
+ *
+ * @port Pointer to the port structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_init_port(struct mtip_port *port)
+{
+	int i;
+	mtip_deinit_port(port);
+
+	/* Program the command list base and FIS base addresses */
+	if (readl(port->dd->mmio + HOST_CAP) & HOST_CAP_64) {
+		writel((port->command_list_dma >> 16) >> 16,
+			 port->mmio + PORT_LST_ADDR_HI);
+		writel((port->rxfis_dma >> 16) >> 16,
+			 port->mmio + PORT_FIS_ADDR_HI);
+	}
+
+	writel(port->command_list_dma & 0xFFFFFFFF,
+			port->mmio + PORT_LST_ADDR);
+	writel(port->rxfis_dma & 0xFFFFFFFF, port->mmio + PORT_FIS_ADDR);
+
+	/* Clear SError */
+	writel(readl(port->mmio + PORT_SCR_ERR), port->mmio + PORT_SCR_ERR);
+
+	/* reset the completed registers.*/
+	for (i = 0; i < port->dd->slot_groups; i++)
+		writel(0xFFFFFFFF, port->completed[i]);
+
+	/* Clear any pending interrupts for this port */
+	writel(readl(port->mmio + PORT_IRQ_STAT), port->mmio + PORT_IRQ_STAT);
+
+	/* Clear any pending interrupts on the HBA. */
+	writel(readl(port->dd->mmio + HOST_IRQ_STAT),
+					port->dd->mmio + HOST_IRQ_STAT);
+
+	/* Enable port interrupts */
+	writel(DEF_PORT_IRQ, port->mmio + PORT_IRQ_MASK);
+}
+
+/*
+ * Restart a port
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_restart_port(struct mtip_port *port)
+{
+	unsigned long timeout;
+
+	/* Disable the DMA engine */
+	mtip_enable_engine(port, 0);
+
+	/* Chip quirk: wait up to 500ms for PxCMD.CR == 0 */
+	timeout = jiffies + msecs_to_jiffies(500);
+	while ((readl(port->mmio + PORT_CMD) & PORT_CMD_LIST_ON)
+		 && time_before(jiffies, timeout))
+		;
+
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+		return;
+
+	/*
+	 * Chip quirk: escalate to hba reset if
+	 * PxCMD.CR not clear after 500 ms
+	 */
+	if (readl(port->mmio + PORT_CMD) & PORT_CMD_LIST_ON) {
+		dev_warn(&port->dd->pdev->dev,
+			"PxCMD.CR not clear, escalating reset\n");
+
+		if (hba_reset_nosleep(port->dd))
+			dev_err(&port->dd->pdev->dev,
+				"HBA reset escalation failed.\n");
+
+		/* 30 ms delay before com reset to quiesce chip */
+		mdelay(30);
+	}
+
+	dev_warn(&port->dd->pdev->dev, "Issuing COM reset\n");
+
+	/* Set PxSCTL.DET */
+	writel(readl(port->mmio + PORT_SCR_CTL) |
+			 1, port->mmio + PORT_SCR_CTL);
+	readl(port->mmio + PORT_SCR_CTL);
+
+	/* Wait 1 ms to quiesce chip function */
+	timeout = jiffies + msecs_to_jiffies(1);
+	while (time_before(jiffies, timeout))
+		;
+
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+		return;
+
+	/* Clear PxSCTL.DET */
+	writel(readl(port->mmio + PORT_SCR_CTL) & ~1,
+			 port->mmio + PORT_SCR_CTL);
+	readl(port->mmio + PORT_SCR_CTL);
+
+	/* Wait 500 ms for bit 0 of PORT_SCR_STS to be set */
+	timeout = jiffies + msecs_to_jiffies(500);
+	while (((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0)
+			 && time_before(jiffies, timeout))
+		;
+
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+		return;
+
+	if ((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0)
+		dev_warn(&port->dd->pdev->dev,
+			"COM reset failed\n");
+
+	mtip_init_port(port);
+	mtip_start_port(port);
+
+}
+
+/*
+ * Helper function for tag logging
+ */
+static void print_tags(struct driver_data *dd,
+			char *msg,
+			unsigned long *tagbits,
+			int cnt)
+{
+	unsigned char tagmap[128];
+	int group, tagmap_len = 0;
+
+	memset(tagmap, 0, sizeof(tagmap));
+	for (group = SLOTBITS_IN_LONGS; group > 0; group--)
+		tagmap_len = sprintf(tagmap + tagmap_len, "%016lX ",
+						tagbits[group-1]);
+	dev_warn(&dd->pdev->dev,
+			"%d command(s) %s: tagmap [%s]", cnt, msg, tagmap);
+}
+
+/*
+ * Called periodically to see if any read/write commands are
+ * taking too long to complete.
+ *
+ * @data Pointer to the PORT data structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_timeout_function(unsigned long int data)
+{
+	struct mtip_port *port = (struct mtip_port *) data;
+	struct host_to_dev_fis *fis;
+	struct mtip_cmd *command;
+	int tag, cmdto_cnt = 0;
+	unsigned int bit, group;
+	unsigned int num_command_slots = port->dd->slot_groups * 32;
+	unsigned long to, tagaccum[SLOTBITS_IN_LONGS];
+
+	if (unlikely(!port))
+		return;
+
+	if (test_bit(MTIP_DDF_RESUME_BIT, &port->dd->dd_flag)) {
+		mod_timer(&port->cmd_timer,
+			jiffies + msecs_to_jiffies(30000));
+		return;
+	}
+	/* clear the tag accumulator */
+	memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+
+	for (tag = 0; tag < num_command_slots; tag++) {
+		/*
+		 * Skip internal command slot as it has
+		 * its own timeout mechanism
+		 */
+		if (tag == MTIP_TAG_INTERNAL)
+			continue;
+
+		if (atomic_read(&port->commands[tag].active) &&
+		   (time_after(jiffies, port->commands[tag].comp_time))) {
+			group = tag >> 5;
+			bit = tag & 0x1F;
+
+			command = &port->commands[tag];
+			fis = (struct host_to_dev_fis *) command->command;
+
+			set_bit(tag, tagaccum);
+			cmdto_cnt++;
+			if (cmdto_cnt == 1)
+				set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
+
+			/*
+			 * Clear the completed bit. This should prevent
+			 *  any interrupt handlers from trying to retire
+			 *  the command.
+			 */
+			writel(1 << bit, port->completed[group]);
+
+			/* Call the async completion callback. */
+			if (likely(command->async_callback))
+				command->async_callback(command->async_data,
+							 -EIO);
+			command->async_callback = NULL;
+			command->comp_func = NULL;
+
+			/* Unmap the DMA scatter list entries */
+			dma_unmap_sg(&port->dd->pdev->dev,
+					command->sg,
+					command->scatter_ents,
+					command->direction);
+
+			/*
+			 * Clear the allocated bit and active tag for the
+			 * command.
+			 */
+			atomic_set(&port->commands[tag].active, 0);
+			release_slot(port, tag);
+
+			up(&port->cmd_slot);
+		}
+	}
+
+	if (cmdto_cnt && !test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) {
+		print_tags(port->dd, "timed out", tagaccum, cmdto_cnt);
+
+		mtip_restart_port(port);
+		clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
+		wake_up_interruptible(&port->svc_wait);
+	}
+
+	if (port->ic_pause_timer) {
+		to  = port->ic_pause_timer + msecs_to_jiffies(1000);
+		if (time_after(jiffies, to)) {
+			if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) {
+				port->ic_pause_timer = 0;
+				clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
+				clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags);
+				clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+				wake_up_interruptible(&port->svc_wait);
+			}
+
+
+		}
+	}
+
+	/* Restart the timer */
+	mod_timer(&port->cmd_timer,
+		jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
+}
+
+/*
+ * IO completion function.
+ *
+ * This completion function is called by the driver ISR when a
+ * command that was issued by the kernel completes. It first calls the
+ * asynchronous completion function which normally calls back into the block
+ * layer passing the asynchronous callback data, then unmaps the
+ * scatter list associated with the completed command, and finally
+ * clears the allocated bit associated with the completed command.
+ *
+ * @port   Pointer to the port data structure.
+ * @tag    Tag of the command.
+ * @data   Pointer to driver_data.
+ * @status Completion status.
+ *
+ * return value
+ *	None
+ */
+static void mtip_async_complete(struct mtip_port *port,
+				int tag,
+				void *data,
+				int status)
+{
+	struct mtip_cmd *command;
+	struct driver_data *dd = data;
+	int cb_status = status ? -EIO : 0;
+
+	if (unlikely(!dd) || unlikely(!port))
+		return;
+
+	command = &port->commands[tag];
+
+	if (unlikely(status == PORT_IRQ_TF_ERR)) {
+		dev_warn(&port->dd->pdev->dev,
+			"Command tag %d failed due to TFE\n", tag);
+	}
+
+	/* Upper layer callback */
+	if (likely(command->async_callback))
+		command->async_callback(command->async_data, cb_status);
+
+	command->async_callback = NULL;
+	command->comp_func = NULL;
+
+	/* Unmap the DMA scatter list entries */
+	dma_unmap_sg(&dd->pdev->dev,
+		command->sg,
+		command->scatter_ents,
+		command->direction);
+
+	/* Clear the allocated and active bits for the command */
+	atomic_set(&port->commands[tag].active, 0);
+	release_slot(port, tag);
+
+	up(&port->cmd_slot);
+}
+
+/*
+ * Internal command completion callback function.
+ *
+ * This function is normally called by the driver ISR when an internal
+ * command completed. This function signals the command completion by
+ * calling complete().
+ *
+ * @port   Pointer to the port data structure.
+ * @tag    Tag of the command that has completed.
+ * @data   Pointer to a completion structure.
+ * @status Completion status.
+ *
+ * return value
+ *	None
+ */
+static void mtip_completion(struct mtip_port *port,
+			    int tag,
+			    void *data,
+			    int status)
+{
+	struct mtip_cmd *command = &port->commands[tag];
+	struct completion *waiting = data;
+	if (unlikely(status == PORT_IRQ_TF_ERR))
+		dev_warn(&port->dd->pdev->dev,
+			"Internal command %d completed with TFE\n", tag);
+
+	command->async_callback = NULL;
+	command->comp_func = NULL;
+
+	complete(waiting);
+}
+
+static void mtip_null_completion(struct mtip_port *port,
+			    int tag,
+			    void *data,
+			    int status)
+{
+	return;
+}
+
+static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer,
+				dma_addr_t buffer_dma, unsigned int sectors);
+static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id,
+						struct smart_attr *attrib);
+/*
+ * Handle an error.
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_handle_tfe(struct driver_data *dd)
+{
+	int group, tag, bit, reissue, rv;
+	struct mtip_port *port;
+	struct mtip_cmd  *cmd;
+	u32 completed;
+	struct host_to_dev_fis *fis;
+	unsigned long tagaccum[SLOTBITS_IN_LONGS];
+	unsigned int cmd_cnt = 0;
+	unsigned char *buf;
+	char *fail_reason = NULL;
+	int fail_all_ncq_write = 0, fail_all_ncq_cmds = 0;
+
+	dev_warn(&dd->pdev->dev, "Taskfile error\n");
+
+	port = dd->port;
+
+	/* Stop the timer to prevent command timeouts. */
+	del_timer(&port->cmd_timer);
+	set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
+
+	if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) &&
+			test_bit(MTIP_TAG_INTERNAL, port->allocated)) {
+		cmd = &port->commands[MTIP_TAG_INTERNAL];
+		dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n");
+
+		atomic_inc(&cmd->active); /* active > 1 indicates error */
+		if (cmd->comp_data && cmd->comp_func) {
+			cmd->comp_func(port, MTIP_TAG_INTERNAL,
+					cmd->comp_data, PORT_IRQ_TF_ERR);
+		}
+		goto handle_tfe_exit;
+	}
+
+	/* clear the tag accumulator */
+	memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+
+	/* Loop through all the groups */
+	for (group = 0; group < dd->slot_groups; group++) {
+		completed = readl(port->completed[group]);
+
+		/* clear completed status register in the hardware.*/
+		writel(completed, port->completed[group]);
+
+		/* Process successfully completed commands */
+		for (bit = 0; bit < 32 && completed; bit++) {
+			if (!(completed & (1<<bit)))
+				continue;
+			tag = (group << 5) + bit;
+
+			/* Skip the internal command slot */
+			if (tag == MTIP_TAG_INTERNAL)
+				continue;
+
+			cmd = &port->commands[tag];
+			if (likely(cmd->comp_func)) {
+				set_bit(tag, tagaccum);
+				cmd_cnt++;
+				atomic_set(&cmd->active, 0);
+				cmd->comp_func(port,
+					 tag,
+					 cmd->comp_data,
+					 0);
+			} else {
+				dev_err(&port->dd->pdev->dev,
+					"Missing completion func for tag %d",
+					tag);
+				if (mtip_check_surprise_removal(dd->pdev)) {
+					mtip_command_cleanup(dd);
+					/* don't proceed further */
+					return;
+				}
+			}
+		}
+	}
+
+	print_tags(dd, "completed (TFE)", tagaccum, cmd_cnt);
+
+	/* Restart the port */
+	mdelay(20);
+	mtip_restart_port(port);
+
+	/* Trying to determine the cause of the error */
+	rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
+				dd->port->log_buf,
+				dd->port->log_buf_dma, 1);
+	if (rv) {
+		dev_warn(&dd->pdev->dev,
+			"Error in READ LOG EXT (10h) command\n");
+		/* non-critical error, don't fail the load */
+	} else {
+		buf = (unsigned char *)dd->port->log_buf;
+		if (buf[259] & 0x1) {
+			dev_info(&dd->pdev->dev,
+				"Write protect bit is set.\n");
+			set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag);
+			fail_all_ncq_write = 1;
+			fail_reason = "write protect";
+		}
+		if (buf[288] == 0xF7) {
+			dev_info(&dd->pdev->dev,
+				"Exceeded Tmax, drive in thermal shutdown.\n");
+			set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag);
+			fail_all_ncq_cmds = 1;
+			fail_reason = "thermal shutdown";
+		}
+		if (buf[288] == 0xBF) {
+			dev_info(&dd->pdev->dev,
+				"Drive indicates rebuild has failed.\n");
+			fail_all_ncq_cmds = 1;
+			fail_reason = "rebuild failed";
+		}
+	}
+
+	/* clear the tag accumulator */
+	memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+
+	/* Loop through all the groups */
+	for (group = 0; group < dd->slot_groups; group++) {
+		for (bit = 0; bit < 32; bit++) {
+			reissue = 1;
+			tag = (group << 5) + bit;
+			cmd = &port->commands[tag];
+
+			/* If the active bit is set re-issue the command */
+			if (atomic_read(&cmd->active) == 0)
+				continue;
+
+			fis = (struct host_to_dev_fis *)cmd->command;
+
+			/* Should re-issue? */
+			if (tag == MTIP_TAG_INTERNAL ||
+			    fis->command == ATA_CMD_SET_FEATURES)
+				reissue = 0;
+			else {
+				if (fail_all_ncq_cmds ||
+					(fail_all_ncq_write &&
+					fis->command == ATA_CMD_FPDMA_WRITE)) {
+					dev_warn(&dd->pdev->dev,
+					"  Fail: %s w/tag %d [%s].\n",
+					fis->command == ATA_CMD_FPDMA_WRITE ?
+						"write" : "read",
+					tag,
+					fail_reason != NULL ?
+						fail_reason : "unknown");
+					atomic_set(&cmd->active, 0);
+					if (cmd->comp_func) {
+						cmd->comp_func(port, tag,
+							cmd->comp_data,
+							-ENODATA);
+					}
+					continue;
+				}
+			}
+
+			/*
+			 * First check if this command has
+			 *  exceeded its retries.
+			 */
+			if (reissue && (cmd->retries-- > 0)) {
+
+				set_bit(tag, tagaccum);
+
+				/* Re-issue the command. */
+				mtip_issue_ncq_command(port, tag);
+
+				continue;
+			}
+
+			/* Retire a command that will not be reissued */
+			dev_warn(&port->dd->pdev->dev,
+				"retiring tag %d\n", tag);
+			atomic_set(&cmd->active, 0);
+
+			if (cmd->comp_func)
+				cmd->comp_func(
+					port,
+					tag,
+					cmd->comp_data,
+					PORT_IRQ_TF_ERR);
+			else
+				dev_warn(&port->dd->pdev->dev,
+					"Bad completion for tag %d\n",
+					tag);
+		}
+	}
+	print_tags(dd, "reissued (TFE)", tagaccum, cmd_cnt);
+
+handle_tfe_exit:
+	/* clear eh_active */
+	clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
+	wake_up_interruptible(&port->svc_wait);
+
+	mod_timer(&port->cmd_timer,
+		 jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
+}
+
+/*
+ * Handle a set device bits interrupt
+ */
+static inline void mtip_process_sdbf(struct driver_data *dd)
+{
+	struct mtip_port  *port = dd->port;
+	int group, tag, bit;
+	u32 completed;
+	struct mtip_cmd *command;
+
+	/* walk all bits in all slot groups */
+	for (group = 0; group < dd->slot_groups; group++) {
+		completed = readl(port->completed[group]);
+		if (!completed)
+			continue;
+
+		/* clear completed status register in the hardware.*/
+		writel(completed, port->completed[group]);
+
+		/* Process completed commands. */
+		for (bit = 0;
+		     (bit < 32) && completed;
+		     bit++, completed >>= 1) {
+			if (completed & 0x01) {
+				tag = (group << 5) | bit;
+
+				/* skip internal command slot. */
+				if (unlikely(tag == MTIP_TAG_INTERNAL))
+					continue;
+
+				command = &port->commands[tag];
+				/* make internal callback */
+				if (likely(command->comp_func)) {
+					command->comp_func(
+						port,
+						tag,
+						command->comp_data,
+						0);
+				} else {
+					dev_warn(&dd->pdev->dev,
+						"Null completion "
+						"for tag %d",
+						tag);
+
+					if (mtip_check_surprise_removal(
+						dd->pdev)) {
+						mtip_command_cleanup(dd);
+						return;
+					}
+				}
+			}
+		}
+	}
+}
+
+/*
+ * Process legacy pio and d2h interrupts
+ */
+static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat)
+{
+	struct mtip_port *port = dd->port;
+	struct mtip_cmd *cmd = &port->commands[MTIP_TAG_INTERNAL];
+
+	if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) &&
+	    (cmd != NULL) && !(readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+		& (1 << MTIP_TAG_INTERNAL))) {
+		if (cmd->comp_func) {
+			cmd->comp_func(port,
+				MTIP_TAG_INTERNAL,
+				cmd->comp_data,
+				0);
+			return;
+		}
+	}
+
+	return;
+}
+
+/*
+ * Demux and handle errors
+ */
+static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat)
+{
+	if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR)))
+		mtip_handle_tfe(dd);
+
+	if (unlikely(port_stat & PORT_IRQ_CONNECT)) {
+		dev_warn(&dd->pdev->dev,
+			"Clearing PxSERR.DIAG.x\n");
+		writel((1 << 26), dd->port->mmio + PORT_SCR_ERR);
+	}
+
+	if (unlikely(port_stat & PORT_IRQ_PHYRDY)) {
+		dev_warn(&dd->pdev->dev,
+			"Clearing PxSERR.DIAG.n\n");
+		writel((1 << 16), dd->port->mmio + PORT_SCR_ERR);
+	}
+
+	if (unlikely(port_stat & ~PORT_IRQ_HANDLED)) {
+		dev_warn(&dd->pdev->dev,
+			"Port stat errors %x unhandled\n",
+			(port_stat & ~PORT_IRQ_HANDLED));
+	}
+}
+
+static inline irqreturn_t mtip_handle_irq(struct driver_data *data)
+{
+	struct driver_data *dd = (struct driver_data *) data;
+	struct mtip_port *port = dd->port;
+	u32 hba_stat, port_stat;
+	int rv = IRQ_NONE;
+
+	hba_stat = readl(dd->mmio + HOST_IRQ_STAT);
+	if (hba_stat) {
+		rv = IRQ_HANDLED;
+
+		/* Acknowledge the interrupt status on the port.*/
+		port_stat = readl(port->mmio + PORT_IRQ_STAT);
+		writel(port_stat, port->mmio + PORT_IRQ_STAT);
+
+		/* Demux port status */
+		if (likely(port_stat & PORT_IRQ_SDB_FIS))
+			mtip_process_sdbf(dd);
+
+		if (unlikely(port_stat & PORT_IRQ_ERR)) {
+			if (unlikely(mtip_check_surprise_removal(dd->pdev))) {
+				mtip_command_cleanup(dd);
+				/* don't proceed further */
+				return IRQ_HANDLED;
+			}
+			if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+							&dd->dd_flag))
+				return rv;
+
+			mtip_process_errors(dd, port_stat & PORT_IRQ_ERR);
+		}
+
+		if (unlikely(port_stat & PORT_IRQ_LEGACY))
+			mtip_process_legacy(dd, port_stat & PORT_IRQ_LEGACY);
+	}
+
+	/* acknowledge interrupt */
+	writel(hba_stat, dd->mmio + HOST_IRQ_STAT);
+
+	return rv;
+}
+
+/*
+ * Wrapper for mtip_handle_irq
+ * (ignores return code)
+ */
+static void mtip_tasklet(unsigned long data)
+{
+	mtip_handle_irq((struct driver_data *) data);
+}
+
+/*
+ * HBA interrupt subroutine.
+ *
+ * @irq		IRQ number.
+ * @instance	Pointer to the driver data structure.
+ *
+ * return value
+ *	IRQ_HANDLED	A HBA interrupt was pending and handled.
+ *	IRQ_NONE	This interrupt was not for the HBA.
+ */
+static irqreturn_t mtip_irq_handler(int irq, void *instance)
+{
+	struct driver_data *dd = instance;
+	tasklet_schedule(&dd->tasklet);
+	return IRQ_HANDLED;
+}
+
+static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag)
+{
+	atomic_set(&port->commands[tag].active, 1);
+	writel(1 << MTIP_TAG_BIT(tag),
+		port->cmd_issue[MTIP_TAG_INDEX(tag)]);
+}
+
+static bool mtip_pause_ncq(struct mtip_port *port,
+				struct host_to_dev_fis *fis)
+{
+	struct host_to_dev_fis *reply;
+	unsigned long task_file_data;
+
+	reply = port->rxfis + RX_FIS_D2H_REG;
+	task_file_data = readl(port->mmio+PORT_TFDATA);
+
+	if (fis->command == ATA_CMD_SEC_ERASE_UNIT)
+		clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
+
+	if ((task_file_data & 1))
+		return false;
+
+	if (fis->command == ATA_CMD_SEC_ERASE_PREP) {
+		set_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
+		set_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
+		port->ic_pause_timer = jiffies;
+		return true;
+	} else if ((fis->command == ATA_CMD_DOWNLOAD_MICRO) &&
+					(fis->features == 0x03)) {
+		set_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags);
+		port->ic_pause_timer = jiffies;
+		return true;
+	} else if ((fis->command == ATA_CMD_SEC_ERASE_UNIT) ||
+		((fis->command == 0xFC) &&
+			(fis->features == 0x27 || fis->features == 0x72 ||
+			 fis->features == 0x62 || fis->features == 0x26))) {
+		/* Com reset after secure erase or lowlevel format */
+		mtip_restart_port(port);
+		return false;
+	}
+
+	return false;
+}
+
+/*
+ * Wait for port to quiesce
+ *
+ * @port    Pointer to port data structure
+ * @timeout Max duration to wait (ms)
+ *
+ * return value
+ *	0	Success
+ *	-EBUSY  Commands still active
+ */
+static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
+{
+	unsigned long to;
+	unsigned int n;
+	unsigned int active = 1;
+
+	to = jiffies + msecs_to_jiffies(timeout);
+	do {
+		if (test_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags) &&
+			test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) {
+			msleep(20);
+			continue; /* svc thd is actively issuing commands */
+		}
+		if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+			return -EFAULT;
+		/*
+		 * Ignore s_active bit 0 of array element 0.
+		 * This bit will always be set
+		 */
+		active = readl(port->s_active[0]) & 0xFFFFFFFE;
+		for (n = 1; n < port->dd->slot_groups; n++)
+			active |= readl(port->s_active[n]);
+
+		if (!active)
+			break;
+
+		msleep(20);
+	} while (time_before(jiffies, to));
+
+	return active ? -EBUSY : 0;
+}
+
+/*
+ * Execute an internal command and wait for the completion.
+ *
+ * @port    Pointer to the port data structure.
+ * @fis     Pointer to the FIS that describes the command.
+ * @fis_len  Length in WORDS of the FIS.
+ * @buffer  DMA accessible for command data.
+ * @buf_len  Length, in bytes, of the data buffer.
+ * @opts    Command header options, excluding the FIS length
+ *             and the number of PRD entries.
+ * @timeout Time in ms to wait for the command to complete.
+ *
+ * return value
+ *	0	 Command completed successfully.
+ *	-EFAULT  The buffer address is not correctly aligned.
+ *	-EBUSY   Internal command or other IO in progress.
+ *	-EAGAIN  Time out waiting for command to complete.
+ */
+static int mtip_exec_internal_command(struct mtip_port *port,
+					struct host_to_dev_fis *fis,
+					int fis_len,
+					dma_addr_t buffer,
+					int buf_len,
+					u32 opts,
+					gfp_t atomic,
+					unsigned long timeout)
+{
+	struct mtip_cmd_sg *command_sg;
+	DECLARE_COMPLETION_ONSTACK(wait);
+	int rv = 0, ready2go = 1;
+	struct mtip_cmd *int_cmd = &port->commands[MTIP_TAG_INTERNAL];
+	unsigned long to;
+
+	/* Make sure the buffer is 8 byte aligned. This is asic specific. */
+	if (buffer & 0x00000007) {
+		dev_err(&port->dd->pdev->dev,
+			"SG buffer is not 8 byte aligned\n");
+		return -EFAULT;
+	}
+
+	to = jiffies + msecs_to_jiffies(timeout);
+	do {
+		ready2go = !test_and_set_bit(MTIP_TAG_INTERNAL,
+						port->allocated);
+		if (ready2go)
+			break;
+		mdelay(100);
+	} while (time_before(jiffies, to));
+	if (!ready2go) {
+		dev_warn(&port->dd->pdev->dev,
+			"Internal cmd active. new cmd [%02X]\n", fis->command);
+		return -EBUSY;
+	}
+	set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+	port->ic_pause_timer = 0;
+
+	if (fis->command == ATA_CMD_SEC_ERASE_UNIT)
+		clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
+	else if (fis->command == ATA_CMD_DOWNLOAD_MICRO)
+		clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags);
+
+	if (atomic == GFP_KERNEL) {
+		if (fis->command != ATA_CMD_STANDBYNOW1) {
+			/* wait for io to complete if non atomic */
+			if (mtip_quiesce_io(port, 5000) < 0) {
+				dev_warn(&port->dd->pdev->dev,
+					"Failed to quiesce IO\n");
+				release_slot(port, MTIP_TAG_INTERNAL);
+				clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+				wake_up_interruptible(&port->svc_wait);
+				return -EBUSY;
+			}
+		}
+
+		/* Set the completion function and data for the command. */
+		int_cmd->comp_data = &wait;
+		int_cmd->comp_func = mtip_completion;
+
+	} else {
+		/* Clear completion - we're going to poll */
+		int_cmd->comp_data = NULL;
+		int_cmd->comp_func = mtip_null_completion;
+	}
+
+	/* Copy the command to the command table */
+	memcpy(int_cmd->command, fis, fis_len*4);
+
+	/* Populate the SG list */
+	int_cmd->command_header->opts =
+		 __force_bit2int cpu_to_le32(opts | fis_len);
+	if (buf_len) {
+		command_sg = int_cmd->command + AHCI_CMD_TBL_HDR_SZ;
+
+		command_sg->info =
+			__force_bit2int cpu_to_le32((buf_len-1) & 0x3FFFFF);
+		command_sg->dba	=
+			__force_bit2int cpu_to_le32(buffer & 0xFFFFFFFF);
+		command_sg->dba_upper =
+			__force_bit2int cpu_to_le32((buffer >> 16) >> 16);
+
+		int_cmd->command_header->opts |=
+			__force_bit2int cpu_to_le32((1 << 16));
+	}
+
+	/* Populate the command header */
+	int_cmd->command_header->byte_count = 0;
+
+	/* Issue the command to the hardware */
+	mtip_issue_non_ncq_command(port, MTIP_TAG_INTERNAL);
+
+	/* Poll if atomic, wait_for_completion otherwise */
+	if (atomic == GFP_KERNEL) {
+		/* Wait for the command to complete or timeout. */
+		if (wait_for_completion_timeout(
+				&wait,
+				msecs_to_jiffies(timeout)) == 0) {
+			dev_err(&port->dd->pdev->dev,
+				"Internal command did not complete [%d] "
+				"within timeout of  %lu ms\n",
+				atomic, timeout);
+			if (mtip_check_surprise_removal(port->dd->pdev) ||
+				test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+						&port->dd->dd_flag)) {
+				rv = -ENXIO;
+				goto exec_ic_exit;
+			}
+			rv = -EAGAIN;
+		}
+	} else {
+		/* Spin for <timeout> checking if command still outstanding */
+		timeout = jiffies + msecs_to_jiffies(timeout);
+		while ((readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+				& (1 << MTIP_TAG_INTERNAL))
+				&& time_before(jiffies, timeout)) {
+			if (mtip_check_surprise_removal(port->dd->pdev)) {
+				rv = -ENXIO;
+				goto exec_ic_exit;
+			}
+			if ((fis->command != ATA_CMD_STANDBYNOW1) &&
+				test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+						&port->dd->dd_flag)) {
+				rv = -ENXIO;
+				goto exec_ic_exit;
+			}
+			if (readl(port->mmio + PORT_IRQ_STAT) & PORT_IRQ_ERR) {
+				atomic_inc(&int_cmd->active); /* error */
+				break;
+			}
+		}
+	}
+
+	if (atomic_read(&int_cmd->active) > 1) {
+		dev_err(&port->dd->pdev->dev,
+			"Internal command [%02X] failed\n", fis->command);
+		rv = -EIO;
+	}
+	if (readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+			& (1 << MTIP_TAG_INTERNAL)) {
+		rv = -ENXIO;
+		if (!test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+					&port->dd->dd_flag)) {
+			mtip_restart_port(port);
+			rv = -EAGAIN;
+		}
+	}
+exec_ic_exit:
+	/* Clear the allocated and active bits for the internal command. */
+	atomic_set(&int_cmd->active, 0);
+	release_slot(port, MTIP_TAG_INTERNAL);
+	if (rv >= 0 && mtip_pause_ncq(port, fis)) {
+		/* NCQ paused */
+		return rv;
+	}
+	clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+	wake_up_interruptible(&port->svc_wait);
+
+	return rv;
+}
+
+/*
+ * Byte-swap ATA ID strings.
+ *
+ * ATA identify data contains strings in byte-swapped 16-bit words.
+ * They must be swapped (on all architectures) to be usable as C strings.
+ * This function swaps bytes in-place.
+ *
+ * @buf The buffer location of the string
+ * @len The number of bytes to swap
+ *
+ * return value
+ *	None
+ */
+static inline void ata_swap_string(u16 *buf, unsigned int len)
+{
+	int i;
+	for (i = 0; i < (len/2); i++)
+		be16_to_cpus(&buf[i]);
+}
+
+/*
+ * Request the device identity information.
+ *
+ * If a user space buffer is not specified, i.e. is NULL, the
+ * identify information is still read from the drive and placed
+ * into the identify data buffer (@e port->identify) in the
+ * port data structure.
+ * When the identify buffer contains valid identify information @e
+ * port->identify_valid is non-zero.
+ *
+ * @port	 Pointer to the port structure.
+ * @user_buffer  A user space buffer where the identify data should be
+ *                    copied.
+ *
+ * return value
+ *	0	Command completed successfully.
+ *	-EFAULT An error occurred while coping data to the user buffer.
+ *	-1	Command failed.
+ */
+static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
+{
+	int rv = 0;
+	struct host_to_dev_fis fis;
+
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+		return -EFAULT;
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= ATA_CMD_ID_ATA;
+
+	/* Set the identify information as invalid. */
+	port->identify_valid = 0;
+
+	/* Clear the identify information. */
+	memset(port->identify, 0, sizeof(u16) * ATA_ID_WORDS);
+
+	/* Execute the command. */
+	if (mtip_exec_internal_command(port,
+				&fis,
+				5,
+				port->identify_dma,
+				sizeof(u16) * ATA_ID_WORDS,
+				0,
+				GFP_KERNEL,
+				MTIP_INTERNAL_COMMAND_TIMEOUT_MS)
+				< 0) {
+		rv = -1;
+		goto out;
+	}
+
+	/*
+	 * Perform any necessary byte-swapping.  Yes, the kernel does in fact
+	 * perform field-sensitive swapping on the string fields.
+	 * See the kernel use of ata_id_string() for proof of this.
+	 */
+#ifdef __LITTLE_ENDIAN
+	ata_swap_string(port->identify + 27, 40);  /* model string*/
+	ata_swap_string(port->identify + 23, 8);   /* firmware string*/
+	ata_swap_string(port->identify + 10, 20);  /* serial# string*/
+#else
+	{
+		int i;
+		for (i = 0; i < ATA_ID_WORDS; i++)
+			port->identify[i] = le16_to_cpu(port->identify[i]);
+	}
+#endif
+
+	/* Set the identify buffer as valid. */
+	port->identify_valid = 1;
+
+	if (user_buffer) {
+		if (copy_to_user(
+			user_buffer,
+			port->identify,
+			ATA_ID_WORDS * sizeof(u16))) {
+			rv = -EFAULT;
+			goto out;
+		}
+	}
+
+out:
+	return rv;
+}
+
+/*
+ * Issue a standby immediate command to the device.
+ *
+ * @port Pointer to the port structure.
+ *
+ * return value
+ *	0	Command was executed successfully.
+ *	-1	An error occurred while executing the command.
+ */
+static int mtip_standby_immediate(struct mtip_port *port)
+{
+	int rv;
+	struct host_to_dev_fis	fis;
+	unsigned long start;
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= ATA_CMD_STANDBYNOW1;
+
+	start = jiffies;
+	rv = mtip_exec_internal_command(port,
+					&fis,
+					5,
+					0,
+					0,
+					0,
+					GFP_ATOMIC,
+					15000);
+	dbg_printk(MTIP_DRV_NAME "Time taken to complete standby cmd: %d ms\n",
+			jiffies_to_msecs(jiffies - start));
+	if (rv)
+		dev_warn(&port->dd->pdev->dev,
+			"STANDBY IMMEDIATE command failed.\n");
+
+	return rv;
+}
+
+/*
+ * Issue a READ LOG EXT command to the device.
+ *
+ * @port	pointer to the port structure.
+ * @page	page number to fetch
+ * @buffer	pointer to buffer
+ * @buffer_dma	dma address corresponding to @buffer
+ * @sectors	page length to fetch, in sectors
+ *
+ * return value
+ *	@rv	return value from mtip_exec_internal_command()
+ */
+static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer,
+				dma_addr_t buffer_dma, unsigned int sectors)
+{
+	struct host_to_dev_fis fis;
+
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= ATA_CMD_READ_LOG_EXT;
+	fis.sect_count	= sectors & 0xFF;
+	fis.sect_cnt_ex	= (sectors >> 8) & 0xFF;
+	fis.lba_low	= page;
+	fis.lba_mid	= 0;
+	fis.device	= ATA_DEVICE_OBS;
+
+	memset(buffer, 0, sectors * ATA_SECT_SIZE);
+
+	return mtip_exec_internal_command(port,
+					&fis,
+					5,
+					buffer_dma,
+					sectors * ATA_SECT_SIZE,
+					0,
+					GFP_ATOMIC,
+					MTIP_INTERNAL_COMMAND_TIMEOUT_MS);
+}
+
+/*
+ * Issue a SMART READ DATA command to the device.
+ *
+ * @port	pointer to the port structure.
+ * @buffer	pointer to buffer
+ * @buffer_dma	dma address corresponding to @buffer
+ *
+ * return value
+ *	@rv	return value from mtip_exec_internal_command()
+ */
+static int mtip_get_smart_data(struct mtip_port *port, u8 *buffer,
+					dma_addr_t buffer_dma)
+{
+	struct host_to_dev_fis fis;
+
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= ATA_CMD_SMART;
+	fis.features	= 0xD0;
+	fis.sect_count	= 1;
+	fis.lba_mid	= 0x4F;
+	fis.lba_hi	= 0xC2;
+	fis.device	= ATA_DEVICE_OBS;
+
+	return mtip_exec_internal_command(port,
+					&fis,
+					5,
+					buffer_dma,
+					ATA_SECT_SIZE,
+					0,
+					GFP_ATOMIC,
+					15000);
+}
+
+/*
+ * Get the value of a smart attribute
+ *
+ * @port	pointer to the port structure
+ * @id		attribute number
+ * @attrib	pointer to return attrib information corresponding to @id
+ *
+ * return value
+ *	-EINVAL	NULL buffer passed or unsupported attribute @id.
+ *	-EPERM	Identify data not valid, SMART not supported or not enabled
+ */
+static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id,
+						struct smart_attr *attrib)
+{
+	int rv, i;
+	struct smart_attr *pattr;
+
+	if (!attrib)
+		return -EINVAL;
+
+	if (!port->identify_valid) {
+		dev_warn(&port->dd->pdev->dev, "IDENTIFY DATA not valid\n");
+		return -EPERM;
+	}
+	if (!(port->identify[82] & 0x1)) {
+		dev_warn(&port->dd->pdev->dev, "SMART not supported\n");
+		return -EPERM;
+	}
+	if (!(port->identify[85] & 0x1)) {
+		dev_warn(&port->dd->pdev->dev, "SMART not enabled\n");
+		return -EPERM;
+	}
+
+	memset(port->smart_buf, 0, ATA_SECT_SIZE);
+	rv = mtip_get_smart_data(port, port->smart_buf, port->smart_buf_dma);
+	if (rv) {
+		dev_warn(&port->dd->pdev->dev, "Failed to ge SMART data\n");
+		return rv;
+	}
+
+	pattr = (struct smart_attr *)(port->smart_buf + 2);
+	for (i = 0; i < 29; i++, pattr++)
+		if (pattr->attr_id == id) {
+			memcpy(attrib, pattr, sizeof(struct smart_attr));
+			break;
+		}
+
+	if (i == 29) {
+		dev_warn(&port->dd->pdev->dev,
+			"Query for invalid SMART attribute ID\n");
+		rv = -EINVAL;
+	}
+
+	return rv;
+}
+
+/*
+ * Get the drive capacity.
+ *
+ * @dd      Pointer to the device data structure.
+ * @sectors Pointer to the variable that will receive the sector count.
+ *
+ * return value
+ *	1 Capacity was returned successfully.
+ *	0 The identify information is invalid.
+ */
+static bool mtip_hw_get_capacity(struct driver_data *dd, sector_t *sectors)
+{
+	struct mtip_port *port = dd->port;
+	u64 total, raw0, raw1, raw2, raw3;
+	raw0 = port->identify[100];
+	raw1 = port->identify[101];
+	raw2 = port->identify[102];
+	raw3 = port->identify[103];
+	total = raw0 | raw1<<16 | raw2<<32 | raw3<<48;
+	*sectors = total;
+	return (bool) !!port->identify_valid;
+}
+
+/*
+ * Reset the HBA.
+ *
+ * Resets the HBA by setting the HBA Reset bit in the Global
+ * HBA Control register. After setting the HBA Reset bit the
+ * function waits for 1 second before reading the HBA Reset
+ * bit to make sure it has cleared. If HBA Reset is not clear
+ * an error is returned. Cannot be used in non-blockable
+ * context.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0  The reset was successful.
+ *	-1 The HBA Reset bit did not clear.
+ */
+static int mtip_hba_reset(struct driver_data *dd)
+{
+	mtip_deinit_port(dd->port);
+
+	/* Set the reset bit */
+	writel(HOST_RESET, dd->mmio + HOST_CTL);
+
+	/* Flush */
+	readl(dd->mmio + HOST_CTL);
+
+	/* Wait for reset to clear */
+	ssleep(1);
+
+	/* Check the bit has cleared */
+	if (readl(dd->mmio + HOST_CTL) & HOST_RESET) {
+		dev_err(&dd->pdev->dev,
+			"Reset bit did not clear.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Display the identify command data.
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_dump_identify(struct mtip_port *port)
+{
+	sector_t sectors;
+	unsigned short revid;
+	char cbuf[42];
+
+	if (!port->identify_valid)
+		return;
+
+	strlcpy(cbuf, (char *)(port->identify+10), 21);
+	dev_info(&port->dd->pdev->dev,
+		"Serial No.: %s\n", cbuf);
+
+	strlcpy(cbuf, (char *)(port->identify+23), 9);
+	dev_info(&port->dd->pdev->dev,
+		"Firmware Ver.: %s\n", cbuf);
+
+	strlcpy(cbuf, (char *)(port->identify+27), 41);
+	dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf);
+
+	if (mtip_hw_get_capacity(port->dd, &sectors))
+		dev_info(&port->dd->pdev->dev,
+			"Capacity: %llu sectors (%llu MB)\n",
+			 (u64)sectors,
+			 ((u64)sectors) * ATA_SECT_SIZE >> 20);
+
+	pci_read_config_word(port->dd->pdev, PCI_REVISION_ID, &revid);
+	switch (revid & 0xFF) {
+	case 0x1:
+		strlcpy(cbuf, "A0", 3);
+		break;
+	case 0x3:
+		strlcpy(cbuf, "A2", 3);
+		break;
+	default:
+		strlcpy(cbuf, "?", 2);
+		break;
+	}
+	dev_info(&port->dd->pdev->dev,
+		"Card Type: %s\n", cbuf);
+}
+
+/*
+ * Map the commands scatter list into the command table.
+ *
+ * @command Pointer to the command.
+ * @nents Number of scatter list entries.
+ *
+ * return value
+ *	None
+ */
+static inline void fill_command_sg(struct driver_data *dd,
+				struct mtip_cmd *command,
+				int nents)
+{
+	int n;
+	unsigned int dma_len;
+	struct mtip_cmd_sg *command_sg;
+	struct scatterlist *sg = command->sg;
+
+	command_sg = command->command + AHCI_CMD_TBL_HDR_SZ;
+
+	for (n = 0; n < nents; n++) {
+		dma_len = sg_dma_len(sg);
+		if (dma_len > 0x400000)
+			dev_err(&dd->pdev->dev,
+				"DMA segment length truncated\n");
+		command_sg->info = __force_bit2int
+			cpu_to_le32((dma_len-1) & 0x3FFFFF);
+		command_sg->dba	= __force_bit2int
+			cpu_to_le32(sg_dma_address(sg));
+		command_sg->dba_upper = __force_bit2int
+			cpu_to_le32((sg_dma_address(sg) >> 16) >> 16);
+		command_sg++;
+		sg++;
+	}
+}
+
+/*
+ * @brief Execute a drive command.
+ *
+ * return value 0 The command completed successfully.
+ * return value -1 An error occurred while executing the command.
+ */
+static int exec_drive_task(struct mtip_port *port, u8 *command)
+{
+	struct host_to_dev_fis	fis;
+	struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG);
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= command[0];
+	fis.features	= command[1];
+	fis.sect_count	= command[2];
+	fis.sector	= command[3];
+	fis.cyl_low	= command[4];
+	fis.cyl_hi	= command[5];
+	fis.device	= command[6] & ~0x10; /* Clear the dev bit*/
+
+	dbg_printk(MTIP_DRV_NAME " %s: User Command: cmd %x, feat %x, nsect %x, sect %x, lcyl %x, hcyl %x, sel %x\n",
+		__func__,
+		command[0],
+		command[1],
+		command[2],
+		command[3],
+		command[4],
+		command[5],
+		command[6]);
+
+	/* Execute the command. */
+	if (mtip_exec_internal_command(port,
+				 &fis,
+				 5,
+				 0,
+				 0,
+				 0,
+				 GFP_KERNEL,
+				 MTIP_IOCTL_COMMAND_TIMEOUT_MS) < 0) {
+		return -1;
+	}
+
+	command[0] = reply->command; /* Status*/
+	command[1] = reply->features; /* Error*/
+	command[4] = reply->cyl_low;
+	command[5] = reply->cyl_hi;
+
+	dbg_printk(MTIP_DRV_NAME " %s: Completion Status: stat %x, err %x , cyl_lo %x cyl_hi %x\n",
+		__func__,
+		command[0],
+		command[1],
+		command[4],
+		command[5]);
+
+	return 0;
+}
+
+/*
+ * @brief Execute a drive command.
+ *
+ * @param port Pointer to the port data structure.
+ * @param command Pointer to the user specified command parameters.
+ * @param user_buffer Pointer to the user space buffer where read sector
+ *                   data should be copied.
+ *
+ * return value 0 The command completed successfully.
+ * return value -EFAULT An error occurred while copying the completion
+ *                 data to the user space buffer.
+ * return value -1 An error occurred while executing the command.
+ */
+static int exec_drive_command(struct mtip_port *port, u8 *command,
+				void __user *user_buffer)
+{
+	struct host_to_dev_fis	fis;
+	struct host_to_dev_fis *reply;
+	u8 *buf = NULL;
+	dma_addr_t dma_addr = 0;
+	int rv = 0, xfer_sz = command[3];
+
+	if (xfer_sz) {
+		if (!user_buffer)
+			return -EFAULT;
+
+		buf = dmam_alloc_coherent(&port->dd->pdev->dev,
+				ATA_SECT_SIZE * xfer_sz,
+				&dma_addr,
+				GFP_KERNEL);
+		if (!buf) {
+			dev_err(&port->dd->pdev->dev,
+				"Memory allocation failed (%d bytes)\n",
+				ATA_SECT_SIZE * xfer_sz);
+			return -ENOMEM;
+		}
+		memset(buf, 0, ATA_SECT_SIZE * xfer_sz);
+	}
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= command[0];
+	fis.features	= command[2];
+	fis.sect_count	= command[3];
+	if (fis.command == ATA_CMD_SMART) {
+		fis.sector	= command[1];
+		fis.cyl_low	= 0x4F;
+		fis.cyl_hi	= 0xC2;
+	}
+
+	if (xfer_sz)
+		reply = (port->rxfis + RX_FIS_PIO_SETUP);
+	else
+		reply = (port->rxfis + RX_FIS_D2H_REG);
+
+	dbg_printk(MTIP_DRV_NAME
+		" %s: User Command: cmd %x, sect %x, "
+		"feat %x, sectcnt %x\n",
+		__func__,
+		command[0],
+		command[1],
+		command[2],
+		command[3]);
+
+	/* Execute the command. */
+	if (mtip_exec_internal_command(port,
+				&fis,
+				 5,
+				 (xfer_sz ? dma_addr : 0),
+				 (xfer_sz ? ATA_SECT_SIZE * xfer_sz : 0),
+				 0,
+				 GFP_KERNEL,
+				 MTIP_IOCTL_COMMAND_TIMEOUT_MS)
+				 < 0) {
+		rv = -EFAULT;
+		goto exit_drive_command;
+	}
+
+	/* Collect the completion status. */
+	command[0] = reply->command; /* Status*/
+	command[1] = reply->features; /* Error*/
+	command[2] = reply->sect_count;
+
+	dbg_printk(MTIP_DRV_NAME
+		" %s: Completion Status: stat %x, "
+		"err %x, nsect %x\n",
+		__func__,
+		command[0],
+		command[1],
+		command[2]);
+
+	if (xfer_sz) {
+		if (copy_to_user(user_buffer,
+				 buf,
+				 ATA_SECT_SIZE * command[3])) {
+			rv = -EFAULT;
+			goto exit_drive_command;
+		}
+	}
+exit_drive_command:
+	if (buf)
+		dmam_free_coherent(&port->dd->pdev->dev,
+				ATA_SECT_SIZE * xfer_sz, buf, dma_addr);
+	return rv;
+}
+
+/*
+ *  Indicates whether a command has a single sector payload.
+ *
+ *  @command passed to the device to perform the certain event.
+ *  @features passed to the device to perform the certain event.
+ *
+ *  return value
+ *	1	command is one that always has a single sector payload,
+ *		regardless of the value in the Sector Count field.
+ *      0       otherwise
+ *
+ */
+static unsigned int implicit_sector(unsigned char command,
+				    unsigned char features)
+{
+	unsigned int rv = 0;
+
+	/* list of commands that have an implicit sector count of 1 */
+	switch (command) {
+	case ATA_CMD_SEC_SET_PASS:
+	case ATA_CMD_SEC_UNLOCK:
+	case ATA_CMD_SEC_ERASE_PREP:
+	case ATA_CMD_SEC_ERASE_UNIT:
+	case ATA_CMD_SEC_FREEZE_LOCK:
+	case ATA_CMD_SEC_DISABLE_PASS:
+	case ATA_CMD_PMP_READ:
+	case ATA_CMD_PMP_WRITE:
+		rv = 1;
+		break;
+	case ATA_CMD_SET_MAX:
+		if (features == ATA_SET_MAX_UNLOCK)
+			rv = 1;
+		break;
+	case ATA_CMD_SMART:
+		if ((features == ATA_SMART_READ_VALUES) ||
+				(features == ATA_SMART_READ_THRESHOLDS))
+			rv = 1;
+		break;
+	case ATA_CMD_CONF_OVERLAY:
+		if ((features == ATA_DCO_IDENTIFY) ||
+				(features == ATA_DCO_SET))
+			rv = 1;
+		break;
+	}
+	return rv;
+}
+
+static void mtip_set_timeout(struct host_to_dev_fis *fis, unsigned int *timeout)
+{
+	switch (fis->command) {
+	case ATA_CMD_DOWNLOAD_MICRO:
+		*timeout = 120000; /* 2 minutes */
+		break;
+	case ATA_CMD_SEC_ERASE_UNIT:
+	case 0xFC:
+		*timeout = 240000; /* 4 minutes */
+		break;
+	case ATA_CMD_STANDBYNOW1:
+		*timeout = 120000;  /* 2 minutes */
+		break;
+	case 0xF7:
+	case 0xFA:
+		*timeout = 60000;  /* 60 seconds */
+		break;
+	case ATA_CMD_SMART:
+		*timeout = 15000;  /* 15 seconds */
+		break;
+	default:
+		*timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS;
+		break;
+	}
+}
+
+/*
+ * Executes a taskfile
+ * See ide_taskfile_ioctl() for derivation
+ */
+static int exec_drive_taskfile(struct driver_data *dd,
+			       void __user *buf,
+			       ide_task_request_t *req_task,
+			       int outtotal)
+{
+	struct host_to_dev_fis	fis;
+	struct host_to_dev_fis *reply;
+	u8 *outbuf = NULL;
+	u8 *inbuf = NULL;
+	dma_addr_t outbuf_dma = 0;
+	dma_addr_t inbuf_dma = 0;
+	dma_addr_t dma_buffer = 0;
+	int err = 0;
+	unsigned int taskin = 0;
+	unsigned int taskout = 0;
+	u8 nsect = 0;
+	unsigned int timeout;
+	unsigned int force_single_sector;
+	unsigned int transfer_size;
+	unsigned long task_file_data;
+	int intotal = outtotal + req_task->out_size;
+
+	taskout = req_task->out_size;
+	taskin = req_task->in_size;
+	/* 130560 = 512 * 0xFF*/
+	if (taskin > 130560 || taskout > 130560) {
+		err = -EINVAL;
+		goto abort;
+	}
+
+	if (taskout) {
+		outbuf = kzalloc(taskout, GFP_KERNEL);
+		if (outbuf == NULL) {
+			err = -ENOMEM;
+			goto abort;
+		}
+		if (copy_from_user(outbuf, buf + outtotal, taskout)) {
+			err = -EFAULT;
+			goto abort;
+		}
+		outbuf_dma = pci_map_single(dd->pdev,
+					 outbuf,
+					 taskout,
+					 DMA_TO_DEVICE);
+		if (outbuf_dma == 0) {
+			err = -ENOMEM;
+			goto abort;
+		}
+		dma_buffer = outbuf_dma;
+	}
+
+	if (taskin) {
+		inbuf = kzalloc(taskin, GFP_KERNEL);
+		if (inbuf == NULL) {
+			err = -ENOMEM;
+			goto abort;
+		}
+
+		if (copy_from_user(inbuf, buf + intotal, taskin)) {
+			err = -EFAULT;
+			goto abort;
+		}
+		inbuf_dma = pci_map_single(dd->pdev,
+					 inbuf,
+					 taskin, DMA_FROM_DEVICE);
+		if (inbuf_dma == 0) {
+			err = -ENOMEM;
+			goto abort;
+		}
+		dma_buffer = inbuf_dma;
+	}
+
+	/* only supports PIO and non-data commands from this ioctl. */
+	switch (req_task->data_phase) {
+	case TASKFILE_OUT:
+		nsect = taskout / ATA_SECT_SIZE;
+		reply = (dd->port->rxfis + RX_FIS_PIO_SETUP);
+		break;
+	case TASKFILE_IN:
+		reply = (dd->port->rxfis + RX_FIS_PIO_SETUP);
+		break;
+	case TASKFILE_NO_DATA:
+		reply = (dd->port->rxfis + RX_FIS_D2H_REG);
+		break;
+	default:
+		err = -EINVAL;
+		goto abort;
+	}
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= req_task->io_ports[7];
+	fis.features	= req_task->io_ports[1];
+	fis.sect_count	= req_task->io_ports[2];
+	fis.lba_low	= req_task->io_ports[3];
+	fis.lba_mid	= req_task->io_ports[4];
+	fis.lba_hi	= req_task->io_ports[5];
+	 /* Clear the dev bit*/
+	fis.device	= req_task->io_ports[6] & ~0x10;
+
+	if ((req_task->in_flags.all == 0) && (req_task->out_flags.all & 1)) {
+		req_task->in_flags.all	=
+			IDE_TASKFILE_STD_IN_FLAGS |
+			(IDE_HOB_STD_IN_FLAGS << 8);
+		fis.lba_low_ex		= req_task->hob_ports[3];
+		fis.lba_mid_ex		= req_task->hob_ports[4];
+		fis.lba_hi_ex		= req_task->hob_ports[5];
+		fis.features_ex		= req_task->hob_ports[1];
+		fis.sect_cnt_ex		= req_task->hob_ports[2];
+
+	} else {
+		req_task->in_flags.all = IDE_TASKFILE_STD_IN_FLAGS;
+	}
+
+	force_single_sector = implicit_sector(fis.command, fis.features);
+
+	if ((taskin || taskout) && (!fis.sect_count)) {
+		if (nsect)
+			fis.sect_count = nsect;
+		else {
+			if (!force_single_sector) {
+				dev_warn(&dd->pdev->dev,
+					"data movement but "
+					"sect_count is 0\n");
+					err = -EINVAL;
+					goto abort;
+			}
+		}
+	}
+
+	dbg_printk(MTIP_DRV_NAME
+		" %s: cmd %x, feat %x, nsect %x,"
+		" sect/lbal %x, lcyl/lbam %x, hcyl/lbah %x,"
+		" head/dev %x\n",
+		__func__,
+		fis.command,
+		fis.features,
+		fis.sect_count,
+		fis.lba_low,
+		fis.lba_mid,
+		fis.lba_hi,
+		fis.device);
+
+	mtip_set_timeout(&fis, &timeout);
+
+	/* Determine the correct transfer size.*/
+	if (force_single_sector)
+		transfer_size = ATA_SECT_SIZE;
+	else
+		transfer_size = ATA_SECT_SIZE * fis.sect_count;
+
+	/* Execute the command.*/
+	if (mtip_exec_internal_command(dd->port,
+				 &fis,
+				 5,
+				 dma_buffer,
+				 transfer_size,
+				 0,
+				 GFP_KERNEL,
+				 timeout) < 0) {
+		err = -EIO;
+		goto abort;
+	}
+
+	task_file_data = readl(dd->port->mmio+PORT_TFDATA);
+
+	if ((req_task->data_phase == TASKFILE_IN) && !(task_file_data & 1)) {
+		reply = dd->port->rxfis + RX_FIS_PIO_SETUP;
+		req_task->io_ports[7] = reply->control;
+	} else {
+		reply = dd->port->rxfis + RX_FIS_D2H_REG;
+		req_task->io_ports[7] = reply->command;
+	}
+
+	/* reclaim the DMA buffers.*/
+	if (inbuf_dma)
+		pci_unmap_single(dd->pdev, inbuf_dma,
+			taskin, DMA_FROM_DEVICE);
+	if (outbuf_dma)
+		pci_unmap_single(dd->pdev, outbuf_dma,
+			taskout, DMA_TO_DEVICE);
+	inbuf_dma  = 0;
+	outbuf_dma = 0;
+
+	/* return the ATA registers to the caller.*/
+	req_task->io_ports[1] = reply->features;
+	req_task->io_ports[2] = reply->sect_count;
+	req_task->io_ports[3] = reply->lba_low;
+	req_task->io_ports[4] = reply->lba_mid;
+	req_task->io_ports[5] = reply->lba_hi;
+	req_task->io_ports[6] = reply->device;
+
+	if (req_task->out_flags.all & 1)  {
+
+		req_task->hob_ports[3] = reply->lba_low_ex;
+		req_task->hob_ports[4] = reply->lba_mid_ex;
+		req_task->hob_ports[5] = reply->lba_hi_ex;
+		req_task->hob_ports[1] = reply->features_ex;
+		req_task->hob_ports[2] = reply->sect_cnt_ex;
+	}
+	dbg_printk(MTIP_DRV_NAME
+		" %s: Completion: stat %x,"
+		"err %x, sect_cnt %x, lbalo %x,"
+		"lbamid %x, lbahi %x, dev %x\n",
+		__func__,
+		req_task->io_ports[7],
+		req_task->io_ports[1],
+		req_task->io_ports[2],
+		req_task->io_ports[3],
+		req_task->io_ports[4],
+		req_task->io_ports[5],
+		req_task->io_ports[6]);
+
+	if (taskout) {
+		if (copy_to_user(buf + outtotal, outbuf, taskout)) {
+			err = -EFAULT;
+			goto abort;
+		}
+	}
+	if (taskin) {
+		if (copy_to_user(buf + intotal, inbuf, taskin)) {
+			err = -EFAULT;
+			goto abort;
+		}
+	}
+abort:
+	if (inbuf_dma)
+		pci_unmap_single(dd->pdev, inbuf_dma,
+					taskin, DMA_FROM_DEVICE);
+	if (outbuf_dma)
+		pci_unmap_single(dd->pdev, outbuf_dma,
+					taskout, DMA_TO_DEVICE);
+	kfree(outbuf);
+	kfree(inbuf);
+
+	return err;
+}
+
+/*
+ * Handle IOCTL calls from the Block Layer.
+ *
+ * This function is called by the Block Layer when it receives an IOCTL
+ * command that it does not understand. If the IOCTL command is not supported
+ * this function returns -ENOTTY.
+ *
+ * @dd  Pointer to the driver data structure.
+ * @cmd IOCTL command passed from the Block Layer.
+ * @arg IOCTL argument passed from the Block Layer.
+ *
+ * return value
+ *	0	The IOCTL completed successfully.
+ *	-ENOTTY The specified command is not supported.
+ *	-EFAULT An error occurred copying data to a user space buffer.
+ *	-EIO	An error occurred while executing the command.
+ */
+static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
+			 unsigned long arg)
+{
+	switch (cmd) {
+	case HDIO_GET_IDENTITY:
+	{
+		if (copy_to_user((void __user *)arg, dd->port->identify,
+						sizeof(u16) * ATA_ID_WORDS))
+			return -EFAULT;
+		break;
+	}
+	case HDIO_DRIVE_CMD:
+	{
+		u8 drive_command[4];
+
+		/* Copy the user command info to our buffer. */
+		if (copy_from_user(drive_command,
+					 (void __user *) arg,
+					 sizeof(drive_command)))
+			return -EFAULT;
+
+		/* Execute the drive command. */
+		if (exec_drive_command(dd->port,
+					 drive_command,
+					 (void __user *) (arg+4)))
+			return -EIO;
+
+		/* Copy the status back to the users buffer. */
+		if (copy_to_user((void __user *) arg,
+					 drive_command,
+					 sizeof(drive_command)))
+			return -EFAULT;
+
+		break;
+	}
+	case HDIO_DRIVE_TASK:
+	{
+		u8 drive_command[7];
+
+		/* Copy the user command info to our buffer. */
+		if (copy_from_user(drive_command,
+					 (void __user *) arg,
+					 sizeof(drive_command)))
+			return -EFAULT;
+
+		/* Execute the drive command. */
+		if (exec_drive_task(dd->port, drive_command))
+			return -EIO;
+
+		/* Copy the status back to the users buffer. */
+		if (copy_to_user((void __user *) arg,
+					 drive_command,
+					 sizeof(drive_command)))
+			return -EFAULT;
+
+		break;
+	}
+	case HDIO_DRIVE_TASKFILE: {
+		ide_task_request_t req_task;
+		int ret, outtotal;
+
+		if (copy_from_user(&req_task, (void __user *) arg,
+					sizeof(req_task)))
+			return -EFAULT;
+
+		outtotal = sizeof(req_task);
+
+		ret = exec_drive_taskfile(dd, (void __user *) arg,
+						&req_task, outtotal);
+
+		if (copy_to_user((void __user *) arg, &req_task,
+							sizeof(req_task)))
+			return -EFAULT;
+
+		return ret;
+	}
+
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Submit an IO to the hw
+ *
+ * This function is called by the block layer to issue an io
+ * to the device. Upon completion, the callback function will
+ * be called with the data parameter passed as the callback data.
+ *
+ * @dd       Pointer to the driver data structure.
+ * @start    First sector to read.
+ * @nsect    Number of sectors to read.
+ * @nents    Number of entries in scatter list for the read command.
+ * @tag      The tag of this read command.
+ * @callback Pointer to the function that should be called
+ *	     when the read completes.
+ * @data     Callback data passed to the callback function
+ *	     when the read completes.
+ * @dir      Direction (read or write)
+ *
+ * return value
+ *	None
+ */
+static void mtip_hw_submit_io(struct driver_data *dd, sector_t start,
+			      int nsect, int nents, int tag, void *callback,
+			      void *data, int dir)
+{
+	struct host_to_dev_fis	*fis;
+	struct mtip_port *port = dd->port;
+	struct mtip_cmd *command = &port->commands[tag];
+	int dma_dir = (dir == READ) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+
+	/* Map the scatter list for DMA access */
+	nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir);
+
+	command->scatter_ents = nents;
+
+	/*
+	 * The number of retries for this command before it is
+	 * reported as a failure to the upper layers.
+	 */
+	command->retries = MTIP_MAX_RETRIES;
+
+	/* Fill out fis */
+	fis = command->command;
+	fis->type        = 0x27;
+	fis->opts        = 1 << 7;
+	fis->command     =
+		(dir == READ ? ATA_CMD_FPDMA_READ : ATA_CMD_FPDMA_WRITE);
+	*((unsigned int *) &fis->lba_low) = (start & 0xFFFFFF);
+	*((unsigned int *) &fis->lba_low_ex) = ((start >> 24) & 0xFFFFFF);
+	fis->device	 = 1 << 6;
+	fis->features    = nsect & 0xFF;
+	fis->features_ex = (nsect >> 8) & 0xFF;
+	fis->sect_count  = ((tag << 3) | (tag >> 5));
+	fis->sect_cnt_ex = 0;
+	fis->control     = 0;
+	fis->res2        = 0;
+	fis->res3        = 0;
+	fill_command_sg(dd, command, nents);
+
+	/* Populate the command header */
+	command->command_header->opts =
+			__force_bit2int cpu_to_le32(
+				(nents << 16) | 5 | AHCI_CMD_PREFETCH);
+	command->command_header->byte_count = 0;
+
+	/*
+	 * Set the completion function and data for the command
+	 * within this layer.
+	 */
+	command->comp_data = dd;
+	command->comp_func = mtip_async_complete;
+	command->direction = dma_dir;
+
+	/*
+	 * Set the completion function and data for the command passed
+	 * from the upper layer.
+	 */
+	command->async_data = data;
+	command->async_callback = callback;
+
+	/*
+	 * To prevent this command from being issued
+	 * if an internal command is in progress or error handling is active.
+	 */
+	if (port->flags & MTIP_PF_PAUSE_IO) {
+		set_bit(tag, port->cmds_to_issue);
+		set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
+		return;
+	}
+
+	/* Issue the command to the hardware */
+	mtip_issue_ncq_command(port, tag);
+
+	return;
+}
+
+/*
+ * Release a command slot.
+ *
+ * @dd  Pointer to the driver data structure.
+ * @tag Slot tag
+ *
+ * return value
+ *      None
+ */
+static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag)
+{
+	release_slot(dd->port, tag);
+}
+
+/*
+ * Obtain a command slot and return its associated scatter list.
+ *
+ * @dd  Pointer to the driver data structure.
+ * @tag Pointer to an int that will receive the allocated command
+ *            slot tag.
+ *
+ * return value
+ *	Pointer to the scatter list for the allocated command slot
+ *	or NULL if no command slots are available.
+ */
+static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd,
+						   int *tag)
+{
+	/*
+	 * It is possible that, even with this semaphore, a thread
+	 * may think that no command slots are available. Therefore, we
+	 * need to make an attempt to get_slot().
+	 */
+	down(&dd->port->cmd_slot);
+	*tag = get_slot(dd->port);
+
+	if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) {
+		up(&dd->port->cmd_slot);
+		return NULL;
+	}
+	if (unlikely(*tag < 0)) {
+		up(&dd->port->cmd_slot);
+		return NULL;
+	}
+
+	return dd->port->commands[*tag].sg;
+}
+
+/*
+ * Sysfs status dump.
+ *
+ * @dev  Pointer to the device structure, passed by the kernrel.
+ * @attr Pointer to the device_attribute structure passed by the kernel.
+ * @buf  Pointer to the char buffer that will receive the stats info.
+ *
+ * return value
+ *	The size, in bytes, of the data copied into buf.
+ */
+static ssize_t mtip_hw_show_status(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct driver_data *dd = dev_to_disk(dev)->private_data;
+	int size = 0;
+
+	if (test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))
+		size += sprintf(buf, "%s", "thermal_shutdown\n");
+	else if (test_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag))
+		size += sprintf(buf, "%s", "write_protect\n");
+	else
+		size += sprintf(buf, "%s", "online\n");
+
+	return size;
+}
+
+static DEVICE_ATTR(status, S_IRUGO, mtip_hw_show_status, NULL);
+
+static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf,
+				  size_t len, loff_t *offset)
+{
+	struct driver_data *dd =  (struct driver_data *)f->private_data;
+	char buf[MTIP_DFS_MAX_BUF_SIZE];
+	u32 group_allocated;
+	int size = *offset;
+	int n;
+
+	if (!len || size)
+		return 0;
+
+	size += sprintf(&buf[size], "H/ S ACTive      : [ 0x");
+
+	for (n = dd->slot_groups-1; n >= 0; n--)
+		size += sprintf(&buf[size], "%08X ",
+					 readl(dd->port->s_active[n]));
+
+	size += sprintf(&buf[size], "]\n");
+	size += sprintf(&buf[size], "H/ Command Issue : [ 0x");
+
+	for (n = dd->slot_groups-1; n >= 0; n--)
+		size += sprintf(&buf[size], "%08X ",
+					readl(dd->port->cmd_issue[n]));
+
+	size += sprintf(&buf[size], "]\n");
+	size += sprintf(&buf[size], "H/ Completed     : [ 0x");
+
+	for (n = dd->slot_groups-1; n >= 0; n--)
+		size += sprintf(&buf[size], "%08X ",
+				readl(dd->port->completed[n]));
+
+	size += sprintf(&buf[size], "]\n");
+	size += sprintf(&buf[size], "H/ PORT IRQ STAT : [ 0x%08X ]\n",
+				readl(dd->port->mmio + PORT_IRQ_STAT));
+	size += sprintf(&buf[size], "H/ HOST IRQ STAT : [ 0x%08X ]\n",
+				readl(dd->mmio + HOST_IRQ_STAT));
+	size += sprintf(&buf[size], "\n");
+
+	size += sprintf(&buf[size], "L/ Allocated     : [ 0x");
+
+	for (n = dd->slot_groups-1; n >= 0; n--) {
+		if (sizeof(long) > sizeof(u32))
+			group_allocated =
+				dd->port->allocated[n/2] >> (32*(n&1));
+		else
+			group_allocated = dd->port->allocated[n];
+		size += sprintf(&buf[size], "%08X ", group_allocated);
+	}
+	size += sprintf(&buf[size], "]\n");
+
+	size += sprintf(&buf[size], "L/ Commands in Q : [ 0x");
+
+	for (n = dd->slot_groups-1; n >= 0; n--) {
+		if (sizeof(long) > sizeof(u32))
+			group_allocated =
+				dd->port->cmds_to_issue[n/2] >> (32*(n&1));
+		else
+			group_allocated = dd->port->cmds_to_issue[n];
+		size += sprintf(&buf[size], "%08X ", group_allocated);
+	}
+	size += sprintf(&buf[size], "]\n");
+
+	*offset = size <= len ? size : len;
+	size = copy_to_user(ubuf, buf, *offset);
+	if (size)
+		return -EFAULT;
+
+	return *offset;
+}
+
+static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf,
+				  size_t len, loff_t *offset)
+{
+	struct driver_data *dd =  (struct driver_data *)f->private_data;
+	char buf[MTIP_DFS_MAX_BUF_SIZE];
+	int size = *offset;
+
+	if (!len || size)
+		return 0;
+
+	size += sprintf(&buf[size], "Flag-port : [ %08lX ]\n",
+							dd->port->flags);
+	size += sprintf(&buf[size], "Flag-dd   : [ %08lX ]\n",
+							dd->dd_flag);
+
+	*offset = size <= len ? size : len;
+	size = copy_to_user(ubuf, buf, *offset);
+	if (size)
+		return -EFAULT;
+
+	return *offset;
+}
+
+static const struct file_operations mtip_regs_fops = {
+	.owner  = THIS_MODULE,
+	.open   = simple_open,
+	.read   = mtip_hw_read_registers,
+	.llseek = no_llseek,
+};
+
+static const struct file_operations mtip_flags_fops = {
+	.owner  = THIS_MODULE,
+	.open   = simple_open,
+	.read   = mtip_hw_read_flags,
+	.llseek = no_llseek,
+};
+
+/*
+ * Create the sysfs related attributes.
+ *
+ * @dd   Pointer to the driver data structure.
+ * @kobj Pointer to the kobj for the block device.
+ *
+ * return value
+ *	0	Operation completed successfully.
+ *	-EINVAL Invalid parameter.
+ */
+static int mtip_hw_sysfs_init(struct driver_data *dd, struct kobject *kobj)
+{
+	if (!kobj || !dd)
+		return -EINVAL;
+
+	if (sysfs_create_file(kobj, &dev_attr_status.attr))
+		dev_warn(&dd->pdev->dev,
+			"Error creating 'status' sysfs entry\n");
+	return 0;
+}
+
+/*
+ * Remove the sysfs related attributes.
+ *
+ * @dd   Pointer to the driver data structure.
+ * @kobj Pointer to the kobj for the block device.
+ *
+ * return value
+ *	0	Operation completed successfully.
+ *	-EINVAL Invalid parameter.
+ */
+static int mtip_hw_sysfs_exit(struct driver_data *dd, struct kobject *kobj)
+{
+	if (!kobj || !dd)
+		return -EINVAL;
+
+	sysfs_remove_file(kobj, &dev_attr_status.attr);
+
+	return 0;
+}
+
+static int mtip_hw_debugfs_init(struct driver_data *dd)
+{
+	if (!dfs_parent)
+		return -1;
+
+	dd->dfs_node = debugfs_create_dir(dd->disk->disk_name, dfs_parent);
+	if (IS_ERR_OR_NULL(dd->dfs_node)) {
+		dev_warn(&dd->pdev->dev,
+			"Error creating node %s under debugfs\n",
+						dd->disk->disk_name);
+		dd->dfs_node = NULL;
+		return -1;
+	}
+
+	debugfs_create_file("flags", S_IRUGO, dd->dfs_node, dd,
+							&mtip_flags_fops);
+	debugfs_create_file("registers", S_IRUGO, dd->dfs_node, dd,
+							&mtip_regs_fops);
+
+	return 0;
+}
+
+static void mtip_hw_debugfs_exit(struct driver_data *dd)
+{
+	debugfs_remove_recursive(dd->dfs_node);
+}
+
+
+/*
+ * Perform any init/resume time hardware setup
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	None
+ */
+static inline void hba_setup(struct driver_data *dd)
+{
+	u32 hwdata;
+	hwdata = readl(dd->mmio + HOST_HSORG);
+
+	/* interrupt bug workaround: use only 1 IS bit.*/
+	writel(hwdata |
+		HSORG_DISABLE_SLOTGRP_INTR |
+		HSORG_DISABLE_SLOTGRP_PXIS,
+		dd->mmio + HOST_HSORG);
+}
+
+/*
+ * Detect the details of the product, and store anything needed
+ * into the driver data structure.  This includes product type and
+ * version and number of slot groups.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_detect_product(struct driver_data *dd)
+{
+	u32 hwdata;
+	unsigned int rev, slotgroups;
+
+	/*
+	 * HBA base + 0xFC [15:0] - vendor-specific hardware interface
+	 * info register:
+	 * [15:8] hardware/software interface rev#
+	 * [   3] asic-style interface
+	 * [ 2:0] number of slot groups, minus 1 (only valid for asic-style).
+	 */
+	hwdata = readl(dd->mmio + HOST_HSORG);
+
+	dd->product_type = MTIP_PRODUCT_UNKNOWN;
+	dd->slot_groups = 1;
+
+	if (hwdata & 0x8) {
+		dd->product_type = MTIP_PRODUCT_ASICFPGA;
+		rev = (hwdata & HSORG_HWREV) >> 8;
+		slotgroups = (hwdata & HSORG_SLOTGROUPS) + 1;
+		dev_info(&dd->pdev->dev,
+			"ASIC-FPGA design, HS rev 0x%x, "
+			"%i slot groups [%i slots]\n",
+			 rev,
+			 slotgroups,
+			 slotgroups * 32);
+
+		if (slotgroups > MTIP_MAX_SLOT_GROUPS) {
+			dev_warn(&dd->pdev->dev,
+				"Warning: driver only supports "
+				"%i slot groups.\n", MTIP_MAX_SLOT_GROUPS);
+			slotgroups = MTIP_MAX_SLOT_GROUPS;
+		}
+		dd->slot_groups = slotgroups;
+		return;
+	}
+
+	dev_warn(&dd->pdev->dev, "Unrecognized product id\n");
+}
+
+/*
+ * Blocking wait for FTL rebuild to complete
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ *	0	FTL rebuild completed successfully
+ *	-EFAULT FTL rebuild error/timeout/interruption
+ */
+static int mtip_ftl_rebuild_poll(struct driver_data *dd)
+{
+	unsigned long timeout, cnt = 0, start;
+
+	dev_warn(&dd->pdev->dev,
+		"FTL rebuild in progress. Polling for completion.\n");
+
+	start = jiffies;
+	timeout = jiffies + msecs_to_jiffies(MTIP_FTL_REBUILD_TIMEOUT_MS);
+
+	do {
+		if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+				&dd->dd_flag)))
+			return -EFAULT;
+		if (mtip_check_surprise_removal(dd->pdev))
+			return -EFAULT;
+
+		if (mtip_get_identify(dd->port, NULL) < 0)
+			return -EFAULT;
+
+		if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
+			MTIP_FTL_REBUILD_MAGIC) {
+			ssleep(1);
+			/* Print message every 3 minutes */
+			if (cnt++ >= 180) {
+				dev_warn(&dd->pdev->dev,
+				"FTL rebuild in progress (%d secs).\n",
+				jiffies_to_msecs(jiffies - start) / 1000);
+				cnt = 0;
+			}
+		} else {
+			dev_warn(&dd->pdev->dev,
+				"FTL rebuild complete (%d secs).\n",
+			jiffies_to_msecs(jiffies - start) / 1000);
+			mtip_block_initialize(dd);
+			return 0;
+		}
+		ssleep(10);
+	} while (time_before(jiffies, timeout));
+
+	/* Check for timeout */
+	dev_err(&dd->pdev->dev,
+		"Timed out waiting for FTL rebuild to complete (%d secs).\n",
+		jiffies_to_msecs(jiffies - start) / 1000);
+	return -EFAULT;
+}
+
+/*
+ * service thread to issue queued commands
+ *
+ * @data Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+
+static int mtip_service_thread(void *data)
+{
+	struct driver_data *dd = (struct driver_data *)data;
+	unsigned long slot, slot_start, slot_wrap;
+	unsigned int num_cmd_slots = dd->slot_groups * 32;
+	struct mtip_port *port = dd->port;
+
+	while (1) {
+		/*
+		 * the condition is to check neither an internal command is
+		 * is in progress nor error handling is active
+		 */
+		wait_event_interruptible(port->svc_wait, (port->flags) &&
+			!(port->flags & MTIP_PF_PAUSE_IO));
+
+		if (kthread_should_stop())
+			break;
+
+		if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+				&dd->dd_flag)))
+			break;
+
+		set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
+		if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) {
+			slot = 1;
+			/* used to restrict the loop to one iteration */
+			slot_start = num_cmd_slots;
+			slot_wrap = 0;
+			while (1) {
+				slot = find_next_bit(port->cmds_to_issue,
+						num_cmd_slots, slot);
+				if (slot_wrap == 1) {
+					if ((slot_start >= slot) ||
+						(slot >= num_cmd_slots))
+						break;
+				}
+				if (unlikely(slot_start == num_cmd_slots))
+					slot_start = slot;
+
+				if (unlikely(slot == num_cmd_slots)) {
+					slot = 1;
+					slot_wrap = 1;
+					continue;
+				}
+
+				/* Issue the command to the hardware */
+				mtip_issue_ncq_command(port, slot);
+
+				clear_bit(slot, port->cmds_to_issue);
+			}
+
+			clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
+		} else if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) {
+			if (!mtip_ftl_rebuild_poll(dd))
+				set_bit(MTIP_DDF_REBUILD_FAILED_BIT,
+							&dd->dd_flag);
+			clear_bit(MTIP_PF_REBUILD_BIT, &port->flags);
+		}
+		clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
+
+		if (test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags))
+			break;
+	}
+	return 0;
+}
+
+/*
+ * Called once for each card.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0 on success, else an error code.
+ */
+static int mtip_hw_init(struct driver_data *dd)
+{
+	int i;
+	int rv;
+	unsigned int num_command_slots;
+	unsigned long timeout, timetaken;
+	unsigned char *buf;
+	struct smart_attr attr242;
+
+	dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR];
+
+	mtip_detect_product(dd);
+	if (dd->product_type == MTIP_PRODUCT_UNKNOWN) {
+		rv = -EIO;
+		goto out1;
+	}
+	num_command_slots = dd->slot_groups * 32;
+
+	hba_setup(dd);
+
+	tasklet_init(&dd->tasklet, mtip_tasklet, (unsigned long)dd);
+
+	dd->port = kzalloc(sizeof(struct mtip_port), GFP_KERNEL);
+	if (!dd->port) {
+		dev_err(&dd->pdev->dev,
+			"Memory allocation: port structure\n");
+		return -ENOMEM;
+	}
+
+	/* Counting semaphore to track command slot usage */
+	sema_init(&dd->port->cmd_slot, num_command_slots - 1);
+
+	/* Spinlock to prevent concurrent issue */
+	spin_lock_init(&dd->port->cmd_issue_lock);
+
+	/* Set the port mmio base address. */
+	dd->port->mmio	= dd->mmio + PORT_OFFSET;
+	dd->port->dd	= dd;
+
+	/* Allocate memory for the command list. */
+	dd->port->command_list =
+		dmam_alloc_coherent(&dd->pdev->dev,
+			HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
+			&dd->port->command_list_dma,
+			GFP_KERNEL);
+	if (!dd->port->command_list) {
+		dev_err(&dd->pdev->dev,
+			"Memory allocation: command list\n");
+		rv = -ENOMEM;
+		goto out1;
+	}
+
+	/* Clear the memory we have allocated. */
+	memset(dd->port->command_list,
+		0,
+		HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4));
+
+	/* Setup the addresse of the RX FIS. */
+	dd->port->rxfis	    = dd->port->command_list + HW_CMD_SLOT_SZ;
+	dd->port->rxfis_dma = dd->port->command_list_dma + HW_CMD_SLOT_SZ;
+
+	/* Setup the address of the command tables. */
+	dd->port->command_table	  = dd->port->rxfis + AHCI_RX_FIS_SZ;
+	dd->port->command_tbl_dma = dd->port->rxfis_dma + AHCI_RX_FIS_SZ;
+
+	/* Setup the address of the identify data. */
+	dd->port->identify     = dd->port->command_table +
+					HW_CMD_TBL_AR_SZ;
+	dd->port->identify_dma = dd->port->command_tbl_dma +
+					HW_CMD_TBL_AR_SZ;
+
+	/* Setup the address of the sector buffer - for some non-ncq cmds */
+	dd->port->sector_buffer	= (void *) dd->port->identify + ATA_SECT_SIZE;
+	dd->port->sector_buffer_dma = dd->port->identify_dma + ATA_SECT_SIZE;
+
+	/* Setup the address of the log buf - for read log command */
+	dd->port->log_buf = (void *)dd->port->sector_buffer  + ATA_SECT_SIZE;
+	dd->port->log_buf_dma = dd->port->sector_buffer_dma + ATA_SECT_SIZE;
+
+	/* Setup the address of the smart buf - for smart read data command */
+	dd->port->smart_buf = (void *)dd->port->log_buf  + ATA_SECT_SIZE;
+	dd->port->smart_buf_dma = dd->port->log_buf_dma + ATA_SECT_SIZE;
+
+
+	/* Point the command headers at the command tables. */
+	for (i = 0; i < num_command_slots; i++) {
+		dd->port->commands[i].command_header =
+					dd->port->command_list +
+					(sizeof(struct mtip_cmd_hdr) * i);
+		dd->port->commands[i].command_header_dma =
+					dd->port->command_list_dma +
+					(sizeof(struct mtip_cmd_hdr) * i);
+
+		dd->port->commands[i].command =
+			dd->port->command_table + (HW_CMD_TBL_SZ * i);
+		dd->port->commands[i].command_dma =
+			dd->port->command_tbl_dma + (HW_CMD_TBL_SZ * i);
+
+		if (readl(dd->mmio + HOST_CAP) & HOST_CAP_64)
+			dd->port->commands[i].command_header->ctbau =
+			__force_bit2int cpu_to_le32(
+			(dd->port->commands[i].command_dma >> 16) >> 16);
+		dd->port->commands[i].command_header->ctba =
+			__force_bit2int cpu_to_le32(
+			dd->port->commands[i].command_dma & 0xFFFFFFFF);
+
+		/*
+		 * If this is not done, a bug is reported by the stock
+		 * FC11 i386. Due to the fact that it has lots of kernel
+		 * debugging enabled.
+		 */
+		sg_init_table(dd->port->commands[i].sg, MTIP_MAX_SG);
+
+		/* Mark all commands as currently inactive.*/
+		atomic_set(&dd->port->commands[i].active, 0);
+	}
+
+	/* Setup the pointers to the extended s_active and CI registers. */
+	for (i = 0; i < dd->slot_groups; i++) {
+		dd->port->s_active[i] =
+			dd->port->mmio + i*0x80 + PORT_SCR_ACT;
+		dd->port->cmd_issue[i] =
+			dd->port->mmio + i*0x80 + PORT_COMMAND_ISSUE;
+		dd->port->completed[i] =
+			dd->port->mmio + i*0x80 + PORT_SDBV;
+	}
+
+	timetaken = jiffies;
+	timeout = jiffies + msecs_to_jiffies(30000);
+	while (((readl(dd->port->mmio + PORT_SCR_STAT) & 0x0F) != 0x03) &&
+		 time_before(jiffies, timeout)) {
+		mdelay(100);
+	}
+	if (unlikely(mtip_check_surprise_removal(dd->pdev))) {
+		timetaken = jiffies - timetaken;
+		dev_warn(&dd->pdev->dev,
+			"Surprise removal detected at %u ms\n",
+			jiffies_to_msecs(timetaken));
+		rv = -ENODEV;
+		goto out2 ;
+	}
+	if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) {
+		timetaken = jiffies - timetaken;
+		dev_warn(&dd->pdev->dev,
+			"Removal detected at %u ms\n",
+			jiffies_to_msecs(timetaken));
+		rv = -EFAULT;
+		goto out2;
+	}
+
+	/* Conditionally reset the HBA. */
+	if (!(readl(dd->mmio + HOST_CAP) & HOST_CAP_NZDMA)) {
+		if (mtip_hba_reset(dd) < 0) {
+			dev_err(&dd->pdev->dev,
+				"Card did not reset within timeout\n");
+			rv = -EIO;
+			goto out2;
+		}
+	} else {
+		/* Clear any pending interrupts on the HBA */
+		writel(readl(dd->mmio + HOST_IRQ_STAT),
+			dd->mmio + HOST_IRQ_STAT);
+	}
+
+	mtip_init_port(dd->port);
+	mtip_start_port(dd->port);
+
+	/* Setup the ISR and enable interrupts. */
+	rv = devm_request_irq(&dd->pdev->dev,
+				dd->pdev->irq,
+				mtip_irq_handler,
+				IRQF_SHARED,
+				dev_driver_string(&dd->pdev->dev),
+				dd);
+
+	if (rv) {
+		dev_err(&dd->pdev->dev,
+			"Unable to allocate IRQ %d\n", dd->pdev->irq);
+		goto out2;
+	}
+
+	/* Enable interrupts on the HBA. */
+	writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
+					dd->mmio + HOST_CTL);
+
+	init_timer(&dd->port->cmd_timer);
+	init_waitqueue_head(&dd->port->svc_wait);
+
+	dd->port->cmd_timer.data = (unsigned long int) dd->port;
+	dd->port->cmd_timer.function = mtip_timeout_function;
+	mod_timer(&dd->port->cmd_timer,
+		jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
+
+
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) {
+		rv = -EFAULT;
+		goto out3;
+	}
+
+	if (mtip_get_identify(dd->port, NULL) < 0) {
+		rv = -EFAULT;
+		goto out3;
+	}
+
+	if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
+		MTIP_FTL_REBUILD_MAGIC) {
+		set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags);
+		return MTIP_FTL_REBUILD_MAGIC;
+	}
+	mtip_dump_identify(dd->port);
+
+	/* check write protect, over temp and rebuild statuses */
+	rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
+				dd->port->log_buf,
+				dd->port->log_buf_dma, 1);
+	if (rv) {
+		dev_warn(&dd->pdev->dev,
+			"Error in READ LOG EXT (10h) command\n");
+		/* non-critical error, don't fail the load */
+	} else {
+		buf = (unsigned char *)dd->port->log_buf;
+		if (buf[259] & 0x1) {
+			dev_info(&dd->pdev->dev,
+				"Write protect bit is set.\n");
+			set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag);
+		}
+		if (buf[288] == 0xF7) {
+			dev_info(&dd->pdev->dev,
+				"Exceeded Tmax, drive in thermal shutdown.\n");
+			set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag);
+		}
+		if (buf[288] == 0xBF) {
+			dev_info(&dd->pdev->dev,
+				"Drive indicates rebuild has failed.\n");
+			/* TODO */
+		}
+	}
+
+	/* get write protect progess */
+	memset(&attr242, 0, sizeof(struct smart_attr));
+	if (mtip_get_smart_attr(dd->port, 242, &attr242))
+		dev_warn(&dd->pdev->dev,
+				"Unable to check write protect progress\n");
+	else
+		dev_info(&dd->pdev->dev,
+				"Write protect progress: %u%% (%u blocks)\n",
+				attr242.cur, le32_to_cpu(attr242.data));
+	return rv;
+
+out3:
+	del_timer_sync(&dd->port->cmd_timer);
+
+	/* Disable interrupts on the HBA. */
+	writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+			dd->mmio + HOST_CTL);
+
+	/*Release the IRQ. */
+	devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
+
+out2:
+	mtip_deinit_port(dd->port);
+
+	/* Free the command/command header memory. */
+	dmam_free_coherent(&dd->pdev->dev,
+				HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
+				dd->port->command_list,
+				dd->port->command_list_dma);
+out1:
+	/* Free the memory allocated for the for structure. */
+	kfree(dd->port);
+
+	return rv;
+}
+
+/*
+ * Called to deinitialize an interface.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+static int mtip_hw_exit(struct driver_data *dd)
+{
+	/*
+	 * Send standby immediate (E0h) to the drive so that it
+	 * saves its state.
+	 */
+	if (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) {
+
+		if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags))
+			if (mtip_standby_immediate(dd->port))
+				dev_warn(&dd->pdev->dev,
+					"STANDBY IMMEDIATE failed\n");
+
+		/* de-initialize the port. */
+		mtip_deinit_port(dd->port);
+
+		/* Disable interrupts on the HBA. */
+		writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+				dd->mmio + HOST_CTL);
+	}
+
+	del_timer_sync(&dd->port->cmd_timer);
+
+	/* Release the IRQ. */
+	devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
+
+	/* Stop the bottom half tasklet. */
+	tasklet_kill(&dd->tasklet);
+
+	/* Free the command/command header memory. */
+	dmam_free_coherent(&dd->pdev->dev,
+			HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
+			dd->port->command_list,
+			dd->port->command_list_dma);
+	/* Free the memory allocated for the for structure. */
+	kfree(dd->port);
+
+	return 0;
+}
+
+/*
+ * Issue a Standby Immediate command to the device.
+ *
+ * This function is called by the Block Layer just before the
+ * system powers off during a shutdown.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+static int mtip_hw_shutdown(struct driver_data *dd)
+{
+	/*
+	 * Send standby immediate (E0h) to the drive so that it
+	 * saves its state.
+	 */
+	mtip_standby_immediate(dd->port);
+
+	return 0;
+}
+
+/*
+ * Suspend function
+ *
+ * This function is called by the Block Layer just before the
+ * system hibernates.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0	Suspend was successful
+ *	-EFAULT Suspend was not successful
+ */
+static int mtip_hw_suspend(struct driver_data *dd)
+{
+	/*
+	 * Send standby immediate (E0h) to the drive
+	 * so that it saves its state.
+	 */
+	if (mtip_standby_immediate(dd->port) != 0) {
+		dev_err(&dd->pdev->dev,
+			"Failed standby-immediate command\n");
+		return -EFAULT;
+	}
+
+	/* Disable interrupts on the HBA.*/
+	writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+			dd->mmio + HOST_CTL);
+	mtip_deinit_port(dd->port);
+
+	return 0;
+}
+
+/*
+ * Resume function
+ *
+ * This function is called by the Block Layer as the
+ * system resumes.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0	Resume was successful
+ *      -EFAULT Resume was not successful
+ */
+static int mtip_hw_resume(struct driver_data *dd)
+{
+	/* Perform any needed hardware setup steps */
+	hba_setup(dd);
+
+	/* Reset the HBA */
+	if (mtip_hba_reset(dd) != 0) {
+		dev_err(&dd->pdev->dev,
+			"Unable to reset the HBA\n");
+		return -EFAULT;
+	}
+
+	/*
+	 * Enable the port, DMA engine, and FIS reception specific
+	 * h/w in controller.
+	 */
+	mtip_init_port(dd->port);
+	mtip_start_port(dd->port);
+
+	/* Enable interrupts on the HBA.*/
+	writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
+			dd->mmio + HOST_CTL);
+
+	return 0;
+}
+
+/*
+ * Helper function for reusing disk name
+ * upon hot insertion.
+ */
+static int rssd_disk_name_format(char *prefix,
+				 int index,
+				 char *buf,
+				 int buflen)
+{
+	const int base = 'z' - 'a' + 1;
+	char *begin = buf + strlen(prefix);
+	char *end = buf + buflen;
+	char *p;
+	int unit;
+
+	p = end - 1;
+	*p = '\0';
+	unit = base;
+	do {
+		if (p == begin)
+			return -EINVAL;
+		*--p = 'a' + (index % unit);
+		index = (index / unit) - 1;
+	} while (index >= 0);
+
+	memmove(begin, p, end - p);
+	memcpy(buf, prefix, strlen(prefix));
+
+	return 0;
+}
+
+/*
+ * Block layer IOCTL handler.
+ *
+ * @dev Pointer to the block_device structure.
+ * @mode ignored
+ * @cmd IOCTL command passed from the user application.
+ * @arg Argument passed from the user application.
+ *
+ * return value
+ *	0        IOCTL completed successfully.
+ *	-ENOTTY  IOCTL not supported or invalid driver data
+ *                 structure pointer.
+ */
+static int mtip_block_ioctl(struct block_device *dev,
+			    fmode_t mode,
+			    unsigned cmd,
+			    unsigned long arg)
+{
+	struct driver_data *dd = dev->bd_disk->private_data;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (!dd)
+		return -ENOTTY;
+
+	if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)))
+		return -ENOTTY;
+
+	switch (cmd) {
+	case BLKFLSBUF:
+		return -ENOTTY;
+	default:
+		return mtip_hw_ioctl(dd, cmd, arg);
+	}
+}
+
+#ifdef CONFIG_COMPAT
+/*
+ * Block layer compat IOCTL handler.
+ *
+ * @dev Pointer to the block_device structure.
+ * @mode ignored
+ * @cmd IOCTL command passed from the user application.
+ * @arg Argument passed from the user application.
+ *
+ * return value
+ *	0        IOCTL completed successfully.
+ *	-ENOTTY  IOCTL not supported or invalid driver data
+ *                 structure pointer.
+ */
+static int mtip_block_compat_ioctl(struct block_device *dev,
+			    fmode_t mode,
+			    unsigned cmd,
+			    unsigned long arg)
+{
+	struct driver_data *dd = dev->bd_disk->private_data;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (!dd)
+		return -ENOTTY;
+
+	if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)))
+		return -ENOTTY;
+
+	switch (cmd) {
+	case BLKFLSBUF:
+		return -ENOTTY;
+	case HDIO_DRIVE_TASKFILE: {
+		struct mtip_compat_ide_task_request_s __user *compat_req_task;
+		ide_task_request_t req_task;
+		int compat_tasksize, outtotal, ret;
+
+		compat_tasksize =
+			sizeof(struct mtip_compat_ide_task_request_s);
+
+		compat_req_task =
+			(struct mtip_compat_ide_task_request_s __user *) arg;
+
+		if (copy_from_user(&req_task, (void __user *) arg,
+			compat_tasksize - (2 * sizeof(compat_long_t))))
+			return -EFAULT;
+
+		if (get_user(req_task.out_size, &compat_req_task->out_size))
+			return -EFAULT;
+
+		if (get_user(req_task.in_size, &compat_req_task->in_size))
+			return -EFAULT;
+
+		outtotal = sizeof(struct mtip_compat_ide_task_request_s);
+
+		ret = exec_drive_taskfile(dd, (void __user *) arg,
+						&req_task, outtotal);
+
+		if (copy_to_user((void __user *) arg, &req_task,
+				compat_tasksize -
+				(2 * sizeof(compat_long_t))))
+			return -EFAULT;
+
+		if (put_user(req_task.out_size, &compat_req_task->out_size))
+			return -EFAULT;
+
+		if (put_user(req_task.in_size, &compat_req_task->in_size))
+			return -EFAULT;
+
+		return ret;
+	}
+	default:
+		return mtip_hw_ioctl(dd, cmd, arg);
+	}
+}
+#endif
+
+/*
+ * Obtain the geometry of the device.
+ *
+ * You may think that this function is obsolete, but some applications,
+ * fdisk for example still used CHS values. This function describes the
+ * device as having 224 heads and 56 sectors per cylinder. These values are
+ * chosen so that each cylinder is aligned on a 4KB boundary. Since a
+ * partition is described in terms of a start and end cylinder this means
+ * that each partition is also 4KB aligned. Non-aligned partitions adversely
+ * affects performance.
+ *
+ * @dev Pointer to the block_device strucutre.
+ * @geo Pointer to a hd_geometry structure.
+ *
+ * return value
+ *	0       Operation completed successfully.
+ *	-ENOTTY An error occurred while reading the drive capacity.
+ */
+static int mtip_block_getgeo(struct block_device *dev,
+				struct hd_geometry *geo)
+{
+	struct driver_data *dd = dev->bd_disk->private_data;
+	sector_t capacity;
+
+	if (!dd)
+		return -ENOTTY;
+
+	if (!(mtip_hw_get_capacity(dd, &capacity))) {
+		dev_warn(&dd->pdev->dev,
+			"Could not get drive capacity.\n");
+		return -ENOTTY;
+	}
+
+	geo->heads = 224;
+	geo->sectors = 56;
+	sector_div(capacity, (geo->heads * geo->sectors));
+	geo->cylinders = capacity;
+	return 0;
+}
+
+/*
+ * Block device operation function.
+ *
+ * This structure contains pointers to the functions required by the block
+ * layer.
+ */
+static const struct block_device_operations mtip_block_ops = {
+	.ioctl		= mtip_block_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= mtip_block_compat_ioctl,
+#endif
+	.getgeo		= mtip_block_getgeo,
+	.owner		= THIS_MODULE
+};
+
+/*
+ * Block layer make request function.
+ *
+ * This function is called by the kernel to process a BIO for
+ * the P320 device.
+ *
+ * @queue Pointer to the request queue. Unused other than to obtain
+ *              the driver data structure.
+ * @bio   Pointer to the BIO.
+ *
+ */
+static void mtip_make_request(struct request_queue *queue, struct bio *bio)
+{
+	struct driver_data *dd = queue->queuedata;
+	struct scatterlist *sg;
+	struct bio_vec *bvec;
+	int nents = 0;
+	int tag = 0;
+
+	if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) {
+		if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+							&dd->dd_flag))) {
+			bio_endio(bio, -ENXIO);
+			return;
+		}
+		if (unlikely(test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))) {
+			bio_endio(bio, -ENODATA);
+			return;
+		}
+		if (unlikely(test_bit(MTIP_DDF_WRITE_PROTECT_BIT,
+							&dd->dd_flag) &&
+				bio_data_dir(bio))) {
+			bio_endio(bio, -ENODATA);
+			return;
+		}
+		if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))) {
+			bio_endio(bio, -ENODATA);
+			return;
+		}
+	}
+
+	if (unlikely(!bio_has_data(bio))) {
+		blk_queue_flush(queue, 0);
+		bio_endio(bio, 0);
+		return;
+	}
+
+	sg = mtip_hw_get_scatterlist(dd, &tag);
+	if (likely(sg != NULL)) {
+		blk_queue_bounce(queue, &bio);
+
+		if (unlikely((bio)->bi_vcnt > MTIP_MAX_SG)) {
+			dev_warn(&dd->pdev->dev,
+				"Maximum number of SGL entries exceeded\n");
+			bio_io_error(bio);
+			mtip_hw_release_scatterlist(dd, tag);
+			return;
+		}
+
+		/* Create the scatter list for this bio. */
+		bio_for_each_segment(bvec, bio, nents) {
+			sg_set_page(&sg[nents],
+					bvec->bv_page,
+					bvec->bv_len,
+					bvec->bv_offset);
+		}
+
+		/* Issue the read/write. */
+		mtip_hw_submit_io(dd,
+				bio->bi_sector,
+				bio_sectors(bio),
+				nents,
+				tag,
+				bio_endio,
+				bio,
+				bio_data_dir(bio));
+	} else
+		bio_io_error(bio);
+}
+
+/*
+ * Block layer initialization function.
+ *
+ * This function is called once by the PCI layer for each P320
+ * device that is connected to the system.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0 on success else an error code.
+ */
+static int mtip_block_initialize(struct driver_data *dd)
+{
+	int rv = 0, wait_for_rebuild = 0;
+	sector_t capacity;
+	unsigned int index = 0;
+	struct kobject *kobj;
+	unsigned char thd_name[16];
+
+	if (dd->disk)
+		goto skip_create_disk; /* hw init done, before rebuild */
+
+	/* Initialize the protocol layer. */
+	wait_for_rebuild = mtip_hw_init(dd);
+	if (wait_for_rebuild < 0) {
+		dev_err(&dd->pdev->dev,
+			"Protocol layer initialization failed\n");
+		rv = -EINVAL;
+		goto protocol_init_error;
+	}
+
+	dd->disk = alloc_disk(MTIP_MAX_MINORS);
+	if (dd->disk  == NULL) {
+		dev_err(&dd->pdev->dev,
+			"Unable to allocate gendisk structure\n");
+		rv = -EINVAL;
+		goto alloc_disk_error;
+	}
+
+	/* Generate the disk name, implemented same as in sd.c */
+	do {
+		if (!ida_pre_get(&rssd_index_ida, GFP_KERNEL))
+			goto ida_get_error;
+
+		spin_lock(&rssd_index_lock);
+		rv = ida_get_new(&rssd_index_ida, &index);
+		spin_unlock(&rssd_index_lock);
+	} while (rv == -EAGAIN);
+
+	if (rv)
+		goto ida_get_error;
+
+	rv = rssd_disk_name_format("rssd",
+				index,
+				dd->disk->disk_name,
+				DISK_NAME_LEN);
+	if (rv)
+		goto disk_index_error;
+
+	dd->disk->driverfs_dev	= &dd->pdev->dev;
+	dd->disk->major		= dd->major;
+	dd->disk->first_minor	= dd->instance * MTIP_MAX_MINORS;
+	dd->disk->fops		= &mtip_block_ops;
+	dd->disk->private_data	= dd;
+	dd->index		= index;
+
+	/*
+	 * if rebuild pending, start the service thread, and delay the block
+	 * queue creation and add_disk()
+	 */
+	if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
+		goto start_service_thread;
+
+skip_create_disk:
+	/* Allocate the request queue. */
+	dd->queue = blk_alloc_queue(GFP_KERNEL);
+	if (dd->queue == NULL) {
+		dev_err(&dd->pdev->dev,
+			"Unable to allocate request queue\n");
+		rv = -ENOMEM;
+		goto block_queue_alloc_init_error;
+	}
+
+	/* Attach our request function to the request queue. */
+	blk_queue_make_request(dd->queue, mtip_make_request);
+
+	dd->disk->queue		= dd->queue;
+	dd->queue->queuedata	= dd;
+
+	/* Set device limits. */
+	set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags);
+	blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
+	blk_queue_physical_block_size(dd->queue, 4096);
+	blk_queue_max_hw_sectors(dd->queue, 0xffff);
+	blk_queue_max_segment_size(dd->queue, 0x400000);
+	blk_queue_io_min(dd->queue, 4096);
+
+	/*
+	 * write back cache is not supported in the device. FUA depends on
+	 * write back cache support, hence setting flush support to zero.
+	 */
+	blk_queue_flush(dd->queue, 0);
+
+	/* Set the capacity of the device in 512 byte sectors. */
+	if (!(mtip_hw_get_capacity(dd, &capacity))) {
+		dev_warn(&dd->pdev->dev,
+			"Could not read drive capacity\n");
+		rv = -EIO;
+		goto read_capacity_error;
+	}
+	set_capacity(dd->disk, capacity);
+
+	/* Enable the block device and add it to /dev */
+	add_disk(dd->disk);
+
+	/*
+	 * Now that the disk is active, initialize any sysfs attributes
+	 * managed by the protocol layer.
+	 */
+	kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+	if (kobj) {
+		mtip_hw_sysfs_init(dd, kobj);
+		kobject_put(kobj);
+	}
+	mtip_hw_debugfs_init(dd);
+
+	if (dd->mtip_svc_handler) {
+		set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag);
+		return rv; /* service thread created for handling rebuild */
+	}
+
+start_service_thread:
+	sprintf(thd_name, "mtip_svc_thd_%02d", index);
+
+	dd->mtip_svc_handler = kthread_run(mtip_service_thread,
+						dd, thd_name);
+
+	if (IS_ERR(dd->mtip_svc_handler)) {
+		dev_err(&dd->pdev->dev, "service thread failed to start\n");
+		dd->mtip_svc_handler = NULL;
+		rv = -EFAULT;
+		goto kthread_run_error;
+	}
+
+	if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
+		rv = wait_for_rebuild;
+
+	return rv;
+
+kthread_run_error:
+	mtip_hw_debugfs_exit(dd);
+
+	/* Delete our gendisk. This also removes the device from /dev */
+	del_gendisk(dd->disk);
+
+read_capacity_error:
+	blk_cleanup_queue(dd->queue);
+
+block_queue_alloc_init_error:
+disk_index_error:
+	spin_lock(&rssd_index_lock);
+	ida_remove(&rssd_index_ida, index);
+	spin_unlock(&rssd_index_lock);
+
+ida_get_error:
+	put_disk(dd->disk);
+
+alloc_disk_error:
+	mtip_hw_exit(dd); /* De-initialize the protocol layer. */
+
+protocol_init_error:
+	return rv;
+}
+
+/*
+ * Block layer deinitialization function.
+ *
+ * Called by the PCI layer as each P320 device is removed.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+static int mtip_block_remove(struct driver_data *dd)
+{
+	struct kobject *kobj;
+
+	if (dd->mtip_svc_handler) {
+		set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags);
+		wake_up_interruptible(&dd->port->svc_wait);
+		kthread_stop(dd->mtip_svc_handler);
+	}
+
+	/* Clean up the sysfs attributes, if created */
+	if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) {
+		kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+		if (kobj) {
+			mtip_hw_sysfs_exit(dd, kobj);
+			kobject_put(kobj);
+		}
+	}
+	mtip_hw_debugfs_exit(dd);
+
+	/*
+	 * Delete our gendisk structure. This also removes the device
+	 * from /dev
+	 */
+	del_gendisk(dd->disk);
+
+	spin_lock(&rssd_index_lock);
+	ida_remove(&rssd_index_ida, dd->index);
+	spin_unlock(&rssd_index_lock);
+
+	blk_cleanup_queue(dd->queue);
+	dd->disk  = NULL;
+	dd->queue = NULL;
+
+	/* De-initialize the protocol layer. */
+	mtip_hw_exit(dd);
+
+	return 0;
+}
+
+/*
+ * Function called by the PCI layer when just before the
+ * machine shuts down.
+ *
+ * If a protocol layer shutdown function is present it will be called
+ * by this function.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+static int mtip_block_shutdown(struct driver_data *dd)
+{
+	dev_info(&dd->pdev->dev,
+		"Shutting down %s ...\n", dd->disk->disk_name);
+
+	/* Delete our gendisk structure, and cleanup the blk queue. */
+	del_gendisk(dd->disk);
+
+	spin_lock(&rssd_index_lock);
+	ida_remove(&rssd_index_ida, dd->index);
+	spin_unlock(&rssd_index_lock);
+
+	blk_cleanup_queue(dd->queue);
+	dd->disk  = NULL;
+	dd->queue = NULL;
+
+	mtip_hw_shutdown(dd);
+	return 0;
+}
+
+static int mtip_block_suspend(struct driver_data *dd)
+{
+	dev_info(&dd->pdev->dev,
+		"Suspending %s ...\n", dd->disk->disk_name);
+	mtip_hw_suspend(dd);
+	return 0;
+}
+
+static int mtip_block_resume(struct driver_data *dd)
+{
+	dev_info(&dd->pdev->dev, "Resuming %s ...\n",
+		dd->disk->disk_name);
+	mtip_hw_resume(dd);
+	return 0;
+}
+
+/*
+ * Called for each supported PCI device detected.
+ *
+ * This function allocates the private data structure, enables the
+ * PCI device and then calls the block layer initialization function.
+ *
+ * return value
+ *	0 on success else an error code.
+ */
+static int mtip_pci_probe(struct pci_dev *pdev,
+			const struct pci_device_id *ent)
+{
+	int rv = 0;
+	struct driver_data *dd = NULL;
+
+	/* Allocate memory for this devices private data. */
+	dd = kzalloc(sizeof(struct driver_data), GFP_KERNEL);
+	if (dd == NULL) {
+		dev_err(&pdev->dev,
+			"Unable to allocate memory for driver data\n");
+		return -ENOMEM;
+	}
+
+	/* Attach the private data to this PCI device.  */
+	pci_set_drvdata(pdev, dd);
+
+	rv = pcim_enable_device(pdev);
+	if (rv < 0) {
+		dev_err(&pdev->dev, "Unable to enable device\n");
+		goto iomap_err;
+	}
+
+	/* Map BAR5 to memory. */
+	rv = pcim_iomap_regions(pdev, 1 << MTIP_ABAR, MTIP_DRV_NAME);
+	if (rv < 0) {
+		dev_err(&pdev->dev, "Unable to map regions\n");
+		goto iomap_err;
+	}
+
+	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
+		rv = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+
+		if (rv) {
+			rv = pci_set_consistent_dma_mask(pdev,
+						DMA_BIT_MASK(32));
+			if (rv) {
+				dev_warn(&pdev->dev,
+					"64-bit DMA enable failed\n");
+				goto setmask_err;
+			}
+		}
+	}
+
+	pci_set_master(pdev);
+
+	if (pci_enable_msi(pdev)) {
+		dev_warn(&pdev->dev,
+			"Unable to enable MSI interrupt.\n");
+		goto block_initialize_err;
+	}
+
+	/* Copy the info we may need later into the private data structure. */
+	dd->major	= mtip_major;
+	dd->instance	= instance;
+	dd->pdev	= pdev;
+
+	/* Initialize the block layer. */
+	rv = mtip_block_initialize(dd);
+	if (rv < 0) {
+		dev_err(&pdev->dev,
+			"Unable to initialize block layer\n");
+		goto block_initialize_err;
+	}
+
+	/*
+	 * Increment the instance count so that each device has a unique
+	 * instance number.
+	 */
+	instance++;
+	if (rv != MTIP_FTL_REBUILD_MAGIC)
+		set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag);
+	goto done;
+
+block_initialize_err:
+	pci_disable_msi(pdev);
+
+setmask_err:
+	pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
+
+iomap_err:
+	kfree(dd);
+	pci_set_drvdata(pdev, NULL);
+	return rv;
+done:
+	return rv;
+}
+
+/*
+ * Called for each probed device when the device is removed or the
+ * driver is unloaded.
+ *
+ * return value
+ *	None
+ */
+static void mtip_pci_remove(struct pci_dev *pdev)
+{
+	struct driver_data *dd = pci_get_drvdata(pdev);
+	int counter = 0;
+
+	set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag);
+
+	if (mtip_check_surprise_removal(pdev)) {
+		while (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) {
+			counter++;
+			msleep(20);
+			if (counter == 10) {
+				/* Cleanup the outstanding commands */
+				mtip_command_cleanup(dd);
+				break;
+			}
+		}
+	}
+
+	/* Clean up the block layer. */
+	mtip_block_remove(dd);
+
+	pci_disable_msi(pdev);
+
+	kfree(dd);
+	pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
+}
+
+/*
+ * Called for each probed device when the device is suspended.
+ *
+ * return value
+ *	0  Success
+ *	<0 Error
+ */
+static int mtip_pci_suspend(struct pci_dev *pdev, pm_message_t mesg)
+{
+	int rv = 0;
+	struct driver_data *dd = pci_get_drvdata(pdev);
+
+	if (!dd) {
+		dev_err(&pdev->dev,
+			"Driver private datastructure is NULL\n");
+		return -EFAULT;
+	}
+
+	set_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag);
+
+	/* Disable ports & interrupts then send standby immediate */
+	rv = mtip_block_suspend(dd);
+	if (rv < 0) {
+		dev_err(&pdev->dev,
+			"Failed to suspend controller\n");
+		return rv;
+	}
+
+	/*
+	 * Save the pci config space to pdev structure &
+	 * disable the device
+	 */
+	pci_save_state(pdev);
+	pci_disable_device(pdev);
+
+	/* Move to Low power state*/
+	pci_set_power_state(pdev, PCI_D3hot);
+
+	return rv;
+}
+
+/*
+ * Called for each probed device when the device is resumed.
+ *
+ * return value
+ *      0  Success
+ *      <0 Error
+ */
+static int mtip_pci_resume(struct pci_dev *pdev)
+{
+	int rv = 0;
+	struct driver_data *dd;
+
+	dd = pci_get_drvdata(pdev);
+	if (!dd) {
+		dev_err(&pdev->dev,
+			"Driver private datastructure is NULL\n");
+		return -EFAULT;
+	}
+
+	/* Move the device to active State */
+	pci_set_power_state(pdev, PCI_D0);
+
+	/* Restore PCI configuration space */
+	pci_restore_state(pdev);
+
+	/* Enable the PCI device*/
+	rv = pcim_enable_device(pdev);
+	if (rv < 0) {
+		dev_err(&pdev->dev,
+			"Failed to enable card during resume\n");
+		goto err;
+	}
+	pci_set_master(pdev);
+
+	/*
+	 * Calls hbaReset, initPort, & startPort function
+	 * then enables interrupts
+	 */
+	rv = mtip_block_resume(dd);
+	if (rv < 0)
+		dev_err(&pdev->dev, "Unable to resume\n");
+
+err:
+	clear_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag);
+
+	return rv;
+}
+
+/*
+ * Shutdown routine
+ *
+ * return value
+ *      None
+ */
+static void mtip_pci_shutdown(struct pci_dev *pdev)
+{
+	struct driver_data *dd = pci_get_drvdata(pdev);
+	if (dd)
+		mtip_block_shutdown(dd);
+}
+
+/* Table of device ids supported by this driver. */
+static DEFINE_PCI_DEVICE_TABLE(mtip_pci_tbl) = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320H_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320M_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320S_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P325M_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P420H_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P420M_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P425M_DEVICE_ID) },
+	{ 0 }
+};
+
+/* Structure that describes the PCI driver functions. */
+static struct pci_driver mtip_pci_driver = {
+	.name			= MTIP_DRV_NAME,
+	.id_table		= mtip_pci_tbl,
+	.probe			= mtip_pci_probe,
+	.remove			= mtip_pci_remove,
+	.suspend		= mtip_pci_suspend,
+	.resume			= mtip_pci_resume,
+	.shutdown		= mtip_pci_shutdown,
+};
+
+MODULE_DEVICE_TABLE(pci, mtip_pci_tbl);
+
+/*
+ * Module initialization function.
+ *
+ * Called once when the module is loaded. This function allocates a major
+ * block device number to the Cyclone devices and registers the PCI layer
+ * of the driver.
+ *
+ * Return value
+ *      0 on success else error code.
+ */
+static int __init mtip_init(void)
+{
+	int error;
+
+	pr_info(MTIP_DRV_NAME " Version " MTIP_DRV_VERSION "\n");
+
+	/* Allocate a major block device number to use with this driver. */
+	error = register_blkdev(0, MTIP_DRV_NAME);
+	if (error <= 0) {
+		pr_err("Unable to register block device (%d)\n",
+		error);
+		return -EBUSY;
+	}
+	mtip_major = error;
+
+	if (!dfs_parent) {
+		dfs_parent = debugfs_create_dir("rssd", NULL);
+		if (IS_ERR_OR_NULL(dfs_parent)) {
+			pr_warn("Error creating debugfs parent\n");
+			dfs_parent = NULL;
+		}
+	}
+
+	/* Register our PCI operations. */
+	error = pci_register_driver(&mtip_pci_driver);
+	if (error) {
+		debugfs_remove(dfs_parent);
+		unregister_blkdev(mtip_major, MTIP_DRV_NAME);
+	}
+
+	return error;
+}
+
+/*
+ * Module de-initialization function.
+ *
+ * Called once when the module is unloaded. This function deallocates
+ * the major block device number allocated by mtip_init() and
+ * unregisters the PCI layer of the driver.
+ *
+ * Return value
+ *      none
+ */
+static void __exit mtip_exit(void)
+{
+	debugfs_remove_recursive(dfs_parent);
+
+	/* Release the allocated major block device number. */
+	unregister_blkdev(mtip_major, MTIP_DRV_NAME);
+
+	/* Unregister the PCI driver. */
+	pci_unregister_driver(&mtip_pci_driver);
+}
+
+MODULE_AUTHOR("Micron Technology, Inc");
+MODULE_DESCRIPTION("Micron RealSSD PCIe Block Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(MTIP_DRV_VERSION);
+
+module_init(mtip_init);
+module_exit(mtip_exit);
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
new file mode 100644
index 000000000000..18627a1d04c5
--- /dev/null
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -0,0 +1,463 @@
+/*
+ * mtip32xx.h - Header file for the P320 SSD Block Driver
+ *   Copyright (C) 2011 Micron Technology, Inc.
+ *
+ * Portions of this code were derived from works subjected to the
+ * following copyright:
+ *    Copyright (C) 2009 Integrated Device Technology, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __MTIP32XX_H__
+#define __MTIP32XX_H__
+
+#include <linux/spinlock.h>
+#include <linux/rwsem.h>
+#include <linux/ata.h>
+#include <linux/interrupt.h>
+#include <linux/genhd.h>
+
+/* Offset of Subsystem Device ID in pci confoguration space */
+#define PCI_SUBSYSTEM_DEVICEID	0x2E
+
+/* offset of Device Control register in PCIe extended capabilites space */
+#define PCIE_CONFIG_EXT_DEVICE_CONTROL_OFFSET	0x48
+
+/* # of times to retry timed out/failed IOs */
+#define MTIP_MAX_RETRIES	2
+
+/* Various timeout values in ms */
+#define MTIP_NCQ_COMMAND_TIMEOUT_MS       5000
+#define MTIP_IOCTL_COMMAND_TIMEOUT_MS     5000
+#define MTIP_INTERNAL_COMMAND_TIMEOUT_MS  5000
+
+/* check for timeouts every 500ms */
+#define MTIP_TIMEOUT_CHECK_PERIOD	500
+
+/* ftl rebuild */
+#define MTIP_FTL_REBUILD_OFFSET		142
+#define MTIP_FTL_REBUILD_MAGIC		0xED51
+#define MTIP_FTL_REBUILD_TIMEOUT_MS	2400000
+
+/* Macro to extract the tag bit number from a tag value. */
+#define MTIP_TAG_BIT(tag)	(tag & 0x1F)
+
+/*
+ * Macro to extract the tag index from a tag value. The index
+ * is used to access the correct s_active/Command Issue register based
+ * on the tag value.
+ */
+#define MTIP_TAG_INDEX(tag)	(tag >> 5)
+
+/*
+ * Maximum number of scatter gather entries
+ * a single command may have.
+ */
+#define MTIP_MAX_SG		128
+
+/*
+ * Maximum number of slot groups (Command Issue & s_active registers)
+ * NOTE: This is the driver maximum; check dd->slot_groups for actual value.
+ */
+#define MTIP_MAX_SLOT_GROUPS	8
+
+/* Internal command tag. */
+#define MTIP_TAG_INTERNAL	0
+
+/* Micron Vendor ID & P320x SSD Device ID */
+#define PCI_VENDOR_ID_MICRON    0x1344
+#define P320H_DEVICE_ID		0x5150
+#define P320M_DEVICE_ID		0x5151
+#define P320S_DEVICE_ID		0x5152
+#define P325M_DEVICE_ID		0x5153
+#define P420H_DEVICE_ID		0x5160
+#define P420M_DEVICE_ID		0x5161
+#define P425M_DEVICE_ID		0x5163
+
+/* Driver name and version strings */
+#define MTIP_DRV_NAME		"mtip32xx"
+#define MTIP_DRV_VERSION	"1.2.6os3"
+
+/* Maximum number of minor device numbers per device. */
+#define MTIP_MAX_MINORS		16
+
+/* Maximum number of supported command slots. */
+#define MTIP_MAX_COMMAND_SLOTS	(MTIP_MAX_SLOT_GROUPS * 32)
+
+/*
+ * Per-tag bitfield size in longs.
+ * Linux bit manipulation functions
+ * (i.e. test_and_set_bit, find_next_zero_bit)
+ * manipulate memory in longs, so we try to make the math work.
+ * take the slot groups and find the number of longs, rounding up.
+ * Careful! i386 and x86_64 use different size longs!
+ */
+#define U32_PER_LONG	(sizeof(long) / sizeof(u32))
+#define SLOTBITS_IN_LONGS ((MTIP_MAX_SLOT_GROUPS + \
+					(U32_PER_LONG-1))/U32_PER_LONG)
+
+/* BAR number used to access the HBA registers. */
+#define MTIP_ABAR		5
+
+#ifdef DEBUG
+ #define dbg_printk(format, arg...)	\
+	printk(pr_fmt(format), ##arg);
+#else
+ #define dbg_printk(format, arg...)
+#endif
+
+#define MTIP_DFS_MAX_BUF_SIZE 1024
+
+#define __force_bit2int (unsigned int __force)
+
+enum {
+	/* below are bit numbers in 'flags' defined in mtip_port */
+	MTIP_PF_IC_ACTIVE_BIT       = 0, /* pio/ioctl */
+	MTIP_PF_EH_ACTIVE_BIT       = 1, /* error handling */
+	MTIP_PF_SE_ACTIVE_BIT       = 2, /* secure erase */
+	MTIP_PF_DM_ACTIVE_BIT       = 3, /* download microcde */
+	MTIP_PF_PAUSE_IO      =	((1 << MTIP_PF_IC_ACTIVE_BIT) | \
+				(1 << MTIP_PF_EH_ACTIVE_BIT) | \
+				(1 << MTIP_PF_SE_ACTIVE_BIT) | \
+				(1 << MTIP_PF_DM_ACTIVE_BIT)),
+
+	MTIP_PF_SVC_THD_ACTIVE_BIT  = 4,
+	MTIP_PF_ISSUE_CMDS_BIT      = 5,
+	MTIP_PF_REBUILD_BIT         = 6,
+	MTIP_PF_SVC_THD_STOP_BIT    = 8,
+
+	/* below are bit numbers in 'dd_flag' defined in driver_data */
+	MTIP_DDF_SEC_LOCK_BIT	    = 0,
+	MTIP_DDF_REMOVE_PENDING_BIT = 1,
+	MTIP_DDF_OVER_TEMP_BIT      = 2,
+	MTIP_DDF_WRITE_PROTECT_BIT  = 3,
+	MTIP_DDF_STOP_IO      = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) | \
+				(1 << MTIP_DDF_SEC_LOCK_BIT) | \
+				(1 << MTIP_DDF_OVER_TEMP_BIT) | \
+				(1 << MTIP_DDF_WRITE_PROTECT_BIT)),
+
+	MTIP_DDF_CLEANUP_BIT        = 5,
+	MTIP_DDF_RESUME_BIT         = 6,
+	MTIP_DDF_INIT_DONE_BIT      = 7,
+	MTIP_DDF_REBUILD_FAILED_BIT = 8,
+};
+
+__packed struct smart_attr{
+	u8 attr_id;
+	u16 flags;
+	u8 cur;
+	u8 worst;
+	u32 data;
+	u8 res[3];
+};
+
+/* Register Frame Information Structure (FIS), host to device. */
+struct host_to_dev_fis {
+	/*
+	 * FIS type.
+	 * - 27h Register FIS, host to device.
+	 * - 34h Register FIS, device to host.
+	 * - 39h DMA Activate FIS, device to host.
+	 * - 41h DMA Setup FIS, bi-directional.
+	 * - 46h Data FIS, bi-directional.
+	 * - 58h BIST Activate FIS, bi-directional.
+	 * - 5Fh PIO Setup FIS, device to host.
+	 * - A1h Set Device Bits FIS, device to host.
+	 */
+	unsigned char type;
+	unsigned char opts;
+	unsigned char command;
+	unsigned char features;
+
+	union {
+		unsigned char lba_low;
+		unsigned char sector;
+	};
+	union {
+		unsigned char lba_mid;
+		unsigned char cyl_low;
+	};
+	union {
+		unsigned char lba_hi;
+		unsigned char cyl_hi;
+	};
+	union {
+		unsigned char device;
+		unsigned char head;
+	};
+
+	union {
+		unsigned char lba_low_ex;
+		unsigned char sector_ex;
+	};
+	union {
+		unsigned char lba_mid_ex;
+		unsigned char cyl_low_ex;
+	};
+	union {
+		unsigned char lba_hi_ex;
+		unsigned char cyl_hi_ex;
+	};
+	unsigned char features_ex;
+
+	unsigned char sect_count;
+	unsigned char sect_cnt_ex;
+	unsigned char res2;
+	unsigned char control;
+
+	unsigned int res3;
+};
+
+/* Command header structure. */
+struct mtip_cmd_hdr {
+	/*
+	 * Command options.
+	 * - Bits 31:16 Number of PRD entries.
+	 * - Bits 15:8 Unused in this implementation.
+	 * - Bit 7 Prefetch bit, informs the drive to prefetch PRD entries.
+	 * - Bit 6 Write bit, should be set when writing data to the device.
+	 * - Bit 5 Unused in this implementation.
+	 * - Bits 4:0 Length of the command FIS in DWords (DWord = 4 bytes).
+	 */
+	unsigned int opts;
+	/* This field is unsed when using NCQ. */
+	union {
+		unsigned int byte_count;
+		unsigned int status;
+	};
+	/*
+	 * Lower 32 bits of the command table address associated with this
+	 * header. The command table addresses must be 128 byte aligned.
+	 */
+	unsigned int ctba;
+	/*
+	 * If 64 bit addressing is used this field is the upper 32 bits
+	 * of the command table address associated with this command.
+	 */
+	unsigned int ctbau;
+	/* Reserved and unused. */
+	unsigned int res[4];
+};
+
+/* Command scatter gather structure (PRD). */
+struct mtip_cmd_sg {
+	/*
+	 * Low 32 bits of the data buffer address. For P320 this
+	 * address must be 8 byte aligned signified by bits 2:0 being
+	 * set to 0.
+	 */
+	unsigned int dba;
+	/*
+	 * When 64 bit addressing is used this field is the upper
+	 * 32 bits of the data buffer address.
+	 */
+	unsigned int dba_upper;
+	/* Unused. */
+	unsigned int reserved;
+	/*
+	 * Bit 31: interrupt when this data block has been transferred.
+	 * Bits 30..22: reserved
+	 * Bits 21..0: byte count (minus 1).  For P320 the byte count must be
+	 * 8 byte aligned signified by bits 2:0 being set to 1.
+	 */
+	unsigned int info;
+};
+struct mtip_port;
+
+/* Structure used to describe a command. */
+struct mtip_cmd {
+
+	struct mtip_cmd_hdr *command_header; /* ptr to command header entry */
+
+	dma_addr_t command_header_dma; /* corresponding physical address */
+
+	void *command; /* ptr to command table entry */
+
+	dma_addr_t command_dma; /* corresponding physical address */
+
+	void *comp_data; /* data passed to completion function comp_func() */
+	/*
+	 * Completion function called by the ISR upon completion of
+	 * a command.
+	 */
+	void (*comp_func)(struct mtip_port *port,
+				int tag,
+				void *data,
+				int status);
+	/* Additional callback function that may be called by comp_func() */
+	void (*async_callback)(void *data, int status);
+
+	void *async_data; /* Addl. data passed to async_callback() */
+
+	int scatter_ents; /* Number of scatter list entries used */
+
+	struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */
+
+	int retries; /* The number of retries left for this command. */
+
+	int direction; /* Data transfer direction */
+
+	unsigned long comp_time; /* command completion time, in jiffies */
+
+	atomic_t active; /* declares if this command sent to the drive. */
+};
+
+/* Structure used to describe a port. */
+struct mtip_port {
+	/* Pointer back to the driver data for this port. */
+	struct driver_data *dd;
+	/*
+	 * Used to determine if the data pointed to by the
+	 * identify field is valid.
+	 */
+	unsigned long identify_valid;
+	/* Base address of the memory mapped IO for the port. */
+	void __iomem *mmio;
+	/* Array of pointers to the memory mapped s_active registers. */
+	void __iomem *s_active[MTIP_MAX_SLOT_GROUPS];
+	/* Array of pointers to the memory mapped completed registers. */
+	void __iomem *completed[MTIP_MAX_SLOT_GROUPS];
+	/* Array of pointers to the memory mapped Command Issue registers. */
+	void __iomem *cmd_issue[MTIP_MAX_SLOT_GROUPS];
+	/*
+	 * Pointer to the beginning of the command header memory as used
+	 * by the driver.
+	 */
+	void *command_list;
+	/*
+	 * Pointer to the beginning of the command header memory as used
+	 * by the DMA.
+	 */
+	dma_addr_t command_list_dma;
+	/*
+	 * Pointer to the beginning of the RX FIS memory as used
+	 * by the driver.
+	 */
+	void *rxfis;
+	/*
+	 * Pointer to the beginning of the RX FIS memory as used
+	 * by the DMA.
+	 */
+	dma_addr_t rxfis_dma;
+	/*
+	 * Pointer to the beginning of the command table memory as used
+	 * by the driver.
+	 */
+	void *command_table;
+	/*
+	 * Pointer to the beginning of the command table memory as used
+	 * by the DMA.
+	 */
+	dma_addr_t command_tbl_dma;
+	/*
+	 * Pointer to the beginning of the identify data memory as used
+	 * by the driver.
+	 */
+	u16 *identify;
+	/*
+	 * Pointer to the beginning of the identify data memory as used
+	 * by the DMA.
+	 */
+	dma_addr_t identify_dma;
+	/*
+	 * Pointer to the beginning of a sector buffer that is used
+	 * by the driver when issuing internal commands.
+	 */
+	u16 *sector_buffer;
+	/*
+	 * Pointer to the beginning of a sector buffer that is used
+	 * by the DMA when the driver issues internal commands.
+	 */
+	dma_addr_t sector_buffer_dma;
+	/*
+	 * Bit significant, used to determine if a command slot has
+	 * been allocated. i.e. the slot is in use.  Bits are cleared
+	 * when the command slot and all associated data structures
+	 * are no longer needed.
+	 */
+	u16 *log_buf;
+	dma_addr_t log_buf_dma;
+
+	u8 *smart_buf;
+	dma_addr_t smart_buf_dma;
+
+	unsigned long allocated[SLOTBITS_IN_LONGS];
+	/*
+	 * used to queue commands when an internal command is in progress
+	 * or error handling is active
+	 */
+	unsigned long cmds_to_issue[SLOTBITS_IN_LONGS];
+	/*
+	 * Array of command slots. Structure includes pointers to the
+	 * command header and command table, and completion function and data
+	 * pointers.
+	 */
+	struct mtip_cmd commands[MTIP_MAX_COMMAND_SLOTS];
+	/* Used by mtip_service_thread to wait for an event */
+	wait_queue_head_t svc_wait;
+	/*
+	 * indicates the state of the port. Also, helps the service thread
+	 * to determine its action on wake up.
+	 */
+	unsigned long flags;
+	/*
+	 * Timer used to complete commands that have been active for too long.
+	 */
+	struct timer_list cmd_timer;
+	unsigned long ic_pause_timer;
+	/*
+	 * Semaphore used to block threads if there are no
+	 * command slots available.
+	 */
+	struct semaphore cmd_slot;
+	/* Spinlock for working around command-issue bug. */
+	spinlock_t cmd_issue_lock;
+};
+
+/*
+ * Driver private data structure.
+ *
+ * One structure is allocated per probed device.
+ */
+struct driver_data {
+	void __iomem *mmio; /* Base address of the HBA registers. */
+
+	int major; /* Major device number. */
+
+	int instance; /* Instance number. First device probed is 0, ... */
+
+	struct gendisk *disk; /* Pointer to our gendisk structure. */
+
+	struct pci_dev *pdev; /* Pointer to the PCI device structure. */
+
+	struct request_queue *queue; /* Our request queue. */
+
+	struct mtip_port *port; /* Pointer to the port data structure. */
+
+	/* Tasklet used to process the bottom half of the ISR. */
+	struct tasklet_struct tasklet;
+
+	unsigned product_type; /* magic value declaring the product type */
+
+	unsigned slot_groups; /* number of slot groups the product supports */
+
+	unsigned long index; /* Index to determine the disk name */
+
+	unsigned long dd_flag; /* NOTE: use atomic bit operations on this */
+
+	struct task_struct *mtip_svc_handler; /* task_struct of svc thd */
+
+	struct dentry *dfs_node;
+};
+
+#endif
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index f533f3375e24..0c03411c59eb 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -34,12 +34,11 @@
 #include <linux/kthread.h>
 
 #include <asm/uaccess.h>
-#include <asm/system.h>
 #include <asm/types.h>
 
 #include <linux/nbd.h>
 
-#define LO_MAGIC 0x68797548
+#define NBD_MAGIC 0x68797548
 
 #ifdef NDEBUG
 #define dprintk(flags, fmt...)
@@ -116,7 +115,7 @@ static void nbd_end_request(struct request *req)
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-static void sock_shutdown(struct nbd_device *lo, int lock)
+static void sock_shutdown(struct nbd_device *nbd, int lock)
 {
 	/* Forcibly shutdown the socket causing all listeners
 	 * to error
@@ -125,15 +124,14 @@ static void sock_shutdown(struct nbd_device *lo, int lock)
 	 * there should be a more generic interface rather than
 	 * calling socket ops directly here */
 	if (lock)
-		mutex_lock(&lo->tx_lock);
-	if (lo->sock) {
-		printk(KERN_WARNING "%s: shutting down socket\n",
-			lo->disk->disk_name);
-		kernel_sock_shutdown(lo->sock, SHUT_RDWR);
-		lo->sock = NULL;
+		mutex_lock(&nbd->tx_lock);
+	if (nbd->sock) {
+		dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
+		kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
+		nbd->sock = NULL;
 	}
 	if (lock)
-		mutex_unlock(&lo->tx_lock);
+		mutex_unlock(&nbd->tx_lock);
 }
 
 static void nbd_xmit_timeout(unsigned long arg)
@@ -148,18 +146,20 @@ static void nbd_xmit_timeout(unsigned long arg)
 /*
  *  Send or receive packet.
  */
-static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
+static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
 		int msg_flags)
 {
-	struct socket *sock = lo->sock;
+	struct socket *sock = nbd->sock;
 	int result;
 	struct msghdr msg;
 	struct kvec iov;
 	sigset_t blocked, oldset;
+	unsigned long pflags = current->flags;
 
 	if (unlikely(!sock)) {
-		printk(KERN_ERR "%s: Attempted %s on closed socket in sock_xmit\n",
-		       lo->disk->disk_name, (send ? "send" : "recv"));
+		dev_err(disk_to_dev(nbd->disk),
+			"Attempted %s on closed socket in sock_xmit\n",
+			(send ? "send" : "recv"));
 		return -EINVAL;
 	}
 
@@ -168,8 +168,9 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
 	siginitsetinv(&blocked, sigmask(SIGKILL));
 	sigprocmask(SIG_SETMASK, &blocked, &oldset);
 
+	current->flags |= PF_MEMALLOC;
 	do {
-		sock->sk->sk_allocation = GFP_NOIO;
+		sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
 		iov.iov_base = buf;
 		iov.iov_len = size;
 		msg.msg_name = NULL;
@@ -181,15 +182,15 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
 		if (send) {
 			struct timer_list ti;
 
-			if (lo->xmit_timeout) {
+			if (nbd->xmit_timeout) {
 				init_timer(&ti);
 				ti.function = nbd_xmit_timeout;
 				ti.data = (unsigned long)current;
-				ti.expires = jiffies + lo->xmit_timeout;
+				ti.expires = jiffies + nbd->xmit_timeout;
 				add_timer(&ti);
 			}
 			result = kernel_sendmsg(sock, &msg, &iov, 1, size);
-			if (lo->xmit_timeout)
+			if (nbd->xmit_timeout)
 				del_timer_sync(&ti);
 		} else
 			result = kernel_recvmsg(sock, &msg, &iov, 1, size,
@@ -201,7 +202,7 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
 				task_pid_nr(current), current->comm,
 				dequeue_signal_lock(current, &current->blocked, &info));
 			result = -EINTR;
-			sock_shutdown(lo, !send);
+			sock_shutdown(nbd, !send);
 			break;
 		}
 
@@ -215,22 +216,24 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
 	} while (size > 0);
 
 	sigprocmask(SIG_SETMASK, &oldset, NULL);
+	tsk_restore_flags(current, pflags, PF_MEMALLOC);
 
 	return result;
 }
 
-static inline int sock_send_bvec(struct nbd_device *lo, struct bio_vec *bvec,
+static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
 		int flags)
 {
 	int result;
 	void *kaddr = kmap(bvec->bv_page);
-	result = sock_xmit(lo, 1, kaddr + bvec->bv_offset, bvec->bv_len, flags);
+	result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
+			   bvec->bv_len, flags);
 	kunmap(bvec->bv_page);
 	return result;
 }
 
 /* always call with the tx_lock held */
-static int nbd_send_req(struct nbd_device *lo, struct request *req)
+static int nbd_send_req(struct nbd_device *nbd, struct request *req)
 {
 	int result, flags;
 	struct nbd_request request;
@@ -243,15 +246,15 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
 	memcpy(request.handle, &req, sizeof(req));
 
 	dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n",
-			lo->disk->disk_name, req,
+			nbd->disk->disk_name, req,
 			nbdcmd_to_ascii(nbd_cmd(req)),
 			(unsigned long long)blk_rq_pos(req) << 9,
 			blk_rq_bytes(req));
-	result = sock_xmit(lo, 1, &request, sizeof(request),
+	result = sock_xmit(nbd, 1, &request, sizeof(request),
 			(nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0);
 	if (result <= 0) {
-		printk(KERN_ERR "%s: Send control failed (result %d)\n",
-				lo->disk->disk_name, result);
+		dev_err(disk_to_dev(nbd->disk),
+			"Send control failed (result %d)\n", result);
 		goto error_out;
 	}
 
@@ -267,11 +270,12 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
 			if (!rq_iter_last(req, iter))
 				flags = MSG_MORE;
 			dprintk(DBG_TX, "%s: request %p: sending %d bytes data\n",
-					lo->disk->disk_name, req, bvec->bv_len);
-			result = sock_send_bvec(lo, bvec, flags);
+					nbd->disk->disk_name, req, bvec->bv_len);
+			result = sock_send_bvec(nbd, bvec, flags);
 			if (result <= 0) {
-				printk(KERN_ERR "%s: Send data failed (result %d)\n",
-						lo->disk->disk_name, result);
+				dev_err(disk_to_dev(nbd->disk),
+					"Send data failed (result %d)\n",
+					result);
 				goto error_out;
 			}
 		}
@@ -282,25 +286,25 @@ error_out:
 	return -EIO;
 }
 
-static struct request *nbd_find_request(struct nbd_device *lo,
+static struct request *nbd_find_request(struct nbd_device *nbd,
 					struct request *xreq)
 {
 	struct request *req, *tmp;
 	int err;
 
-	err = wait_event_interruptible(lo->active_wq, lo->active_req != xreq);
+	err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq);
 	if (unlikely(err))
 		goto out;
 
-	spin_lock(&lo->queue_lock);
-	list_for_each_entry_safe(req, tmp, &lo->queue_head, queuelist) {
+	spin_lock(&nbd->queue_lock);
+	list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) {
 		if (req != xreq)
 			continue;
 		list_del_init(&req->queuelist);
-		spin_unlock(&lo->queue_lock);
+		spin_unlock(&nbd->queue_lock);
 		return req;
 	}
-	spin_unlock(&lo->queue_lock);
+	spin_unlock(&nbd->queue_lock);
 
 	err = -ENOENT;
 
@@ -308,79 +312,78 @@ out:
 	return ERR_PTR(err);
 }
 
-static inline int sock_recv_bvec(struct nbd_device *lo, struct bio_vec *bvec)
+static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
 {
 	int result;
 	void *kaddr = kmap(bvec->bv_page);
-	result = sock_xmit(lo, 0, kaddr + bvec->bv_offset, bvec->bv_len,
+	result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len,
 			MSG_WAITALL);
 	kunmap(bvec->bv_page);
 	return result;
 }
 
 /* NULL returned = something went wrong, inform userspace */
-static struct request *nbd_read_stat(struct nbd_device *lo)
+static struct request *nbd_read_stat(struct nbd_device *nbd)
 {
 	int result;
 	struct nbd_reply reply;
 	struct request *req;
 
 	reply.magic = 0;
-	result = sock_xmit(lo, 0, &reply, sizeof(reply), MSG_WAITALL);
+	result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
 	if (result <= 0) {
-		printk(KERN_ERR "%s: Receive control failed (result %d)\n",
-				lo->disk->disk_name, result);
+		dev_err(disk_to_dev(nbd->disk),
+			"Receive control failed (result %d)\n", result);
 		goto harderror;
 	}
 
 	if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
-		printk(KERN_ERR "%s: Wrong magic (0x%lx)\n",
-				lo->disk->disk_name,
+		dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
 				(unsigned long)ntohl(reply.magic));
 		result = -EPROTO;
 		goto harderror;
 	}
 
-	req = nbd_find_request(lo, *(struct request **)reply.handle);
+	req = nbd_find_request(nbd, *(struct request **)reply.handle);
 	if (IS_ERR(req)) {
 		result = PTR_ERR(req);
 		if (result != -ENOENT)
 			goto harderror;
 
-		printk(KERN_ERR "%s: Unexpected reply (%p)\n",
-				lo->disk->disk_name, reply.handle);
+		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n",
+			reply.handle);
 		result = -EBADR;
 		goto harderror;
 	}
 
 	if (ntohl(reply.error)) {
-		printk(KERN_ERR "%s: Other side returned error (%d)\n",
-				lo->disk->disk_name, ntohl(reply.error));
+		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
+			ntohl(reply.error));
 		req->errors++;
 		return req;
 	}
 
 	dprintk(DBG_RX, "%s: request %p: got reply\n",
-			lo->disk->disk_name, req);
+			nbd->disk->disk_name, req);
 	if (nbd_cmd(req) == NBD_CMD_READ) {
 		struct req_iterator iter;
 		struct bio_vec *bvec;
 
 		rq_for_each_segment(bvec, req, iter) {
-			result = sock_recv_bvec(lo, bvec);
+			result = sock_recv_bvec(nbd, bvec);
 			if (result <= 0) {
-				printk(KERN_ERR "%s: Receive data failed (result %d)\n",
-						lo->disk->disk_name, result);
+				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
+					result);
 				req->errors++;
 				return req;
 			}
 			dprintk(DBG_RX, "%s: request %p: got %d bytes data\n",
-				lo->disk->disk_name, req, bvec->bv_len);
+				nbd->disk->disk_name, req, bvec->bv_len);
 		}
 	}
 	return req;
 harderror:
-	lo->harderror = result;
+	nbd->harderror = result;
 	return NULL;
 }
 
@@ -398,48 +401,57 @@ static struct device_attribute pid_attr = {
 	.show = pid_show,
 };
 
-static int nbd_do_it(struct nbd_device *lo)
+static int nbd_do_it(struct nbd_device *nbd)
 {
 	struct request *req;
 	int ret;
 
-	BUG_ON(lo->magic != LO_MAGIC);
+	BUG_ON(nbd->magic != NBD_MAGIC);
 
-	lo->pid = current->pid;
-	ret = sysfs_create_file(&disk_to_dev(lo->disk)->kobj, &pid_attr.attr);
+	sk_set_memalloc(nbd->sock->sk);
+	nbd->pid = task_pid_nr(current);
+	ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
 	if (ret) {
-		printk(KERN_ERR "nbd: sysfs_create_file failed!");
-		lo->pid = 0;
+		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
+		nbd->pid = 0;
 		return ret;
 	}
 
-	while ((req = nbd_read_stat(lo)) != NULL)
+	while ((req = nbd_read_stat(nbd)) != NULL)
 		nbd_end_request(req);
 
-	sysfs_remove_file(&disk_to_dev(lo->disk)->kobj, &pid_attr.attr);
-	lo->pid = 0;
+	device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
+	nbd->pid = 0;
 	return 0;
 }
 
-static void nbd_clear_que(struct nbd_device *lo)
+static void nbd_clear_que(struct nbd_device *nbd)
 {
 	struct request *req;
 
-	BUG_ON(lo->magic != LO_MAGIC);
+	BUG_ON(nbd->magic != NBD_MAGIC);
 
 	/*
-	 * Because we have set lo->sock to NULL under the tx_lock, all
+	 * Because we have set nbd->sock to NULL under the tx_lock, all
 	 * modifications to the list must have completed by now.  For
 	 * the same reason, the active_req must be NULL.
 	 *
 	 * As a consequence, we don't need to take the spin lock while
 	 * purging the list here.
 	 */
-	BUG_ON(lo->sock);
-	BUG_ON(lo->active_req);
+	BUG_ON(nbd->sock);
+	BUG_ON(nbd->active_req);
 
-	while (!list_empty(&lo->queue_head)) {
-		req = list_entry(lo->queue_head.next, struct request,
+	while (!list_empty(&nbd->queue_head)) {
+		req = list_entry(nbd->queue_head.next, struct request,
+				 queuelist);
+		list_del_init(&req->queuelist);
+		req->errors++;
+		nbd_end_request(req);
+	}
+
+	while (!list_empty(&nbd->waiting_queue)) {
+		req = list_entry(nbd->waiting_queue.next, struct request,
 				 queuelist);
 		list_del_init(&req->queuelist);
 		req->errors++;
@@ -448,7 +460,7 @@ static void nbd_clear_que(struct nbd_device *lo)
 }
 
 
-static void nbd_handle_req(struct nbd_device *lo, struct request *req)
+static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
 {
 	if (req->cmd_type != REQ_TYPE_FS)
 		goto error_out;
@@ -456,39 +468,38 @@ static void nbd_handle_req(struct nbd_device *lo, struct request *req)
 	nbd_cmd(req) = NBD_CMD_READ;
 	if (rq_data_dir(req) == WRITE) {
 		nbd_cmd(req) = NBD_CMD_WRITE;
-		if (lo->flags & NBD_READ_ONLY) {
-			printk(KERN_ERR "%s: Write on read-only\n",
-					lo->disk->disk_name);
+		if (nbd->flags & NBD_READ_ONLY) {
+			dev_err(disk_to_dev(nbd->disk),
+				"Write on read-only\n");
 			goto error_out;
 		}
 	}
 
 	req->errors = 0;
 
-	mutex_lock(&lo->tx_lock);
-	if (unlikely(!lo->sock)) {
-		mutex_unlock(&lo->tx_lock);
-		printk(KERN_ERR "%s: Attempted send on closed socket\n",
-		       lo->disk->disk_name);
+	mutex_lock(&nbd->tx_lock);
+	if (unlikely(!nbd->sock)) {
+		mutex_unlock(&nbd->tx_lock);
+		dev_err(disk_to_dev(nbd->disk),
+			"Attempted send on closed socket\n");
 		goto error_out;
 	}
 
-	lo->active_req = req;
+	nbd->active_req = req;
 
-	if (nbd_send_req(lo, req) != 0) {
-		printk(KERN_ERR "%s: Request send failed\n",
-				lo->disk->disk_name);
+	if (nbd_send_req(nbd, req) != 0) {
+		dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
 		req->errors++;
 		nbd_end_request(req);
 	} else {
-		spin_lock(&lo->queue_lock);
-		list_add(&req->queuelist, &lo->queue_head);
-		spin_unlock(&lo->queue_lock);
+		spin_lock(&nbd->queue_lock);
+		list_add_tail(&req->queuelist, &nbd->queue_head);
+		spin_unlock(&nbd->queue_lock);
 	}
 
-	lo->active_req = NULL;
-	mutex_unlock(&lo->tx_lock);
-	wake_up_all(&lo->active_wq);
+	nbd->active_req = NULL;
+	mutex_unlock(&nbd->tx_lock);
+	wake_up_all(&nbd->active_wq);
 
 	return;
 
@@ -499,28 +510,28 @@ error_out:
 
 static int nbd_thread(void *data)
 {
-	struct nbd_device *lo = data;
+	struct nbd_device *nbd = data;
 	struct request *req;
 
 	set_user_nice(current, -20);
-	while (!kthread_should_stop() || !list_empty(&lo->waiting_queue)) {
+	while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
 		/* wait for something to do */
-		wait_event_interruptible(lo->waiting_wq,
+		wait_event_interruptible(nbd->waiting_wq,
 					 kthread_should_stop() ||
-					 !list_empty(&lo->waiting_queue));
+					 !list_empty(&nbd->waiting_queue));
 
 		/* extract request */
-		if (list_empty(&lo->waiting_queue))
+		if (list_empty(&nbd->waiting_queue))
 			continue;
 
-		spin_lock_irq(&lo->queue_lock);
-		req = list_entry(lo->waiting_queue.next, struct request,
+		spin_lock_irq(&nbd->queue_lock);
+		req = list_entry(nbd->waiting_queue.next, struct request,
 				 queuelist);
 		list_del_init(&req->queuelist);
-		spin_unlock_irq(&lo->queue_lock);
+		spin_unlock_irq(&nbd->queue_lock);
 
 		/* handle request */
-		nbd_handle_req(lo, req);
+		nbd_handle_req(nbd, req);
 	}
 	return 0;
 }
@@ -528,7 +539,7 @@ static int nbd_thread(void *data)
 /*
  * We always wait for result of write, for now. It would be nice to make it optional
  * in future
- * if ((rq_data_dir(req) == WRITE) && (lo->flags & NBD_WRITE_NOCHK))
+ * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK))
  *   { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
  */
 
@@ -537,31 +548,31 @@ static void do_nbd_request(struct request_queue *q)
 	struct request *req;
 	
 	while ((req = blk_fetch_request(q)) != NULL) {
-		struct nbd_device *lo;
+		struct nbd_device *nbd;
 
 		spin_unlock_irq(q->queue_lock);
 
 		dprintk(DBG_BLKDEV, "%s: request %p: dequeued (flags=%x)\n",
 				req->rq_disk->disk_name, req, req->cmd_type);
 
-		lo = req->rq_disk->private_data;
+		nbd = req->rq_disk->private_data;
 
-		BUG_ON(lo->magic != LO_MAGIC);
+		BUG_ON(nbd->magic != NBD_MAGIC);
 
-		if (unlikely(!lo->sock)) {
-			printk(KERN_ERR "%s: Attempted send on closed socket\n",
-				lo->disk->disk_name);
+		if (unlikely(!nbd->sock)) {
+			dev_err(disk_to_dev(nbd->disk),
+				"Attempted send on closed socket\n");
 			req->errors++;
 			nbd_end_request(req);
 			spin_lock_irq(q->queue_lock);
 			continue;
 		}
 
-		spin_lock_irq(&lo->queue_lock);
-		list_add_tail(&req->queuelist, &lo->waiting_queue);
-		spin_unlock_irq(&lo->queue_lock);
+		spin_lock_irq(&nbd->queue_lock);
+		list_add_tail(&req->queuelist, &nbd->waiting_queue);
+		spin_unlock_irq(&nbd->queue_lock);
 
-		wake_up(&lo->waiting_wq);
+		wake_up(&nbd->waiting_wq);
 
 		spin_lock_irq(q->queue_lock);
 	}
@@ -569,32 +580,33 @@ static void do_nbd_request(struct request_queue *q)
 
 /* Must be called with tx_lock held */
 
-static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
+static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		       unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
 	case NBD_DISCONNECT: {
 		struct request sreq;
 
-	        printk(KERN_INFO "%s: NBD_DISCONNECT\n", lo->disk->disk_name);
+		dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
 
 		blk_rq_init(NULL, &sreq);
 		sreq.cmd_type = REQ_TYPE_SPECIAL;
 		nbd_cmd(&sreq) = NBD_CMD_DISC;
-		if (!lo->sock)
+		if (!nbd->sock)
 			return -EINVAL;
-		nbd_send_req(lo, &sreq);
+		nbd_send_req(nbd, &sreq);
                 return 0;
 	}
  
 	case NBD_CLEAR_SOCK: {
 		struct file *file;
 
-		lo->sock = NULL;
-		file = lo->file;
-		lo->file = NULL;
-		nbd_clear_que(lo);
-		BUG_ON(!list_empty(&lo->queue_head));
+		nbd->sock = NULL;
+		file = nbd->file;
+		nbd->file = NULL;
+		nbd_clear_que(nbd);
+		BUG_ON(!list_empty(&nbd->queue_head));
+		BUG_ON(!list_empty(&nbd->waiting_queue));
 		if (file)
 			fput(file);
 		return 0;
@@ -602,14 +614,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
 
 	case NBD_SET_SOCK: {
 		struct file *file;
-		if (lo->file)
+		if (nbd->file)
 			return -EBUSY;
 		file = fget(arg);
 		if (file) {
 			struct inode *inode = file->f_path.dentry->d_inode;
 			if (S_ISSOCK(inode->i_mode)) {
-				lo->file = file;
-				lo->sock = SOCKET_I(inode);
+				nbd->file = file;
+				nbd->sock = SOCKET_I(inode);
 				if (max_part > 0)
 					bdev->bd_invalidated = 1;
 				return 0;
@@ -621,29 +633,29 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
 	}
 
 	case NBD_SET_BLKSIZE:
-		lo->blksize = arg;
-		lo->bytesize &= ~(lo->blksize-1);
-		bdev->bd_inode->i_size = lo->bytesize;
-		set_blocksize(bdev, lo->blksize);
-		set_capacity(lo->disk, lo->bytesize >> 9);
+		nbd->blksize = arg;
+		nbd->bytesize &= ~(nbd->blksize-1);
+		bdev->bd_inode->i_size = nbd->bytesize;
+		set_blocksize(bdev, nbd->blksize);
+		set_capacity(nbd->disk, nbd->bytesize >> 9);
 		return 0;
 
 	case NBD_SET_SIZE:
-		lo->bytesize = arg & ~(lo->blksize-1);
-		bdev->bd_inode->i_size = lo->bytesize;
-		set_blocksize(bdev, lo->blksize);
-		set_capacity(lo->disk, lo->bytesize >> 9);
+		nbd->bytesize = arg & ~(nbd->blksize-1);
+		bdev->bd_inode->i_size = nbd->bytesize;
+		set_blocksize(bdev, nbd->blksize);
+		set_capacity(nbd->disk, nbd->bytesize >> 9);
 		return 0;
 
 	case NBD_SET_TIMEOUT:
-		lo->xmit_timeout = arg * HZ;
+		nbd->xmit_timeout = arg * HZ;
 		return 0;
 
 	case NBD_SET_SIZE_BLOCKS:
-		lo->bytesize = ((u64) arg) * lo->blksize;
-		bdev->bd_inode->i_size = lo->bytesize;
-		set_blocksize(bdev, lo->blksize);
-		set_capacity(lo->disk, lo->bytesize >> 9);
+		nbd->bytesize = ((u64) arg) * nbd->blksize;
+		bdev->bd_inode->i_size = nbd->bytesize;
+		set_blocksize(bdev, nbd->blksize);
+		set_capacity(nbd->disk, nbd->bytesize >> 9);
 		return 0;
 
 	case NBD_DO_IT: {
@@ -651,38 +663,38 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
 		struct file *file;
 		int error;
 
-		if (lo->pid)
+		if (nbd->pid)
 			return -EBUSY;
-		if (!lo->file)
+		if (!nbd->file)
 			return -EINVAL;
 
-		mutex_unlock(&lo->tx_lock);
+		mutex_unlock(&nbd->tx_lock);
 
-		thread = kthread_create(nbd_thread, lo, lo->disk->disk_name);
+		thread = kthread_create(nbd_thread, nbd, nbd->disk->disk_name);
 		if (IS_ERR(thread)) {
-			mutex_lock(&lo->tx_lock);
+			mutex_lock(&nbd->tx_lock);
 			return PTR_ERR(thread);
 		}
 		wake_up_process(thread);
-		error = nbd_do_it(lo);
+		error = nbd_do_it(nbd);
 		kthread_stop(thread);
 
-		mutex_lock(&lo->tx_lock);
+		mutex_lock(&nbd->tx_lock);
 		if (error)
 			return error;
-		sock_shutdown(lo, 0);
-		file = lo->file;
-		lo->file = NULL;
-		nbd_clear_que(lo);
-		printk(KERN_WARNING "%s: queue cleared\n", lo->disk->disk_name);
+		sock_shutdown(nbd, 0);
+		file = nbd->file;
+		nbd->file = NULL;
+		nbd_clear_que(nbd);
+		dev_warn(disk_to_dev(nbd->disk), "queue cleared\n");
 		if (file)
 			fput(file);
-		lo->bytesize = 0;
+		nbd->bytesize = 0;
 		bdev->bd_inode->i_size = 0;
-		set_capacity(lo->disk, 0);
+		set_capacity(nbd->disk, 0);
 		if (max_part > 0)
 			ioctl_by_bdev(bdev, BLKRRPART, 0);
-		return lo->harderror;
+		return nbd->harderror;
 	}
 
 	case NBD_CLEAR_QUE:
@@ -690,14 +702,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
 		 * This is for compatibility only.  The queue is always cleared
 		 * by NBD_DO_IT or NBD_CLEAR_SOCK.
 		 */
-		BUG_ON(!lo->sock && !list_empty(&lo->queue_head));
+		BUG_ON(!nbd->sock && !list_empty(&nbd->queue_head));
 		return 0;
 
 	case NBD_PRINT_DEBUG:
-		printk(KERN_INFO "%s: next = %p, prev = %p, head = %p\n",
-			bdev->bd_disk->disk_name,
-			lo->queue_head.next, lo->queue_head.prev,
-			&lo->queue_head);
+		dev_info(disk_to_dev(nbd->disk),
+			"next = %p, prev = %p, head = %p\n",
+			nbd->queue_head.next, nbd->queue_head.prev,
+			&nbd->queue_head);
 		return 0;
 	}
 	return -ENOTTY;
@@ -706,21 +718,21 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
 static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
 		     unsigned int cmd, unsigned long arg)
 {
-	struct nbd_device *lo = bdev->bd_disk->private_data;
+	struct nbd_device *nbd = bdev->bd_disk->private_data;
 	int error;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	BUG_ON(lo->magic != LO_MAGIC);
+	BUG_ON(nbd->magic != NBD_MAGIC);
 
 	/* Anyone capable of this syscall can do *real bad* things */
 	dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n",
-			lo->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
+		nbd->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
 
-	mutex_lock(&lo->tx_lock);
-	error = __nbd_ioctl(bdev, lo, cmd, arg);
-	mutex_unlock(&lo->tx_lock);
+	mutex_lock(&nbd->tx_lock);
+	error = __nbd_ioctl(bdev, nbd, cmd, arg);
+	mutex_unlock(&nbd->tx_lock);
 
 	return error;
 }
@@ -745,7 +757,7 @@ static int __init nbd_init(void)
 	BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
 
 	if (max_part < 0) {
-		printk(KERN_CRIT "nbd: max_part must be >= 0\n");
+		printk(KERN_ERR "nbd: max_part must be >= 0\n");
 		return -EINVAL;
 	}
 
@@ -806,7 +818,7 @@ static int __init nbd_init(void)
 	for (i = 0; i < nbds_max; i++) {
 		struct gendisk *disk = nbd_dev[i].disk;
 		nbd_dev[i].file = NULL;
-		nbd_dev[i].magic = LO_MAGIC;
+		nbd_dev[i].magic = NBD_MAGIC;
 		nbd_dev[i].flags = 0;
 		INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
 		spin_lock_init(&nbd_dev[i].queue_lock);
diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index f9ad514c9227..ad16c68c8645 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -39,7 +39,8 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/types.h>
-#include <linux/version.h>
+
+#include <asm-generic/io-64-nonatomic-lo-hi.h>
 
 #define NVME_Q_DEPTH 1024
 #define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
@@ -614,11 +615,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	return result;
 }
 
-/*
- * NB: return value of non-zero would mean that we were a stacking driver.
- * make_request must always succeed.
- */
-static int nvme_make_request(struct request_queue *q, struct bio *bio)
+static void nvme_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct nvme_ns *ns = q->queuedata;
 	struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
@@ -635,8 +632,6 @@ static int nvme_make_request(struct request_queue *q, struct bio *bio)
 
 	spin_unlock_irq(&nvmeq->q_lock);
 	put_nvmeq(nvmeq);
-
-	return 0;
 }
 
 static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
diff --git a/drivers/block/paride/bpck6.c b/drivers/block/paride/bpck6.c
index ad124525ac23..ec64e7f5d1ce 100644
--- a/drivers/block/paride/bpck6.c
+++ b/drivers/block/paride/bpck6.c
@@ -20,9 +20,6 @@
 */
 
 
-/* PARAMETERS */
-static int verbose; /* set this to 1 to see debugging messages and whatnot */
-
 #define BACKPACK_VERSION "2.0.2"
 
 #include <linux/module.h>
@@ -36,6 +33,8 @@ static int verbose; /* set this to 1 to see debugging messages and whatnot */
 #include "ppc6lnx.c"
 #include "paride.h"
 
+/* PARAMETERS */
+static bool verbose; /* set this to 1 to see debugging messages and whatnot */
  
 
 #define PPCSTRUCT(pi) ((Interface *)(pi->private))
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 46b8136c31bb..ba2b6b5e5910 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -144,7 +144,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
 static DEFINE_MUTEX(pcd_mutex);
 static DEFINE_SPINLOCK(pcd_lock);
 
-module_param(verbose, bool, 0644);
+module_param(verbose, int, 0644);
 module_param(major, int, 0);
 module_param(name, charp, 0);
 module_param(nice, int, 0);
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 869e7676d46f..831e3ac156e6 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -124,8 +124,9 @@
    by default.
 
 */
+#include <linux/types.h>
 
-static int verbose = 0;
+static bool verbose = 0;
 static int major = PD_MAJOR;
 static char *name = PD_NAME;
 static int cluster = 64;
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index f21b520ef419..ec8f9ed6326e 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -118,13 +118,15 @@
 #define PF_NAME		"pf"
 #define PF_UNITS	4
 
+#include <linux/types.h>
+
 /* Here are things one can override from the insmod command.
    Most are autoprobed by paride unless set here.  Verbose is off
    by default.
 
 */
 
-static int verbose = 0;
+static bool verbose = 0;
 static int major = PF_MAJOR;
 static char *name = PF_NAME;
 static int cluster = 64;
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index 6b9a2000d56a..4a27b1de5fcb 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -130,13 +130,14 @@
 #define PI_PG	4
 #endif
 
+#include <linux/types.h>
 /* Here are things one can override from the insmod command.
    Most are autoprobed by paride unless set here.  Verbose is 0
    by default.
 
 */
 
-static int verbose = 0;
+static bool verbose = 0;
 static int major = PG_MAJOR;
 static char *name = PG_NAME;
 static int disable = 0;
@@ -630,6 +631,7 @@ static ssize_t pg_read(struct file *filp, char __user *buf, size_t count, loff_t
 		if (dev->status & 0x10)
 			return -ETIME;
 
+	memset(&hdr, 0, sizeof(hdr));
 	hdr.magic = PG_MAGIC;
 	hdr.dlen = dev->dlen;
 	copy = 0;
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index 7179f79d7468..2596042eb987 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -109,13 +109,15 @@
 #define PT_NAME		"pt"
 #define PT_UNITS	4
 
+#include <linux/types.h>
+
 /* Here are things one can override from the insmod command.
    Most are autoprobed by paride unless set here.  Verbose is on
    by default.
 
 */
 
-static int verbose = 0;
+static bool verbose = 0;
 static int major = PT_MAJOR;
 static char *name = PT_NAME;
 static int disable = 0;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index e133f094ab08..ba66e4445f41 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -987,14 +987,14 @@ static void pkt_copy_bio_data(struct bio *src_bio, int seg, int offs, struct pag
 
 	while (copy_size > 0) {
 		struct bio_vec *src_bvl = bio_iovec_idx(src_bio, seg);
-		void *vfrom = kmap_atomic(src_bvl->bv_page, KM_USER0) +
+		void *vfrom = kmap_atomic(src_bvl->bv_page) +
 			src_bvl->bv_offset + offs;
 		void *vto = page_address(dst_page) + dst_offs;
 		int len = min_t(int, copy_size, src_bvl->bv_len - offs);
 
 		BUG_ON(len < 0);
 		memcpy(vto, vfrom, len);
-		kunmap_atomic(vfrom, KM_USER0);
+		kunmap_atomic(vfrom);
 
 		seg++;
 		offs = 0;
@@ -1019,10 +1019,10 @@ static void pkt_make_local_copy(struct packet_data *pkt, struct bio_vec *bvec)
 	offs = 0;
 	for (f = 0; f < pkt->frames; f++) {
 		if (bvec[f].bv_page != pkt->pages[p]) {
-			void *vfrom = kmap_atomic(bvec[f].bv_page, KM_USER0) + bvec[f].bv_offset;
+			void *vfrom = kmap_atomic(bvec[f].bv_page) + bvec[f].bv_offset;
 			void *vto = page_address(pkt->pages[p]) + offs;
 			memcpy(vto, vfrom, CD_FRAMESIZE);
-			kunmap_atomic(vfrom, KM_USER0);
+			kunmap_atomic(vfrom);
 			bvec[f].bv_page = pkt->pages[p];
 			bvec[f].bv_offset = offs;
 		} else {
@@ -2444,7 +2444,7 @@ static void pkt_end_io_read_cloned(struct bio *bio, int err)
 	pkt_bio_finished(pd);
 }
 
-static int pkt_make_request(struct request_queue *q, struct bio *bio)
+static void pkt_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct pktcdvd_device *pd;
 	char b[BDEVNAME_SIZE];
@@ -2473,7 +2473,7 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio)
 		cloned_bio->bi_end_io = pkt_end_io_read_cloned;
 		pd->stats.secs_r += bio->bi_size >> 9;
 		pkt_queue_bio(pd, cloned_bio);
-		return 0;
+		return;
 	}
 
 	if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
@@ -2509,7 +2509,7 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio)
 			pkt_make_request(q, &bp->bio1);
 			pkt_make_request(q, &bp->bio2);
 			bio_pair_release(bp);
-			return 0;
+			return;
 		}
 	}
 
@@ -2533,7 +2533,7 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio)
 				}
 				spin_unlock(&pkt->lock);
 				spin_unlock(&pd->cdrw.active_list_lock);
-				return 0;
+				return;
 			} else {
 				blocked_bio = 1;
 			}
@@ -2584,10 +2584,9 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio)
 		 */
 		wake_up(&pd->wqueue);
 	}
-	return 0;
+	return;
 end_io:
 	bio_io_error(bio);
-	return 0;
 }
 
 
@@ -2818,7 +2817,7 @@ static const struct block_device_operations pktcdvd_ops = {
 	.check_events =		pkt_check_events,
 };
 
-static char *pktcdvd_devnode(struct gendisk *gd, mode_t *mode)
+static char *pktcdvd_devnode(struct gendisk *gd, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "pktcdvd/%s", gd->disk_name);
 }
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 8e1ce2e2916a..da0abc1838c1 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -21,6 +21,7 @@
 #include <linux/ata.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
+#include <linux/module.h>
 
 #include <asm/lv1call.h>
 #include <asm/ps3stor.h>
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index b3bdb8af89cf..f58cdcfb305f 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -10,6 +10,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/delay.h>
+#include <linux/module.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
@@ -596,7 +597,7 @@ out:
 	return next;
 }
 
-static int ps3vram_make_request(struct request_queue *q, struct bio *bio)
+static void ps3vram_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct ps3_system_bus_device *dev = q->queuedata;
 	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
@@ -610,13 +611,11 @@ static int ps3vram_make_request(struct request_queue *q, struct bio *bio)
 	spin_unlock_irq(&priv->lock);
 
 	if (busy)
-		return 0;
+		return;
 
 	do {
 		bio = ps3vram_do_bio(dev, bio);
 	} while (bio);
-
-	return 0;
 }
 
 static int __devinit ps3vram_probe(struct ps3_system_bus_device *dev)
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 15f65b5f3fc7..54a55f03115d 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -41,19 +41,33 @@
 
 #include "rbd_types.h"
 
-#define DRV_NAME "rbd"
-#define DRV_NAME_LONG "rbd (rados block device)"
+/*
+ * The basic unit of block I/O is a sector.  It is interpreted in a
+ * number of contexts in Linux (blk, bio, genhd), but the default is
+ * universally 512 bytes.  These symbols are just slightly more
+ * meaningful than the bare numbers they represent.
+ */
+#define	SECTOR_SHIFT	9
+#define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
+
+#define RBD_DRV_NAME "rbd"
+#define RBD_DRV_NAME_LONG "rbd (rados block device)"
 
 #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
 
-#define RBD_MAX_MD_NAME_LEN	(96 + sizeof(RBD_SUFFIX))
-#define RBD_MAX_POOL_NAME_LEN	64
 #define RBD_MAX_SNAP_NAME_LEN	32
 #define RBD_MAX_OPT_LEN		1024
 
 #define RBD_SNAP_HEAD_NAME	"-"
 
+/*
+ * An RBD device name will be "rbd#", where the "rbd" comes from
+ * RBD_DRV_NAME above, and # is a unique integer identifier.
+ * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
+ * enough to hold all possible device names.
+ */
 #define DEV_NAME_LEN		32
+#define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
 
 #define RBD_NOTIFY_TIMEOUT_DEFAULT 10
 
@@ -62,14 +76,12 @@
  */
 struct rbd_image_header {
 	u64 image_size;
-	char block_name[32];
+	char *object_prefix;
 	__u8 obj_order;
 	__u8 crypt_type;
 	__u8 comp_type;
-	struct rw_semaphore snap_rwsem;
 	struct ceph_snap_context *snapc;
 	size_t snap_names_len;
-	u64 snap_seq;
 	u32 total_snaps;
 
 	char *snap_names;
@@ -83,7 +95,7 @@ struct rbd_options {
 };
 
 /*
- * an instance of the client.  multiple devices may share a client.
+ * an instance of the client.  multiple devices may share an rbd client.
  */
 struct rbd_client {
 	struct ceph_client	*client;
@@ -92,20 +104,9 @@ struct rbd_client {
 	struct list_head	node;
 };
 
-struct rbd_req_coll;
-
 /*
- * a single io request
+ * a request completion status
  */
-struct rbd_request {
-	struct request		*rq;		/* blk layer request */
-	struct bio		*bio;		/* cloned bio */
-	struct page		**pages;	/* list of used pages */
-	u64			len;
-	int			coll_index;
-	struct rbd_req_coll	*coll;
-};
-
 struct rbd_req_status {
 	int done;
 	int rc;
@@ -122,10 +123,22 @@ struct rbd_req_coll {
 	struct rbd_req_status	status[0];
 };
 
+/*
+ * a single io request
+ */
+struct rbd_request {
+	struct request		*rq;		/* blk layer request */
+	struct bio		*bio;		/* cloned bio */
+	struct page		**pages;	/* list of used pages */
+	u64			len;
+	int			coll_index;
+	struct rbd_req_coll	*coll;
+};
+
 struct rbd_snap {
 	struct	device		dev;
 	const char		*name;
-	size_t			size;
+	u64			size;
 	struct list_head	node;
 	u64			id;
 };
@@ -134,13 +147,12 @@ struct rbd_snap {
  * a single device
  */
 struct rbd_device {
-	int			id;		/* blkdev unique id */
+	int			dev_id;		/* blkdev unique id */
 
 	int			major;		/* blkdev assigned major */
 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
 	struct request_queue	*q;
 
-	struct ceph_client	*client;
 	struct rbd_client	*rbd_client;
 
 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
@@ -148,19 +160,24 @@ struct rbd_device {
 	spinlock_t		lock;		/* queue lock */
 
 	struct rbd_image_header	header;
-	char			obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
-	int			obj_len;
-	char			obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
-	char			pool_name[RBD_MAX_POOL_NAME_LEN];
-	int			poolid;
+	char			*image_name;
+	size_t			image_name_len;
+	char			*header_name;
+	char			*pool_name;
+	int			pool_id;
 
 	struct ceph_osd_event   *watch_event;
 	struct ceph_osd_request *watch_request;
 
-	char                    snap_name[RBD_MAX_SNAP_NAME_LEN];
-	u32 cur_snap;	/* index+1 of current snapshot within snap context
-			   0 - for the head */
-	int read_only;
+	/* protects updating the header */
+	struct rw_semaphore     header_rwsem;
+	/* name of the snapshot this device reads from */
+	char                    *snap_name;
+	/* id of the snapshot this device reads from */
+	u64                     snap_id;	/* current snapshot id */
+	/* whether the snap_id this device reads from still exists */
+	bool                    snap_exists;
+	int                     read_only;
 
 	struct list_head	node;
 
@@ -171,35 +188,48 @@ struct rbd_device {
 	struct device		dev;
 };
 
-static struct bus_type rbd_bus_type = {
-	.name		= "rbd",
-};
-
-static spinlock_t node_lock;      /* protects client get/put */
-
 static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
+
 static LIST_HEAD(rbd_dev_list);    /* devices */
-static LIST_HEAD(rbd_client_list);      /* clients */
+static DEFINE_SPINLOCK(rbd_dev_list_lock);
+
+static LIST_HEAD(rbd_client_list);		/* clients */
+static DEFINE_SPINLOCK(rbd_client_list_lock);
 
 static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
 static void rbd_dev_release(struct device *dev);
-static ssize_t rbd_snap_rollback(struct device *dev,
-				 struct device_attribute *attr,
-				 const char *buf,
-				 size_t size);
 static ssize_t rbd_snap_add(struct device *dev,
 			    struct device_attribute *attr,
 			    const char *buf,
 			    size_t count);
-static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
-				  struct rbd_snap *snap);;
+static void __rbd_remove_snap_dev(struct rbd_snap *snap);
 
+static ssize_t rbd_add(struct bus_type *bus, const char *buf,
+		       size_t count);
+static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
+			  size_t count);
 
-static struct rbd_device *dev_to_rbd(struct device *dev)
+static struct bus_attribute rbd_bus_attrs[] = {
+	__ATTR(add, S_IWUSR, NULL, rbd_add),
+	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
+	__ATTR_NULL
+};
+
+static struct bus_type rbd_bus_type = {
+	.name		= "rbd",
+	.bus_attrs	= rbd_bus_attrs,
+};
+
+static void rbd_root_dev_release(struct device *dev)
 {
-	return container_of(dev, struct rbd_device, dev);
 }
 
+static struct device rbd_root_dev = {
+	.init_name =    "rbd",
+	.release =      rbd_root_dev_release,
+};
+
+
 static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
 {
 	return get_device(&rbd_dev->dev);
@@ -210,20 +240,18 @@ static void rbd_put_dev(struct rbd_device *rbd_dev)
 	put_device(&rbd_dev->dev);
 }
 
-static int __rbd_update_snaps(struct rbd_device *rbd_dev);
+static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
 
 static int rbd_open(struct block_device *bdev, fmode_t mode)
 {
-	struct gendisk *disk = bdev->bd_disk;
-	struct rbd_device *rbd_dev = disk->private_data;
-
-	rbd_get_dev(rbd_dev);
-
-	set_device_ro(bdev, rbd_dev->read_only);
+	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
 
 	if ((mode & FMODE_WRITE) && rbd_dev->read_only)
 		return -EROFS;
 
+	rbd_get_dev(rbd_dev);
+	set_device_ro(bdev, rbd_dev->read_only);
+
 	return 0;
 }
 
@@ -244,9 +272,9 @@ static const struct block_device_operations rbd_bd_ops = {
 
 /*
  * Initialize an rbd client instance.
- * We own *opt.
+ * We own *ceph_opts.
  */
-static struct rbd_client *rbd_client_create(struct ceph_options *opt,
+static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts,
 					    struct rbd_options *rbd_opts)
 {
 	struct rbd_client *rbdc;
@@ -260,10 +288,12 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt,
 	kref_init(&rbdc->kref);
 	INIT_LIST_HEAD(&rbdc->node);
 
-	rbdc->client = ceph_create_client(opt, rbdc);
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
 	if (IS_ERR(rbdc->client))
-		goto out_rbdc;
-	opt = NULL; /* Now rbdc->client is responsible for opt */
+		goto out_mutex;
+	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
 
 	ret = ceph_open_session(rbdc->client);
 	if (ret < 0)
@@ -271,35 +301,38 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt,
 
 	rbdc->rbd_opts = rbd_opts;
 
-	spin_lock(&node_lock);
+	spin_lock(&rbd_client_list_lock);
 	list_add_tail(&rbdc->node, &rbd_client_list);
-	spin_unlock(&node_lock);
+	spin_unlock(&rbd_client_list_lock);
+
+	mutex_unlock(&ctl_mutex);
 
 	dout("rbd_client_create created %p\n", rbdc);
 	return rbdc;
 
 out_err:
 	ceph_destroy_client(rbdc->client);
-out_rbdc:
+out_mutex:
+	mutex_unlock(&ctl_mutex);
 	kfree(rbdc);
 out_opt:
-	if (opt)
-		ceph_destroy_options(opt);
+	if (ceph_opts)
+		ceph_destroy_options(ceph_opts);
 	return ERR_PTR(ret);
 }
 
 /*
  * Find a ceph client with specific addr and configuration.
  */
-static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
+static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts)
 {
 	struct rbd_client *client_node;
 
-	if (opt->flags & CEPH_OPT_NOSHARE)
+	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
 		return NULL;
 
 	list_for_each_entry(client_node, &rbd_client_list, node)
-		if (ceph_compare_options(opt, client_node->client) == 0)
+		if (!ceph_compare_options(ceph_opts, client_node->client))
 			return client_node;
 	return NULL;
 }
@@ -315,7 +348,7 @@ enum {
 	/* string args above */
 };
 
-static match_table_t rbdopt_tokens = {
+static match_table_t rbd_opts_tokens = {
 	{Opt_notify_timeout, "notify_timeout=%d"},
 	/* int args above */
 	/* string args above */
@@ -324,11 +357,11 @@ static match_table_t rbdopt_tokens = {
 
 static int parse_rbd_opts_token(char *c, void *private)
 {
-	struct rbd_options *rbdopt = private;
+	struct rbd_options *rbd_opts = private;
 	substring_t argstr[MAX_OPT_ARGS];
 	int token, intval, ret;
 
-	token = match_token((char *)c, rbdopt_tokens, argstr);
+	token = match_token(c, rbd_opts_tokens, argstr);
 	if (token < 0)
 		return -EINVAL;
 
@@ -349,7 +382,7 @@ static int parse_rbd_opts_token(char *c, void *private)
 
 	switch (token) {
 	case Opt_notify_timeout:
-		rbdopt->notify_timeout = intval;
+		rbd_opts->notify_timeout = intval;
 		break;
 	default:
 		BUG_ON(token);
@@ -361,64 +394,63 @@ static int parse_rbd_opts_token(char *c, void *private)
  * Get a ceph client with specific addr and configuration, if one does
  * not exist create it.
  */
-static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
-			  char *options)
+static struct rbd_client *rbd_get_client(const char *mon_addr,
+					 size_t mon_addr_len,
+					 char *options)
 {
 	struct rbd_client *rbdc;
-	struct ceph_options *opt;
-	int ret;
+	struct ceph_options *ceph_opts;
 	struct rbd_options *rbd_opts;
 
 	rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
 	if (!rbd_opts)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
 
-	ret = ceph_parse_options(&opt, options, mon_addr,
-				 mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts);
-	if (ret < 0)
-		goto done_err;
+	ceph_opts = ceph_parse_options(options, mon_addr,
+					mon_addr + mon_addr_len,
+					parse_rbd_opts_token, rbd_opts);
+	if (IS_ERR(ceph_opts)) {
+		kfree(rbd_opts);
+		return ERR_CAST(ceph_opts);
+	}
 
-	spin_lock(&node_lock);
-	rbdc = __rbd_client_find(opt);
+	spin_lock(&rbd_client_list_lock);
+	rbdc = __rbd_client_find(ceph_opts);
 	if (rbdc) {
-		ceph_destroy_options(opt);
-
 		/* using an existing client */
 		kref_get(&rbdc->kref);
-		rbd_dev->rbd_client = rbdc;
-		rbd_dev->client = rbdc->client;
-		spin_unlock(&node_lock);
-		return 0;
-	}
-	spin_unlock(&node_lock);
+		spin_unlock(&rbd_client_list_lock);
 
-	rbdc = rbd_client_create(opt, rbd_opts);
-	if (IS_ERR(rbdc)) {
-		ret = PTR_ERR(rbdc);
-		goto done_err;
+		ceph_destroy_options(ceph_opts);
+		kfree(rbd_opts);
+
+		return rbdc;
 	}
+	spin_unlock(&rbd_client_list_lock);
 
-	rbd_dev->rbd_client = rbdc;
-	rbd_dev->client = rbdc->client;
-	return 0;
-done_err:
-	kfree(rbd_opts);
-	return ret;
+	rbdc = rbd_client_create(ceph_opts, rbd_opts);
+
+	if (IS_ERR(rbdc))
+		kfree(rbd_opts);
+
+	return rbdc;
 }
 
 /*
  * Destroy ceph client
+ *
+ * Caller must hold rbd_client_list_lock.
  */
 static void rbd_client_release(struct kref *kref)
 {
 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
 
 	dout("rbd_release_client %p\n", rbdc);
-	spin_lock(&node_lock);
+	spin_lock(&rbd_client_list_lock);
 	list_del(&rbdc->node);
-	spin_unlock(&node_lock);
+	spin_unlock(&rbd_client_list_lock);
 
 	ceph_destroy_client(rbdc->client);
 	kfree(rbdc->rbd_opts);
@@ -433,7 +465,6 @@ static void rbd_put_client(struct rbd_device *rbd_dev)
 {
 	kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
 	rbd_dev->rbd_client = NULL;
-	rbd_dev->client = NULL;
 }
 
 /*
@@ -448,28 +479,37 @@ static void rbd_coll_release(struct kref *kref)
 	kfree(coll);
 }
 
+static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
+{
+	return !memcmp(&ondisk->text,
+			RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT));
+}
+
 /*
  * Create a new header structure, translate header format from the on-disk
  * header.
  */
 static int rbd_header_from_disk(struct rbd_image_header *header,
 				 struct rbd_image_header_ondisk *ondisk,
-				 int allocated_snaps,
-				 gfp_t gfp_flags)
+				 u32 allocated_snaps)
 {
-	int i;
-	u32 snap_count = le32_to_cpu(ondisk->snap_count);
-	int ret = -ENOMEM;
+	u32 snap_count;
 
-	init_rwsem(&header->snap_rwsem);
-	header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
+	if (!rbd_dev_ondisk_valid(ondisk))
+		return -ENXIO;
+
+	snap_count = le32_to_cpu(ondisk->snap_count);
+	if (snap_count > (SIZE_MAX - sizeof(struct ceph_snap_context))
+				 / sizeof (u64))
+		return -EINVAL;
 	header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
-				snap_count *
-				 sizeof(struct rbd_image_snap_ondisk),
-				gfp_flags);
+				snap_count * sizeof(u64),
+				GFP_KERNEL);
 	if (!header->snapc)
 		return -ENOMEM;
+
 	if (snap_count) {
+		header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
 		header->snap_names = kmalloc(header->snap_names_len,
 					     GFP_KERNEL);
 		if (!header->snap_names)
@@ -479,11 +519,20 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 		if (!header->snap_sizes)
 			goto err_names;
 	} else {
+		WARN_ON(ondisk->snap_names_len);
+		header->snap_names_len = 0;
 		header->snap_names = NULL;
 		header->snap_sizes = NULL;
 	}
-	memcpy(header->block_name, ondisk->block_name,
+
+	header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1,
+					GFP_KERNEL);
+	if (!header->object_prefix)
+		goto err_sizes;
+
+	memcpy(header->object_prefix, ondisk->block_name,
 	       sizeof(ondisk->block_name));
+	header->object_prefix[sizeof (ondisk->block_name)] = '\0';
 
 	header->image_size = le64_to_cpu(ondisk->image_size);
 	header->obj_order = ondisk->options.order;
@@ -491,12 +540,13 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 	header->comp_type = ondisk->options.comp_type;
 
 	atomic_set(&header->snapc->nref, 1);
-	header->snap_seq = le64_to_cpu(ondisk->snap_seq);
+	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
 	header->snapc->num_snaps = snap_count;
 	header->total_snaps = snap_count;
 
-	if (snap_count &&
-	    allocated_snaps == snap_count) {
+	if (snap_count && allocated_snaps == snap_count) {
+		int i;
+
 		for (i = 0; i < snap_count; i++) {
 			header->snapc->snaps[i] =
 				le64_to_cpu(ondisk->snaps[i].id);
@@ -505,32 +555,23 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 		}
 
 		/* copy snapshot names */
-		memcpy(header->snap_names, &ondisk->snaps[i],
+		memcpy(header->snap_names, &ondisk->snaps[snap_count],
 			header->snap_names_len);
 	}
 
 	return 0;
 
+err_sizes:
+	kfree(header->snap_sizes);
+	header->snap_sizes = NULL;
 err_names:
 	kfree(header->snap_names);
+	header->snap_names = NULL;
 err_snapc:
 	kfree(header->snapc);
-	return ret;
-}
-
-static int snap_index(struct rbd_image_header *header, int snap_num)
-{
-	return header->total_snaps - snap_num;
-}
-
-static u64 cur_snap_id(struct rbd_device *rbd_dev)
-{
-	struct rbd_image_header *header = &rbd_dev->header;
-
-	if (!rbd_dev->cur_snap)
-		return 0;
+	header->snapc = NULL;
 
-	return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
+	return -ENOMEM;
 }
 
 static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
@@ -539,70 +580,66 @@ static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
 	int i;
 	char *p = header->snap_names;
 
-	for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
-		if (strcmp(snap_name, p) == 0)
-			break;
-	}
-	if (i == header->total_snaps)
-		return -ENOENT;
-	if (seq)
-		*seq = header->snapc->snaps[i];
+	for (i = 0; i < header->total_snaps; i++) {
+		if (!strcmp(snap_name, p)) {
 
-	if (size)
-		*size = header->snap_sizes[i];
+			/* Found it.  Pass back its id and/or size */
 
-	return i;
+			if (seq)
+				*seq = header->snapc->snaps[i];
+			if (size)
+				*size = header->snap_sizes[i];
+			return i;
+		}
+		p += strlen(p) + 1;	/* Skip ahead to the next name */
+	}
+	return -ENOENT;
 }
 
-static int rbd_header_set_snap(struct rbd_device *dev,
-			       const char *snap_name,
-			       u64 *size)
-{
-	struct rbd_image_header *header = &dev->header;
-	struct ceph_snap_context *snapc = header->snapc;
-	int ret = -ENOENT;
-
-	down_write(&header->snap_rwsem);
-
-	if (!snap_name ||
-	    !*snap_name ||
-	    strcmp(snap_name, "-") == 0 ||
-	    strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
-		if (header->total_snaps)
-			snapc->seq = header->snap_seq;
-		else
-			snapc->seq = 0;
-		dev->cur_snap = 0;
-		dev->read_only = 0;
+static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
+{
+	int ret;
+
+	down_write(&rbd_dev->header_rwsem);
+
+	if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
+		    sizeof (RBD_SNAP_HEAD_NAME))) {
+		rbd_dev->snap_id = CEPH_NOSNAP;
+		rbd_dev->snap_exists = false;
+		rbd_dev->read_only = 0;
 		if (size)
-			*size = header->image_size;
+			*size = rbd_dev->header.image_size;
 	} else {
-		ret = snap_by_name(header, snap_name, &snapc->seq, size);
+		u64 snap_id = 0;
+
+		ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
+					&snap_id, size);
 		if (ret < 0)
 			goto done;
-
-		dev->cur_snap = header->total_snaps - ret;
-		dev->read_only = 1;
+		rbd_dev->snap_id = snap_id;
+		rbd_dev->snap_exists = true;
+		rbd_dev->read_only = 1;
 	}
 
 	ret = 0;
 done:
-	up_write(&header->snap_rwsem);
+	up_write(&rbd_dev->header_rwsem);
 	return ret;
 }
 
 static void rbd_header_free(struct rbd_image_header *header)
 {
-	kfree(header->snapc);
-	kfree(header->snap_names);
+	kfree(header->object_prefix);
 	kfree(header->snap_sizes);
+	kfree(header->snap_names);
+	ceph_put_snap_context(header->snapc);
 }
 
 /*
  * get the actual striped segment name, offset and length
  */
 static u64 rbd_get_segment(struct rbd_image_header *header,
-			   const char *block_name,
+			   const char *object_prefix,
 			   u64 ofs, u64 len,
 			   char *seg_name, u64 *segofs)
 {
@@ -610,7 +647,7 @@ static u64 rbd_get_segment(struct rbd_image_header *header,
 
 	if (seg_name)
 		snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
-			 "%s.%012llx", block_name, seg);
+			 "%s.%012llx", object_prefix, seg);
 
 	ofs = ofs & ((1 << header->obj_order) - 1);
 	len = min_t(u64, len, (1 << header->obj_order) - ofs);
@@ -708,13 +745,12 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
 			 * split_bio will BUG_ON if this is not the case
 			 */
 			dout("bio_chain_clone split! total=%d remaining=%d"
-			     "bi_size=%d\n",
-			     (int)total, (int)len-total,
-			     (int)old_chain->bi_size);
+			     "bi_size=%u\n",
+			     total, len - total, old_chain->bi_size);
 
 			/* split the bio. We'll release it either in the next
 			   call, or it will have to be released outside */
-			bp = bio_split(old_chain, (len - total) / 512ULL);
+			bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
 			if (!bp)
 				goto err_out;
 
@@ -759,22 +795,24 @@ err_out:
 /*
  * helpers for osd request op vectors.
  */
-static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
-			    int num_ops,
-			    int opcode,
-			    u32 payload_len)
-{
-	*ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
-		       GFP_NOIO);
-	if (!*ops)
-		return -ENOMEM;
-	(*ops)[0].op = opcode;
+static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
+					int opcode, u32 payload_len)
+{
+	struct ceph_osd_req_op *ops;
+
+	ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
+	if (!ops)
+		return NULL;
+
+	ops[0].op = opcode;
+
 	/*
 	 * op extent offset and length will be set later on
 	 * in calc_raw_layout()
 	 */
-	(*ops)[0].payload_len = payload_len;
-	return 0;
+	ops[0].payload_len = payload_len;
+
+	return ops;
 }
 
 static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
@@ -790,8 +828,8 @@ static void rbd_coll_end_req_index(struct request *rq,
 	struct request_queue *q;
 	int min, max, i;
 
-	dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
-	     coll, index, ret, len);
+	dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
+	     coll, index, ret, (unsigned long long) len);
 
 	if (!rq)
 		return;
@@ -830,16 +868,15 @@ static void rbd_coll_end_req(struct rbd_request *req,
  * Send ceph osd request
  */
 static int rbd_do_request(struct request *rq,
-			  struct rbd_device *dev,
+			  struct rbd_device *rbd_dev,
 			  struct ceph_snap_context *snapc,
 			  u64 snapid,
-			  const char *obj, u64 ofs, u64 len,
+			  const char *object_name, u64 ofs, u64 len,
 			  struct bio *bio,
 			  struct page **pages,
 			  int num_pages,
 			  int flags,
 			  struct ceph_osd_req_op *ops,
-			  int num_reply,
 			  struct rbd_req_coll *coll,
 			  int coll_index,
 			  void (*rbd_cb)(struct ceph_osd_request *req,
@@ -854,7 +891,7 @@ static int rbd_do_request(struct request *rq,
 	struct timespec mtime = CURRENT_TIME;
 	struct rbd_request *req_data;
 	struct ceph_osd_request_head *reqhead;
-	struct rbd_image_header *header = &dev->header;
+	struct ceph_osd_client *osdc;
 
 	req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
 	if (!req_data) {
@@ -869,17 +906,13 @@ static int rbd_do_request(struct request *rq,
 		req_data->coll_index = coll_index;
 	}
 
-	dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
+	dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
+		(unsigned long long) ofs, (unsigned long long) len);
 
-	down_read(&header->snap_rwsem);
-
-	req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
-				      snapc,
-				      ops,
-				      false,
-				      GFP_NOIO, pages, bio);
+	osdc = &rbd_dev->rbd_client->client->osdc;
+	req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
+					false, GFP_NOIO, pages, bio);
 	if (!req) {
-		up_read(&header->snap_rwsem);
 		ret = -ENOMEM;
 		goto done_pages;
 	}
@@ -896,7 +929,7 @@ static int rbd_do_request(struct request *rq,
 	reqhead = req->r_request->front.iov_base;
 	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
 
-	strncpy(req->r_oid, obj, sizeof(req->r_oid));
+	strncpy(req->r_oid, object_name, sizeof(req->r_oid));
 	req->r_oid_len = strlen(req->r_oid);
 
 	layout = &req->r_file_layout;
@@ -904,33 +937,32 @@ static int rbd_do_request(struct request *rq,
 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
 	layout->fl_stripe_count = cpu_to_le32(1);
 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-	layout->fl_pg_preferred = cpu_to_le32(-1);
-	layout->fl_pg_pool = cpu_to_le32(dev->poolid);
-	ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
-			     ofs, &len, &bno, req, ops);
+	layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
+	ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
+				req, ops);
 
 	ceph_osdc_build_request(req, ofs, &len,
 				ops,
 				snapc,
 				&mtime,
 				req->r_oid, req->r_oid_len);
-	up_read(&header->snap_rwsem);
 
 	if (linger_req) {
-		ceph_osdc_set_request_linger(&dev->client->osdc, req);
+		ceph_osdc_set_request_linger(osdc, req);
 		*linger_req = req;
 	}
 
-	ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
+	ret = ceph_osdc_start_request(osdc, req, false);
 	if (ret < 0)
 		goto done_err;
 
 	if (!rbd_cb) {
-		ret = ceph_osdc_wait_request(&dev->client->osdc, req);
+		ret = ceph_osdc_wait_request(osdc, req);
 		if (ver)
 			*ver = le64_to_cpu(req->r_reassert_version.version);
-		dout("reassert_ver=%lld\n",
-		     le64_to_cpu(req->r_reassert_version.version));
+		dout("reassert_ver=%llu\n",
+			(unsigned long long)
+				le64_to_cpu(req->r_reassert_version.version));
 		ceph_osdc_put_request(req);
 	}
 	return ret;
@@ -962,9 +994,10 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
 	op = (void *)(replyhead + 1);
 	rc = le32_to_cpu(replyhead->result);
 	bytes = le64_to_cpu(op->extent.length);
-	read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
+	read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
 
-	dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
+	dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
+		(unsigned long long) bytes, read_op, (int) rc);
 
 	if (rc == -ENOENT && read_op) {
 		zero_bio_chain(req_data->bio, 0);
@@ -991,14 +1024,12 @@ static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg
 /*
  * Do a synchronous ceph osd operation
  */
-static int rbd_req_sync_op(struct rbd_device *dev,
+static int rbd_req_sync_op(struct rbd_device *rbd_dev,
 			   struct ceph_snap_context *snapc,
 			   u64 snapid,
-			   int opcode,
 			   int flags,
-			   struct ceph_osd_req_op *orig_ops,
-			   int num_reply,
-			   const char *obj,
+			   struct ceph_osd_req_op *ops,
+			   const char *object_name,
 			   u64 ofs, u64 len,
 			   char *buf,
 			   struct ceph_osd_request **linger_req,
@@ -1007,45 +1038,28 @@ static int rbd_req_sync_op(struct rbd_device *dev,
 	int ret;
 	struct page **pages;
 	int num_pages;
-	struct ceph_osd_req_op *ops = orig_ops;
-	u32 payload_len;
+
+	BUG_ON(ops == NULL);
 
 	num_pages = calc_pages_for(ofs , len);
 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
 	if (IS_ERR(pages))
 		return PTR_ERR(pages);
 
-	if (!orig_ops) {
-		payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
-		ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
-		if (ret < 0)
-			goto done;
-
-		if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
-			ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
-			if (ret < 0)
-				goto done_ops;
-		}
-	}
-
-	ret = rbd_do_request(NULL, dev, snapc, snapid,
-			  obj, ofs, len, NULL,
+	ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
+			  object_name, ofs, len, NULL,
 			  pages, num_pages,
 			  flags,
 			  ops,
-			  2,
 			  NULL, 0,
 			  NULL,
 			  linger_req, ver);
 	if (ret < 0)
-		goto done_ops;
+		goto done;
 
 	if ((flags & CEPH_OSD_FLAG_READ) && buf)
 		ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
 
-done_ops:
-	if (!orig_ops)
-		rbd_destroy_ops(ops);
 done:
 	ceph_release_page_vector(pages, num_pages);
 	return ret;
@@ -1055,10 +1069,10 @@ done:
  * Do an asynchronous ceph osd operation
  */
 static int rbd_do_op(struct request *rq,
-		     struct rbd_device *rbd_dev ,
+		     struct rbd_device *rbd_dev,
 		     struct ceph_snap_context *snapc,
 		     u64 snapid,
-		     int opcode, int flags, int num_reply,
+		     int opcode, int flags,
 		     u64 ofs, u64 len,
 		     struct bio *bio,
 		     struct rbd_req_coll *coll,
@@ -1076,14 +1090,15 @@ static int rbd_do_op(struct request *rq,
 		return -ENOMEM;
 
 	seg_len = rbd_get_segment(&rbd_dev->header,
-				  rbd_dev->header.block_name,
+				  rbd_dev->header.object_prefix,
 				  ofs, len,
 				  seg_name, &seg_ofs);
 
 	payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
 
-	ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
-	if (ret < 0)
+	ret = -ENOMEM;
+	ops = rbd_create_rw_ops(1, opcode, payload_len);
+	if (!ops)
 		goto done;
 
 	/* we've taken care of segment sizes earlier when we
@@ -1097,7 +1112,6 @@ static int rbd_do_op(struct request *rq,
 			     NULL, 0,
 			     flags,
 			     ops,
-			     num_reply,
 			     coll, coll_index,
 			     rbd_req_cb, 0, NULL);
 
@@ -1121,7 +1135,6 @@ static int rbd_req_write(struct request *rq,
 	return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
 			 CEPH_OSD_OP_WRITE,
 			 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			 2,
 			 ofs, len, bio, coll, coll_index);
 }
 
@@ -1137,58 +1150,61 @@ static int rbd_req_read(struct request *rq,
 			 int coll_index)
 {
 	return rbd_do_op(rq, rbd_dev, NULL,
-			 (snapid ? snapid : CEPH_NOSNAP),
+			 snapid,
 			 CEPH_OSD_OP_READ,
 			 CEPH_OSD_FLAG_READ,
-			 2,
 			 ofs, len, bio, coll, coll_index);
 }
 
 /*
  * Request sync osd read
  */
-static int rbd_req_sync_read(struct rbd_device *dev,
-			  struct ceph_snap_context *snapc,
+static int rbd_req_sync_read(struct rbd_device *rbd_dev,
 			  u64 snapid,
-			  const char *obj,
+			  const char *object_name,
 			  u64 ofs, u64 len,
 			  char *buf,
 			  u64 *ver)
 {
-	return rbd_req_sync_op(dev, NULL,
-			       (snapid ? snapid : CEPH_NOSNAP),
-			       CEPH_OSD_OP_READ,
+	struct ceph_osd_req_op *ops;
+	int ret;
+
+	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
+	if (!ops)
+		return -ENOMEM;
+
+	ret = rbd_req_sync_op(rbd_dev, NULL,
+			       snapid,
 			       CEPH_OSD_FLAG_READ,
-			       NULL,
-			       1, obj, ofs, len, buf, NULL, ver);
+			       ops, object_name, ofs, len, buf, NULL, ver);
+	rbd_destroy_ops(ops);
+
+	return ret;
 }
 
 /*
  * Request sync osd watch
  */
-static int rbd_req_sync_notify_ack(struct rbd_device *dev,
+static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
 				   u64 ver,
-				   u64 notify_id,
-				   const char *obj)
+				   u64 notify_id)
 {
 	struct ceph_osd_req_op *ops;
-	struct page **pages = NULL;
 	int ret;
 
-	ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
-	if (ret < 0)
-		return ret;
+	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
+	if (!ops)
+		return -ENOMEM;
 
-	ops[0].watch.ver = cpu_to_le64(dev->header.obj_version);
+	ops[0].watch.ver = cpu_to_le64(ver);
 	ops[0].watch.cookie = notify_id;
 	ops[0].watch.flag = 0;
 
-	ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
-			  obj, 0, 0, NULL,
-			  pages, 0,
+	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
+			  rbd_dev->header_name, 0, 0, NULL,
+			  NULL, 0,
 			  CEPH_OSD_FLAG_READ,
 			  ops,
-			  1,
 			  NULL, 0,
 			  rbd_simple_req_cb, 0, NULL);
 
@@ -1198,54 +1214,53 @@ static int rbd_req_sync_notify_ack(struct rbd_device *dev,
 
 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 {
-	struct rbd_device *dev = (struct rbd_device *)data;
+	struct rbd_device *rbd_dev = (struct rbd_device *)data;
+	u64 hver;
 	int rc;
 
-	if (!dev)
+	if (!rbd_dev)
 		return;
 
-	dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
-		notify_id, (int)opcode);
-	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-	rc = __rbd_update_snaps(dev);
-	mutex_unlock(&ctl_mutex);
+	dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
+		rbd_dev->header_name, (unsigned long long) notify_id,
+		(unsigned int) opcode);
+	rc = rbd_refresh_header(rbd_dev, &hver);
 	if (rc)
-		pr_warning(DRV_NAME "%d got notification but failed to update"
-			   " snaps: %d\n", dev->major, rc);
+		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
+			   " update snaps: %d\n", rbd_dev->major, rc);
 
-	rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
+	rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
 }
 
 /*
  * Request sync osd watch
  */
-static int rbd_req_sync_watch(struct rbd_device *dev,
-			      const char *obj,
-			      u64 ver)
+static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
 {
 	struct ceph_osd_req_op *ops;
-	struct ceph_osd_client *osdc = &dev->client->osdc;
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	int ret;
 
-	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
-	if (ret < 0)
-		return ret;
+	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
+	if (!ops)
+		return -ENOMEM;
 
 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
-				     (void *)dev, &dev->watch_event);
+				     (void *)rbd_dev, &rbd_dev->watch_event);
 	if (ret < 0)
 		goto fail;
 
-	ops[0].watch.ver = cpu_to_le64(ver);
-	ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
+	ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
+	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
 	ops[0].watch.flag = 1;
 
-	ret = rbd_req_sync_op(dev, NULL,
+	ret = rbd_req_sync_op(rbd_dev, NULL,
 			      CEPH_NOSNAP,
-			      0,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      ops,
-			      1, obj, 0, 0, NULL,
-			      &dev->watch_request, NULL);
+			      rbd_dev->header_name,
+			      0, 0, NULL,
+			      &rbd_dev->watch_request, NULL);
 
 	if (ret < 0)
 		goto fail_event;
@@ -1254,8 +1269,8 @@ static int rbd_req_sync_watch(struct rbd_device *dev,
 	return 0;
 
 fail_event:
-	ceph_osdc_cancel_event(dev->watch_event);
-	dev->watch_event = NULL;
+	ceph_osdc_cancel_event(rbd_dev->watch_event);
+	rbd_dev->watch_event = NULL;
 fail:
 	rbd_destroy_ops(ops);
 	return ret;
@@ -1264,64 +1279,65 @@ fail:
 /*
  * Request sync osd unwatch
  */
-static int rbd_req_sync_unwatch(struct rbd_device *dev,
-				const char *obj)
+static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
 {
 	struct ceph_osd_req_op *ops;
+	int ret;
 
-	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
-	if (ret < 0)
-		return ret;
+	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
+	if (!ops)
+		return -ENOMEM;
 
 	ops[0].watch.ver = 0;
-	ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
+	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
 	ops[0].watch.flag = 0;
 
-	ret = rbd_req_sync_op(dev, NULL,
+	ret = rbd_req_sync_op(rbd_dev, NULL,
 			      CEPH_NOSNAP,
-			      0,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      ops,
-			      1, obj, 0, 0, NULL, NULL, NULL);
+			      rbd_dev->header_name,
+			      0, 0, NULL, NULL, NULL);
+
 
 	rbd_destroy_ops(ops);
-	ceph_osdc_cancel_event(dev->watch_event);
-	dev->watch_event = NULL;
+	ceph_osdc_cancel_event(rbd_dev->watch_event);
+	rbd_dev->watch_event = NULL;
 	return ret;
 }
 
 struct rbd_notify_info {
-	struct rbd_device *dev;
+	struct rbd_device *rbd_dev;
 };
 
 static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 {
-	struct rbd_device *dev = (struct rbd_device *)data;
-	if (!dev)
+	struct rbd_device *rbd_dev = (struct rbd_device *)data;
+	if (!rbd_dev)
 		return;
 
-	dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
-		notify_id, (int)opcode);
+	dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
+			rbd_dev->header_name, (unsigned long long) notify_id,
+			(unsigned int) opcode);
 }
 
 /*
  * Request sync osd notify
  */
-static int rbd_req_sync_notify(struct rbd_device *dev,
-		          const char *obj)
+static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
 {
 	struct ceph_osd_req_op *ops;
-	struct ceph_osd_client *osdc = &dev->client->osdc;
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
 	struct ceph_osd_event *event;
 	struct rbd_notify_info info;
 	int payload_len = sizeof(u32) + sizeof(u32);
 	int ret;
 
-	ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
-	if (ret < 0)
-		return ret;
+	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
+	if (!ops)
+		return -ENOMEM;
 
-	info.dev = dev;
+	info.rbd_dev = rbd_dev;
 
 	ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
 				     (void *)&info, &event);
@@ -1334,12 +1350,12 @@ static int rbd_req_sync_notify(struct rbd_device *dev,
 	ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
 	ops[0].watch.timeout = 12;
 
-	ret = rbd_req_sync_op(dev, NULL,
+	ret = rbd_req_sync_op(rbd_dev, NULL,
 			       CEPH_NOSNAP,
-			       0,
 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			       ops,
-			       1, obj, 0, 0, NULL, NULL, NULL);
+			       rbd_dev->header_name,
+			       0, 0, NULL, NULL, NULL);
 	if (ret < 0)
 		goto fail_event;
 
@@ -1356,64 +1372,39 @@ fail:
 }
 
 /*
- * Request sync osd rollback
- */
-static int rbd_req_sync_rollback_obj(struct rbd_device *dev,
-				     u64 snapid,
-				     const char *obj)
-{
-	struct ceph_osd_req_op *ops;
-	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_ROLLBACK, 0);
-	if (ret < 0)
-		return ret;
-
-	ops[0].snap.snapid = snapid;
-
-	ret = rbd_req_sync_op(dev, NULL,
-			       CEPH_NOSNAP,
-			       0,
-			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-			       ops,
-			       1, obj, 0, 0, NULL, NULL, NULL);
-
-	rbd_destroy_ops(ops);
-
-	return ret;
-}
-
-/*
  * Request sync osd read
  */
-static int rbd_req_sync_exec(struct rbd_device *dev,
-			     const char *obj,
-			     const char *cls,
-			     const char *method,
+static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
+			     const char *object_name,
+			     const char *class_name,
+			     const char *method_name,
 			     const char *data,
 			     int len,
 			     u64 *ver)
 {
 	struct ceph_osd_req_op *ops;
-	int cls_len = strlen(cls);
-	int method_len = strlen(method);
-	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
-				    cls_len + method_len + len);
-	if (ret < 0)
-		return ret;
+	int class_name_len = strlen(class_name);
+	int method_name_len = strlen(method_name);
+	int ret;
 
-	ops[0].cls.class_name = cls;
-	ops[0].cls.class_len = (__u8)cls_len;
-	ops[0].cls.method_name = method;
-	ops[0].cls.method_len = (__u8)method_len;
+	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
+				    class_name_len + method_name_len + len);
+	if (!ops)
+		return -ENOMEM;
+
+	ops[0].cls.class_name = class_name;
+	ops[0].cls.class_len = (__u8) class_name_len;
+	ops[0].cls.method_name = method_name;
+	ops[0].cls.method_len = (__u8) method_name_len;
 	ops[0].cls.argc = 0;
 	ops[0].cls.indata = data;
 	ops[0].cls.indata_len = len;
 
-	ret = rbd_req_sync_op(dev, NULL,
+	ret = rbd_req_sync_op(rbd_dev, NULL,
 			       CEPH_NOSNAP,
-			       0,
 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			       ops,
-			       1, obj, 0, 0, NULL, NULL, ver);
+			       object_name, 0, 0, NULL, NULL, ver);
 
 	rbd_destroy_ops(ops);
 
@@ -1444,16 +1435,16 @@ static void rbd_rq_fn(struct request_queue *q)
 	struct request *rq;
 	struct bio_pair *bp = NULL;
 
-	rq = blk_fetch_request(q);
-
-	while (1) {
+	while ((rq = blk_fetch_request(q))) {
 		struct bio *bio;
 		struct bio *rq_bio, *next_bio = NULL;
 		bool do_write;
-		int size, op_size = 0;
+		unsigned int size;
+		u64 op_size = 0;
 		u64 ofs;
 		int num_segs, cur_seg = 0;
 		struct rbd_req_coll *coll;
+		struct ceph_snap_context *snapc;
 
 		/* peek at request from block layer */
 		if (!rq)
@@ -1464,39 +1455,54 @@ static void rbd_rq_fn(struct request_queue *q)
 		/* filter out block requests we don't understand */
 		if ((rq->cmd_type != REQ_TYPE_FS)) {
 			__blk_end_request_all(rq, 0);
-			goto next;
+			continue;
 		}
 
 		/* deduce our operation (read, write) */
 		do_write = (rq_data_dir(rq) == WRITE);
 
 		size = blk_rq_bytes(rq);
-		ofs = blk_rq_pos(rq) * 512ULL;
+		ofs = blk_rq_pos(rq) * SECTOR_SIZE;
 		rq_bio = rq->bio;
 		if (do_write && rbd_dev->read_only) {
 			__blk_end_request_all(rq, -EROFS);
-			goto next;
+			continue;
 		}
 
 		spin_unlock_irq(q->queue_lock);
 
+		down_read(&rbd_dev->header_rwsem);
+
+		if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
+			up_read(&rbd_dev->header_rwsem);
+			dout("request for non-existent snapshot");
+			spin_lock_irq(q->queue_lock);
+			__blk_end_request_all(rq, -ENXIO);
+			continue;
+		}
+
+		snapc = ceph_get_snap_context(rbd_dev->header.snapc);
+
+		up_read(&rbd_dev->header_rwsem);
+
 		dout("%s 0x%x bytes at 0x%llx\n",
 		     do_write ? "write" : "read",
-		     size, blk_rq_pos(rq) * 512ULL);
+		     size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
 
 		num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
 		coll = rbd_alloc_coll(num_segs);
 		if (!coll) {
 			spin_lock_irq(q->queue_lock);
 			__blk_end_request_all(rq, -ENOMEM);
-			goto next;
+			ceph_put_snap_context(snapc);
+			continue;
 		}
 
 		do {
 			/* a bio clone to be passed down to OSD req */
-			dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
+			dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
 			op_size = rbd_get_segment(&rbd_dev->header,
-						  rbd_dev->header.block_name,
+						  rbd_dev->header.object_prefix,
 						  ofs, size,
 						  NULL, NULL);
 			kref_get(&coll->kref);
@@ -1512,13 +1518,13 @@ static void rbd_rq_fn(struct request_queue *q)
 			/* init OSD command: write or read */
 			if (do_write)
 				rbd_req_write(rq, rbd_dev,
-					      rbd_dev->header.snapc,
+					      snapc,
 					      ofs,
 					      op_size, bio,
 					      coll, cur_seg);
 			else
 				rbd_req_read(rq, rbd_dev,
-					     cur_snap_id(rbd_dev),
+					     rbd_dev->snap_id,
 					     ofs,
 					     op_size, bio,
 					     coll, cur_seg);
@@ -1535,8 +1541,8 @@ next_seg:
 		if (bp)
 			bio_pair_release(bp);
 		spin_lock_irq(q->queue_lock);
-next:
-		rq = blk_fetch_request(q);
+
+		ceph_put_snap_context(snapc);
 	}
 }
 
@@ -1549,13 +1555,17 @@ static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
 			  struct bio_vec *bvec)
 {
 	struct rbd_device *rbd_dev = q->queuedata;
-	unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
-	sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
-	unsigned int bio_sectors = bmd->bi_size >> 9;
+	unsigned int chunk_sectors;
+	sector_t sector;
+	unsigned int bio_sectors;
 	int max;
 
+	chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
+	sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
+	bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
+
 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1))
-				 + bio_sectors)) << 9;
+				 + bio_sectors)) << SECTOR_SHIFT;
 	if (max < 0)
 		max = 0; /* bio_add cannot handle a negative return */
 	if (max <= bvec->bv_len && bio_sectors == 0)
@@ -1587,40 +1597,48 @@ static int rbd_read_header(struct rbd_device *rbd_dev,
 {
 	ssize_t rc;
 	struct rbd_image_header_ondisk *dh;
-	int snap_count = 0;
-	u64 snap_names_len = 0;
+	u32 snap_count = 0;
 	u64 ver;
+	size_t len;
 
+	/*
+	 * First reads the fixed-size header to determine the number
+	 * of snapshots, then re-reads it, along with all snapshot
+	 * records as well as their stored names.
+	 */
+	len = sizeof (*dh);
 	while (1) {
-		int len = sizeof(*dh) +
-			  snap_count * sizeof(struct rbd_image_snap_ondisk) +
-			  snap_names_len;
-
-		rc = -ENOMEM;
 		dh = kmalloc(len, GFP_KERNEL);
 		if (!dh)
 			return -ENOMEM;
 
 		rc = rbd_req_sync_read(rbd_dev,
-				       NULL, CEPH_NOSNAP,
-				       rbd_dev->obj_md_name,
+				       CEPH_NOSNAP,
+				       rbd_dev->header_name,
 				       0, len,
 				       (char *)dh, &ver);
 		if (rc < 0)
 			goto out_dh;
 
-		rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
-		if (rc < 0)
+		rc = rbd_header_from_disk(header, dh, snap_count);
+		if (rc < 0) {
+			if (rc == -ENXIO)
+				pr_warning("unrecognized header format"
+					   " for image %s\n",
+					   rbd_dev->image_name);
 			goto out_dh;
-
-		if (snap_count != header->total_snaps) {
-			snap_count = header->total_snaps;
-			snap_names_len = header->snap_names_len;
-			rbd_header_free(header);
-			kfree(dh);
-			continue;
 		}
-		break;
+
+		if (snap_count == header->total_snaps)
+			break;
+
+		snap_count = header->total_snaps;
+		len = sizeof (*dh) +
+			snap_count * sizeof(struct rbd_image_snap_ondisk) +
+			header->snap_names_len;
+
+		rbd_header_free(header);
+		kfree(dh);
 	}
 	header->obj_version = ver;
 
@@ -1632,7 +1650,7 @@ out_dh:
 /*
  * create a snapshot
  */
-static int rbd_header_add_snap(struct rbd_device *dev,
+static int rbd_header_add_snap(struct rbd_device *rbd_dev,
 			       const char *snap_name,
 			       gfp_t gfp_flags)
 {
@@ -1640,15 +1658,15 @@ static int rbd_header_add_snap(struct rbd_device *dev,
 	u64 new_snapid;
 	int ret;
 	void *data, *p, *e;
-	u64 ver;
+	struct ceph_mon_client *monc;
 
 	/* we should create a snapshot only if we're pointing at the head */
-	if (dev->cur_snap)
+	if (rbd_dev->snap_id != CEPH_NOSNAP)
 		return -EINVAL;
 
-	ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
-				      &new_snapid);
-	dout("created snapid=%lld\n", new_snapid);
+	monc = &rbd_dev->rbd_client->client->monc;
+	ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
+	dout("created snapid=%llu\n", (unsigned long long) new_snapid);
 	if (ret < 0)
 		return ret;
 
@@ -1662,17 +1680,13 @@ static int rbd_header_add_snap(struct rbd_device *dev,
 	ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
 	ceph_encode_64_safe(&p, e, new_snapid, bad);
 
-	ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
-				data, p - data, &ver);
+	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
+				"rbd", "snap_add",
+				data, p - data, NULL);
 
 	kfree(data);
 
-	if (ret < 0)
-		return ret;
-
-	dev->header.snapc->seq =  new_snapid;
-
-	return 0;
+	return ret < 0 ? ret : 0;
 bad:
 	return -ERANGE;
 }
@@ -1680,56 +1694,67 @@ bad:
 static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
 {
 	struct rbd_snap *snap;
+	struct rbd_snap *next;
 
-	while (!list_empty(&rbd_dev->snaps)) {
-		snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
-		__rbd_remove_snap_dev(rbd_dev, snap);
-	}
+	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
+		__rbd_remove_snap_dev(snap);
 }
 
 /*
  * only read the first part of the ondisk header, without the snaps info
  */
-static int __rbd_update_snaps(struct rbd_device *rbd_dev)
+static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
 {
 	int ret;
 	struct rbd_image_header h;
-	u64 snap_seq;
-	int follow_seq = 0;
 
 	ret = rbd_read_header(rbd_dev, &h);
 	if (ret < 0)
 		return ret;
 
-	/* resized? */
-	set_capacity(rbd_dev->disk, h.image_size / 512ULL);
+	down_write(&rbd_dev->header_rwsem);
 
-	down_write(&rbd_dev->header.snap_rwsem);
+	/* resized? */
+	if (rbd_dev->snap_id == CEPH_NOSNAP) {
+		sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
 
-	snap_seq = rbd_dev->header.snapc->seq;
-	if (rbd_dev->header.total_snaps &&
-	    rbd_dev->header.snapc->snaps[0] == snap_seq)
-		/* pointing at the head, will need to follow that
-		   if head moves */
-		follow_seq = 1;
+		dout("setting size to %llu sectors", (unsigned long long) size);
+		set_capacity(rbd_dev->disk, size);
+	}
 
-	kfree(rbd_dev->header.snapc);
-	kfree(rbd_dev->header.snap_names);
+	/* rbd_dev->header.object_prefix shouldn't change */
 	kfree(rbd_dev->header.snap_sizes);
+	kfree(rbd_dev->header.snap_names);
+	/* osd requests may still refer to snapc */
+	ceph_put_snap_context(rbd_dev->header.snapc);
 
+	if (hver)
+		*hver = h.obj_version;
+	rbd_dev->header.obj_version = h.obj_version;
+	rbd_dev->header.image_size = h.image_size;
 	rbd_dev->header.total_snaps = h.total_snaps;
 	rbd_dev->header.snapc = h.snapc;
 	rbd_dev->header.snap_names = h.snap_names;
 	rbd_dev->header.snap_names_len = h.snap_names_len;
 	rbd_dev->header.snap_sizes = h.snap_sizes;
-	if (follow_seq)
-		rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
-	else
-		rbd_dev->header.snapc->seq = snap_seq;
+	/* Free the extra copy of the object prefix */
+	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
+	kfree(h.object_prefix);
 
 	ret = __rbd_init_snaps_header(rbd_dev);
 
-	up_write(&rbd_dev->header.snap_rwsem);
+	up_write(&rbd_dev->header_rwsem);
+
+	return ret;
+}
+
+static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
+{
+	int ret;
+
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+	ret = __rbd_refresh_header(rbd_dev, hver);
+	mutex_unlock(&ctl_mutex);
 
 	return ret;
 }
@@ -1739,6 +1764,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	struct gendisk *disk;
 	struct request_queue *q;
 	int rc;
+	u64 segment_size;
 	u64 total_size = 0;
 
 	/* contact OSD, request size info about the object being mapped */
@@ -1751,7 +1777,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	if (rc)
 		return rc;
 
-	rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
+	rc = rbd_header_set_snap(rbd_dev, &total_size);
 	if (rc)
 		return rc;
 
@@ -1761,8 +1787,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	if (!disk)
 		goto out;
 
-	snprintf(disk->disk_name, sizeof(disk->disk_name), DRV_NAME "%d",
-		 rbd_dev->id);
+	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
+		 rbd_dev->dev_id);
 	disk->major = rbd_dev->major;
 	disk->first_minor = 0;
 	disk->fops = &rbd_bd_ops;
@@ -1774,11 +1800,15 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	if (!q)
 		goto out_disk;
 
+	/* We use the default size, but let's be explicit about it. */
+	blk_queue_physical_block_size(q, SECTOR_SIZE);
+
 	/* set io sizes to object size */
-	blk_queue_max_hw_sectors(q, rbd_obj_bytes(&rbd_dev->header) / 512ULL);
-	blk_queue_max_segment_size(q, rbd_obj_bytes(&rbd_dev->header));
-	blk_queue_io_min(q, rbd_obj_bytes(&rbd_dev->header));
-	blk_queue_io_opt(q, rbd_obj_bytes(&rbd_dev->header));
+	segment_size = rbd_obj_bytes(&rbd_dev->header);
+	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
+	blk_queue_max_segment_size(q, segment_size);
+	blk_queue_io_min(q, segment_size);
+	blk_queue_io_opt(q, segment_size);
 
 	blk_queue_merge_bvec(q, rbd_merge_bvec);
 	disk->queue = q;
@@ -1789,7 +1819,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	rbd_dev->q = q;
 
 	/* finally, announce the disk to the world */
-	set_capacity(disk, total_size / 512ULL);
+	set_capacity(disk, total_size / SECTOR_SIZE);
 	add_disk(disk);
 
 	pr_info("%s: added with size 0x%llx\n",
@@ -1806,18 +1836,28 @@ out:
   sysfs
 */
 
+static struct rbd_device *dev_to_rbd_dev(struct device *dev)
+{
+	return container_of(dev, struct rbd_device, dev);
+}
+
 static ssize_t rbd_size_show(struct device *dev,
 			     struct device_attribute *attr, char *buf)
 {
-	struct rbd_device *rbd_dev = dev_to_rbd(dev);
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+	sector_t size;
+
+	down_read(&rbd_dev->header_rwsem);
+	size = get_capacity(rbd_dev->disk);
+	up_read(&rbd_dev->header_rwsem);
 
-	return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
+	return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
 }
 
 static ssize_t rbd_major_show(struct device *dev,
 			      struct device_attribute *attr, char *buf)
 {
-	struct rbd_device *rbd_dev = dev_to_rbd(dev);
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 
 	return sprintf(buf, "%d\n", rbd_dev->major);
 }
@@ -1825,32 +1865,41 @@ static ssize_t rbd_major_show(struct device *dev,
 static ssize_t rbd_client_id_show(struct device *dev,
 				  struct device_attribute *attr, char *buf)
 {
-	struct rbd_device *rbd_dev = dev_to_rbd(dev);
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 
-	return sprintf(buf, "client%lld\n", ceph_client_id(rbd_dev->client));
+	return sprintf(buf, "client%lld\n",
+			ceph_client_id(rbd_dev->rbd_client->client));
 }
 
 static ssize_t rbd_pool_show(struct device *dev,
 			     struct device_attribute *attr, char *buf)
 {
-	struct rbd_device *rbd_dev = dev_to_rbd(dev);
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 
 	return sprintf(buf, "%s\n", rbd_dev->pool_name);
 }
 
+static ssize_t rbd_pool_id_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "%d\n", rbd_dev->pool_id);
+}
+
 static ssize_t rbd_name_show(struct device *dev,
 			     struct device_attribute *attr, char *buf)
 {
-	struct rbd_device *rbd_dev = dev_to_rbd(dev);
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 
-	return sprintf(buf, "%s\n", rbd_dev->obj);
+	return sprintf(buf, "%s\n", rbd_dev->image_name);
 }
 
 static ssize_t rbd_snap_show(struct device *dev,
 			     struct device_attribute *attr,
 			     char *buf)
 {
-	struct rbd_device *rbd_dev = dev_to_rbd(dev);
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 
 	return sprintf(buf, "%s\n", rbd_dev->snap_name);
 }
@@ -1860,40 +1909,34 @@ static ssize_t rbd_image_refresh(struct device *dev,
 				 const char *buf,
 				 size_t size)
 {
-	struct rbd_device *rbd_dev = dev_to_rbd(dev);
-	int rc;
-	int ret = size;
-
-	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+	int ret;
 
-	rc = __rbd_update_snaps(rbd_dev);
-	if (rc < 0)
-		ret = rc;
+	ret = rbd_refresh_header(rbd_dev, NULL);
 
-	mutex_unlock(&ctl_mutex);
-	return ret;
+	return ret < 0 ? ret : size;
 }
 
 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
+static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
 static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
-static DEVICE_ATTR(rollback_snap, S_IWUSR, NULL, rbd_snap_rollback);
 
 static struct attribute *rbd_attrs[] = {
 	&dev_attr_size.attr,
 	&dev_attr_major.attr,
 	&dev_attr_client_id.attr,
 	&dev_attr_pool.attr,
+	&dev_attr_pool_id.attr,
 	&dev_attr_name.attr,
 	&dev_attr_current_snap.attr,
 	&dev_attr_refresh.attr,
 	&dev_attr_create_snap.attr,
-	&dev_attr_rollback_snap.attr,
 	NULL
 };
 
@@ -1927,7 +1970,7 @@ static ssize_t rbd_snap_size_show(struct device *dev,
 {
 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
 
-	return sprintf(buf, "%lld\n", (long long)snap->size);
+	return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
 }
 
 static ssize_t rbd_snap_id_show(struct device *dev,
@@ -1936,7 +1979,7 @@ static ssize_t rbd_snap_id_show(struct device *dev,
 {
 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
 
-	return sprintf(buf, "%lld\n", (long long)snap->id);
+	return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
 }
 
 static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
@@ -1969,15 +2012,13 @@ static struct device_type rbd_snap_device_type = {
 	.release	= rbd_snap_dev_release,
 };
 
-static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
-				  struct rbd_snap *snap)
+static void __rbd_remove_snap_dev(struct rbd_snap *snap)
 {
 	list_del(&snap->node);
 	device_unregister(&snap->dev);
 }
 
-static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
-				  struct rbd_snap *snap,
+static int rbd_register_snap_dev(struct rbd_snap *snap,
 				  struct device *parent)
 {
 	struct device *dev = &snap->dev;
@@ -1992,29 +2033,36 @@ static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
 	return ret;
 }
 
-static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
-			      int i, const char *name,
-			      struct rbd_snap **snapp)
+static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
+					      int i, const char *name)
 {
+	struct rbd_snap *snap;
 	int ret;
-	struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
+
+	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
 	if (!snap)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
+
+	ret = -ENOMEM;
 	snap->name = kstrdup(name, GFP_KERNEL);
+	if (!snap->name)
+		goto err;
+
 	snap->size = rbd_dev->header.snap_sizes[i];
 	snap->id = rbd_dev->header.snapc->snaps[i];
 	if (device_is_registered(&rbd_dev->dev)) {
-		ret = rbd_register_snap_dev(rbd_dev, snap,
-					     &rbd_dev->dev);
+		ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
 		if (ret < 0)
 			goto err;
 	}
-	*snapp = snap;
-	return 0;
+
+	return snap;
+
 err:
 	kfree(snap->name);
 	kfree(snap);
-	return ret;
+
+	return ERR_PTR(ret);
 }
 
 /*
@@ -2047,7 +2095,6 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
 	const char *name, *first_name;
 	int i = rbd_dev->header.total_snaps;
 	struct rbd_snap *snap, *old_snap = NULL;
-	int ret;
 	struct list_head *p, *n;
 
 	first_name = rbd_dev->header.snap_names;
@@ -2062,8 +2109,15 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
 			cur_id = rbd_dev->header.snapc->snaps[i - 1];
 
 		if (!i || old_snap->id < cur_id) {
-			/* old_snap->id was skipped, thus was removed */
-			__rbd_remove_snap_dev(rbd_dev, old_snap);
+			/*
+			 * old_snap->id was skipped, thus was
+			 * removed.  If this rbd_dev is mapped to
+			 * the removed snapshot, record that it no
+			 * longer exists, to prevent further I/O.
+			 */
+			if (rbd_dev->snap_id == old_snap->id)
+				rbd_dev->snap_exists = false;
+			__rbd_remove_snap_dev(old_snap);
 			continue;
 		}
 		if (old_snap->id == cur_id) {
@@ -2083,9 +2137,9 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
 			if (cur_id >= old_snap->id)
 				break;
 			/* a new snapshot */
-			ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
-			if (ret < 0)
-				return ret;
+			snap = __rbd_add_snap_dev(rbd_dev, i - 1, name);
+			if (IS_ERR(snap))
+				return PTR_ERR(snap);
 
 			/* note that we add it backward so using n and not p */
 			list_add(&snap->node, n);
@@ -2099,28 +2153,18 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
 			WARN_ON(1);
 			return -EINVAL;
 		}
-		ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
-		if (ret < 0)
-			return ret;
+		snap = __rbd_add_snap_dev(rbd_dev, i - 1, name);
+		if (IS_ERR(snap))
+			return PTR_ERR(snap);
 		list_add(&snap->node, &rbd_dev->snaps);
 	}
 
 	return 0;
 }
 
-
-static void rbd_root_dev_release(struct device *dev)
-{
-}
-
-static struct device rbd_root_dev = {
-	.init_name =    "rbd",
-	.release =      rbd_root_dev_release,
-};
-
 static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
 {
-	int ret = -ENOMEM;
+	int ret;
 	struct device *dev;
 	struct rbd_snap *snap;
 
@@ -2131,21 +2175,17 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
 	dev->type = &rbd_device_type;
 	dev->parent = &rbd_root_dev;
 	dev->release = rbd_dev_release;
-	dev_set_name(dev, "%d", rbd_dev->id);
+	dev_set_name(dev, "%d", rbd_dev->dev_id);
 	ret = device_register(dev);
 	if (ret < 0)
-		goto done_free;
+		goto out;
 
 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
-		ret = rbd_register_snap_dev(rbd_dev, snap,
-					     &rbd_dev->dev);
+		ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
 		if (ret < 0)
 			break;
 	}
-
-	mutex_unlock(&ctl_mutex);
-	return 0;
-done_free:
+out:
 	mutex_unlock(&ctl_mutex);
 	return ret;
 }
@@ -2160,12 +2200,9 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
 	int ret, rc;
 
 	do {
-		ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name,
-					 rbd_dev->header.obj_version);
+		ret = rbd_req_sync_watch(rbd_dev);
 		if (ret == -ERANGE) {
-			mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-			rc = __rbd_update_snaps(rbd_dev);
-			mutex_unlock(&ctl_mutex);
+			rc = rbd_refresh_header(rbd_dev, NULL);
 			if (rc < 0)
 				return rc;
 		}
@@ -2174,102 +2211,305 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
 	return ret;
 }
 
+static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
+
+/*
+ * Get a unique rbd identifier for the given new rbd_dev, and add
+ * the rbd_dev to the global list.  The minimum rbd id is 1.
+ */
+static void rbd_id_get(struct rbd_device *rbd_dev)
+{
+	rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max);
+
+	spin_lock(&rbd_dev_list_lock);
+	list_add_tail(&rbd_dev->node, &rbd_dev_list);
+	spin_unlock(&rbd_dev_list_lock);
+}
+
+/*
+ * Remove an rbd_dev from the global list, and record that its
+ * identifier is no longer in use.
+ */
+static void rbd_id_put(struct rbd_device *rbd_dev)
+{
+	struct list_head *tmp;
+	int rbd_id = rbd_dev->dev_id;
+	int max_id;
+
+	BUG_ON(rbd_id < 1);
+
+	spin_lock(&rbd_dev_list_lock);
+	list_del_init(&rbd_dev->node);
+
+	/*
+	 * If the id being "put" is not the current maximum, there
+	 * is nothing special we need to do.
+	 */
+	if (rbd_id != atomic64_read(&rbd_id_max)) {
+		spin_unlock(&rbd_dev_list_lock);
+		return;
+	}
+
+	/*
+	 * We need to update the current maximum id.  Search the
+	 * list to find out what it is.  We're more likely to find
+	 * the maximum at the end, so search the list backward.
+	 */
+	max_id = 0;
+	list_for_each_prev(tmp, &rbd_dev_list) {
+		struct rbd_device *rbd_dev;
+
+		rbd_dev = list_entry(tmp, struct rbd_device, node);
+		if (rbd_id > max_id)
+			max_id = rbd_id;
+	}
+	spin_unlock(&rbd_dev_list_lock);
+
+	/*
+	 * The max id could have been updated by rbd_id_get(), in
+	 * which case it now accurately reflects the new maximum.
+	 * Be careful not to overwrite the maximum value in that
+	 * case.
+	 */
+	atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
+}
+
+/*
+ * Skips over white space at *buf, and updates *buf to point to the
+ * first found non-space character (if any). Returns the length of
+ * the token (string of non-white space characters) found.  Note
+ * that *buf must be terminated with '\0'.
+ */
+static inline size_t next_token(const char **buf)
+{
+        /*
+        * These are the characters that produce nonzero for
+        * isspace() in the "C" and "POSIX" locales.
+        */
+        const char *spaces = " \f\n\r\t\v";
+
+        *buf += strspn(*buf, spaces);	/* Find start of token */
+
+	return strcspn(*buf, spaces);   /* Return token length */
+}
+
+/*
+ * Finds the next token in *buf, and if the provided token buffer is
+ * big enough, copies the found token into it.  The result, if
+ * copied, is guaranteed to be terminated with '\0'.  Note that *buf
+ * must be terminated with '\0' on entry.
+ *
+ * Returns the length of the token found (not including the '\0').
+ * Return value will be 0 if no token is found, and it will be >=
+ * token_size if the token would not fit.
+ *
+ * The *buf pointer will be updated to point beyond the end of the
+ * found token.  Note that this occurs even if the token buffer is
+ * too small to hold it.
+ */
+static inline size_t copy_token(const char **buf,
+				char *token,
+				size_t token_size)
+{
+        size_t len;
+
+	len = next_token(buf);
+	if (len < token_size) {
+		memcpy(token, *buf, len);
+		*(token + len) = '\0';
+	}
+	*buf += len;
+
+        return len;
+}
+
+/*
+ * Finds the next token in *buf, dynamically allocates a buffer big
+ * enough to hold a copy of it, and copies the token into the new
+ * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
+ * that a duplicate buffer is created even for a zero-length token.
+ *
+ * Returns a pointer to the newly-allocated duplicate, or a null
+ * pointer if memory for the duplicate was not available.  If
+ * the lenp argument is a non-null pointer, the length of the token
+ * (not including the '\0') is returned in *lenp.
+ *
+ * If successful, the *buf pointer will be updated to point beyond
+ * the end of the found token.
+ *
+ * Note: uses GFP_KERNEL for allocation.
+ */
+static inline char *dup_token(const char **buf, size_t *lenp)
+{
+	char *dup;
+	size_t len;
+
+	len = next_token(buf);
+	dup = kmalloc(len + 1, GFP_KERNEL);
+	if (!dup)
+		return NULL;
+
+	memcpy(dup, *buf, len);
+	*(dup + len) = '\0';
+	*buf += len;
+
+	if (lenp)
+		*lenp = len;
+
+	return dup;
+}
+
+/*
+ * This fills in the pool_name, image_name, image_name_len, snap_name,
+ * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
+ * on the list of monitor addresses and other options provided via
+ * /sys/bus/rbd/add.
+ *
+ * Note: rbd_dev is assumed to have been initially zero-filled.
+ */
+static int rbd_add_parse_args(struct rbd_device *rbd_dev,
+			      const char *buf,
+			      const char **mon_addrs,
+			      size_t *mon_addrs_size,
+			      char *options,
+			     size_t options_size)
+{
+	size_t len;
+	int ret;
+
+	/* The first four tokens are required */
+
+	len = next_token(&buf);
+	if (!len)
+		return -EINVAL;
+	*mon_addrs_size = len + 1;
+	*mon_addrs = buf;
+
+	buf += len;
+
+	len = copy_token(&buf, options, options_size);
+	if (!len || len >= options_size)
+		return -EINVAL;
+
+	ret = -ENOMEM;
+	rbd_dev->pool_name = dup_token(&buf, NULL);
+	if (!rbd_dev->pool_name)
+		goto out_err;
+
+	rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
+	if (!rbd_dev->image_name)
+		goto out_err;
+
+	/* Create the name of the header object */
+
+	rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
+						+ sizeof (RBD_SUFFIX),
+					GFP_KERNEL);
+	if (!rbd_dev->header_name)
+		goto out_err;
+	sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
+
+	/*
+	 * The snapshot name is optional.  If none is is supplied,
+	 * we use the default value.
+	 */
+	rbd_dev->snap_name = dup_token(&buf, &len);
+	if (!rbd_dev->snap_name)
+		goto out_err;
+	if (!len) {
+		/* Replace the empty name with the default */
+		kfree(rbd_dev->snap_name);
+		rbd_dev->snap_name
+			= kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
+		if (!rbd_dev->snap_name)
+			goto out_err;
+
+		memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
+			sizeof (RBD_SNAP_HEAD_NAME));
+	}
+
+	return 0;
+
+out_err:
+	kfree(rbd_dev->header_name);
+	kfree(rbd_dev->image_name);
+	kfree(rbd_dev->pool_name);
+	rbd_dev->pool_name = NULL;
+
+	return ret;
+}
+
 static ssize_t rbd_add(struct bus_type *bus,
 		       const char *buf,
 		       size_t count)
 {
-	struct ceph_osd_client *osdc;
-	struct rbd_device *rbd_dev;
-	ssize_t rc = -ENOMEM;
-	int irc, new_id = 0;
-	struct list_head *tmp;
-	char *mon_dev_name;
 	char *options;
+	struct rbd_device *rbd_dev = NULL;
+	const char *mon_addrs = NULL;
+	size_t mon_addrs_size = 0;
+	struct ceph_osd_client *osdc;
+	int rc = -ENOMEM;
 
 	if (!try_module_get(THIS_MODULE))
 		return -ENODEV;
 
-	mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
-	if (!mon_dev_name)
-		goto err_out_mod;
-
-	options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
+	options = kmalloc(count, GFP_KERNEL);
 	if (!options)
-		goto err_mon_dev;
-
-	/* new rbd_device object */
+		goto err_nomem;
 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
 	if (!rbd_dev)
-		goto err_out_opt;
+		goto err_nomem;
 
 	/* static rbd_device initialization */
 	spin_lock_init(&rbd_dev->lock);
 	INIT_LIST_HEAD(&rbd_dev->node);
 	INIT_LIST_HEAD(&rbd_dev->snaps);
+	init_rwsem(&rbd_dev->header_rwsem);
 
 	/* generate unique id: find highest unique id, add one */
-	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
-	list_for_each(tmp, &rbd_dev_list) {
-		struct rbd_device *rbd_dev;
+	rbd_id_get(rbd_dev);
 
-		rbd_dev = list_entry(tmp, struct rbd_device, node);
-		if (rbd_dev->id >= new_id)
-			new_id = rbd_dev->id + 1;
-	}
-
-	rbd_dev->id = new_id;
-
-	/* add to global list */
-	list_add_tail(&rbd_dev->node, &rbd_dev_list);
+	/* Fill in the device name, now that we have its id. */
+	BUILD_BUG_ON(DEV_NAME_LEN
+			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
+	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
 
 	/* parse add command */
-	if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
-		   "%" __stringify(RBD_MAX_OPT_LEN) "s "
-		   "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
-		   "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
-		   "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
-		   mon_dev_name, options, rbd_dev->pool_name,
-		   rbd_dev->obj, rbd_dev->snap_name) < 4) {
-		rc = -EINVAL;
-		goto err_out_slot;
-	}
-
-	if (rbd_dev->snap_name[0] == 0)
-		rbd_dev->snap_name[0] = '-';
-
-	rbd_dev->obj_len = strlen(rbd_dev->obj);
-	snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
-		 rbd_dev->obj, RBD_SUFFIX);
-
-	/* initialize rest of new object */
-	snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
-	rc = rbd_get_client(rbd_dev, mon_dev_name, options);
-	if (rc < 0)
-		goto err_out_slot;
+	rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
+				options, count);
+	if (rc)
+		goto err_put_id;
 
-	mutex_unlock(&ctl_mutex);
+	rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
+						options);
+	if (IS_ERR(rbd_dev->rbd_client)) {
+		rc = PTR_ERR(rbd_dev->rbd_client);
+		goto err_put_id;
+	}
 
 	/* pick the pool */
-	osdc = &rbd_dev->client->osdc;
+	osdc = &rbd_dev->rbd_client->client->osdc;
 	rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
 	if (rc < 0)
 		goto err_out_client;
-	rbd_dev->poolid = rc;
+	rbd_dev->pool_id = rc;
 
 	/* register our block device */
-	irc = register_blkdev(0, rbd_dev->name);
-	if (irc < 0) {
-		rc = irc;
+	rc = register_blkdev(0, rbd_dev->name);
+	if (rc < 0)
 		goto err_out_client;
-	}
-	rbd_dev->major = irc;
+	rbd_dev->major = rc;
 
 	rc = rbd_bus_add_dev(rbd_dev);
 	if (rc)
 		goto err_out_blkdev;
 
-	/* set up and announce blkdev mapping */
+	/*
+	 * At this point cleanup in the event of an error is the job
+	 * of the sysfs code (initiated by rbd_bus_del_dev()).
+	 *
+	 * Set up and announce blkdev mapping.
+	 */
 	rc = rbd_init_disk(rbd_dev);
 	if (rc)
 		goto err_out_bus;
@@ -2281,66 +2521,76 @@ static ssize_t rbd_add(struct bus_type *bus,
 	return count;
 
 err_out_bus:
-	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-	list_del_init(&rbd_dev->node);
-	mutex_unlock(&ctl_mutex);
-
 	/* this will also clean up rest of rbd_dev stuff */
 
 	rbd_bus_del_dev(rbd_dev);
 	kfree(options);
-	kfree(mon_dev_name);
 	return rc;
 
 err_out_blkdev:
 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
 err_out_client:
 	rbd_put_client(rbd_dev);
-	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-err_out_slot:
-	list_del_init(&rbd_dev->node);
-	mutex_unlock(&ctl_mutex);
-
+err_put_id:
+	if (rbd_dev->pool_name) {
+		kfree(rbd_dev->snap_name);
+		kfree(rbd_dev->header_name);
+		kfree(rbd_dev->image_name);
+		kfree(rbd_dev->pool_name);
+	}
+	rbd_id_put(rbd_dev);
+err_nomem:
 	kfree(rbd_dev);
-err_out_opt:
 	kfree(options);
-err_mon_dev:
-	kfree(mon_dev_name);
-err_out_mod:
+
 	dout("Error adding device %s\n", buf);
 	module_put(THIS_MODULE);
-	return rc;
+
+	return (ssize_t) rc;
 }
 
-static struct rbd_device *__rbd_get_dev(unsigned long id)
+static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
 {
 	struct list_head *tmp;
 	struct rbd_device *rbd_dev;
 
+	spin_lock(&rbd_dev_list_lock);
 	list_for_each(tmp, &rbd_dev_list) {
 		rbd_dev = list_entry(tmp, struct rbd_device, node);
-		if (rbd_dev->id == id)
+		if (rbd_dev->dev_id == dev_id) {
+			spin_unlock(&rbd_dev_list_lock);
 			return rbd_dev;
+		}
 	}
+	spin_unlock(&rbd_dev_list_lock);
 	return NULL;
 }
 
 static void rbd_dev_release(struct device *dev)
 {
-	struct rbd_device *rbd_dev =
-			container_of(dev, struct rbd_device, dev);
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	if (rbd_dev->watch_request) {
+		struct ceph_client *client = rbd_dev->rbd_client->client;
 
-	if (rbd_dev->watch_request)
-		ceph_osdc_unregister_linger_request(&rbd_dev->client->osdc,
+		ceph_osdc_unregister_linger_request(&client->osdc,
 						    rbd_dev->watch_request);
+	}
 	if (rbd_dev->watch_event)
-		rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name);
+		rbd_req_sync_unwatch(rbd_dev);
 
 	rbd_put_client(rbd_dev);
 
 	/* clean up and free blkdev */
 	rbd_free_disk(rbd_dev);
 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
+
+	/* done with the id, and with the rbd_dev */
+	kfree(rbd_dev->snap_name);
+	kfree(rbd_dev->header_name);
+	kfree(rbd_dev->pool_name);
+	kfree(rbd_dev->image_name);
+	rbd_id_put(rbd_dev);
 	kfree(rbd_dev);
 
 	/* release module ref */
@@ -2373,8 +2623,6 @@ static ssize_t rbd_remove(struct bus_type *bus,
 		goto done;
 	}
 
-	list_del_init(&rbd_dev->node);
-
 	__rbd_remove_all_snaps(rbd_dev);
 	rbd_bus_del_dev(rbd_dev);
 
@@ -2388,7 +2636,7 @@ static ssize_t rbd_snap_add(struct device *dev,
 			    const char *buf,
 			    size_t count)
 {
-	struct rbd_device *rbd_dev = dev_to_rbd(dev);
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 	int ret;
 	char *name = kmalloc(count + 1, GFP_KERNEL);
 	if (!name)
@@ -2403,7 +2651,7 @@ static ssize_t rbd_snap_add(struct device *dev,
 	if (ret < 0)
 		goto err_unlock;
 
-	ret = __rbd_update_snaps(rbd_dev);
+	ret = __rbd_refresh_header(rbd_dev, NULL);
 	if (ret < 0)
 		goto err_unlock;
 
@@ -2412,7 +2660,7 @@ static ssize_t rbd_snap_add(struct device *dev,
 	mutex_unlock(&ctl_mutex);
 
 	/* make a best effort, don't error if failed */
-	rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name);
+	rbd_req_sync_notify(rbd_dev);
 
 	ret = count;
 	kfree(name);
@@ -2424,70 +2672,6 @@ err_unlock:
 	return ret;
 }
 
-static ssize_t rbd_snap_rollback(struct device *dev,
-				 struct device_attribute *attr,
-				 const char *buf,
-				 size_t count)
-{
-	struct rbd_device *rbd_dev = dev_to_rbd(dev);
-	int ret;
-	u64 snapid;
-	u64 cur_ofs;
-	char *seg_name = NULL;
-	char *snap_name = kmalloc(count + 1, GFP_KERNEL);
-	ret = -ENOMEM;
-	if (!snap_name)
-		return ret;
-
-	/* parse snaps add command */
-	snprintf(snap_name, count, "%s", buf);
-	seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
-	if (!seg_name)
-		goto done;
-
-	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
-	ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL);
-	if (ret < 0)
-		goto done_unlock;
-
-	dout("snapid=%lld\n", snapid);
-
-	cur_ofs = 0;
-	while (cur_ofs < rbd_dev->header.image_size) {
-		cur_ofs += rbd_get_segment(&rbd_dev->header,
-					   rbd_dev->obj,
-					   cur_ofs, (u64)-1,
-					   seg_name, NULL);
-		dout("seg_name=%s\n", seg_name);
-
-		ret = rbd_req_sync_rollback_obj(rbd_dev, snapid, seg_name);
-		if (ret < 0)
-			pr_warning("could not roll back obj %s err=%d\n",
-				   seg_name, ret);
-	}
-
-	ret = __rbd_update_snaps(rbd_dev);
-	if (ret < 0)
-		goto done_unlock;
-
-	ret = count;
-
-done_unlock:
-	mutex_unlock(&ctl_mutex);
-done:
-	kfree(seg_name);
-	kfree(snap_name);
-
-	return ret;
-}
-
-static struct bus_attribute rbd_bus_attrs[] = {
-	__ATTR(add, S_IWUSR, NULL, rbd_add),
-	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
-	__ATTR_NULL
-};
-
 /*
  * create control files in sysfs
  * /sys/bus/rbd/...
@@ -2496,21 +2680,21 @@ static int rbd_sysfs_init(void)
 {
 	int ret;
 
-	rbd_bus_type.bus_attrs = rbd_bus_attrs;
-
-	ret = bus_register(&rbd_bus_type);
-	 if (ret < 0)
+	ret = device_register(&rbd_root_dev);
+	if (ret < 0)
 		return ret;
 
-	ret = device_register(&rbd_root_dev);
+	ret = bus_register(&rbd_bus_type);
+	if (ret < 0)
+		device_unregister(&rbd_root_dev);
 
 	return ret;
 }
 
 static void rbd_sysfs_cleanup(void)
 {
-	device_unregister(&rbd_root_dev);
 	bus_unregister(&rbd_bus_type);
+	device_unregister(&rbd_root_dev);
 }
 
 int __init rbd_init(void)
@@ -2520,8 +2704,7 @@ int __init rbd_init(void)
 	rc = rbd_sysfs_init();
 	if (rc)
 		return rc;
-	spin_lock_init(&node_lock);
-	pr_info("loaded " DRV_NAME_LONG "\n");
+	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
 	return 0;
 }
 
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
index fc6c678aa2cb..0924e9e41a60 100644
--- a/drivers/block/rbd_types.h
+++ b/drivers/block/rbd_types.h
@@ -31,7 +31,6 @@
 #define RBD_MIN_OBJ_ORDER       16
 #define RBD_MAX_OBJ_ORDER       30
 
-#define RBD_MAX_OBJ_NAME_LEN	96
 #define RBD_MAX_SEG_NAME_LEN	128
 
 #define RBD_COMP_NONE		0
@@ -41,10 +40,6 @@
 #define RBD_HEADER_SIGNATURE	"RBD"
 #define RBD_HEADER_VERSION	"001.005"
 
-struct rbd_info {
-	__le64 max_id;
-} __attribute__ ((packed));
-
 struct rbd_image_snap_ondisk {
 	__le64 id;
 	__le64 image_size;
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 48e8fee9f2d4..9dcf76a10bb6 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -839,10 +839,7 @@ static struct vio_driver vdc_port_driver = {
 	.id_table	= vdc_port_match,
 	.probe		= vdc_port_probe,
 	.remove		= vdc_port_remove,
-	.driver		= {
-		.name	= "vdc_port",
-		.owner	= THIS_MODULE,
-	}
+	.name		= "vdc_port",
 };
 
 static int __init vdc_init(void)
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index fd5adcd55944..6d5a914b9619 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -26,7 +26,6 @@
 #include <linux/delay.h>
 #include <linux/platform_device.h>
 
-#include <asm/macintosh.h>
 #include <asm/mac_via.h>
 
 #define CARDNAME "swim"
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index ae3e167e17ad..89ddab127e33 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -16,6 +16,8 @@
  * handle GCR disks
  */
 
+#undef DEBUG
+
 #include <linux/stddef.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
@@ -36,13 +38,11 @@
 #include <asm/machdep.h>
 #include <asm/pmac_feature.h>
 
-static DEFINE_MUTEX(swim3_mutex);
-static struct request_queue *swim3_queue;
-static struct gendisk *disks[2];
-static struct request *fd_req;
-
 #define MAX_FLOPPIES	2
 
+static DEFINE_MUTEX(swim3_mutex);
+static struct gendisk *disks[MAX_FLOPPIES];
+
 enum swim_state {
 	idle,
 	locating,
@@ -177,7 +177,6 @@ struct swim3 {
 
 struct floppy_state {
 	enum swim_state	state;
-	spinlock_t lock;
 	struct swim3 __iomem *swim3;	/* hardware registers */
 	struct dbdma_regs __iomem *dma;	/* DMA controller registers */
 	int	swim3_intr;	/* interrupt number for SWIM3 */
@@ -204,8 +203,20 @@ struct floppy_state {
 	int	wanted;
 	struct macio_dev *mdev;
 	char	dbdma_cmd_space[5 * sizeof(struct dbdma_cmd)];
+	int	index;
+	struct request *cur_req;
 };
 
+#define swim3_err(fmt, arg...)	dev_err(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+#define swim3_warn(fmt, arg...)	dev_warn(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+#define swim3_info(fmt, arg...)	dev_info(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+
+#ifdef DEBUG
+#define swim3_dbg(fmt, arg...)	dev_dbg(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+#else
+#define swim3_dbg(fmt, arg...)	do { } while(0)
+#endif
+
 static struct floppy_state floppy_states[MAX_FLOPPIES];
 static int floppy_count = 0;
 static DEFINE_SPINLOCK(swim3_lock);
@@ -224,17 +235,8 @@ static unsigned short write_postamble[] = {
 	0, 0, 0, 0, 0, 0
 };
 
-static void swim3_select(struct floppy_state *fs, int sel);
-static void swim3_action(struct floppy_state *fs, int action);
-static int swim3_readbit(struct floppy_state *fs, int bit);
-static void do_fd_request(struct request_queue * q);
-static void start_request(struct floppy_state *fs);
-static void set_timeout(struct floppy_state *fs, int nticks,
-			void (*proc)(unsigned long));
-static void scan_track(struct floppy_state *fs);
 static void seek_track(struct floppy_state *fs, int n);
 static void init_dma(struct dbdma_cmd *cp, int cmd, void *buf, int count);
-static void setup_transfer(struct floppy_state *fs);
 static void act(struct floppy_state *fs);
 static void scan_timeout(unsigned long data);
 static void seek_timeout(unsigned long data);
@@ -254,18 +256,21 @@ static unsigned int floppy_check_events(struct gendisk *disk,
 					unsigned int clearing);
 static int floppy_revalidate(struct gendisk *disk);
 
-static bool swim3_end_request(int err, unsigned int nr_bytes)
+static bool swim3_end_request(struct floppy_state *fs, int err, unsigned int nr_bytes)
 {
-	if (__blk_end_request(fd_req, err, nr_bytes))
-		return true;
+	struct request *req = fs->cur_req;
+	int rc;
 
-	fd_req = NULL;
-	return false;
-}
+	swim3_dbg("  end request, err=%d nr_bytes=%d, cur_req=%p\n",
+		  err, nr_bytes, req);
 
-static bool swim3_end_request_cur(int err)
-{
-	return swim3_end_request(err, blk_rq_cur_bytes(fd_req));
+	if (err)
+		nr_bytes = blk_rq_cur_bytes(req);
+	rc = __blk_end_request(req, err, nr_bytes);
+	if (rc)
+		return true;
+	fs->cur_req = NULL;
+	return false;
 }
 
 static void swim3_select(struct floppy_state *fs, int sel)
@@ -303,50 +308,53 @@ static int swim3_readbit(struct floppy_state *fs, int bit)
 	return (stat & DATA) == 0;
 }
 
-static void do_fd_request(struct request_queue * q)
-{
-	int i;
-
-	for(i=0; i<floppy_count; i++) {
-		struct floppy_state *fs = &floppy_states[i];
-		if (fs->mdev->media_bay &&
-		    check_media_bay(fs->mdev->media_bay) != MB_FD)
-			continue;
-		start_request(fs);
-	}
-}
-
 static void start_request(struct floppy_state *fs)
 {
 	struct request *req;
 	unsigned long x;
 
+	swim3_dbg("start request, initial state=%d\n", fs->state);
+
 	if (fs->state == idle && fs->wanted) {
 		fs->state = available;
 		wake_up(&fs->wait);
 		return;
 	}
 	while (fs->state == idle) {
-		if (!fd_req) {
-			fd_req = blk_fetch_request(swim3_queue);
-			if (!fd_req)
+		swim3_dbg("start request, idle loop, cur_req=%p\n", fs->cur_req);
+		if (!fs->cur_req) {
+			fs->cur_req = blk_fetch_request(disks[fs->index]->queue);
+			swim3_dbg("  fetched request %p\n", fs->cur_req);
+			if (!fs->cur_req)
 				break;
 		}
-		req = fd_req;
-#if 0
-		printk("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n",
-		       req->rq_disk->disk_name, req->cmd,
-		       (long)blk_rq_pos(req), blk_rq_sectors(req), req->buffer);
-		printk("           errors=%d current_nr_sectors=%u\n",
-		       req->errors, blk_rq_cur_sectors(req));
+		req = fs->cur_req;
+
+		if (fs->mdev->media_bay &&
+		    check_media_bay(fs->mdev->media_bay) != MB_FD) {
+			swim3_dbg("%s", "  media bay absent, dropping req\n");
+			swim3_end_request(fs, -ENODEV, 0);
+			continue;
+		}
+
+#if 0 /* This is really too verbose */
+		swim3_dbg("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n",
+			  req->rq_disk->disk_name, req->cmd,
+			  (long)blk_rq_pos(req), blk_rq_sectors(req),
+			  req->buffer);
+		swim3_dbg("           errors=%d current_nr_sectors=%u\n",
+			  req->errors, blk_rq_cur_sectors(req));
 #endif
 
 		if (blk_rq_pos(req) >= fs->total_secs) {
-			swim3_end_request_cur(-EIO);
+			swim3_dbg("  pos out of bounds (%ld, max is %ld)\n",
+				  (long)blk_rq_pos(req), (long)fs->total_secs);
+			swim3_end_request(fs, -EIO, 0);
 			continue;
 		}
 		if (fs->ejected) {
-			swim3_end_request_cur(-EIO);
+			swim3_dbg("%s", "  disk ejected\n");
+			swim3_end_request(fs, -EIO, 0);
 			continue;
 		}
 
@@ -354,7 +362,8 @@ static void start_request(struct floppy_state *fs)
 			if (fs->write_prot < 0)
 				fs->write_prot = swim3_readbit(fs, WRITE_PROT);
 			if (fs->write_prot) {
-				swim3_end_request_cur(-EIO);
+				swim3_dbg("%s", "  try to write, disk write protected\n");
+				swim3_end_request(fs, -EIO, 0);
 				continue;
 			}
 		}
@@ -369,7 +378,6 @@ static void start_request(struct floppy_state *fs)
 		x = ((long)blk_rq_pos(req)) % fs->secpercyl;
 		fs->head = x / fs->secpertrack;
 		fs->req_sector = x % fs->secpertrack + 1;
-		fd_req = req;
 		fs->state = do_transfer;
 		fs->retries = 0;
 
@@ -377,12 +385,14 @@ static void start_request(struct floppy_state *fs)
 	}
 }
 
+static void do_fd_request(struct request_queue * q)
+{
+	start_request(q->queuedata);
+}
+
 static void set_timeout(struct floppy_state *fs, int nticks,
 			void (*proc)(unsigned long))
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&fs->lock, flags);
 	if (fs->timeout_pending)
 		del_timer(&fs->timeout);
 	fs->timeout.expires = jiffies + nticks;
@@ -390,7 +400,6 @@ static void set_timeout(struct floppy_state *fs, int nticks,
 	fs->timeout.data = (unsigned long) fs;
 	add_timer(&fs->timeout);
 	fs->timeout_pending = 1;
-	spin_unlock_irqrestore(&fs->lock, flags);
 }
 
 static inline void scan_track(struct floppy_state *fs)
@@ -442,40 +451,45 @@ static inline void setup_transfer(struct floppy_state *fs)
 	struct swim3 __iomem *sw = fs->swim3;
 	struct dbdma_cmd *cp = fs->dma_cmd;
 	struct dbdma_regs __iomem *dr = fs->dma;
+	struct request *req = fs->cur_req;
 
-	if (blk_rq_cur_sectors(fd_req) <= 0) {
-		printk(KERN_ERR "swim3: transfer 0 sectors?\n");
+	if (blk_rq_cur_sectors(req) <= 0) {
+		swim3_warn("%s", "Transfer 0 sectors ?\n");
 		return;
 	}
-	if (rq_data_dir(fd_req) == WRITE)
+	if (rq_data_dir(req) == WRITE)
 		n = 1;
 	else {
 		n = fs->secpertrack - fs->req_sector + 1;
-		if (n > blk_rq_cur_sectors(fd_req))
-			n = blk_rq_cur_sectors(fd_req);
+		if (n > blk_rq_cur_sectors(req))
+			n = blk_rq_cur_sectors(req);
 	}
+
+	swim3_dbg("  setup xfer at sect %d (of %d) head %d for %d\n",
+		  fs->req_sector, fs->secpertrack, fs->head, n);
+
 	fs->scount = n;
 	swim3_select(fs, fs->head? READ_DATA_1: READ_DATA_0);
 	out_8(&sw->sector, fs->req_sector);
 	out_8(&sw->nsect, n);
 	out_8(&sw->gap3, 0);
 	out_le32(&dr->cmdptr, virt_to_bus(cp));
-	if (rq_data_dir(fd_req) == WRITE) {
+	if (rq_data_dir(req) == WRITE) {
 		/* Set up 3 dma commands: write preamble, data, postamble */
 		init_dma(cp, OUTPUT_MORE, write_preamble, sizeof(write_preamble));
 		++cp;
-		init_dma(cp, OUTPUT_MORE, fd_req->buffer, 512);
+		init_dma(cp, OUTPUT_MORE, req->buffer, 512);
 		++cp;
 		init_dma(cp, OUTPUT_LAST, write_postamble, sizeof(write_postamble));
 	} else {
-		init_dma(cp, INPUT_LAST, fd_req->buffer, n * 512);
+		init_dma(cp, INPUT_LAST, req->buffer, n * 512);
 	}
 	++cp;
 	out_le16(&cp->command, DBDMA_STOP);
 	out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS);
 	in_8(&sw->error);
 	out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS);
-	if (rq_data_dir(fd_req) == WRITE)
+	if (rq_data_dir(req) == WRITE)
 		out_8(&sw->control_bis, WRITE_SECTORS);
 	in_8(&sw->intr);
 	out_le32(&dr->control, (RUN << 16) | RUN);
@@ -488,12 +502,16 @@ static inline void setup_transfer(struct floppy_state *fs)
 static void act(struct floppy_state *fs)
 {
 	for (;;) {
+		swim3_dbg("  act loop, state=%d, req_cyl=%d, cur_cyl=%d\n",
+			  fs->state, fs->req_cyl, fs->cur_cyl);
+
 		switch (fs->state) {
 		case idle:
 			return;		/* XXX shouldn't get here */
 
 		case locating:
 			if (swim3_readbit(fs, TRACK_ZERO)) {
+				swim3_dbg("%s", "    locate track 0\n");
 				fs->cur_cyl = 0;
 				if (fs->req_cyl == 0)
 					fs->state = do_transfer;
@@ -511,7 +529,7 @@ static void act(struct floppy_state *fs)
 				break;
 			}
 			if (fs->req_cyl == fs->cur_cyl) {
-				printk("whoops, seeking 0\n");
+				swim3_warn("%s", "Whoops, seeking 0\n");
 				fs->state = do_transfer;
 				break;
 			}
@@ -527,7 +545,9 @@ static void act(struct floppy_state *fs)
 		case do_transfer:
 			if (fs->cur_cyl != fs->req_cyl) {
 				if (fs->retries > 5) {
-					swim3_end_request_cur(-EIO);
+					swim3_err("Wrong cylinder in transfer, want: %d got %d\n",
+						  fs->req_cyl, fs->cur_cyl);
+					swim3_end_request(fs, -EIO, 0);
 					fs->state = idle;
 					return;
 				}
@@ -542,7 +562,7 @@ static void act(struct floppy_state *fs)
 			return;
 
 		default:
-			printk(KERN_ERR"swim3: unknown state %d\n", fs->state);
+			swim3_err("Unknown state %d\n", fs->state);
 			return;
 		}
 	}
@@ -552,59 +572,75 @@ static void scan_timeout(unsigned long data)
 {
 	struct floppy_state *fs = (struct floppy_state *) data;
 	struct swim3 __iomem *sw = fs->swim3;
+	unsigned long flags;
+
+	swim3_dbg("* scan timeout, state=%d\n", fs->state);
 
+	spin_lock_irqsave(&swim3_lock, flags);
 	fs->timeout_pending = 0;
 	out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS);
 	out_8(&sw->select, RELAX);
 	out_8(&sw->intr_enable, 0);
 	fs->cur_cyl = -1;
 	if (fs->retries > 5) {
-		swim3_end_request_cur(-EIO);
+		swim3_end_request(fs, -EIO, 0);
 		fs->state = idle;
 		start_request(fs);
 	} else {
 		fs->state = jogging;
 		act(fs);
 	}
+	spin_unlock_irqrestore(&swim3_lock, flags);
 }
 
 static void seek_timeout(unsigned long data)
 {
 	struct floppy_state *fs = (struct floppy_state *) data;
 	struct swim3 __iomem *sw = fs->swim3;
+	unsigned long flags;
+
+	swim3_dbg("* seek timeout, state=%d\n", fs->state);
 
+	spin_lock_irqsave(&swim3_lock, flags);
 	fs->timeout_pending = 0;
 	out_8(&sw->control_bic, DO_SEEK);
 	out_8(&sw->select, RELAX);
 	out_8(&sw->intr_enable, 0);
-	printk(KERN_ERR "swim3: seek timeout\n");
-	swim3_end_request_cur(-EIO);
+	swim3_err("%s", "Seek timeout\n");
+	swim3_end_request(fs, -EIO, 0);
 	fs->state = idle;
 	start_request(fs);
+	spin_unlock_irqrestore(&swim3_lock, flags);
 }
 
 static void settle_timeout(unsigned long data)
 {
 	struct floppy_state *fs = (struct floppy_state *) data;
 	struct swim3 __iomem *sw = fs->swim3;
+	unsigned long flags;
+
+	swim3_dbg("* settle timeout, state=%d\n", fs->state);
 
+	spin_lock_irqsave(&swim3_lock, flags);
 	fs->timeout_pending = 0;
 	if (swim3_readbit(fs, SEEK_COMPLETE)) {
 		out_8(&sw->select, RELAX);
 		fs->state = locating;
 		act(fs);
-		return;
+		goto unlock;
 	}
 	out_8(&sw->select, RELAX);
 	if (fs->settle_time < 2*HZ) {
 		++fs->settle_time;
 		set_timeout(fs, 1, settle_timeout);
-		return;
+		goto unlock;
 	}
-	printk(KERN_ERR "swim3: seek settle timeout\n");
-	swim3_end_request_cur(-EIO);
+	swim3_err("%s", "Seek settle timeout\n");
+	swim3_end_request(fs, -EIO, 0);
 	fs->state = idle;
 	start_request(fs);
+ unlock:
+	spin_unlock_irqrestore(&swim3_lock, flags);
 }
 
 static void xfer_timeout(unsigned long data)
@@ -612,8 +648,12 @@ static void xfer_timeout(unsigned long data)
 	struct floppy_state *fs = (struct floppy_state *) data;
 	struct swim3 __iomem *sw = fs->swim3;
 	struct dbdma_regs __iomem *dr = fs->dma;
+	unsigned long flags;
 	int n;
 
+	swim3_dbg("* xfer timeout, state=%d\n", fs->state);
+
+	spin_lock_irqsave(&swim3_lock, flags);
 	fs->timeout_pending = 0;
 	out_le32(&dr->control, RUN << 16);
 	/* We must wait a bit for dbdma to stop */
@@ -622,12 +662,13 @@ static void xfer_timeout(unsigned long data)
 	out_8(&sw->intr_enable, 0);
 	out_8(&sw->control_bic, WRITE_SECTORS | DO_ACTION);
 	out_8(&sw->select, RELAX);
-	printk(KERN_ERR "swim3: timeout %sing sector %ld\n",
-	       (rq_data_dir(fd_req)==WRITE? "writ": "read"),
-	       (long)blk_rq_pos(fd_req));
-	swim3_end_request_cur(-EIO);
+	swim3_err("Timeout %sing sector %ld\n",
+	       (rq_data_dir(fs->cur_req)==WRITE? "writ": "read"),
+	       (long)blk_rq_pos(fs->cur_req));
+	swim3_end_request(fs, -EIO, 0);
 	fs->state = idle;
 	start_request(fs);
+	spin_unlock_irqrestore(&swim3_lock, flags);
 }
 
 static irqreturn_t swim3_interrupt(int irq, void *dev_id)
@@ -638,12 +679,17 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 	int stat, resid;
 	struct dbdma_regs __iomem *dr;
 	struct dbdma_cmd *cp;
+	unsigned long flags;
+	struct request *req = fs->cur_req;
+
+	swim3_dbg("* interrupt, state=%d\n", fs->state);
 
+	spin_lock_irqsave(&swim3_lock, flags);
 	intr = in_8(&sw->intr);
 	err = (intr & ERROR_INTR)? in_8(&sw->error): 0;
 	if ((intr & ERROR_INTR) && fs->state != do_transfer)
-		printk(KERN_ERR "swim3_interrupt, state=%d, dir=%x, intr=%x, err=%x\n",
-		       fs->state, rq_data_dir(fd_req), intr, err);
+		swim3_err("Non-transfer error interrupt: state=%d, dir=%x, intr=%x, err=%x\n",
+			  fs->state, rq_data_dir(req), intr, err);
 	switch (fs->state) {
 	case locating:
 		if (intr & SEEN_SECTOR) {
@@ -653,10 +699,10 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 			del_timer(&fs->timeout);
 			fs->timeout_pending = 0;
 			if (sw->ctrack == 0xff) {
-				printk(KERN_ERR "swim3: seen sector but cyl=ff?\n");
+				swim3_err("%s", "Seen sector but cyl=ff?\n");
 				fs->cur_cyl = -1;
 				if (fs->retries > 5) {
-					swim3_end_request_cur(-EIO);
+					swim3_end_request(fs, -EIO, 0);
 					fs->state = idle;
 					start_request(fs);
 				} else {
@@ -668,8 +714,8 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 			fs->cur_cyl = sw->ctrack;
 			fs->cur_sector = sw->csect;
 			if (fs->expect_cyl != -1 && fs->expect_cyl != fs->cur_cyl)
-				printk(KERN_ERR "swim3: expected cyl %d, got %d\n",
-				       fs->expect_cyl, fs->cur_cyl);
+				swim3_err("Expected cyl %d, got %d\n",
+					  fs->expect_cyl, fs->cur_cyl);
 			fs->state = do_transfer;
 			act(fs);
 		}
@@ -704,7 +750,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 		fs->timeout_pending = 0;
 		dr = fs->dma;
 		cp = fs->dma_cmd;
-		if (rq_data_dir(fd_req) == WRITE)
+		if (rq_data_dir(req) == WRITE)
 			++cp;
 		/*
 		 * Check that the main data transfer has finished.
@@ -729,31 +775,32 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 		if (intr & ERROR_INTR) {
 			n = fs->scount - 1 - resid / 512;
 			if (n > 0) {
-				blk_update_request(fd_req, 0, n << 9);
+				blk_update_request(req, 0, n << 9);
 				fs->req_sector += n;
 			}
 			if (fs->retries < 5) {
 				++fs->retries;
 				act(fs);
 			} else {
-				printk("swim3: error %sing block %ld (err=%x)\n",
-				       rq_data_dir(fd_req) == WRITE? "writ": "read",
-				       (long)blk_rq_pos(fd_req), err);
-				swim3_end_request_cur(-EIO);
+				swim3_err("Error %sing block %ld (err=%x)\n",
+				       rq_data_dir(req) == WRITE? "writ": "read",
+				       (long)blk_rq_pos(req), err);
+				swim3_end_request(fs, -EIO, 0);
 				fs->state = idle;
 			}
 		} else {
 			if ((stat & ACTIVE) == 0 || resid != 0) {
 				/* musta been an error */
-				printk(KERN_ERR "swim3: fd dma: stat=%x resid=%d\n", stat, resid);
-				printk(KERN_ERR "  state=%d, dir=%x, intr=%x, err=%x\n",
-				       fs->state, rq_data_dir(fd_req), intr, err);
-				swim3_end_request_cur(-EIO);
+				swim3_err("fd dma error: stat=%x resid=%d\n", stat, resid);
+				swim3_err("  state=%d, dir=%x, intr=%x, err=%x\n",
+					  fs->state, rq_data_dir(req), intr, err);
+				swim3_end_request(fs, -EIO, 0);
 				fs->state = idle;
 				start_request(fs);
 				break;
 			}
-			if (swim3_end_request(0, fs->scount << 9)) {
+			fs->retries = 0;
+			if (swim3_end_request(fs, 0, fs->scount << 9)) {
 				fs->req_sector += fs->scount;
 				if (fs->req_sector > fs->secpertrack) {
 					fs->req_sector -= fs->secpertrack;
@@ -770,8 +817,9 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 			start_request(fs);
 		break;
 	default:
-		printk(KERN_ERR "swim3: don't know what to do in state %d\n", fs->state);
+		swim3_err("Don't know what to do in state %d\n", fs->state);
 	}
+	spin_unlock_irqrestore(&swim3_lock, flags);
 	return IRQ_HANDLED;
 }
 
@@ -781,26 +829,31 @@ static void fd_dma_interrupt(int irq, void *dev_id)
 }
 */
 
+/* Called under the mutex to grab exclusive access to a drive */
 static int grab_drive(struct floppy_state *fs, enum swim_state state,
 		      int interruptible)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&fs->lock, flags);
-	if (fs->state != idle) {
+	swim3_dbg("%s", "-> grab drive\n");
+
+	spin_lock_irqsave(&swim3_lock, flags);
+	if (fs->state != idle && fs->state != available) {
 		++fs->wanted;
 		while (fs->state != available) {
+			spin_unlock_irqrestore(&swim3_lock, flags);
 			if (interruptible && signal_pending(current)) {
 				--fs->wanted;
-				spin_unlock_irqrestore(&fs->lock, flags);
 				return -EINTR;
 			}
 			interruptible_sleep_on(&fs->wait);
+			spin_lock_irqsave(&swim3_lock, flags);
 		}
 		--fs->wanted;
 	}
 	fs->state = state;
-	spin_unlock_irqrestore(&fs->lock, flags);
+	spin_unlock_irqrestore(&swim3_lock, flags);
+
 	return 0;
 }
 
@@ -808,10 +861,12 @@ static void release_drive(struct floppy_state *fs)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&fs->lock, flags);
+	swim3_dbg("%s", "-> release drive\n");
+
+	spin_lock_irqsave(&swim3_lock, flags);
 	fs->state = idle;
 	start_request(fs);
-	spin_unlock_irqrestore(&fs->lock, flags);
+	spin_unlock_irqrestore(&swim3_lock, flags);
 }
 
 static int fd_eject(struct floppy_state *fs)
@@ -966,6 +1021,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
 {
 	struct floppy_state *fs = disk->private_data;
 	struct swim3 __iomem *sw = fs->swim3;
+
 	mutex_lock(&swim3_mutex);
 	if (fs->ref_count > 0 && --fs->ref_count == 0) {
 		swim3_action(fs, MOTOR_OFF);
@@ -1031,30 +1087,48 @@ static const struct block_device_operations floppy_fops = {
 	.revalidate_disk= floppy_revalidate,
 };
 
+static void swim3_mb_event(struct macio_dev* mdev, int mb_state)
+{
+	struct floppy_state *fs = macio_get_drvdata(mdev);
+	struct swim3 __iomem *sw = fs->swim3;
+
+	if (!fs)
+		return;
+	if (mb_state != MB_FD)
+		return;
+
+	/* Clear state */
+	out_8(&sw->intr_enable, 0);
+	in_8(&sw->intr);
+	in_8(&sw->error);
+}
+
 static int swim3_add_device(struct macio_dev *mdev, int index)
 {
 	struct device_node *swim = mdev->ofdev.dev.of_node;
 	struct floppy_state *fs = &floppy_states[index];
 	int rc = -EBUSY;
 
+	/* Do this first for message macros */
+	memset(fs, 0, sizeof(*fs));
+	fs->mdev = mdev;
+	fs->index = index;
+
 	/* Check & Request resources */
 	if (macio_resource_count(mdev) < 2) {
-		printk(KERN_WARNING "ifd%d: no address for %s\n",
-		       index, swim->full_name);
+		swim3_err("%s", "No address in device-tree\n");
 		return -ENXIO;
 	}
-	if (macio_irq_count(mdev) < 2) {
-		printk(KERN_WARNING "fd%d: no intrs for device %s\n",
-			index, swim->full_name);
+	if (macio_irq_count(mdev) < 1) {
+		swim3_err("%s", "No interrupt in device-tree\n");
+		return -ENXIO;
 	}
 	if (macio_request_resource(mdev, 0, "swim3 (mmio)")) {
-		printk(KERN_ERR "fd%d: can't request mmio resource for %s\n",
-		       index, swim->full_name);
+		swim3_err("%s", "Can't request mmio resource\n");
 		return -EBUSY;
 	}
 	if (macio_request_resource(mdev, 1, "swim3 (dma)")) {
-		printk(KERN_ERR "fd%d: can't request dma resource for %s\n",
-		       index, swim->full_name);
+		swim3_err("%s", "Can't request dma resource\n");
 		macio_release_resource(mdev, 0);
 		return -EBUSY;
 	}
@@ -1063,22 +1137,18 @@ static int swim3_add_device(struct macio_dev *mdev, int index)
 	if (mdev->media_bay == NULL)
 		pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 1);
 	
-	memset(fs, 0, sizeof(*fs));
-	spin_lock_init(&fs->lock);
 	fs->state = idle;
 	fs->swim3 = (struct swim3 __iomem *)
 		ioremap(macio_resource_start(mdev, 0), 0x200);
 	if (fs->swim3 == NULL) {
-		printk("fd%d: couldn't map registers for %s\n",
-		       index, swim->full_name);
+		swim3_err("%s", "Couldn't map mmio registers\n");
 		rc = -ENOMEM;
 		goto out_release;
 	}
 	fs->dma = (struct dbdma_regs __iomem *)
 		ioremap(macio_resource_start(mdev, 1), 0x200);
 	if (fs->dma == NULL) {
-		printk("fd%d: couldn't map DMA for %s\n",
-		       index, swim->full_name);
+		swim3_err("%s", "Couldn't map dma registers\n");
 		iounmap(fs->swim3);
 		rc = -ENOMEM;
 		goto out_release;
@@ -1090,31 +1160,25 @@ static int swim3_add_device(struct macio_dev *mdev, int index)
 	fs->secpercyl = 36;
 	fs->secpertrack = 18;
 	fs->total_secs = 2880;
-	fs->mdev = mdev;
 	init_waitqueue_head(&fs->wait);
 
 	fs->dma_cmd = (struct dbdma_cmd *) DBDMA_ALIGN(fs->dbdma_cmd_space);
 	memset(fs->dma_cmd, 0, 2 * sizeof(struct dbdma_cmd));
 	st_le16(&fs->dma_cmd[1].command, DBDMA_STOP);
 
+	if (mdev->media_bay == NULL || check_media_bay(mdev->media_bay) == MB_FD)
+		swim3_mb_event(mdev, MB_FD);
+
 	if (request_irq(fs->swim3_intr, swim3_interrupt, 0, "SWIM3", fs)) {
-		printk(KERN_ERR "fd%d: couldn't request irq %d for %s\n",
-		       index, fs->swim3_intr, swim->full_name);
+		swim3_err("%s", "Couldn't request interrupt\n");
 		pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 0);
 		goto out_unmap;
 		return -EBUSY;
 	}
-/*
-	if (request_irq(fs->dma_intr, fd_dma_interrupt, 0, "SWIM3-dma", fs)) {
-		printk(KERN_ERR "Couldn't get irq %d for SWIM3 DMA",
-		       fs->dma_intr);
-		return -EBUSY;
-	}
-*/
 
 	init_timer(&fs->timeout);
 
-	printk(KERN_INFO "fd%d: SWIM3 floppy controller %s\n", floppy_count,
+	swim3_info("SWIM3 floppy controller %s\n",
 		mdev->media_bay ? "in media bay" : "");
 
 	return 0;
@@ -1132,41 +1196,42 @@ static int swim3_add_device(struct macio_dev *mdev, int index)
 
 static int __devinit swim3_attach(struct macio_dev *mdev, const struct of_device_id *match)
 {
-	int i, rc;
 	struct gendisk *disk;
+	int index, rc;
+
+	index = floppy_count++;
+	if (index >= MAX_FLOPPIES)
+		return -ENXIO;
 
 	/* Add the drive */
-	rc = swim3_add_device(mdev, floppy_count);
+	rc = swim3_add_device(mdev, index);
 	if (rc)
 		return rc;
+	/* Now register that disk. Same comment about failure handling */
+	disk = disks[index] = alloc_disk(1);
+	if (disk == NULL)
+		return -ENOMEM;
+	disk->queue = blk_init_queue(do_fd_request, &swim3_lock);
+	if (disk->queue == NULL) {
+		put_disk(disk);
+		return -ENOMEM;
+	}
+	disk->queue->queuedata = &floppy_states[index];
 
-	/* Now create the queue if not there yet */
-	if (swim3_queue == NULL) {
+	if (index == 0) {
 		/* If we failed, there isn't much we can do as the driver is still
 		 * too dumb to remove the device, just bail out
 		 */
 		if (register_blkdev(FLOPPY_MAJOR, "fd"))
 			return 0;
-		swim3_queue = blk_init_queue(do_fd_request, &swim3_lock);
-		if (swim3_queue == NULL) {
-			unregister_blkdev(FLOPPY_MAJOR, "fd");
-			return 0;
-		}
 	}
 
-	/* Now register that disk. Same comment about failure handling */
-	i = floppy_count++;
-	disk = disks[i] = alloc_disk(1);
-	if (disk == NULL)
-		return 0;
-
 	disk->major = FLOPPY_MAJOR;
-	disk->first_minor = i;
+	disk->first_minor = index;
 	disk->fops = &floppy_fops;
-	disk->private_data = &floppy_states[i];
-	disk->queue = swim3_queue;
+	disk->private_data = &floppy_states[index];
 	disk->flags |= GENHD_FL_REMOVABLE;
-	sprintf(disk->disk_name, "fd%d", i);
+	sprintf(disk->disk_name, "fd%d", index);
 	set_capacity(disk, 2880);
 	add_disk(disk);
 
@@ -1194,6 +1259,9 @@ static struct macio_driver swim3_driver =
 		.of_match_table	= swim3_match,
 	},
 	.probe		= swim3_attach,
+#ifdef CONFIG_PMAC_MEDIABAY
+	.mediabay_event	= swim3_mb_event,
+#endif
 #if 0
 	.suspend	= swim3_suspend,
 	.resume		= swim3_resume,
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index b70f0fca9a42..3fb6ab4c8b4e 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -619,8 +619,10 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
 	       host->state == HST_DEV_SCAN);
 	spin_unlock_irq(&host->lock);
 
-	DPRINTK("blk_insert_request, tag == %u\n", idx);
-	blk_insert_request(host->oob_q, crq->rq, 1, crq);
+	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
+	crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+	crq->rq->special = crq;
+	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
 
 	return 0;
 
@@ -658,8 +660,10 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
 	BUG_ON(rc < 0);
 	crq->msg_bucket = (u32) rc;
 
-	DPRINTK("blk_insert_request, tag == %u\n", idx);
-	blk_insert_request(host->oob_q, crq->rq, 1, crq);
+	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
+	crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+	crq->rq->special = crq;
+	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
 
 	return 0;
 }
@@ -1116,7 +1120,7 @@ static inline void carm_handle_resp(struct carm_host *host,
 			break;
 		case MISC_GET_FW_VER: {
 			struct carm_fw_ver *ver = (struct carm_fw_ver *)
-				mem + sizeof(struct carm_msg_get_fw_ver);
+				(mem + sizeof(struct carm_msg_get_fw_ver));
 			if (!error) {
 				host->fw_ver = le32_to_cpu(ver->version);
 				host->flags |= (ver->features & FL_FW_VER_MASK);
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 0e376d46bdd1..fcec0225ac76 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -119,43 +119,6 @@
 
 /*
  */
-
-/* command block wrapper */
-struct bulk_cb_wrap {
-	__le32	Signature;		/* contains 'USBC' */
-	u32	Tag;			/* unique per command id */
-	__le32	DataTransferLength;	/* size of data */
-	u8	Flags;			/* direction in bit 0 */
-	u8	Lun;			/* LUN */
-	u8	Length;			/* of of the CDB */
-	u8	CDB[UB_MAX_CDB_SIZE];	/* max command */
-};
-
-#define US_BULK_CB_WRAP_LEN	31
-#define US_BULK_CB_SIGN		0x43425355	/*spells out USBC */
-#define US_BULK_FLAG_IN		1
-#define US_BULK_FLAG_OUT	0
-
-/* command status wrapper */
-struct bulk_cs_wrap {
-	__le32	Signature;		/* should = 'USBS' */
-	u32	Tag;			/* same as original command */
-	__le32	Residue;		/* amount not transferred */
-	u8	Status;			/* see below */
-};
-
-#define US_BULK_CS_WRAP_LEN	13
-#define US_BULK_CS_SIGN		0x53425355	/* spells out 'USBS' */
-#define US_BULK_STAT_OK		0
-#define US_BULK_STAT_FAIL	1
-#define US_BULK_STAT_PHASE	2
-
-/* bulk-only class specific requests */
-#define US_BULK_RESET_REQUEST	0xff
-#define US_BULK_GET_MAX_LUN	0xfe
-
-/*
- */
 struct ub_dev;
 
 #define UB_MAX_REQ_SG	9	/* cdrecord requires 32KB and maybe a header */
@@ -1744,12 +1707,11 @@ static int ub_bd_release(struct gendisk *disk, fmode_t mode)
 static int ub_bd_ioctl(struct block_device *bdev, fmode_t mode,
     unsigned int cmd, unsigned long arg)
 {
-	struct gendisk *disk = bdev->bd_disk;
 	void __user *usermem = (void __user *) arg;
 	int ret;
 
 	mutex_lock(&ub_mutex);
-	ret = scsi_cmd_ioctl(disk->queue, disk, mode, cmd, usermem);
+	ret = scsi_cmd_blk_ioctl(bdev, mode, cmd, usermem);
 	mutex_unlock(&ub_mutex);
 
 	return ret;
@@ -2478,6 +2440,8 @@ static int __init ub_init(void)
 	int rc;
 	int i;
 
+	pr_info("'Low Performance USB Block' driver is deprecated. "
+			"Please switch to usb-storage\n");
 	for (i = 0; i < UB_QLOCK_NUM; i++)
 		spin_lock_init(&ub_qlockv[i]);
 
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 031ca720d926..eb0d8216f557 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -513,7 +513,22 @@ static void process_page(unsigned long data)
 	}
 }
 
-static int mm_make_request(struct request_queue *q, struct bio *bio)
+static void mm_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+	struct cardinfo *card = cb->data;
+
+	spin_lock_irq(&card->lock);
+	activate(card);
+	spin_unlock_irq(&card->lock);
+	kfree(cb);
+}
+
+static int mm_check_plugged(struct cardinfo *card)
+{
+	return !!blk_check_plugged(mm_unplug, card, sizeof(struct blk_plug_cb));
+}
+
+static void mm_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct cardinfo *card = q->queuedata;
 	pr_debug("mm_make_request %llu %u\n",
@@ -523,9 +538,11 @@ static int mm_make_request(struct request_queue *q, struct bio *bio)
 	*card->biotail = bio;
 	bio->bi_next = NULL;
 	card->biotail = &bio->bi_next;
+	if (bio->bi_rw & REQ_SYNC || !mm_check_plugged(card))
+		activate(card);
 	spin_unlock_irq(&card->lock);
 
-	return 0;
+	return;
 }
 
 static irqreturn_t mm_interrupt(int irq, void *__card)
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
deleted file mode 100644
index 9a5b2a2d616d..000000000000
--- a/drivers/block/viodasd.c
+++ /dev/null
@@ -1,809 +0,0 @@
-/* -*- linux-c -*-
- * viodasd.c
- *  Authors: Dave Boutcher <boutcher@us.ibm.com>
- *           Ryan Arnold <ryanarn@us.ibm.com>
- *           Colin Devilbiss <devilbis@us.ibm.com>
- *           Stephen Rothwell
- *
- * (C) Copyright 2000-2004 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * This routine provides access to disk space (termed "DASD" in historical
- * IBM terms) owned and managed by an OS/400 partition running on the
- * same box as this Linux partition.
- *
- * All disk operations are performed by sending messages back and forth to
- * the OS/400 partition.
- */
-
-#define pr_fmt(fmt) "viod: " fmt
-
-#include <linux/major.h>
-#include <linux/fs.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/blkdev.h>
-#include <linux/genhd.h>
-#include <linux/hdreg.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/mutex.h>
-#include <linux/dma-mapping.h>
-#include <linux/completion.h>
-#include <linux/device.h>
-#include <linux/scatterlist.h>
-
-#include <asm/uaccess.h>
-#include <asm/vio.h>
-#include <asm/iseries/hv_types.h>
-#include <asm/iseries/hv_lp_event.h>
-#include <asm/iseries/hv_lp_config.h>
-#include <asm/iseries/vio.h>
-#include <asm/firmware.h>
-
-MODULE_DESCRIPTION("iSeries Virtual DASD");
-MODULE_AUTHOR("Dave Boutcher");
-MODULE_LICENSE("GPL");
-
-/*
- * We only support 7 partitions per physical disk....so with minor
- * numbers 0-255 we get a maximum of 32 disks.
- */
-#define VIOD_GENHD_NAME		"iseries/vd"
-
-#define VIOD_VERS		"1.64"
-
-enum {
-	PARTITION_SHIFT = 3,
-	MAX_DISKNO = HVMAXARCHITECTEDVIRTUALDISKS,
-	MAX_DISK_NAME = FIELD_SIZEOF(struct gendisk, disk_name)
-};
-
-static DEFINE_MUTEX(viodasd_mutex);
-static DEFINE_SPINLOCK(viodasd_spinlock);
-
-#define VIOMAXREQ		16
-
-#define DEVICE_NO(cell)	((struct viodasd_device *)(cell) - &viodasd_devices[0])
-
-struct viodasd_waitevent {
-	struct completion	com;
-	int			rc;
-	u16			sub_result;
-	int			max_disk;	/* open */
-};
-
-static const struct vio_error_entry viodasd_err_table[] = {
-	{ 0x0201, EINVAL, "Invalid Range" },
-	{ 0x0202, EINVAL, "Invalid Token" },
-	{ 0x0203, EIO, "DMA Error" },
-	{ 0x0204, EIO, "Use Error" },
-	{ 0x0205, EIO, "Release Error" },
-	{ 0x0206, EINVAL, "Invalid Disk" },
-	{ 0x0207, EBUSY, "Can't Lock" },
-	{ 0x0208, EIO, "Already Locked" },
-	{ 0x0209, EIO, "Already Unlocked" },
-	{ 0x020A, EIO, "Invalid Arg" },
-	{ 0x020B, EIO, "Bad IFS File" },
-	{ 0x020C, EROFS, "Read Only Device" },
-	{ 0x02FF, EIO, "Internal Error" },
-	{ 0x0000, 0, NULL },
-};
-
-/*
- * Figure out the biggest I/O request (in sectors) we can accept
- */
-#define VIODASD_MAXSECTORS (4096 / 512 * VIOMAXBLOCKDMA)
-
-/*
- * Number of disk I/O requests we've sent to OS/400
- */
-static int num_req_outstanding;
-
-/*
- * This is our internal structure for keeping track of disk devices
- */
-struct viodasd_device {
-	u16		cylinders;
-	u16		tracks;
-	u16		sectors;
-	u16		bytes_per_sector;
-	u64		size;
-	int		read_only;
-	spinlock_t	q_lock;
-	struct gendisk	*disk;
-	struct device	*dev;
-} viodasd_devices[MAX_DISKNO];
-
-/*
- * External open entry point.
- */
-static int viodasd_open(struct block_device *bdev, fmode_t mode)
-{
-	struct viodasd_device *d = bdev->bd_disk->private_data;
-	HvLpEvent_Rc hvrc;
-	struct viodasd_waitevent we;
-	u16 flags = 0;
-
-	if (d->read_only) {
-		if (mode & FMODE_WRITE)
-			return -EROFS;
-		flags = vioblockflags_ro;
-	}
-
-	init_completion(&we.com);
-
-	/* Send the open event to OS/400 */
-	hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
-			HvLpEvent_Type_VirtualIo,
-			viomajorsubtype_blockio | vioblockopen,
-			HvLpEvent_AckInd_DoAck, HvLpEvent_AckType_ImmediateAck,
-			viopath_sourceinst(viopath_hostLp),
-			viopath_targetinst(viopath_hostLp),
-			(u64)(unsigned long)&we, VIOVERSION << 16,
-			((u64)DEVICE_NO(d) << 48) | ((u64)flags << 32),
-			0, 0, 0);
-	if (hvrc != 0) {
-		pr_warning("HV open failed %d\n", (int)hvrc);
-		return -EIO;
-	}
-
-	wait_for_completion(&we.com);
-
-	/* Check the return code */
-	if (we.rc != 0) {
-		const struct vio_error_entry *err =
-			vio_lookup_rc(viodasd_err_table, we.sub_result);
-
-		pr_warning("bad rc opening disk: %d:0x%04x (%s)\n",
-			   (int)we.rc, we.sub_result, err->msg);
-		return -EIO;
-	}
-
-	return 0;
-}
-
-static int viodasd_unlocked_open(struct block_device *bdev, fmode_t mode)
-{
-	int ret;
-
-	mutex_lock(&viodasd_mutex);
-	ret = viodasd_open(bdev, mode);
-	mutex_unlock(&viodasd_mutex);
-
-	return ret;
-}
-
-
-/*
- * External release entry point.
- */
-static int viodasd_release(struct gendisk *disk, fmode_t mode)
-{
-	struct viodasd_device *d = disk->private_data;
-	HvLpEvent_Rc hvrc;
-
-	mutex_lock(&viodasd_mutex);
-	/* Send the event to OS/400.  We DON'T expect a response */
-	hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
-			HvLpEvent_Type_VirtualIo,
-			viomajorsubtype_blockio | vioblockclose,
-			HvLpEvent_AckInd_NoAck, HvLpEvent_AckType_ImmediateAck,
-			viopath_sourceinst(viopath_hostLp),
-			viopath_targetinst(viopath_hostLp),
-			0, VIOVERSION << 16,
-			((u64)DEVICE_NO(d) << 48) /* | ((u64)flags << 32) */,
-			0, 0, 0);
-	if (hvrc != 0)
-		pr_warning("HV close call failed %d\n", (int)hvrc);
-
-	mutex_unlock(&viodasd_mutex);
-
-	return 0;
-}
-
-
-/* External ioctl entry point.
- */
-static int viodasd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
-	struct gendisk *disk = bdev->bd_disk;
-	struct viodasd_device *d = disk->private_data;
-
-	geo->sectors = d->sectors ? d->sectors : 32;
-	geo->heads = d->tracks ? d->tracks  : 64;
-	geo->cylinders = d->cylinders ? d->cylinders :
-		get_capacity(disk) / (geo->sectors * geo->heads);
-
-	return 0;
-}
-
-/*
- * Our file operations table
- */
-static const struct block_device_operations viodasd_fops = {
-	.owner = THIS_MODULE,
-	.open = viodasd_unlocked_open,
-	.release = viodasd_release,
-	.getgeo = viodasd_getgeo,
-};
-
-/*
- * End a request
- */
-static void viodasd_end_request(struct request *req, int error,
-		int num_sectors)
-{
-	__blk_end_request(req, error, num_sectors << 9);
-}
-
-/*
- * Send an actual I/O request to OS/400
- */
-static int send_request(struct request *req)
-{
-	u64 start;
-	int direction;
-	int nsg;
-	u16 viocmd;
-	HvLpEvent_Rc hvrc;
-	struct vioblocklpevent *bevent;
-	struct HvLpEvent *hev;
-	struct scatterlist sg[VIOMAXBLOCKDMA];
-	int sgindex;
-	struct viodasd_device *d;
-	unsigned long flags;
-
-	start = (u64)blk_rq_pos(req) << 9;
-
-	if (rq_data_dir(req) == READ) {
-		direction = DMA_FROM_DEVICE;
-		viocmd = viomajorsubtype_blockio | vioblockread;
-	} else {
-		direction = DMA_TO_DEVICE;
-		viocmd = viomajorsubtype_blockio | vioblockwrite;
-	}
-
-        d = req->rq_disk->private_data;
-
-	/* Now build the scatter-gather list */
-	sg_init_table(sg, VIOMAXBLOCKDMA);
-	nsg = blk_rq_map_sg(req->q, req, sg);
-	nsg = dma_map_sg(d->dev, sg, nsg, direction);
-
-	spin_lock_irqsave(&viodasd_spinlock, flags);
-	num_req_outstanding++;
-
-	/* This optimization handles a single DMA block */
-	if (nsg == 1)
-		hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
-				HvLpEvent_Type_VirtualIo, viocmd,
-				HvLpEvent_AckInd_DoAck,
-				HvLpEvent_AckType_ImmediateAck,
-				viopath_sourceinst(viopath_hostLp),
-				viopath_targetinst(viopath_hostLp),
-				(u64)(unsigned long)req, VIOVERSION << 16,
-				((u64)DEVICE_NO(d) << 48), start,
-				((u64)sg_dma_address(&sg[0])) << 32,
-				sg_dma_len(&sg[0]));
-	else {
-		bevent = (struct vioblocklpevent *)
-			vio_get_event_buffer(viomajorsubtype_blockio);
-		if (bevent == NULL) {
-			pr_warning("error allocating disk event buffer\n");
-			goto error_ret;
-		}
-
-		/*
-		 * Now build up the actual request.  Note that we store
-		 * the pointer to the request in the correlation
-		 * token so we can match the response up later
-		 */
-		memset(bevent, 0, sizeof(struct vioblocklpevent));
-		hev = &bevent->event;
-		hev->flags = HV_LP_EVENT_VALID | HV_LP_EVENT_DO_ACK |
-			HV_LP_EVENT_INT;
-		hev->xType = HvLpEvent_Type_VirtualIo;
-		hev->xSubtype = viocmd;
-		hev->xSourceLp = HvLpConfig_getLpIndex();
-		hev->xTargetLp = viopath_hostLp;
-		hev->xSizeMinus1 =
-			offsetof(struct vioblocklpevent, u.rw_data.dma_info) +
-			(sizeof(bevent->u.rw_data.dma_info[0]) * nsg) - 1;
-		hev->xSourceInstanceId = viopath_sourceinst(viopath_hostLp);
-		hev->xTargetInstanceId = viopath_targetinst(viopath_hostLp);
-		hev->xCorrelationToken = (u64)req;
-		bevent->version = VIOVERSION;
-		bevent->disk = DEVICE_NO(d);
-		bevent->u.rw_data.offset = start;
-
-		/*
-		 * Copy just the dma information from the sg list
-		 * into the request
-		 */
-		for (sgindex = 0; sgindex < nsg; sgindex++) {
-			bevent->u.rw_data.dma_info[sgindex].token =
-				sg_dma_address(&sg[sgindex]);
-			bevent->u.rw_data.dma_info[sgindex].len =
-				sg_dma_len(&sg[sgindex]);
-		}
-
-		/* Send the request */
-		hvrc = HvCallEvent_signalLpEvent(&bevent->event);
-		vio_free_event_buffer(viomajorsubtype_blockio, bevent);
-	}
-
-	if (hvrc != HvLpEvent_Rc_Good) {
-		pr_warning("error sending disk event to OS/400 (rc %d)\n",
-			   (int)hvrc);
-		goto error_ret;
-	}
-	spin_unlock_irqrestore(&viodasd_spinlock, flags);
-	return 0;
-
-error_ret:
-	num_req_outstanding--;
-	spin_unlock_irqrestore(&viodasd_spinlock, flags);
-	dma_unmap_sg(d->dev, sg, nsg, direction);
-	return -1;
-}
-
-/*
- * This is the external request processing routine
- */
-static void do_viodasd_request(struct request_queue *q)
-{
-	struct request *req;
-
-	/*
-	 * If we already have the maximum number of requests
-	 * outstanding to OS/400 just bail out. We'll come
-	 * back later.
-	 */
-	while (num_req_outstanding < VIOMAXREQ) {
-		req = blk_fetch_request(q);
-		if (req == NULL)
-			return;
-		/* check that request contains a valid command */
-		if (req->cmd_type != REQ_TYPE_FS) {
-			viodasd_end_request(req, -EIO, blk_rq_sectors(req));
-			continue;
-		}
-		/* Try sending the request */
-		if (send_request(req) != 0)
-			viodasd_end_request(req, -EIO, blk_rq_sectors(req));
-	}
-}
-
-/*
- * Probe a single disk and fill in the viodasd_device structure
- * for it.
- */
-static int probe_disk(struct viodasd_device *d)
-{
-	HvLpEvent_Rc hvrc;
-	struct viodasd_waitevent we;
-	int dev_no = DEVICE_NO(d);
-	struct gendisk *g;
-	struct request_queue *q;
-	u16 flags = 0;
-
-retry:
-	init_completion(&we.com);
-
-	/* Send the open event to OS/400 */
-	hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
-			HvLpEvent_Type_VirtualIo,
-			viomajorsubtype_blockio | vioblockopen,
-			HvLpEvent_AckInd_DoAck, HvLpEvent_AckType_ImmediateAck,
-			viopath_sourceinst(viopath_hostLp),
-			viopath_targetinst(viopath_hostLp),
-			(u64)(unsigned long)&we, VIOVERSION << 16,
-			((u64)dev_no << 48) | ((u64)flags<< 32),
-			0, 0, 0);
-	if (hvrc != 0) {
-		pr_warning("bad rc on HV open %d\n", (int)hvrc);
-		return 0;
-	}
-
-	wait_for_completion(&we.com);
-
-	if (we.rc != 0) {
-		if (flags != 0)
-			return 0;
-		/* try again with read only flag set */
-		flags = vioblockflags_ro;
-		goto retry;
-	}
-	if (we.max_disk > (MAX_DISKNO - 1)) {
-		printk_once(KERN_INFO pr_fmt("Only examining the first %d of %d disks connected\n"),
-			    MAX_DISKNO, we.max_disk + 1);
-	}
-
-	/* Send the close event to OS/400.  We DON'T expect a response */
-	hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
-			HvLpEvent_Type_VirtualIo,
-			viomajorsubtype_blockio | vioblockclose,
-			HvLpEvent_AckInd_NoAck, HvLpEvent_AckType_ImmediateAck,
-			viopath_sourceinst(viopath_hostLp),
-			viopath_targetinst(viopath_hostLp),
-			0, VIOVERSION << 16,
-			((u64)dev_no << 48) | ((u64)flags << 32),
-			0, 0, 0);
-	if (hvrc != 0) {
-		pr_warning("bad rc sending event to OS/400 %d\n", (int)hvrc);
-		return 0;
-	}
-
-	if (d->dev == NULL) {
-		/* this is when we reprobe for new disks */
-		if (vio_create_viodasd(dev_no) == NULL) {
-			pr_warning("cannot allocate virtual device for disk %d\n",
-				   dev_no);
-			return 0;
-		}
-		/*
-		 * The vio_create_viodasd will have recursed into this
-		 * routine with d->dev set to the new vio device and
-		 * will finish the setup of the disk below.
-		 */
-		return 1;
-	}
-
-	/* create the request queue for the disk */
-	spin_lock_init(&d->q_lock);
-	q = blk_init_queue(do_viodasd_request, &d->q_lock);
-	if (q == NULL) {
-		pr_warning("cannot allocate queue for disk %d\n", dev_no);
-		return 0;
-	}
-	g = alloc_disk(1 << PARTITION_SHIFT);
-	if (g == NULL) {
-		pr_warning("cannot allocate disk structure for disk %d\n",
-			   dev_no);
-		blk_cleanup_queue(q);
-		return 0;
-	}
-
-	d->disk = g;
-	blk_queue_max_segments(q, VIOMAXBLOCKDMA);
-	blk_queue_max_hw_sectors(q, VIODASD_MAXSECTORS);
-	g->major = VIODASD_MAJOR;
-	g->first_minor = dev_no << PARTITION_SHIFT;
-	if (dev_no >= 26)
-		snprintf(g->disk_name, sizeof(g->disk_name),
-				VIOD_GENHD_NAME "%c%c",
-				'a' + (dev_no / 26) - 1, 'a' + (dev_no % 26));
-	else
-		snprintf(g->disk_name, sizeof(g->disk_name),
-				VIOD_GENHD_NAME "%c", 'a' + (dev_no % 26));
-	g->fops = &viodasd_fops;
-	g->queue = q;
-	g->private_data = d;
-	g->driverfs_dev = d->dev;
-	set_capacity(g, d->size >> 9);
-
-	pr_info("disk %d: %lu sectors (%lu MB) CHS=%d/%d/%d sector size %d%s\n",
-		dev_no, (unsigned long)(d->size >> 9),
-		(unsigned long)(d->size >> 20),
-		(int)d->cylinders, (int)d->tracks,
-		(int)d->sectors, (int)d->bytes_per_sector,
-		d->read_only ? " (RO)" : "");
-
-	/* register us in the global list */
-	add_disk(g);
-	return 1;
-}
-
-/* returns the total number of scatterlist elements converted */
-static int block_event_to_scatterlist(const struct vioblocklpevent *bevent,
-		struct scatterlist *sg, int *total_len)
-{
-	int i, numsg;
-	const struct rw_data *rw_data = &bevent->u.rw_data;
-	static const int offset =
-		offsetof(struct vioblocklpevent, u.rw_data.dma_info);
-	static const int element_size = sizeof(rw_data->dma_info[0]);
-
-	numsg = ((bevent->event.xSizeMinus1 + 1) - offset) / element_size;
-	if (numsg > VIOMAXBLOCKDMA)
-		numsg = VIOMAXBLOCKDMA;
-
-	*total_len = 0;
-	sg_init_table(sg, VIOMAXBLOCKDMA);
-	for (i = 0; (i < numsg) && (rw_data->dma_info[i].len > 0); ++i) {
-		sg_dma_address(&sg[i]) = rw_data->dma_info[i].token;
-		sg_dma_len(&sg[i]) = rw_data->dma_info[i].len;
-		*total_len += rw_data->dma_info[i].len;
-	}
-	return i;
-}
-
-/*
- * Restart all queues, starting with the one _after_ the disk given,
- * thus reducing the chance of starvation of higher numbered disks.
- */
-static void viodasd_restart_all_queues_starting_from(int first_index)
-{
-	int i;
-
-	for (i = first_index + 1; i < MAX_DISKNO; ++i)
-		if (viodasd_devices[i].disk)
-			blk_run_queue(viodasd_devices[i].disk->queue);
-	for (i = 0; i <= first_index; ++i)
-		if (viodasd_devices[i].disk)
-			blk_run_queue(viodasd_devices[i].disk->queue);
-}
-
-/*
- * For read and write requests, decrement the number of outstanding requests,
- * Free the DMA buffers we allocated.
- */
-static int viodasd_handle_read_write(struct vioblocklpevent *bevent)
-{
-	int num_sg, num_sect, pci_direction, total_len;
-	struct request *req;
-	struct scatterlist sg[VIOMAXBLOCKDMA];
-	struct HvLpEvent *event = &bevent->event;
-	unsigned long irq_flags;
-	struct viodasd_device *d;
-	int error;
-	spinlock_t *qlock;
-
-	num_sg = block_event_to_scatterlist(bevent, sg, &total_len);
-	num_sect = total_len >> 9;
-	if (event->xSubtype == (viomajorsubtype_blockio | vioblockread))
-		pci_direction = DMA_FROM_DEVICE;
-	else
-		pci_direction = DMA_TO_DEVICE;
-	req = (struct request *)bevent->event.xCorrelationToken;
-	d = req->rq_disk->private_data;
-
-	dma_unmap_sg(d->dev, sg, num_sg, pci_direction);
-
-	/*
-	 * Since this is running in interrupt mode, we need to make sure
-	 * we're not stepping on any global I/O operations
-	 */
-	spin_lock_irqsave(&viodasd_spinlock, irq_flags);
-	num_req_outstanding--;
-	spin_unlock_irqrestore(&viodasd_spinlock, irq_flags);
-
-	error = (event->xRc == HvLpEvent_Rc_Good) ? 0 : -EIO;
-	if (error) {
-		const struct vio_error_entry *err;
-		err = vio_lookup_rc(viodasd_err_table, bevent->sub_result);
-		pr_warning("read/write error %d:0x%04x (%s)\n",
-			   event->xRc, bevent->sub_result, err->msg);
-		num_sect = blk_rq_sectors(req);
-	}
-	qlock = req->q->queue_lock;
-	spin_lock_irqsave(qlock, irq_flags);
-	viodasd_end_request(req, error, num_sect);
-	spin_unlock_irqrestore(qlock, irq_flags);
-
-	/* Finally, try to get more requests off of this device's queue */
-	viodasd_restart_all_queues_starting_from(DEVICE_NO(d));
-
-	return 0;
-}
-
-/* This routine handles incoming block LP events */
-static void handle_block_event(struct HvLpEvent *event)
-{
-	struct vioblocklpevent *bevent = (struct vioblocklpevent *)event;
-	struct viodasd_waitevent *pwe;
-
-	if (event == NULL)
-		/* Notification that a partition went away! */
-		return;
-	/* First, we should NEVER get an int here...only acks */
-	if (hvlpevent_is_int(event)) {
-		pr_warning("Yikes! got an int in viodasd event handler!\n");
-		if (hvlpevent_need_ack(event)) {
-			event->xRc = HvLpEvent_Rc_InvalidSubtype;
-			HvCallEvent_ackLpEvent(event);
-		}
-	}
-
-	switch (event->xSubtype & VIOMINOR_SUBTYPE_MASK) {
-	case vioblockopen:
-		/*
-		 * Handle a response to an open request.  We get all the
-		 * disk information in the response, so update it.  The
-		 * correlation token contains a pointer to a waitevent
-		 * structure that has a completion in it.  update the
-		 * return code in the waitevent structure and post the
-		 * completion to wake up the guy who sent the request
-		 */
-		pwe = (struct viodasd_waitevent *)event->xCorrelationToken;
-		pwe->rc = event->xRc;
-		pwe->sub_result = bevent->sub_result;
-		if (event->xRc == HvLpEvent_Rc_Good) {
-			const struct open_data *data = &bevent->u.open_data;
-			struct viodasd_device *device =
-				&viodasd_devices[bevent->disk];
-			device->read_only =
-				bevent->flags & vioblockflags_ro;
-			device->size = data->disk_size;
-			device->cylinders = data->cylinders;
-			device->tracks = data->tracks;
-			device->sectors = data->sectors;
-			device->bytes_per_sector = data->bytes_per_sector;
-			pwe->max_disk = data->max_disk;
-		}
-		complete(&pwe->com);
-		break;
-	case vioblockclose:
-		break;
-	case vioblockread:
-	case vioblockwrite:
-		viodasd_handle_read_write(bevent);
-		break;
-
-	default:
-		pr_warning("invalid subtype!");
-		if (hvlpevent_need_ack(event)) {
-			event->xRc = HvLpEvent_Rc_InvalidSubtype;
-			HvCallEvent_ackLpEvent(event);
-		}
-	}
-}
-
-/*
- * Get the driver to reprobe for more disks.
- */
-static ssize_t probe_disks(struct device_driver *drv, const char *buf,
-		size_t count)
-{
-	struct viodasd_device *d;
-
-	for (d = viodasd_devices; d < &viodasd_devices[MAX_DISKNO]; d++) {
-		if (d->disk == NULL)
-			probe_disk(d);
-	}
-	return count;
-}
-static DRIVER_ATTR(probe, S_IWUSR, NULL, probe_disks);
-
-static int viodasd_probe(struct vio_dev *vdev, const struct vio_device_id *id)
-{
-	struct viodasd_device *d = &viodasd_devices[vdev->unit_address];
-
-	d->dev = &vdev->dev;
-	if (!probe_disk(d))
-		return -ENODEV;
-	return 0;
-}
-
-static int viodasd_remove(struct vio_dev *vdev)
-{
-	struct viodasd_device *d;
-
-	d = &viodasd_devices[vdev->unit_address];
-	if (d->disk) {
-		del_gendisk(d->disk);
-		blk_cleanup_queue(d->disk->queue);
-		put_disk(d->disk);
-		d->disk = NULL;
-	}
-	d->dev = NULL;
-	return 0;
-}
-
-/**
- * viodasd_device_table: Used by vio.c to match devices that we
- * support.
- */
-static struct vio_device_id viodasd_device_table[] __devinitdata = {
-	{ "block", "IBM,iSeries-viodasd" },
-	{ "", "" }
-};
-MODULE_DEVICE_TABLE(vio, viodasd_device_table);
-
-static struct vio_driver viodasd_driver = {
-	.id_table = viodasd_device_table,
-	.probe = viodasd_probe,
-	.remove = viodasd_remove,
-	.driver = {
-		.name = "viodasd",
-		.owner = THIS_MODULE,
-	}
-};
-
-static int need_delete_probe;
-
-/*
- * Initialize the whole device driver.  Handle module and non-module
- * versions
- */
-static int __init viodasd_init(void)
-{
-	int rc;
-
-	if (!firmware_has_feature(FW_FEATURE_ISERIES)) {
-		rc = -ENODEV;
-		goto early_fail;
-	}
-
-	/* Try to open to our host lp */
-	if (viopath_hostLp == HvLpIndexInvalid)
-		vio_set_hostlp();
-
-	if (viopath_hostLp == HvLpIndexInvalid) {
-		pr_warning("invalid hosting partition\n");
-		rc = -EIO;
-		goto early_fail;
-	}
-
-	pr_info("vers " VIOD_VERS ", hosting partition %d\n", viopath_hostLp);
-
-        /* register the block device */
-	rc =  register_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME);
-	if (rc) {
-		pr_warning("Unable to get major number %d for %s\n",
-			   VIODASD_MAJOR, VIOD_GENHD_NAME);
-		goto early_fail;
-	}
-	/* Actually open the path to the hosting partition */
-	rc = viopath_open(viopath_hostLp, viomajorsubtype_blockio,
-				VIOMAXREQ + 2);
-	if (rc) {
-		pr_warning("error opening path to host partition %d\n",
-			   viopath_hostLp);
-		goto unregister_blk;
-	}
-
-	/* Initialize our request handler */
-	vio_setHandler(viomajorsubtype_blockio, handle_block_event);
-
-	rc = vio_register_driver(&viodasd_driver);
-	if (rc) {
-		pr_warning("vio_register_driver failed\n");
-		goto unset_handler;
-	}
-
-	/*
-	 * If this call fails, it just means that we cannot dynamically
-	 * add virtual disks, but the driver will still work fine for
-	 * all existing disk, so ignore the failure.
-	 */
-	if (!driver_create_file(&viodasd_driver.driver, &driver_attr_probe))
-		need_delete_probe = 1;
-
-	return 0;
-
-unset_handler:
-	vio_clearHandler(viomajorsubtype_blockio);
-	viopath_close(viopath_hostLp, viomajorsubtype_blockio, VIOMAXREQ + 2);
-unregister_blk:
-	unregister_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME);
-early_fail:
-	return rc;
-}
-module_init(viodasd_init);
-
-void __exit viodasd_exit(void)
-{
-	if (need_delete_probe)
-		driver_remove_file(&viodasd_driver.driver, &driver_attr_probe);
-	vio_unregister_driver(&viodasd_driver);
-	vio_clearHandler(viomajorsubtype_blockio);
-	viopath_close(viopath_hostLp, viomajorsubtype_blockio, VIOMAXREQ + 2);
-	unregister_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME);
-}
-module_exit(viodasd_exit);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 079c08808d8a..c0bbeb470754 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -3,45 +3,53 @@
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/hdreg.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
 #include <linux/virtio.h>
 #include <linux/virtio_blk.h>
 #include <linux/scatterlist.h>
 #include <linux/string_helpers.h>
 #include <scsi/scsi_cmnd.h>
+#include <linux/idr.h>
 
 #define PART_BITS 4
 
-static int major, index;
+static int major;
+static DEFINE_IDA(vd_index_ida);
+
 struct workqueue_struct *virtblk_wq;
 
 struct virtio_blk
 {
-	spinlock_t lock;
-
 	struct virtio_device *vdev;
 	struct virtqueue *vq;
 
 	/* The disk structure for the kernel. */
 	struct gendisk *disk;
 
-	/* Request tracking. */
-	struct list_head reqs;
-
 	mempool_t *pool;
 
 	/* Process context for config space updates */
 	struct work_struct config_work;
 
+	/* Lock for config space updates */
+	struct mutex config_lock;
+
+	/* enable config space updates */
+	bool config_enable;
+
 	/* What host tells us, plus 2 for header & tailer. */
 	unsigned int sg_elems;
 
+	/* Ida index - used to track minor number allocations. */
+	int index;
+
 	/* Scatterlist: can be too big for stack. */
 	struct scatterlist sg[/*sg_elems*/];
 };
 
 struct virtblk_req
 {
-	struct list_head list;
 	struct request *req;
 	struct virtio_blk_outhdr out_hdr;
 	struct virtio_scsi_inhdr in_hdr;
@@ -55,7 +63,7 @@ static void blk_done(struct virtqueue *vq)
 	unsigned int len;
 	unsigned long flags;
 
-	spin_lock_irqsave(&vblk->lock, flags);
+	spin_lock_irqsave(vblk->disk->queue->queue_lock, flags);
 	while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
 		int error;
 
@@ -85,12 +93,11 @@ static void blk_done(struct virtqueue *vq)
 		}
 
 		__blk_end_request_all(vbr->req, error);
-		list_del(&vbr->list);
 		mempool_free(vbr, vblk->pool);
 	}
 	/* In case queue is stopped waiting for more buffers. */
 	blk_start_queue(vblk->disk->queue);
-	spin_unlock_irqrestore(&vblk->lock, flags);
+	spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags);
 }
 
 static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
@@ -165,12 +172,11 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 		}
 	}
 
-	if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr) < 0) {
+	if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, GFP_ATOMIC)<0) {
 		mempool_free(vbr, vblk->pool);
 		return false;
 	}
 
-	list_add_tail(&vbr->list, &vblk->reqs);
 	return true;
 }
 
@@ -236,8 +242,8 @@ static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
 	if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI))
 		return -ENOTTY;
 
-	return scsi_cmd_ioctl(disk->queue, disk, mode, cmd,
-			      (void __user *)data);
+	return scsi_cmd_blk_ioctl(bdev, mode, cmd,
+				  (void __user *)data);
 }
 
 /* We provide getgeo only to please some old bootloader/partitioning tools */
@@ -276,6 +282,11 @@ static int index_to_minor(int index)
 	return index << PART_BITS;
 }
 
+static int minor_to_index(int minor)
+{
+	return minor >> PART_BITS;
+}
+
 static ssize_t virtblk_serial_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
@@ -306,6 +317,10 @@ static void virtblk_config_changed_work(struct work_struct *work)
 	char cap_str_2[10], cap_str_10[10];
 	u64 capacity, size;
 
+	mutex_lock(&vblk->config_lock);
+	if (!vblk->config_enable)
+		goto done;
+
 	/* Host must always specify the capacity. */
 	vdev->config->get(vdev, offsetof(struct virtio_blk_config, capacity),
 			  &capacity, sizeof(capacity));
@@ -328,6 +343,9 @@ static void virtblk_config_changed_work(struct work_struct *work)
 		  cap_str_10, cap_str_2);
 
 	set_capacity(vblk->disk, capacity);
+	revalidate_disk(vblk->disk);
+done:
+	mutex_unlock(&vblk->config_lock);
 }
 
 static void virtblk_config_changed(struct virtio_device *vdev)
@@ -337,18 +355,138 @@ static void virtblk_config_changed(struct virtio_device *vdev)
 	queue_work(virtblk_wq, &vblk->config_work);
 }
 
+static int init_vq(struct virtio_blk *vblk)
+{
+	int err = 0;
+
+	/* We expect one virtqueue, for output. */
+	vblk->vq = virtio_find_single_vq(vblk->vdev, blk_done, "requests");
+	if (IS_ERR(vblk->vq))
+		err = PTR_ERR(vblk->vq);
+
+	return err;
+}
+
+/*
+ * Legacy naming scheme used for virtio devices.  We are stuck with it for
+ * virtio blk but don't ever use it for any new driver.
+ */
+static int virtblk_name_format(char *prefix, int index, char *buf, int buflen)
+{
+	const int base = 'z' - 'a' + 1;
+	char *begin = buf + strlen(prefix);
+	char *end = buf + buflen;
+	char *p;
+	int unit;
+
+	p = end - 1;
+	*p = '\0';
+	unit = base;
+	do {
+		if (p == begin)
+			return -EINVAL;
+		*--p = 'a' + (index % unit);
+		index = (index / unit) - 1;
+	} while (index >= 0);
+
+	memmove(begin, p, end - p);
+	memcpy(buf, prefix, strlen(prefix));
+
+	return 0;
+}
+
+static int virtblk_get_cache_mode(struct virtio_device *vdev)
+{
+	u8 writeback;
+	int err;
+
+	err = virtio_config_val(vdev, VIRTIO_BLK_F_CONFIG_WCE,
+				offsetof(struct virtio_blk_config, wce),
+				&writeback);
+	if (err)
+		writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE);
+
+	return writeback;
+}
+
+static void virtblk_update_cache_mode(struct virtio_device *vdev)
+{
+	u8 writeback = virtblk_get_cache_mode(vdev);
+	struct virtio_blk *vblk = vdev->priv;
+
+	if (writeback)
+		blk_queue_flush(vblk->disk->queue, REQ_FLUSH);
+	else
+		blk_queue_flush(vblk->disk->queue, 0);
+
+	revalidate_disk(vblk->disk);
+}
+
+static const char *const virtblk_cache_types[] = {
+	"write through", "write back"
+};
+
+static ssize_t
+virtblk_cache_type_store(struct device *dev, struct device_attribute *attr,
+			 const char *buf, size_t count)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct virtio_blk *vblk = disk->private_data;
+	struct virtio_device *vdev = vblk->vdev;
+	int i;
+	u8 writeback;
+
+	BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE));
+	for (i = ARRAY_SIZE(virtblk_cache_types); --i >= 0; )
+		if (sysfs_streq(buf, virtblk_cache_types[i]))
+			break;
+
+	if (i < 0)
+		return -EINVAL;
+
+	writeback = i;
+	vdev->config->set(vdev,
+			  offsetof(struct virtio_blk_config, wce),
+			  &writeback, sizeof(writeback));
+
+	virtblk_update_cache_mode(vdev);
+	return count;
+}
+
+static ssize_t
+virtblk_cache_type_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct virtio_blk *vblk = disk->private_data;
+	u8 writeback = virtblk_get_cache_mode(vblk->vdev);
+
+	BUG_ON(writeback >= ARRAY_SIZE(virtblk_cache_types));
+	return snprintf(buf, 40, "%s\n", virtblk_cache_types[writeback]);
+}
+
+static const struct device_attribute dev_attr_cache_type_ro =
+	__ATTR(cache_type, S_IRUGO,
+	       virtblk_cache_type_show, NULL);
+static const struct device_attribute dev_attr_cache_type_rw =
+	__ATTR(cache_type, S_IRUGO|S_IWUSR,
+	       virtblk_cache_type_show, virtblk_cache_type_store);
+
 static int __devinit virtblk_probe(struct virtio_device *vdev)
 {
 	struct virtio_blk *vblk;
 	struct request_queue *q;
-	int err;
+	int err, index;
 	u64 cap;
 	u32 v, blk_size, sg_elems, opt_io_size;
 	u16 min_io_size;
 	u8 physical_block_exp, alignment_offset;
 
-	if (index_to_minor(index) >= 1 << MINORBITS)
-		return -ENOSPC;
+	err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
+			     GFP_KERNEL);
+	if (err < 0)
+		goto out;
+	index = err;
 
 	/* We need to know how many segments before we allocate. */
 	err = virtio_config_val(vdev, VIRTIO_BLK_F_SEG_MAX,
@@ -365,22 +503,19 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 				    sizeof(vblk->sg[0]) * sg_elems, GFP_KERNEL);
 	if (!vblk) {
 		err = -ENOMEM;
-		goto out;
+		goto out_free_index;
 	}
 
-	INIT_LIST_HEAD(&vblk->reqs);
-	spin_lock_init(&vblk->lock);
 	vblk->vdev = vdev;
 	vblk->sg_elems = sg_elems;
 	sg_init_table(vblk->sg, vblk->sg_elems);
+	mutex_init(&vblk->config_lock);
 	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
+	vblk->config_enable = true;
 
-	/* We expect one virtqueue, for output. */
-	vblk->vq = virtio_find_single_vq(vdev, blk_done, "requests");
-	if (IS_ERR(vblk->vq)) {
-		err = PTR_ERR(vblk->vq);
+	err = init_vq(vblk);
+	if (err)
 		goto out_free_vblk;
-	}
 
 	vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req));
 	if (!vblk->pool) {
@@ -395,7 +530,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 		goto out_mempool;
 	}
 
-	q = vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock);
+	q = vblk->disk->queue = blk_init_queue(do_virtblk_request, NULL);
 	if (!q) {
 		err = -ENOMEM;
 		goto out_put_disk;
@@ -403,29 +538,17 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 
 	q->queuedata = vblk;
 
-	if (index < 26) {
-		sprintf(vblk->disk->disk_name, "vd%c", 'a' + index % 26);
-	} else if (index < (26 + 1) * 26) {
-		sprintf(vblk->disk->disk_name, "vd%c%c",
-			'a' + index / 26 - 1, 'a' + index % 26);
-	} else {
-		const unsigned int m1 = (index / 26 - 1) / 26 - 1;
-		const unsigned int m2 = (index / 26 - 1) % 26;
-		const unsigned int m3 =  index % 26;
-		sprintf(vblk->disk->disk_name, "vd%c%c%c",
-			'a' + m1, 'a' + m2, 'a' + m3);
-	}
+	virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
 
 	vblk->disk->major = major;
 	vblk->disk->first_minor = index_to_minor(index);
 	vblk->disk->private_data = vblk;
 	vblk->disk->fops = &virtblk_fops;
 	vblk->disk->driverfs_dev = &vdev->dev;
-	index++;
+	vblk->index = index;
 
 	/* configure queue flush support */
-	if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
-		blk_queue_flush(q, REQ_FLUSH);
+	virtblk_update_cache_mode(vdev);
 
 	/* If disk is read-only in the host, the guest should obey */
 	if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
@@ -503,6 +626,14 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 	if (err)
 		goto out_del_disk;
 
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE))
+		err = device_create_file(disk_to_dev(vblk->disk),
+					 &dev_attr_cache_type_rw);
+	else
+		err = device_create_file(disk_to_dev(vblk->disk),
+					 &dev_attr_cache_type_ro);
+	if (err)
+		goto out_del_disk;
 	return 0;
 
 out_del_disk:
@@ -516,6 +647,8 @@ out_free_vq:
 	vdev->config->del_vqs(vdev);
 out_free_vblk:
 	kfree(vblk);
+out_free_index:
+	ida_simple_remove(&vd_index_ida, index);
 out:
 	return err;
 }
@@ -523,22 +656,67 @@ out:
 static void __devexit virtblk_remove(struct virtio_device *vdev)
 {
 	struct virtio_blk *vblk = vdev->priv;
+	int index = vblk->index;
 
-	flush_work(&vblk->config_work);
+	/* Prevent config work handler from accessing the device. */
+	mutex_lock(&vblk->config_lock);
+	vblk->config_enable = false;
+	mutex_unlock(&vblk->config_lock);
 
-	/* Nothing should be pending. */
-	BUG_ON(!list_empty(&vblk->reqs));
+	del_gendisk(vblk->disk);
+	blk_cleanup_queue(vblk->disk->queue);
 
 	/* Stop all the virtqueues. */
 	vdev->config->reset(vdev);
 
-	del_gendisk(vblk->disk);
-	blk_cleanup_queue(vblk->disk->queue);
+	flush_work(&vblk->config_work);
+
 	put_disk(vblk->disk);
 	mempool_destroy(vblk->pool);
 	vdev->config->del_vqs(vdev);
 	kfree(vblk);
+	ida_simple_remove(&vd_index_ida, index);
+}
+
+#ifdef CONFIG_PM
+static int virtblk_freeze(struct virtio_device *vdev)
+{
+	struct virtio_blk *vblk = vdev->priv;
+
+	/* Ensure we don't receive any more interrupts */
+	vdev->config->reset(vdev);
+
+	/* Prevent config work handler from accessing the device. */
+	mutex_lock(&vblk->config_lock);
+	vblk->config_enable = false;
+	mutex_unlock(&vblk->config_lock);
+
+	flush_work(&vblk->config_work);
+
+	spin_lock_irq(vblk->disk->queue->queue_lock);
+	blk_stop_queue(vblk->disk->queue);
+	spin_unlock_irq(vblk->disk->queue->queue_lock);
+	blk_sync_queue(vblk->disk->queue);
+
+	vdev->config->del_vqs(vdev);
+	return 0;
+}
+
+static int virtblk_restore(struct virtio_device *vdev)
+{
+	struct virtio_blk *vblk = vdev->priv;
+	int ret;
+
+	vblk->config_enable = true;
+	ret = init_vq(vdev->priv);
+	if (!ret) {
+		spin_lock_irq(vblk->disk->queue->queue_lock);
+		blk_start_queue(vblk->disk->queue);
+		spin_unlock_irq(vblk->disk->queue->queue_lock);
+	}
+	return ret;
 }
+#endif
 
 static const struct virtio_device_id id_table[] = {
 	{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
@@ -548,7 +726,7 @@ static const struct virtio_device_id id_table[] = {
 static unsigned int features[] = {
 	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
 	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
-	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY
+	VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE
 };
 
 /*
@@ -565,6 +743,10 @@ static struct virtio_driver __refdata virtio_blk = {
 	.probe			= virtblk_probe,
 	.remove			= __devexit_p(virtblk_remove),
 	.config_changed		= virtblk_config_changed,
+#ifdef CONFIG_PM
+	.freeze			= virtblk_freeze,
+	.restore		= virtblk_restore,
+#endif
 };
 
 static int __init init(void)
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 4abd2bcd20fb..ff540520bada 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -52,7 +52,6 @@
 #include <linux/io.h>
 #include <linux/gfp.h>
 
-#include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/dma.h>
 
@@ -148,7 +147,7 @@ static volatile int xdc_busy;
 static struct timer_list xd_watchdog_int;
 
 static volatile u_char xd_error;
-static int nodma = XD_DONT_USE_DMA;
+static bool nodma = XD_DONT_USE_DMA;
 
 static struct request_queue *xd_queue;
 
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 2330a9ad5e95..c6decb901e5e 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -258,13 +258,16 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
 
 static void print_stats(struct xen_blkif *blkif)
 {
-	pr_info("xen-blkback (%s): oo %3d  |  rd %4d  |  wr %4d  |  f %4d\n",
+	pr_info("xen-blkback (%s): oo %3d  |  rd %4d  |  wr %4d  |  f %4d"
+		 "  |  ds %4d\n",
 		 current->comm, blkif->st_oo_req,
-		 blkif->st_rd_req, blkif->st_wr_req, blkif->st_f_req);
+		 blkif->st_rd_req, blkif->st_wr_req,
+		 blkif->st_f_req, blkif->st_ds_req);
 	blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
 	blkif->st_rd_req = 0;
 	blkif->st_wr_req = 0;
 	blkif->st_oo_req = 0;
+	blkif->st_ds_req = 0;
 }
 
 int xen_blkif_schedule(void *arg)
@@ -318,6 +321,7 @@ struct seg_buf {
 static void xen_blkbk_unmap(struct pending_req *req)
 {
 	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	unsigned int i, invcount = 0;
 	grant_handle_t handle;
 	int ret;
@@ -329,25 +333,12 @@ static void xen_blkbk_unmap(struct pending_req *req)
 		gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
 				    GNTMAP_host_map, handle);
 		pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
+		pages[invcount] = virt_to_page(vaddr(req, i));
 		invcount++;
 	}
 
-	ret = HYPERVISOR_grant_table_op(
-		GNTTABOP_unmap_grant_ref, unmap, invcount);
+	ret = gnttab_unmap_refs(unmap, NULL, pages, invcount);
 	BUG_ON(ret);
-	/*
-	 * Note, we use invcount, so nr->pages, so we can't index
-	 * using vaddr(req, i).
-	 */
-	for (i = 0; i < invcount; i++) {
-		ret = m2p_remove_override(
-			virt_to_page(unmap[i].host_addr), false);
-		if (ret) {
-			pr_alert(DRV_PFX "Failed to remove M2P override for %lx\n",
-				 (unsigned long)unmap[i].host_addr);
-			continue;
-		}
-	}
 }
 
 static int xen_blkbk_map(struct blkif_request *req,
@@ -356,7 +347,7 @@ static int xen_blkbk_map(struct blkif_request *req,
 {
 	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	int i;
-	int nseg = req->nr_segments;
+	int nseg = req->u.rw.nr_segments;
 	int ret = 0;
 
 	/*
@@ -375,7 +366,7 @@ static int xen_blkbk_map(struct blkif_request *req,
 				  pending_req->blkif->domid);
 	}
 
-	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
+	ret = gnttab_map_refs(map, NULL, &blkbk->pending_page(pending_req, 0), nseg);
 	BUG_ON(ret);
 
 	/*
@@ -395,21 +386,59 @@ static int xen_blkbk_map(struct blkif_request *req,
 		if (ret)
 			continue;
 
-		ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr),
-			blkbk->pending_page(pending_req, i), false);
-		if (ret) {
-			pr_alert(DRV_PFX "Failed to install M2P override for %lx (ret: %d)\n",
-				 (unsigned long)map[i].dev_bus_addr, ret);
-			/* We could switch over to GNTTABOP_copy */
-			continue;
-		}
-
 		seg[i].buf  = map[i].dev_bus_addr |
 			(req->u.rw.seg[i].first_sect << 9);
 	}
 	return ret;
 }
 
+static int dispatch_discard_io(struct xen_blkif *blkif,
+				struct blkif_request *req)
+{
+	int err = 0;
+	int status = BLKIF_RSP_OKAY;
+	struct block_device *bdev = blkif->vbd.bdev;
+	unsigned long secure;
+
+	blkif->st_ds_req++;
+
+	xen_blkif_get(blkif);
+	secure = (blkif->vbd.discard_secure &&
+		 (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
+		 BLKDEV_DISCARD_SECURE : 0;
+
+	err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
+				   req->u.discard.nr_sectors,
+				   GFP_KERNEL, secure);
+
+	if (err == -EOPNOTSUPP) {
+		pr_debug(DRV_PFX "discard op failed, not supported\n");
+		status = BLKIF_RSP_EOPNOTSUPP;
+	} else if (err)
+		status = BLKIF_RSP_ERROR;
+
+	make_response(blkif, req->u.discard.id, req->operation, status);
+	xen_blkif_put(blkif);
+	return err;
+}
+
+static void xen_blk_drain_io(struct xen_blkif *blkif)
+{
+	atomic_set(&blkif->drain, 1);
+	do {
+		/* The initial value is one, and one refcnt taken at the
+		 * start of the xen_blkif_schedule thread. */
+		if (atomic_read(&blkif->refcnt) <= 2)
+			break;
+		wait_for_completion_interruptible_timeout(
+				&blkif->drain_complete, HZ);
+
+		if (!atomic_read(&blkif->drain))
+			break;
+	} while (!kthread_should_stop());
+	atomic_set(&blkif->drain, 0);
+}
+
 /*
  * Completion callback on the bio's. Called as bh->b_end_io()
  */
@@ -422,6 +451,11 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
 		pr_debug(DRV_PFX "flush diskcache op failed, not supported\n");
 		xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0);
 		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
+	} else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
+		    (error == -EOPNOTSUPP)) {
+		pr_debug(DRV_PFX "write barrier op failed, not supported\n");
+		xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0);
+		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
 	} else if (error) {
 		pr_debug(DRV_PFX "Buffer not up-to-date at end of operation,"
 			 " error=%d\n", error);
@@ -438,6 +472,10 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
 		make_response(pending_req->blkif, pending_req->id,
 			      pending_req->operation, pending_req->status);
 		xen_blkif_put(pending_req->blkif);
+		if (atomic_read(&pending_req->blkif->refcnt) <= 2) {
+			if (atomic_read(&pending_req->blkif->drain))
+				complete(&pending_req->blkif->drain_complete);
+		}
 		free_req(pending_req);
 	}
 }
@@ -505,8 +543,11 @@ __do_block_io_op(struct xen_blkif *blkif)
 
 		/* Apply all sanity checks to /private copy/ of request. */
 		barrier();
-
-		if (dispatch_rw_block_io(blkif, &req, pending_req))
+		if (unlikely(req.operation == BLKIF_OP_DISCARD)) {
+			free_req(pending_req);
+			if (dispatch_discard_io(blkif, &req))
+				break;
+		} else if (dispatch_rw_block_io(blkif, &req, pending_req))
 			break;
 
 		/* Yield point for this unbounded loop. */
@@ -532,7 +573,6 @@ do_block_io_op(struct xen_blkif *blkif)
 
 	return more_to_do;
 }
-
 /*
  * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
  * and call the 'submit_bio' to pass it to the underlying storage.
@@ -549,6 +589,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	int i, nbio = 0;
 	int operation;
 	struct blk_plug plug;
+	bool drain = false;
 
 	switch (req->operation) {
 	case BLKIF_OP_READ:
@@ -559,11 +600,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 		blkif->st_wr_req++;
 		operation = WRITE_ODIRECT;
 		break;
+	case BLKIF_OP_WRITE_BARRIER:
+		drain = true;
 	case BLKIF_OP_FLUSH_DISKCACHE:
 		blkif->st_f_req++;
 		operation = WRITE_FLUSH;
 		break;
-	case BLKIF_OP_WRITE_BARRIER:
 	default:
 		operation = 0; /* make gcc happy */
 		goto fail_response;
@@ -571,7 +613,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	}
 
 	/* Check that the number of segments is sane. */
-	nseg = req->nr_segments;
+	nseg = req->u.rw.nr_segments;
+
 	if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
 	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
 		pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
@@ -580,12 +623,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 		goto fail_response;
 	}
 
-	preq.dev           = req->handle;
+	preq.dev           = req->u.rw.handle;
 	preq.sector_number = req->u.rw.sector_number;
 	preq.nr_sects      = 0;
 
 	pending_req->blkif     = blkif;
-	pending_req->id        = req->id;
+	pending_req->id        = req->u.rw.id;
 	pending_req->operation = req->operation;
 	pending_req->status    = BLKIF_RSP_OKAY;
 	pending_req->nr_pages  = nseg;
@@ -621,6 +664,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 		}
 	}
 
+	/* Wait on all outstanding I/O's and once that has been completed
+	 * issue the WRITE_FLUSH.
+	 */
+	if (drain)
+		xen_blk_drain_io(pending_req->blkif);
+
 	/*
 	 * If we have failed at this point, we need to undo the M2P override,
 	 * set gnttab_set_unmap_op on all of the grant references and perform
@@ -630,7 +679,10 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	if (xen_blkbk_map(req, pending_req, seg))
 		goto fail_flush;
 
-	/* This corresponding xen_blkif_put is done in __end_block_io_op */
+	/*
+	 * This corresponding xen_blkif_put is done in __end_block_io_op, or
+	 * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
+	 */
 	xen_blkif_get(blkif);
 
 	for (i = 0; i < nseg; i++) {
@@ -654,7 +706,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 		preq.sector_number += seg[i].nsec;
 	}
 
-	/* This will be hit if the operation was a flush. */
+	/* This will be hit if the operation was a flush or discard. */
 	if (!bio) {
 		BUG_ON(operation != WRITE_FLUSH);
 
@@ -685,7 +737,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 
 	if (operation == READ)
 		blkif->st_rd_sect += preq.nr_sects;
-	else if (operation == WRITE || operation == WRITE_FLUSH)
+	else if (operation & WRITE)
 		blkif->st_wr_sect += preq.nr_sects;
 
 	return 0;
@@ -694,7 +746,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	xen_blkbk_unmap(pending_req);
  fail_response:
 	/* Haven't submitted any bio's yet. */
-	make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+	make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR);
 	free_req(pending_req);
 	msleep(1); /* back off a bit */
 	return -EIO;
@@ -754,7 +806,7 @@ static int __init xen_blkif_init(void)
 	int i, mmap_pages;
 	int rc = 0;
 
-	if (!xen_pv_domain())
+	if (!xen_domain())
 		return -ENODEV;
 
 	blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
@@ -765,9 +817,9 @@ static int __init xen_blkif_init(void)
 
 	mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
 
-	blkbk->pending_reqs          = kmalloc(sizeof(blkbk->pending_reqs[0]) *
+	blkbk->pending_reqs          = kzalloc(sizeof(blkbk->pending_reqs[0]) *
 					xen_blkif_reqs, GFP_KERNEL);
-	blkbk->pending_grant_handles = kzalloc(sizeof(blkbk->pending_grant_handles[0]) *
+	blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) *
 					mmap_pages, GFP_KERNEL);
 	blkbk->pending_pages         = kzalloc(sizeof(blkbk->pending_pages[0]) *
 					mmap_pages, GFP_KERNEL);
@@ -790,8 +842,6 @@ static int __init xen_blkif_init(void)
 	if (rc)
 		goto failed_init;
 
-	memset(blkbk->pending_reqs, 0, sizeof(blkbk->pending_reqs));
-
 	INIT_LIST_HEAD(&blkbk->pending_free);
 	spin_lock_init(&blkbk->pending_free_lock);
 	init_waitqueue_head(&blkbk->pending_free_wq);
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 00c57c90e2d6..9ad3b5ec1dc1 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -27,7 +27,6 @@
 #ifndef __XEN_BLKIF__BACKEND__COMMON_H__
 #define __XEN_BLKIF__BACKEND__COMMON_H__
 
-#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/slab.h>
@@ -61,32 +60,66 @@ struct blkif_common_response {
 	char dummy;
 };
 
-/* i386 protocol version */
-#pragma pack(push, 4)
-struct blkif_x86_32_request {
-	uint8_t        operation;    /* BLKIF_OP_???                         */
+struct blkif_x86_32_request_rw {
 	uint8_t        nr_segments;  /* number of segments                   */
 	blkif_vdev_t   handle;       /* only for read/write requests         */
 	uint64_t       id;           /* private guest value, echoed in resp  */
 	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
 	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-};
+} __attribute__((__packed__));
+
+struct blkif_x86_32_request_discard {
+	uint8_t        flag;         /* BLKIF_DISCARD_SECURE or zero         */
+	blkif_vdev_t   _pad1;        /* was "handle" for read/write requests */
+	uint64_t       id;           /* private guest value, echoed in resp  */
+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+	uint64_t       nr_sectors;
+} __attribute__((__packed__));
+
+struct blkif_x86_32_request {
+	uint8_t        operation;    /* BLKIF_OP_???                         */
+	union {
+		struct blkif_x86_32_request_rw rw;
+		struct blkif_x86_32_request_discard discard;
+	} u;
+} __attribute__((__packed__));
+
+/* i386 protocol version */
+#pragma pack(push, 4)
 struct blkif_x86_32_response {
 	uint64_t        id;              /* copied from request */
 	uint8_t         operation;       /* copied from request */
 	int16_t         status;          /* BLKIF_RSP_???       */
 };
 #pragma pack(pop)
-
 /* x86_64 protocol version */
-struct blkif_x86_64_request {
-	uint8_t        operation;    /* BLKIF_OP_???                         */
+
+struct blkif_x86_64_request_rw {
 	uint8_t        nr_segments;  /* number of segments                   */
 	blkif_vdev_t   handle;       /* only for read/write requests         */
-	uint64_t       __attribute__((__aligned__(8))) id;
+	uint32_t       _pad1;        /* offsetof(blkif_reqest..,u.rw.id)==8  */
+	uint64_t       id;
 	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
 	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-};
+} __attribute__((__packed__));
+
+struct blkif_x86_64_request_discard {
+	uint8_t        flag;         /* BLKIF_DISCARD_SECURE or zero         */
+	blkif_vdev_t   _pad1;        /* was "handle" for read/write requests */
+        uint32_t       _pad2;        /* offsetof(blkif_..,u.discard.id)==8   */
+	uint64_t       id;
+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+	uint64_t       nr_sectors;
+} __attribute__((__packed__));
+
+struct blkif_x86_64_request {
+	uint8_t        operation;    /* BLKIF_OP_???                         */
+	union {
+		struct blkif_x86_64_request_rw rw;
+		struct blkif_x86_64_request_discard discard;
+	} u;
+} __attribute__((__packed__));
+
 struct blkif_x86_64_response {
 	uint64_t       __attribute__((__aligned__(8))) id;
 	uint8_t         operation;       /* copied from request */
@@ -126,6 +159,7 @@ struct xen_vbd {
 	/* Cached size parameter. */
 	sector_t		size;
 	bool			flush_support;
+	bool			discard_secure;
 };
 
 struct backend_info;
@@ -139,7 +173,7 @@ struct xen_blkif {
 	/* Comms information. */
 	enum blkif_protocol	blk_protocol;
 	union blkif_back_rings	blk_rings;
-	struct vm_struct	*blk_ring_area;
+	void			*blk_ring;
 	/* The VBD attached to this interface. */
 	struct xen_vbd		vbd;
 	/* Back pointer to the backend_info. */
@@ -149,6 +183,9 @@ struct xen_blkif {
 	atomic_t		refcnt;
 
 	wait_queue_head_t	wq;
+	/* for barrier (drain) requests */
+	struct completion	drain_complete;
+	atomic_t		drain;
 	/* One thread per one blkif. */
 	struct task_struct	*xenblkd;
 	unsigned int		waiting_reqs;
@@ -159,13 +196,11 @@ struct xen_blkif {
 	int			st_wr_req;
 	int			st_oo_req;
 	int			st_f_req;
+	int			st_ds_req;
 	int			st_rd_sect;
 	int			st_wr_sect;
 
 	wait_queue_head_t	waiting_to_free;
-
-	grant_handle_t		shmem_handle;
-	grant_ref_t		shmem_ref;
 };
 
 
@@ -182,7 +217,7 @@ struct xen_blkif {
 
 struct phys_req {
 	unsigned short		dev;
-	unsigned short		nr_sects;
+	blkif_sector_t		nr_sects;
 	struct block_device	*bdev;
 	blkif_sector_t		sector_number;
 };
@@ -196,6 +231,8 @@ int xen_blkif_schedule(void *arg);
 int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
 			      struct backend_info *be, int state);
 
+int xen_blkbk_barrier(struct xenbus_transaction xbt,
+		      struct backend_info *be, int state);
 struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
 
 static inline void blkif_get_x86_32_req(struct blkif_request *dst,
@@ -203,15 +240,30 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
 {
 	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
 	dst->operation = src->operation;
-	dst->nr_segments = src->nr_segments;
-	dst->handle = src->handle;
-	dst->id = src->id;
-	dst->u.rw.sector_number = src->sector_number;
-	barrier();
-	if (n > dst->nr_segments)
-		n = dst->nr_segments;
-	for (i = 0; i < n; i++)
-		dst->u.rw.seg[i] = src->seg[i];
+	switch (src->operation) {
+	case BLKIF_OP_READ:
+	case BLKIF_OP_WRITE:
+	case BLKIF_OP_WRITE_BARRIER:
+	case BLKIF_OP_FLUSH_DISKCACHE:
+		dst->u.rw.nr_segments = src->u.rw.nr_segments;
+		dst->u.rw.handle = src->u.rw.handle;
+		dst->u.rw.id = src->u.rw.id;
+		dst->u.rw.sector_number = src->u.rw.sector_number;
+		barrier();
+		if (n > dst->u.rw.nr_segments)
+			n = dst->u.rw.nr_segments;
+		for (i = 0; i < n; i++)
+			dst->u.rw.seg[i] = src->u.rw.seg[i];
+		break;
+	case BLKIF_OP_DISCARD:
+		dst->u.discard.flag = src->u.discard.flag;
+		dst->u.discard.id = src->u.discard.id;
+		dst->u.discard.sector_number = src->u.discard.sector_number;
+		dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
+		break;
+	default:
+		break;
+	}
 }
 
 static inline void blkif_get_x86_64_req(struct blkif_request *dst,
@@ -219,15 +271,30 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst,
 {
 	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
 	dst->operation = src->operation;
-	dst->nr_segments = src->nr_segments;
-	dst->handle = src->handle;
-	dst->id = src->id;
-	dst->u.rw.sector_number = src->sector_number;
-	barrier();
-	if (n > dst->nr_segments)
-		n = dst->nr_segments;
-	for (i = 0; i < n; i++)
-		dst->u.rw.seg[i] = src->seg[i];
+	switch (src->operation) {
+	case BLKIF_OP_READ:
+	case BLKIF_OP_WRITE:
+	case BLKIF_OP_WRITE_BARRIER:
+	case BLKIF_OP_FLUSH_DISKCACHE:
+		dst->u.rw.nr_segments = src->u.rw.nr_segments;
+		dst->u.rw.handle = src->u.rw.handle;
+		dst->u.rw.id = src->u.rw.id;
+		dst->u.rw.sector_number = src->u.rw.sector_number;
+		barrier();
+		if (n > dst->u.rw.nr_segments)
+			n = dst->u.rw.nr_segments;
+		for (i = 0; i < n; i++)
+			dst->u.rw.seg[i] = src->u.rw.seg[i];
+		break;
+	case BLKIF_OP_DISCARD:
+		dst->u.discard.flag = src->u.discard.flag;
+		dst->u.discard.id = src->u.discard.id;
+		dst->u.discard.sector_number = src->u.discard.sector_number;
+		dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
+		break;
+	default:
+		break;
+	}
 }
 
 #endif /* __XEN_BLKIF__BACKEND__COMMON_H__ */
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 5fd2010f7d2b..4f66171c6683 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -114,44 +114,14 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 	spin_lock_init(&blkif->blk_ring_lock);
 	atomic_set(&blkif->refcnt, 1);
 	init_waitqueue_head(&blkif->wq);
+	init_completion(&blkif->drain_complete);
+	atomic_set(&blkif->drain, 0);
 	blkif->st_print = jiffies;
 	init_waitqueue_head(&blkif->waiting_to_free);
 
 	return blkif;
 }
 
-static int map_frontend_page(struct xen_blkif *blkif, unsigned long shared_page)
-{
-	struct gnttab_map_grant_ref op;
-
-	gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
-			  GNTMAP_host_map, shared_page, blkif->domid);
-
-	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
-		BUG();
-
-	if (op.status) {
-		DPRINTK("Grant table operation failure !\n");
-		return op.status;
-	}
-
-	blkif->shmem_ref = shared_page;
-	blkif->shmem_handle = op.handle;
-
-	return 0;
-}
-
-static void unmap_frontend_page(struct xen_blkif *blkif)
-{
-	struct gnttab_unmap_grant_ref op;
-
-	gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
-			    GNTMAP_host_map, blkif->shmem_handle);
-
-	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
-		BUG();
-}
-
 static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
 			 unsigned int evtchn)
 {
@@ -161,35 +131,29 @@ static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
 	if (blkif->irq)
 		return 0;
 
-	blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE);
-	if (!blkif->blk_ring_area)
-		return -ENOMEM;
-
-	err = map_frontend_page(blkif, shared_page);
-	if (err) {
-		free_vm_area(blkif->blk_ring_area);
+	err = xenbus_map_ring_valloc(blkif->be->dev, shared_page, &blkif->blk_ring);
+	if (err < 0)
 		return err;
-	}
 
 	switch (blkif->blk_protocol) {
 	case BLKIF_PROTOCOL_NATIVE:
 	{
 		struct blkif_sring *sring;
-		sring = (struct blkif_sring *)blkif->blk_ring_area->addr;
+		sring = (struct blkif_sring *)blkif->blk_ring;
 		BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
 		break;
 	}
 	case BLKIF_PROTOCOL_X86_32:
 	{
 		struct blkif_x86_32_sring *sring_x86_32;
-		sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring_area->addr;
+		sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
 		BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
 		break;
 	}
 	case BLKIF_PROTOCOL_X86_64:
 	{
 		struct blkif_x86_64_sring *sring_x86_64;
-		sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring_area->addr;
+		sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
 		BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
 		break;
 	}
@@ -201,8 +165,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
 						    xen_blkif_be_int, 0,
 						    "blkif-backend", blkif);
 	if (err < 0) {
-		unmap_frontend_page(blkif);
-		free_vm_area(blkif->blk_ring_area);
+		xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
 		blkif->blk_rings.common.sring = NULL;
 		return err;
 	}
@@ -228,8 +191,7 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif)
 	}
 
 	if (blkif->blk_rings.common.sring) {
-		unmap_frontend_page(blkif);
-		free_vm_area(blkif->blk_ring_area);
+		xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
 		blkif->blk_rings.common.sring = NULL;
 	}
 }
@@ -272,6 +234,7 @@ VBD_SHOW(oo_req,  "%d\n", be->blkif->st_oo_req);
 VBD_SHOW(rd_req,  "%d\n", be->blkif->st_rd_req);
 VBD_SHOW(wr_req,  "%d\n", be->blkif->st_wr_req);
 VBD_SHOW(f_req,  "%d\n", be->blkif->st_f_req);
+VBD_SHOW(ds_req,  "%d\n", be->blkif->st_ds_req);
 VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
 VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
 
@@ -280,6 +243,7 @@ static struct attribute *xen_vbdstat_attrs[] = {
 	&dev_attr_rd_req.attr,
 	&dev_attr_wr_req.attr,
 	&dev_attr_f_req.attr,
+	&dev_attr_ds_req.attr,
 	&dev_attr_rd_sect.attr,
 	&dev_attr_wr_sect.attr,
 	NULL
@@ -374,6 +338,9 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
 	if (q && q->flush_flags)
 		vbd->flush_support = true;
 
+	if (q && blk_queue_secdiscard(q))
+		vbd->discard_secure = true;
+
 	DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
 		handle, blkif->domid);
 	return 0;
@@ -414,7 +381,60 @@ int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
 	err = xenbus_printf(xbt, dev->nodename, "feature-flush-cache",
 			    "%d", state);
 	if (err)
-		xenbus_dev_fatal(dev, err, "writing feature-flush-cache");
+		dev_warn(&dev->dev, "writing feature-flush-cache (%d)", err);
+
+	return err;
+}
+
+static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info *be)
+{
+	struct xenbus_device *dev = be->dev;
+	struct xen_blkif *blkif = be->blkif;
+	int err;
+	int state = 0;
+	struct block_device *bdev = be->blkif->vbd.bdev;
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (blk_queue_discard(q)) {
+		err = xenbus_printf(xbt, dev->nodename,
+			"discard-granularity", "%u",
+			q->limits.discard_granularity);
+		if (err) {
+			dev_warn(&dev->dev, "writing discard-granularity (%d)", err);
+			return;
+		}
+		err = xenbus_printf(xbt, dev->nodename,
+			"discard-alignment", "%u",
+			q->limits.discard_alignment);
+		if (err) {
+			dev_warn(&dev->dev, "writing discard-alignment (%d)", err);
+			return;
+		}
+		state = 1;
+		/* Optional. */
+		err = xenbus_printf(xbt, dev->nodename,
+				    "discard-secure", "%d",
+				    blkif->vbd.discard_secure);
+		if (err) {
+			dev_warn(&dev->dev, "writing discard-secure (%d)", err);
+			return;
+		}
+	}
+	err = xenbus_printf(xbt, dev->nodename, "feature-discard",
+			    "%d", state);
+	if (err)
+		dev_warn(&dev->dev, "writing feature-discard (%d)", err);
+}
+int xen_blkbk_barrier(struct xenbus_transaction xbt,
+		      struct backend_info *be, int state)
+{
+	struct xenbus_device *dev = be->dev;
+	int err;
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
+			    "%d", state);
+	if (err)
+		dev_warn(&dev->dev, "writing feature-barrier (%d)", err);
 
 	return err;
 }
@@ -582,7 +602,7 @@ static void frontend_changed(struct xenbus_device *dev,
 	case XenbusStateConnected:
 		/*
 		 * Ensure we connect even when two watches fire in
-		 * close successsion and we miss the intermediate value
+		 * close succession and we miss the intermediate value
 		 * of frontend_state.
 		 */
 		if (dev->state == XenbusStateConnected)
@@ -646,9 +666,12 @@ again:
 		return;
 	}
 
-	err = xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support);
-	if (err)
-		goto abort;
+	/* If we can't advertise it is OK. */
+	xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support);
+
+	xen_blkbk_discard(xbt, be);
+
+	xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support);
 
 	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
 			    (unsigned long long)vbd_sz(&be->blkif->vbd));
@@ -751,17 +774,14 @@ static const struct xenbus_device_id xen_blkbk_ids[] = {
 };
 
 
-static struct xenbus_driver xen_blkbk = {
-	.name = "vbd",
-	.owner = THIS_MODULE,
-	.ids = xen_blkbk_ids,
+static DEFINE_XENBUS_DRIVER(xen_blkbk, ,
 	.probe = xen_blkbk_probe,
 	.remove = xen_blkbk_remove,
 	.otherend_changed = frontend_changed
-};
+);
 
 
 int xen_blkif_xenbus_init(void)
 {
-	return xenbus_register_backend(&xen_blkbk);
+	return xenbus_register_backend(&xen_blkbk_driver);
 }
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 9ea8c2576c70..2c2d2e5c1597 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -43,6 +43,7 @@
 #include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/scatterlist.h>
+#include <linux/bitmap.h>
 
 #include <xen/xen.h>
 #include <xen/xenbus.h>
@@ -81,6 +82,7 @@ static const struct block_device_operations xlvbd_block_fops;
  */
 struct blkfront_info
 {
+	spinlock_t io_lock;
 	struct mutex mutex;
 	struct xenbus_device *xbdev;
 	struct gendisk *gd;
@@ -98,11 +100,13 @@ struct blkfront_info
 	unsigned long shadow_free;
 	unsigned int feature_flush;
 	unsigned int flush_op;
+	unsigned int feature_discard:1;
+	unsigned int feature_secdiscard:1;
+	unsigned int discard_granularity;
+	unsigned int discard_alignment;
 	int is_ready;
 };
 
-static DEFINE_SPINLOCK(blkif_io_lock);
-
 static unsigned int nr_minors;
 static unsigned long *minors;
 static DEFINE_SPINLOCK(minor_lock);
@@ -132,19 +136,41 @@ static int get_id_from_freelist(struct blkfront_info *info)
 {
 	unsigned long free = info->shadow_free;
 	BUG_ON(free >= BLK_RING_SIZE);
-	info->shadow_free = info->shadow[free].req.id;
-	info->shadow[free].req.id = 0x0fffffee; /* debug */
+	info->shadow_free = info->shadow[free].req.u.rw.id;
+	info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
 	return free;
 }
 
-static void add_id_to_freelist(struct blkfront_info *info,
+static int add_id_to_freelist(struct blkfront_info *info,
 			       unsigned long id)
 {
-	info->shadow[id].req.id  = info->shadow_free;
+	if (info->shadow[id].req.u.rw.id != id)
+		return -EINVAL;
+	if (info->shadow[id].request == NULL)
+		return -EINVAL;
+	info->shadow[id].req.u.rw.id  = info->shadow_free;
 	info->shadow[id].request = NULL;
 	info->shadow_free = id;
+	return 0;
 }
 
+static const char *op_name(int op)
+{
+	static const char *const names[] = {
+		[BLKIF_OP_READ] = "read",
+		[BLKIF_OP_WRITE] = "write",
+		[BLKIF_OP_WRITE_BARRIER] = "barrier",
+		[BLKIF_OP_FLUSH_DISKCACHE] = "flush",
+		[BLKIF_OP_DISCARD] = "discard" };
+
+	if (op < 0 || op >= ARRAY_SIZE(names))
+		return "unknown";
+
+	if (!names[op])
+		return "reserved";
+
+	return names[op];
+}
 static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
 {
 	unsigned int end = minor + nr;
@@ -153,7 +179,7 @@ static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
 	if (end > nr_minors) {
 		unsigned long *bitmap, *old;
 
-		bitmap = kzalloc(BITS_TO_LONGS(end) * sizeof(*bitmap),
+		bitmap = kcalloc(BITS_TO_LONGS(end), sizeof(*bitmap),
 				 GFP_KERNEL);
 		if (bitmap == NULL)
 			return -ENOMEM;
@@ -173,8 +199,7 @@ static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
 
 	spin_lock(&minor_lock);
 	if (find_next_bit(minors, end, minor) >= end) {
-		for (; minor < end; ++minor)
-			__set_bit(minor, minors);
+		bitmap_set(minors, minor, nr);
 		rc = 0;
 	} else
 		rc = -EBUSY;
@@ -189,8 +214,7 @@ static void xlbd_release_minors(unsigned int minor, unsigned int nr)
 
 	BUG_ON(end > nr_minors);
 	spin_lock(&minor_lock);
-	for (; minor < end; ++minor)
-		__clear_bit(minor, minors);
+	bitmap_clear(minors,  minor, nr);
 	spin_unlock(&minor_lock);
 }
 
@@ -284,9 +308,9 @@ static int blkif_queue_request(struct request *req)
 	id = get_id_from_freelist(info);
 	info->shadow[id].request = req;
 
-	ring_req->id = id;
+	ring_req->u.rw.id = id;
 	ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
-	ring_req->handle = info->handle;
+	ring_req->u.rw.handle = info->handle;
 
 	ring_req->operation = rq_data_dir(req) ?
 		BLKIF_OP_WRITE : BLKIF_OP_READ;
@@ -302,29 +326,41 @@ static int blkif_queue_request(struct request *req)
 		ring_req->operation = info->flush_op;
 	}
 
-	ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
-	BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
-
-	for_each_sg(info->sg, sg, ring_req->nr_segments, i) {
-		buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg)));
-		fsect = sg->offset >> 9;
-		lsect = fsect + (sg->length >> 9) - 1;
-		/* install a grant reference. */
-		ref = gnttab_claim_grant_reference(&gref_head);
-		BUG_ON(ref == -ENOSPC);
-
-		gnttab_grant_foreign_access_ref(
-				ref,
-				info->xbdev->otherend_id,
-				buffer_mfn,
-				rq_data_dir(req) );
-
-		info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
-		ring_req->u.rw.seg[i] =
-				(struct blkif_request_segment) {
-					.gref       = ref,
-					.first_sect = fsect,
-					.last_sect  = lsect };
+	if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
+		/* id, sector_number and handle are set above. */
+		ring_req->operation = BLKIF_OP_DISCARD;
+		ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
+		if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
+			ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
+		else
+			ring_req->u.discard.flag = 0;
+	} else {
+		ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req,
+							   info->sg);
+		BUG_ON(ring_req->u.rw.nr_segments >
+		       BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+		for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) {
+			buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg)));
+			fsect = sg->offset >> 9;
+			lsect = fsect + (sg->length >> 9) - 1;
+			/* install a grant reference. */
+			ref = gnttab_claim_grant_reference(&gref_head);
+			BUG_ON(ref == -ENOSPC);
+
+			gnttab_grant_foreign_access_ref(
+					ref,
+					info->xbdev->otherend_id,
+					buffer_mfn,
+					rq_data_dir(req));
+
+			info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
+			ring_req->u.rw.seg[i] =
+					(struct blkif_request_segment) {
+						.gref       = ref,
+						.first_sect = fsect,
+						.last_sect  = lsect };
+		}
 	}
 
 	info->ring.req_prod_pvt++;
@@ -370,7 +406,9 @@ static void do_blkif_request(struct request_queue *rq)
 
 		blk_start_request(req);
 
-		if (req->cmd_type != REQ_TYPE_FS) {
+		if ((req->cmd_type != REQ_TYPE_FS) ||
+		    ((req->cmd_flags & (REQ_FLUSH | REQ_FUA)) &&
+		    !info->flush_op)) {
 			__blk_end_request_all(req, -EIO);
 			continue;
 		}
@@ -399,13 +437,23 @@ wait:
 static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 {
 	struct request_queue *rq;
+	struct blkfront_info *info = gd->private_data;
 
-	rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
+	rq = blk_init_queue(do_blkif_request, &info->io_lock);
 	if (rq == NULL)
 		return -1;
 
 	queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
 
+	if (info->feature_discard) {
+		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, rq);
+		blk_queue_max_discard_sectors(rq, get_capacity(gd));
+		rq->limits.discard_granularity = info->discard_granularity;
+		rq->limits.discard_alignment = info->discard_alignment;
+		if (info->feature_secdiscard)
+			queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq);
+	}
+
 	/* Hard sector size and max sectors impersonate the equiv. hardware. */
 	blk_queue_logical_block_size(rq, sector_size);
 	blk_queue_max_hw_sectors(rq, 512);
@@ -500,6 +548,14 @@ static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
 	return 0;
 }
 
+static char *encode_disk_name(char *ptr, unsigned int n)
+{
+	if (n >= 26)
+		ptr = encode_disk_name(ptr, n / 26 - 1);
+	*ptr = 'a' + n % 26;
+	return ptr + 1;
+}
+
 static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 			       struct blkfront_info *info,
 			       u16 vdisk_info, u16 sector_size)
@@ -510,6 +566,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 	unsigned int offset;
 	int minor;
 	int nr_parts;
+	char *ptr;
 
 	BUG_ON(info->gd != NULL);
 	BUG_ON(info->rq != NULL);
@@ -534,7 +591,11 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 					"emulated IDE disks,\n\t choose an xvd device name"
 					"from xvde on\n", info->vdevice);
 	}
-	err = -ENODEV;
+	if (minor >> MINORBITS) {
+		pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n",
+			info->vdevice, minor);
+		return -ENODEV;
+	}
 
 	if ((minor % nr_parts) == 0)
 		nr_minors = nr_parts;
@@ -548,23 +609,14 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 	if (gd == NULL)
 		goto release;
 
-	if (nr_minors > 1) {
-		if (offset < 26)
-			sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset);
-		else
-			sprintf(gd->disk_name, "%s%c%c", DEV_NAME,
-				'a' + ((offset / 26)-1), 'a' + (offset % 26));
-	} else {
-		if (offset < 26)
-			sprintf(gd->disk_name, "%s%c%d", DEV_NAME,
-				'a' + offset,
-				minor & (nr_parts - 1));
-		else
-			sprintf(gd->disk_name, "%s%c%c%d", DEV_NAME,
-				'a' + ((offset / 26) - 1),
-				'a' + (offset % 26),
-				minor & (nr_parts - 1));
-	}
+	strcpy(gd->disk_name, DEV_NAME);
+	ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset);
+	BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN);
+	if (nr_minors > 1)
+		*ptr = 0;
+	else
+		snprintf(ptr, gd->disk_name + DISK_NAME_LEN - ptr,
+			 "%d", minor & (nr_parts - 1));
 
 	gd->major = XENVBD_MAJOR;
 	gd->first_minor = minor;
@@ -608,14 +660,14 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
 	if (info->rq == NULL)
 		return;
 
-	spin_lock_irqsave(&blkif_io_lock, flags);
+	spin_lock_irqsave(&info->io_lock, flags);
 
 	/* No more blkif_request(). */
 	blk_stop_queue(info->rq);
 
 	/* No more gnttab callback work. */
 	gnttab_cancel_free_callback(&info->callback);
-	spin_unlock_irqrestore(&blkif_io_lock, flags);
+	spin_unlock_irqrestore(&info->io_lock, flags);
 
 	/* Flush gnttab callback work. Must be done with no locks held. */
 	flush_work_sync(&info->work);
@@ -647,16 +699,16 @@ static void blkif_restart_queue(struct work_struct *work)
 {
 	struct blkfront_info *info = container_of(work, struct blkfront_info, work);
 
-	spin_lock_irq(&blkif_io_lock);
+	spin_lock_irq(&info->io_lock);
 	if (info->connected == BLKIF_STATE_CONNECTED)
 		kick_pending_request_queues(info);
-	spin_unlock_irq(&blkif_io_lock);
+	spin_unlock_irq(&info->io_lock);
 }
 
 static void blkif_free(struct blkfront_info *info, int suspend)
 {
 	/* Prevent new requests being issued until we fix things up. */
-	spin_lock_irq(&blkif_io_lock);
+	spin_lock_irq(&info->io_lock);
 	info->connected = suspend ?
 		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
 	/* No more blkif_request(). */
@@ -664,7 +716,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 		blk_stop_queue(info->rq);
 	/* No more gnttab callback work. */
 	gnttab_cancel_free_callback(&info->callback);
-	spin_unlock_irq(&blkif_io_lock);
+	spin_unlock_irq(&info->io_lock);
 
 	/* Flush gnttab callback work. Must be done with no locks held. */
 	flush_work_sync(&info->work);
@@ -685,7 +737,9 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 static void blkif_completion(struct blk_shadow *s)
 {
 	int i;
-	for (i = 0; i < s->req.nr_segments; i++)
+	/* Do not let BLKIF_OP_DISCARD as nr_segment is in the same place
+	 * flag. */
+	for (i = 0; i < s->req.u.rw.nr_segments; i++)
 		gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL);
 }
 
@@ -698,10 +752,10 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 	struct blkfront_info *info = (struct blkfront_info *)dev_id;
 	int error;
 
-	spin_lock_irqsave(&blkif_io_lock, flags);
+	spin_lock_irqsave(&info->io_lock, flags);
 
 	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
-		spin_unlock_irqrestore(&blkif_io_lock, flags);
+		spin_unlock_irqrestore(&info->io_lock, flags);
 		return IRQ_HANDLED;
 	}
 
@@ -714,29 +768,55 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 
 		bret = RING_GET_RESPONSE(&info->ring, i);
 		id   = bret->id;
+		/*
+		 * The backend has messed up and given us an id that we would
+		 * never have given to it (we stamp it up to BLK_RING_SIZE -
+		 * look in get_id_from_freelist.
+		 */
+		if (id >= BLK_RING_SIZE) {
+			WARN(1, "%s: response to %s has incorrect id (%ld)\n",
+			     info->gd->disk_name, op_name(bret->operation), id);
+			/* We can't safely get the 'struct request' as
+			 * the id is busted. */
+			continue;
+		}
 		req  = info->shadow[id].request;
 
-		blkif_completion(&info->shadow[id]);
+		if (bret->operation != BLKIF_OP_DISCARD)
+			blkif_completion(&info->shadow[id]);
 
-		add_id_to_freelist(info, id);
+		if (add_id_to_freelist(info, id)) {
+			WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
+			     info->gd->disk_name, op_name(bret->operation), id);
+			continue;
+		}
 
 		error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
 		switch (bret->operation) {
+		case BLKIF_OP_DISCARD:
+			if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
+				struct request_queue *rq = info->rq;
+				printk(KERN_WARNING "blkfront: %s: %s op failed\n",
+					   info->gd->disk_name, op_name(bret->operation));
+				error = -EOPNOTSUPP;
+				info->feature_discard = 0;
+				info->feature_secdiscard = 0;
+				queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
+				queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
+			}
+			__blk_end_request_all(req, error);
+			break;
 		case BLKIF_OP_FLUSH_DISKCACHE:
 		case BLKIF_OP_WRITE_BARRIER:
 			if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
-				printk(KERN_WARNING "blkfront: %s: write %s op failed\n",
-				       info->flush_op == BLKIF_OP_WRITE_BARRIER ?
-				       "barrier" :  "flush disk cache",
-				       info->gd->disk_name);
+				printk(KERN_WARNING "blkfront: %s: %s op failed\n",
+				       info->gd->disk_name, op_name(bret->operation));
 				error = -EOPNOTSUPP;
 			}
 			if (unlikely(bret->status == BLKIF_RSP_ERROR &&
-				     info->shadow[id].req.nr_segments == 0)) {
-				printk(KERN_WARNING "blkfront: %s: empty write %s op failed\n",
-				       info->flush_op == BLKIF_OP_WRITE_BARRIER ?
-				       "barrier" :  "flush disk cache",
-				       info->gd->disk_name);
+				     info->shadow[id].req.u.rw.nr_segments == 0)) {
+				printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
+				       info->gd->disk_name, op_name(bret->operation));
 				error = -EOPNOTSUPP;
 			}
 			if (unlikely(error)) {
@@ -772,7 +852,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 
 	kick_pending_request_queues(info);
 
-	spin_unlock_irqrestore(&blkif_io_lock, flags);
+	spin_unlock_irqrestore(&info->io_lock, flags);
 
 	return IRQ_HANDLED;
 }
@@ -808,9 +888,8 @@ static int setup_blkring(struct xenbus_device *dev,
 	if (err)
 		goto fail;
 
-	err = bind_evtchn_to_irqhandler(info->evtchn,
-					blkif_interrupt,
-					IRQF_SAMPLE_RANDOM, "blkif", info);
+	err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0,
+					"blkif", info);
 	if (err <= 0) {
 		xenbus_dev_fatal(dev, err,
 				 "bind_evtchn_to_irqhandler failed");
@@ -947,14 +1026,15 @@ static int blkfront_probe(struct xenbus_device *dev,
 	}
 
 	mutex_init(&info->mutex);
+	spin_lock_init(&info->io_lock);
 	info->xbdev = dev;
 	info->vdevice = vdevice;
 	info->connected = BLKIF_STATE_DISCONNECTED;
 	INIT_WORK(&info->work, blkif_restart_queue);
 
 	for (i = 0; i < BLK_RING_SIZE; i++)
-		info->shadow[i].req.id = i+1;
-	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+		info->shadow[i].req.u.rw.id = i+1;
+	info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
 
 	/* Front end dir is a number, which is used as the id. */
 	info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
@@ -988,9 +1068,9 @@ static int blkif_recover(struct blkfront_info *info)
 	/* Stage 2: Set up free list. */
 	memset(&info->shadow, 0, sizeof(info->shadow));
 	for (i = 0; i < BLK_RING_SIZE; i++)
-		info->shadow[i].req.id = i+1;
+		info->shadow[i].req.u.rw.id = i+1;
 	info->shadow_free = info->ring.req_prod_pvt;
-	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+	info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
 
 	/* Stage 3: Find pending requests and requeue them. */
 	for (i = 0; i < BLK_RING_SIZE; i++) {
@@ -1003,17 +1083,19 @@ static int blkif_recover(struct blkfront_info *info)
 		*req = copy[i].req;
 
 		/* We get a new request id, and must reset the shadow state. */
-		req->id = get_id_from_freelist(info);
-		memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
+		req->u.rw.id = get_id_from_freelist(info);
+		memcpy(&info->shadow[req->u.rw.id], &copy[i], sizeof(copy[i]));
 
+		if (req->operation != BLKIF_OP_DISCARD) {
 		/* Rewrite any grant references invalidated by susp/resume. */
-		for (j = 0; j < req->nr_segments; j++)
-			gnttab_grant_foreign_access_ref(
-				req->u.rw.seg[j].gref,
-				info->xbdev->otherend_id,
-				pfn_to_mfn(info->shadow[req->id].frame[j]),
-				rq_data_dir(info->shadow[req->id].request));
-		info->shadow[req->id].req = *req;
+			for (j = 0; j < req->u.rw.nr_segments; j++)
+				gnttab_grant_foreign_access_ref(
+					req->u.rw.seg[j].gref,
+					info->xbdev->otherend_id,
+					pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]),
+					rq_data_dir(info->shadow[req->u.rw.id].request));
+		}
+		info->shadow[req->u.rw.id].req = *req;
 
 		info->ring.req_prod_pvt++;
 	}
@@ -1022,7 +1104,7 @@ static int blkif_recover(struct blkfront_info *info)
 
 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
 
-	spin_lock_irq(&blkif_io_lock);
+	spin_lock_irq(&info->io_lock);
 
 	/* Now safe for us to use the shared ring */
 	info->connected = BLKIF_STATE_CONNECTED;
@@ -1033,7 +1115,7 @@ static int blkif_recover(struct blkfront_info *info)
 	/* Kick any other new requests queued since we resumed */
 	kick_pending_request_queues(info);
 
-	spin_unlock_irq(&blkif_io_lock);
+	spin_unlock_irq(&info->io_lock);
 
 	return 0;
 }
@@ -1098,6 +1180,41 @@ blkfront_closing(struct blkfront_info *info)
 	bdput(bdev);
 }
 
+static void blkfront_setup_discard(struct blkfront_info *info)
+{
+	int err;
+	char *type;
+	unsigned int discard_granularity;
+	unsigned int discard_alignment;
+	unsigned int discard_secure;
+
+	type = xenbus_read(XBT_NIL, info->xbdev->otherend, "type", NULL);
+	if (IS_ERR(type))
+		return;
+
+	info->feature_secdiscard = 0;
+	if (strncmp(type, "phy", 3) == 0) {
+		err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			"discard-granularity", "%u", &discard_granularity,
+			"discard-alignment", "%u", &discard_alignment,
+			NULL);
+		if (!err) {
+			info->feature_discard = 1;
+			info->discard_granularity = discard_granularity;
+			info->discard_alignment = discard_alignment;
+		}
+		err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			    "discard-secure", "%d", &discard_secure,
+			    NULL);
+		if (!err)
+			info->feature_secdiscard = discard_secure;
+
+	} else if (strncmp(type, "file", 4) == 0)
+		info->feature_discard = 1;
+
+	kfree(type);
+}
+
 /*
  * Invoked when the backend is finally 'ready' (and has told produced
  * the details about the physical device - #sectors, size, etc).
@@ -1108,7 +1225,7 @@ static void blkfront_connect(struct blkfront_info *info)
 	unsigned long sector_size;
 	unsigned int binfo;
 	int err;
-	int barrier, flush;
+	int barrier, flush, discard;
 
 	switch (info->connected) {
 	case BLKIF_STATE_CONNECTED:
@@ -1178,7 +1295,14 @@ static void blkfront_connect(struct blkfront_info *info)
 		info->feature_flush = REQ_FLUSH;
 		info->flush_op = BLKIF_OP_FLUSH_DISKCACHE;
 	}
-		
+
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			    "feature-discard", "%d", &discard,
+			    NULL);
+
+	if (!err && discard)
+		blkfront_setup_discard(info);
+
 	err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
 	if (err) {
 		xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
@@ -1189,10 +1313,10 @@ static void blkfront_connect(struct blkfront_info *info)
 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
 
 	/* Kick pending requests. */
-	spin_lock_irq(&blkif_io_lock);
+	spin_lock_irq(&info->io_lock);
 	info->connected = BLKIF_STATE_CONNECTED;
 	kick_pending_request_queues(info);
-	spin_unlock_irq(&blkif_io_lock);
+	spin_unlock_irq(&info->io_lock);
 
 	add_disk(info->gd);
 
@@ -1322,7 +1446,6 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
 	mutex_lock(&blkfront_mutex);
 
 	bdev = bdget_disk(disk, 0);
-	bdput(bdev);
 
 	if (bdev->bd_openers)
 		goto out;
@@ -1353,6 +1476,7 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
 	}
 
 out:
+	bdput(bdev);
 	mutex_unlock(&blkfront_mutex);
 	return 0;
 }
@@ -1372,36 +1496,46 @@ static const struct xenbus_device_id blkfront_ids[] = {
 	{ "" }
 };
 
-static struct xenbus_driver blkfront = {
-	.name = "vbd",
-	.owner = THIS_MODULE,
-	.ids = blkfront_ids,
+static DEFINE_XENBUS_DRIVER(blkfront, ,
 	.probe = blkfront_probe,
 	.remove = blkfront_remove,
 	.resume = blkfront_resume,
 	.otherend_changed = blkback_changed,
 	.is_ready = blkfront_is_ready,
-};
+);
 
 static int __init xlblk_init(void)
 {
+	int ret;
+
 	if (!xen_domain())
 		return -ENODEV;
 
+	if (xen_hvm_domain() && !xen_platform_pci_unplug)
+		return -ENODEV;
+
 	if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
 		printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
 		       XENVBD_MAJOR, DEV_NAME);
 		return -ENODEV;
 	}
 
-	return xenbus_register_frontend(&blkfront);
+	ret = xenbus_register_frontend(&blkfront_driver);
+	if (ret) {
+		unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
+		return ret;
+	}
+
+	return 0;
 }
 module_init(xlblk_init);
 
 
 static void __exit xlblk_exit(void)
 {
-	return xenbus_unregister_driver(&blkfront);
+	xenbus_unregister_driver(&blkfront_driver);
+	unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
+	kfree(minors);
 }
 module_exit(xlblk_exit);
 
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index fb1975d82a73..1a17e338735e 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -456,7 +456,7 @@ static inline void ace_fsm_yieldirq(struct ace_device *ace)
 {
 	dev_dbg(ace->dev, "ace_fsm_yieldirq()\n");
 
-	if (ace->irq == NO_IRQ)
+	if (!ace->irq)
 		/* No IRQ assigned, so need to poll */
 		tasklet_schedule(&ace->fsm_tasklet);
 	ace->fsm_continue_flag = 0;
@@ -1034,12 +1034,12 @@ static int __devinit ace_setup(struct ace_device *ace)
 		ACE_CTRL_DATABUFRDYIRQ | ACE_CTRL_ERRORIRQ);
 
 	/* Now we can hook up the irq handler */
-	if (ace->irq != NO_IRQ) {
+	if (ace->irq) {
 		rc = request_irq(ace->irq, ace_interrupt, 0, "systemace", ace);
 		if (rc) {
 			/* Failure - fall back to polled mode */
 			dev_err(ace->dev, "request_irq failed\n");
-			ace->irq = NO_IRQ;
+			ace->irq = 0;
 		}
 	}
 
@@ -1086,7 +1086,7 @@ static void __devexit ace_teardown(struct ace_device *ace)
 
 	tasklet_kill(&ace->fsm_tasklet);
 
-	if (ace->irq != NO_IRQ)
+	if (ace->irq)
 		free_irq(ace->irq, ace);
 
 	iounmap(ace->baseaddr);
@@ -1156,7 +1156,7 @@ static int __devinit ace_probe(struct platform_device *dev)
 	resource_size_t physaddr = 0;
 	int bus_width = ACE_BUS_WIDTH_16; /* FIXME: should not be hard coded */
 	u32 id = dev->id;
-	int irq = NO_IRQ;
+	int irq = 0;
 	int i;
 
 	dev_dbg(&dev->dev, "ace_probe(%p)\n", dev);