aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/DAC960.c41
-rw-r--r--drivers/block/Kconfig4
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/amiflop.c2
-rw-r--r--drivers/block/aoe/aoeblk.c15
-rw-r--r--drivers/block/aoe/aoechr.c3
-rw-r--r--drivers/block/aoe/aoecmd.c1
-rw-r--r--drivers/block/brd.c33
-rw-r--r--drivers/block/cciss.c94
-rw-r--r--drivers/block/cciss.h1
-rw-r--r--drivers/block/cciss_scsi.c16
-rw-r--r--drivers/block/cpqarray.c2
-rw-r--r--drivers/block/drbd/drbd_actlog.c112
-rw-r--r--drivers/block/drbd/drbd_bitmap.c231
-rw-r--r--drivers/block/drbd/drbd_int.h144
-rw-r--r--drivers/block/drbd/drbd_main.c424
-rw-r--r--drivers/block/drbd/drbd_nl.c98
-rw-r--r--drivers/block/drbd/drbd_proc.c5
-rw-r--r--drivers/block/drbd/drbd_receiver.c139
-rw-r--r--drivers/block/drbd/drbd_req.c247
-rw-r--r--drivers/block/drbd/drbd_req.h19
-rw-r--r--drivers/block/drbd/drbd_worker.c43
-rw-r--r--drivers/block/floppy.c243
-rw-r--r--drivers/block/hd.c1
-rw-r--r--drivers/block/loop.c343
-rw-r--r--drivers/block/mg_disk.c13
-rw-r--r--drivers/block/mtip32xx/Kconfig9
-rw-r--r--drivers/block/mtip32xx/Makefile5
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c4266
-rw-r--r--drivers/block/mtip32xx/mtip32xx.h463
-rw-r--r--drivers/block/nbd.c340
-rw-r--r--drivers/block/nvme.c11
-rw-r--r--drivers/block/paride/bpck6.c5
-rw-r--r--drivers/block/paride/pcd.c2
-rw-r--r--drivers/block/paride/pd.c3
-rw-r--r--drivers/block/paride/pf.c4
-rw-r--r--drivers/block/paride/pg.c4
-rw-r--r--drivers/block/paride/pt.c4
-rw-r--r--drivers/block/pktcdvd.c21
-rw-r--r--drivers/block/ps3disk.c1
-rw-r--r--drivers/block/ps3vram.c7
-rw-r--r--drivers/block/rbd.c1545
-rw-r--r--drivers/block/rbd_types.h5
-rw-r--r--drivers/block/sunvdc.c5
-rw-r--r--drivers/block/swim.c1
-rw-r--r--drivers/block/swim3.c362
-rw-r--r--drivers/block/sx8.c14
-rw-r--r--drivers/block/ub.c42
-rw-r--r--drivers/block/umem.c21
-rw-r--r--drivers/block/viodasd.c809
-rw-r--r--drivers/block/virtio_blk.c276
-rw-r--r--drivers/block/xd.c3
-rw-r--r--drivers/block/xen-blkback/blkback.c138
-rw-r--r--drivers/block/xen-blkback/common.h135
-rw-r--r--drivers/block/xen-blkback/xenbus.c136
-rw-r--r--drivers/block/xen-blkfront.c348
-rw-r--r--drivers/block/xsysace.c10
57 files changed, 8284 insertions, 2986 deletions
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index e086fbbbe853..9a13e889837e 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -1177,7 +1177,8 @@ static bool DAC960_V1_EnableMemoryMailboxInterface(DAC960_Controller_T
int TimeoutCounter;
int i;
-
+ memset(&CommandMailbox, 0, sizeof(DAC960_V1_CommandMailbox_T));
+
if (pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32)))
return DAC960_Failure(Controller, "DMA mask out of range");
Controller->BounceBufferLimit = DMA_BIT_MASK(32);
@@ -4627,7 +4628,8 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
DAC960_Controller_T *Controller = Command->Controller;
DAC960_CommandType_T CommandType = Command->CommandType;
DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox;
- DAC960_V2_IOCTL_Opcode_T CommandOpcode = CommandMailbox->Common.IOCTL_Opcode;
+ DAC960_V2_IOCTL_Opcode_T IOCTLOpcode = CommandMailbox->Common.IOCTL_Opcode;
+ DAC960_V2_CommandOpcode_T CommandOpcode = CommandMailbox->SCSI_10.CommandOpcode;
DAC960_V2_CommandStatus_T CommandStatus = Command->V2.CommandStatus;
if (CommandType == DAC960_ReadCommand ||
@@ -4699,7 +4701,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
{
if (Controller->ShutdownMonitoringTimer)
return;
- if (CommandOpcode == DAC960_V2_GetControllerInfo)
+ if (IOCTLOpcode == DAC960_V2_GetControllerInfo)
{
DAC960_V2_ControllerInfo_T *NewControllerInfo =
Controller->V2.NewControllerInformation;
@@ -4719,14 +4721,14 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
memcpy(ControllerInfo, NewControllerInfo,
sizeof(DAC960_V2_ControllerInfo_T));
}
- else if (CommandOpcode == DAC960_V2_GetEvent)
+ else if (IOCTLOpcode == DAC960_V2_GetEvent)
{
if (CommandStatus == DAC960_V2_NormalCompletion) {
DAC960_V2_ReportEvent(Controller, Controller->V2.Event);
}
Controller->V2.NextEventSequenceNumber++;
}
- else if (CommandOpcode == DAC960_V2_GetPhysicalDeviceInfoValid &&
+ else if (IOCTLOpcode == DAC960_V2_GetPhysicalDeviceInfoValid &&
CommandStatus == DAC960_V2_NormalCompletion)
{
DAC960_V2_PhysicalDeviceInfo_T *NewPhysicalDeviceInfo =
@@ -4915,7 +4917,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
NewPhysicalDeviceInfo->LogicalUnit++;
Controller->V2.PhysicalDeviceIndex++;
}
- else if (CommandOpcode == DAC960_V2_GetPhysicalDeviceInfoValid)
+ else if (IOCTLOpcode == DAC960_V2_GetPhysicalDeviceInfoValid)
{
unsigned int DeviceIndex;
for (DeviceIndex = Controller->V2.PhysicalDeviceIndex;
@@ -4938,7 +4940,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
}
Controller->V2.NeedPhysicalDeviceInformation = false;
}
- else if (CommandOpcode == DAC960_V2_GetLogicalDeviceInfoValid &&
+ else if (IOCTLOpcode == DAC960_V2_GetLogicalDeviceInfoValid &&
CommandStatus == DAC960_V2_NormalCompletion)
{
DAC960_V2_LogicalDeviceInfo_T *NewLogicalDeviceInfo =
@@ -5065,7 +5067,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
[LogicalDeviceNumber] = true;
NewLogicalDeviceInfo->LogicalDeviceNumber++;
}
- else if (CommandOpcode == DAC960_V2_GetLogicalDeviceInfoValid)
+ else if (IOCTLOpcode == DAC960_V2_GetLogicalDeviceInfoValid)
{
int LogicalDriveNumber;
for (LogicalDriveNumber = 0;
@@ -6578,24 +6580,21 @@ static const struct file_operations dac960_user_command_proc_fops = {
static void DAC960_CreateProcEntries(DAC960_Controller_T *Controller)
{
- struct proc_dir_entry *StatusProcEntry;
struct proc_dir_entry *ControllerProcEntry;
- struct proc_dir_entry *UserCommandProcEntry;
if (DAC960_ProcDirectoryEntry == NULL) {
- DAC960_ProcDirectoryEntry = proc_mkdir("rd", NULL);
- StatusProcEntry = proc_create("status", 0,
- DAC960_ProcDirectoryEntry,
- &dac960_proc_fops);
+ DAC960_ProcDirectoryEntry = proc_mkdir("rd", NULL);
+ proc_create("status", 0, DAC960_ProcDirectoryEntry,
+ &dac960_proc_fops);
}
- sprintf(Controller->ControllerName, "c%d", Controller->ControllerNumber);
- ControllerProcEntry = proc_mkdir(Controller->ControllerName,
- DAC960_ProcDirectoryEntry);
- proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller);
- proc_create_data("current_status", 0, ControllerProcEntry, &dac960_current_status_proc_fops, Controller);
- UserCommandProcEntry = proc_create_data("user_command", S_IWUSR | S_IRUSR, ControllerProcEntry, &dac960_user_command_proc_fops, Controller);
- Controller->ControllerProcEntry = ControllerProcEntry;
+ sprintf(Controller->ControllerName, "c%d", Controller->ControllerNumber);
+ ControllerProcEntry = proc_mkdir(Controller->ControllerName,
+ DAC960_ProcDirectoryEntry);
+ proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller);
+ proc_create_data("current_status", 0, ControllerProcEntry, &dac960_current_status_proc_fops, Controller);
+ proc_create_data("user_command", S_IWUSR | S_IRUSR, ControllerProcEntry, &dac960_user_command_proc_fops, Controller);
+ Controller->ControllerProcEntry = ControllerProcEntry;
}
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 35e56e1c948f..a796407123c7 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -116,6 +116,8 @@ config PARIDE
source "drivers/block/paride/Kconfig"
+source "drivers/block/mtip32xx/Kconfig"
+
config BLK_CPQ_DA
tristate "Compaq SMART2 support"
depends on PCI && VIRT_TO_BUS
@@ -352,7 +354,7 @@ config BLK_DEV_SX8
Use devices /dev/sx8/$N and /dev/sx8/$Np$M.
config BLK_DEV_UB
- tristate "Low Performance USB Block driver"
+ tristate "Low Performance USB Block driver (deprecated)"
depends on USB
help
This driver supports certain USB attached storage devices
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 349539ad3ad9..5b795059f8fb 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -40,5 +40,6 @@ obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/
obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
+obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/
swim_mod-y := swim.o swim_asm.o
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 8eba86bba599..386146d792d1 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -63,7 +63,7 @@
#include <linux/mutex.h>
#include <linux/amifdreg.h>
#include <linux/amifd.h>
-#include <linux/buffer_head.h>
+#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/interrupt.h>
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 528f6318ded1..321de7b6c442 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -15,6 +15,7 @@
#include <linux/genhd.h>
#include <linux/netdevice.h>
#include <linux/mutex.h>
+#include <linux/export.h>
#include "aoe.h"
static DEFINE_MUTEX(aoeblk_mutex);
@@ -159,7 +160,7 @@ aoeblk_release(struct gendisk *disk, fmode_t mode)
return 0;
}
-static int
+static void
aoeblk_make_request(struct request_queue *q, struct bio *bio)
{
struct sk_buff_head queue;
@@ -172,25 +173,25 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio)
if (bio == NULL) {
printk(KERN_ERR "aoe: bio is NULL\n");
BUG();
- return 0;
+ return;
}
d = bio->bi_bdev->bd_disk->private_data;
if (d == NULL) {
printk(KERN_ERR "aoe: bd_disk->private_data is NULL\n");
BUG();
bio_endio(bio, -ENXIO);
- return 0;
+ return;
} else if (bio->bi_io_vec == NULL) {
printk(KERN_ERR "aoe: bi_io_vec is NULL\n");
BUG();
bio_endio(bio, -ENXIO);
- return 0;
+ return;
}
buf = mempool_alloc(d->bufpool, GFP_NOIO);
if (buf == NULL) {
printk(KERN_INFO "aoe: buf allocation failure\n");
bio_endio(bio, -ENOMEM);
- return 0;
+ return;
}
memset(buf, 0, sizeof(*buf));
INIT_LIST_HEAD(&buf->bufs);
@@ -211,7 +212,7 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio)
spin_unlock_irqrestore(&d->lock, flags);
mempool_free(buf, d->bufpool);
bio_endio(bio, -ENXIO);
- return 0;
+ return;
}
list_add_tail(&buf->bufs, &d->bufq);
@@ -222,8 +223,6 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio)
spin_unlock_irqrestore(&d->lock, flags);
aoenet_xmit(&queue);
-
- return 0;
}
static int
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c
index 146296ca4965..e86d2062a164 100644
--- a/drivers/block/aoe/aoechr.c
+++ b/drivers/block/aoe/aoechr.c
@@ -11,6 +11,7 @@
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/skbuff.h>
+#include <linux/export.h>
#include "aoe.h"
enum {
@@ -269,7 +270,7 @@ static const struct file_operations aoe_fops = {
.llseek = noop_llseek,
};
-static char *aoe_devnode(struct device *dev, mode_t *mode)
+static char *aoe_devnode(struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "etherd/%s", dev_name(dev));
}
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index de0435e63b02..887f68f6d79a 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -35,6 +35,7 @@ new_skb(ulong len)
skb_reset_mac_header(skb);
skb_reset_network_header(skb);
skb->protocol = __constant_htons(ETH_P_AOE);
+ skb_checksum_none_assert(skb);
}
return skb;
}
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index dba1c32e1ddf..531ceb31d0ff 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -17,7 +17,7 @@
#include <linux/highmem.h>
#include <linux/mutex.h>
#include <linux/radix-tree.h>
-#include <linux/buffer_head.h> /* invalidate_bh_lrus() */
+#include <linux/fs.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
@@ -242,9 +242,9 @@ static void copy_to_brd(struct brd_device *brd, const void *src,
page = brd_lookup_page(brd, sector);
BUG_ON(!page);
- dst = kmap_atomic(page, KM_USER1);
+ dst = kmap_atomic(page);
memcpy(dst + offset, src, copy);
- kunmap_atomic(dst, KM_USER1);
+ kunmap_atomic(dst);
if (copy < n) {
src += copy;
@@ -253,9 +253,9 @@ static void copy_to_brd(struct brd_device *brd, const void *src,
page = brd_lookup_page(brd, sector);
BUG_ON(!page);
- dst = kmap_atomic(page, KM_USER1);
+ dst = kmap_atomic(page);
memcpy(dst, src, copy);
- kunmap_atomic(dst, KM_USER1);
+ kunmap_atomic(dst);
}
}
@@ -273,9 +273,9 @@ static void copy_from_brd(void *dst, struct brd_device *brd,
copy = min_t(size_t, n, PAGE_SIZE - offset);
page = brd_lookup_page(brd, sector);
if (page) {
- src = kmap_atomic(page, KM_USER1);
+ src = kmap_atomic(page);
memcpy(dst, src + offset, copy);
- kunmap_atomic(src, KM_USER1);
+ kunmap_atomic(src);
} else
memset(dst, 0, copy);
@@ -285,9 +285,9 @@ static void copy_from_brd(void *dst, struct brd_device *brd,
copy = n - copy;
page = brd_lookup_page(brd, sector);
if (page) {
- src = kmap_atomic(page, KM_USER1);
+ src = kmap_atomic(page);
memcpy(dst, src, copy);
- kunmap_atomic(src, KM_USER1);
+ kunmap_atomic(src);
} else
memset(dst, 0, copy);
}
@@ -309,7 +309,7 @@ static int brd_do_bvec(struct brd_device *brd, struct page *page,
goto out;
}
- mem = kmap_atomic(page, KM_USER0);
+ mem = kmap_atomic(page);
if (rw == READ) {
copy_from_brd(mem + off, brd, sector, len);
flush_dcache_page(page);
@@ -317,13 +317,13 @@ static int brd_do_bvec(struct brd_device *brd, struct page *page,
flush_dcache_page(page);
copy_to_brd(brd, mem + off, sector, len);
}
- kunmap_atomic(mem, KM_USER0);
+ kunmap_atomic(mem);
out:
return err;
}
-static int brd_make_request(struct request_queue *q, struct bio *bio)
+static void brd_make_request(struct request_queue *q, struct bio *bio)
{
struct block_device *bdev = bio->bi_bdev;
struct brd_device *brd = bdev->bd_disk->private_data;
@@ -359,8 +359,6 @@ static int brd_make_request(struct request_queue *q, struct bio *bio)
out:
bio_endio(bio, err);
-
- return 0;
}
#ifdef CONFIG_BLK_DEV_XIP
@@ -404,14 +402,13 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode,
error = -EBUSY;
if (bdev->bd_openers <= 1) {
/*
- * Invalidate the cache first, so it isn't written
- * back to the device.
+ * Kill the cache first, so it isn't written back to the
+ * device.
*
* Another thread might instantiate more buffercache here,
* but there is not much we can do to close that race.
*/
- invalidate_bh_lrus();
- truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
+ kill_bdev(bdev);
brd_free_pages(brd);
error = 0;
}
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 8f4ef656a1af..b0f553b26d0f 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -24,6 +24,7 @@
#include <linux/interrupt.h>
#include <linux/types.h>
#include <linux/pci.h>
+#include <linux/pci-aspm.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/delay.h>
@@ -68,6 +69,10 @@ static int cciss_tape_cmds = 6;
module_param(cciss_tape_cmds, int, 0644);
MODULE_PARM_DESC(cciss_tape_cmds,
"number of commands to allocate for tape devices (default: 6)");
+static int cciss_simple_mode;
+module_param(cciss_simple_mode, int, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(cciss_simple_mode,
+ "Use 'simple mode' rather than 'performant mode'");
static DEFINE_MUTEX(cciss_mutex);
static struct proc_dir_entry *proc_cciss;
@@ -176,6 +181,7 @@ static void cciss_geometry_inquiry(ctlr_info_t *h, int logvol,
unsigned int block_size, InquiryData_struct *inq_buff,
drive_info_struct *drv);
static void __devinit cciss_interrupt_mode(ctlr_info_t *);
+static int __devinit cciss_enter_simple_mode(struct ctlr_info *h);
static void start_io(ctlr_info_t *h);
static int sendcmd_withirq(ctlr_info_t *h, __u8 cmd, void *buff, size_t size,
__u8 page_code, unsigned char scsi3addr[],
@@ -388,7 +394,7 @@ static void cciss_seq_show_header(struct seq_file *seq)
h->product_name,
(unsigned long)h->board_id,
h->firm_ver[0], h->firm_ver[1], h->firm_ver[2],
- h->firm_ver[3], (unsigned int)h->intr[PERF_MODE_INT],
+ h->firm_ver[3], (unsigned int)h->intr[h->intr_mode],
h->num_luns,
h->Qdepth, h->commands_outstanding,
h->maxQsinceinit, h->max_outstanding, h->maxSG);
@@ -636,6 +642,18 @@ static ssize_t host_store_rescan(struct device *dev,
}
static DEVICE_ATTR(rescan, S_IWUSR, NULL, host_store_rescan);
+static ssize_t host_show_transport_mode(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ctlr_info *h = to_hba(dev);
+
+ return snprintf(buf, 20, "%s\n",
+ h->transMethod & CFGTBL_Trans_Performant ?
+ "performant" : "simple");
+}
+static DEVICE_ATTR(transport_mode, S_IRUGO, host_show_transport_mode, NULL);
+
static ssize_t dev_show_unique_id(struct device *dev,
struct device_attribute *attr,
char *buf)
@@ -808,6 +826,7 @@ static DEVICE_ATTR(usage_count, S_IRUGO, cciss_show_usage_count, NULL);
static struct attribute *cciss_host_attrs[] = {
&dev_attr_rescan.attr,
&dev_attr_resettable.attr,
+ &dev_attr_transport_mode.attr,
NULL
};
@@ -1716,7 +1735,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
case CCISS_BIG_PASSTHRU:
return cciss_bigpassthru(h, argp);
- /* scsi_cmd_ioctl handles these, below, though some are not */
+ /* scsi_cmd_blk_ioctl handles these, below, though some are not */
/* very meaningful for cciss. SG_IO is the main one people want. */
case SG_GET_VERSION_NUM:
@@ -1727,9 +1746,9 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
case SG_EMULATED_HOST:
case SG_IO:
case SCSI_IOCTL_SEND_COMMAND:
- return scsi_cmd_ioctl(disk->queue, disk, mode, cmd, argp);
+ return scsi_cmd_blk_ioctl(bdev, mode, cmd, argp);
- /* scsi_cmd_ioctl would normally handle these, below, but */
+ /* scsi_cmd_blk_ioctl would normally handle these, below, but */
/* they aren't a good fit for cciss, as CD-ROMs are */
/* not supported, and we don't have any bus/target/lun */
/* which we present to the kernel. */
@@ -2582,6 +2601,8 @@ static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff,
c->Request.Timeout = 0;
c->Request.CDB[0] = BMIC_WRITE;
c->Request.CDB[6] = BMIC_CACHE_FLUSH;
+ c->Request.CDB[7] = (size >> 8) & 0xFF;
+ c->Request.CDB[8] = size & 0xFF;
break;
case TEST_UNIT_READY:
c->Request.CDBLen = 6;
@@ -3984,6 +4005,9 @@ static void __devinit cciss_put_controller_into_performant_mode(ctlr_info_t *h)
{
__u32 trans_support;
+ if (cciss_simple_mode)
+ return;
+
dev_dbg(&h->pdev->dev, "Trying to put board into Performant mode\n");
/* Attempt to put controller into performant mode if supported */
/* Does board support performant mode? */
@@ -4081,7 +4105,7 @@ static void __devinit cciss_interrupt_mode(ctlr_info_t *h)
default_int_mode:
#endif /* CONFIG_PCI_MSI */
/* if we get here we're going to use the default interrupt mode */
- h->intr[PERF_MODE_INT] = h->pdev->irq;
+ h->intr[h->intr_mode] = h->pdev->irq;
return;
}
@@ -4298,6 +4322,10 @@ static int __devinit cciss_pci_init(ctlr_info_t *h)
dev_warn(&h->pdev->dev, "controller appears to be disabled\n");
return -ENODEV;
}
+
+ pci_disable_link_state(h->pdev, PCIE_LINK_STATE_L0S |
+ PCIE_LINK_STATE_L1 | PCIE_LINK_STATE_CLKPM);
+
err = pci_enable_device(h->pdev);
if (err) {
dev_warn(&h->pdev->dev, "Unable to Enable PCI device\n");
@@ -4341,6 +4369,9 @@ static int __devinit cciss_pci_init(ctlr_info_t *h)
}
cciss_enable_scsi_prefetch(h);
cciss_p600_dma_prefetch_quirk(h);
+ err = cciss_enter_simple_mode(h);
+ if (err)
+ goto err_out_free_res;
cciss_put_controller_into_performant_mode(h);
return 0;
@@ -4533,6 +4564,13 @@ static int cciss_controller_hard_reset(struct pci_dev *pdev,
pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
pmcsr |= PCI_D0;
pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr);
+
+ /*
+ * The P600 requires a small delay when changing states.
+ * Otherwise we may think the board did not reset and we bail.
+ * This for kdump only and is particular to the P600.
+ */
+ msleep(500);
}
return 0;
}
@@ -4843,20 +4881,20 @@ static int cciss_request_irq(ctlr_info_t *h,
irqreturn_t (*intxhandler)(int, void *))
{
if (h->msix_vector || h->msi_vector) {
- if (!request_irq(h->intr[PERF_MODE_INT], msixhandler,
- IRQF_DISABLED, h->devname, h))
+ if (!request_irq(h->intr[h->intr_mode], msixhandler,
+ 0, h->devname, h))
return 0;
dev_err(&h->pdev->dev, "Unable to get msi irq %d"
- " for %s\n", h->intr[PERF_MODE_INT],
+ " for %s\n", h->intr[h->intr_mode],
h->devname);
return -1;
}
- if (!request_irq(h->intr[PERF_MODE_INT], intxhandler,
- IRQF_DISABLED, h->devname, h))
+ if (!request_irq(h->intr[h->intr_mode], intxhandler,
+ IRQF_SHARED, h->devname, h))
return 0;
dev_err(&h->pdev->dev, "Unable to get irq %d for %s\n",
- h->intr[PERF_MODE_INT], h->devname);
+ h->intr[h->intr_mode], h->devname);
return -1;
}
@@ -4887,7 +4925,7 @@ static void cciss_undo_allocations_after_kdump_soft_reset(ctlr_info_t *h)
{
int ctlr = h->ctlr;
- free_irq(h->intr[PERF_MODE_INT], h);
+ free_irq(h->intr[h->intr_mode], h);
#ifdef CONFIG_PCI_MSI
if (h->msix_vector)
pci_disable_msix(h->pdev);
@@ -4953,6 +4991,7 @@ reinit_after_soft_reset:
h = hba[i];
h->pdev = pdev;
h->busy_initializing = 1;
+ h->intr_mode = cciss_simple_mode ? SIMPLE_MODE_INT : PERF_MODE_INT;
INIT_LIST_HEAD(&h->cmpQ);
INIT_LIST_HEAD(&h->reqQ);
mutex_init(&h->busy_shutting_down);
@@ -5009,7 +5048,7 @@ reinit_after_soft_reset:
dev_info(&h->pdev->dev, "%s: <0x%x> at PCI %s IRQ %d%s using DAC\n",
h->devname, pdev->device, pci_name(pdev),
- h->intr[PERF_MODE_INT], dac ? "" : " not");
+ h->intr[h->intr_mode], dac ? "" : " not");
if (cciss_allocate_cmd_pool(h))
goto clean4;
@@ -5056,7 +5095,7 @@ reinit_after_soft_reset:
spin_lock_irqsave(&h->lock, flags);
h->access.set_intr_mask(h, CCISS_INTR_OFF);
spin_unlock_irqrestore(&h->lock, flags);
- free_irq(h->intr[PERF_MODE_INT], h);
+ free_irq(h->intr[h->intr_mode], h);
rc = cciss_request_irq(h, cciss_msix_discard_completions,
cciss_intx_discard_completions);
if (rc) {
@@ -5126,6 +5165,7 @@ reinit_after_soft_reset:
h->cciss_max_sectors = 8192;
rebuild_lun_table(h, 1, 0);
+ cciss_engage_scsi(h);
h->busy_initializing = 0;
return 1;
@@ -5133,7 +5173,7 @@ clean4:
cciss_free_cmd_pool(h);
cciss_free_scatterlists(h);
cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds);
- free_irq(h->intr[PERF_MODE_INT], h);
+ free_irq(h->intr[h->intr_mode], h);
clean2:
unregister_blkdev(h->major, h->devname);
clean1:
@@ -5172,9 +5212,31 @@ static void cciss_shutdown(struct pci_dev *pdev)
if (return_code != IO_OK)
dev_warn(&h->pdev->dev, "Error flushing cache\n");
h->access.set_intr_mask(h, CCISS_INTR_OFF);
- free_irq(h->intr[PERF_MODE_INT], h);
+ free_irq(h->intr[h->intr_mode], h);
}
+static int __devinit cciss_enter_simple_mode(struct ctlr_info *h)
+{
+ u32 trans_support;
+
+ trans_support = readl(&(h->cfgtable->TransportSupport));
+ if (!(trans_support & SIMPLE_MODE))
+ return -ENOTSUPP;
+
+ h->max_commands = readl(&(h->cfgtable->CmdsOutMax));
+ writel(CFGTBL_Trans_Simple, &(h->cfgtable->HostWrite.TransportRequest));
+ writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
+ cciss_wait_for_mode_change_ack(h);
+ print_cfg_table(h);
+ if (!(readl(&(h->cfgtable->TransportActive)) & CFGTBL_Trans_Simple)) {
+ dev_warn(&h->pdev->dev, "unable to get board into simple mode\n");
+ return -ENODEV;
+ }
+ h->transMethod = CFGTBL_Trans_Simple;
+ return 0;
+}
+
+
static void __devexit cciss_remove_one(struct pci_dev *pdev)
{
ctlr_info_t *h;
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index c049548e68b7..7fda30e4a241 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -92,6 +92,7 @@ struct ctlr_info
unsigned int intr[4];
unsigned int msix_vector;
unsigned int msi_vector;
+ int intr_mode;
int cciss_max_sectors;
BYTE cciss_read;
BYTE cciss_write;
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 951a4e33b92b..da3311129a0c 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -763,16 +763,7 @@ static void complete_scsi_command(CommandList_struct *c, int timeout,
{
case CMD_TARGET_STATUS:
/* Pass it up to the upper layers... */
- if( ei->ScsiStatus)
- {
-#if 0
- printk(KERN_WARNING "cciss: cmd %p "
- "has SCSI Status = %x\n",
- c, ei->ScsiStatus);
-#endif
- cmd->result |= (ei->ScsiStatus << 1);
- }
- else { /* scsi status is zero??? How??? */
+ if (!ei->ScsiStatus) {
/* Ordinarily, this case should never happen, but there is a bug
in some released firmware revisions that allows it to happen
@@ -804,6 +795,7 @@ static void complete_scsi_command(CommandList_struct *c, int timeout,
}
break;
case CMD_PROTOCOL_ERR:
+ cmd->result = DID_ERROR << 16;
dev_warn(&h->pdev->dev,
"%p has protocol error\n", c);
break;
@@ -866,6 +858,7 @@ cciss_scsi_detect(ctlr_info_t *h)
sh->can_queue = cciss_tape_cmds;
sh->sg_tablesize = h->maxsgentries;
sh->max_cmd_len = MAX_COMMAND_SIZE;
+ sh->max_sectors = h->cciss_max_sectors;
((struct cciss_scsi_adapter_data_t *)
h->scsi_ctlr)->scsi_host = sh;
@@ -1410,7 +1403,7 @@ static void cciss_scatter_gather(ctlr_info_t *h, CommandList_struct *c,
/* track how many SG entries we are using */
if (request_nsgs > h->maxSG)
h->maxSG = request_nsgs;
- c->Header.SGTotal = (__u8) request_nsgs + chained;
+ c->Header.SGTotal = (u16) request_nsgs + chained;
if (request_nsgs > h->max_cmd_sgentries)
c->Header.SGList = h->max_cmd_sgentries;
else
@@ -1720,5 +1713,6 @@ static int cciss_eh_abort_handler(struct scsi_cmnd *scsicmd)
/* If no tape support, then these become defined out of existence */
#define cciss_scsi_setup(cntl_num)
+#define cciss_engage_scsi(h)
#endif /* CONFIG_CISS_SCSI_TAPE */
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index b2fceb53e809..9125bbeacd4d 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -620,6 +620,7 @@ static int cpqarray_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
}
vendor_id = pdev->vendor;
device_id = pdev->device;
+ revision = pdev->revision;
irq = pdev->irq;
for(i=0; i<6; i++)
@@ -632,7 +633,6 @@ static int cpqarray_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
}
pci_read_config_word(pdev, PCI_COMMAND, &command);
- pci_read_config_byte(pdev, PCI_CLASS_REVISION, &revision);
pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &cache_line_size);
pci_read_config_byte(pdev, PCI_LATENCY_TIMER, &latency_timer);
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index cf0e63dd97da..3fbef018ce55 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -65,39 +65,80 @@ struct drbd_atodb_wait {
int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
+void *drbd_md_get_buffer(struct drbd_conf *mdev)
+{
+ int r;
+
+ wait_event(mdev->misc_wait,
+ (r = atomic_cmpxchg(&mdev->md_io_in_use, 0, 1)) == 0 ||
+ mdev->state.disk <= D_FAILED);
+
+ return r ? NULL : page_address(mdev->md_io_page);
+}
+
+void drbd_md_put_buffer(struct drbd_conf *mdev)
+{
+ if (atomic_dec_and_test(&mdev->md_io_in_use))
+ wake_up(&mdev->misc_wait);
+}
+
+static bool md_io_allowed(struct drbd_conf *mdev)
+{
+ enum drbd_disk_state ds = mdev->state.disk;
+ return ds >= D_NEGOTIATING || ds == D_ATTACHING;
+}
+
+void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+ unsigned int *done)
+{
+ long dt = bdev->dc.disk_timeout * HZ / 10;
+ if (dt == 0)
+ dt = MAX_SCHEDULE_TIMEOUT;
+
+ dt = wait_event_timeout(mdev->misc_wait, *done || !md_io_allowed(mdev), dt);
+ if (dt == 0)
+ dev_err(DEV, "meta-data IO operation timed out\n");
+}
+
static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
struct drbd_backing_dev *bdev,
struct page *page, sector_t sector,
int rw, int size)
{
struct bio *bio;
- struct drbd_md_io md_io;
int ok;
- md_io.mdev = mdev;
- init_completion(&md_io.event);
- md_io.error = 0;
+ mdev->md_io.done = 0;
+ mdev->md_io.error = -ENODEV;
if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
rw |= REQ_FUA | REQ_FLUSH;
rw |= REQ_SYNC;
- bio = bio_alloc(GFP_NOIO, 1);
+ bio = bio_alloc_drbd(GFP_NOIO);
bio->bi_bdev = bdev->md_bdev;
bio->bi_sector = sector;
ok = (bio_add_page(bio, page, size, 0) == size);
if (!ok)
goto out;
- bio->bi_private = &md_io;
+ bio->bi_private = &mdev->md_io;
bio->bi_end_io = drbd_md_io_complete;
bio->bi_rw = rw;
+ if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */
+ dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
+ ok = 0;
+ goto out;
+ }
+
+ bio_get(bio); /* one bio_put() is in the completion handler */
+ atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */
if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
bio_endio(bio, -EIO);
else
submit_bio(rw, bio);
- wait_for_completion(&md_io.event);
- ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
+ wait_until_done_or_disk_failure(mdev, bdev, &mdev->md_io.done);
+ ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0;
out:
bio_put(bio);
@@ -111,7 +152,7 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
int offset = 0;
struct page *iop = mdev->md_io_page;
- D_ASSERT(mutex_is_locked(&mdev->md_io_mutex));
+ D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1);
BUG_ON(!bdev->md_bdev);
@@ -328,8 +369,13 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
return 1;
}
- mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */
- buffer = (struct al_transaction *)page_address(mdev->md_io_page);
+ buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */
+ if (!buffer) {
+ dev_err(DEV, "disk failed while waiting for md_io buffer\n");
+ complete(&((struct update_al_work *)w)->event);
+ put_ldev(mdev);
+ return 1;
+ }
buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
@@ -365,7 +411,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+ mdev->ldev->md.al_offset + mdev->al_tr_pos;
if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
- drbd_chk_io_error(mdev, 1, true);
+ drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
if (++mdev->al_tr_pos >
div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
@@ -374,7 +420,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
mdev->al_tr_number++;
- mutex_unlock(&mdev->md_io_mutex);
+ drbd_md_put_buffer(mdev);
complete(&((struct update_al_work *)w)->event);
put_ldev(mdev);
@@ -443,8 +489,9 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
/* lock out all other meta data io for now,
* and make sure the page is mapped.
*/
- mutex_lock(&mdev->md_io_mutex);
- buffer = page_address(mdev->md_io_page);
+ buffer = drbd_md_get_buffer(mdev);
+ if (!buffer)
+ return 0;
/* Find the valid transaction in the log */
for (i = 0; i <= mx; i++) {
@@ -452,7 +499,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
if (rv == 0)
continue;
if (rv == -1) {
- mutex_unlock(&mdev->md_io_mutex);
+ drbd_md_put_buffer(mdev);
return 0;
}
cnr = be32_to_cpu(buffer->tr_number);
@@ -478,7 +525,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
if (!found_valid) {
dev_warn(DEV, "No usable activity log found.\n");
- mutex_unlock(&mdev->md_io_mutex);
+ drbd_md_put_buffer(mdev);
return 1;
}
@@ -493,7 +540,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
rv = drbd_al_read_tr(mdev, bdev, buffer, i);
ERR_IF(rv == 0) goto cancel;
if (rv == -1) {
- mutex_unlock(&mdev->md_io_mutex);
+ drbd_md_put_buffer(mdev);
return 0;
}
@@ -534,7 +581,7 @@ cancel:
mdev->al_tr_pos = 0;
/* ok, we are done with it */
- mutex_unlock(&mdev->md_io_mutex);
+ drbd_md_put_buffer(mdev);
dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
transactions, active_extents);
@@ -671,16 +718,20 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
else
ext->rs_failed += count;
if (ext->rs_left < ext->rs_failed) {
- dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
- "rs_failed=%d count=%d\n",
+ dev_warn(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
+ "rs_failed=%d count=%d cstate=%s\n",
(unsigned long long)sector,
ext->lce.lc_number, ext->rs_left,
- ext->rs_failed, count);
- dump_stack();
-
- lc_put(mdev->resync, &ext->lce);
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return;
+ ext->rs_failed, count,
+ drbd_conn_str(mdev->state.conn));
+
+ /* We don't expect to be able to clear more bits
+ * than have been set when we originally counted
+ * the set bits to cache that value in ext->rs_left.
+ * Whatever the reason (disconnect during resync,
+ * delayed local completion of an application write),
+ * try to fix it up by recounting here. */
+ ext->rs_left = drbd_bm_e_weight(mdev, enr);
}
} else {
/* Normally this element should be in the cache,
@@ -825,7 +876,11 @@ int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
unsigned int enr, count = 0;
struct lc_element *e;
- if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
+ /* this should be an empty REQ_FLUSH */
+ if (size == 0)
+ return 0;
+
+ if (size < 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
dev_err(DEV, "sector: %llus, size: %d\n",
(unsigned long long)sector, size);
return 0;
@@ -1192,6 +1247,7 @@ int drbd_rs_del_all(struct drbd_conf *mdev)
put_ldev(mdev);
}
spin_unlock_irq(&mdev->al_lock);
+ wake_up(&mdev->al_wait);
return 0;
}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 7b976296b564..d84566496746 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -205,7 +205,7 @@ void drbd_bm_unlock(struct drbd_conf *mdev)
static void bm_store_page_idx(struct page *page, unsigned long idx)
{
BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
- page_private(page) |= idx;
+ set_page_private(page, idx);
}
static unsigned long bm_page_to_idx(struct page *page)
@@ -289,25 +289,25 @@ static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
return page_nr;
}
-static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km)
+static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
{
struct page *page = b->bm_pages[idx];
- return (unsigned long *) kmap_atomic(page, km);
+ return (unsigned long *) kmap_atomic(page);
}
static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
{
- return __bm_map_pidx(b, idx, KM_IRQ1);
+ return __bm_map_pidx(b, idx);
}
-static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
+static void __bm_unmap(unsigned long *p_addr)
{
- kunmap_atomic(p_addr, km);
+ kunmap_atomic(p_addr);
};
static void bm_unmap(unsigned long *p_addr)
{
- return __bm_unmap(p_addr, KM_IRQ1);
+ return __bm_unmap(p_addr);
}
/* long word offset of _bitmap_ sector */
@@ -378,15 +378,14 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
* thread. As we have no disk yet, we are not in the IO path,
* not even the IO path of the peer. */
bytes = sizeof(struct page *)*want;
- new_pages = kmalloc(bytes, GFP_KERNEL);
+ new_pages = kzalloc(bytes, GFP_KERNEL);
if (!new_pages) {
- new_pages = vmalloc(bytes);
+ new_pages = vzalloc(bytes);
if (!new_pages)
return NULL;
vmalloced = 1;
}
- memset(new_pages, 0, bytes);
if (want >= have) {
for (i = 0; i < have; i++)
new_pages[i] = old_pages[i];
@@ -544,15 +543,15 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b)
/* all but last page */
for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
- p_addr = __bm_map_pidx(b, idx, KM_USER0);
+ p_addr = __bm_map_pidx(b, idx);
for (i = 0; i < LWPP; i++)
bits += hweight_long(p_addr[i]);
- __bm_unmap(p_addr, KM_USER0);
+ __bm_unmap(p_addr);
cond_resched();
}
/* last (or only) page */
last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
- p_addr = __bm_map_pidx(b, idx, KM_USER0);
+ p_addr = __bm_map_pidx(b, idx);
for (i = 0; i < last_word; i++)
bits += hweight_long(p_addr[i]);
p_addr[last_word] &= cpu_to_lel(mask);
@@ -560,7 +559,7 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b)
/* 32bit arch, may have an unused padding long */
if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
p_addr[last_word+1] = 0;
- __bm_unmap(p_addr, KM_USER0);
+ __bm_unmap(p_addr);
return bits;
}
@@ -887,12 +886,22 @@ void drbd_bm_clear_all(struct drbd_conf *mdev)
struct bm_aio_ctx {
struct drbd_conf *mdev;
atomic_t in_flight;
- struct completion done;
+ unsigned int done;
unsigned flags;
#define BM_AIO_COPY_PAGES 1
+#define BM_WRITE_ALL_PAGES 2
int error;
+ struct kref kref;
};
+static void bm_aio_ctx_destroy(struct kref *kref)
+{
+ struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref);
+
+ put_ldev(ctx->mdev);
+ kfree(ctx);
+}
+
/* bv_page may be a copy, or may be the original */
static void bm_async_io_complete(struct bio *bio, int error)
{
@@ -931,20 +940,21 @@ static void bm_async_io_complete(struct bio *bio, int error)
bm_page_unlock_io(mdev, idx);
- /* FIXME give back to page pool */
if (ctx->flags & BM_AIO_COPY_PAGES)
- put_page(bio->bi_io_vec[0].bv_page);
+ mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool);
bio_put(bio);
- if (atomic_dec_and_test(&ctx->in_flight))
- complete(&ctx->done);
+ if (atomic_dec_and_test(&ctx->in_flight)) {
+ ctx->done = 1;
+ wake_up(&mdev->misc_wait);
+ kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+ }
}
static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
{
- /* we are process context. we always get a bio */
- struct bio *bio = bio_alloc(GFP_KERNEL, 1);
+ struct bio *bio = bio_alloc_drbd(GFP_NOIO);
struct drbd_conf *mdev = ctx->mdev;
struct drbd_bitmap *b = mdev->bitmap;
struct page *page;
@@ -967,21 +977,21 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
bm_set_page_unchanged(b->bm_pages[page_nr]);
if (ctx->flags & BM_AIO_COPY_PAGES) {
- /* FIXME alloc_page is good enough for now, but actually needs
- * to use pre-allocated page pool */
void *src, *dest;
- page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT);
- dest = kmap_atomic(page, KM_USER0);
- src = kmap_atomic(b->bm_pages[page_nr], KM_USER1);
+ page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT);
+ dest = kmap_atomic(page);
+ src = kmap_atomic(b->bm_pages[page_nr]);
memcpy(dest, src, PAGE_SIZE);
- kunmap_atomic(src, KM_USER1);
- kunmap_atomic(dest, KM_USER0);
+ kunmap_atomic(src);
+ kunmap_atomic(dest);
bm_store_page_idx(page, page_nr);
} else
page = b->bm_pages[page_nr];
bio->bi_bdev = mdev->ldev->md_bdev;
bio->bi_sector = on_disk_sector;
+ /* bio_add_page of a single page to an empty bio will always succeed,
+ * according to api. Do we want to assert that? */
bio_add_page(bio, page, len, 0);
bio->bi_private = ctx;
bio->bi_end_io = bm_async_io_complete;
@@ -1000,14 +1010,9 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
/*
* bm_rw: read/write the whole bitmap from/to its on disk location.
*/
-static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local)
+static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
{
- struct bm_aio_ctx ctx = {
- .mdev = mdev,
- .in_flight = ATOMIC_INIT(1),
- .done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
- .flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0,
- };
+ struct bm_aio_ctx *ctx;
struct drbd_bitmap *b = mdev->bitmap;
int num_pages, i, count = 0;
unsigned long now;
@@ -1022,7 +1027,27 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
* For lazy writeout, we don't care for ongoing changes to the bitmap,
* as we submit copies of pages anyways.
*/
- if (!ctx.flags)
+
+ ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
+ if (!ctx)
+ return -ENOMEM;
+
+ *ctx = (struct bm_aio_ctx) {
+ .mdev = mdev,
+ .in_flight = ATOMIC_INIT(1),
+ .done = 0,
+ .flags = flags,
+ .error = 0,
+ .kref = { ATOMIC_INIT(2) },
+ };
+
+ if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */
+ dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
+ kfree(ctx);
+ return -ENODEV;
+ }
+
+ if (!ctx->flags)
WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
num_pages = b->bm_number_of_pages;
@@ -1035,7 +1060,8 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
break;
if (rw & WRITE) {
- if (bm_test_page_unchanged(b->bm_pages[i])) {
+ if (!(flags & BM_WRITE_ALL_PAGES) &&
+ bm_test_page_unchanged(b->bm_pages[i])) {
dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
continue;
}
@@ -1047,29 +1073,38 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
continue;
}
}
- atomic_inc(&ctx.in_flight);
- bm_page_io_async(&ctx, i, rw);
+ atomic_inc(&ctx->in_flight);
+ bm_page_io_async(ctx, i, rw);
++count;
cond_resched();
}
/*
- * We initialize ctx.in_flight to one to make sure bm_async_io_complete
- * will not complete() early, and decrement / test it here. If there
+ * We initialize ctx->in_flight to one to make sure bm_async_io_complete
+ * will not set ctx->done early, and decrement / test it here. If there
* are still some bios in flight, we need to wait for them here.
+ * If all IO is done already (or nothing had been submitted), there is
+ * no need to wait. Still, we need to put the kref associated with the
+ * "in_flight reached zero, all done" event.
*/
- if (!atomic_dec_and_test(&ctx.in_flight))
- wait_for_completion(&ctx.done);
+ if (!atomic_dec_and_test(&ctx->in_flight))
+ wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
+ else
+ kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+
dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
rw == WRITE ? "WRITE" : "READ",
count, jiffies - now);
- if (ctx.error) {
+ if (ctx->error) {
dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
- drbd_chk_io_error(mdev, 1, true);
- err = -EIO; /* ctx.error ? */
+ drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
+ err = -EIO; /* ctx->error ? */
}
+ if (atomic_read(&ctx->in_flight))
+ err = -EIO; /* Disk failed during IO... */
+
now = jiffies;
if (rw == WRITE) {
drbd_md_flush(mdev);
@@ -1083,6 +1118,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
+ kref_put(&ctx->kref, &bm_aio_ctx_destroy);
return err;
}
@@ -1092,7 +1128,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
*/
int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
{
- return bm_rw(mdev, READ, 0);
+ return bm_rw(mdev, READ, 0, 0);
}
/**
@@ -1103,7 +1139,18 @@ int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
*/
int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
{
- return bm_rw(mdev, WRITE, 0);
+ return bm_rw(mdev, WRITE, 0, 0);
+}
+
+/**
+ * drbd_bm_write_all() - Write the whole bitmap to its on disk location.
+ * @mdev: DRBD device.
+ *
+ * Will write all pages.
+ */
+int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local)
+{
+ return bm_rw(mdev, WRITE, BM_WRITE_ALL_PAGES, 0);
}
/**
@@ -1113,7 +1160,23 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
*/
int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local)
{
- return bm_rw(mdev, WRITE, upper_idx);
+ return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, upper_idx);
+}
+
+/**
+ * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location.
+ * @mdev: DRBD device.
+ *
+ * Will only write pages that have changed since last IO.
+ * In contrast to drbd_bm_write(), this will copy the bitmap pages
+ * to temporary writeout pages. It is intended to trigger a full write-out
+ * while still allowing the bitmap to change, for example if a resync or online
+ * verify is aborted due to a failed peer disk, while local IO continues, or
+ * pending resync acks are still being processed.
+ */
+int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local)
+{
+ return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0);
}
@@ -1131,28 +1194,45 @@ int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(l
*/
int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local)
{
- struct bm_aio_ctx ctx = {
+ struct bm_aio_ctx *ctx;
+ int err;
+
+ if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {
+ dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx);
+ return 0;
+ }
+
+ ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
+ if (!ctx)
+ return -ENOMEM;
+
+ *ctx = (struct bm_aio_ctx) {
.mdev = mdev,
.in_flight = ATOMIC_INIT(1),
- .done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
+ .done = 0,
.flags = BM_AIO_COPY_PAGES,
+ .error = 0,
+ .kref = { ATOMIC_INIT(2) },
};
- if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {
- dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx);
- return 0;
+ if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */
+ dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n");
+ kfree(ctx);
+ return -ENODEV;
}
- bm_page_io_async(&ctx, idx, WRITE_SYNC);
- wait_for_completion(&ctx.done);
+ bm_page_io_async(ctx, idx, WRITE_SYNC);
+ wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
- if (ctx.error)
- drbd_chk_io_error(mdev, 1, true);
+ if (ctx->error)
+ drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
/* that should force detach, so the in memory bitmap will be
* gone in a moment as well. */
mdev->bm_writ_cnt++;
- return ctx.error;
+ err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error;
+ kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+ return err;
}
/* NOTE
@@ -1164,7 +1244,7 @@ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(loc
* this returns a bit number, NOT a sector!
*/
static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
- const int find_zero_bit, const enum km_type km)
+ const int find_zero_bit)
{
struct drbd_bitmap *b = mdev->bitmap;
unsigned long *p_addr;
@@ -1179,7 +1259,7 @@ static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
while (bm_fo < b->bm_bits) {
/* bit offset of the first bit in the page */
bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
- p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo), km);
+ p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo));
if (find_zero_bit)
i = find_next_zero_bit_le(p_addr,
@@ -1188,7 +1268,7 @@ static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
i = find_next_bit_le(p_addr,
PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
- __bm_unmap(p_addr, km);
+ __bm_unmap(p_addr);
if (i < PAGE_SIZE*8) {
bm_fo = bit_offset + i;
if (bm_fo >= b->bm_bits)
@@ -1216,7 +1296,7 @@ static unsigned long bm_find_next(struct drbd_conf *mdev,
if (BM_DONT_TEST & b->bm_flags)
bm_print_lock_info(mdev);
- i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
+ i = __bm_find_next(mdev, bm_fo, find_zero_bit);
spin_unlock_irq(&b->bm_lock);
return i;
@@ -1240,13 +1320,13 @@ unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo
unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
{
/* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
- return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
+ return __bm_find_next(mdev, bm_fo, 0);
}
unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
{
/* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
- return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
+ return __bm_find_next(mdev, bm_fo, 1);
}
/* returns number of bits actually changed.
@@ -1274,14 +1354,14 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
if (page_nr != last_page_nr) {
if (p_addr)
- __bm_unmap(p_addr, KM_IRQ1);
+ __bm_unmap(p_addr);
if (c < 0)
bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
else if (c > 0)
bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
changed_total += c;
c = 0;
- p_addr = __bm_map_pidx(b, page_nr, KM_IRQ1);
+ p_addr = __bm_map_pidx(b, page_nr);
last_page_nr = page_nr;
}
if (val)
@@ -1290,7 +1370,7 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
}
if (p_addr)
- __bm_unmap(p_addr, KM_IRQ1);
+ __bm_unmap(p_addr);
if (c < 0)
bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
else if (c > 0)
@@ -1343,13 +1423,13 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
{
int i;
int bits;
- unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_IRQ1);
+ unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
for (i = first_word; i < last_word; i++) {
bits = hweight_long(paddr[i]);
paddr[i] = ~0UL;
b->bm_set += BITS_PER_LONG - bits;
}
- kunmap_atomic(paddr, KM_IRQ1);
+ kunmap_atomic(paddr);
}
/* Same thing as drbd_bm_set_bits,
@@ -1408,10 +1488,17 @@ void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
first_word = 0;
spin_lock_irq(&b->bm_lock);
}
-
/* last page (respectively only page, for first page == last page) */
last_word = MLPP(el >> LN2_BPL);
- bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word);
+
+ /* consider bitmap->bm_bits = 32768, bitmap->bm_number_of_pages = 1. (or multiples).
+ * ==> e = 32767, el = 32768, last_page = 2,
+ * and now last_word = 0.
+ * We do not want to touch last_page in this case,
+ * as we did not allocate it, it is not present in bitmap->bm_pages.
+ */
+ if (last_word)
+ bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word);
/* possibly trailing bits.
* example: (e & 63) == 63, el will be e+1.
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index ef2ceed3be4b..b953cc7c9c00 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -28,7 +28,6 @@
#include <linux/compiler.h>
#include <linux/types.h>
-#include <linux/version.h>
#include <linux/list.h>
#include <linux/sched.h>
#include <linux/bitops.h>
@@ -60,8 +59,8 @@
/* module parameter, defined in drbd_main.c */
extern unsigned int minor_count;
-extern int disable_sendpage;
-extern int allow_oos;
+extern bool disable_sendpage;
+extern bool allow_oos;
extern unsigned int cn_idx;
#ifdef CONFIG_DRBD_FAULT_INJECTION
@@ -713,7 +712,6 @@ struct drbd_request {
struct list_head tl_requests; /* ring list in the transfer log */
struct bio *master_bio; /* master bio pointer */
unsigned long rq_state; /* see comments above _req_mod() */
- int seq_num;
unsigned long start_time;
};
@@ -815,7 +813,6 @@ enum {
SIGNAL_ASENDER, /* whether asender wants to be interrupted */
SEND_PING, /* whether asender should send a ping asap */
- UNPLUG_QUEUED, /* only relevant with kernel 2.4 */
UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
MD_DIRTY, /* current uuids and flags not yet on disk */
DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */
@@ -826,7 +823,6 @@ enum {
CRASHED_PRIMARY, /* This node was a crashed primary.
* Gets cleared when the state.conn
* goes into C_CONNECTED state. */
- NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */
CONSIDER_RESYNC,
MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */
@@ -836,6 +832,7 @@ enum {
BITMAP_IO_QUEUED, /* Started bitmap IO */
GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */
WAS_IO_ERROR, /* Local disk failed returned IO error */
+ FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */
RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
NET_CONGESTED, /* The data socket is congested */
@@ -852,6 +849,14 @@ enum {
NEW_CUR_UUID, /* Create new current UUID when thawing IO */
AL_SUSPENDED, /* Activity logging is currently suspended. */
AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */
+ STATE_SENT, /* Do not change state/UUIDs while this is set */
+
+ CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
+ * pending, from drbd worker context.
+ * If set, bdi_write_congested() returns true,
+ * so shrink_page_list() would not recurse into,
+ * and potentially deadlock on, this drbd worker.
+ */
};
struct drbd_bitmap; /* opaque for drbd_conf */
@@ -863,31 +868,30 @@ enum bm_flag {
BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */
/* currently locked for bulk operation */
- BM_LOCKED_MASK = 0x7,
+ BM_LOCKED_MASK = 0xf,
/* in detail, that is: */
BM_DONT_CLEAR = 0x1,
BM_DONT_SET = 0x2,
BM_DONT_TEST = 0x4,
+ /* so we can mark it locked for bulk operation,
+ * and still allow all non-bulk operations */
+ BM_IS_LOCKED = 0x8,
+
/* (test bit, count bit) allowed (common case) */
- BM_LOCKED_TEST_ALLOWED = 0x3,
+ BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED,
/* testing bits, as well as setting new bits allowed, but clearing bits
* would be unexpected. Used during bitmap receive. Setting new bits
* requires sending of "out-of-sync" information, though. */
- BM_LOCKED_SET_ALLOWED = 0x1,
+ BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED,
- /* clear is not expected while bitmap is locked for bulk operation */
+ /* for drbd_bm_write_copy_pages, everything is allowed,
+ * only concurrent bulk operations are locked out. */
+ BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED,
};
-
-/* TODO sort members for performance
- * MAYBE group them further */
-
-/* THINK maybe we actually want to use the default "event/%s" worker threads
- * or similar in linux 2.6, which uses per cpu data and threads.
- */
struct drbd_work_queue {
struct list_head q;
struct semaphore s; /* producers up it, worker down()s it */
@@ -928,7 +932,7 @@ struct drbd_md {
#define NL_INT64(pn,pr,member) __u64 member;
#define NL_BIT(pn,pr,member) unsigned member:1;
#define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len;
-#include "linux/drbd_nl.h"
+#include <linux/drbd_nl.h>
struct drbd_backing_dev {
struct block_device *backing_bdev;
@@ -939,8 +943,7 @@ struct drbd_backing_dev {
};
struct drbd_md_io {
- struct drbd_conf *mdev;
- struct completion event;
+ unsigned int done;
int error;
};
@@ -1023,6 +1026,7 @@ struct drbd_conf {
struct drbd_tl_epoch *newest_tle;
struct drbd_tl_epoch *oldest_tle;
struct list_head out_of_sequence_requests;
+ struct list_head barrier_acked_requests;
struct hlist_head *tl_hash;
unsigned int tl_hash_s;
@@ -1057,6 +1061,8 @@ struct drbd_conf {
struct crypto_hash *csums_tfm;
struct crypto_hash *verify_tfm;
+ unsigned long last_reattach_jif;
+ unsigned long last_reconnect_jif;
struct drbd_thread receiver;
struct drbd_thread worker;
struct drbd_thread asender;
@@ -1095,7 +1101,8 @@ struct drbd_conf {
wait_queue_head_t ee_wait;
struct page *md_io_page; /* one page buffer for md_io */
struct page *md_io_tmpp; /* for logical_block_size != 512 */
- struct mutex md_io_mutex; /* protects the md_io_buffer */
+ struct drbd_md_io md_io;
+ atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */
spinlock_t al_lock;
wait_queue_head_t al_wait;
struct lru_cache *act_log; /* activity log */
@@ -1129,8 +1136,8 @@ struct drbd_conf {
int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
int rs_planed; /* resync sectors already planned */
atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
- int peer_max_bio_size;
- int local_max_bio_size;
+ unsigned int peer_max_bio_size;
+ unsigned int local_max_bio_size;
};
static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1229,8 +1236,8 @@ extern int drbd_send_uuids(struct drbd_conf *mdev);
extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev);
extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
-extern int _drbd_send_state(struct drbd_conf *mdev);
-extern int drbd_send_state(struct drbd_conf *mdev);
+extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s);
+extern int drbd_send_current_state(struct drbd_conf *mdev);
extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
enum drbd_packets cmd, struct p_header80 *h,
size_t size, unsigned msg_flags);
@@ -1434,9 +1441,9 @@ struct bm_extent {
* hash table. */
#define HT_SHIFT 8
#define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT))
-#define DRBD_MAX_BIO_SIZE_SAFE (1 << 12) /* Works always = 4k */
+#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */
-#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */
+#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* The old header only allows packets up to 32Kib data */
/* Number of elements in the app_reads_hash */
#define APP_R_HSIZE 15
@@ -1462,6 +1469,8 @@ extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
+extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local);
+extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local);
extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
unsigned long al_enr);
extern size_t drbd_bm_words(struct drbd_conf *mdev);
@@ -1494,11 +1503,38 @@ extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
extern mempool_t *drbd_request_mempool;
extern mempool_t *drbd_ee_mempool;
-extern struct page *drbd_pp_pool; /* drbd's page pool */
+/* drbd's page pool, used to buffer data received from the peer,
+ * or data requested by the peer.
+ *
+ * This does not have an emergency reserve.
+ *
+ * When allocating from this pool, it first takes pages from the pool.
+ * Only if the pool is depleted will try to allocate from the system.
+ *
+ * The assumption is that pages taken from this pool will be processed,
+ * and given back, "quickly", and then can be recycled, so we can avoid
+ * frequent calls to alloc_page(), and still will be able to make progress even
+ * under memory pressure.
+ */
+extern struct page *drbd_pp_pool;
extern spinlock_t drbd_pp_lock;
extern int drbd_pp_vacant;
extern wait_queue_head_t drbd_pp_wait;
+/* We also need a standard (emergency-reserve backed) page pool
+ * for meta data IO (activity log, bitmap).
+ * We can keep it global, as long as it is used as "N pages at a time".
+ * 128 should be plenty, currently we probably can get away with as few as 1.
+ */
+#define DRBD_MIN_POOL_PAGES 128
+extern mempool_t *drbd_md_io_page_pool;
+
+/* We also need to make sure we get a bio
+ * when we need it for housekeeping purposes */
+extern struct bio_set *drbd_md_io_bio_set;
+/* to allocate from that set */
+extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
+
extern rwlock_t global_state_lock;
extern struct drbd_conf *drbd_new_device(unsigned int minor);
@@ -1507,7 +1543,7 @@ extern void drbd_free_mdev(struct drbd_conf *mdev);
extern int proc_details;
/* drbd_req */
-extern int drbd_make_request(struct request_queue *q, struct bio *bio);
+extern void drbd_make_request(struct request_queue *q, struct bio *bio);
extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
extern int is_valid_ar_handle(struct drbd_request *, sector_t);
@@ -1537,8 +1573,12 @@ extern void resume_next_sg(struct drbd_conf *mdev);
extern void suspend_other_sg(struct drbd_conf *mdev);
extern int drbd_resync_finished(struct drbd_conf *mdev);
/* maybe rather drbd_main.c ? */
+extern void *drbd_md_get_buffer(struct drbd_conf *mdev);
+extern void drbd_md_put_buffer(struct drbd_conf *mdev);
extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
- struct drbd_backing_dev *bdev, sector_t sector, int rw);
+ struct drbd_backing_dev *bdev, sector_t sector, int rw);
+extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+ unsigned int *done);
extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
extern void drbd_rs_controller_reset(struct drbd_conf *mdev);
@@ -1755,19 +1795,6 @@ static inline struct page *page_chain_next(struct page *page)
#define page_chain_for_each_safe(page, n) \
for (; page && ({ n = page_chain_next(page); 1; }); page = n)
-static inline int drbd_bio_has_active_page(struct bio *bio)
-{
- struct bio_vec *bvec;
- int i;
-
- __bio_for_each_segment(bvec, bio, i, 0) {
- if (page_count(bvec->bv_page) > 1)
- return 1;
- }
-
- return 0;
-}
-
static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
{
struct page *page = e->pages;
@@ -1778,7 +1805,6 @@ static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
return 0;
}
-
static inline void drbd_state_lock(struct drbd_conf *mdev)
{
wait_event(mdev->misc_wait,
@@ -1821,12 +1847,20 @@ static inline int drbd_request_state(struct drbd_conf *mdev,
return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
}
+enum drbd_force_detach_flags {
+ DRBD_IO_ERROR,
+ DRBD_META_IO_ERROR,
+ DRBD_FORCE_DETACH,
+};
+
#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
-static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where)
+static inline void __drbd_chk_io_error_(struct drbd_conf *mdev,
+ enum drbd_force_detach_flags forcedetach,
+ const char *where)
{
switch (mdev->ldev->dc.on_io_error) {
case EP_PASS_ON:
- if (!forcedetach) {
+ if (forcedetach == DRBD_IO_ERROR) {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Local IO failed in %s.\n", where);
if (mdev->state.disk > D_INCONSISTENT)
@@ -1837,6 +1871,8 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach,
case EP_DETACH:
case EP_CALL_HELPER:
set_bit(WAS_IO_ERROR, &mdev->flags);
+ if (forcedetach == DRBD_FORCE_DETACH)
+ set_bit(FORCE_DETACH, &mdev->flags);
if (mdev->state.disk > D_FAILED) {
_drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
dev_err(DEV,
@@ -1856,7 +1892,7 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach,
*/
#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__)
static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
- int error, int forcedetach, const char *where)
+ int error, enum drbd_force_detach_flags forcedetach, const char *where)
{
if (error) {
unsigned long flags;
@@ -2231,7 +2267,7 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
* Note: currently we don't support such large bitmaps on 32bit
* arch anyways, but no harm done to be prepared for it here.
*/
- unsigned int shift = mdev->rs_total >= (1ULL << 32) ? 16 : 10;
+ unsigned int shift = mdev->rs_total > UINT_MAX ? 16 : 10;
unsigned long left = *bits_left >> shift;
unsigned long total = 1UL + (mdev->rs_total >> shift);
unsigned long tmp = 1000UL - left * 1000UL/total;
@@ -2307,12 +2343,12 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev)
case D_OUTDATED:
case D_CONSISTENT:
case D_UP_TO_DATE:
+ case D_FAILED:
/* disk state is stable as well. */
break;
/* no new io accepted during tansitional states */
case D_ATTACHING:
- case D_FAILED:
case D_NEGOTIATING:
case D_UNKNOWN:
case D_MASK:
@@ -2386,15 +2422,17 @@ static inline void dec_ap_bio(struct drbd_conf *mdev)
int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt);
D_ASSERT(ap_bio >= 0);
+
+ if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
+ if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
+ drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
+ }
+
/* this currently does wake_up for every dec_ap_bio!
* maybe rather introduce some type of hysteresis?
* e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */
if (ap_bio < mxb)
wake_up(&mdev->misc_wait);
- if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
- if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
- drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
- }
}
static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 0358e55356c8..f93a0320e952 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -79,6 +79,7 @@ static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
static void md_sync_timer_fn(unsigned long data);
static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+static void _tl_clear(struct drbd_conf *mdev);
MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
"Lars Ellenberg <lars@linbit.com>");
@@ -117,8 +118,8 @@ module_param(fault_devs, int, 0644);
/* module parameter, defined */
unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
-int disable_sendpage;
-int allow_oos;
+bool disable_sendpage;
+bool allow_oos;
unsigned int cn_idx = CN_IDX_DRBD;
int proc_details; /* Detail level in proc drbd*/
@@ -139,6 +140,8 @@ struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
mempool_t *drbd_request_mempool;
mempool_t *drbd_ee_mempool;
+mempool_t *drbd_md_io_page_pool;
+struct bio_set *drbd_md_io_bio_set;
/* I do not use a standard mempool, because:
1) I want to hand out the pre-allocated objects first.
@@ -159,7 +162,24 @@ static const struct block_device_operations drbd_ops = {
.release = drbd_release,
};
-#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
+static void bio_destructor_drbd(struct bio *bio)
+{
+ bio_free(bio, drbd_md_io_bio_set);
+}
+
+struct bio *bio_alloc_drbd(gfp_t gfp_mask)
+{
+ struct bio *bio;
+
+ if (!drbd_md_io_bio_set)
+ return bio_alloc(gfp_mask, 1);
+
+ bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+ if (!bio)
+ return NULL;
+ bio->bi_destructor = bio_destructor_drbd;
+ return bio;
+}
#ifdef __CHECKER__
/* When checking with sparse, and this is an inline function, sparse will
@@ -208,6 +228,7 @@ static int tl_init(struct drbd_conf *mdev)
mdev->oldest_tle = b;
mdev->newest_tle = b;
INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
+ INIT_LIST_HEAD(&mdev->barrier_acked_requests);
mdev->tl_hash = NULL;
mdev->tl_hash_s = 0;
@@ -246,9 +267,7 @@ void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
new->n_writes = 0;
newest_before = mdev->newest_tle;
- /* never send a barrier number == 0, because that is special-cased
- * when using TCQ for our write ordering code */
- new->br_number = (newest_before->br_number+1) ?: 1;
+ new->br_number = newest_before->br_number+1;
if (mdev->newest_tle != new) {
mdev->newest_tle->next = new;
mdev->newest_tle = new;
@@ -311,7 +330,7 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
These have been list_move'd to the out_of_sequence_requests list in
_req_mod(, barrier_acked) above.
*/
- list_del_init(&b->requests);
+ list_splice_init(&b->requests, &mdev->barrier_acked_requests);
nob = b->next;
if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
@@ -411,6 +430,14 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
b = tmp;
list_splice(&carry_reads, &b->requests);
}
+
+ /* Actions operating on the disk state, also want to work on
+ requests that got barrier acked. */
+
+ list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
+ req = list_entry(le, struct drbd_request, tl_requests);
+ _req_mod(req, what);
+ }
}
@@ -424,11 +451,16 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
*/
void tl_clear(struct drbd_conf *mdev)
{
+ spin_lock_irq(&mdev->req_lock);
+ _tl_clear(mdev);
+ spin_unlock_irq(&mdev->req_lock);
+}
+
+static void _tl_clear(struct drbd_conf *mdev)
+{
struct list_head *le, *tle;
struct drbd_request *r;
- spin_lock_irq(&mdev->req_lock);
-
_tl_restart(mdev, connection_lost_while_pending);
/* we expect this list to be empty. */
@@ -447,7 +479,6 @@ void tl_clear(struct drbd_conf *mdev)
memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
- spin_unlock_irq(&mdev->req_lock);
}
void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
@@ -458,6 +489,38 @@ void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
}
/**
+ * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
+ * @mdev: DRBD device.
+ */
+void tl_abort_disk_io(struct drbd_conf *mdev)
+{
+ struct drbd_tl_epoch *b;
+ struct list_head *le, *tle;
+ struct drbd_request *req;
+
+ spin_lock_irq(&mdev->req_lock);
+ b = mdev->oldest_tle;
+ while (b) {
+ list_for_each_safe(le, tle, &b->requests) {
+ req = list_entry(le, struct drbd_request, tl_requests);
+ if (!(req->rq_state & RQ_LOCAL_PENDING))
+ continue;
+ _req_mod(req, abort_disk_io);
+ }
+ b = b->next;
+ }
+
+ list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
+ req = list_entry(le, struct drbd_request, tl_requests);
+ if (!(req->rq_state & RQ_LOCAL_PENDING))
+ continue;
+ _req_mod(req, abort_disk_io);
+ }
+
+ spin_unlock_irq(&mdev->req_lock);
+}
+
+/**
* cl_wide_st_chg() - true if the state change is a cluster wide one
* @mdev: DRBD device.
* @os: old (current) state.
@@ -470,7 +533,7 @@ static int cl_wide_st_chg(struct drbd_conf *mdev,
((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
(os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
(os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
- (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
+ (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
(os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
(os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
}
@@ -509,8 +572,16 @@ static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
union drbd_state,
union drbd_state);
+enum sanitize_state_warnings {
+ NO_WARNING,
+ ABORTED_ONLINE_VERIFY,
+ ABORTED_RESYNC,
+ CONNECTION_LOST_NEGOTIATING,
+ IMPLICITLY_UPGRADED_DISK,
+ IMPLICITLY_UPGRADED_PDSK,
+};
static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, const char **warn_sync_abort);
+ union drbd_state ns, enum sanitize_state_warnings *warn);
int drbd_send_state_req(struct drbd_conf *,
union drbd_state, union drbd_state);
@@ -785,6 +856,13 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
rv = SS_IN_TRANSIENT_STATE;
+ /* While establishing a connection only allow cstate to change.
+ Delay/refuse role changes, detach attach etc... */
+ if (test_bit(STATE_SENT, &mdev->flags) &&
+ !(os.conn == C_WF_REPORT_PARAMS ||
+ (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
+ rv = SS_IN_TRANSIENT_STATE;
+
if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
rv = SS_NEED_CONNECTION;
@@ -803,6 +881,21 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
return rv;
}
+static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
+{
+ static const char *msg_table[] = {
+ [NO_WARNING] = "",
+ [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
+ [ABORTED_RESYNC] = "Resync aborted.",
+ [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
+ [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
+ [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
+ };
+
+ if (warn != NO_WARNING)
+ dev_warn(DEV, "%s\n", msg_table[warn]);
+}
+
/**
* sanitize_state() - Resolves implicitly necessary additional changes to a state transition
* @mdev: DRBD device.
@@ -814,11 +907,14 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
* to D_UNKNOWN. This rule and many more along those lines are in this function.
*/
static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, const char **warn_sync_abort)
+ union drbd_state ns, enum sanitize_state_warnings *warn)
{
enum drbd_fencing_p fp;
enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
+ if (warn)
+ *warn = NO_WARNING;
+
fp = FP_DONT_CARE;
if (get_ldev(mdev)) {
fp = mdev->ldev->dc.fencing;
@@ -833,18 +929,13 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
/* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
* If you try to go into some Sync* state, that shall fail (elsewhere). */
if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
- ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
+ ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
ns.conn = os.conn;
/* we cannot fail (again) if we already detached */
if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
ns.disk = D_DISKLESS;
- /* if we are only D_ATTACHING yet,
- * we can (and should) go directly to D_DISKLESS. */
- if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
- ns.disk = D_DISKLESS;
-
/* After C_DISCONNECTING only C_STANDALONE may follow */
if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
ns.conn = os.conn;
@@ -863,10 +954,9 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
/* Abort resync if a disk fails/detaches */
if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
(ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
- if (warn_sync_abort)
- *warn_sync_abort =
- os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
- "Online-verify" : "Resync";
+ if (warn)
+ *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
+ ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
ns.conn = C_CONNECTED;
}
@@ -877,7 +967,8 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
ns.disk = mdev->new_state_tmp.disk;
ns.pdsk = mdev->new_state_tmp.pdsk;
} else {
- dev_alert(DEV, "Connection lost while negotiating, no data!\n");
+ if (warn)
+ *warn = CONNECTION_LOST_NEGOTIATING;
ns.disk = D_DISKLESS;
ns.pdsk = D_UNKNOWN;
}
@@ -959,16 +1050,16 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
ns.disk = disk_max;
if (ns.disk < disk_min) {
- dev_warn(DEV, "Implicitly set disk from %s to %s\n",
- drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
+ if (warn)
+ *warn = IMPLICITLY_UPGRADED_DISK;
ns.disk = disk_min;
}
if (ns.pdsk > pdsk_max)
ns.pdsk = pdsk_max;
if (ns.pdsk < pdsk_min) {
- dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
- drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
+ if (warn)
+ *warn = IMPLICITLY_UPGRADED_PDSK;
ns.pdsk = pdsk_min;
}
@@ -1045,12 +1136,12 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
{
union drbd_state os;
enum drbd_state_rv rv = SS_SUCCESS;
- const char *warn_sync_abort = NULL;
+ enum sanitize_state_warnings ssw;
struct after_state_chg_work *ascw;
os = mdev->state;
- ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
+ ns = sanitize_state(mdev, os, ns, &ssw);
if (ns.i == os.i)
return SS_NOTHING_TO_DO;
@@ -1076,8 +1167,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
return rv;
}
- if (warn_sync_abort)
- dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
+ print_sanitize_warnings(mdev, ssw);
{
char *pbp, pb[300];
@@ -1243,7 +1333,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
drbd_thread_stop_nowait(&mdev->receiver);
/* Upon network failure, we need to restart the receiver. */
- if (os.conn > C_TEAR_DOWN &&
+ if (os.conn > C_WF_CONNECTION &&
ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
drbd_thread_restart_nowait(&mdev->receiver);
@@ -1251,6 +1341,15 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
drbd_resume_al(mdev);
+ /* remember last connect and attach times so request_timer_fn() won't
+ * kill newly established sessions while we are still trying to thaw
+ * previously frozen IO */
+ if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
+ mdev->last_reconnect_jif = jiffies;
+ if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+ ns.disk > D_NEGOTIATING)
+ mdev->last_reattach_jif = jiffies;
+
ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
if (ascw) {
ascw->os = os;
@@ -1354,12 +1453,16 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
/* Here we have the actions that are performed after a
state change. This function might sleep */
+ if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
+ mod_timer(&mdev->request_timer, jiffies + HZ);
+
nsm.i = -1;
if (ns.susp_nod) {
if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
what = resend;
- if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
+ if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+ ns.disk > D_NEGOTIATING)
what = restart_frozen_disk_io;
if (what != nothing)
@@ -1369,12 +1472,12 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
if (ns.susp_fen) {
/* case1: The outdate peer handler is successful: */
if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
- tl_clear(mdev);
if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
drbd_uuid_new_current(mdev);
clear_bit(NEW_CUR_UUID, &mdev->flags);
}
spin_lock_irq(&mdev->req_lock);
+ _tl_clear(mdev);
_drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
spin_unlock_irq(&mdev->req_lock);
}
@@ -1407,8 +1510,15 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
/* Do not change the order of the if above and the two below... */
if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
+ /* we probably will start a resync soon.
+ * make sure those things are properly reset. */
+ mdev->rs_total = 0;
+ mdev->rs_failed = 0;
+ atomic_set(&mdev->rs_pending_cnt, 0);
+ drbd_rs_cancel_all(mdev);
+
drbd_send_uuids(mdev);
- drbd_send_state(mdev);
+ drbd_send_state(mdev, ns);
}
/* No point in queuing send_bitmap if we don't have a connection
* anymore, so check also the _current_ state, not only the new state
@@ -1441,11 +1551,11 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
}
if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
- if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
+ if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
+ mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
drbd_uuid_new_current(mdev);
drbd_send_uuids(mdev);
}
-
/* D_DISKLESS Peer becomes secondary */
if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
/* We may still be Primary ourselves.
@@ -1473,14 +1583,14 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
drbd_send_sizes(mdev, 0, 0); /* to start sync... */
drbd_send_uuids(mdev);
- drbd_send_state(mdev);
+ drbd_send_state(mdev, ns);
}
/* We want to pause/continue resync, tell peer. */
if (ns.conn >= C_CONNECTED &&
((os.aftr_isp != ns.aftr_isp) ||
(os.user_isp != ns.user_isp)))
- drbd_send_state(mdev);
+ drbd_send_state(mdev, ns);
/* In case one of the isp bits got set, suspend other devices. */
if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
@@ -1490,10 +1600,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
/* Make sure the peer gets informed about eventual state
changes (ISP bits) while we were in WFReportParams. */
if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
- drbd_send_state(mdev);
+ drbd_send_state(mdev, ns);
if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
- drbd_send_state(mdev);
+ drbd_send_state(mdev, ns);
/* We are in the progress to start a full sync... */
if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
@@ -1513,37 +1623,54 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
/* first half of local IO error, failure to attach,
* or administrative detach */
if (os.disk != D_FAILED && ns.disk == D_FAILED) {
- enum drbd_io_error_p eh;
- int was_io_error;
+ enum drbd_io_error_p eh = EP_PASS_ON;
+ int was_io_error = 0;
/* corresponding get_ldev was in __drbd_set_state, to serialize
- * our cleanup here with the transition to D_DISKLESS,
- * so it is safe to dreference ldev here. */
- eh = mdev->ldev->dc.on_io_error;
- was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
-
- /* current state still has to be D_FAILED,
- * there is only one way out: to D_DISKLESS,
- * and that may only happen after our put_ldev below. */
- if (mdev->state.disk != D_FAILED)
- dev_err(DEV,
- "ASSERT FAILED: disk is %s during detach\n",
- drbd_disk_str(mdev->state.disk));
-
- if (drbd_send_state(mdev))
- dev_warn(DEV, "Notified peer that I am detaching my disk\n");
- else
- dev_err(DEV, "Sending state for detaching disk failed\n");
-
- drbd_rs_cancel_all(mdev);
-
- /* In case we want to get something to stable storage still,
- * this may be the last chance.
- * Following put_ldev may transition to D_DISKLESS. */
- drbd_md_sync(mdev);
+ * our cleanup here with the transition to D_DISKLESS.
+ * But is is still not save to dreference ldev here, since
+ * we might come from an failed Attach before ldev was set. */
+ if (mdev->ldev) {
+ eh = mdev->ldev->dc.on_io_error;
+ was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
+
+ if (was_io_error && eh == EP_CALL_HELPER)
+ drbd_khelper(mdev, "local-io-error");
+
+ /* Immediately allow completion of all application IO,
+ * that waits for completion from the local disk,
+ * if this was a force-detach due to disk_timeout
+ * or administrator request (drbdsetup detach --force).
+ * Do NOT abort otherwise.
+ * Aborting local requests may cause serious problems,
+ * if requests are completed to upper layers already,
+ * and then later the already submitted local bio completes.
+ * This can cause DMA into former bio pages that meanwhile
+ * have been re-used for other things.
+ * So aborting local requests may cause crashes,
+ * or even worse, silent data corruption.
+ */
+ if (test_and_clear_bit(FORCE_DETACH, &mdev->flags))
+ tl_abort_disk_io(mdev);
+
+ /* current state still has to be D_FAILED,
+ * there is only one way out: to D_DISKLESS,
+ * and that may only happen after our put_ldev below. */
+ if (mdev->state.disk != D_FAILED)
+ dev_err(DEV,
+ "ASSERT FAILED: disk is %s during detach\n",
+ drbd_disk_str(mdev->state.disk));
+
+ if (ns.conn >= C_CONNECTED)
+ drbd_send_state(mdev, ns);
+
+ drbd_rs_cancel_all(mdev);
+
+ /* In case we want to get something to stable storage still,
+ * this may be the last chance.
+ * Following put_ldev may transition to D_DISKLESS. */
+ drbd_md_sync(mdev);
+ }
put_ldev(mdev);
-
- if (was_io_error && eh == EP_CALL_HELPER)
- drbd_khelper(mdev, "local-io-error");
}
/* second half of local IO error, failure to attach,
@@ -1557,20 +1684,17 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
"ASSERT FAILED: disk is %s while going diskless\n",
drbd_disk_str(mdev->state.disk));
- mdev->rs_total = 0;
- mdev->rs_failed = 0;
- atomic_set(&mdev->rs_pending_cnt, 0);
+ if (ns.conn >= C_CONNECTED)
+ drbd_send_state(mdev, ns);
- if (drbd_send_state(mdev))
- dev_warn(DEV, "Notified peer that I'm now diskless.\n");
/* corresponding get_ldev in __drbd_set_state
* this may finally trigger drbd_ldev_destroy. */
put_ldev(mdev);
}
/* Notify peer that I had a local IO error, and did not detached.. */
- if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
- drbd_send_state(mdev);
+ if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
+ drbd_send_state(mdev, ns);
/* Disks got bigger while they were detached */
if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
@@ -1588,7 +1712,13 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
/* sync target done with resync. Explicitly notify peer, even though
* it should (at least for non-empty resyncs) already know itself. */
if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
- drbd_send_state(mdev);
+ drbd_send_state(mdev, ns);
+
+ /* Wake up role changes, that were delayed because of connection establishing */
+ if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
+ clear_bit(STATE_SENT, &mdev->flags);
+ wake_up(&mdev->state_wait);
+ }
/* This triggers bitmap writeout of potentially still unwritten pages
* if the resync finished cleanly, or aborted because of peer disk
@@ -1598,8 +1728,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
* No harm done if some bits change during this phase.
*/
if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
- drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
- "write from resync_finished", BM_LOCKED_SET_ALLOWED);
+ drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
+ "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
put_ldev(mdev);
}
@@ -2057,7 +2187,11 @@ int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
- uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
+ uuid = mdev->ldev->md.uuid[UI_BITMAP];
+ if (uuid && uuid != UUID_JUST_CREATED)
+ uuid = uuid + UUID_NEW_BM_OFFSET;
+ else
+ get_random_bytes(&uuid, sizeof(u64));
drbd_uuid_set(mdev, UI_BITMAP, uuid);
drbd_print_uuids(mdev, "updated sync UUID");
drbd_md_sync(mdev);
@@ -2071,7 +2205,8 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
{
struct p_sizes p;
sector_t d_size, u_size;
- int q_order_type, max_bio_size;
+ int q_order_type;
+ unsigned int max_bio_size;
int ok;
if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
@@ -2080,7 +2215,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
u_size = mdev->ldev->dc.disk_size;
q_order_type = drbd_queue_order_type(mdev);
max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
- max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
+ max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
put_ldev(mdev);
} else {
d_size = 0;
@@ -2089,6 +2224,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
}
+ /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
+ if (mdev->agreed_pro_version <= 94)
+ max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+
p.d_size = cpu_to_be64(d_size);
p.u_size = cpu_to_be64(u_size);
p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
@@ -2102,10 +2241,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
}
/**
- * drbd_send_state() - Sends the drbd state to the peer
+ * drbd_send_current_state() - Sends the drbd state to the peer
* @mdev: DRBD device.
*/
-int drbd_send_state(struct drbd_conf *mdev)
+int drbd_send_current_state(struct drbd_conf *mdev)
{
struct socket *sock;
struct p_state p;
@@ -2131,6 +2270,37 @@ int drbd_send_state(struct drbd_conf *mdev)
return ok;
}
+/**
+ * drbd_send_state() - After a state change, sends the new state to the peer
+ * @mdev: DRBD device.
+ * @state: the state to send, not necessarily the current state.
+ *
+ * Each state change queues an "after_state_ch" work, which will eventually
+ * send the resulting new state to the peer. If more state changes happen
+ * between queuing and processing of the after_state_ch work, we still
+ * want to send each intermediary state in the order it occurred.
+ */
+int drbd_send_state(struct drbd_conf *mdev, union drbd_state state)
+{
+ struct socket *sock;
+ struct p_state p;
+ int ok = 0;
+
+ mutex_lock(&mdev->data.mutex);
+
+ p.state = cpu_to_be32(state.i);
+ sock = mdev->data.socket;
+
+ if (likely(sock != NULL)) {
+ ok = _drbd_send_cmd(mdev, sock, P_STATE,
+ (struct p_header80 *)&p, sizeof(p), 0);
+ }
+
+ mutex_unlock(&mdev->data.mutex);
+
+ return ok;
+}
+
int drbd_send_state_req(struct drbd_conf *mdev,
union drbd_state mask, union drbd_state val)
{
@@ -2615,7 +2785,7 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
struct bio_vec *bvec;
int i;
/* hint all but last page with MSG_MORE */
- __bio_for_each_segment(bvec, bio, i, 0) {
+ bio_for_each_segment(bvec, bio, i) {
if (!_drbd_no_send_page(mdev, bvec->bv_page,
bvec->bv_offset, bvec->bv_len,
i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
@@ -2629,7 +2799,7 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
struct bio_vec *bvec;
int i;
/* hint all but last page with MSG_MORE */
- __bio_for_each_segment(bvec, bio, i, 0) {
+ bio_for_each_segment(bvec, bio, i) {
if (!_drbd_send_page(mdev, bvec->bv_page,
bvec->bv_offset, bvec->bv_len,
i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
@@ -2695,8 +2865,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
p.sector = cpu_to_be64(req->sector);
p.block_id = (unsigned long)req;
- p.seq_num = cpu_to_be32(req->seq_num =
- atomic_add_return(1, &mdev->packet_seq));
+ p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
@@ -2987,8 +3156,8 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
atomic_set(&mdev->rs_sect_in, 0);
atomic_set(&mdev->rs_sect_ev, 0);
atomic_set(&mdev->ap_in_flight, 0);
+ atomic_set(&mdev->md_io_in_use, 0);
- mutex_init(&mdev->md_io_mutex);
mutex_init(&mdev->data.mutex);
mutex_init(&mdev->meta.mutex);
sema_init(&mdev->data.work.s, 0);
@@ -3126,6 +3295,10 @@ static void drbd_destroy_mempools(void)
/* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
+ if (drbd_md_io_bio_set)
+ bioset_free(drbd_md_io_bio_set);
+ if (drbd_md_io_page_pool)
+ mempool_destroy(drbd_md_io_page_pool);
if (drbd_ee_mempool)
mempool_destroy(drbd_ee_mempool);
if (drbd_request_mempool)
@@ -3139,6 +3312,8 @@ static void drbd_destroy_mempools(void)
if (drbd_al_ext_cache)
kmem_cache_destroy(drbd_al_ext_cache);
+ drbd_md_io_bio_set = NULL;
+ drbd_md_io_page_pool = NULL;
drbd_ee_mempool = NULL;
drbd_request_mempool = NULL;
drbd_ee_cache = NULL;
@@ -3162,6 +3337,8 @@ static int drbd_create_mempools(void)
drbd_bm_ext_cache = NULL;
drbd_al_ext_cache = NULL;
drbd_pp_pool = NULL;
+ drbd_md_io_page_pool = NULL;
+ drbd_md_io_bio_set = NULL;
/* caches */
drbd_request_cache = kmem_cache_create(
@@ -3185,6 +3362,16 @@ static int drbd_create_mempools(void)
goto Enomem;
/* mempools */
+#ifdef COMPAT_HAVE_BIOSET_CREATE
+ drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
+ if (drbd_md_io_bio_set == NULL)
+ goto Enomem;
+#endif
+
+ drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
+ if (drbd_md_io_page_pool == NULL)
+ goto Enomem;
+
drbd_request_mempool = mempool_create(number,
mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
if (drbd_request_mempool == NULL)
@@ -3262,6 +3449,8 @@ static void drbd_delete_device(unsigned int minor)
if (!mdev)
return;
+ del_timer_sync(&mdev->request_timer);
+
/* paranoia asserts */
if (mdev->open_cnt != 0)
dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
@@ -3344,9 +3533,9 @@ static void drbd_cleanup(void)
}
/**
- * drbd_congested() - Callback for pdflush
+ * drbd_congested() - Callback for the flusher thread
* @congested_data: User data
- * @bdi_bits: Bits pdflush is currently interested in
+ * @bdi_bits: Bits the BDI flusher thread is currently interested in
*
* Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
*/
@@ -3364,6 +3553,22 @@ static int drbd_congested(void *congested_data, int bdi_bits)
goto out;
}
+ if (test_bit(CALLBACK_PENDING, &mdev->flags)) {
+ r |= (1 << BDI_async_congested);
+ /* Without good local data, we would need to read from remote,
+ * and that would need the worker thread as well, which is
+ * currently blocked waiting for that usermode helper to
+ * finish.
+ */
+ if (!get_ldev_if_state(mdev, D_UP_TO_DATE))
+ r |= (1 << BDI_sync_congested);
+ else
+ put_ldev(mdev);
+ r &= bdi_bits;
+ reason = 'c';
+ goto out;
+ }
+
if (get_ldev(mdev)) {
q = bdev_get_queue(mdev->ldev->backing_bdev);
r = bdi_congested(&q->backing_dev_info, bdi_bits);
@@ -3427,6 +3632,7 @@ struct drbd_conf *drbd_new_device(unsigned int minor)
q->backing_dev_info.congested_data = mdev;
blk_queue_make_request(q, drbd_make_request);
+ blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
/* Setting the max_hw_sectors to an odd value of 8kibyte here
This triggers a max_bio_size message upon first attach or connect */
blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
@@ -3666,8 +3872,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
if (!get_ldev_if_state(mdev, D_FAILED))
return;
- mutex_lock(&mdev->md_io_mutex);
- buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
+ buffer = drbd_md_get_buffer(mdev);
+ if (!buffer)
+ goto out;
+
memset(buffer, 0, 512);
buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
@@ -3691,14 +3899,15 @@ void drbd_md_sync(struct drbd_conf *mdev)
if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
/* this was a try anyways ... */
dev_err(DEV, "meta data update failed!\n");
- drbd_chk_io_error(mdev, 1, true);
+ drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
}
/* Update mdev->ldev->md.la_size_sect,
* since we updated it on metadata. */
mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
- mutex_unlock(&mdev->md_io_mutex);
+ drbd_md_put_buffer(mdev);
+out:
put_ldev(mdev);
}
@@ -3718,8 +3927,9 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
if (!get_ldev_if_state(mdev, D_ATTACHING))
return ERR_IO_MD_DISK;
- mutex_lock(&mdev->md_io_mutex);
- buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
+ buffer = drbd_md_get_buffer(mdev);
+ if (!buffer)
+ goto out;
if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
/* NOTE: can't do normal error processing here as this is
@@ -3769,9 +3979,9 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
spin_lock_irq(&mdev->req_lock);
if (mdev->state.conn < C_CONNECTED) {
- int peer;
+ unsigned int peer;
peer = be32_to_cpu(buffer->la_peer_max_bio_size);
- peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
+ peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
mdev->peer_max_bio_size = peer;
}
spin_unlock_irq(&mdev->req_lock);
@@ -3780,7 +3990,8 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
mdev->sync_conf.al_extents = 127;
err:
- mutex_unlock(&mdev->md_io_mutex);
+ drbd_md_put_buffer(mdev);
+ out:
put_ldev(mdev);
return rv;
@@ -4183,12 +4394,11 @@ const char *drbd_buildtag(void)
static char buildtag[38] = "\0uilt-in";
if (buildtag[0] == 0) {
-#ifdef CONFIG_MODULES
- if (THIS_MODULE != NULL)
- sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
- else
+#ifdef MODULE
+ sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
+#else
+ buildtag[0] = 'b';
#endif
- buildtag[0] = 'b';
}
return buildtag;
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 0feab261e295..edb490aad8b4 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -94,7 +94,7 @@ static int name ## _from_tags(struct drbd_conf *mdev, \
arg->member ## _len = dlen; \
memcpy(arg->member, tags, min_t(size_t, dlen, len)); \
break;
-#include "linux/drbd_nl.h"
+#include <linux/drbd_nl.h>
/* Generate the struct to tag_list functions */
#define NL_PACKET(name, number, fields) \
@@ -129,7 +129,7 @@ name ## _to_tags(struct drbd_conf *mdev, \
put_unaligned(arg->member ## _len, tags++); \
memcpy(tags, arg->member, arg->member ## _len); \
tags = (unsigned short *)((char *)tags + arg->member ## _len);
-#include "linux/drbd_nl.h"
+#include <linux/drbd_nl.h>
void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name);
void drbd_nl_send_reply(struct cn_msg *, int);
@@ -147,6 +147,9 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
char *argv[] = {usermode_helper, cmd, mb, NULL };
int ret;
+ if (current == mdev->worker.task)
+ set_bit(CALLBACK_PENDING, &mdev->flags);
+
snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev));
if (get_net_conf(mdev)) {
@@ -179,7 +182,7 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
drbd_bcast_ev_helper(mdev, cmd);
- ret = call_usermodehelper(usermode_helper, argv, envp, 1);
+ ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
if (ret)
dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
usermode_helper, cmd, mb,
@@ -189,6 +192,9 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
usermode_helper, cmd, mb,
(ret >> 8) & 0xff, ret);
+ if (current == mdev->worker.task)
+ clear_bit(CALLBACK_PENDING, &mdev->flags);
+
if (ret < 0) /* Ignore any ERRNOs we got. */
ret = 0;
@@ -289,7 +295,7 @@ static int _try_outdate_peer_async(void *data)
*/
spin_lock_irq(&mdev->req_lock);
ns = mdev->state;
- if (ns.conn < C_WF_REPORT_PARAMS) {
+ if (ns.conn < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &mdev->flags)) {
ns.pdsk = nps;
_drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
}
@@ -432,7 +438,7 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
/* if this was forced, we should consider sync */
if (forced)
drbd_send_uuids(mdev);
- drbd_send_state(mdev);
+ drbd_send_current_state(mdev);
}
drbd_md_sync(mdev);
@@ -668,8 +674,8 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
la_size_changed && md_moved ? "size changed and md moved" :
la_size_changed ? "size changed" : "md moved");
/* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
- err = drbd_bitmap_io(mdev, &drbd_bm_write,
- "size changed", BM_LOCKED_MASK);
+ err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
+ "size changed", BM_LOCKED_MASK);
if (err) {
rv = dev_size_error;
goto out;
@@ -795,8 +801,8 @@ static int drbd_check_al_size(struct drbd_conf *mdev)
static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size)
{
struct request_queue * const q = mdev->rq_queue;
- int max_hw_sectors = max_bio_size >> 9;
- int max_segments = 0;
+ unsigned int max_hw_sectors = max_bio_size >> 9;
+ unsigned int max_segments = 0;
if (get_ldev_if_state(mdev, D_ATTACHING)) {
struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
@@ -829,7 +835,7 @@ static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_
void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
{
- int now, new, local, peer;
+ unsigned int now, new, local, peer;
now = queue_max_hw_sectors(mdev->rq_queue) << 9;
local = mdev->local_max_bio_size; /* Eventually last known value, from volatile memory */
@@ -840,23 +846,25 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
mdev->local_max_bio_size = local;
put_ldev(mdev);
}
+ local = min(local, DRBD_MAX_BIO_SIZE);
/* We may ignore peer limits if the peer is modern enough.
Because new from 8.3.8 onwards the peer can use multiple
BIOs for a single peer_request */
if (mdev->state.conn >= C_CONNECTED) {
- if (mdev->agreed_pro_version < 94)
- peer = mdev->peer_max_bio_size;
- else if (mdev->agreed_pro_version == 94)
+ if (mdev->agreed_pro_version < 94) {
+ peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+ /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
+ } else if (mdev->agreed_pro_version == 94)
peer = DRBD_MAX_SIZE_H80_PACKET;
else /* drbd 8.3.8 onwards */
peer = DRBD_MAX_BIO_SIZE;
}
- new = min_t(int, local, peer);
+ new = min(local, peer);
if (mdev->state.role == R_PRIMARY && new < now)
- dev_err(DEV, "ASSERT FAILED new < now; (%d < %d)\n", new, now);
+ dev_err(DEV, "ASSERT FAILED new < now; (%u < %u)\n", new, now);
if (new != now)
dev_info(DEV, "max BIO size = %u\n", new);
@@ -949,6 +957,14 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
* to realize a "hot spare" feature (not that I'd recommend that) */
wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
+ /* make sure there is no leftover from previous force-detach attempts */
+ clear_bit(FORCE_DETACH, &mdev->flags);
+
+ /* and no leftover from previously aborted resync or verify, either */
+ mdev->rs_total = 0;
+ mdev->rs_failed = 0;
+ atomic_set(&mdev->rs_pending_cnt, 0);
+
/* allocation not in the IO path, cqueue thread context */
nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
if (!nbc) {
@@ -1032,7 +1048,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
(unsigned long long) drbd_get_max_capacity(nbc),
(unsigned long long) nbc->dc.disk_size);
- retcode = ERR_DISK_TO_SMALL;
+ retcode = ERR_DISK_TOO_SMALL;
goto fail;
}
@@ -1046,7 +1062,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
}
if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
- retcode = ERR_MD_DISK_TO_SMALL;
+ retcode = ERR_MD_DISK_TOO_SMALL;
dev_warn(DEV, "refusing attach: md-device too small, "
"at least %llu sectors needed for this meta-disk type\n",
(unsigned long long) min_md_device_sectors);
@@ -1057,7 +1073,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
* (we may currently be R_PRIMARY with no local disk...) */
if (drbd_get_max_capacity(nbc) <
drbd_get_capacity(mdev->this_bdev)) {
- retcode = ERR_DISK_TO_SMALL;
+ retcode = ERR_DISK_TOO_SMALL;
goto fail;
}
@@ -1138,7 +1154,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) {
dev_warn(DEV, "refusing to truncate a consistent device\n");
- retcode = ERR_DISK_TO_SMALL;
+ retcode = ERR_DISK_TOO_SMALL;
goto force_diskless_dec;
}
@@ -1336,17 +1352,35 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
{
enum drbd_ret_code retcode;
int ret;
+ struct detach dt = {};
+
+ if (!detach_from_tags(mdev, nlp->tag_list, &dt)) {
+ reply->ret_code = ERR_MANDATORY_TAG;
+ goto out;
+ }
+
+ if (dt.detach_force) {
+ set_bit(FORCE_DETACH, &mdev->flags);
+ drbd_force_state(mdev, NS(disk, D_FAILED));
+ reply->ret_code = SS_SUCCESS;
+ goto out;
+ }
+
drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */
+ drbd_md_get_buffer(mdev); /* make sure there is no in-flight meta-data IO */
retcode = drbd_request_state(mdev, NS(disk, D_FAILED));
+ drbd_md_put_buffer(mdev);
/* D_FAILED will transition to DISKLESS. */
ret = wait_event_interruptible(mdev->misc_wait,
mdev->state.disk != D_FAILED);
drbd_resume_io(mdev);
+
if ((int)retcode == (int)SS_IS_DISKLESS)
retcode = SS_NOTHING_TO_DO;
if (ret)
retcode = ERR_INTR;
reply->ret_code = retcode;
+out:
return 0;
}
@@ -1711,7 +1745,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
if (rs.no_resync && mdev->agreed_pro_version < 93) {
retcode = ERR_NEED_APV_93;
- goto fail;
+ goto fail_ldev;
}
if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
@@ -1738,6 +1772,10 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
fail:
reply->ret_code = retcode;
return 0;
+
+ fail_ldev:
+ put_ldev(mdev);
+ goto fail;
}
static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
@@ -1940,8 +1978,11 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
int retcode;
/* If there is still bitmap IO pending, probably because of a previous
- * resync just being finished, wait for it before requesting a new resync. */
+ * resync just being finished, wait for it before requesting a new resync.
+ * Also wait for it's after_state_ch(). */
+ drbd_suspend_io(mdev);
wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
+ drbd_flush_workqueue(mdev);
retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
@@ -1959,6 +2000,7 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
}
+ drbd_resume_io(mdev);
reply->ret_code = retcode;
return 0;
@@ -1979,8 +2021,11 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re
int retcode;
/* If there is still bitmap IO pending, probably because of a previous
- * resync just being finished, wait for it before requesting a new resync. */
+ * resync just being finished, wait for it before requesting a new resync.
+ * Also wait for it's after_state_ch(). */
+ drbd_suspend_io(mdev);
wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
+ drbd_flush_workqueue(mdev);
retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
@@ -1998,6 +2043,7 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re
} else
retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
}
+ drbd_resume_io(mdev);
reply->ret_code = retcode;
return 0;
@@ -2170,11 +2216,13 @@ static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
/* If there is still bitmap IO pending, e.g. previous resync or verify
* just being finished, wait for it before requesting a new resync. */
+ drbd_suspend_io(mdev);
wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
/* w_make_ov_request expects position to be aligned */
mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
+ drbd_resume_io(mdev);
return 0;
}
@@ -2297,7 +2345,7 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms
return;
}
- if (!cap_raised(current_cap(), CAP_SYS_ADMIN)) {
+ if (!capable(CAP_SYS_ADMIN)) {
retcode = ERR_PERM;
goto fail;
}
@@ -2526,10 +2574,10 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
page = e->pages;
page_chain_for_each(page) {
- void *d = kmap_atomic(page, KM_USER0);
+ void *d = kmap_atomic(page);
unsigned l = min_t(unsigned, len, PAGE_SIZE);
memcpy(tl, d, l);
- kunmap_atomic(d, KM_USER0);
+ kunmap_atomic(d);
tl = (unsigned short*)((char*)tl + l);
len -= l;
if (len == 0)
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 2959cdfb77f5..5496104f90b9 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -52,7 +52,7 @@ void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
if (unlikely(v >= 1000000)) {
/* cool: > GiByte/s */
seq_printf(seq, "%ld,", v / 1000000);
- v /= 1000000;
+ v %= 1000000;
seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000);
} else if (likely(v >= 1000))
seq_printf(seq, "%ld,%03ld", v/1000, v % 1000);
@@ -245,6 +245,9 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
mdev->state.role == R_SECONDARY) {
seq_printf(seq, "%2d: cs:Unconfigured\n", i);
} else {
+ /* reset mdev->congestion_reason */
+ bdi_rw_congested(&mdev->rq_queue->backing_dev_info);
+
seq_printf(seq,
"%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
" ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 43beaca53179..c74ca2df7431 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -277,6 +277,9 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
int i;
+ if (page == NULL)
+ return;
+
if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE)*minor_count)
i = page_chain_free(page);
else {
@@ -316,7 +319,7 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
gfp_t gfp_mask) __must_hold(local)
{
struct drbd_epoch_entry *e;
- struct page *page;
+ struct page *page = NULL;
unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE))
@@ -329,9 +332,11 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
return NULL;
}
- page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
- if (!page)
- goto fail;
+ if (data_size) {
+ page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
+ if (!page)
+ goto fail;
+ }
INIT_HLIST_NODE(&e->collision);
e->epoch = NULL;
@@ -466,6 +471,7 @@ static int drbd_accept(struct drbd_conf *mdev, const char **what,
goto out;
}
(*newsock)->ops = sock->ops;
+ __module_get((*newsock)->ops->owner);
out:
return err;
@@ -664,7 +670,7 @@ static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
timeo = mdev->net_conf->try_connect_int * HZ;
timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
- s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */
+ s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
s_listen->sk->sk_rcvtimeo = timeo;
s_listen->sk->sk_sndtimeo = timeo;
drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size,
@@ -750,6 +756,7 @@ static int drbd_connect(struct drbd_conf *mdev)
{
struct socket *s, *sock, *msock;
int try, h, ok;
+ enum drbd_state_rv rv;
D_ASSERT(!mdev->data.socket);
@@ -841,8 +848,8 @@ retry:
}
} while (1);
- msock->sk->sk_reuse = 1; /* SO_REUSEADDR */
- sock->sk->sk_reuse = 1; /* SO_REUSEADDR */
+ msock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+ sock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
sock->sk->sk_allocation = GFP_NOIO;
msock->sk->sk_allocation = GFP_NOIO;
@@ -888,25 +895,32 @@ retry:
}
}
- if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS)
- return 0;
-
sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
atomic_set(&mdev->packet_seq, 0);
mdev->peer_seq = 0;
- drbd_thread_start(&mdev->asender);
-
if (drbd_send_protocol(mdev) == -1)
return -1;
+ set_bit(STATE_SENT, &mdev->flags);
drbd_send_sync_param(mdev, &mdev->sync_conf);
drbd_send_sizes(mdev, 0, 0);
drbd_send_uuids(mdev);
- drbd_send_state(mdev);
+ drbd_send_current_state(mdev);
clear_bit(USE_DEGR_WFC_T, &mdev->flags);
clear_bit(RESIZE_PENDING, &mdev->flags);
+
+ spin_lock_irq(&mdev->req_lock);
+ rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL);
+ if (mdev->state.conn != C_WF_REPORT_PARAMS)
+ clear_bit(STATE_SENT, &mdev->flags);
+ spin_unlock_irq(&mdev->req_lock);
+
+ if (rv < SS_SUCCESS)
+ return 0;
+
+ drbd_thread_start(&mdev->asender);
mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
return 1;
@@ -957,7 +971,7 @@ static void drbd_flush(struct drbd_conf *mdev)
rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
NULL);
if (rv) {
- dev_err(DEV, "local disk flush failed with status %d\n", rv);
+ dev_info(DEV, "local disk flush failed with status %d\n", rv);
/* would rather check on EOPNOTSUPP, but that is not reliable.
* don't try again for ANY return value != 0
* if (rv == -EOPNOTSUPP) */
@@ -1001,13 +1015,14 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
if (epoch_size != 0 &&
atomic_read(&epoch->active) == 0 &&
- test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) {
+ (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
if (!(ev & EV_CLEANUP)) {
spin_unlock(&mdev->epoch_lock);
drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
spin_lock(&mdev->epoch_lock);
}
- dec_unacked(mdev);
+ if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
+ dec_unacked(mdev);
if (mdev->current_epoch != epoch) {
next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
@@ -1096,7 +1111,11 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
/* In most cases, we will only need one bio. But in case the lower
* level restrictions happen to be different at this offset on this
* side than those of the sending peer, we may need to submit the
- * request in more than one bio. */
+ * request in more than one bio.
+ *
+ * Plain bio_alloc is good enough here, this is no DRBD internally
+ * generated bio, but a bio allocated on behalf of the peer.
+ */
next_bio:
bio = bio_alloc(GFP_NOIO, nr_pages);
if (!bio) {
@@ -1256,7 +1275,6 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
data_size -= dgs;
- ERR_IF(data_size == 0) return NULL;
ERR_IF(data_size & 0x1ff) return NULL;
ERR_IF(data_size > DRBD_MAX_BIO_SIZE) return NULL;
@@ -1277,6 +1295,9 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
if (!e)
return NULL;
+ if (!data_size)
+ return e;
+
ds = data_size;
page = e->pages;
page_chain_for_each(page) {
@@ -1583,6 +1604,24 @@ static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int u
return ok;
}
+static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e)
+{
+
+ struct drbd_epoch_entry *rs_e;
+ bool rv = 0;
+
+ spin_lock_irq(&mdev->req_lock);
+ list_for_each_entry(rs_e, &mdev->sync_ee, w.list) {
+ if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) {
+ rv = 1;
+ break;
+ }
+ }
+ spin_unlock_irq(&mdev->req_lock);
+
+ return rv;
+}
+
/* Called from receive_Data.
* Synchronize packets on sock with packets on msock.
*
@@ -1683,6 +1722,10 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
dp_flags = be32_to_cpu(p->dp_flags);
rw |= wire_flags_to_bio(mdev, dp_flags);
+ if (e->pages == NULL) {
+ D_ASSERT(e->size == 0);
+ D_ASSERT(dp_flags & DP_FLUSH);
+ }
if (dp_flags & DP_MAY_SET_IN_SYNC)
e->flags |= EE_MAY_SET_IN_SYNC;
@@ -1826,6 +1869,9 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
list_add(&e->w.list, &mdev->active_ee);
spin_unlock_irq(&mdev->req_lock);
+ if (mdev->state.conn == C_SYNC_TARGET)
+ wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e));
+
switch (mdev->net_conf->wire_protocol) {
case DRBD_PROT_C:
inc_unacked(mdev);
@@ -2420,7 +2466,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
- dev_info(DEV, "Did not got last syncUUID packet, corrected:\n");
+ dev_info(DEV, "Lost last syncUUID packet, corrected:\n");
drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
return -1;
@@ -2806,10 +2852,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
if (apv >= 88) {
if (apv == 88) {
- if (data_size > SHARED_SECRET_MAX) {
- dev_err(DEV, "verify-alg too long, "
- "peer wants %u, accepting only %u byte\n",
- data_size, SHARED_SECRET_MAX);
+ if (data_size > SHARED_SECRET_MAX || data_size == 0) {
+ dev_err(DEV, "verify-alg of wrong size, "
+ "peer wants %u, accepting only up to %u byte\n",
+ data_size, SHARED_SECRET_MAX);
return false;
}
@@ -3168,9 +3214,20 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
os = ns = mdev->state;
spin_unlock_irq(&mdev->req_lock);
- /* peer says his disk is uptodate, while we think it is inconsistent,
- * and this happens while we think we have a sync going on. */
- if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE &&
+ /* If some other part of the code (asender thread, timeout)
+ * already decided to close the connection again,
+ * we must not "re-establish" it here. */
+ if (os.conn <= C_TEAR_DOWN)
+ return false;
+
+ /* If this is the "end of sync" confirmation, usually the peer disk
+ * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
+ * set) resync started in PausedSyncT, or if the timing of pause-/
+ * unpause-sync events has been "just right", the peer disk may
+ * transition from D_CONSISTENT to D_UP_TO_DATE as well.
+ */
+ if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
+ real_peer_disk == D_UP_TO_DATE &&
os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
/* If we are (becoming) SyncSource, but peer is still in sync
* preparation, ignore its uptodate-ness to avoid flapping, it
@@ -3288,7 +3345,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
/* Nowadays only used when forcing a node into primary role and
setting its disk to UpToDate with that */
drbd_send_uuids(mdev);
- drbd_send_state(mdev);
+ drbd_send_current_state(mdev);
}
}
@@ -3755,11 +3812,18 @@ void drbd_free_tl_hash(struct drbd_conf *mdev)
mdev->ee_hash = NULL;
mdev->ee_hash_s = 0;
- /* paranoia code */
- for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
- if (h->first)
- dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
- (int)(h - mdev->tl_hash), h->first);
+ /* We may not have had the chance to wait for all locally pending
+ * application requests. The hlist_add_fake() prevents access after
+ * free on master bio completion. */
+ for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) {
+ struct drbd_request *req;
+ struct hlist_node *pos, *n;
+ hlist_for_each_entry_safe(req, pos, n, h, collision) {
+ hlist_del_init(&req->collision);
+ hlist_add_fake(&req->collision);
+ }
+ }
+
kfree(mdev->tl_hash);
mdev->tl_hash = NULL;
mdev->tl_hash_s = 0;
@@ -3776,6 +3840,13 @@ static void drbd_disconnect(struct drbd_conf *mdev)
if (mdev->state.conn == C_STANDALONE)
return;
+ /* We are about to start the cleanup after connection loss.
+ * Make sure drbd_make_request knows about that.
+ * Usually we should be in some network failure state already,
+ * but just in case we are not, we fix it up here.
+ */
+ drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+
/* asender does not clean up anything. it must not interfere, either */
drbd_thread_stop(&mdev->asender);
drbd_free_sock(mdev);
@@ -3803,8 +3874,6 @@ static void drbd_disconnect(struct drbd_conf *mdev)
atomic_set(&mdev->rs_pending_cnt, 0);
wake_up(&mdev->misc_wait);
- del_timer(&mdev->request_timer);
-
/* make sure syncer is stopped and w_resume_next_sg queued */
del_timer_sync(&mdev->resync_timer);
resync_timer_fn((unsigned long)mdev);
@@ -4433,7 +4502,7 @@ static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
if (mdev->state.conn == C_AHEAD &&
atomic_read(&mdev->ap_in_flight) == 0 &&
- !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags)) {
+ !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) {
mdev->start_resync_timer.expires = jiffies + HZ;
add_timer(&mdev->start_resync_timer);
}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3424d675b769..01b2ac641c7b 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -37,6 +37,7 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req
const int rw = bio_data_dir(bio);
int cpu;
cpu = part_stat_lock();
+ part_round_stats(cpu, &mdev->vdisk->part0);
part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
part_inc_in_flight(&mdev->vdisk->part0, rw);
@@ -214,8 +215,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
{
const unsigned long s = req->rq_state;
struct drbd_conf *mdev = req->mdev;
- /* only WRITES may end up here without a master bio (on barrier ack) */
- int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE;
+ int rw = req->rq_state & RQ_WRITE ? WRITE : READ;
/* we must not complete the master bio, while it is
* still being processed by _drbd_send_zc_bio (drbd_send_dblock)
@@ -230,7 +230,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
return;
if (s & RQ_NET_PENDING)
return;
- if (s & RQ_LOCAL_PENDING)
+ if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED))
return;
if (req->master_bio) {
@@ -277,6 +277,9 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
req->master_bio = NULL;
}
+ if (s & RQ_LOCAL_PENDING)
+ return;
+
if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
/* this is disconnected (local only) operation,
* or protocol C P_WRITE_ACK,
@@ -429,7 +432,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
break;
case completed_ok:
- if (bio_data_dir(req->master_bio) == WRITE)
+ if (req->rq_state & RQ_WRITE)
mdev->writ_cnt += req->size>>9;
else
mdev->read_cnt += req->size>>9;
@@ -438,16 +441,22 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
req->rq_state &= ~RQ_LOCAL_PENDING;
_req_may_be_done_not_susp(req, m);
- put_ldev(mdev);
+ break;
+
+ case abort_disk_io:
+ req->rq_state |= RQ_LOCAL_ABORTED;
+ if (req->rq_state & RQ_WRITE)
+ _req_may_be_done_not_susp(req, m);
+ else
+ goto goto_queue_for_net_read;
break;
case write_completed_with_error:
req->rq_state |= RQ_LOCAL_COMPLETED;
req->rq_state &= ~RQ_LOCAL_PENDING;
- __drbd_chk_io_error(mdev, false);
+ __drbd_chk_io_error(mdev, DRBD_IO_ERROR);
_req_may_be_done_not_susp(req, m);
- put_ldev(mdev);
break;
case read_ahead_completed_with_error:
@@ -455,7 +464,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
req->rq_state |= RQ_LOCAL_COMPLETED;
req->rq_state &= ~RQ_LOCAL_PENDING;
_req_may_be_done_not_susp(req, m);
- put_ldev(mdev);
break;
case read_completed_with_error:
@@ -464,10 +472,16 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
req->rq_state |= RQ_LOCAL_COMPLETED;
req->rq_state &= ~RQ_LOCAL_PENDING;
- D_ASSERT(!(req->rq_state & RQ_NET_MASK));
+ if (req->rq_state & RQ_LOCAL_ABORTED) {
+ _req_may_be_done(req, m);
+ break;
+ }
- __drbd_chk_io_error(mdev, false);
- put_ldev(mdev);
+ __drbd_chk_io_error(mdev, DRBD_IO_ERROR);
+
+ goto_queue_for_net_read:
+
+ D_ASSERT(!(req->rq_state & RQ_NET_MASK));
/* no point in retrying if there is no good remote data,
* or we have no connection. */
@@ -556,10 +570,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
drbd_queue_work(&mdev->data.work, &req->w);
break;
- case oos_handed_to_network:
- /* actually the same */
+ case read_retry_remote_canceled:
case send_canceled:
- /* treat it the same */
case send_failed:
/* real cleanup will be done from tl_clear. just update flags
* so it is no longer marked as on the worker queue */
@@ -589,17 +601,17 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
}
req->rq_state &= ~RQ_NET_QUEUED;
req->rq_state |= RQ_NET_SENT;
- /* because _drbd_send_zc_bio could sleep, and may want to
- * dereference the bio even after the "write_acked_by_peer" and
- * "completed_ok" events came in, once we return from
- * _drbd_send_zc_bio (drbd_send_dblock), we have to check
- * whether it is done already, and end it. */
_req_may_be_done_not_susp(req, m);
break;
- case read_retry_remote_canceled:
+ case oos_handed_to_network:
+ /* Was not set PENDING, no longer QUEUED, so is now DONE
+ * as far as this connection is concerned. */
req->rq_state &= ~RQ_NET_QUEUED;
- /* fall through, in case we raced with drbd_disconnect */
+ req->rq_state |= RQ_NET_DONE;
+ _req_may_be_done_not_susp(req, m);
+ break;
+
case connection_lost_while_pending:
/* transfer log cleanup after connection loss */
/* assert something? */
@@ -616,8 +628,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
_req_may_be_done(req, m); /* Allowed while state.susp */
break;
- case write_acked_by_peer_and_sis:
- req->rq_state |= RQ_NET_SIS;
case conflict_discarded_by_peer:
/* for discarded conflicting writes of multiple primaries,
* there is no need to keep anything in the tl, potential
@@ -628,18 +638,15 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
(unsigned long long)req->sector, req->size);
req->rq_state |= RQ_NET_DONE;
/* fall through */
+ case write_acked_by_peer_and_sis:
case write_acked_by_peer:
+ if (what == write_acked_by_peer_and_sis)
+ req->rq_state |= RQ_NET_SIS;
/* protocol C; successfully written on peer.
- * Nothing to do here.
+ * Nothing more to do here.
* We want to keep the tl in place for all protocols, to cater
- * for volatile write-back caches on lower level devices.
- *
- * A barrier request is expected to have forced all prior
- * requests onto stable storage, so completion of a barrier
- * request could set NET_DONE right here, and not wait for the
- * P_BARRIER_ACK, but that is an unnecessary optimization. */
+ * for volatile write-back caches on lower level devices. */
- /* this makes it effectively the same as for: */
case recv_acked_by_peer:
/* protocol B; pretends to be successfully written on peer.
* see also notes above in handed_over_to_network about
@@ -688,6 +695,12 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
break;
case resend:
+ /* Simply complete (local only) READs. */
+ if (!(req->rq_state & RQ_WRITE) && !req->w.cb) {
+ _req_may_be_done(req, m);
+ break;
+ }
+
/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
before the connection loss (B&C only); only P_BARRIER_ACK was missing.
Trowing them out of the TL here by pretending we got a BARRIER_ACK
@@ -763,6 +776,40 @@ static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int s
return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
}
+static void maybe_pull_ahead(struct drbd_conf *mdev)
+{
+ int congested = 0;
+
+ /* If I don't even have good local storage, we can not reasonably try
+ * to pull ahead of the peer. We also need the local reference to make
+ * sure mdev->act_log is there.
+ * Note: caller has to make sure that net_conf is there.
+ */
+ if (!get_ldev_if_state(mdev, D_UP_TO_DATE))
+ return;
+
+ if (mdev->net_conf->cong_fill &&
+ atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) {
+ dev_info(DEV, "Congestion-fill threshold reached\n");
+ congested = 1;
+ }
+
+ if (mdev->act_log->used >= mdev->net_conf->cong_extents) {
+ dev_info(DEV, "Congestion-extents threshold reached\n");
+ congested = 1;
+ }
+
+ if (congested) {
+ queue_barrier(mdev); /* last barrier, after mirrored writes */
+
+ if (mdev->net_conf->on_congestion == OC_PULL_AHEAD)
+ _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
+ else /*mdev->net_conf->on_congestion == OC_DISCONNECT */
+ _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
+ }
+ put_ldev(mdev);
+}
+
static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
{
const int rw = bio_rw(bio);
@@ -773,6 +820,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
int local, remote, send_oos = 0;
int err = -EIO;
int ret = 0;
+ union drbd_state s;
/* allocate outside of all locks; */
req = drbd_req_new(mdev, bio);
@@ -792,7 +840,15 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
req->private_bio = NULL;
}
if (rw == WRITE) {
- remote = 1;
+ /* Need to replicate writes. Unless it is an empty flush,
+ * which is better mapped to a DRBD P_BARRIER packet,
+ * also for drbd wire protocol compatibility reasons. */
+ if (unlikely(size == 0)) {
+ /* The only size==0 bios we expect are empty flushes. */
+ D_ASSERT(bio->bi_rw & REQ_FLUSH);
+ remote = 0;
+ } else
+ remote = 1;
} else {
/* READ || READA */
if (local) {
@@ -828,14 +884,18 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
* extent. This waits for any resync activity in the corresponding
* resync extent to finish, and, if necessary, pulls in the target
* extent into the activity log, which involves further disk io because
- * of transactional on-disk meta data updates. */
- if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) {
+ * of transactional on-disk meta data updates.
+ * Empty flushes don't need to go into the activity log, they can only
+ * flush data for pending writes which are already in there. */
+ if (rw == WRITE && local && size
+ && !test_bit(AL_SUSPENDED, &mdev->flags)) {
req->rq_state |= RQ_IN_ACT_LOG;
drbd_al_begin_io(mdev, sector);
}
- remote = remote && drbd_should_do_remote(mdev->state);
- send_oos = rw == WRITE && drbd_should_send_oos(mdev->state);
+ s = mdev->state;
+ remote = remote && drbd_should_do_remote(s);
+ send_oos = rw == WRITE && drbd_should_send_oos(s);
D_ASSERT(!(remote && send_oos));
if (!(local || remote) && !is_susp(mdev->state)) {
@@ -867,7 +927,7 @@ allocate_barrier:
if (is_susp(mdev->state)) {
/* If we got suspended, use the retry mechanism of
- generic_make_request() to restart processing of this
+ drbd_make_request() to restart processing of this
bio. In the next call to drbd_make_request
we sleep in inc_ap_bio() */
ret = 1;
@@ -951,7 +1011,10 @@ allocate_barrier:
if (rw == WRITE && _req_conflicts(req))
goto fail_conflicting;
- list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
+ /* no point in adding empty flushes to the transfer log,
+ * they are mapped to drbd barriers already. */
+ if (likely(size!=0))
+ list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
/* NOTE remote first: to get the concurrent write detection right,
* we must register the request before start of local IO. */
@@ -968,29 +1031,16 @@ allocate_barrier:
_req_mod(req, queue_for_send_oos);
if (remote &&
- mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) {
- int congested = 0;
-
- if (mdev->net_conf->cong_fill &&
- atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) {
- dev_info(DEV, "Congestion-fill threshold reached\n");
- congested = 1;
- }
-
- if (mdev->act_log->used >= mdev->net_conf->cong_extents) {
- dev_info(DEV, "Congestion-extents threshold reached\n");
- congested = 1;
- }
-
- if (congested) {
- queue_barrier(mdev); /* last barrier, after mirrored writes */
-
- if (mdev->net_conf->on_congestion == OC_PULL_AHEAD)
- _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
- else /*mdev->net_conf->on_congestion == OC_DISCONNECT */
- _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
- }
- }
+ mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96)
+ maybe_pull_ahead(mdev);
+
+ /* If this was a flush, queue a drbd barrier/start a new epoch.
+ * Unless the current epoch was empty anyways, or we are not currently
+ * replicating, in which case there is no point. */
+ if (unlikely(bio->bi_rw & REQ_FLUSH)
+ && mdev->newest_tle->n_writes
+ && drbd_should_do_remote(mdev->state))
+ queue_barrier(mdev);
spin_unlock_irq(&mdev->req_lock);
kfree(b); /* if someone else has beaten us to it... */
@@ -1073,7 +1123,7 @@ static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
return 0;
}
-int drbd_make_request(struct request_queue *q, struct bio *bio)
+void drbd_make_request(struct request_queue *q, struct bio *bio)
{
unsigned int s_enr, e_enr;
struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
@@ -1081,7 +1131,7 @@ int drbd_make_request(struct request_queue *q, struct bio *bio)
if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
bio_endio(bio, -EPERM);
- return 0;
+ return;
}
start_time = jiffies;
@@ -1089,18 +1139,18 @@ int drbd_make_request(struct request_queue *q, struct bio *bio)
/*
* what we "blindly" assume:
*/
- D_ASSERT(bio->bi_size > 0);
D_ASSERT((bio->bi_size & 0x1ff) == 0);
- D_ASSERT(bio->bi_idx == 0);
/* to make some things easier, force alignment of requests within the
* granularity of our hash tables */
s_enr = bio->bi_sector >> HT_SHIFT;
- e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT;
+ e_enr = bio->bi_size ? (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT : s_enr;
if (likely(s_enr == e_enr)) {
- inc_ap_bio(mdev, 1);
- return drbd_make_request_common(mdev, bio, start_time);
+ do {
+ inc_ap_bio(mdev, 1);
+ } while (drbd_make_request_common(mdev, bio, start_time));
+ return;
}
/* can this bio be split generically?
@@ -1148,7 +1198,6 @@ int drbd_make_request(struct request_queue *q, struct bio *bio)
bio_pair_release(bp);
}
- return 0;
}
/* This is called by bio_add_page(). With this function we reduce
@@ -1196,36 +1245,66 @@ void request_timer_fn(unsigned long data)
struct drbd_conf *mdev = (struct drbd_conf *) data;
struct drbd_request *req; /* oldest request */
struct list_head *le;
- unsigned long et = 0; /* effective timeout = ko_count * timeout */
+ unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
+ unsigned long now;
if (get_net_conf(mdev)) {
- et = mdev->net_conf->timeout*HZ/10 * mdev->net_conf->ko_count;
+ if (mdev->state.conn >= C_WF_REPORT_PARAMS)
+ ent = mdev->net_conf->timeout*HZ/10
+ * mdev->net_conf->ko_count;
put_net_conf(mdev);
}
- if (!et || mdev->state.conn < C_WF_REPORT_PARAMS)
+ if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */
+ dt = mdev->ldev->dc.disk_timeout * HZ / 10;
+ put_ldev(mdev);
+ }
+ et = min_not_zero(dt, ent);
+
+ if (!et)
return; /* Recurring timer stopped */
+ now = jiffies;
+
spin_lock_irq(&mdev->req_lock);
le = &mdev->oldest_tle->requests;
if (list_empty(le)) {
spin_unlock_irq(&mdev->req_lock);
- mod_timer(&mdev->request_timer, jiffies + et);
+ mod_timer(&mdev->request_timer, now + et);
return;
}
le = le->prev;
req = list_entry(le, struct drbd_request, tl_requests);
- if (time_is_before_eq_jiffies(req->start_time + et)) {
- if (req->rq_state & RQ_NET_PENDING) {
- dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
- _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE, NULL);
- } else {
- dev_warn(DEV, "Local backing block device frozen?\n");
- mod_timer(&mdev->request_timer, jiffies + et);
- }
- } else {
- mod_timer(&mdev->request_timer, req->start_time + et);
- }
+ /* The request is considered timed out, if
+ * - we have some effective timeout from the configuration,
+ * with above state restrictions applied,
+ * - the oldest request is waiting for a response from the network
+ * resp. the local disk,
+ * - the oldest request is in fact older than the effective timeout,
+ * - the connection was established (resp. disk was attached)
+ * for longer than the timeout already.
+ * Note that for 32bit jiffies and very stable connections/disks,
+ * we may have a wrap around, which is catched by
+ * !time_in_range(now, last_..._jif, last_..._jif + timeout).
+ *
+ * Side effect: once per 32bit wrap-around interval, which means every
+ * ~198 days with 250 HZ, we have a window where the timeout would need
+ * to expire twice (worst case) to become effective. Good enough.
+ */
+ if (ent && req->rq_state & RQ_NET_PENDING &&
+ time_after(now, req->start_time + ent) &&
+ !time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) {
+ dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
+ _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
+ }
+ if (dt && req->rq_state & RQ_LOCAL_PENDING &&
+ time_after(now, req->start_time + dt) &&
+ !time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) {
+ dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n");
+ __drbd_chk_io_error(mdev, DRBD_FORCE_DETACH);
+ }
+ nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et;
spin_unlock_irq(&mdev->req_lock);
+ mod_timer(&mdev->request_timer, nt);
}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 68a234a5fdc5..3d2111919486 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -105,6 +105,7 @@ enum drbd_req_event {
read_completed_with_error,
read_ahead_completed_with_error,
write_completed_with_error,
+ abort_disk_io,
completed_ok,
resend,
fail_frozen_disk_io,
@@ -118,18 +119,21 @@ enum drbd_req_event {
* same time, so we should hold the request lock anyways.
*/
enum drbd_req_state_bits {
- /* 210
- * 000: no local possible
- * 001: to be submitted
+ /* 3210
+ * 0000: no local possible
+ * 0001: to be submitted
* UNUSED, we could map: 011: submitted, completion still pending
- * 110: completed ok
- * 010: completed with error
+ * 0110: completed ok
+ * 0010: completed with error
+ * 1001: Aborted (before completion)
+ * 1x10: Aborted and completed -> free
*/
__RQ_LOCAL_PENDING,
__RQ_LOCAL_COMPLETED,
__RQ_LOCAL_OK,
+ __RQ_LOCAL_ABORTED,
- /* 76543
+ /* 87654
* 00000: no network possible
* 00001: to be send
* 00011: to be send, on worker queue
@@ -199,8 +203,9 @@ enum drbd_req_state_bits {
#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK)
+#define RQ_LOCAL_ABORTED (1UL << __RQ_LOCAL_ABORTED)
-#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */
+#define RQ_LOCAL_MASK ((RQ_LOCAL_ABORTED << 1)-1)
#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING)
#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED)
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 4d3e6f6213ba..6bce2cc179d4 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -70,11 +70,29 @@ rwlock_t global_state_lock;
void drbd_md_io_complete(struct bio *bio, int error)
{
struct drbd_md_io *md_io;
+ struct drbd_conf *mdev;
md_io = (struct drbd_md_io *)bio->bi_private;
+ mdev = container_of(md_io, struct drbd_conf, md_io);
+
md_io->error = error;
- complete(&md_io->event);
+ /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
+ * to timeout on the lower level device, and eventually detach from it.
+ * If this io completion runs after that timeout expired, this
+ * drbd_md_put_buffer() may allow us to finally try and re-attach.
+ * During normal operation, this only puts that extra reference
+ * down to 1 again.
+ * Make sure we first drop the reference, and only then signal
+ * completion, or we may (in drbd_al_read_log()) cycle so fast into the
+ * next drbd_md_sync_page_io(), that we trigger the
+ * ASSERT(atomic_read(&mdev->md_io_in_use) == 1) there.
+ */
+ drbd_md_put_buffer(mdev);
+ md_io->done = 1;
+ wake_up(&mdev->misc_wait);
+ bio_put(bio);
+ put_ldev(mdev);
}
/* reads on behalf of the partner,
@@ -93,7 +111,7 @@ void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
if (list_empty(&mdev->read_ee))
wake_up(&mdev->ee_wait);
if (test_bit(__EE_WAS_ERROR, &e->flags))
- __drbd_chk_io_error(mdev, false);
+ __drbd_chk_io_error(mdev, DRBD_IO_ERROR);
spin_unlock_irqrestore(&mdev->req_lock, flags);
drbd_queue_work(&mdev->data.work, &e->w);
@@ -136,7 +154,7 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo
: list_empty(&mdev->active_ee);
if (test_bit(__EE_WAS_ERROR, &e->flags))
- __drbd_chk_io_error(mdev, false);
+ __drbd_chk_io_error(mdev, DRBD_IO_ERROR);
spin_unlock_irqrestore(&mdev->req_lock, flags);
if (is_syncer_req)
@@ -226,6 +244,7 @@ void drbd_endio_pri(struct bio *bio, int error)
spin_lock_irqsave(&mdev->req_lock, flags);
__req_mod(req, what, &m);
spin_unlock_irqrestore(&mdev->req_lock, flags);
+ put_ldev(mdev);
if (m.bio)
complete_master_bio(mdev, &m);
@@ -290,7 +309,7 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *
sg_init_table(&sg, 1);
crypto_hash_init(&desc);
- __bio_for_each_segment(bvec, bio, i, 0) {
+ bio_for_each_segment(bvec, bio, i) {
sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
crypto_hash_update(&desc, &sg, sg.length);
}
@@ -728,7 +747,7 @@ int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
}
drbd_start_resync(mdev, C_SYNC_SOURCE);
- clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags);
+ clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags);
return 1;
}
@@ -1482,14 +1501,6 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
return;
}
- if (mdev->state.conn < C_AHEAD) {
- /* In case a previous resync run was aborted by an IO error/detach on the peer. */
- drbd_rs_cancel_all(mdev);
- /* This should be done when we abort the resync. We definitely do not
- want to have this for connections going back and forth between
- Ahead/Behind and SyncSource/SyncTarget */
- }
-
if (side == C_SYNC_TARGET) {
/* Since application IO was locked out during C_WF_BITMAP_T and
C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
@@ -1519,14 +1530,14 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
}
drbd_state_lock(mdev);
-
+ write_lock_irq(&global_state_lock);
if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
+ write_unlock_irq(&global_state_lock);
drbd_state_unlock(mdev);
return;
}
- write_lock_irq(&global_state_lock);
- ns = mdev->state;
+ ns.i = mdev->state.i;
ns.aftr_isp = !_drbd_may_sync_now(mdev);
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 9955a53733b2..a7d6347aaa79 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -188,10 +188,10 @@ static int print_unex = 1;
#include <linux/init.h>
#include <linux/platform_device.h>
#include <linux/mod_devicetable.h>
-#include <linux/buffer_head.h> /* for invalidate_buffers() */
#include <linux/mutex.h>
#include <linux/io.h>
#include <linux/uaccess.h>
+#include <linux/async.h>
/*
* PS/2 floppies have much slower step rates than regular floppies.
@@ -203,7 +203,6 @@ static int slow_floppy;
#include <asm/dma.h>
#include <asm/irq.h>
-#include <asm/system.h>
static int FLOPPY_IRQ = 6;
static int FLOPPY_DMA = 2;
@@ -553,7 +552,7 @@ static void floppy_ready(void);
static void floppy_start(void);
static void process_fd_request(void);
static void recalibrate_floppy(void);
-static void floppy_shutdown(unsigned long);
+static void floppy_shutdown(struct work_struct *);
static int floppy_request_regions(int);
static void floppy_release_regions(int);
@@ -590,6 +589,8 @@ static int buffer_max = -1;
static struct floppy_fdc_state fdc_state[N_FDC];
static int fdc; /* current fdc */
+static struct workqueue_struct *floppy_wq;
+
static struct floppy_struct *_floppy = floppy_type;
static unsigned char current_drive;
static long current_count_sectors;
@@ -631,16 +632,15 @@ static inline void set_debugt(void) { }
static inline void debugt(const char *func, const char *msg) { }
#endif /* DEBUGT */
-typedef void (*timeout_fn)(unsigned long);
-static DEFINE_TIMER(fd_timeout, floppy_shutdown, 0, 0);
+static DECLARE_DELAYED_WORK(fd_timeout, floppy_shutdown);
static const char *timeout_message;
static void is_alive(const char *func, const char *message)
{
/* this routine checks whether the floppy driver is "alive" */
if (test_bit(0, &fdc_busy) && command_status < 2 &&
- !timer_pending(&fd_timeout)) {
+ !delayed_work_pending(&fd_timeout)) {
DPRINT("%s: timeout handler died. %s\n", func, message);
}
}
@@ -668,15 +668,19 @@ static int output_log_pos;
static void __reschedule_timeout(int drive, const char *message)
{
+ unsigned long delay;
+
if (drive == current_reqD)
drive = current_drive;
- del_timer(&fd_timeout);
+ __cancel_delayed_work(&fd_timeout);
+
if (drive < 0 || drive >= N_DRIVE) {
- fd_timeout.expires = jiffies + 20UL * HZ;
+ delay = 20UL * HZ;
drive = 0;
} else
- fd_timeout.expires = jiffies + UDP->timeout;
- add_timer(&fd_timeout);
+ delay = UDP->timeout;
+
+ queue_delayed_work(floppy_wq, &fd_timeout, delay);
if (UDP->flags & FD_DEBUG)
DPRINT("reschedule timeout %s\n", message);
timeout_message = message;
@@ -874,7 +878,7 @@ static int lock_fdc(int drive, bool interruptible)
command_status = FD_COMMAND_NONE;
- __reschedule_timeout(drive, "lock fdc");
+ reschedule_timeout(drive, "lock fdc");
set_fdc(drive);
return 0;
}
@@ -882,23 +886,15 @@ static int lock_fdc(int drive, bool interruptible)
/* unlocks the driver */
static void unlock_fdc(void)
{
- unsigned long flags;
-
- raw_cmd = NULL;
if (!test_bit(0, &fdc_busy))
DPRINT("FDC access conflict!\n");
- if (do_floppy)
- DPRINT("device interrupt still active at FDC release: %pf!\n",
- do_floppy);
+ raw_cmd = NULL;
command_status = FD_COMMAND_NONE;
- spin_lock_irqsave(&floppy_lock, flags);
- del_timer(&fd_timeout);
+ __cancel_delayed_work(&fd_timeout);
+ do_floppy = NULL;
cont = NULL;
clear_bit(0, &fdc_busy);
- if (current_req || set_next_request())
- do_fd_request(current_req->q);
- spin_unlock_irqrestore(&floppy_lock, flags);
wake_up(&fdc_wait);
}
@@ -970,26 +966,24 @@ static DECLARE_WORK(floppy_work, NULL);
static void schedule_bh(void (*handler)(void))
{
+ WARN_ON(work_pending(&floppy_work));
+
PREPARE_WORK(&floppy_work, (work_func_t)handler);
- schedule_work(&floppy_work);
+ queue_work(floppy_wq, &floppy_work);
}
-static DEFINE_TIMER(fd_timer, NULL, 0, 0);
+static DECLARE_DELAYED_WORK(fd_timer, NULL);
static void cancel_activity(void)
{
- unsigned long flags;
-
- spin_lock_irqsave(&floppy_lock, flags);
do_floppy = NULL;
- PREPARE_WORK(&floppy_work, (work_func_t)empty);
- del_timer(&fd_timer);
- spin_unlock_irqrestore(&floppy_lock, flags);
+ cancel_delayed_work_sync(&fd_timer);
+ cancel_work_sync(&floppy_work);
}
/* this function makes sure that the disk stays in the drive during the
* transfer */
-static void fd_watchdog(void)
+static void fd_watchdog(struct work_struct *arg)
{
debug_dcl(DP->flags, "calling disk change from watchdog\n");
@@ -999,21 +993,20 @@ static void fd_watchdog(void)
cont->done(0);
reset_fdc();
} else {
- del_timer(&fd_timer);
- fd_timer.function = (timeout_fn)fd_watchdog;
- fd_timer.expires = jiffies + HZ / 10;
- add_timer(&fd_timer);
+ cancel_delayed_work(&fd_timer);
+ PREPARE_DELAYED_WORK(&fd_timer, fd_watchdog);
+ queue_delayed_work(floppy_wq, &fd_timer, HZ / 10);
}
}
static void main_command_interrupt(void)
{
- del_timer(&fd_timer);
+ cancel_delayed_work(&fd_timer);
cont->interrupt();
}
/* waits for a delay (spinup or select) to pass */
-static int fd_wait_for_completion(unsigned long delay, timeout_fn function)
+static int fd_wait_for_completion(unsigned long expires, work_func_t function)
{
if (FDCS->reset) {
reset_fdc(); /* do the reset during sleep to win time
@@ -1022,47 +1015,15 @@ static int fd_wait_for_completion(unsigned long delay, timeout_fn function)
return 1;
}
- if (time_before(jiffies, delay)) {
- del_timer(&fd_timer);
- fd_timer.function = function;
- fd_timer.expires = delay;
- add_timer(&fd_timer);
+ if (time_before(jiffies, expires)) {
+ cancel_delayed_work(&fd_timer);
+ PREPARE_DELAYED_WORK(&fd_timer, function);
+ queue_delayed_work(floppy_wq, &fd_timer, expires - jiffies);
return 1;
}
return 0;
}
-static DEFINE_SPINLOCK(floppy_hlt_lock);
-static int hlt_disabled;
-static void floppy_disable_hlt(void)
-{
- unsigned long flags;
-
- WARN_ONCE(1, "floppy_disable_hlt() scheduled for removal in 2012");
- spin_lock_irqsave(&floppy_hlt_lock, flags);
- if (!hlt_disabled) {
- hlt_disabled = 1;
-#ifdef HAVE_DISABLE_HLT
- disable_hlt();
-#endif
- }
- spin_unlock_irqrestore(&floppy_hlt_lock, flags);
-}
-
-static void floppy_enable_hlt(void)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&floppy_hlt_lock, flags);
- if (hlt_disabled) {
- hlt_disabled = 0;
-#ifdef HAVE_DISABLE_HLT
- enable_hlt();
-#endif
- }
- spin_unlock_irqrestore(&floppy_hlt_lock, flags);
-}
-
static void setup_DMA(void)
{
unsigned long f;
@@ -1107,7 +1068,6 @@ static void setup_DMA(void)
fd_enable_dma();
release_dma_lock(f);
#endif
- floppy_disable_hlt();
}
static void show_floppy(void);
@@ -1376,7 +1336,7 @@ static int fdc_dtr(void)
*/
FDCS->dtr = raw_cmd->rate & 3;
return fd_wait_for_completion(jiffies + 2UL * HZ / 100,
- (timeout_fn)floppy_ready);
+ (work_func_t)floppy_ready);
} /* fdc_dtr */
static void tell_sector(void)
@@ -1481,7 +1441,7 @@ static void setup_rw_floppy(void)
int flags;
int dflags;
unsigned long ready_date;
- timeout_fn function;
+ work_func_t function;
flags = raw_cmd->flags;
if (flags & (FD_RAW_READ | FD_RAW_WRITE))
@@ -1495,9 +1455,9 @@ static void setup_rw_floppy(void)
*/
if (time_after(ready_date, jiffies + DP->select_delay)) {
ready_date -= DP->select_delay;
- function = (timeout_fn)floppy_start;
+ function = (work_func_t)floppy_start;
} else
- function = (timeout_fn)setup_rw_floppy;
+ function = (work_func_t)setup_rw_floppy;
/* wait until the floppy is spinning fast enough */
if (fd_wait_for_completion(ready_date, function))
@@ -1527,7 +1487,7 @@ static void setup_rw_floppy(void)
inr = result();
cont->interrupt();
} else if (flags & FD_RAW_NEED_DISK)
- fd_watchdog();
+ fd_watchdog(NULL);
}
static int blind_seek;
@@ -1709,7 +1669,6 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id)
fd_disable_dma();
release_dma_lock(f);
- floppy_enable_hlt();
do_floppy = NULL;
if (fdc >= N_FDC || FDCS->address == -1) {
/* we don't even know which FDC is the culprit */
@@ -1837,20 +1796,22 @@ static void show_floppy(void)
pr_info("do_floppy=%pf\n", do_floppy);
if (work_pending(&floppy_work))
pr_info("floppy_work.func=%pf\n", floppy_work.func);
- if (timer_pending(&fd_timer))
- pr_info("fd_timer.function=%pf\n", fd_timer.function);
- if (timer_pending(&fd_timeout)) {
- pr_info("timer_function=%pf\n", fd_timeout.function);
- pr_info("expires=%lu\n", fd_timeout.expires - jiffies);
- pr_info("now=%lu\n", jiffies);
- }
+ if (delayed_work_pending(&fd_timer))
+ pr_info("delayed work.function=%p expires=%ld\n",
+ fd_timer.work.func,
+ fd_timer.timer.expires - jiffies);
+ if (delayed_work_pending(&fd_timeout))
+ pr_info("timer_function=%p expires=%ld\n",
+ fd_timeout.work.func,
+ fd_timeout.timer.expires - jiffies);
+
pr_info("cont=%p\n", cont);
pr_info("current_req=%p\n", current_req);
pr_info("command_status=%d\n", command_status);
pr_info("\n");
}
-static void floppy_shutdown(unsigned long data)
+static void floppy_shutdown(struct work_struct *arg)
{
unsigned long flags;
@@ -1858,8 +1819,6 @@ static void floppy_shutdown(unsigned long data)
show_floppy();
cancel_activity();
- floppy_enable_hlt();
-
flags = claim_dma_lock();
fd_disable_dma();
release_dma_lock(flags);
@@ -1905,7 +1864,7 @@ static int start_motor(void (*function)(void))
/* wait_for_completion also schedules reset if needed. */
return fd_wait_for_completion(DRS->select_date + DP->select_delay,
- (timeout_fn)function);
+ (work_func_t)function);
}
static void floppy_ready(void)
@@ -2558,8 +2517,7 @@ static int make_raw_rw_request(void)
set_fdc((long)current_req->rq_disk->private_data);
raw_cmd = &default_raw_cmd;
- raw_cmd->flags = FD_RAW_SPIN | FD_RAW_NEED_DISK | FD_RAW_NEED_DISK |
- FD_RAW_NEED_SEEK;
+ raw_cmd->flags = FD_RAW_SPIN | FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK;
raw_cmd->cmd_count = NR_RW;
if (rq_data_dir(current_req) == READ) {
raw_cmd->flags |= FD_RAW_READ;
@@ -2858,7 +2816,6 @@ do_request:
spin_lock_irq(&floppy_lock);
pending = set_next_request();
spin_unlock_irq(&floppy_lock);
-
if (!pending) {
do_floppy = NULL;
unlock_fdc();
@@ -2935,13 +2892,15 @@ static void do_fd_request(struct request_queue *q)
current_req->cmd_flags))
return;
- if (test_bit(0, &fdc_busy)) {
+ if (test_and_set_bit(0, &fdc_busy)) {
/* fdc busy, this new request will be treated when the
current one is done */
is_alive(__func__, "old request running");
return;
}
- lock_fdc(MAXTIMEOUT, false);
+ command_status = FD_COMMAND_NONE;
+ __reschedule_timeout(MAXTIMEOUT, "fd_request");
+ set_fdc(0);
process_fd_request();
is_alive(__func__, "");
}
@@ -3649,9 +3608,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
mutex_lock(&floppy_mutex);
mutex_lock(&open_lock);
- if (UDRS->fd_ref < 0)
- UDRS->fd_ref = 0;
- else if (!UDRS->fd_ref--) {
+ if (!UDRS->fd_ref--) {
DPRINT("floppy_release with fd_ref == 0");
UDRS->fd_ref = 0;
}
@@ -3687,13 +3644,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
set_bit(FD_VERIFY_BIT, &UDRS->flags);
}
- if (UDRS->fd_ref == -1 || (UDRS->fd_ref && (mode & FMODE_EXCL)))
- goto out2;
-
- if (mode & FMODE_EXCL)
- UDRS->fd_ref = -1;
- else
- UDRS->fd_ref++;
+ UDRS->fd_ref++;
opened_bdev[drive] = bdev;
@@ -3756,10 +3707,8 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
mutex_unlock(&floppy_mutex);
return 0;
out:
- if (UDRS->fd_ref < 0)
- UDRS->fd_ref = 0;
- else
- UDRS->fd_ref--;
+ UDRS->fd_ref--;
+
if (!UDRS->fd_ref)
opened_bdev[drive] = NULL;
out2:
@@ -3833,7 +3782,7 @@ static int __floppy_read_block_0(struct block_device *bdev)
bio.bi_size = size;
bio.bi_bdev = bdev;
bio.bi_sector = 0;
- bio.bi_flags = BIO_QUIET;
+ bio.bi_flags = (1 << BIO_QUIET);
init_completion(&complete);
bio.bi_private = &complete;
bio.bi_end_io = floppy_rb0_complete;
@@ -4174,7 +4123,7 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data)
return get_disk(disks[drive]);
}
-static int __init floppy_init(void)
+static int __init do_floppy_init(void)
{
int i, unit, drive;
int err, dr;
@@ -4196,10 +4145,16 @@ static int __init floppy_init(void)
goto out_put_disk;
}
+ floppy_wq = alloc_ordered_workqueue("floppy", 0);
+ if (!floppy_wq) {
+ err = -ENOMEM;
+ goto out_put_disk;
+ }
+
disks[dr]->queue = blk_init_queue(do_fd_request, &floppy_lock);
if (!disks[dr]->queue) {
err = -ENOMEM;
- goto out_put_disk;
+ goto out_destroy_workq;
}
blk_queue_max_hw_sectors(disks[dr]->queue, 64);
@@ -4250,7 +4205,7 @@ static int __init floppy_init(void)
use_virtual_dma = can_use_virtual_dma & 1;
fdc_state[0].address = FDC1;
if (fdc_state[0].address == -1) {
- del_timer_sync(&fd_timeout);
+ cancel_delayed_work(&fd_timeout);
err = -ENODEV;
goto out_unreg_region;
}
@@ -4261,7 +4216,7 @@ static int __init floppy_init(void)
fdc = 0; /* reset fdc in case of unexpected interrupt */
err = floppy_grab_irq_and_dma();
if (err) {
- del_timer_sync(&fd_timeout);
+ cancel_delayed_work(&fd_timeout);
err = -EBUSY;
goto out_unreg_region;
}
@@ -4318,13 +4273,13 @@ static int __init floppy_init(void)
user_reset_fdc(-1, FD_RESET_ALWAYS, false);
}
fdc = 0;
- del_timer_sync(&fd_timeout);
+ cancel_delayed_work(&fd_timeout);
current_drive = 0;
initialized = true;
if (have_no_fdc) {
DPRINT("no floppy controllers found\n");
err = have_no_fdc;
- goto out_flush_work;
+ goto out_release_dma;
}
for (drive = 0; drive < N_DRIVE; drive++) {
@@ -4339,7 +4294,7 @@ static int __init floppy_init(void)
err = platform_device_register(&floppy_device[drive]);
if (err)
- goto out_flush_work;
+ goto out_release_dma;
err = device_create_file(&floppy_device[drive].dev,
&dev_attr_cmos);
@@ -4357,25 +4312,50 @@ static int __init floppy_init(void)
out_unreg_platform_dev:
platform_device_unregister(&floppy_device[drive]);
-out_flush_work:
- flush_work_sync(&floppy_work);
+out_release_dma:
if (atomic_read(&usage_count))
floppy_release_irq_and_dma();
out_unreg_region:
blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
platform_driver_unregister(&floppy_driver);
+out_destroy_workq:
+ destroy_workqueue(floppy_wq);
out_unreg_blkdev:
unregister_blkdev(FLOPPY_MAJOR, "fd");
out_put_disk:
while (dr--) {
del_timer_sync(&motor_off_timer[dr]);
- if (disks[dr]->queue)
+ if (disks[dr]->queue) {
blk_cleanup_queue(disks[dr]->queue);
+ /*
+ * put_disk() is not paired with add_disk() and
+ * will put queue reference one extra time. fix it.
+ */
+ disks[dr]->queue = NULL;
+ }
put_disk(disks[dr]);
}
return err;
}
+#ifndef MODULE
+static __init void floppy_async_init(void *data, async_cookie_t cookie)
+{
+ do_floppy_init();
+}
+#endif
+
+static int __init floppy_init(void)
+{
+#ifdef MODULE
+ return do_floppy_init();
+#else
+ /* Don't hold up the bootup by the floppy initialization */
+ async_schedule(floppy_async_init, NULL);
+ return 0;
+#endif
+}
+
static const struct io_region {
int offset;
int size;
@@ -4428,7 +4408,7 @@ static int floppy_grab_irq_and_dma(void)
* We might have scheduled a free_irq(), wait it to
* drain first:
*/
- flush_work_sync(&floppy_work);
+ flush_workqueue(floppy_wq);
if (fd_request_irq()) {
DPRINT("Unable to grab IRQ%d for the floppy driver\n",
@@ -4504,7 +4484,6 @@ static void floppy_release_irq_and_dma(void)
#if N_FDC > 1
set_dor(1, ~8, 0);
#endif
- floppy_enable_hlt();
if (floppy_track_buffer && max_buffer_sectors) {
tmpsize = max_buffer_sectors * 1024;
@@ -4520,9 +4499,9 @@ static void floppy_release_irq_and_dma(void)
pr_info("motor off timer %d still active\n", drive);
#endif
- if (timer_pending(&fd_timeout))
+ if (delayed_work_pending(&fd_timeout))
pr_info("floppy timer still active:%s\n", timeout_message);
- if (timer_pending(&fd_timer))
+ if (delayed_work_pending(&fd_timer))
pr_info("auxiliary floppy timer still active\n");
if (work_pending(&floppy_work))
pr_info("work still pending\n");
@@ -4580,11 +4559,21 @@ static void __exit floppy_module_exit(void)
platform_device_unregister(&floppy_device[drive]);
}
blk_cleanup_queue(disks[drive]->queue);
+
+ /*
+ * These disks have not called add_disk(). Don't put down
+ * queue reference in put_disk().
+ */
+ if (!(allowed_drive_mask & (1 << drive)) ||
+ fdc_state[FDC(drive)].version == FDC_NONE)
+ disks[drive]->queue = NULL;
+
put_disk(disks[drive]);
}
- del_timer_sync(&fd_timeout);
- del_timer_sync(&fd_timer);
+ cancel_delayed_work_sync(&fd_timeout);
+ cancel_delayed_work_sync(&fd_timer);
+ destroy_workqueue(floppy_wq);
if (atomic_read(&usage_count))
floppy_release_irq_and_dma();
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index b52c9ca146fc..bf397bf108b7 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -44,7 +44,6 @@
#define HD_IRQ 14
#define REALLY_SLOW_IO
-#include <asm/system.h>
#include <asm/io.h>
#include <asm/uaccess.h>
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 4720c7ade0ae..3bba65510d23 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -69,13 +69,14 @@
#include <linux/freezer.h>
#include <linux/mutex.h>
#include <linux/writeback.h>
-#include <linux/buffer_head.h> /* for invalidate_bdev() */
#include <linux/completion.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
#include <linux/splice.h>
#include <linux/sysfs.h>
#include <linux/miscdevice.h>
+#include <linux/falloc.h>
+
#include <asm/uaccess.h>
static DEFINE_IDR(loop_index_idr);
@@ -92,16 +93,16 @@ static int transfer_none(struct loop_device *lo, int cmd,
struct page *loop_page, unsigned loop_off,
int size, sector_t real_block)
{
- char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;
- char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;
+ char *raw_buf = kmap_atomic(raw_page) + raw_off;
+ char *loop_buf = kmap_atomic(loop_page) + loop_off;
if (cmd == READ)
memcpy(loop_buf, raw_buf, size);
else
memcpy(raw_buf, loop_buf, size);
- kunmap_atomic(loop_buf, KM_USER1);
- kunmap_atomic(raw_buf, KM_USER0);
+ kunmap_atomic(loop_buf);
+ kunmap_atomic(raw_buf);
cond_resched();
return 0;
}
@@ -111,8 +112,8 @@ static int transfer_xor(struct loop_device *lo, int cmd,
struct page *loop_page, unsigned loop_off,
int size, sector_t real_block)
{
- char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;
- char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;
+ char *raw_buf = kmap_atomic(raw_page) + raw_off;
+ char *loop_buf = kmap_atomic(loop_page) + loop_off;
char *in, *out, *key;
int i, keysize;
@@ -129,8 +130,8 @@ static int transfer_xor(struct loop_device *lo, int cmd,
for (i = 0; i < size; i++)
*out++ = *in++ ^ key[(i & 511) % keysize];
- kunmap_atomic(loop_buf, KM_USER1);
- kunmap_atomic(raw_buf, KM_USER0);
+ kunmap_atomic(loop_buf);
+ kunmap_atomic(raw_buf);
cond_resched();
return 0;
}
@@ -159,17 +160,19 @@ static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = {
&xor_funcs
};
-static loff_t get_loop_size(struct loop_device *lo, struct file *file)
+static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file)
{
- loff_t size, offset, loopsize;
+ loff_t size, loopsize;
/* Compute loopsize in bytes */
size = i_size_read(file->f_mapping->host);
- offset = lo->lo_offset;
loopsize = size - offset;
- if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize)
- loopsize = lo->lo_sizelimit;
+ /* offset is beyond i_size, wierd but possible */
+ if (loopsize < 0)
+ return 0;
+ if (sizelimit > 0 && sizelimit < loopsize)
+ loopsize = sizelimit;
/*
* Unfortunately, if we want to do I/O on the device,
* the number of 512-byte sectors has to fit into a sector_t.
@@ -177,17 +180,25 @@ static loff_t get_loop_size(struct loop_device *lo, struct file *file)
return loopsize >> 9;
}
+static loff_t get_loop_size(struct loop_device *lo, struct file *file)
+{
+ return get_size(lo->lo_offset, lo->lo_sizelimit, file);
+}
+
static int
-figure_loop_size(struct loop_device *lo)
+figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit)
{
- loff_t size = get_loop_size(lo, lo->lo_backing_file);
+ loff_t size = get_size(offset, sizelimit, lo->lo_backing_file);
sector_t x = (sector_t)size;
if (unlikely((loff_t)x != size))
return -EFBIG;
-
+ if (lo->lo_offset != offset)
+ lo->lo_offset = offset;
+ if (lo->lo_sizelimit != sizelimit)
+ lo->lo_sizelimit = sizelimit;
set_capacity(lo->lo_disk, x);
- return 0;
+ return 0;
}
static inline int
@@ -203,74 +214,6 @@ lo_do_transfer(struct loop_device *lo, int cmd,
}
/**
- * do_lo_send_aops - helper for writing data to a loop device
- *
- * This is the fast version for backing filesystems which implement the address
- * space operations write_begin and write_end.
- */
-static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
- loff_t pos, struct page *unused)
-{
- struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */
- struct address_space *mapping = file->f_mapping;
- pgoff_t index;
- unsigned offset, bv_offs;
- int len, ret;
-
- mutex_lock(&mapping->host->i_mutex);
- index = pos >> PAGE_CACHE_SHIFT;
- offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1);
- bv_offs = bvec->bv_offset;
- len = bvec->bv_len;
- while (len > 0) {
- sector_t IV;
- unsigned size, copied;
- int transfer_result;
- struct page *page;
- void *fsdata;
-
- IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
- size = PAGE_CACHE_SIZE - offset;
- if (size > len)
- size = len;
-
- ret = pagecache_write_begin(file, mapping, pos, size, 0,
- &page, &fsdata);
- if (ret)
- goto fail;
-
- file_update_time(file);
-
- transfer_result = lo_do_transfer(lo, WRITE, page, offset,
- bvec->bv_page, bv_offs, size, IV);
- copied = size;
- if (unlikely(transfer_result))
- copied = 0;
-
- ret = pagecache_write_end(file, mapping, pos, size, copied,
- page, fsdata);
- if (ret < 0 || ret != copied)
- goto fail;
-
- if (unlikely(transfer_result))
- goto fail;
-
- bv_offs += copied;
- len -= copied;
- offset = 0;
- index++;
- pos += copied;
- }
- ret = 0;
-out:
- mutex_unlock(&mapping->host->i_mutex);
- return ret;
-fail:
- ret = -1;
- goto out;
-}
-
-/**
* __do_lo_send_write - helper for writing data to a loop device
*
* This helper just factors out common code between do_lo_send_direct_write()
@@ -297,10 +240,8 @@ static int __do_lo_send_write(struct file *file,
/**
* do_lo_send_direct_write - helper for writing data to a loop device
*
- * This is the fast, non-transforming version for backing filesystems which do
- * not implement the address space operations write_begin and write_end.
- * It uses the write file operation which should be present on all writeable
- * filesystems.
+ * This is the fast, non-transforming version that does not need double
+ * buffering.
*/
static int do_lo_send_direct_write(struct loop_device *lo,
struct bio_vec *bvec, loff_t pos, struct page *page)
@@ -316,15 +257,9 @@ static int do_lo_send_direct_write(struct loop_device *lo,
/**
* do_lo_send_write - helper for writing data to a loop device
*
- * This is the slow, transforming version for filesystems which do not
- * implement the address space operations write_begin and write_end. It
- * uses the write file operation which should be present on all writeable
- * filesystems.
- *
- * Using fops->write is slower than using aops->{prepare,commit}_write in the
- * transforming case because we need to double buffer the data as we cannot do
- * the transformations in place as we do not have direct access to the
- * destination pages of the backing file.
+ * This is the slow, transforming version that needs to double buffer the
+ * data as it cannot do the transformations in place without having direct
+ * access to the destination pages of the backing file.
*/
static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
loff_t pos, struct page *page)
@@ -350,17 +285,16 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
struct page *page = NULL;
int i, ret = 0;
- do_lo_send = do_lo_send_aops;
- if (!(lo->lo_flags & LO_FLAGS_USE_AOPS)) {
+ if (lo->transfer != transfer_none) {
+ page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
+ if (unlikely(!page))
+ goto fail;
+ kmap(page);
+ do_lo_send = do_lo_send_write;
+ } else {
do_lo_send = do_lo_send_direct_write;
- if (lo->transfer != transfer_none) {
- page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
- if (unlikely(!page))
- goto fail;
- kmap(page);
- do_lo_send = do_lo_send_write;
- }
}
+
bio_for_each_segment(bvec, bio, i) {
ret = do_lo_send(lo, bvec, pos, page);
if (ret < 0)
@@ -422,14 +356,14 @@ lo_direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd)
return __splice_from_pipe(pipe, sd, lo_splice_actor);
}
-static int
+static ssize_t
do_lo_receive(struct loop_device *lo,
struct bio_vec *bvec, int bsize, loff_t pos)
{
struct lo_read_data cookie;
struct splice_desc sd;
struct file *file;
- long retval;
+ ssize_t retval;
cookie.lo = lo;
cookie.page = bvec->bv_page;
@@ -445,25 +379,28 @@ do_lo_receive(struct loop_device *lo,
file = lo->lo_backing_file;
retval = splice_direct_to_actor(file, &sd, lo_direct_splice_actor);
- if (retval < 0)
- return retval;
-
- return 0;
+ return retval;
}
static int
lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
{
struct bio_vec *bvec;
- int i, ret = 0;
+ ssize_t s;
+ int i;
bio_for_each_segment(bvec, bio, i) {
- ret = do_lo_receive(lo, bvec, bsize, pos);
- if (ret < 0)
+ s = do_lo_receive(lo, bvec, bsize, pos);
+ if (s < 0)
+ return s;
+
+ if (s != bvec->bv_len) {
+ zero_fill_bio(bio);
break;
+ }
pos += bvec->bv_len;
}
- return ret;
+ return 0;
}
static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
@@ -484,6 +421,29 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
}
}
+ /*
+ * We use punch hole to reclaim the free space used by the
+ * image a.k.a. discard. However we do not support discard if
+ * encryption is enabled, because it may give an attacker
+ * useful information.
+ */
+ if (bio->bi_rw & REQ_DISCARD) {
+ struct file *file = lo->lo_backing_file;
+ int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
+
+ if ((!file->f_op->fallocate) ||
+ lo->lo_encrypt_key_size) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ ret = file->f_op->fallocate(file, mode, pos,
+ bio->bi_size);
+ if (unlikely(ret && ret != -EINVAL &&
+ ret != -EOPNOTSUPP))
+ ret = -EIO;
+ goto out;
+ }
+
ret = lo_send(lo, bio, pos);
if ((bio->bi_rw & REQ_FUA) && !ret) {
@@ -514,7 +474,7 @@ static struct bio *loop_get_bio(struct loop_device *lo)
return bio_list_pop(&lo->lo_bio_list);
}
-static int loop_make_request(struct request_queue *q, struct bio *old_bio)
+static void loop_make_request(struct request_queue *q, struct bio *old_bio)
{
struct loop_device *lo = q->queuedata;
int rw = bio_rw(old_bio);
@@ -532,12 +492,11 @@ static int loop_make_request(struct request_queue *q, struct bio *old_bio)
loop_add_bio(lo, old_bio);
wake_up(&lo->lo_event);
spin_unlock_irq(&lo->lo_lock);
- return 0;
+ return;
out:
spin_unlock_irq(&lo->lo_lock);
bio_io_error(old_bio);
- return 0;
}
struct switch_request {
@@ -700,7 +659,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
goto out_putf;
fput(old_file);
- if (max_part > 0)
+ if (lo->lo_flags & LO_FLAGS_PARTSCAN)
ioctl_by_bdev(bdev, BLKRRPART, 0);
return 0;
@@ -777,16 +736,25 @@ static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf)
return sprintf(buf, "%s\n", autoclear ? "1" : "0");
}
+static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf)
+{
+ int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN);
+
+ return sprintf(buf, "%s\n", partscan ? "1" : "0");
+}
+
LOOP_ATTR_RO(backing_file);
LOOP_ATTR_RO(offset);
LOOP_ATTR_RO(sizelimit);
LOOP_ATTR_RO(autoclear);
+LOOP_ATTR_RO(partscan);
static struct attribute *loop_attrs[] = {
&loop_attr_backing_file.attr,
&loop_attr_offset.attr,
&loop_attr_sizelimit.attr,
&loop_attr_autoclear.attr,
+ &loop_attr_partscan.attr,
NULL,
};
@@ -807,6 +775,35 @@ static void loop_sysfs_exit(struct loop_device *lo)
&loop_attribute_group);
}
+static void loop_config_discard(struct loop_device *lo)
+{
+ struct file *file = lo->lo_backing_file;
+ struct inode *inode = file->f_mapping->host;
+ struct request_queue *q = lo->lo_queue;
+
+ /*
+ * We use punch hole to reclaim the free space used by the
+ * image a.k.a. discard. However we do support discard if
+ * encryption is enabled, because it may give an attacker
+ * useful information.
+ */
+ if ((!file->f_op->fallocate) ||
+ lo->lo_encrypt_key_size) {
+ q->limits.discard_granularity = 0;
+ q->limits.discard_alignment = 0;
+ q->limits.max_discard_sectors = 0;
+ q->limits.discard_zeroes_data = 0;
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
+ return;
+ }
+
+ q->limits.discard_granularity = inode->i_sb->s_blocksize;
+ q->limits.discard_alignment = 0;
+ q->limits.max_discard_sectors = UINT_MAX >> 9;
+ q->limits.discard_zeroes_data = 1;
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+}
+
static int loop_set_fd(struct loop_device *lo, fmode_t mode,
struct block_device *bdev, unsigned int arg)
{
@@ -849,35 +846,23 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
mapping = file->f_mapping;
inode = mapping->host;
- if (!(file->f_mode & FMODE_WRITE))
- lo_flags |= LO_FLAGS_READ_ONLY;
-
error = -EINVAL;
- if (S_ISREG(inode->i_mode) || S_ISBLK(inode->i_mode)) {
- const struct address_space_operations *aops = mapping->a_ops;
-
- if (aops->write_begin)
- lo_flags |= LO_FLAGS_USE_AOPS;
- if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
- lo_flags |= LO_FLAGS_READ_ONLY;
+ if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+ goto out_putf;
- lo_blocksize = S_ISBLK(inode->i_mode) ?
- inode->i_bdev->bd_block_size : PAGE_SIZE;
+ if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) ||
+ !file->f_op->write)
+ lo_flags |= LO_FLAGS_READ_ONLY;
- error = 0;
- } else {
- goto out_putf;
- }
+ lo_blocksize = S_ISBLK(inode->i_mode) ?
+ inode->i_bdev->bd_block_size : PAGE_SIZE;
+ error = -EFBIG;
size = get_loop_size(lo, file);
-
- if ((loff_t)(sector_t)size != size) {
- error = -EFBIG;
+ if ((loff_t)(sector_t)size != size)
goto out_putf;
- }
- if (!(mode & FMODE_WRITE))
- lo_flags |= LO_FLAGS_READ_ONLY;
+ error = 0;
set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
@@ -919,7 +904,9 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
}
lo->lo_state = Lo_bound;
wake_up_process(lo->lo_thread);
- if (max_part > 0)
+ if (part_shift)
+ lo->lo_flags |= LO_FLAGS_PARTSCAN;
+ if (lo->lo_flags & LO_FLAGS_PARTSCAN)
ioctl_by_bdev(bdev, BLKRRPART, 0);
return 0;
@@ -980,10 +967,11 @@ loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer,
return err;
}
-static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
+static int loop_clr_fd(struct loop_device *lo)
{
struct file *filp = lo->lo_backing_file;
gfp_t gfp = lo->old_gfp_mask;
+ struct block_device *bdev = lo->lo_device;
if (lo->lo_state != Lo_bound)
return -ENXIO;
@@ -1012,7 +1000,6 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
lo->lo_offset = 0;
lo->lo_sizelimit = 0;
lo->lo_encrypt_key_size = 0;
- lo->lo_flags = 0;
lo->lo_thread = NULL;
memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
@@ -1030,8 +1017,11 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
lo->lo_state = Lo_unbound;
/* This is safe: open() is still holding a reference. */
module_put(THIS_MODULE);
- if (max_part > 0 && bdev)
+ if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev)
ioctl_by_bdev(bdev, BLKRRPART, 0);
+ lo->lo_flags = 0;
+ if (!part_shift)
+ lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
mutex_unlock(&lo->lo_ctl_mutex);
/*
* Need not hold lo_ctl_mutex to fput backing file.
@@ -1080,11 +1070,10 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
if (lo->lo_offset != info->lo_offset ||
lo->lo_sizelimit != info->lo_sizelimit) {
- lo->lo_offset = info->lo_offset;
- lo->lo_sizelimit = info->lo_sizelimit;
- if (figure_loop_size(lo))
+ if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit))
return -EFBIG;
}
+ loop_config_discard(lo);
memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE);
memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE);
@@ -1100,6 +1089,13 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
(info->lo_flags & LO_FLAGS_AUTOCLEAR))
lo->lo_flags ^= LO_FLAGS_AUTOCLEAR;
+ if ((info->lo_flags & LO_FLAGS_PARTSCAN) &&
+ !(lo->lo_flags & LO_FLAGS_PARTSCAN)) {
+ lo->lo_flags |= LO_FLAGS_PARTSCAN;
+ lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;
+ ioctl_by_bdev(lo->lo_device, BLKRRPART, 0);
+ }
+
lo->lo_encrypt_key_size = info->lo_encrypt_key_size;
lo->lo_init[0] = info->lo_init[0];
lo->lo_init[1] = info->lo_init[1];
@@ -1260,7 +1256,7 @@ static int loop_set_capacity(struct loop_device *lo, struct block_device *bdev)
err = -ENXIO;
if (unlikely(lo->lo_state != Lo_bound))
goto out;
- err = figure_loop_size(lo);
+ err = figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit);
if (unlikely(err))
goto out;
sec = get_capacity(lo->lo_disk);
@@ -1293,18 +1289,24 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
break;
case LOOP_CLR_FD:
/* loop_clr_fd would have unlocked lo_ctl_mutex on success */
- err = loop_clr_fd(lo, bdev);
+ err = loop_clr_fd(lo);
if (!err)
goto out_unlocked;
break;
case LOOP_SET_STATUS:
- err = loop_set_status_old(lo, (struct loop_info __user *) arg);
+ err = -EPERM;
+ if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+ err = loop_set_status_old(lo,
+ (struct loop_info __user *)arg);
break;
case LOOP_GET_STATUS:
err = loop_get_status_old(lo, (struct loop_info __user *) arg);
break;
case LOOP_SET_STATUS64:
- err = loop_set_status64(lo, (struct loop_info64 __user *) arg);
+ err = -EPERM;
+ if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+ err = loop_set_status64(lo,
+ (struct loop_info64 __user *) arg);
break;
case LOOP_GET_STATUS64:
err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
@@ -1513,7 +1515,7 @@ static int lo_release(struct gendisk *disk, fmode_t mode)
* In autoclear mode, stop the loop thread
* and remove configuration after last close.
*/
- err = loop_clr_fd(lo, NULL);
+ err = loop_clr_fd(lo);
if (!err)
goto out_unlocked;
} else {
@@ -1595,14 +1597,12 @@ static int loop_add(struct loop_device **l, int i)
struct gendisk *disk;
int err;
+ err = -ENOMEM;
lo = kzalloc(sizeof(*lo), GFP_KERNEL);
- if (!lo) {
- err = -ENOMEM;
+ if (!lo)
goto out;
- }
- err = idr_pre_get(&loop_index_idr, GFP_KERNEL);
- if (err < 0)
+ if (!idr_pre_get(&loop_index_idr, GFP_KERNEL))
goto out_free_dev;
if (i >= 0) {
@@ -1635,6 +1635,27 @@ static int loop_add(struct loop_device **l, int i)
if (!disk)
goto out_free_queue;
+ /*
+ * Disable partition scanning by default. The in-kernel partition
+ * scanning can be requested individually per-device during its
+ * setup. Userspace can always add and remove partitions from all
+ * devices. The needed partition minors are allocated from the
+ * extended minor space, the main loop device numbers will continue
+ * to match the loop minors, regardless of the number of partitions
+ * used.
+ *
+ * If max_part is given, partition scanning is globally enabled for
+ * all loop devices. The minors for the main loop devices will be
+ * multiples of max_part.
+ *
+ * Note: Global-for-all-devices, set-only-at-init, read-only module
+ * parameteters like 'max_loop' and 'max_part' make things needlessly
+ * complicated, are too static, inflexible and may surprise
+ * userspace tools. Parameters like this in general should be avoided.
+ */
+ if (!part_shift)
+ disk->flags |= GENHD_FL_NO_PART_SCAN;
+ disk->flags |= GENHD_FL_EXT_DEVT;
mutex_init(&lo->lo_ctl_mutex);
lo->lo_number = i;
lo->lo_thread = NULL;
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 76fa3deaee84..1788f491e0fb 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -780,9 +780,9 @@ static const struct block_device_operations mg_disk_ops = {
.getgeo = mg_getgeo
};
-static int mg_suspend(struct platform_device *plat_dev, pm_message_t state)
+static int mg_suspend(struct device *dev)
{
- struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
+ struct mg_drv_data *prv_data = dev->platform_data;
struct mg_host *host = prv_data->host;
if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
@@ -804,9 +804,9 @@ static int mg_suspend(struct platform_device *plat_dev, pm_message_t state)
return 0;
}
-static int mg_resume(struct platform_device *plat_dev)
+static int mg_resume(struct device *dev)
{
- struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
+ struct mg_drv_data *prv_data = dev->platform_data;
struct mg_host *host = prv_data->host;
if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
@@ -825,6 +825,8 @@ static int mg_resume(struct platform_device *plat_dev)
return 0;
}
+static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume);
+
static int mg_probe(struct platform_device *plat_dev)
{
struct mg_host *host;
@@ -1074,11 +1076,10 @@ static int mg_remove(struct platform_device *plat_dev)
static struct platform_driver mg_disk_driver = {
.probe = mg_probe,
.remove = mg_remove,
- .suspend = mg_suspend,
- .resume = mg_resume,
.driver = {
.name = MG_DEV_NAME,
.owner = THIS_MODULE,
+ .pm = &mg_pm,
}
};
diff --git a/drivers/block/mtip32xx/Kconfig b/drivers/block/mtip32xx/Kconfig
new file mode 100644
index 000000000000..0ba837fc62a8
--- /dev/null
+++ b/drivers/block/mtip32xx/Kconfig
@@ -0,0 +1,9 @@
+#
+# mtip32xx device driver configuration
+#
+
+config BLK_DEV_PCIESSD_MTIP32XX
+ tristate "Block Device Driver for Micron PCIe SSDs"
+ depends on PCI
+ help
+ This enables the block driver for Micron PCIe SSDs.
diff --git a/drivers/block/mtip32xx/Makefile b/drivers/block/mtip32xx/Makefile
new file mode 100644
index 000000000000..4fbef8c8329b
--- /dev/null
+++ b/drivers/block/mtip32xx/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for Block device driver for Micron PCIe SSD
+#
+
+obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx.o
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
new file mode 100644
index 000000000000..f946d31d6917
--- /dev/null
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -0,0 +1,4266 @@
+/*
+ * Driver for the Micron P320 SSD
+ * Copyright (C) 2011 Micron Technology, Inc.
+ *
+ * Portions of this code were derived from works subjected to the
+ * following copyright:
+ * Copyright (C) 2009 Integrated Device Technology, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/ata.h>
+#include <linux/delay.h>
+#include <linux/hdreg.h>
+#include <linux/uaccess.h>
+#include <linux/random.h>
+#include <linux/smp.h>
+#include <linux/compat.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/dma-mapping.h>
+#include <linux/idr.h>
+#include <linux/kthread.h>
+#include <../drivers/ata/ahci.h>
+#include <linux/export.h>
+#include <linux/debugfs.h>
+#include "mtip32xx.h"
+
+#define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32)
+#define HW_CMD_TBL_SZ (AHCI_CMD_TBL_HDR_SZ + (MTIP_MAX_SG * 16))
+#define HW_CMD_TBL_AR_SZ (HW_CMD_TBL_SZ * MTIP_MAX_COMMAND_SLOTS)
+#define HW_PORT_PRIV_DMA_SZ \
+ (HW_CMD_SLOT_SZ + HW_CMD_TBL_AR_SZ + AHCI_RX_FIS_SZ)
+
+#define HOST_CAP_NZDMA (1 << 19)
+#define HOST_HSORG 0xFC
+#define HSORG_DISABLE_SLOTGRP_INTR (1<<24)
+#define HSORG_DISABLE_SLOTGRP_PXIS (1<<16)
+#define HSORG_HWREV 0xFF00
+#define HSORG_STYLE 0x8
+#define HSORG_SLOTGROUPS 0x7
+
+#define PORT_COMMAND_ISSUE 0x38
+#define PORT_SDBV 0x7C
+
+#define PORT_OFFSET 0x100
+#define PORT_MEM_SIZE 0x80
+
+#define PORT_IRQ_ERR \
+ (PORT_IRQ_HBUS_ERR | PORT_IRQ_IF_ERR | PORT_IRQ_CONNECT | \
+ PORT_IRQ_PHYRDY | PORT_IRQ_UNK_FIS | PORT_IRQ_BAD_PMP | \
+ PORT_IRQ_TF_ERR | PORT_IRQ_HBUS_DATA_ERR | PORT_IRQ_IF_NONFATAL | \
+ PORT_IRQ_OVERFLOW)
+#define PORT_IRQ_LEGACY \
+ (PORT_IRQ_PIOS_FIS | PORT_IRQ_D2H_REG_FIS)
+#define PORT_IRQ_HANDLED \
+ (PORT_IRQ_SDB_FIS | PORT_IRQ_LEGACY | \
+ PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR | \
+ PORT_IRQ_CONNECT | PORT_IRQ_PHYRDY)
+#define DEF_PORT_IRQ \
+ (PORT_IRQ_ERR | PORT_IRQ_LEGACY | PORT_IRQ_SDB_FIS)
+
+/* product numbers */
+#define MTIP_PRODUCT_UNKNOWN 0x00
+#define MTIP_PRODUCT_ASICFPGA 0x11
+
+/* Device instance number, incremented each time a device is probed. */
+static int instance;
+
+/*
+ * Global variable used to hold the major block device number
+ * allocated in mtip_init().
+ */
+static int mtip_major;
+static struct dentry *dfs_parent;
+
+static DEFINE_SPINLOCK(rssd_index_lock);
+static DEFINE_IDA(rssd_index_ida);
+
+static int mtip_block_initialize(struct driver_data *dd);
+
+#ifdef CONFIG_COMPAT
+struct mtip_compat_ide_task_request_s {
+ __u8 io_ports[8];
+ __u8 hob_ports[8];
+ ide_reg_valid_t out_flags;
+ ide_reg_valid_t in_flags;
+ int data_phase;
+ int req_cmd;
+ compat_ulong_t out_size;
+ compat_ulong_t in_size;
+};
+#endif
+
+/*
+ * This function check_for_surprise_removal is called
+ * while card is removed from the system and it will
+ * read the vendor id from the configration space
+ *
+ * @pdev Pointer to the pci_dev structure.
+ *
+ * return value
+ * true if device removed, else false
+ */
+static bool mtip_check_surprise_removal(struct pci_dev *pdev)
+{
+ u16 vendor_id = 0;
+
+ /* Read the vendorID from the configuration space */
+ pci_read_config_word(pdev, 0x00, &vendor_id);
+ if (vendor_id == 0xFFFF)
+ return true; /* device removed */
+
+ return false; /* device present */
+}
+
+/*
+ * This function is called for clean the pending command in the
+ * command slot during the surprise removal of device and return
+ * error to the upper layer.
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ * None
+ */
+static void mtip_command_cleanup(struct driver_data *dd)
+{
+ int group = 0, commandslot = 0, commandindex = 0;
+ struct mtip_cmd *command;
+ struct mtip_port *port = dd->port;
+ static int in_progress;
+
+ if (in_progress)
+ return;
+
+ in_progress = 1;
+
+ for (group = 0; group < 4; group++) {
+ for (commandslot = 0; commandslot < 32; commandslot++) {
+ if (!(port->allocated[group] & (1 << commandslot)))
+ continue;
+
+ commandindex = group << 5 | commandslot;
+ command = &port->commands[commandindex];
+
+ if (atomic_read(&command->active)
+ && (command->async_callback)) {
+ command->async_callback(command->async_data,
+ -ENODEV);
+ command->async_callback = NULL;
+ command->async_data = NULL;
+ }
+
+ dma_unmap_sg(&port->dd->pdev->dev,
+ command->sg,
+ command->scatter_ents,
+ command->direction);
+ }
+ }
+
+ up(&port->cmd_slot);
+
+ set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag);
+ in_progress = 0;
+}
+
+/*
+ * Obtain an empty command slot.
+ *
+ * This function needs to be reentrant since it could be called
+ * at the same time on multiple CPUs. The allocation of the
+ * command slot must be atomic.
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ * >= 0 Index of command slot obtained.
+ * -1 No command slots available.
+ */
+static int get_slot(struct mtip_port *port)
+{
+ int slot, i;
+ unsigned int num_command_slots = port->dd->slot_groups * 32;
+
+ /*
+ * Try 10 times, because there is a small race here.
+ * that's ok, because it's still cheaper than a lock.
+ *
+ * Race: Since this section is not protected by lock, same bit
+ * could be chosen by different process contexts running in
+ * different processor. So instead of costly lock, we are going
+ * with loop.
+ */
+ for (i = 0; i < 10; i++) {
+ slot = find_next_zero_bit(port->allocated,
+ num_command_slots, 1);
+ if ((slot < num_command_slots) &&
+ (!test_and_set_bit(slot, port->allocated)))
+ return slot;
+ }
+ dev_warn(&port->dd->pdev->dev, "Failed to get a tag.\n");
+
+ if (mtip_check_surprise_removal(port->dd->pdev)) {
+ /* Device not present, clean outstanding commands */
+ mtip_command_cleanup(port->dd);
+ }
+ return -1;
+}
+
+/*
+ * Release a command slot.
+ *
+ * @port Pointer to the port data structure.
+ * @tag Tag of command to release
+ *
+ * return value
+ * None
+ */
+static inline void release_slot(struct mtip_port *port, int tag)
+{
+ smp_mb__before_clear_bit();
+ clear_bit(tag, port->allocated);
+ smp_mb__after_clear_bit();
+}
+
+/*
+ * Reset the HBA (without sleeping)
+ *
+ * Just like hba_reset, except does not call sleep, so can be
+ * run from interrupt/tasklet context.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0 The reset was successful.
+ * -1 The HBA Reset bit did not clear.
+ */
+static int hba_reset_nosleep(struct driver_data *dd)
+{
+ unsigned long timeout;
+
+ /* Chip quirk: quiesce any chip function */
+ mdelay(10);
+
+ /* Set the reset bit */
+ writel(HOST_RESET, dd->mmio + HOST_CTL);
+
+ /* Flush */
+ readl(dd->mmio + HOST_CTL);
+
+ /*
+ * Wait 10ms then spin for up to 1 second
+ * waiting for reset acknowledgement
+ */
+ timeout = jiffies + msecs_to_jiffies(1000);
+ mdelay(10);
+ while ((readl(dd->mmio + HOST_CTL) & HOST_RESET)
+ && time_before(jiffies, timeout))
+ mdelay(1);
+
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))
+ return -1;
+
+ if (readl(dd->mmio + HOST_CTL) & HOST_RESET)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Issue a command to the hardware.
+ *
+ * Set the appropriate bit in the s_active and Command Issue hardware
+ * registers, causing hardware command processing to begin.
+ *
+ * @port Pointer to the port structure.
+ * @tag The tag of the command to be issued.
+ *
+ * return value
+ * None
+ */
+static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag)
+{
+ atomic_set(&port->commands[tag].active, 1);
+
+ spin_lock(&port->cmd_issue_lock);
+
+ writel((1 << MTIP_TAG_BIT(tag)),
+ port->s_active[MTIP_TAG_INDEX(tag)]);
+ writel((1 << MTIP_TAG_BIT(tag)),
+ port->cmd_issue[MTIP_TAG_INDEX(tag)]);
+
+ spin_unlock(&port->cmd_issue_lock);
+
+ /* Set the command's timeout value.*/
+ port->commands[tag].comp_time = jiffies + msecs_to_jiffies(
+ MTIP_NCQ_COMMAND_TIMEOUT_MS);
+}
+
+/*
+ * Enable/disable the reception of FIS
+ *
+ * @port Pointer to the port data structure
+ * @enable 1 to enable, 0 to disable
+ *
+ * return value
+ * Previous state: 1 enabled, 0 disabled
+ */
+static int mtip_enable_fis(struct mtip_port *port, int enable)
+{
+ u32 tmp;
+
+ /* enable FIS reception */
+ tmp = readl(port->mmio + PORT_CMD);
+ if (enable)
+ writel(tmp | PORT_CMD_FIS_RX, port->mmio + PORT_CMD);
+ else
+ writel(tmp & ~PORT_CMD_FIS_RX, port->mmio + PORT_CMD);
+
+ /* Flush */
+ readl(port->mmio + PORT_CMD);
+
+ return (((tmp & PORT_CMD_FIS_RX) == PORT_CMD_FIS_RX));
+}
+
+/*
+ * Enable/disable the DMA engine
+ *
+ * @port Pointer to the port data structure
+ * @enable 1 to enable, 0 to disable
+ *
+ * return value
+ * Previous state: 1 enabled, 0 disabled.
+ */
+static int mtip_enable_engine(struct mtip_port *port, int enable)
+{
+ u32 tmp;
+
+ /* enable FIS reception */
+ tmp = readl(port->mmio + PORT_CMD);
+ if (enable)
+ writel(tmp | PORT_CMD_START, port->mmio + PORT_CMD);
+ else
+ writel(tmp & ~PORT_CMD_START, port->mmio + PORT_CMD);
+
+ readl(port->mmio + PORT_CMD);
+ return (((tmp & PORT_CMD_START) == PORT_CMD_START));
+}
+
+/*
+ * Enables the port DMA engine and FIS reception.
+ *
+ * return value
+ * None
+ */
+static inline void mtip_start_port(struct mtip_port *port)
+{
+ /* Enable FIS reception */
+ mtip_enable_fis(port, 1);
+
+ /* Enable the DMA engine */
+ mtip_enable_engine(port, 1);
+}
+
+/*
+ * Deinitialize a port by disabling port interrupts, the DMA engine,
+ * and FIS reception.
+ *
+ * @port Pointer to the port structure
+ *
+ * return value
+ * None
+ */
+static inline void mtip_deinit_port(struct mtip_port *port)
+{
+ /* Disable interrupts on this port */
+ writel(0, port->mmio + PORT_IRQ_MASK);
+
+ /* Disable the DMA engine */
+ mtip_enable_engine(port, 0);
+
+ /* Disable FIS reception */
+ mtip_enable_fis(port, 0);
+}
+
+/*
+ * Initialize a port.
+ *
+ * This function deinitializes the port by calling mtip_deinit_port() and
+ * then initializes it by setting the command header and RX FIS addresses,
+ * clearing the SError register and any pending port interrupts before
+ * re-enabling the default set of port interrupts.
+ *
+ * @port Pointer to the port structure.
+ *
+ * return value
+ * None
+ */
+static void mtip_init_port(struct mtip_port *port)
+{
+ int i;
+ mtip_deinit_port(port);
+
+ /* Program the command list base and FIS base addresses */
+ if (readl(port->dd->mmio + HOST_CAP) & HOST_CAP_64) {
+ writel((port->command_list_dma >> 16) >> 16,
+ port->mmio + PORT_LST_ADDR_HI);
+ writel((port->rxfis_dma >> 16) >> 16,
+ port->mmio + PORT_FIS_ADDR_HI);
+ }
+
+ writel(port->command_list_dma & 0xFFFFFFFF,
+ port->mmio + PORT_LST_ADDR);
+ writel(port->rxfis_dma & 0xFFFFFFFF, port->mmio + PORT_FIS_ADDR);
+
+ /* Clear SError */
+ writel(readl(port->mmio + PORT_SCR_ERR), port->mmio + PORT_SCR_ERR);
+
+ /* reset the completed registers.*/
+ for (i = 0; i < port->dd->slot_groups; i++)
+ writel(0xFFFFFFFF, port->completed[i]);
+
+ /* Clear any pending interrupts for this port */
+ writel(readl(port->mmio + PORT_IRQ_STAT), port->mmio + PORT_IRQ_STAT);
+
+ /* Clear any pending interrupts on the HBA. */
+ writel(readl(port->dd->mmio + HOST_IRQ_STAT),
+ port->dd->mmio + HOST_IRQ_STAT);
+
+ /* Enable port interrupts */
+ writel(DEF_PORT_IRQ, port->mmio + PORT_IRQ_MASK);
+}
+
+/*
+ * Restart a port
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ * None
+ */
+static void mtip_restart_port(struct mtip_port *port)
+{
+ unsigned long timeout;
+
+ /* Disable the DMA engine */
+ mtip_enable_engine(port, 0);
+
+ /* Chip quirk: wait up to 500ms for PxCMD.CR == 0 */
+ timeout = jiffies + msecs_to_jiffies(500);
+ while ((readl(port->mmio + PORT_CMD) & PORT_CMD_LIST_ON)
+ && time_before(jiffies, timeout))
+ ;
+
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+ return;
+
+ /*
+ * Chip quirk: escalate to hba reset if
+ * PxCMD.CR not clear after 500 ms
+ */
+ if (readl(port->mmio + PORT_CMD) & PORT_CMD_LIST_ON) {
+ dev_warn(&port->dd->pdev->dev,
+ "PxCMD.CR not clear, escalating reset\n");
+
+ if (hba_reset_nosleep(port->dd))
+ dev_err(&port->dd->pdev->dev,
+ "HBA reset escalation failed.\n");
+
+ /* 30 ms delay before com reset to quiesce chip */
+ mdelay(30);
+ }
+
+ dev_warn(&port->dd->pdev->dev, "Issuing COM reset\n");
+
+ /* Set PxSCTL.DET */
+ writel(readl(port->mmio + PORT_SCR_CTL) |
+ 1, port->mmio + PORT_SCR_CTL);
+ readl(port->mmio + PORT_SCR_CTL);
+
+ /* Wait 1 ms to quiesce chip function */
+ timeout = jiffies + msecs_to_jiffies(1);
+ while (time_before(jiffies, timeout))
+ ;
+
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+ return;
+
+ /* Clear PxSCTL.DET */
+ writel(readl(port->mmio + PORT_SCR_CTL) & ~1,
+ port->mmio + PORT_SCR_CTL);
+ readl(port->mmio + PORT_SCR_CTL);
+
+ /* Wait 500 ms for bit 0 of PORT_SCR_STS to be set */
+ timeout = jiffies + msecs_to_jiffies(500);
+ while (((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0)
+ && time_before(jiffies, timeout))
+ ;
+
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+ return;
+
+ if ((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0)
+ dev_warn(&port->dd->pdev->dev,
+ "COM reset failed\n");
+
+ mtip_init_port(port);
+ mtip_start_port(port);
+
+}
+
+/*
+ * Helper function for tag logging
+ */
+static void print_tags(struct driver_data *dd,
+ char *msg,
+ unsigned long *tagbits,
+ int cnt)
+{
+ unsigned char tagmap[128];
+ int group, tagmap_len = 0;
+
+ memset(tagmap, 0, sizeof(tagmap));
+ for (group = SLOTBITS_IN_LONGS; group > 0; group--)
+ tagmap_len = sprintf(tagmap + tagmap_len, "%016lX ",
+ tagbits[group-1]);
+ dev_warn(&dd->pdev->dev,
+ "%d command(s) %s: tagmap [%s]", cnt, msg, tagmap);
+}
+
+/*
+ * Called periodically to see if any read/write commands are
+ * taking too long to complete.
+ *
+ * @data Pointer to the PORT data structure.
+ *
+ * return value
+ * None
+ */
+static void mtip_timeout_function(unsigned long int data)
+{
+ struct mtip_port *port = (struct mtip_port *) data;
+ struct host_to_dev_fis *fis;
+ struct mtip_cmd *command;
+ int tag, cmdto_cnt = 0;
+ unsigned int bit, group;
+ unsigned int num_command_slots = port->dd->slot_groups * 32;
+ unsigned long to, tagaccum[SLOTBITS_IN_LONGS];
+
+ if (unlikely(!port))
+ return;
+
+ if (test_bit(MTIP_DDF_RESUME_BIT, &port->dd->dd_flag)) {
+ mod_timer(&port->cmd_timer,
+ jiffies + msecs_to_jiffies(30000));
+ return;
+ }
+ /* clear the tag accumulator */
+ memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+
+ for (tag = 0; tag < num_command_slots; tag++) {
+ /*
+ * Skip internal command slot as it has
+ * its own timeout mechanism
+ */
+ if (tag == MTIP_TAG_INTERNAL)
+ continue;
+
+ if (atomic_read(&port->commands[tag].active) &&
+ (time_after(jiffies, port->commands[tag].comp_time))) {
+ group = tag >> 5;
+ bit = tag & 0x1F;
+
+ command = &port->commands[tag];
+ fis = (struct host_to_dev_fis *) command->command;
+
+ set_bit(tag, tagaccum);
+ cmdto_cnt++;
+ if (cmdto_cnt == 1)
+ set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
+
+ /*
+ * Clear the completed bit. This should prevent
+ * any interrupt handlers from trying to retire
+ * the command.
+ */
+ writel(1 << bit, port->completed[group]);
+
+ /* Call the async completion callback. */
+ if (likely(command->async_callback))
+ command->async_callback(command->async_data,
+ -EIO);
+ command->async_callback = NULL;
+ command->comp_func = NULL;
+
+ /* Unmap the DMA scatter list entries */
+ dma_unmap_sg(&port->dd->pdev->dev,
+ command->sg,
+ command->scatter_ents,
+ command->direction);
+
+ /*
+ * Clear the allocated bit and active tag for the
+ * command.
+ */
+ atomic_set(&port->commands[tag].active, 0);
+ release_slot(port, tag);
+
+ up(&port->cmd_slot);
+ }
+ }
+
+ if (cmdto_cnt && !test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) {
+ print_tags(port->dd, "timed out", tagaccum, cmdto_cnt);
+
+ mtip_restart_port(port);
+ clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
+ wake_up_interruptible(&port->svc_wait);
+ }
+
+ if (port->ic_pause_timer) {
+ to = port->ic_pause_timer + msecs_to_jiffies(1000);
+ if (time_after(jiffies, to)) {
+ if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) {
+ port->ic_pause_timer = 0;
+ clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
+ clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags);
+ clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+ wake_up_interruptible(&port->svc_wait);
+ }
+
+
+ }
+ }
+
+ /* Restart the timer */
+ mod_timer(&port->cmd_timer,
+ jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
+}
+
+/*
+ * IO completion function.
+ *
+ * This completion function is called by the driver ISR when a
+ * command that was issued by the kernel completes. It first calls the
+ * asynchronous completion function which normally calls back into the block
+ * layer passing the asynchronous callback data, then unmaps the
+ * scatter list associated with the completed command, and finally
+ * clears the allocated bit associated with the completed command.
+ *
+ * @port Pointer to the port data structure.
+ * @tag Tag of the command.
+ * @data Pointer to driver_data.
+ * @status Completion status.
+ *
+ * return value
+ * None
+ */
+static void mtip_async_complete(struct mtip_port *port,
+ int tag,
+ void *data,
+ int status)
+{
+ struct mtip_cmd *command;
+ struct driver_data *dd = data;
+ int cb_status = status ? -EIO : 0;
+
+ if (unlikely(!dd) || unlikely(!port))
+ return;
+
+ command = &port->commands[tag];
+
+ if (unlikely(status == PORT_IRQ_TF_ERR)) {
+ dev_warn(&port->dd->pdev->dev,
+ "Command tag %d failed due to TFE\n", tag);
+ }
+
+ /* Upper layer callback */
+ if (likely(command->async_callback))
+ command->async_callback(command->async_data, cb_status);
+
+ command->async_callback = NULL;
+ command->comp_func = NULL;
+
+ /* Unmap the DMA scatter list entries */
+ dma_unmap_sg(&dd->pdev->dev,
+ command->sg,
+ command->scatter_ents,
+ command->direction);
+
+ /* Clear the allocated and active bits for the command */
+ atomic_set(&port->commands[tag].active, 0);
+ release_slot(port, tag);
+
+ up(&port->cmd_slot);
+}
+
+/*
+ * Internal command completion callback function.
+ *
+ * This function is normally called by the driver ISR when an internal
+ * command completed. This function signals the command completion by
+ * calling complete().
+ *
+ * @port Pointer to the port data structure.
+ * @tag Tag of the command that has completed.
+ * @data Pointer to a completion structure.
+ * @status Completion status.
+ *
+ * return value
+ * None
+ */
+static void mtip_completion(struct mtip_port *port,
+ int tag,
+ void *data,
+ int status)
+{
+ struct mtip_cmd *command = &port->commands[tag];
+ struct completion *waiting = data;
+ if (unlikely(status == PORT_IRQ_TF_ERR))
+ dev_warn(&port->dd->pdev->dev,
+ "Internal command %d completed with TFE\n", tag);
+
+ command->async_callback = NULL;
+ command->comp_func = NULL;
+
+ complete(waiting);
+}
+
+static void mtip_null_completion(struct mtip_port *port,
+ int tag,
+ void *data,
+ int status)
+{
+ return;
+}
+
+static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer,
+ dma_addr_t buffer_dma, unsigned int sectors);
+static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id,
+ struct smart_attr *attrib);
+/*
+ * Handle an error.
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ * None
+ */
+static void mtip_handle_tfe(struct driver_data *dd)
+{
+ int group, tag, bit, reissue, rv;
+ struct mtip_port *port;
+ struct mtip_cmd *cmd;
+ u32 completed;
+ struct host_to_dev_fis *fis;
+ unsigned long tagaccum[SLOTBITS_IN_LONGS];
+ unsigned int cmd_cnt = 0;
+ unsigned char *buf;
+ char *fail_reason = NULL;
+ int fail_all_ncq_write = 0, fail_all_ncq_cmds = 0;
+
+ dev_warn(&dd->pdev->dev, "Taskfile error\n");
+
+ port = dd->port;
+
+ /* Stop the timer to prevent command timeouts. */
+ del_timer(&port->cmd_timer);
+ set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
+
+ if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) &&
+ test_bit(MTIP_TAG_INTERNAL, port->allocated)) {
+ cmd = &port->commands[MTIP_TAG_INTERNAL];
+ dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n");
+
+ atomic_inc(&cmd->active); /* active > 1 indicates error */
+ if (cmd->comp_data && cmd->comp_func) {
+ cmd->comp_func(port, MTIP_TAG_INTERNAL,
+ cmd->comp_data, PORT_IRQ_TF_ERR);
+ }
+ goto handle_tfe_exit;
+ }
+
+ /* clear the tag accumulator */
+ memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+
+ /* Loop through all the groups */
+ for (group = 0; group < dd->slot_groups; group++) {
+ completed = readl(port->completed[group]);
+
+ /* clear completed status register in the hardware.*/
+ writel(completed, port->completed[group]);
+
+ /* Process successfully completed commands */
+ for (bit = 0; bit < 32 && completed; bit++) {
+ if (!(completed & (1<<bit)))
+ continue;
+ tag = (group << 5) + bit;
+
+ /* Skip the internal command slot */
+ if (tag == MTIP_TAG_INTERNAL)
+ continue;
+
+ cmd = &port->commands[tag];
+ if (likely(cmd->comp_func)) {
+ set_bit(tag, tagaccum);
+ cmd_cnt++;
+ atomic_set(&cmd->active, 0);
+ cmd->comp_func(port,
+ tag,
+ cmd->comp_data,
+ 0);
+ } else {
+ dev_err(&port->dd->pdev->dev,
+ "Missing completion func for tag %d",
+ tag);
+ if (mtip_check_surprise_removal(dd->pdev)) {
+ mtip_command_cleanup(dd);
+ /* don't proceed further */
+ return;
+ }
+ }
+ }
+ }
+
+ print_tags(dd, "completed (TFE)", tagaccum, cmd_cnt);
+
+ /* Restart the port */
+ mdelay(20);
+ mtip_restart_port(port);
+
+ /* Trying to determine the cause of the error */
+ rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
+ dd->port->log_buf,
+ dd->port->log_buf_dma, 1);
+ if (rv) {
+ dev_warn(&dd->pdev->dev,
+ "Error in READ LOG EXT (10h) command\n");
+ /* non-critical error, don't fail the load */
+ } else {
+ buf = (unsigned char *)dd->port->log_buf;
+ if (buf[259] & 0x1) {
+ dev_info(&dd->pdev->dev,
+ "Write protect bit is set.\n");
+ set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag);
+ fail_all_ncq_write = 1;
+ fail_reason = "write protect";
+ }
+ if (buf[288] == 0xF7) {
+ dev_info(&dd->pdev->dev,
+ "Exceeded Tmax, drive in thermal shutdown.\n");
+ set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag);
+ fail_all_ncq_cmds = 1;
+ fail_reason = "thermal shutdown";
+ }
+ if (buf[288] == 0xBF) {
+ dev_info(&dd->pdev->dev,
+ "Drive indicates rebuild has failed.\n");
+ fail_all_ncq_cmds = 1;
+ fail_reason = "rebuild failed";
+ }
+ }
+
+ /* clear the tag accumulator */
+ memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+
+ /* Loop through all the groups */
+ for (group = 0; group < dd->slot_groups; group++) {
+ for (bit = 0; bit < 32; bit++) {
+ reissue = 1;
+ tag = (group << 5) + bit;
+ cmd = &port->commands[tag];
+
+ /* If the active bit is set re-issue the command */
+ if (atomic_read(&cmd->active) == 0)
+ continue;
+
+ fis = (struct host_to_dev_fis *)cmd->command;
+
+ /* Should re-issue? */
+ if (tag == MTIP_TAG_INTERNAL ||
+ fis->command == ATA_CMD_SET_FEATURES)
+ reissue = 0;
+ else {
+ if (fail_all_ncq_cmds ||
+ (fail_all_ncq_write &&
+ fis->command == ATA_CMD_FPDMA_WRITE)) {
+ dev_warn(&dd->pdev->dev,
+ " Fail: %s w/tag %d [%s].\n",
+ fis->command == ATA_CMD_FPDMA_WRITE ?
+ "write" : "read",
+ tag,
+ fail_reason != NULL ?
+ fail_reason : "unknown");
+ atomic_set(&cmd->active, 0);
+ if (cmd->comp_func) {
+ cmd->comp_func(port, tag,
+ cmd->comp_data,
+ -ENODATA);
+ }
+ continue;
+ }
+ }
+
+ /*
+ * First check if this command has
+ * exceeded its retries.
+ */
+ if (reissue && (cmd->retries-- > 0)) {
+
+ set_bit(tag, tagaccum);
+
+ /* Re-issue the command. */
+ mtip_issue_ncq_command(port, tag);
+
+ continue;
+ }
+
+ /* Retire a command that will not be reissued */
+ dev_warn(&port->dd->pdev->dev,
+ "retiring tag %d\n", tag);
+ atomic_set(&cmd->active, 0);
+
+ if (cmd->comp_func)
+ cmd->comp_func(
+ port,
+ tag,
+ cmd->comp_data,
+ PORT_IRQ_TF_ERR);
+ else
+ dev_warn(&port->dd->pdev->dev,
+ "Bad completion for tag %d\n",
+ tag);
+ }
+ }
+ print_tags(dd, "reissued (TFE)", tagaccum, cmd_cnt);
+
+handle_tfe_exit:
+ /* clear eh_active */
+ clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
+ wake_up_interruptible(&port->svc_wait);
+
+ mod_timer(&port->cmd_timer,
+ jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
+}
+
+/*
+ * Handle a set device bits interrupt
+ */
+static inline void mtip_process_sdbf(struct driver_data *dd)
+{
+ struct mtip_port *port = dd->port;
+ int group, tag, bit;
+ u32 completed;
+ struct mtip_cmd *command;
+
+ /* walk all bits in all slot groups */
+ for (group = 0; group < dd->slot_groups; group++) {
+ completed = readl(port->completed[group]);
+ if (!completed)
+ continue;
+
+ /* clear completed status register in the hardware.*/
+ writel(completed, port->completed[group]);
+
+ /* Process completed commands. */
+ for (bit = 0;
+ (bit < 32) && completed;
+ bit++, completed >>= 1) {
+ if (completed & 0x01) {
+ tag = (group << 5) | bit;
+
+ /* skip internal command slot. */
+ if (unlikely(tag == MTIP_TAG_INTERNAL))
+ continue;
+
+ command = &port->commands[tag];
+ /* make internal callback */
+ if (likely(command->comp_func)) {
+ command->comp_func(
+ port,
+ tag,
+ command->comp_data,
+ 0);
+ } else {
+ dev_warn(&dd->pdev->dev,
+ "Null completion "
+ "for tag %d",
+ tag);
+
+ if (mtip_check_surprise_removal(
+ dd->pdev)) {
+ mtip_command_cleanup(dd);
+ return;
+ }
+ }
+ }
+ }
+ }
+}
+
+/*
+ * Process legacy pio and d2h interrupts
+ */
+static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat)
+{
+ struct mtip_port *port = dd->port;
+ struct mtip_cmd *cmd = &port->commands[MTIP_TAG_INTERNAL];
+
+ if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) &&
+ (cmd != NULL) && !(readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+ & (1 << MTIP_TAG_INTERNAL))) {
+ if (cmd->comp_func) {
+ cmd->comp_func(port,
+ MTIP_TAG_INTERNAL,
+ cmd->comp_data,
+ 0);
+ return;
+ }
+ }
+
+ return;
+}
+
+/*
+ * Demux and handle errors
+ */
+static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat)
+{
+ if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR)))
+ mtip_handle_tfe(dd);
+
+ if (unlikely(port_stat & PORT_IRQ_CONNECT)) {
+ dev_warn(&dd->pdev->dev,
+ "Clearing PxSERR.DIAG.x\n");
+ writel((1 << 26), dd->port->mmio + PORT_SCR_ERR);
+ }
+
+ if (unlikely(port_stat & PORT_IRQ_PHYRDY)) {
+ dev_warn(&dd->pdev->dev,
+ "Clearing PxSERR.DIAG.n\n");
+ writel((1 << 16), dd->port->mmio + PORT_SCR_ERR);
+ }
+
+ if (unlikely(port_stat & ~PORT_IRQ_HANDLED)) {
+ dev_warn(&dd->pdev->dev,
+ "Port stat errors %x unhandled\n",
+ (port_stat & ~PORT_IRQ_HANDLED));
+ }
+}
+
+static inline irqreturn_t mtip_handle_irq(struct driver_data *data)
+{
+ struct driver_data *dd = (struct driver_data *) data;
+ struct mtip_port *port = dd->port;
+ u32 hba_stat, port_stat;
+ int rv = IRQ_NONE;
+
+ hba_stat = readl(dd->mmio + HOST_IRQ_STAT);
+ if (hba_stat) {
+ rv = IRQ_HANDLED;
+
+ /* Acknowledge the interrupt status on the port.*/
+ port_stat = readl(port->mmio + PORT_IRQ_STAT);
+ writel(port_stat, port->mmio + PORT_IRQ_STAT);
+
+ /* Demux port status */
+ if (likely(port_stat & PORT_IRQ_SDB_FIS))
+ mtip_process_sdbf(dd);
+
+ if (unlikely(port_stat & PORT_IRQ_ERR)) {
+ if (unlikely(mtip_check_surprise_removal(dd->pdev))) {
+ mtip_command_cleanup(dd);
+ /* don't proceed further */
+ return IRQ_HANDLED;
+ }
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &dd->dd_flag))
+ return rv;
+
+ mtip_process_errors(dd, port_stat & PORT_IRQ_ERR);
+ }
+
+ if (unlikely(port_stat & PORT_IRQ_LEGACY))
+ mtip_process_legacy(dd, port_stat & PORT_IRQ_LEGACY);
+ }
+
+ /* acknowledge interrupt */
+ writel(hba_stat, dd->mmio + HOST_IRQ_STAT);
+
+ return rv;
+}
+
+/*
+ * Wrapper for mtip_handle_irq
+ * (ignores return code)
+ */
+static void mtip_tasklet(unsigned long data)
+{
+ mtip_handle_irq((struct driver_data *) data);
+}
+
+/*
+ * HBA interrupt subroutine.
+ *
+ * @irq IRQ number.
+ * @instance Pointer to the driver data structure.
+ *
+ * return value
+ * IRQ_HANDLED A HBA interrupt was pending and handled.
+ * IRQ_NONE This interrupt was not for the HBA.
+ */
+static irqreturn_t mtip_irq_handler(int irq, void *instance)
+{
+ struct driver_data *dd = instance;
+ tasklet_schedule(&dd->tasklet);
+ return IRQ_HANDLED;
+}
+
+static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag)
+{
+ atomic_set(&port->commands[tag].active, 1);
+ writel(1 << MTIP_TAG_BIT(tag),
+ port->cmd_issue[MTIP_TAG_INDEX(tag)]);
+}
+
+static bool mtip_pause_ncq(struct mtip_port *port,
+ struct host_to_dev_fis *fis)
+{
+ struct host_to_dev_fis *reply;
+ unsigned long task_file_data;
+
+ reply = port->rxfis + RX_FIS_D2H_REG;
+ task_file_data = readl(port->mmio+PORT_TFDATA);
+
+ if (fis->command == ATA_CMD_SEC_ERASE_UNIT)
+ clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
+
+ if ((task_file_data & 1))
+ return false;
+
+ if (fis->command == ATA_CMD_SEC_ERASE_PREP) {
+ set_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
+ set_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
+ port->ic_pause_timer = jiffies;
+ return true;
+ } else if ((fis->command == ATA_CMD_DOWNLOAD_MICRO) &&
+ (fis->features == 0x03)) {
+ set_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags);
+ port->ic_pause_timer = jiffies;
+ return true;
+ } else if ((fis->command == ATA_CMD_SEC_ERASE_UNIT) ||
+ ((fis->command == 0xFC) &&
+ (fis->features == 0x27 || fis->features == 0x72 ||
+ fis->features == 0x62 || fis->features == 0x26))) {
+ /* Com reset after secure erase or lowlevel format */
+ mtip_restart_port(port);
+ return false;
+ }
+
+ return false;
+}
+
+/*
+ * Wait for port to quiesce
+ *
+ * @port Pointer to port data structure
+ * @timeout Max duration to wait (ms)
+ *
+ * return value
+ * 0 Success
+ * -EBUSY Commands still active
+ */
+static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
+{
+ unsigned long to;
+ unsigned int n;
+ unsigned int active = 1;
+
+ to = jiffies + msecs_to_jiffies(timeout);
+ do {
+ if (test_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags) &&
+ test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) {
+ msleep(20);
+ continue; /* svc thd is actively issuing commands */
+ }
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+ return -EFAULT;
+ /*
+ * Ignore s_active bit 0 of array element 0.
+ * This bit will always be set
+ */
+ active = readl(port->s_active[0]) & 0xFFFFFFFE;
+ for (n = 1; n < port->dd->slot_groups; n++)
+ active |= readl(port->s_active[n]);
+
+ if (!active)
+ break;
+
+ msleep(20);
+ } while (time_before(jiffies, to));
+
+ return active ? -EBUSY : 0;
+}
+
+/*
+ * Execute an internal command and wait for the completion.
+ *
+ * @port Pointer to the port data structure.
+ * @fis Pointer to the FIS that describes the command.
+ * @fis_len Length in WORDS of the FIS.
+ * @buffer DMA accessible for command data.
+ * @buf_len Length, in bytes, of the data buffer.
+ * @opts Command header options, excluding the FIS length
+ * and the number of PRD entries.
+ * @timeout Time in ms to wait for the command to complete.
+ *
+ * return value
+ * 0 Command completed successfully.
+ * -EFAULT The buffer address is not correctly aligned.
+ * -EBUSY Internal command or other IO in progress.
+ * -EAGAIN Time out waiting for command to complete.
+ */
+static int mtip_exec_internal_command(struct mtip_port *port,
+ struct host_to_dev_fis *fis,
+ int fis_len,
+ dma_addr_t buffer,
+ int buf_len,
+ u32 opts,
+ gfp_t atomic,
+ unsigned long timeout)
+{
+ struct mtip_cmd_sg *command_sg;
+ DECLARE_COMPLETION_ONSTACK(wait);
+ int rv = 0, ready2go = 1;
+ struct mtip_cmd *int_cmd = &port->commands[MTIP_TAG_INTERNAL];
+ unsigned long to;
+
+ /* Make sure the buffer is 8 byte aligned. This is asic specific. */
+ if (buffer & 0x00000007) {
+ dev_err(&port->dd->pdev->dev,
+ "SG buffer is not 8 byte aligned\n");
+ return -EFAULT;
+ }
+
+ to = jiffies + msecs_to_jiffies(timeout);
+ do {
+ ready2go = !test_and_set_bit(MTIP_TAG_INTERNAL,
+ port->allocated);
+ if (ready2go)
+ break;
+ mdelay(100);
+ } while (time_before(jiffies, to));
+ if (!ready2go) {
+ dev_warn(&port->dd->pdev->dev,
+ "Internal cmd active. new cmd [%02X]\n", fis->command);
+ return -EBUSY;
+ }
+ set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+ port->ic_pause_timer = 0;
+
+ if (fis->command == ATA_CMD_SEC_ERASE_UNIT)
+ clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
+ else if (fis->command == ATA_CMD_DOWNLOAD_MICRO)
+ clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags);
+
+ if (atomic == GFP_KERNEL) {
+ if (fis->command != ATA_CMD_STANDBYNOW1) {
+ /* wait for io to complete if non atomic */
+ if (mtip_quiesce_io(port, 5000) < 0) {
+ dev_warn(&port->dd->pdev->dev,
+ "Failed to quiesce IO\n");
+ release_slot(port, MTIP_TAG_INTERNAL);
+ clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+ wake_up_interruptible(&port->svc_wait);
+ return -EBUSY;
+ }
+ }
+
+ /* Set the completion function and data for the command. */
+ int_cmd->comp_data = &wait;
+ int_cmd->comp_func = mtip_completion;
+
+ } else {
+ /* Clear completion - we're going to poll */
+ int_cmd->comp_data = NULL;
+ int_cmd->comp_func = mtip_null_completion;
+ }
+
+ /* Copy the command to the command table */
+ memcpy(int_cmd->command, fis, fis_len*4);
+
+ /* Populate the SG list */
+ int_cmd->command_header->opts =
+ __force_bit2int cpu_to_le32(opts | fis_len);
+ if (buf_len) {
+ command_sg = int_cmd->command + AHCI_CMD_TBL_HDR_SZ;
+
+ command_sg->info =
+ __force_bit2int cpu_to_le32((buf_len-1) & 0x3FFFFF);
+ command_sg->dba =
+ __force_bit2int cpu_to_le32(buffer & 0xFFFFFFFF);
+ command_sg->dba_upper =
+ __force_bit2int cpu_to_le32((buffer >> 16) >> 16);
+
+ int_cmd->command_header->opts |=
+ __force_bit2int cpu_to_le32((1 << 16));
+ }
+
+ /* Populate the command header */
+ int_cmd->command_header->byte_count = 0;
+
+ /* Issue the command to the hardware */
+ mtip_issue_non_ncq_command(port, MTIP_TAG_INTERNAL);
+
+ /* Poll if atomic, wait_for_completion otherwise */
+ if (atomic == GFP_KERNEL) {
+ /* Wait for the command to complete or timeout. */
+ if (wait_for_completion_timeout(
+ &wait,
+ msecs_to_jiffies(timeout)) == 0) {
+ dev_err(&port->dd->pdev->dev,
+ "Internal command did not complete [%d] "
+ "within timeout of %lu ms\n",
+ atomic, timeout);
+ if (mtip_check_surprise_removal(port->dd->pdev) ||
+ test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &port->dd->dd_flag)) {
+ rv = -ENXIO;
+ goto exec_ic_exit;
+ }
+ rv = -EAGAIN;
+ }
+ } else {
+ /* Spin for <timeout> checking if command still outstanding */
+ timeout = jiffies + msecs_to_jiffies(timeout);
+ while ((readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+ & (1 << MTIP_TAG_INTERNAL))
+ && time_before(jiffies, timeout)) {
+ if (mtip_check_surprise_removal(port->dd->pdev)) {
+ rv = -ENXIO;
+ goto exec_ic_exit;
+ }
+ if ((fis->command != ATA_CMD_STANDBYNOW1) &&
+ test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &port->dd->dd_flag)) {
+ rv = -ENXIO;
+ goto exec_ic_exit;
+ }
+ if (readl(port->mmio + PORT_IRQ_STAT) & PORT_IRQ_ERR) {
+ atomic_inc(&int_cmd->active); /* error */
+ break;
+ }
+ }
+ }
+
+ if (atomic_read(&int_cmd->active) > 1) {
+ dev_err(&port->dd->pdev->dev,
+ "Internal command [%02X] failed\n", fis->command);
+ rv = -EIO;
+ }
+ if (readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+ & (1 << MTIP_TAG_INTERNAL)) {
+ rv = -ENXIO;
+ if (!test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &port->dd->dd_flag)) {
+ mtip_restart_port(port);
+ rv = -EAGAIN;
+ }
+ }
+exec_ic_exit:
+ /* Clear the allocated and active bits for the internal command. */
+ atomic_set(&int_cmd->active, 0);
+ release_slot(port, MTIP_TAG_INTERNAL);
+ if (rv >= 0 && mtip_pause_ncq(port, fis)) {
+ /* NCQ paused */
+ return rv;
+ }
+ clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+ wake_up_interruptible(&port->svc_wait);
+
+ return rv;
+}
+
+/*
+ * Byte-swap ATA ID strings.
+ *
+ * ATA identify data contains strings in byte-swapped 16-bit words.
+ * They must be swapped (on all architectures) to be usable as C strings.
+ * This function swaps bytes in-place.
+ *
+ * @buf The buffer location of the string
+ * @len The number of bytes to swap
+ *
+ * return value
+ * None
+ */
+static inline void ata_swap_string(u16 *buf, unsigned int len)
+{
+ int i;
+ for (i = 0; i < (len/2); i++)
+ be16_to_cpus(&buf[i]);
+}
+
+/*
+ * Request the device identity information.
+ *
+ * If a user space buffer is not specified, i.e. is NULL, the
+ * identify information is still read from the drive and placed
+ * into the identify data buffer (@e port->identify) in the
+ * port data structure.
+ * When the identify buffer contains valid identify information @e
+ * port->identify_valid is non-zero.
+ *
+ * @port Pointer to the port structure.
+ * @user_buffer A user space buffer where the identify data should be
+ * copied.
+ *
+ * return value
+ * 0 Command completed successfully.
+ * -EFAULT An error occurred while coping data to the user buffer.
+ * -1 Command failed.
+ */
+static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
+{
+ int rv = 0;
+ struct host_to_dev_fis fis;
+
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+ return -EFAULT;
+
+ /* Build the FIS. */
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = ATA_CMD_ID_ATA;
+
+ /* Set the identify information as invalid. */
+ port->identify_valid = 0;
+
+ /* Clear the identify information. */
+ memset(port->identify, 0, sizeof(u16) * ATA_ID_WORDS);
+
+ /* Execute the command. */
+ if (mtip_exec_internal_command(port,
+ &fis,
+ 5,
+ port->identify_dma,
+ sizeof(u16) * ATA_ID_WORDS,
+ 0,
+ GFP_KERNEL,
+ MTIP_INTERNAL_COMMAND_TIMEOUT_MS)
+ < 0) {
+ rv = -1;
+ goto out;
+ }
+
+ /*
+ * Perform any necessary byte-swapping. Yes, the kernel does in fact
+ * perform field-sensitive swapping on the string fields.
+ * See the kernel use of ata_id_string() for proof of this.
+ */
+#ifdef __LITTLE_ENDIAN
+ ata_swap_string(port->identify + 27, 40); /* model string*/
+ ata_swap_string(port->identify + 23, 8); /* firmware string*/
+ ata_swap_string(port->identify + 10, 20); /* serial# string*/
+#else
+ {
+ int i;
+ for (i = 0; i < ATA_ID_WORDS; i++)
+ port->identify[i] = le16_to_cpu(port->identify[i]);
+ }
+#endif
+
+ /* Set the identify buffer as valid. */
+ port->identify_valid = 1;
+
+ if (user_buffer) {
+ if (copy_to_user(
+ user_buffer,
+ port->identify,
+ ATA_ID_WORDS * sizeof(u16))) {
+ rv = -EFAULT;
+ goto out;
+ }
+ }
+
+out:
+ return rv;
+}
+
+/*
+ * Issue a standby immediate command to the device.
+ *
+ * @port Pointer to the port structure.
+ *
+ * return value
+ * 0 Command was executed successfully.
+ * -1 An error occurred while executing the command.
+ */
+static int mtip_standby_immediate(struct mtip_port *port)
+{
+ int rv;
+ struct host_to_dev_fis fis;
+ unsigned long start;
+
+ /* Build the FIS. */
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = ATA_CMD_STANDBYNOW1;
+
+ start = jiffies;
+ rv = mtip_exec_internal_command(port,
+ &fis,
+ 5,
+ 0,
+ 0,
+ 0,
+ GFP_ATOMIC,
+ 15000);
+ dbg_printk(MTIP_DRV_NAME "Time taken to complete standby cmd: %d ms\n",
+ jiffies_to_msecs(jiffies - start));
+ if (rv)
+ dev_warn(&port->dd->pdev->dev,
+ "STANDBY IMMEDIATE command failed.\n");
+
+ return rv;
+}
+
+/*
+ * Issue a READ LOG EXT command to the device.
+ *
+ * @port pointer to the port structure.
+ * @page page number to fetch
+ * @buffer pointer to buffer
+ * @buffer_dma dma address corresponding to @buffer
+ * @sectors page length to fetch, in sectors
+ *
+ * return value
+ * @rv return value from mtip_exec_internal_command()
+ */
+static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer,
+ dma_addr_t buffer_dma, unsigned int sectors)
+{
+ struct host_to_dev_fis fis;
+
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = ATA_CMD_READ_LOG_EXT;
+ fis.sect_count = sectors & 0xFF;
+ fis.sect_cnt_ex = (sectors >> 8) & 0xFF;
+ fis.lba_low = page;
+ fis.lba_mid = 0;
+ fis.device = ATA_DEVICE_OBS;
+
+ memset(buffer, 0, sectors * ATA_SECT_SIZE);
+
+ return mtip_exec_internal_command(port,
+ &fis,
+ 5,
+ buffer_dma,
+ sectors * ATA_SECT_SIZE,
+ 0,
+ GFP_ATOMIC,
+ MTIP_INTERNAL_COMMAND_TIMEOUT_MS);
+}
+
+/*
+ * Issue a SMART READ DATA command to the device.
+ *
+ * @port pointer to the port structure.
+ * @buffer pointer to buffer
+ * @buffer_dma dma address corresponding to @buffer
+ *
+ * return value
+ * @rv return value from mtip_exec_internal_command()
+ */
+static int mtip_get_smart_data(struct mtip_port *port, u8 *buffer,
+ dma_addr_t buffer_dma)
+{
+ struct host_to_dev_fis fis;
+
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = ATA_CMD_SMART;
+ fis.features = 0xD0;
+ fis.sect_count = 1;
+ fis.lba_mid = 0x4F;
+ fis.lba_hi = 0xC2;
+ fis.device = ATA_DEVICE_OBS;
+
+ return mtip_exec_internal_command(port,
+ &fis,
+ 5,
+ buffer_dma,
+ ATA_SECT_SIZE,
+ 0,
+ GFP_ATOMIC,
+ 15000);
+}
+
+/*
+ * Get the value of a smart attribute
+ *
+ * @port pointer to the port structure
+ * @id attribute number
+ * @attrib pointer to return attrib information corresponding to @id
+ *
+ * return value
+ * -EINVAL NULL buffer passed or unsupported attribute @id.
+ * -EPERM Identify data not valid, SMART not supported or not enabled
+ */
+static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id,
+ struct smart_attr *attrib)
+{
+ int rv, i;
+ struct smart_attr *pattr;
+
+ if (!attrib)
+ return -EINVAL;
+
+ if (!port->identify_valid) {
+ dev_warn(&port->dd->pdev->dev, "IDENTIFY DATA not valid\n");
+ return -EPERM;
+ }
+ if (!(port->identify[82] & 0x1)) {
+ dev_warn(&port->dd->pdev->dev, "SMART not supported\n");
+ return -EPERM;
+ }
+ if (!(port->identify[85] & 0x1)) {
+ dev_warn(&port->dd->pdev->dev, "SMART not enabled\n");
+ return -EPERM;
+ }
+
+ memset(port->smart_buf, 0, ATA_SECT_SIZE);
+ rv = mtip_get_smart_data(port, port->smart_buf, port->smart_buf_dma);
+ if (rv) {
+ dev_warn(&port->dd->pdev->dev, "Failed to ge SMART data\n");
+ return rv;
+ }
+
+ pattr = (struct smart_attr *)(port->smart_buf + 2);
+ for (i = 0; i < 29; i++, pattr++)
+ if (pattr->attr_id == id) {
+ memcpy(attrib, pattr, sizeof(struct smart_attr));
+ break;
+ }
+
+ if (i == 29) {
+ dev_warn(&port->dd->pdev->dev,
+ "Query for invalid SMART attribute ID\n");
+ rv = -EINVAL;
+ }
+
+ return rv;
+}
+
+/*
+ * Get the drive capacity.
+ *
+ * @dd Pointer to the device data structure.
+ * @sectors Pointer to the variable that will receive the sector count.
+ *
+ * return value
+ * 1 Capacity was returned successfully.
+ * 0 The identify information is invalid.
+ */
+static bool mtip_hw_get_capacity(struct driver_data *dd, sector_t *sectors)
+{
+ struct mtip_port *port = dd->port;
+ u64 total, raw0, raw1, raw2, raw3;
+ raw0 = port->identify[100];
+ raw1 = port->identify[101];
+ raw2 = port->identify[102];
+ raw3 = port->identify[103];
+ total = raw0 | raw1<<16 | raw2<<32 | raw3<<48;
+ *sectors = total;
+ return (bool) !!port->identify_valid;
+}
+
+/*
+ * Reset the HBA.
+ *
+ * Resets the HBA by setting the HBA Reset bit in the Global
+ * HBA Control register. After setting the HBA Reset bit the
+ * function waits for 1 second before reading the HBA Reset
+ * bit to make sure it has cleared. If HBA Reset is not clear
+ * an error is returned. Cannot be used in non-blockable
+ * context.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0 The reset was successful.
+ * -1 The HBA Reset bit did not clear.
+ */
+static int mtip_hba_reset(struct driver_data *dd)
+{
+ mtip_deinit_port(dd->port);
+
+ /* Set the reset bit */
+ writel(HOST_RESET, dd->mmio + HOST_CTL);
+
+ /* Flush */
+ readl(dd->mmio + HOST_CTL);
+
+ /* Wait for reset to clear */
+ ssleep(1);
+
+ /* Check the bit has cleared */
+ if (readl(dd->mmio + HOST_CTL) & HOST_RESET) {
+ dev_err(&dd->pdev->dev,
+ "Reset bit did not clear.\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Display the identify command data.
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ * None
+ */
+static void mtip_dump_identify(struct mtip_port *port)
+{
+ sector_t sectors;
+ unsigned short revid;
+ char cbuf[42];
+
+ if (!port->identify_valid)
+ return;
+
+ strlcpy(cbuf, (char *)(port->identify+10), 21);
+ dev_info(&port->dd->pdev->dev,
+ "Serial No.: %s\n", cbuf);
+
+ strlcpy(cbuf, (char *)(port->identify+23), 9);
+ dev_info(&port->dd->pdev->dev,
+ "Firmware Ver.: %s\n", cbuf);
+
+ strlcpy(cbuf, (char *)(port->identify+27), 41);
+ dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf);
+
+ if (mtip_hw_get_capacity(port->dd, &sectors))
+ dev_info(&port->dd->pdev->dev,
+ "Capacity: %llu sectors (%llu MB)\n",
+ (u64)sectors,
+ ((u64)sectors) * ATA_SECT_SIZE >> 20);
+
+ pci_read_config_word(port->dd->pdev, PCI_REVISION_ID, &revid);
+ switch (revid & 0xFF) {
+ case 0x1:
+ strlcpy(cbuf, "A0", 3);
+ break;
+ case 0x3:
+ strlcpy(cbuf, "A2", 3);
+ break;
+ default:
+ strlcpy(cbuf, "?", 2);
+ break;
+ }
+ dev_info(&port->dd->pdev->dev,
+ "Card Type: %s\n", cbuf);
+}
+
+/*
+ * Map the commands scatter list into the command table.
+ *
+ * @command Pointer to the command.
+ * @nents Number of scatter list entries.
+ *
+ * return value
+ * None
+ */
+static inline void fill_command_sg(struct driver_data *dd,
+ struct mtip_cmd *command,
+ int nents)
+{
+ int n;
+ unsigned int dma_len;
+ struct mtip_cmd_sg *command_sg;
+ struct scatterlist *sg = command->sg;
+
+ command_sg = command->command + AHCI_CMD_TBL_HDR_SZ;
+
+ for (n = 0; n < nents; n++) {
+ dma_len = sg_dma_len(sg);
+ if (dma_len > 0x400000)
+ dev_err(&dd->pdev->dev,
+ "DMA segment length truncated\n");
+ command_sg->info = __force_bit2int
+ cpu_to_le32((dma_len-1) & 0x3FFFFF);
+ command_sg->dba = __force_bit2int
+ cpu_to_le32(sg_dma_address(sg));
+ command_sg->dba_upper = __force_bit2int
+ cpu_to_le32((sg_dma_address(sg) >> 16) >> 16);
+ command_sg++;
+ sg++;
+ }
+}
+
+/*
+ * @brief Execute a drive command.
+ *
+ * return value 0 The command completed successfully.
+ * return value -1 An error occurred while executing the command.
+ */
+static int exec_drive_task(struct mtip_port *port, u8 *command)
+{
+ struct host_to_dev_fis fis;
+ struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG);
+
+ /* Build the FIS. */
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = command[0];
+ fis.features = command[1];
+ fis.sect_count = command[2];
+ fis.sector = command[3];
+ fis.cyl_low = command[4];
+ fis.cyl_hi = command[5];
+ fis.device = command[6] & ~0x10; /* Clear the dev bit*/
+
+ dbg_printk(MTIP_DRV_NAME " %s: User Command: cmd %x, feat %x, nsect %x, sect %x, lcyl %x, hcyl %x, sel %x\n",
+ __func__,
+ command[0],
+ command[1],
+ command[2],
+ command[3],
+ command[4],
+ command[5],
+ command[6]);
+
+ /* Execute the command. */
+ if (mtip_exec_internal_command(port,
+ &fis,
+ 5,
+ 0,
+ 0,
+ 0,
+ GFP_KERNEL,
+ MTIP_IOCTL_COMMAND_TIMEOUT_MS) < 0) {
+ return -1;
+ }
+
+ command[0] = reply->command; /* Status*/
+ command[1] = reply->features; /* Error*/
+ command[4] = reply->cyl_low;
+ command[5] = reply->cyl_hi;
+
+ dbg_printk(MTIP_DRV_NAME " %s: Completion Status: stat %x, err %x , cyl_lo %x cyl_hi %x\n",
+ __func__,
+ command[0],
+ command[1],
+ command[4],
+ command[5]);
+
+ return 0;
+}
+
+/*
+ * @brief Execute a drive command.
+ *
+ * @param port Pointer to the port data structure.
+ * @param command Pointer to the user specified command parameters.
+ * @param user_buffer Pointer to the user space buffer where read sector
+ * data should be copied.
+ *
+ * return value 0 The command completed successfully.
+ * return value -EFAULT An error occurred while copying the completion
+ * data to the user space buffer.
+ * return value -1 An error occurred while executing the command.
+ */
+static int exec_drive_command(struct mtip_port *port, u8 *command,
+ void __user *user_buffer)
+{
+ struct host_to_dev_fis fis;
+ struct host_to_dev_fis *reply;
+ u8 *buf = NULL;
+ dma_addr_t dma_addr = 0;
+ int rv = 0, xfer_sz = command[3];
+
+ if (xfer_sz) {
+ if (!user_buffer)
+ return -EFAULT;
+
+ buf = dmam_alloc_coherent(&port->dd->pdev->dev,
+ ATA_SECT_SIZE * xfer_sz,
+ &dma_addr,
+ GFP_KERNEL);
+ if (!buf) {
+ dev_err(&port->dd->pdev->dev,
+ "Memory allocation failed (%d bytes)\n",
+ ATA_SECT_SIZE * xfer_sz);
+ return -ENOMEM;
+ }
+ memset(buf, 0, ATA_SECT_SIZE * xfer_sz);
+ }
+
+ /* Build the FIS. */
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = command[0];
+ fis.features = command[2];
+ fis.sect_count = command[3];
+ if (fis.command == ATA_CMD_SMART) {
+ fis.sector = command[1];
+ fis.cyl_low = 0x4F;
+ fis.cyl_hi = 0xC2;
+ }
+
+ if (xfer_sz)
+ reply = (port->rxfis + RX_FIS_PIO_SETUP);
+ else
+ reply = (port->rxfis + RX_FIS_D2H_REG);
+
+ dbg_printk(MTIP_DRV_NAME
+ " %s: User Command: cmd %x, sect %x, "
+ "feat %x, sectcnt %x\n",
+ __func__,
+ command[0],
+ command[1],
+ command[2],
+ command[3]);
+
+ /* Execute the command. */
+ if (mtip_exec_internal_command(port,
+ &fis,
+ 5,
+ (xfer_sz ? dma_addr : 0),
+ (xfer_sz ? ATA_SECT_SIZE * xfer_sz : 0),
+ 0,
+ GFP_KERNEL,
+ MTIP_IOCTL_COMMAND_TIMEOUT_MS)
+ < 0) {
+ rv = -EFAULT;
+ goto exit_drive_command;
+ }
+
+ /* Collect the completion status. */
+ command[0] = reply->command; /* Status*/
+ command[1] = reply->features; /* Error*/
+ command[2] = reply->sect_count;
+
+ dbg_printk(MTIP_DRV_NAME
+ " %s: Completion Status: stat %x, "
+ "err %x, nsect %x\n",
+ __func__,
+ command[0],
+ command[1],
+ command[2]);
+
+ if (xfer_sz) {
+ if (copy_to_user(user_buffer,
+ buf,
+ ATA_SECT_SIZE * command[3])) {
+ rv = -EFAULT;
+ goto exit_drive_command;
+ }
+ }
+exit_drive_command:
+ if (buf)
+ dmam_free_coherent(&port->dd->pdev->dev,
+ ATA_SECT_SIZE * xfer_sz, buf, dma_addr);
+ return rv;
+}
+
+/*
+ * Indicates whether a command has a single sector payload.
+ *
+ * @command passed to the device to perform the certain event.
+ * @features passed to the device to perform the certain event.
+ *
+ * return value
+ * 1 command is one that always has a single sector payload,
+ * regardless of the value in the Sector Count field.
+ * 0 otherwise
+ *
+ */
+static unsigned int implicit_sector(unsigned char command,
+ unsigned char features)
+{
+ unsigned int rv = 0;
+
+ /* list of commands that have an implicit sector count of 1 */
+ switch (command) {
+ case ATA_CMD_SEC_SET_PASS:
+ case ATA_CMD_SEC_UNLOCK:
+ case ATA_CMD_SEC_ERASE_PREP:
+ case ATA_CMD_SEC_ERASE_UNIT:
+ case ATA_CMD_SEC_FREEZE_LOCK:
+ case ATA_CMD_SEC_DISABLE_PASS:
+ case ATA_CMD_PMP_READ:
+ case ATA_CMD_PMP_WRITE:
+ rv = 1;
+ break;
+ case ATA_CMD_SET_MAX:
+ if (features == ATA_SET_MAX_UNLOCK)
+ rv = 1;
+ break;
+ case ATA_CMD_SMART:
+ if ((features == ATA_SMART_READ_VALUES) ||
+ (features == ATA_SMART_READ_THRESHOLDS))
+ rv = 1;
+ break;
+ case ATA_CMD_CONF_OVERLAY:
+ if ((features == ATA_DCO_IDENTIFY) ||
+ (features == ATA_DCO_SET))
+ rv = 1;
+ break;
+ }
+ return rv;
+}
+
+static void mtip_set_timeout(struct host_to_dev_fis *fis, unsigned int *timeout)
+{
+ switch (fis->command) {
+ case ATA_CMD_DOWNLOAD_MICRO:
+ *timeout = 120000; /* 2 minutes */
+ break;
+ case ATA_CMD_SEC_ERASE_UNIT:
+ case 0xFC:
+ *timeout = 240000; /* 4 minutes */
+ break;
+ case ATA_CMD_STANDBYNOW1:
+ *timeout = 120000; /* 2 minutes */
+ break;
+ case 0xF7:
+ case 0xFA:
+ *timeout = 60000; /* 60 seconds */
+ break;
+ case ATA_CMD_SMART:
+ *timeout = 15000; /* 15 seconds */
+ break;
+ default:
+ *timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS;
+ break;
+ }
+}
+
+/*
+ * Executes a taskfile
+ * See ide_taskfile_ioctl() for derivation
+ */
+static int exec_drive_taskfile(struct driver_data *dd,
+ void __user *buf,
+ ide_task_request_t *req_task,
+ int outtotal)
+{
+ struct host_to_dev_fis fis;
+ struct host_to_dev_fis *reply;
+ u8 *outbuf = NULL;
+ u8 *inbuf = NULL;
+ dma_addr_t outbuf_dma = 0;
+ dma_addr_t inbuf_dma = 0;
+ dma_addr_t dma_buffer = 0;
+ int err = 0;
+ unsigned int taskin = 0;
+ unsigned int taskout = 0;
+ u8 nsect = 0;
+ unsigned int timeout;
+ unsigned int force_single_sector;
+ unsigned int transfer_size;
+ unsigned long task_file_data;
+ int intotal = outtotal + req_task->out_size;
+
+ taskout = req_task->out_size;
+ taskin = req_task->in_size;
+ /* 130560 = 512 * 0xFF*/
+ if (taskin > 130560 || taskout > 130560) {
+ err = -EINVAL;
+ goto abort;
+ }
+
+ if (taskout) {
+ outbuf = kzalloc(taskout, GFP_KERNEL);
+ if (outbuf == NULL) {
+ err = -ENOMEM;
+ goto abort;
+ }
+ if (copy_from_user(outbuf, buf + outtotal, taskout)) {
+ err = -EFAULT;
+ goto abort;
+ }
+ outbuf_dma = pci_map_single(dd->pdev,
+ outbuf,
+ taskout,
+ DMA_TO_DEVICE);
+ if (outbuf_dma == 0) {
+ err = -ENOMEM;
+ goto abort;
+ }
+ dma_buffer = outbuf_dma;
+ }
+
+ if (taskin) {
+ inbuf = kzalloc(taskin, GFP_KERNEL);
+ if (inbuf == NULL) {
+ err = -ENOMEM;
+ goto abort;
+ }
+
+ if (copy_from_user(inbuf, buf + intotal, taskin)) {
+ err = -EFAULT;
+ goto abort;
+ }
+ inbuf_dma = pci_map_single(dd->pdev,
+ inbuf,
+ taskin, DMA_FROM_DEVICE);
+ if (inbuf_dma == 0) {
+ err = -ENOMEM;
+ goto abort;
+ }
+ dma_buffer = inbuf_dma;
+ }
+
+ /* only supports PIO and non-data commands from this ioctl. */
+ switch (req_task->data_phase) {
+ case TASKFILE_OUT:
+ nsect = taskout / ATA_SECT_SIZE;
+ reply = (dd->port->rxfis + RX_FIS_PIO_SETUP);
+ break;
+ case TASKFILE_IN:
+ reply = (dd->port->rxfis + RX_FIS_PIO_SETUP);
+ break;
+ case TASKFILE_NO_DATA:
+ reply = (dd->port->rxfis + RX_FIS_D2H_REG);
+ break;
+ default:
+ err = -EINVAL;
+ goto abort;
+ }
+
+ /* Build the FIS. */
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = req_task->io_ports[7];
+ fis.features = req_task->io_ports[1];
+ fis.sect_count = req_task->io_ports[2];
+ fis.lba_low = req_task->io_ports[3];
+ fis.lba_mid = req_task->io_ports[4];
+ fis.lba_hi = req_task->io_ports[5];
+ /* Clear the dev bit*/
+ fis.device = req_task->io_ports[6] & ~0x10;
+
+ if ((req_task->in_flags.all == 0) && (req_task->out_flags.all & 1)) {
+ req_task->in_flags.all =
+ IDE_TASKFILE_STD_IN_FLAGS |
+ (IDE_HOB_STD_IN_FLAGS << 8);
+ fis.lba_low_ex = req_task->hob_ports[3];
+ fis.lba_mid_ex = req_task->hob_ports[4];
+ fis.lba_hi_ex = req_task->hob_ports[5];
+ fis.features_ex = req_task->hob_ports[1];
+ fis.sect_cnt_ex = req_task->hob_ports[2];
+
+ } else {
+ req_task->in_flags.all = IDE_TASKFILE_STD_IN_FLAGS;
+ }
+
+ force_single_sector = implicit_sector(fis.command, fis.features);
+
+ if ((taskin || taskout) && (!fis.sect_count)) {
+ if (nsect)
+ fis.sect_count = nsect;
+ else {
+ if (!force_single_sector) {
+ dev_warn(&dd->pdev->dev,
+ "data movement but "
+ "sect_count is 0\n");
+ err = -EINVAL;
+ goto abort;
+ }
+ }
+ }
+
+ dbg_printk(MTIP_DRV_NAME
+ " %s: cmd %x, feat %x, nsect %x,"
+ " sect/lbal %x, lcyl/lbam %x, hcyl/lbah %x,"
+ " head/dev %x\n",
+ __func__,
+ fis.command,
+ fis.features,
+ fis.sect_count,
+ fis.lba_low,
+ fis.lba_mid,
+ fis.lba_hi,
+ fis.device);
+
+ mtip_set_timeout(&fis, &timeout);
+
+ /* Determine the correct transfer size.*/
+ if (force_single_sector)
+ transfer_size = ATA_SECT_SIZE;
+ else
+ transfer_size = ATA_SECT_SIZE * fis.sect_count;
+
+ /* Execute the command.*/
+ if (mtip_exec_internal_command(dd->port,
+ &fis,
+ 5,
+ dma_buffer,
+ transfer_size,
+ 0,
+ GFP_KERNEL,
+ timeout) < 0) {
+ err = -EIO;
+ goto abort;
+ }
+
+ task_file_data = readl(dd->port->mmio+PORT_TFDATA);
+
+ if ((req_task->data_phase == TASKFILE_IN) && !(task_file_data & 1)) {
+ reply = dd->port->rxfis + RX_FIS_PIO_SETUP;
+ req_task->io_ports[7] = reply->control;
+ } else {
+ reply = dd->port->rxfis + RX_FIS_D2H_REG;
+ req_task->io_ports[7] = reply->command;
+ }
+
+ /* reclaim the DMA buffers.*/
+ if (inbuf_dma)
+ pci_unmap_single(dd->pdev, inbuf_dma,
+ taskin, DMA_FROM_DEVICE);
+ if (outbuf_dma)
+ pci_unmap_single(dd->pdev, outbuf_dma,
+ taskout, DMA_TO_DEVICE);
+ inbuf_dma = 0;
+ outbuf_dma = 0;
+
+ /* return the ATA registers to the caller.*/
+ req_task->io_ports[1] = reply->features;
+ req_task->io_ports[2] = reply->sect_count;
+ req_task->io_ports[3] = reply->lba_low;
+ req_task->io_ports[4] = reply->lba_mid;
+ req_task->io_ports[5] = reply->lba_hi;
+ req_task->io_ports[6] = reply->device;
+
+ if (req_task->out_flags.all & 1) {
+
+ req_task->hob_ports[3] = reply->lba_low_ex;
+ req_task->hob_ports[4] = reply->lba_mid_ex;
+ req_task->hob_ports[5] = reply->lba_hi_ex;
+ req_task->hob_ports[1] = reply->features_ex;
+ req_task->hob_ports[2] = reply->sect_cnt_ex;
+ }
+ dbg_printk(MTIP_DRV_NAME
+ " %s: Completion: stat %x,"
+ "err %x, sect_cnt %x, lbalo %x,"
+ "lbamid %x, lbahi %x, dev %x\n",
+ __func__,
+ req_task->io_ports[7],
+ req_task->io_ports[1],
+ req_task->io_ports[2],
+ req_task->io_ports[3],
+ req_task->io_ports[4],
+ req_task->io_ports[5],
+ req_task->io_ports[6]);
+
+ if (taskout) {
+ if (copy_to_user(buf + outtotal, outbuf, taskout)) {
+ err = -EFAULT;
+ goto abort;
+ }
+ }
+ if (taskin) {
+ if (copy_to_user(buf + intotal, inbuf, taskin)) {
+ err = -EFAULT;
+ goto abort;
+ }
+ }
+abort:
+ if (inbuf_dma)
+ pci_unmap_single(dd->pdev, inbuf_dma,
+ taskin, DMA_FROM_DEVICE);
+ if (outbuf_dma)
+ pci_unmap_single(dd->pdev, outbuf_dma,
+ taskout, DMA_TO_DEVICE);
+ kfree(outbuf);
+ kfree(inbuf);
+
+ return err;
+}
+
+/*
+ * Handle IOCTL calls from the Block Layer.
+ *
+ * This function is called by the Block Layer when it receives an IOCTL
+ * command that it does not understand. If the IOCTL command is not supported
+ * this function returns -ENOTTY.
+ *
+ * @dd Pointer to the driver data structure.
+ * @cmd IOCTL command passed from the Block Layer.
+ * @arg IOCTL argument passed from the Block Layer.
+ *
+ * return value
+ * 0 The IOCTL completed successfully.
+ * -ENOTTY The specified command is not supported.
+ * -EFAULT An error occurred copying data to a user space buffer.
+ * -EIO An error occurred while executing the command.
+ */
+static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case HDIO_GET_IDENTITY:
+ {
+ if (copy_to_user((void __user *)arg, dd->port->identify,
+ sizeof(u16) * ATA_ID_WORDS))
+ return -EFAULT;
+ break;
+ }
+ case HDIO_DRIVE_CMD:
+ {
+ u8 drive_command[4];
+
+ /* Copy the user command info to our buffer. */
+ if (copy_from_user(drive_command,
+ (void __user *) arg,
+ sizeof(drive_command)))
+ return -EFAULT;
+
+ /* Execute the drive command. */
+ if (exec_drive_command(dd->port,
+ drive_command,
+ (void __user *) (arg+4)))
+ return -EIO;
+
+ /* Copy the status back to the users buffer. */
+ if (copy_to_user((void __user *) arg,
+ drive_command,
+ sizeof(drive_command)))
+ return -EFAULT;
+
+ break;
+ }
+ case HDIO_DRIVE_TASK:
+ {
+ u8 drive_command[7];
+
+ /* Copy the user command info to our buffer. */
+ if (copy_from_user(drive_command,
+ (void __user *) arg,
+ sizeof(drive_command)))
+ return -EFAULT;
+
+ /* Execute the drive command. */
+ if (exec_drive_task(dd->port, drive_command))
+ return -EIO;
+
+ /* Copy the status back to the users buffer. */
+ if (copy_to_user((void __user *) arg,
+ drive_command,
+ sizeof(drive_command)))
+ return -EFAULT;
+
+ break;
+ }
+ case HDIO_DRIVE_TASKFILE: {
+ ide_task_request_t req_task;
+ int ret, outtotal;
+
+ if (copy_from_user(&req_task, (void __user *) arg,
+ sizeof(req_task)))
+ return -EFAULT;
+
+ outtotal = sizeof(req_task);
+
+ ret = exec_drive_taskfile(dd, (void __user *) arg,
+ &req_task, outtotal);
+
+ if (copy_to_user((void __user *) arg, &req_task,
+ sizeof(req_task)))
+ return -EFAULT;
+
+ return ret;
+ }
+
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * Submit an IO to the hw
+ *
+ * This function is called by the block layer to issue an io
+ * to the device. Upon completion, the callback function will
+ * be called with the data parameter passed as the callback data.
+ *
+ * @dd Pointer to the driver data structure.
+ * @start First sector to read.
+ * @nsect Number of sectors to read.
+ * @nents Number of entries in scatter list for the read command.
+ * @tag The tag of this read command.
+ * @callback Pointer to the function that should be called
+ * when the read completes.
+ * @data Callback data passed to the callback function
+ * when the read completes.
+ * @dir Direction (read or write)
+ *
+ * return value
+ * None
+ */
+static void mtip_hw_submit_io(struct driver_data *dd, sector_t start,
+ int nsect, int nents, int tag, void *callback,
+ void *data, int dir)
+{
+ struct host_to_dev_fis *fis;
+ struct mtip_port *port = dd->port;
+ struct mtip_cmd *command = &port->commands[tag];
+ int dma_dir = (dir == READ) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+
+ /* Map the scatter list for DMA access */
+ nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir);
+
+ command->scatter_ents = nents;
+
+ /*
+ * The number of retries for this command before it is
+ * reported as a failure to the upper layers.
+ */
+ command->retries = MTIP_MAX_RETRIES;
+
+ /* Fill out fis */
+ fis = command->command;
+ fis->type = 0x27;
+ fis->opts = 1 << 7;
+ fis->command =
+ (dir == READ ? ATA_CMD_FPDMA_READ : ATA_CMD_FPDMA_WRITE);
+ *((unsigned int *) &fis->lba_low) = (start & 0xFFFFFF);
+ *((unsigned int *) &fis->lba_low_ex) = ((start >> 24) & 0xFFFFFF);
+ fis->device = 1 << 6;
+ fis->features = nsect & 0xFF;
+ fis->features_ex = (nsect >> 8) & 0xFF;
+ fis->sect_count = ((tag << 3) | (tag >> 5));
+ fis->sect_cnt_ex = 0;
+ fis->control = 0;
+ fis->res2 = 0;
+ fis->res3 = 0;
+ fill_command_sg(dd, command, nents);
+
+ /* Populate the command header */
+ command->command_header->opts =
+ __force_bit2int cpu_to_le32(
+ (nents << 16) | 5 | AHCI_CMD_PREFETCH);
+ command->command_header->byte_count = 0;
+
+ /*
+ * Set the completion function and data for the command
+ * within this layer.
+ */
+ command->comp_data = dd;
+ command->comp_func = mtip_async_complete;
+ command->direction = dma_dir;
+
+ /*
+ * Set the completion function and data for the command passed
+ * from the upper layer.
+ */
+ command->async_data = data;
+ command->async_callback = callback;
+
+ /*
+ * To prevent this command from being issued
+ * if an internal command is in progress or error handling is active.
+ */
+ if (port->flags & MTIP_PF_PAUSE_IO) {
+ set_bit(tag, port->cmds_to_issue);
+ set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
+ return;
+ }
+
+ /* Issue the command to the hardware */
+ mtip_issue_ncq_command(port, tag);
+
+ return;
+}
+
+/*
+ * Release a command slot.
+ *
+ * @dd Pointer to the driver data structure.
+ * @tag Slot tag
+ *
+ * return value
+ * None
+ */
+static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag)
+{
+ release_slot(dd->port, tag);
+}
+
+/*
+ * Obtain a command slot and return its associated scatter list.
+ *
+ * @dd Pointer to the driver data structure.
+ * @tag Pointer to an int that will receive the allocated command
+ * slot tag.
+ *
+ * return value
+ * Pointer to the scatter list for the allocated command slot
+ * or NULL if no command slots are available.
+ */
+static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd,
+ int *tag)
+{
+ /*
+ * It is possible that, even with this semaphore, a thread
+ * may think that no command slots are available. Therefore, we
+ * need to make an attempt to get_slot().
+ */
+ down(&dd->port->cmd_slot);
+ *tag = get_slot(dd->port);
+
+ if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) {
+ up(&dd->port->cmd_slot);
+ return NULL;
+ }
+ if (unlikely(*tag < 0)) {
+ up(&dd->port->cmd_slot);
+ return NULL;
+ }
+
+ return dd->port->commands[*tag].sg;
+}
+
+/*
+ * Sysfs status dump.
+ *
+ * @dev Pointer to the device structure, passed by the kernrel.
+ * @attr Pointer to the device_attribute structure passed by the kernel.
+ * @buf Pointer to the char buffer that will receive the stats info.
+ *
+ * return value
+ * The size, in bytes, of the data copied into buf.
+ */
+static ssize_t mtip_hw_show_status(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct driver_data *dd = dev_to_disk(dev)->private_data;
+ int size = 0;
+
+ if (test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))
+ size += sprintf(buf, "%s", "thermal_shutdown\n");
+ else if (test_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag))
+ size += sprintf(buf, "%s", "write_protect\n");
+ else
+ size += sprintf(buf, "%s", "online\n");
+
+ return size;
+}
+
+static DEVICE_ATTR(status, S_IRUGO, mtip_hw_show_status, NULL);
+
+static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf,
+ size_t len, loff_t *offset)
+{
+ struct driver_data *dd = (struct driver_data *)f->private_data;
+ char buf[MTIP_DFS_MAX_BUF_SIZE];
+ u32 group_allocated;
+ int size = *offset;
+ int n;
+
+ if (!len || size)
+ return 0;
+
+ size += sprintf(&buf[size], "H/ S ACTive : [ 0x");
+
+ for (n = dd->slot_groups-1; n >= 0; n--)
+ size += sprintf(&buf[size], "%08X ",
+ readl(dd->port->s_active[n]));
+
+ size += sprintf(&buf[size], "]\n");
+ size += sprintf(&buf[size], "H/ Command Issue : [ 0x");
+
+ for (n = dd->slot_groups-1; n >= 0; n--)
+ size += sprintf(&buf[size], "%08X ",
+ readl(dd->port->cmd_issue[n]));
+
+ size += sprintf(&buf[size], "]\n");
+ size += sprintf(&buf[size], "H/ Completed : [ 0x");
+
+ for (n = dd->slot_groups-1; n >= 0; n--)
+ size += sprintf(&buf[size], "%08X ",
+ readl(dd->port->completed[n]));
+
+ size += sprintf(&buf[size], "]\n");
+ size += sprintf(&buf[size], "H/ PORT IRQ STAT : [ 0x%08X ]\n",
+ readl(dd->port->mmio + PORT_IRQ_STAT));
+ size += sprintf(&buf[size], "H/ HOST IRQ STAT : [ 0x%08X ]\n",
+ readl(dd->mmio + HOST_IRQ_STAT));
+ size += sprintf(&buf[size], "\n");
+
+ size += sprintf(&buf[size], "L/ Allocated : [ 0x");
+
+ for (n = dd->slot_groups-1; n >= 0; n--) {
+ if (sizeof(long) > sizeof(u32))
+ group_allocated =
+ dd->port->allocated[n/2] >> (32*(n&1));
+ else
+ group_allocated = dd->port->allocated[n];
+ size += sprintf(&buf[size], "%08X ", group_allocated);
+ }
+ size += sprintf(&buf[size], "]\n");
+
+ size += sprintf(&buf[size], "L/ Commands in Q : [ 0x");
+
+ for (n = dd->slot_groups-1; n >= 0; n--) {
+ if (sizeof(long) > sizeof(u32))
+ group_allocated =
+ dd->port->cmds_to_issue[n/2] >> (32*(n&1));
+ else
+ group_allocated = dd->port->cmds_to_issue[n];
+ size += sprintf(&buf[size], "%08X ", group_allocated);
+ }
+ size += sprintf(&buf[size], "]\n");
+
+ *offset = size <= len ? size : len;
+ size = copy_to_user(ubuf, buf, *offset);
+ if (size)
+ return -EFAULT;
+
+ return *offset;
+}
+
+static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf,
+ size_t len, loff_t *offset)
+{
+ struct driver_data *dd = (struct driver_data *)f->private_data;
+ char buf[MTIP_DFS_MAX_BUF_SIZE];
+ int size = *offset;
+
+ if (!len || size)
+ return 0;
+
+ size += sprintf(&buf[size], "Flag-port : [ %08lX ]\n",
+ dd->port->flags);
+ size += sprintf(&buf[size], "Flag-dd : [ %08lX ]\n",
+ dd->dd_flag);
+
+ *offset = size <= len ? size : len;
+ size = copy_to_user(ubuf, buf, *offset);
+ if (size)
+ return -EFAULT;
+
+ return *offset;
+}
+
+static const struct file_operations mtip_regs_fops = {
+ .owner = THIS_MODULE,
+ .open = simple_open,
+ .read = mtip_hw_read_registers,
+ .llseek = no_llseek,
+};
+
+static const struct file_operations mtip_flags_fops = {
+ .owner = THIS_MODULE,
+ .open = simple_open,
+ .read = mtip_hw_read_flags,
+ .llseek = no_llseek,
+};
+
+/*
+ * Create the sysfs related attributes.
+ *
+ * @dd Pointer to the driver data structure.
+ * @kobj Pointer to the kobj for the block device.
+ *
+ * return value
+ * 0 Operation completed successfully.
+ * -EINVAL Invalid parameter.
+ */
+static int mtip_hw_sysfs_init(struct driver_data *dd, struct kobject *kobj)
+{
+ if (!kobj || !dd)
+ return -EINVAL;
+
+ if (sysfs_create_file(kobj, &dev_attr_status.attr))
+ dev_warn(&dd->pdev->dev,
+ "Error creating 'status' sysfs entry\n");
+ return 0;
+}
+
+/*
+ * Remove the sysfs related attributes.
+ *
+ * @dd Pointer to the driver data structure.
+ * @kobj Pointer to the kobj for the block device.
+ *
+ * return value
+ * 0 Operation completed successfully.
+ * -EINVAL Invalid parameter.
+ */
+static int mtip_hw_sysfs_exit(struct driver_data *dd, struct kobject *kobj)
+{
+ if (!kobj || !dd)
+ return -EINVAL;
+
+ sysfs_remove_file(kobj, &dev_attr_status.attr);
+
+ return 0;
+}
+
+static int mtip_hw_debugfs_init(struct driver_data *dd)
+{
+ if (!dfs_parent)
+ return -1;
+
+ dd->dfs_node = debugfs_create_dir(dd->disk->disk_name, dfs_parent);
+ if (IS_ERR_OR_NULL(dd->dfs_node)) {
+ dev_warn(&dd->pdev->dev,
+ "Error creating node %s under debugfs\n",
+ dd->disk->disk_name);
+ dd->dfs_node = NULL;
+ return -1;
+ }
+
+ debugfs_create_file("flags", S_IRUGO, dd->dfs_node, dd,
+ &mtip_flags_fops);
+ debugfs_create_file("registers", S_IRUGO, dd->dfs_node, dd,
+ &mtip_regs_fops);
+
+ return 0;
+}
+
+static void mtip_hw_debugfs_exit(struct driver_data *dd)
+{
+ debugfs_remove_recursive(dd->dfs_node);
+}
+
+
+/*
+ * Perform any init/resume time hardware setup
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * None
+ */
+static inline void hba_setup(struct driver_data *dd)
+{
+ u32 hwdata;
+ hwdata = readl(dd->mmio + HOST_HSORG);
+
+ /* interrupt bug workaround: use only 1 IS bit.*/
+ writel(hwdata |
+ HSORG_DISABLE_SLOTGRP_INTR |
+ HSORG_DISABLE_SLOTGRP_PXIS,
+ dd->mmio + HOST_HSORG);
+}
+
+/*
+ * Detect the details of the product, and store anything needed
+ * into the driver data structure. This includes product type and
+ * version and number of slot groups.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * None
+ */
+static void mtip_detect_product(struct driver_data *dd)
+{
+ u32 hwdata;
+ unsigned int rev, slotgroups;
+
+ /*
+ * HBA base + 0xFC [15:0] - vendor-specific hardware interface
+ * info register:
+ * [15:8] hardware/software interface rev#
+ * [ 3] asic-style interface
+ * [ 2:0] number of slot groups, minus 1 (only valid for asic-style).
+ */
+ hwdata = readl(dd->mmio + HOST_HSORG);
+
+ dd->product_type = MTIP_PRODUCT_UNKNOWN;
+ dd->slot_groups = 1;
+
+ if (hwdata & 0x8) {
+ dd->product_type = MTIP_PRODUCT_ASICFPGA;
+ rev = (hwdata & HSORG_HWREV) >> 8;
+ slotgroups = (hwdata & HSORG_SLOTGROUPS) + 1;
+ dev_info(&dd->pdev->dev,
+ "ASIC-FPGA design, HS rev 0x%x, "
+ "%i slot groups [%i slots]\n",
+ rev,
+ slotgroups,
+ slotgroups * 32);
+
+ if (slotgroups > MTIP_MAX_SLOT_GROUPS) {
+ dev_warn(&dd->pdev->dev,
+ "Warning: driver only supports "
+ "%i slot groups.\n", MTIP_MAX_SLOT_GROUPS);
+ slotgroups = MTIP_MAX_SLOT_GROUPS;
+ }
+ dd->slot_groups = slotgroups;
+ return;
+ }
+
+ dev_warn(&dd->pdev->dev, "Unrecognized product id\n");
+}
+
+/*
+ * Blocking wait for FTL rebuild to complete
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ * 0 FTL rebuild completed successfully
+ * -EFAULT FTL rebuild error/timeout/interruption
+ */
+static int mtip_ftl_rebuild_poll(struct driver_data *dd)
+{
+ unsigned long timeout, cnt = 0, start;
+
+ dev_warn(&dd->pdev->dev,
+ "FTL rebuild in progress. Polling for completion.\n");
+
+ start = jiffies;
+ timeout = jiffies + msecs_to_jiffies(MTIP_FTL_REBUILD_TIMEOUT_MS);
+
+ do {
+ if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &dd->dd_flag)))
+ return -EFAULT;
+ if (mtip_check_surprise_removal(dd->pdev))
+ return -EFAULT;
+
+ if (mtip_get_identify(dd->port, NULL) < 0)
+ return -EFAULT;
+
+ if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
+ MTIP_FTL_REBUILD_MAGIC) {
+ ssleep(1);
+ /* Print message every 3 minutes */
+ if (cnt++ >= 180) {
+ dev_warn(&dd->pdev->dev,
+ "FTL rebuild in progress (%d secs).\n",
+ jiffies_to_msecs(jiffies - start) / 1000);
+ cnt = 0;
+ }
+ } else {
+ dev_warn(&dd->pdev->dev,
+ "FTL rebuild complete (%d secs).\n",
+ jiffies_to_msecs(jiffies - start) / 1000);
+ mtip_block_initialize(dd);
+ return 0;
+ }
+ ssleep(10);
+ } while (time_before(jiffies, timeout));
+
+ /* Check for timeout */
+ dev_err(&dd->pdev->dev,
+ "Timed out waiting for FTL rebuild to complete (%d secs).\n",
+ jiffies_to_msecs(jiffies - start) / 1000);
+ return -EFAULT;
+}
+
+/*
+ * service thread to issue queued commands
+ *
+ * @data Pointer to the driver data structure.
+ *
+ * return value
+ * 0
+ */
+
+static int mtip_service_thread(void *data)
+{
+ struct driver_data *dd = (struct driver_data *)data;
+ unsigned long slot, slot_start, slot_wrap;
+ unsigned int num_cmd_slots = dd->slot_groups * 32;
+ struct mtip_port *port = dd->port;
+
+ while (1) {
+ /*
+ * the condition is to check neither an internal command is
+ * is in progress nor error handling is active
+ */
+ wait_event_interruptible(port->svc_wait, (port->flags) &&
+ !(port->flags & MTIP_PF_PAUSE_IO));
+
+ if (kthread_should_stop())
+ break;
+
+ if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &dd->dd_flag)))
+ break;
+
+ set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
+ if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) {
+ slot = 1;
+ /* used to restrict the loop to one iteration */
+ slot_start = num_cmd_slots;
+ slot_wrap = 0;
+ while (1) {
+ slot = find_next_bit(port->cmds_to_issue,
+ num_cmd_slots, slot);
+ if (slot_wrap == 1) {
+ if ((slot_start >= slot) ||
+ (slot >= num_cmd_slots))
+ break;
+ }
+ if (unlikely(slot_start == num_cmd_slots))
+ slot_start = slot;
+
+ if (unlikely(slot == num_cmd_slots)) {
+ slot = 1;
+ slot_wrap = 1;
+ continue;
+ }
+
+ /* Issue the command to the hardware */
+ mtip_issue_ncq_command(port, slot);
+
+ clear_bit(slot, port->cmds_to_issue);
+ }
+
+ clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
+ } else if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) {
+ if (!mtip_ftl_rebuild_poll(dd))
+ set_bit(MTIP_DDF_REBUILD_FAILED_BIT,
+ &dd->dd_flag);
+ clear_bit(MTIP_PF_REBUILD_BIT, &port->flags);
+ }
+ clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
+
+ if (test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags))
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Called once for each card.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0 on success, else an error code.
+ */
+static int mtip_hw_init(struct driver_data *dd)
+{
+ int i;
+ int rv;
+ unsigned int num_command_slots;
+ unsigned long timeout, timetaken;
+ unsigned char *buf;
+ struct smart_attr attr242;
+
+ dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR];
+
+ mtip_detect_product(dd);
+ if (dd->product_type == MTIP_PRODUCT_UNKNOWN) {
+ rv = -EIO;
+ goto out1;
+ }
+ num_command_slots = dd->slot_groups * 32;
+
+ hba_setup(dd);
+
+ tasklet_init(&dd->tasklet, mtip_tasklet, (unsigned long)dd);
+
+ dd->port = kzalloc(sizeof(struct mtip_port), GFP_KERNEL);
+ if (!dd->port) {
+ dev_err(&dd->pdev->dev,
+ "Memory allocation: port structure\n");
+ return -ENOMEM;
+ }
+
+ /* Counting semaphore to track command slot usage */
+ sema_init(&dd->port->cmd_slot, num_command_slots - 1);
+
+ /* Spinlock to prevent concurrent issue */
+ spin_lock_init(&dd->port->cmd_issue_lock);
+
+ /* Set the port mmio base address. */
+ dd->port->mmio = dd->mmio + PORT_OFFSET;
+ dd->port->dd = dd;
+
+ /* Allocate memory for the command list. */
+ dd->port->command_list =
+ dmam_alloc_coherent(&dd->pdev->dev,
+ HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
+ &dd->port->command_list_dma,
+ GFP_KERNEL);
+ if (!dd->port->command_list) {
+ dev_err(&dd->pdev->dev,
+ "Memory allocation: command list\n");
+ rv = -ENOMEM;
+ goto out1;
+ }
+
+ /* Clear the memory we have allocated. */
+ memset(dd->port->command_list,
+ 0,
+ HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4));
+
+ /* Setup the addresse of the RX FIS. */
+ dd->port->rxfis = dd->port->command_list + HW_CMD_SLOT_SZ;
+ dd->port->rxfis_dma = dd->port->command_list_dma + HW_CMD_SLOT_SZ;
+
+ /* Setup the address of the command tables. */
+ dd->port->command_table = dd->port->rxfis + AHCI_RX_FIS_SZ;
+ dd->port->command_tbl_dma = dd->port->rxfis_dma + AHCI_RX_FIS_SZ;
+
+ /* Setup the address of the identify data. */
+ dd->port->identify = dd->port->command_table +
+ HW_CMD_TBL_AR_SZ;
+ dd->port->identify_dma = dd->port->command_tbl_dma +
+ HW_CMD_TBL_AR_SZ;
+
+ /* Setup the address of the sector buffer - for some non-ncq cmds */
+ dd->port->sector_buffer = (void *) dd->port->identify + ATA_SECT_SIZE;
+ dd->port->sector_buffer_dma = dd->port->identify_dma + ATA_SECT_SIZE;
+
+ /* Setup the address of the log buf - for read log command */
+ dd->port->log_buf = (void *)dd->port->sector_buffer + ATA_SECT_SIZE;
+ dd->port->log_buf_dma = dd->port->sector_buffer_dma + ATA_SECT_SIZE;
+
+ /* Setup the address of the smart buf - for smart read data command */
+ dd->port->smart_buf = (void *)dd->port->log_buf + ATA_SECT_SIZE;
+ dd->port->smart_buf_dma = dd->port->log_buf_dma + ATA_SECT_SIZE;
+
+
+ /* Point the command headers at the command tables. */
+ for (i = 0; i < num_command_slots; i++) {
+ dd->port->commands[i].command_header =
+ dd->port->command_list +
+ (sizeof(struct mtip_cmd_hdr) * i);
+ dd->port->commands[i].command_header_dma =
+ dd->port->command_list_dma +
+ (sizeof(struct mtip_cmd_hdr) * i);
+
+ dd->port->commands[i].command =
+ dd->port->command_table + (HW_CMD_TBL_SZ * i);
+ dd->port->commands[i].command_dma =
+ dd->port->command_tbl_dma + (HW_CMD_TBL_SZ * i);
+
+ if (readl(dd->mmio + HOST_CAP) & HOST_CAP_64)
+ dd->port->commands[i].command_header->ctbau =
+ __force_bit2int cpu_to_le32(
+ (dd->port->commands[i].command_dma >> 16) >> 16);
+ dd->port->commands[i].command_header->ctba =
+ __force_bit2int cpu_to_le32(
+ dd->port->commands[i].command_dma & 0xFFFFFFFF);
+
+ /*
+ * If this is not done, a bug is reported by the stock
+ * FC11 i386. Due to the fact that it has lots of kernel
+ * debugging enabled.
+ */
+ sg_init_table(dd->port->commands[i].sg, MTIP_MAX_SG);
+
+ /* Mark all commands as currently inactive.*/
+ atomic_set(&dd->port->commands[i].active, 0);
+ }
+
+ /* Setup the pointers to the extended s_active and CI registers. */
+ for (i = 0; i < dd->slot_groups; i++) {
+ dd->port->s_active[i] =
+ dd->port->mmio + i*0x80 + PORT_SCR_ACT;
+ dd->port->cmd_issue[i] =
+ dd->port->mmio + i*0x80 + PORT_COMMAND_ISSUE;
+ dd->port->completed[i] =
+ dd->port->mmio + i*0x80 + PORT_SDBV;
+ }
+
+ timetaken = jiffies;
+ timeout = jiffies + msecs_to_jiffies(30000);
+ while (((readl(dd->port->mmio + PORT_SCR_STAT) & 0x0F) != 0x03) &&
+ time_before(jiffies, timeout)) {
+ mdelay(100);
+ }
+ if (unlikely(mtip_check_surprise_removal(dd->pdev))) {
+ timetaken = jiffies - timetaken;
+ dev_warn(&dd->pdev->dev,
+ "Surprise removal detected at %u ms\n",
+ jiffies_to_msecs(timetaken));
+ rv = -ENODEV;
+ goto out2 ;
+ }
+ if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) {
+ timetaken = jiffies - timetaken;
+ dev_warn(&dd->pdev->dev,
+ "Removal detected at %u ms\n",
+ jiffies_to_msecs(timetaken));
+ rv = -EFAULT;
+ goto out2;
+ }
+
+ /* Conditionally reset the HBA. */
+ if (!(readl(dd->mmio + HOST_CAP) & HOST_CAP_NZDMA)) {
+ if (mtip_hba_reset(dd) < 0) {
+ dev_err(&dd->pdev->dev,
+ "Card did not reset within timeout\n");
+ rv = -EIO;
+ goto out2;
+ }
+ } else {
+ /* Clear any pending interrupts on the HBA */
+ writel(readl(dd->mmio + HOST_IRQ_STAT),
+ dd->mmio + HOST_IRQ_STAT);
+ }
+
+ mtip_init_port(dd->port);
+ mtip_start_port(dd->port);
+
+ /* Setup the ISR and enable interrupts. */
+ rv = devm_request_irq(&dd->pdev->dev,
+ dd->pdev->irq,
+ mtip_irq_handler,
+ IRQF_SHARED,
+ dev_driver_string(&dd->pdev->dev),
+ dd);
+
+ if (rv) {
+ dev_err(&dd->pdev->dev,
+ "Unable to allocate IRQ %d\n", dd->pdev->irq);
+ goto out2;
+ }
+
+ /* Enable interrupts on the HBA. */
+ writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
+ dd->mmio + HOST_CTL);
+
+ init_timer(&dd->port->cmd_timer);
+ init_waitqueue_head(&dd->port->svc_wait);
+
+ dd->port->cmd_timer.data = (unsigned long int) dd->port;
+ dd->port->cmd_timer.function = mtip_timeout_function;
+ mod_timer(&dd->port->cmd_timer,
+ jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
+
+
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) {
+ rv = -EFAULT;
+ goto out3;
+ }
+
+ if (mtip_get_identify(dd->port, NULL) < 0) {
+ rv = -EFAULT;
+ goto out3;
+ }
+
+ if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
+ MTIP_FTL_REBUILD_MAGIC) {
+ set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags);
+ return MTIP_FTL_REBUILD_MAGIC;
+ }
+ mtip_dump_identify(dd->port);
+
+ /* check write protect, over temp and rebuild statuses */
+ rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
+ dd->port->log_buf,
+ dd->port->log_buf_dma, 1);
+ if (rv) {
+ dev_warn(&dd->pdev->dev,
+ "Error in READ LOG EXT (10h) command\n");
+ /* non-critical error, don't fail the load */
+ } else {
+ buf = (unsigned char *)dd->port->log_buf;
+ if (buf[259] & 0x1) {
+ dev_info(&dd->pdev->dev,
+ "Write protect bit is set.\n");
+ set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag);
+ }
+ if (buf[288] == 0xF7) {
+ dev_info(&dd->pdev->dev,
+ "Exceeded Tmax, drive in thermal shutdown.\n");
+ set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag);
+ }
+ if (buf[288] == 0xBF) {
+ dev_info(&dd->pdev->dev,
+ "Drive indicates rebuild has failed.\n");
+ /* TODO */
+ }
+ }
+
+ /* get write protect progess */
+ memset(&attr242, 0, sizeof(struct smart_attr));
+ if (mtip_get_smart_attr(dd->port, 242, &attr242))
+ dev_warn(&dd->pdev->dev,
+ "Unable to check write protect progress\n");
+ else
+ dev_info(&dd->pdev->dev,
+ "Write protect progress: %u%% (%u blocks)\n",
+ attr242.cur, le32_to_cpu(attr242.data));
+ return rv;
+
+out3:
+ del_timer_sync(&dd->port->cmd_timer);
+
+ /* Disable interrupts on the HBA. */
+ writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+ dd->mmio + HOST_CTL);
+
+ /*Release the IRQ. */
+ devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
+
+out2:
+ mtip_deinit_port(dd->port);
+
+ /* Free the command/command header memory. */
+ dmam_free_coherent(&dd->pdev->dev,
+ HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
+ dd->port->command_list,
+ dd->port->command_list_dma);
+out1:
+ /* Free the memory allocated for the for structure. */
+ kfree(dd->port);
+
+ return rv;
+}
+
+/*
+ * Called to deinitialize an interface.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0
+ */
+static int mtip_hw_exit(struct driver_data *dd)
+{
+ /*
+ * Send standby immediate (E0h) to the drive so that it
+ * saves its state.
+ */
+ if (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) {
+
+ if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags))
+ if (mtip_standby_immediate(dd->port))
+ dev_warn(&dd->pdev->dev,
+ "STANDBY IMMEDIATE failed\n");
+
+ /* de-initialize the port. */
+ mtip_deinit_port(dd->port);
+
+ /* Disable interrupts on the HBA. */
+ writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+ dd->mmio + HOST_CTL);
+ }
+
+ del_timer_sync(&dd->port->cmd_timer);
+
+ /* Release the IRQ. */
+ devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
+
+ /* Stop the bottom half tasklet. */
+ tasklet_kill(&dd->tasklet);
+
+ /* Free the command/command header memory. */
+ dmam_free_coherent(&dd->pdev->dev,
+ HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
+ dd->port->command_list,
+ dd->port->command_list_dma);
+ /* Free the memory allocated for the for structure. */
+ kfree(dd->port);
+
+ return 0;
+}
+
+/*
+ * Issue a Standby Immediate command to the device.
+ *
+ * This function is called by the Block Layer just before the
+ * system powers off during a shutdown.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0
+ */
+static int mtip_hw_shutdown(struct driver_data *dd)
+{
+ /*
+ * Send standby immediate (E0h) to the drive so that it
+ * saves its state.
+ */
+ mtip_standby_immediate(dd->port);
+
+ return 0;
+}
+
+/*
+ * Suspend function
+ *
+ * This function is called by the Block Layer just before the
+ * system hibernates.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0 Suspend was successful
+ * -EFAULT Suspend was not successful
+ */
+static int mtip_hw_suspend(struct driver_data *dd)
+{
+ /*
+ * Send standby immediate (E0h) to the drive
+ * so that it saves its state.
+ */
+ if (mtip_standby_immediate(dd->port) != 0) {
+ dev_err(&dd->pdev->dev,
+ "Failed standby-immediate command\n");
+ return -EFAULT;
+ }
+
+ /* Disable interrupts on the HBA.*/
+ writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+ dd->mmio + HOST_CTL);
+ mtip_deinit_port(dd->port);
+
+ return 0;
+}
+
+/*
+ * Resume function
+ *
+ * This function is called by the Block Layer as the
+ * system resumes.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0 Resume was successful
+ * -EFAULT Resume was not successful
+ */
+static int mtip_hw_resume(struct driver_data *dd)
+{
+ /* Perform any needed hardware setup steps */
+ hba_setup(dd);
+
+ /* Reset the HBA */
+ if (mtip_hba_reset(dd) != 0) {
+ dev_err(&dd->pdev->dev,
+ "Unable to reset the HBA\n");
+ return -EFAULT;
+ }
+
+ /*
+ * Enable the port, DMA engine, and FIS reception specific
+ * h/w in controller.
+ */
+ mtip_init_port(dd->port);
+ mtip_start_port(dd->port);
+
+ /* Enable interrupts on the HBA.*/
+ writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
+ dd->mmio + HOST_CTL);
+
+ return 0;
+}
+
+/*
+ * Helper function for reusing disk name
+ * upon hot insertion.
+ */
+static int rssd_disk_name_format(char *prefix,
+ int index,
+ char *buf,
+ int buflen)
+{
+ const int base = 'z' - 'a' + 1;
+ char *begin = buf + strlen(prefix);
+ char *end = buf + buflen;
+ char *p;
+ int unit;
+
+ p = end - 1;
+ *p = '\0';
+ unit = base;
+ do {
+ if (p == begin)
+ return -EINVAL;
+ *--p = 'a' + (index % unit);
+ index = (index / unit) - 1;
+ } while (index >= 0);
+
+ memmove(begin, p, end - p);
+ memcpy(buf, prefix, strlen(prefix));
+
+ return 0;
+}
+
+/*
+ * Block layer IOCTL handler.
+ *
+ * @dev Pointer to the block_device structure.
+ * @mode ignored
+ * @cmd IOCTL command passed from the user application.
+ * @arg Argument passed from the user application.
+ *
+ * return value
+ * 0 IOCTL completed successfully.
+ * -ENOTTY IOCTL not supported or invalid driver data
+ * structure pointer.
+ */
+static int mtip_block_ioctl(struct block_device *dev,
+ fmode_t mode,
+ unsigned cmd,
+ unsigned long arg)
+{
+ struct driver_data *dd = dev->bd_disk->private_data;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (!dd)
+ return -ENOTTY;
+
+ if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)))
+ return -ENOTTY;
+
+ switch (cmd) {
+ case BLKFLSBUF:
+ return -ENOTTY;
+ default:
+ return mtip_hw_ioctl(dd, cmd, arg);
+ }
+}
+
+#ifdef CONFIG_COMPAT
+/*
+ * Block layer compat IOCTL handler.
+ *
+ * @dev Pointer to the block_device structure.
+ * @mode ignored
+ * @cmd IOCTL command passed from the user application.
+ * @arg Argument passed from the user application.
+ *
+ * return value
+ * 0 IOCTL completed successfully.
+ * -ENOTTY IOCTL not supported or invalid driver data
+ * structure pointer.
+ */
+static int mtip_block_compat_ioctl(struct block_device *dev,
+ fmode_t mode,
+ unsigned cmd,
+ unsigned long arg)
+{
+ struct driver_data *dd = dev->bd_disk->private_data;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (!dd)
+ return -ENOTTY;
+
+ if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)))
+ return -ENOTTY;
+
+ switch (cmd) {
+ case BLKFLSBUF:
+ return -ENOTTY;
+ case HDIO_DRIVE_TASKFILE: {
+ struct mtip_compat_ide_task_request_s __user *compat_req_task;
+ ide_task_request_t req_task;
+ int compat_tasksize, outtotal, ret;
+
+ compat_tasksize =
+ sizeof(struct mtip_compat_ide_task_request_s);
+
+ compat_req_task =
+ (struct mtip_compat_ide_task_request_s __user *) arg;
+
+ if (copy_from_user(&req_task, (void __user *) arg,
+ compat_tasksize - (2 * sizeof(compat_long_t))))
+ return -EFAULT;
+
+ if (get_user(req_task.out_size, &compat_req_task->out_size))
+ return -EFAULT;
+
+ if (get_user(req_task.in_size, &compat_req_task->in_size))
+ return -EFAULT;
+
+ outtotal = sizeof(struct mtip_compat_ide_task_request_s);
+
+ ret = exec_drive_taskfile(dd, (void __user *) arg,
+ &req_task, outtotal);
+
+ if (copy_to_user((void __user *) arg, &req_task,
+ compat_tasksize -
+ (2 * sizeof(compat_long_t))))
+ return -EFAULT;
+
+ if (put_user(req_task.out_size, &compat_req_task->out_size))
+ return -EFAULT;
+
+ if (put_user(req_task.in_size, &compat_req_task->in_size))
+ return -EFAULT;
+
+ return ret;
+ }
+ default:
+ return mtip_hw_ioctl(dd, cmd, arg);
+ }
+}
+#endif
+
+/*
+ * Obtain the geometry of the device.
+ *
+ * You may think that this function is obsolete, but some applications,
+ * fdisk for example still used CHS values. This function describes the
+ * device as having 224 heads and 56 sectors per cylinder. These values are
+ * chosen so that each cylinder is aligned on a 4KB boundary. Since a
+ * partition is described in terms of a start and end cylinder this means
+ * that each partition is also 4KB aligned. Non-aligned partitions adversely
+ * affects performance.
+ *
+ * @dev Pointer to the block_device strucutre.
+ * @geo Pointer to a hd_geometry structure.
+ *
+ * return value
+ * 0 Operation completed successfully.
+ * -ENOTTY An error occurred while reading the drive capacity.
+ */
+static int mtip_block_getgeo(struct block_device *dev,
+ struct hd_geometry *geo)
+{
+ struct driver_data *dd = dev->bd_disk->private_data;
+ sector_t capacity;
+
+ if (!dd)
+ return -ENOTTY;
+
+ if (!(mtip_hw_get_capacity(dd, &capacity))) {
+ dev_warn(&dd->pdev->dev,
+ "Could not get drive capacity.\n");
+ return -ENOTTY;
+ }
+
+ geo->heads = 224;
+ geo->sectors = 56;
+ sector_div(capacity, (geo->heads * geo->sectors));
+ geo->cylinders = capacity;
+ return 0;
+}
+
+/*
+ * Block device operation function.
+ *
+ * This structure contains pointers to the functions required by the block
+ * layer.
+ */
+static const struct block_device_operations mtip_block_ops = {
+ .ioctl = mtip_block_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = mtip_block_compat_ioctl,
+#endif
+ .getgeo = mtip_block_getgeo,
+ .owner = THIS_MODULE
+};
+
+/*
+ * Block layer make request function.
+ *
+ * This function is called by the kernel to process a BIO for
+ * the P320 device.
+ *
+ * @queue Pointer to the request queue. Unused other than to obtain
+ * the driver data structure.
+ * @bio Pointer to the BIO.
+ *
+ */
+static void mtip_make_request(struct request_queue *queue, struct bio *bio)
+{
+ struct driver_data *dd = queue->queuedata;
+ struct scatterlist *sg;
+ struct bio_vec *bvec;
+ int nents = 0;
+ int tag = 0;
+
+ if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) {
+ if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &dd->dd_flag))) {
+ bio_endio(bio, -ENXIO);
+ return;
+ }
+ if (unlikely(test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))) {
+ bio_endio(bio, -ENODATA);
+ return;
+ }
+ if (unlikely(test_bit(MTIP_DDF_WRITE_PROTECT_BIT,
+ &dd->dd_flag) &&
+ bio_data_dir(bio))) {
+ bio_endio(bio, -ENODATA);
+ return;
+ }
+ if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))) {
+ bio_endio(bio, -ENODATA);
+ return;
+ }
+ }
+
+ if (unlikely(!bio_has_data(bio))) {
+ blk_queue_flush(queue, 0);
+ bio_endio(bio, 0);
+ return;
+ }
+
+ sg = mtip_hw_get_scatterlist(dd, &tag);
+ if (likely(sg != NULL)) {
+ blk_queue_bounce(queue, &bio);
+
+ if (unlikely((bio)->bi_vcnt > MTIP_MAX_SG)) {
+ dev_warn(&dd->pdev->dev,
+ "Maximum number of SGL entries exceeded\n");
+ bio_io_error(bio);
+ mtip_hw_release_scatterlist(dd, tag);
+ return;
+ }
+
+ /* Create the scatter list for this bio. */
+ bio_for_each_segment(bvec, bio, nents) {
+ sg_set_page(&sg[nents],
+ bvec->bv_page,
+ bvec->bv_len,
+ bvec->bv_offset);
+ }
+
+ /* Issue the read/write. */
+ mtip_hw_submit_io(dd,
+ bio->bi_sector,
+ bio_sectors(bio),
+ nents,
+ tag,
+ bio_endio,
+ bio,
+ bio_data_dir(bio));
+ } else
+ bio_io_error(bio);
+}
+
+/*
+ * Block layer initialization function.
+ *
+ * This function is called once by the PCI layer for each P320
+ * device that is connected to the system.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0 on success else an error code.
+ */
+static int mtip_block_initialize(struct driver_data *dd)
+{
+ int rv = 0, wait_for_rebuild = 0;
+ sector_t capacity;
+ unsigned int index = 0;
+ struct kobject *kobj;
+ unsigned char thd_name[16];
+
+ if (dd->disk)
+ goto skip_create_disk; /* hw init done, before rebuild */
+
+ /* Initialize the protocol layer. */
+ wait_for_rebuild = mtip_hw_init(dd);
+ if (wait_for_rebuild < 0) {
+ dev_err(&dd->pdev->dev,
+ "Protocol layer initialization failed\n");
+ rv = -EINVAL;
+ goto protocol_init_error;
+ }
+
+ dd->disk = alloc_disk(MTIP_MAX_MINORS);
+ if (dd->disk == NULL) {
+ dev_err(&dd->pdev->dev,
+ "Unable to allocate gendisk structure\n");
+ rv = -EINVAL;
+ goto alloc_disk_error;
+ }
+
+ /* Generate the disk name, implemented same as in sd.c */
+ do {
+ if (!ida_pre_get(&rssd_index_ida, GFP_KERNEL))
+ goto ida_get_error;
+
+ spin_lock(&rssd_index_lock);
+ rv = ida_get_new(&rssd_index_ida, &index);
+ spin_unlock(&rssd_index_lock);
+ } while (rv == -EAGAIN);
+
+ if (rv)
+ goto ida_get_error;
+
+ rv = rssd_disk_name_format("rssd",
+ index,
+ dd->disk->disk_name,
+ DISK_NAME_LEN);
+ if (rv)
+ goto disk_index_error;
+
+ dd->disk->driverfs_dev = &dd->pdev->dev;
+ dd->disk->major = dd->major;
+ dd->disk->first_minor = dd->instance * MTIP_MAX_MINORS;
+ dd->disk->fops = &mtip_block_ops;
+ dd->disk->private_data = dd;
+ dd->index = index;
+
+ /*
+ * if rebuild pending, start the service thread, and delay the block
+ * queue creation and add_disk()
+ */
+ if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
+ goto start_service_thread;
+
+skip_create_disk:
+ /* Allocate the request queue. */
+ dd->queue = blk_alloc_queue(GFP_KERNEL);
+ if (dd->queue == NULL) {
+ dev_err(&dd->pdev->dev,
+ "Unable to allocate request queue\n");
+ rv = -ENOMEM;
+ goto block_queue_alloc_init_error;
+ }
+
+ /* Attach our request function to the request queue. */
+ blk_queue_make_request(dd->queue, mtip_make_request);
+
+ dd->disk->queue = dd->queue;
+ dd->queue->queuedata = dd;
+
+ /* Set device limits. */
+ set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags);
+ blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
+ blk_queue_physical_block_size(dd->queue, 4096);
+ blk_queue_max_hw_sectors(dd->queue, 0xffff);
+ blk_queue_max_segment_size(dd->queue, 0x400000);
+ blk_queue_io_min(dd->queue, 4096);
+
+ /*
+ * write back cache is not supported in the device. FUA depends on
+ * write back cache support, hence setting flush support to zero.
+ */
+ blk_queue_flush(dd->queue, 0);
+
+ /* Set the capacity of the device in 512 byte sectors. */
+ if (!(mtip_hw_get_capacity(dd, &capacity))) {
+ dev_warn(&dd->pdev->dev,
+ "Could not read drive capacity\n");
+ rv = -EIO;
+ goto read_capacity_error;
+ }
+ set_capacity(dd->disk, capacity);
+
+ /* Enable the block device and add it to /dev */
+ add_disk(dd->disk);
+
+ /*
+ * Now that the disk is active, initialize any sysfs attributes
+ * managed by the protocol layer.
+ */
+ kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+ if (kobj) {
+ mtip_hw_sysfs_init(dd, kobj);
+ kobject_put(kobj);
+ }
+ mtip_hw_debugfs_init(dd);
+
+ if (dd->mtip_svc_handler) {
+ set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag);
+ return rv; /* service thread created for handling rebuild */
+ }
+
+start_service_thread:
+ sprintf(thd_name, "mtip_svc_thd_%02d", index);
+
+ dd->mtip_svc_handler = kthread_run(mtip_service_thread,
+ dd, thd_name);
+
+ if (IS_ERR(dd->mtip_svc_handler)) {
+ dev_err(&dd->pdev->dev, "service thread failed to start\n");
+ dd->mtip_svc_handler = NULL;
+ rv = -EFAULT;
+ goto kthread_run_error;
+ }
+
+ if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
+ rv = wait_for_rebuild;
+
+ return rv;
+
+kthread_run_error:
+ mtip_hw_debugfs_exit(dd);
+
+ /* Delete our gendisk. This also removes the device from /dev */
+ del_gendisk(dd->disk);
+
+read_capacity_error:
+ blk_cleanup_queue(dd->queue);
+
+block_queue_alloc_init_error:
+disk_index_error:
+ spin_lock(&rssd_index_lock);
+ ida_remove(&rssd_index_ida, index);
+ spin_unlock(&rssd_index_lock);
+
+ida_get_error:
+ put_disk(dd->disk);
+
+alloc_disk_error:
+ mtip_hw_exit(dd); /* De-initialize the protocol layer. */
+
+protocol_init_error:
+ return rv;
+}
+
+/*
+ * Block layer deinitialization function.
+ *
+ * Called by the PCI layer as each P320 device is removed.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0
+ */
+static int mtip_block_remove(struct driver_data *dd)
+{
+ struct kobject *kobj;
+
+ if (dd->mtip_svc_handler) {
+ set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags);
+ wake_up_interruptible(&dd->port->svc_wait);
+ kthread_stop(dd->mtip_svc_handler);
+ }
+
+ /* Clean up the sysfs attributes, if created */
+ if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) {
+ kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+ if (kobj) {
+ mtip_hw_sysfs_exit(dd, kobj);
+ kobject_put(kobj);
+ }
+ }
+ mtip_hw_debugfs_exit(dd);
+
+ /*
+ * Delete our gendisk structure. This also removes the device
+ * from /dev
+ */
+ del_gendisk(dd->disk);
+
+ spin_lock(&rssd_index_lock);
+ ida_remove(&rssd_index_ida, dd->index);
+ spin_unlock(&rssd_index_lock);
+
+ blk_cleanup_queue(dd->queue);
+ dd->disk = NULL;
+ dd->queue = NULL;
+
+ /* De-initialize the protocol layer. */
+ mtip_hw_exit(dd);
+
+ return 0;
+}
+
+/*
+ * Function called by the PCI layer when just before the
+ * machine shuts down.
+ *
+ * If a protocol layer shutdown function is present it will be called
+ * by this function.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0
+ */
+static int mtip_block_shutdown(struct driver_data *dd)
+{
+ dev_info(&dd->pdev->dev,
+ "Shutting down %s ...\n", dd->disk->disk_name);
+
+ /* Delete our gendisk structure, and cleanup the blk queue. */
+ del_gendisk(dd->disk);
+
+ spin_lock(&rssd_index_lock);
+ ida_remove(&rssd_index_ida, dd->index);
+ spin_unlock(&rssd_index_lock);
+
+ blk_cleanup_queue(dd->queue);
+ dd->disk = NULL;
+ dd->queue = NULL;
+
+ mtip_hw_shutdown(dd);
+ return 0;
+}
+
+static int mtip_block_suspend(struct driver_data *dd)
+{
+ dev_info(&dd->pdev->dev,
+ "Suspending %s ...\n", dd->disk->disk_name);
+ mtip_hw_suspend(dd);
+ return 0;
+}
+
+static int mtip_block_resume(struct driver_data *dd)
+{
+ dev_info(&dd->pdev->dev, "Resuming %s ...\n",
+ dd->disk->disk_name);
+ mtip_hw_resume(dd);
+ return 0;
+}
+
+/*
+ * Called for each supported PCI device detected.
+ *
+ * This function allocates the private data structure, enables the
+ * PCI device and then calls the block layer initialization function.
+ *
+ * return value
+ * 0 on success else an error code.
+ */
+static int mtip_pci_probe(struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+{
+ int rv = 0;
+ struct driver_data *dd = NULL;
+
+ /* Allocate memory for this devices private data. */
+ dd = kzalloc(sizeof(struct driver_data), GFP_KERNEL);
+ if (dd == NULL) {
+ dev_err(&pdev->dev,
+ "Unable to allocate memory for driver data\n");
+ return -ENOMEM;
+ }
+
+ /* Attach the private data to this PCI device. */
+ pci_set_drvdata(pdev, dd);
+
+ rv = pcim_enable_device(pdev);
+ if (rv < 0) {
+ dev_err(&pdev->dev, "Unable to enable device\n");
+ goto iomap_err;
+ }
+
+ /* Map BAR5 to memory. */
+ rv = pcim_iomap_regions(pdev, 1 << MTIP_ABAR, MTIP_DRV_NAME);
+ if (rv < 0) {
+ dev_err(&pdev->dev, "Unable to map regions\n");
+ goto iomap_err;
+ }
+
+ if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
+ rv = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+
+ if (rv) {
+ rv = pci_set_consistent_dma_mask(pdev,
+ DMA_BIT_MASK(32));
+ if (rv) {
+ dev_warn(&pdev->dev,
+ "64-bit DMA enable failed\n");
+ goto setmask_err;
+ }
+ }
+ }
+
+ pci_set_master(pdev);
+
+ if (pci_enable_msi(pdev)) {
+ dev_warn(&pdev->dev,
+ "Unable to enable MSI interrupt.\n");
+ goto block_initialize_err;
+ }
+
+ /* Copy the info we may need later into the private data structure. */
+ dd->major = mtip_major;
+ dd->instance = instance;
+ dd->pdev = pdev;
+
+ /* Initialize the block layer. */
+ rv = mtip_block_initialize(dd);
+ if (rv < 0) {
+ dev_err(&pdev->dev,
+ "Unable to initialize block layer\n");
+ goto block_initialize_err;
+ }
+
+ /*
+ * Increment the instance count so that each device has a unique
+ * instance number.
+ */
+ instance++;
+ if (rv != MTIP_FTL_REBUILD_MAGIC)
+ set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag);
+ goto done;
+
+block_initialize_err:
+ pci_disable_msi(pdev);
+
+setmask_err:
+ pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
+
+iomap_err:
+ kfree(dd);
+ pci_set_drvdata(pdev, NULL);
+ return rv;
+done:
+ return rv;
+}
+
+/*
+ * Called for each probed device when the device is removed or the
+ * driver is unloaded.
+ *
+ * return value
+ * None
+ */
+static void mtip_pci_remove(struct pci_dev *pdev)
+{
+ struct driver_data *dd = pci_get_drvdata(pdev);
+ int counter = 0;
+
+ set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag);
+
+ if (mtip_check_surprise_removal(pdev)) {
+ while (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) {
+ counter++;
+ msleep(20);
+ if (counter == 10) {
+ /* Cleanup the outstanding commands */
+ mtip_command_cleanup(dd);
+ break;
+ }
+ }
+ }
+
+ /* Clean up the block layer. */
+ mtip_block_remove(dd);
+
+ pci_disable_msi(pdev);
+
+ kfree(dd);
+ pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
+}
+
+/*
+ * Called for each probed device when the device is suspended.
+ *
+ * return value
+ * 0 Success
+ * <0 Error
+ */
+static int mtip_pci_suspend(struct pci_dev *pdev, pm_message_t mesg)
+{
+ int rv = 0;
+ struct driver_data *dd = pci_get_drvdata(pdev);
+
+ if (!dd) {
+ dev_err(&pdev->dev,
+ "Driver private datastructure is NULL\n");
+ return -EFAULT;
+ }
+
+ set_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag);
+
+ /* Disable ports & interrupts then send standby immediate */
+ rv = mtip_block_suspend(dd);
+ if (rv < 0) {
+ dev_err(&pdev->dev,
+ "Failed to suspend controller\n");
+ return rv;
+ }
+
+ /*
+ * Save the pci config space to pdev structure &
+ * disable the device
+ */
+ pci_save_state(pdev);
+ pci_disable_device(pdev);
+
+ /* Move to Low power state*/
+ pci_set_power_state(pdev, PCI_D3hot);
+
+ return rv;
+}
+
+/*
+ * Called for each probed device when the device is resumed.
+ *
+ * return value
+ * 0 Success
+ * <0 Error
+ */
+static int mtip_pci_resume(struct pci_dev *pdev)
+{
+ int rv = 0;
+ struct driver_data *dd;
+
+ dd = pci_get_drvdata(pdev);
+ if (!dd) {
+ dev_err(&pdev->dev,
+ "Driver private datastructure is NULL\n");
+ return -EFAULT;
+ }
+
+ /* Move the device to active State */
+ pci_set_power_state(pdev, PCI_D0);
+
+ /* Restore PCI configuration space */
+ pci_restore_state(pdev);
+
+ /* Enable the PCI device*/
+ rv = pcim_enable_device(pdev);
+ if (rv < 0) {
+ dev_err(&pdev->dev,
+ "Failed to enable card during resume\n");
+ goto err;
+ }
+ pci_set_master(pdev);
+
+ /*
+ * Calls hbaReset, initPort, & startPort function
+ * then enables interrupts
+ */
+ rv = mtip_block_resume(dd);
+ if (rv < 0)
+ dev_err(&pdev->dev, "Unable to resume\n");
+
+err:
+ clear_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag);
+
+ return rv;
+}
+
+/*
+ * Shutdown routine
+ *
+ * return value
+ * None
+ */
+static void mtip_pci_shutdown(struct pci_dev *pdev)
+{
+ struct driver_data *dd = pci_get_drvdata(pdev);
+ if (dd)
+ mtip_block_shutdown(dd);
+}
+
+/* Table of device ids supported by this driver. */
+static DEFINE_PCI_DEVICE_TABLE(mtip_pci_tbl) = {
+ { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320H_DEVICE_ID) },
+ { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320M_DEVICE_ID) },
+ { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320S_DEVICE_ID) },
+ { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P325M_DEVICE_ID) },
+ { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P420H_DEVICE_ID) },
+ { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P420M_DEVICE_ID) },
+ { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P425M_DEVICE_ID) },
+ { 0 }
+};
+
+/* Structure that describes the PCI driver functions. */
+static struct pci_driver mtip_pci_driver = {
+ .name = MTIP_DRV_NAME,
+ .id_table = mtip_pci_tbl,
+ .probe = mtip_pci_probe,
+ .remove = mtip_pci_remove,
+ .suspend = mtip_pci_suspend,
+ .resume = mtip_pci_resume,
+ .shutdown = mtip_pci_shutdown,
+};
+
+MODULE_DEVICE_TABLE(pci, mtip_pci_tbl);
+
+/*
+ * Module initialization function.
+ *
+ * Called once when the module is loaded. This function allocates a major
+ * block device number to the Cyclone devices and registers the PCI layer
+ * of the driver.
+ *
+ * Return value
+ * 0 on success else error code.
+ */
+static int __init mtip_init(void)
+{
+ int error;
+
+ pr_info(MTIP_DRV_NAME " Version " MTIP_DRV_VERSION "\n");
+
+ /* Allocate a major block device number to use with this driver. */
+ error = register_blkdev(0, MTIP_DRV_NAME);
+ if (error <= 0) {
+ pr_err("Unable to register block device (%d)\n",
+ error);
+ return -EBUSY;
+ }
+ mtip_major = error;
+
+ if (!dfs_parent) {
+ dfs_parent = debugfs_create_dir("rssd", NULL);
+ if (IS_ERR_OR_NULL(dfs_parent)) {
+ pr_warn("Error creating debugfs parent\n");
+ dfs_parent = NULL;
+ }
+ }
+
+ /* Register our PCI operations. */
+ error = pci_register_driver(&mtip_pci_driver);
+ if (error) {
+ debugfs_remove(dfs_parent);
+ unregister_blkdev(mtip_major, MTIP_DRV_NAME);
+ }
+
+ return error;
+}
+
+/*
+ * Module de-initialization function.
+ *
+ * Called once when the module is unloaded. This function deallocates
+ * the major block device number allocated by mtip_init() and
+ * unregisters the PCI layer of the driver.
+ *
+ * Return value
+ * none
+ */
+static void __exit mtip_exit(void)
+{
+ debugfs_remove_recursive(dfs_parent);
+
+ /* Release the allocated major block device number. */
+ unregister_blkdev(mtip_major, MTIP_DRV_NAME);
+
+ /* Unregister the PCI driver. */
+ pci_unregister_driver(&mtip_pci_driver);
+}
+
+MODULE_AUTHOR("Micron Technology, Inc");
+MODULE_DESCRIPTION("Micron RealSSD PCIe Block Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(MTIP_DRV_VERSION);
+
+module_init(mtip_init);
+module_exit(mtip_exit);
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
new file mode 100644
index 000000000000..18627a1d04c5
--- /dev/null
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -0,0 +1,463 @@
+/*
+ * mtip32xx.h - Header file for the P320 SSD Block Driver
+ * Copyright (C) 2011 Micron Technology, Inc.
+ *
+ * Portions of this code were derived from works subjected to the
+ * following copyright:
+ * Copyright (C) 2009 Integrated Device Technology, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __MTIP32XX_H__
+#define __MTIP32XX_H__
+
+#include <linux/spinlock.h>
+#include <linux/rwsem.h>
+#include <linux/ata.h>
+#include <linux/interrupt.h>
+#include <linux/genhd.h>
+
+/* Offset of Subsystem Device ID in pci confoguration space */
+#define PCI_SUBSYSTEM_DEVICEID 0x2E
+
+/* offset of Device Control register in PCIe extended capabilites space */
+#define PCIE_CONFIG_EXT_DEVICE_CONTROL_OFFSET 0x48
+
+/* # of times to retry timed out/failed IOs */
+#define MTIP_MAX_RETRIES 2
+
+/* Various timeout values in ms */
+#define MTIP_NCQ_COMMAND_TIMEOUT_MS 5000
+#define MTIP_IOCTL_COMMAND_TIMEOUT_MS 5000
+#define MTIP_INTERNAL_COMMAND_TIMEOUT_MS 5000
+
+/* check for timeouts every 500ms */
+#define MTIP_TIMEOUT_CHECK_PERIOD 500
+
+/* ftl rebuild */
+#define MTIP_FTL_REBUILD_OFFSET 142
+#define MTIP_FTL_REBUILD_MAGIC 0xED51
+#define MTIP_FTL_REBUILD_TIMEOUT_MS 2400000
+
+/* Macro to extract the tag bit number from a tag value. */
+#define MTIP_TAG_BIT(tag) (tag & 0x1F)
+
+/*
+ * Macro to extract the tag index from a tag value. The index
+ * is used to access the correct s_active/Command Issue register based
+ * on the tag value.
+ */
+#define MTIP_TAG_INDEX(tag) (tag >> 5)
+
+/*
+ * Maximum number of scatter gather entries
+ * a single command may have.
+ */
+#define MTIP_MAX_SG 128
+
+/*
+ * Maximum number of slot groups (Command Issue & s_active registers)
+ * NOTE: This is the driver maximum; check dd->slot_groups for actual value.
+ */
+#define MTIP_MAX_SLOT_GROUPS 8
+
+/* Internal command tag. */
+#define MTIP_TAG_INTERNAL 0
+
+/* Micron Vendor ID & P320x SSD Device ID */
+#define PCI_VENDOR_ID_MICRON 0x1344
+#define P320H_DEVICE_ID 0x5150
+#define P320M_DEVICE_ID 0x5151
+#define P320S_DEVICE_ID 0x5152
+#define P325M_DEVICE_ID 0x5153
+#define P420H_DEVICE_ID 0x5160
+#define P420M_DEVICE_ID 0x5161
+#define P425M_DEVICE_ID 0x5163
+
+/* Driver name and version strings */
+#define MTIP_DRV_NAME "mtip32xx"
+#define MTIP_DRV_VERSION "1.2.6os3"
+
+/* Maximum number of minor device numbers per device. */
+#define MTIP_MAX_MINORS 16
+
+/* Maximum number of supported command slots. */
+#define MTIP_MAX_COMMAND_SLOTS (MTIP_MAX_SLOT_GROUPS * 32)
+
+/*
+ * Per-tag bitfield size in longs.
+ * Linux bit manipulation functions
+ * (i.e. test_and_set_bit, find_next_zero_bit)
+ * manipulate memory in longs, so we try to make the math work.
+ * take the slot groups and find the number of longs, rounding up.
+ * Careful! i386 and x86_64 use different size longs!
+ */
+#define U32_PER_LONG (sizeof(long) / sizeof(u32))
+#define SLOTBITS_IN_LONGS ((MTIP_MAX_SLOT_GROUPS + \
+ (U32_PER_LONG-1))/U32_PER_LONG)
+
+/* BAR number used to access the HBA registers. */
+#define MTIP_ABAR 5
+
+#ifdef DEBUG
+ #define dbg_printk(format, arg...) \
+ printk(pr_fmt(format), ##arg);
+#else
+ #define dbg_printk(format, arg...)
+#endif
+
+#define MTIP_DFS_MAX_BUF_SIZE 1024
+
+#define __force_bit2int (unsigned int __force)
+
+enum {
+ /* below are bit numbers in 'flags' defined in mtip_port */
+ MTIP_PF_IC_ACTIVE_BIT = 0, /* pio/ioctl */
+ MTIP_PF_EH_ACTIVE_BIT = 1, /* error handling */
+ MTIP_PF_SE_ACTIVE_BIT = 2, /* secure erase */
+ MTIP_PF_DM_ACTIVE_BIT = 3, /* download microcde */
+ MTIP_PF_PAUSE_IO = ((1 << MTIP_PF_IC_ACTIVE_BIT) | \
+ (1 << MTIP_PF_EH_ACTIVE_BIT) | \
+ (1 << MTIP_PF_SE_ACTIVE_BIT) | \
+ (1 << MTIP_PF_DM_ACTIVE_BIT)),
+
+ MTIP_PF_SVC_THD_ACTIVE_BIT = 4,
+ MTIP_PF_ISSUE_CMDS_BIT = 5,
+ MTIP_PF_REBUILD_BIT = 6,
+ MTIP_PF_SVC_THD_STOP_BIT = 8,
+
+ /* below are bit numbers in 'dd_flag' defined in driver_data */
+ MTIP_DDF_SEC_LOCK_BIT = 0,
+ MTIP_DDF_REMOVE_PENDING_BIT = 1,
+ MTIP_DDF_OVER_TEMP_BIT = 2,
+ MTIP_DDF_WRITE_PROTECT_BIT = 3,
+ MTIP_DDF_STOP_IO = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) | \
+ (1 << MTIP_DDF_SEC_LOCK_BIT) | \
+ (1 << MTIP_DDF_OVER_TEMP_BIT) | \
+ (1 << MTIP_DDF_WRITE_PROTECT_BIT)),
+
+ MTIP_DDF_CLEANUP_BIT = 5,
+ MTIP_DDF_RESUME_BIT = 6,
+ MTIP_DDF_INIT_DONE_BIT = 7,
+ MTIP_DDF_REBUILD_FAILED_BIT = 8,
+};
+
+__packed struct smart_attr{
+ u8 attr_id;
+ u16 flags;
+ u8 cur;
+ u8 worst;
+ u32 data;
+ u8 res[3];
+};
+
+/* Register Frame Information Structure (FIS), host to device. */
+struct host_to_dev_fis {
+ /*
+ * FIS type.
+ * - 27h Register FIS, host to device.
+ * - 34h Register FIS, device to host.
+ * - 39h DMA Activate FIS, device to host.
+ * - 41h DMA Setup FIS, bi-directional.
+ * - 46h Data FIS, bi-directional.
+ * - 58h BIST Activate FIS, bi-directional.
+ * - 5Fh PIO Setup FIS, device to host.
+ * - A1h Set Device Bits FIS, device to host.
+ */
+ unsigned char type;
+ unsigned char opts;
+ unsigned char command;
+ unsigned char features;
+
+ union {
+ unsigned char lba_low;
+ unsigned char sector;
+ };
+ union {
+ unsigned char lba_mid;
+ unsigned char cyl_low;
+ };
+ union {
+ unsigned char lba_hi;
+ unsigned char cyl_hi;
+ };
+ union {
+ unsigned char device;
+ unsigned char head;
+ };
+
+ union {
+ unsigned char lba_low_ex;
+ unsigned char sector_ex;
+ };
+ union {
+ unsigned char lba_mid_ex;
+ unsigned char cyl_low_ex;
+ };
+ union {
+ unsigned char lba_hi_ex;
+ unsigned char cyl_hi_ex;
+ };
+ unsigned char features_ex;
+
+ unsigned char sect_count;
+ unsigned char sect_cnt_ex;
+ unsigned char res2;
+ unsigned char control;
+
+ unsigned int res3;
+};
+
+/* Command header structure. */
+struct mtip_cmd_hdr {
+ /*
+ * Command options.
+ * - Bits 31:16 Number of PRD entries.
+ * - Bits 15:8 Unused in this implementation.
+ * - Bit 7 Prefetch bit, informs the drive to prefetch PRD entries.
+ * - Bit 6 Write bit, should be set when writing data to the device.
+ * - Bit 5 Unused in this implementation.
+ * - Bits 4:0 Length of the command FIS in DWords (DWord = 4 bytes).
+ */
+ unsigned int opts;
+ /* This field is unsed when using NCQ. */
+ union {
+ unsigned int byte_count;
+ unsigned int status;
+ };
+ /*
+ * Lower 32 bits of the command table address associated with this
+ * header. The command table addresses must be 128 byte aligned.
+ */
+ unsigned int ctba;
+ /*
+ * If 64 bit addressing is used this field is the upper 32 bits
+ * of the command table address associated with this command.
+ */
+ unsigned int ctbau;
+ /* Reserved and unused. */
+ unsigned int res[4];
+};
+
+/* Command scatter gather structure (PRD). */
+struct mtip_cmd_sg {
+ /*
+ * Low 32 bits of the data buffer address. For P320 this
+ * address must be 8 byte aligned signified by bits 2:0 being
+ * set to 0.
+ */
+ unsigned int dba;
+ /*
+ * When 64 bit addressing is used this field is the upper
+ * 32 bits of the data buffer address.
+ */
+ unsigned int dba_upper;
+ /* Unused. */
+ unsigned int reserved;
+ /*
+ * Bit 31: interrupt when this data block has been transferred.
+ * Bits 30..22: reserved
+ * Bits 21..0: byte count (minus 1). For P320 the byte count must be
+ * 8 byte aligned signified by bits 2:0 being set to 1.
+ */
+ unsigned int info;
+};
+struct mtip_port;
+
+/* Structure used to describe a command. */
+struct mtip_cmd {
+
+ struct mtip_cmd_hdr *command_header; /* ptr to command header entry */
+
+ dma_addr_t command_header_dma; /* corresponding physical address */
+
+ void *command; /* ptr to command table entry */
+
+ dma_addr_t command_dma; /* corresponding physical address */
+
+ void *comp_data; /* data passed to completion function comp_func() */
+ /*
+ * Completion function called by the ISR upon completion of
+ * a command.
+ */
+ void (*comp_func)(struct mtip_port *port,
+ int tag,
+ void *data,
+ int status);
+ /* Additional callback function that may be called by comp_func() */
+ void (*async_callback)(void *data, int status);
+
+ void *async_data; /* Addl. data passed to async_callback() */
+
+ int scatter_ents; /* Number of scatter list entries used */
+
+ struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */
+
+ int retries; /* The number of retries left for this command. */
+
+ int direction; /* Data transfer direction */
+
+ unsigned long comp_time; /* command completion time, in jiffies */
+
+ atomic_t active; /* declares if this command sent to the drive. */
+};
+
+/* Structure used to describe a port. */
+struct mtip_port {
+ /* Pointer back to the driver data for this port. */
+ struct driver_data *dd;
+ /*
+ * Used to determine if the data pointed to by the
+ * identify field is valid.
+ */
+ unsigned long identify_valid;
+ /* Base address of the memory mapped IO for the port. */
+ void __iomem *mmio;
+ /* Array of pointers to the memory mapped s_active registers. */
+ void __iomem *s_active[MTIP_MAX_SLOT_GROUPS];
+ /* Array of pointers to the memory mapped completed registers. */
+ void __iomem *completed[MTIP_MAX_SLOT_GROUPS];
+ /* Array of pointers to the memory mapped Command Issue registers. */
+ void __iomem *cmd_issue[MTIP_MAX_SLOT_GROUPS];
+ /*
+ * Pointer to the beginning of the command header memory as used
+ * by the driver.
+ */
+ void *command_list;
+ /*
+ * Pointer to the beginning of the command header memory as used
+ * by the DMA.
+ */
+ dma_addr_t command_list_dma;
+ /*
+ * Pointer to the beginning of the RX FIS memory as used
+ * by the driver.
+ */
+ void *rxfis;
+ /*
+ * Pointer to the beginning of the RX FIS memory as used
+ * by the DMA.
+ */
+ dma_addr_t rxfis_dma;
+ /*
+ * Pointer to the beginning of the command table memory as used
+ * by the driver.
+ */
+ void *command_table;
+ /*
+ * Pointer to the beginning of the command table memory as used
+ * by the DMA.
+ */
+ dma_addr_t command_tbl_dma;
+ /*
+ * Pointer to the beginning of the identify data memory as used
+ * by the driver.
+ */
+ u16 *identify;
+ /*
+ * Pointer to the beginning of the identify data memory as used
+ * by the DMA.
+ */
+ dma_addr_t identify_dma;
+ /*
+ * Pointer to the beginning of a sector buffer that is used
+ * by the driver when issuing internal commands.
+ */
+ u16 *sector_buffer;
+ /*
+ * Pointer to the beginning of a sector buffer that is used
+ * by the DMA when the driver issues internal commands.
+ */
+ dma_addr_t sector_buffer_dma;
+ /*
+ * Bit significant, used to determine if a command slot has
+ * been allocated. i.e. the slot is in use. Bits are cleared
+ * when the command slot and all associated data structures
+ * are no longer needed.
+ */
+ u16 *log_buf;
+ dma_addr_t log_buf_dma;
+
+ u8 *smart_buf;
+ dma_addr_t smart_buf_dma;
+
+ unsigned long allocated[SLOTBITS_IN_LONGS];
+ /*
+ * used to queue commands when an internal command is in progress
+ * or error handling is active
+ */
+ unsigned long cmds_to_issue[SLOTBITS_IN_LONGS];
+ /*
+ * Array of command slots. Structure includes pointers to the
+ * command header and command table, and completion function and data
+ * pointers.
+ */
+ struct mtip_cmd commands[MTIP_MAX_COMMAND_SLOTS];
+ /* Used by mtip_service_thread to wait for an event */
+ wait_queue_head_t svc_wait;
+ /*
+ * indicates the state of the port. Also, helps the service thread
+ * to determine its action on wake up.
+ */
+ unsigned long flags;
+ /*
+ * Timer used to complete commands that have been active for too long.
+ */
+ struct timer_list cmd_timer;
+ unsigned long ic_pause_timer;
+ /*
+ * Semaphore used to block threads if there are no
+ * command slots available.
+ */
+ struct semaphore cmd_slot;
+ /* Spinlock for working around command-issue bug. */
+ spinlock_t cmd_issue_lock;
+};
+
+/*
+ * Driver private data structure.
+ *
+ * One structure is allocated per probed device.
+ */
+struct driver_data {
+ void __iomem *mmio; /* Base address of the HBA registers. */
+
+ int major; /* Major device number. */
+
+ int instance; /* Instance number. First device probed is 0, ... */
+
+ struct gendisk *disk; /* Pointer to our gendisk structure. */
+
+ struct pci_dev *pdev; /* Pointer to the PCI device structure. */
+
+ struct request_queue *queue; /* Our request queue. */
+
+ struct mtip_port *port; /* Pointer to the port data structure. */
+
+ /* Tasklet used to process the bottom half of the ISR. */
+ struct tasklet_struct tasklet;
+
+ unsigned product_type; /* magic value declaring the product type */
+
+ unsigned slot_groups; /* number of slot groups the product supports */
+
+ unsigned long index; /* Index to determine the disk name */
+
+ unsigned long dd_flag; /* NOTE: use atomic bit operations on this */
+
+ struct task_struct *mtip_svc_handler; /* task_struct of svc thd */
+
+ struct dentry *dfs_node;
+};
+
+#endif
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index f533f3375e24..0c03411c59eb 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -34,12 +34,11 @@
#include <linux/kthread.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <asm/types.h>
#include <linux/nbd.h>
-#define LO_MAGIC 0x68797548
+#define NBD_MAGIC 0x68797548
#ifdef NDEBUG
#define dprintk(flags, fmt...)
@@ -116,7 +115,7 @@ static void nbd_end_request(struct request *req)
spin_unlock_irqrestore(q->queue_lock, flags);
}
-static void sock_shutdown(struct nbd_device *lo, int lock)
+static void sock_shutdown(struct nbd_device *nbd, int lock)
{
/* Forcibly shutdown the socket causing all listeners
* to error
@@ -125,15 +124,14 @@ static void sock_shutdown(struct nbd_device *lo, int lock)
* there should be a more generic interface rather than
* calling socket ops directly here */
if (lock)
- mutex_lock(&lo->tx_lock);
- if (lo->sock) {
- printk(KERN_WARNING "%s: shutting down socket\n",
- lo->disk->disk_name);
- kernel_sock_shutdown(lo->sock, SHUT_RDWR);
- lo->sock = NULL;
+ mutex_lock(&nbd->tx_lock);
+ if (nbd->sock) {
+ dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
+ kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
+ nbd->sock = NULL;
}
if (lock)
- mutex_unlock(&lo->tx_lock);
+ mutex_unlock(&nbd->tx_lock);
}
static void nbd_xmit_timeout(unsigned long arg)
@@ -148,18 +146,20 @@ static void nbd_xmit_timeout(unsigned long arg)
/*
* Send or receive packet.
*/
-static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
+static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
int msg_flags)
{
- struct socket *sock = lo->sock;
+ struct socket *sock = nbd->sock;
int result;
struct msghdr msg;
struct kvec iov;
sigset_t blocked, oldset;
+ unsigned long pflags = current->flags;
if (unlikely(!sock)) {
- printk(KERN_ERR "%s: Attempted %s on closed socket in sock_xmit\n",
- lo->disk->disk_name, (send ? "send" : "recv"));
+ dev_err(disk_to_dev(nbd->disk),
+ "Attempted %s on closed socket in sock_xmit\n",
+ (send ? "send" : "recv"));
return -EINVAL;
}
@@ -168,8 +168,9 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
siginitsetinv(&blocked, sigmask(SIGKILL));
sigprocmask(SIG_SETMASK, &blocked, &oldset);
+ current->flags |= PF_MEMALLOC;
do {
- sock->sk->sk_allocation = GFP_NOIO;
+ sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
iov.iov_base = buf;
iov.iov_len = size;
msg.msg_name = NULL;
@@ -181,15 +182,15 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
if (send) {
struct timer_list ti;
- if (lo->xmit_timeout) {
+ if (nbd->xmit_timeout) {
init_timer(&ti);
ti.function = nbd_xmit_timeout;
ti.data = (unsigned long)current;
- ti.expires = jiffies + lo->xmit_timeout;
+ ti.expires = jiffies + nbd->xmit_timeout;
add_timer(&ti);
}
result = kernel_sendmsg(sock, &msg, &iov, 1, size);
- if (lo->xmit_timeout)
+ if (nbd->xmit_timeout)
del_timer_sync(&ti);
} else
result = kernel_recvmsg(sock, &msg, &iov, 1, size,
@@ -201,7 +202,7 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
task_pid_nr(current), current->comm,
dequeue_signal_lock(current, &current->blocked, &info));
result = -EINTR;
- sock_shutdown(lo, !send);
+ sock_shutdown(nbd, !send);
break;
}
@@ -215,22 +216,24 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
} while (size > 0);
sigprocmask(SIG_SETMASK, &oldset, NULL);
+ tsk_restore_flags(current, pflags, PF_MEMALLOC);
return result;
}
-static inline int sock_send_bvec(struct nbd_device *lo, struct bio_vec *bvec,
+static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
int flags)
{
int result;
void *kaddr = kmap(bvec->bv_page);
- result = sock_xmit(lo, 1, kaddr + bvec->bv_offset, bvec->bv_len, flags);
+ result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
+ bvec->bv_len, flags);
kunmap(bvec->bv_page);
return result;
}
/* always call with the tx_lock held */
-static int nbd_send_req(struct nbd_device *lo, struct request *req)
+static int nbd_send_req(struct nbd_device *nbd, struct request *req)
{
int result, flags;
struct nbd_request request;
@@ -243,15 +246,15 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
memcpy(request.handle, &req, sizeof(req));
dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n",
- lo->disk->disk_name, req,
+ nbd->disk->disk_name, req,
nbdcmd_to_ascii(nbd_cmd(req)),
(unsigned long long)blk_rq_pos(req) << 9,
blk_rq_bytes(req));
- result = sock_xmit(lo, 1, &request, sizeof(request),
+ result = sock_xmit(nbd, 1, &request, sizeof(request),
(nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0);
if (result <= 0) {
- printk(KERN_ERR "%s: Send control failed (result %d)\n",
- lo->disk->disk_name, result);
+ dev_err(disk_to_dev(nbd->disk),
+ "Send control failed (result %d)\n", result);
goto error_out;
}
@@ -267,11 +270,12 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
if (!rq_iter_last(req, iter))
flags = MSG_MORE;
dprintk(DBG_TX, "%s: request %p: sending %d bytes data\n",
- lo->disk->disk_name, req, bvec->bv_len);
- result = sock_send_bvec(lo, bvec, flags);
+ nbd->disk->disk_name, req, bvec->bv_len);
+ result = sock_send_bvec(nbd, bvec, flags);
if (result <= 0) {
- printk(KERN_ERR "%s: Send data failed (result %d)\n",
- lo->disk->disk_name, result);
+ dev_err(disk_to_dev(nbd->disk),
+ "Send data failed (result %d)\n",
+ result);
goto error_out;
}
}
@@ -282,25 +286,25 @@ error_out:
return -EIO;
}
-static struct request *nbd_find_request(struct nbd_device *lo,
+static struct request *nbd_find_request(struct nbd_device *nbd,
struct request *xreq)
{
struct request *req, *tmp;
int err;
- err = wait_event_interruptible(lo->active_wq, lo->active_req != xreq);
+ err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq);
if (unlikely(err))
goto out;
- spin_lock(&lo->queue_lock);
- list_for_each_entry_safe(req, tmp, &lo->queue_head, queuelist) {
+ spin_lock(&nbd->queue_lock);
+ list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) {
if (req != xreq)
continue;
list_del_init(&req->queuelist);
- spin_unlock(&lo->queue_lock);
+ spin_unlock(&nbd->queue_lock);
return req;
}
- spin_unlock(&lo->queue_lock);
+ spin_unlock(&nbd->queue_lock);
err = -ENOENT;
@@ -308,79 +312,78 @@ out:
return ERR_PTR(err);
}
-static inline int sock_recv_bvec(struct nbd_device *lo, struct bio_vec *bvec)
+static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
{
int result;
void *kaddr = kmap(bvec->bv_page);
- result = sock_xmit(lo, 0, kaddr + bvec->bv_offset, bvec->bv_len,
+ result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len,
MSG_WAITALL);
kunmap(bvec->bv_page);
return result;
}
/* NULL returned = something went wrong, inform userspace */
-static struct request *nbd_read_stat(struct nbd_device *lo)
+static struct request *nbd_read_stat(struct nbd_device *nbd)
{
int result;
struct nbd_reply reply;
struct request *req;
reply.magic = 0;
- result = sock_xmit(lo, 0, &reply, sizeof(reply), MSG_WAITALL);
+ result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
if (result <= 0) {
- printk(KERN_ERR "%s: Receive control failed (result %d)\n",
- lo->disk->disk_name, result);
+ dev_err(disk_to_dev(nbd->disk),
+ "Receive control failed (result %d)\n", result);
goto harderror;
}
if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
- printk(KERN_ERR "%s: Wrong magic (0x%lx)\n",
- lo->disk->disk_name,
+ dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
(unsigned long)ntohl(reply.magic));
result = -EPROTO;
goto harderror;
}
- req = nbd_find_request(lo, *(struct request **)reply.handle);
+ req = nbd_find_request(nbd, *(struct request **)reply.handle);
if (IS_ERR(req)) {
result = PTR_ERR(req);
if (result != -ENOENT)
goto harderror;
- printk(KERN_ERR "%s: Unexpected reply (%p)\n",
- lo->disk->disk_name, reply.handle);
+ dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n",
+ reply.handle);
result = -EBADR;
goto harderror;
}
if (ntohl(reply.error)) {
- printk(KERN_ERR "%s: Other side returned error (%d)\n",
- lo->disk->disk_name, ntohl(reply.error));
+ dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
+ ntohl(reply.error));
req->errors++;
return req;
}
dprintk(DBG_RX, "%s: request %p: got reply\n",
- lo->disk->disk_name, req);
+ nbd->disk->disk_name, req);
if (nbd_cmd(req) == NBD_CMD_READ) {
struct req_iterator iter;
struct bio_vec *bvec;
rq_for_each_segment(bvec, req, iter) {
- result = sock_recv_bvec(lo, bvec);
+ result = sock_recv_bvec(nbd, bvec);
if (result <= 0) {
- printk(KERN_ERR "%s: Receive data failed (result %d)\n",
- lo->disk->disk_name, result);
+ dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
+ result);
req->errors++;
return req;
}
dprintk(DBG_RX, "%s: request %p: got %d bytes data\n",
- lo->disk->disk_name, req, bvec->bv_len);
+ nbd->disk->disk_name, req, bvec->bv_len);
}
}
return req;
harderror:
- lo->harderror = result;
+ nbd->harderror = result;
return NULL;
}
@@ -398,48 +401,57 @@ static struct device_attribute pid_attr = {
.show = pid_show,
};
-static int nbd_do_it(struct nbd_device *lo)
+static int nbd_do_it(struct nbd_device *nbd)
{
struct request *req;
int ret;
- BUG_ON(lo->magic != LO_MAGIC);
+ BUG_ON(nbd->magic != NBD_MAGIC);
- lo->pid = current->pid;
- ret = sysfs_create_file(&disk_to_dev(lo->disk)->kobj, &pid_attr.attr);
+ sk_set_memalloc(nbd->sock->sk);
+ nbd->pid = task_pid_nr(current);
+ ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
if (ret) {
- printk(KERN_ERR "nbd: sysfs_create_file failed!");
- lo->pid = 0;
+ dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
+ nbd->pid = 0;
return ret;
}
- while ((req = nbd_read_stat(lo)) != NULL)
+ while ((req = nbd_read_stat(nbd)) != NULL)
nbd_end_request(req);
- sysfs_remove_file(&disk_to_dev(lo->disk)->kobj, &pid_attr.attr);
- lo->pid = 0;
+ device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
+ nbd->pid = 0;
return 0;
}
-static void nbd_clear_que(struct nbd_device *lo)
+static void nbd_clear_que(struct nbd_device *nbd)
{
struct request *req;
- BUG_ON(lo->magic != LO_MAGIC);
+ BUG_ON(nbd->magic != NBD_MAGIC);
/*
- * Because we have set lo->sock to NULL under the tx_lock, all
+ * Because we have set nbd->sock to NULL under the tx_lock, all
* modifications to the list must have completed by now. For
* the same reason, the active_req must be NULL.
*
* As a consequence, we don't need to take the spin lock while
* purging the list here.
*/
- BUG_ON(lo->sock);
- BUG_ON(lo->active_req);
+ BUG_ON(nbd->sock);
+ BUG_ON(nbd->active_req);
- while (!list_empty(&lo->queue_head)) {
- req = list_entry(lo->queue_head.next, struct request,
+ while (!list_empty(&nbd->queue_head)) {
+ req = list_entry(nbd->queue_head.next, struct request,
+ queuelist);
+ list_del_init(&req->queuelist);
+ req->errors++;
+ nbd_end_request(req);
+ }
+
+ while (!list_empty(&nbd->waiting_queue)) {
+ req = list_entry(nbd->waiting_queue.next, struct request,
queuelist);
list_del_init(&req->queuelist);
req->errors++;
@@ -448,7 +460,7 @@ static void nbd_clear_que(struct nbd_device *lo)
}
-static void nbd_handle_req(struct nbd_device *lo, struct request *req)
+static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
{
if (req->cmd_type != REQ_TYPE_FS)
goto error_out;
@@ -456,39 +468,38 @@ static void nbd_handle_req(struct nbd_device *lo, struct request *req)
nbd_cmd(req) = NBD_CMD_READ;
if (rq_data_dir(req) == WRITE) {
nbd_cmd(req) = NBD_CMD_WRITE;
- if (lo->flags & NBD_READ_ONLY) {
- printk(KERN_ERR "%s: Write on read-only\n",
- lo->disk->disk_name);
+ if (nbd->flags & NBD_READ_ONLY) {
+ dev_err(disk_to_dev(nbd->disk),
+ "Write on read-only\n");
goto error_out;
}
}
req->errors = 0;
- mutex_lock(&lo->tx_lock);
- if (unlikely(!lo->sock)) {
- mutex_unlock(&lo->tx_lock);
- printk(KERN_ERR "%s: Attempted send on closed socket\n",
- lo->disk->disk_name);
+ mutex_lock(&nbd->tx_lock);
+ if (unlikely(!nbd->sock)) {
+ mutex_unlock(&nbd->tx_lock);
+ dev_err(disk_to_dev(nbd->disk),
+ "Attempted send on closed socket\n");
goto error_out;
}
- lo->active_req = req;
+ nbd->active_req = req;
- if (nbd_send_req(lo, req) != 0) {
- printk(KERN_ERR "%s: Request send failed\n",
- lo->disk->disk_name);
+ if (nbd_send_req(nbd, req) != 0) {
+ dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
req->errors++;
nbd_end_request(req);
} else {
- spin_lock(&lo->queue_lock);
- list_add(&req->queuelist, &lo->queue_head);
- spin_unlock(&lo->queue_lock);
+ spin_lock(&nbd->queue_lock);
+ list_add_tail(&req->queuelist, &nbd->queue_head);
+ spin_unlock(&nbd->queue_lock);
}
- lo->active_req = NULL;
- mutex_unlock(&lo->tx_lock);
- wake_up_all(&lo->active_wq);
+ nbd->active_req = NULL;
+ mutex_unlock(&nbd->tx_lock);
+ wake_up_all(&nbd->active_wq);
return;
@@ -499,28 +510,28 @@ error_out:
static int nbd_thread(void *data)
{
- struct nbd_device *lo = data;
+ struct nbd_device *nbd = data;
struct request *req;
set_user_nice(current, -20);
- while (!kthread_should_stop() || !list_empty(&lo->waiting_queue)) {
+ while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
/* wait for something to do */
- wait_event_interruptible(lo->waiting_wq,
+ wait_event_interruptible(nbd->waiting_wq,
kthread_should_stop() ||
- !list_empty(&lo->waiting_queue));
+ !list_empty(&nbd->waiting_queue));
/* extract request */
- if (list_empty(&lo->waiting_queue))
+ if (list_empty(&nbd->waiting_queue))
continue;
- spin_lock_irq(&lo->queue_lock);
- req = list_entry(lo->waiting_queue.next, struct request,
+ spin_lock_irq(&nbd->queue_lock);
+ req = list_entry(nbd->waiting_queue.next, struct request,
queuelist);
list_del_init(&req->queuelist);
- spin_unlock_irq(&lo->queue_lock);
+ spin_unlock_irq(&nbd->queue_lock);
/* handle request */
- nbd_handle_req(lo, req);
+ nbd_handle_req(nbd, req);
}
return 0;
}
@@ -528,7 +539,7 @@ static int nbd_thread(void *data)
/*
* We always wait for result of write, for now. It would be nice to make it optional
* in future
- * if ((rq_data_dir(req) == WRITE) && (lo->flags & NBD_WRITE_NOCHK))
+ * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK))
* { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
*/
@@ -537,31 +548,31 @@ static void do_nbd_request(struct request_queue *q)
struct request *req;
while ((req = blk_fetch_request(q)) != NULL) {
- struct nbd_device *lo;
+ struct nbd_device *nbd;
spin_unlock_irq(q->queue_lock);
dprintk(DBG_BLKDEV, "%s: request %p: dequeued (flags=%x)\n",
req->rq_disk->disk_name, req, req->cmd_type);
- lo = req->rq_disk->private_data;
+ nbd = req->rq_disk->private_data;
- BUG_ON(lo->magic != LO_MAGIC);
+ BUG_ON(nbd->magic != NBD_MAGIC);
- if (unlikely(!lo->sock)) {
- printk(KERN_ERR "%s: Attempted send on closed socket\n",
- lo->disk->disk_name);
+ if (unlikely(!nbd->sock)) {
+ dev_err(disk_to_dev(nbd->disk),
+ "Attempted send on closed socket\n");
req->errors++;
nbd_end_request(req);
spin_lock_irq(q->queue_lock);
continue;
}
- spin_lock_irq(&lo->queue_lock);
- list_add_tail(&req->queuelist, &lo->waiting_queue);
- spin_unlock_irq(&lo->queue_lock);
+ spin_lock_irq(&nbd->queue_lock);
+ list_add_tail(&req->queuelist, &nbd->waiting_queue);
+ spin_unlock_irq(&nbd->queue_lock);
- wake_up(&lo->waiting_wq);
+ wake_up(&nbd->waiting_wq);
spin_lock_irq(q->queue_lock);
}
@@ -569,32 +580,33 @@ static void do_nbd_request(struct request_queue *q)
/* Must be called with tx_lock held */
-static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
+static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
unsigned int cmd, unsigned long arg)
{
switch (cmd) {
case NBD_DISCONNECT: {
struct request sreq;
- printk(KERN_INFO "%s: NBD_DISCONNECT\n", lo->disk->disk_name);
+ dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
blk_rq_init(NULL, &sreq);
sreq.cmd_type = REQ_TYPE_SPECIAL;
nbd_cmd(&sreq) = NBD_CMD_DISC;
- if (!lo->sock)
+ if (!nbd->sock)
return -EINVAL;
- nbd_send_req(lo, &sreq);
+ nbd_send_req(nbd, &sreq);
return 0;
}
case NBD_CLEAR_SOCK: {
struct file *file;
- lo->sock = NULL;
- file = lo->file;
- lo->file = NULL;
- nbd_clear_que(lo);
- BUG_ON(!list_empty(&lo->queue_head));
+ nbd->sock = NULL;
+ file = nbd->file;
+ nbd->file = NULL;
+ nbd_clear_que(nbd);
+ BUG_ON(!list_empty(&nbd->queue_head));
+ BUG_ON(!list_empty(&nbd->waiting_queue));
if (file)
fput(file);
return 0;
@@ -602,14 +614,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
case NBD_SET_SOCK: {
struct file *file;
- if (lo->file)
+ if (nbd->file)
return -EBUSY;
file = fget(arg);
if (file) {
struct inode *inode = file->f_path.dentry->d_inode;
if (S_ISSOCK(inode->i_mode)) {
- lo->file = file;
- lo->sock = SOCKET_I(inode);
+ nbd->file = file;
+ nbd->sock = SOCKET_I(inode);
if (max_part > 0)
bdev->bd_invalidated = 1;
return 0;
@@ -621,29 +633,29 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
}
case NBD_SET_BLKSIZE:
- lo->blksize = arg;
- lo->bytesize &= ~(lo->blksize-1);
- bdev->bd_inode->i_size = lo->bytesize;
- set_blocksize(bdev, lo->blksize);
- set_capacity(lo->disk, lo->bytesize >> 9);
+ nbd->blksize = arg;
+ nbd->bytesize &= ~(nbd->blksize-1);
+ bdev->bd_inode->i_size = nbd->bytesize;
+ set_blocksize(bdev, nbd->blksize);
+ set_capacity(nbd->disk, nbd->bytesize >> 9);
return 0;
case NBD_SET_SIZE:
- lo->bytesize = arg & ~(lo->blksize-1);
- bdev->bd_inode->i_size = lo->bytesize;
- set_blocksize(bdev, lo->blksize);
- set_capacity(lo->disk, lo->bytesize >> 9);
+ nbd->bytesize = arg & ~(nbd->blksize-1);
+ bdev->bd_inode->i_size = nbd->bytesize;
+ set_blocksize(bdev, nbd->blksize);
+ set_capacity(nbd->disk, nbd->bytesize >> 9);
return 0;
case NBD_SET_TIMEOUT:
- lo->xmit_timeout = arg * HZ;
+ nbd->xmit_timeout = arg * HZ;
return 0;
case NBD_SET_SIZE_BLOCKS:
- lo->bytesize = ((u64) arg) * lo->blksize;
- bdev->bd_inode->i_size = lo->bytesize;
- set_blocksize(bdev, lo->blksize);
- set_capacity(lo->disk, lo->bytesize >> 9);
+ nbd->bytesize = ((u64) arg) * nbd->blksize;
+ bdev->bd_inode->i_size = nbd->bytesize;
+ set_blocksize(bdev, nbd->blksize);
+ set_capacity(nbd->disk, nbd->bytesize >> 9);
return 0;
case NBD_DO_IT: {
@@ -651,38 +663,38 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
struct file *file;
int error;
- if (lo->pid)
+ if (nbd->pid)
return -EBUSY;
- if (!lo->file)
+ if (!nbd->file)
return -EINVAL;
- mutex_unlock(&lo->tx_lock);
+ mutex_unlock(&nbd->tx_lock);
- thread = kthread_create(nbd_thread, lo, lo->disk->disk_name);
+ thread = kthread_create(nbd_thread, nbd, nbd->disk->disk_name);
if (IS_ERR(thread)) {
- mutex_lock(&lo->tx_lock);
+ mutex_lock(&nbd->tx_lock);
return PTR_ERR(thread);
}
wake_up_process(thread);
- error = nbd_do_it(lo);
+ error = nbd_do_it(nbd);
kthread_stop(thread);
- mutex_lock(&lo->tx_lock);
+ mutex_lock(&nbd->tx_lock);
if (error)
return error;
- sock_shutdown(lo, 0);
- file = lo->file;
- lo->file = NULL;
- nbd_clear_que(lo);
- printk(KERN_WARNING "%s: queue cleared\n", lo->disk->disk_name);
+ sock_shutdown(nbd, 0);
+ file = nbd->file;
+ nbd->file = NULL;
+ nbd_clear_que(nbd);
+ dev_warn(disk_to_dev(nbd->disk), "queue cleared\n");
if (file)
fput(file);
- lo->bytesize = 0;
+ nbd->bytesize = 0;
bdev->bd_inode->i_size = 0;
- set_capacity(lo->disk, 0);
+ set_capacity(nbd->disk, 0);
if (max_part > 0)
ioctl_by_bdev(bdev, BLKRRPART, 0);
- return lo->harderror;
+ return nbd->harderror;
}
case NBD_CLEAR_QUE:
@@ -690,14 +702,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
* This is for compatibility only. The queue is always cleared
* by NBD_DO_IT or NBD_CLEAR_SOCK.
*/
- BUG_ON(!lo->sock && !list_empty(&lo->queue_head));
+ BUG_ON(!nbd->sock && !list_empty(&nbd->queue_head));
return 0;
case NBD_PRINT_DEBUG:
- printk(KERN_INFO "%s: next = %p, prev = %p, head = %p\n",
- bdev->bd_disk->disk_name,
- lo->queue_head.next, lo->queue_head.prev,
- &lo->queue_head);
+ dev_info(disk_to_dev(nbd->disk),
+ "next = %p, prev = %p, head = %p\n",
+ nbd->queue_head.next, nbd->queue_head.prev,
+ &nbd->queue_head);
return 0;
}
return -ENOTTY;
@@ -706,21 +718,21 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
- struct nbd_device *lo = bdev->bd_disk->private_data;
+ struct nbd_device *nbd = bdev->bd_disk->private_data;
int error;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- BUG_ON(lo->magic != LO_MAGIC);
+ BUG_ON(nbd->magic != NBD_MAGIC);
/* Anyone capable of this syscall can do *real bad* things */
dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n",
- lo->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
+ nbd->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
- mutex_lock(&lo->tx_lock);
- error = __nbd_ioctl(bdev, lo, cmd, arg);
- mutex_unlock(&lo->tx_lock);
+ mutex_lock(&nbd->tx_lock);
+ error = __nbd_ioctl(bdev, nbd, cmd, arg);
+ mutex_unlock(&nbd->tx_lock);
return error;
}
@@ -745,7 +757,7 @@ static int __init nbd_init(void)
BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
if (max_part < 0) {
- printk(KERN_CRIT "nbd: max_part must be >= 0\n");
+ printk(KERN_ERR "nbd: max_part must be >= 0\n");
return -EINVAL;
}
@@ -806,7 +818,7 @@ static int __init nbd_init(void)
for (i = 0; i < nbds_max; i++) {
struct gendisk *disk = nbd_dev[i].disk;
nbd_dev[i].file = NULL;
- nbd_dev[i].magic = LO_MAGIC;
+ nbd_dev[i].magic = NBD_MAGIC;
nbd_dev[i].flags = 0;
INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
spin_lock_init(&nbd_dev[i].queue_lock);
diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index f9ad514c9227..ad16c68c8645 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -39,7 +39,8 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/types.h>
-#include <linux/version.h>
+
+#include <asm-generic/io-64-nonatomic-lo-hi.h>
#define NVME_Q_DEPTH 1024
#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
@@ -614,11 +615,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
return result;
}
-/*
- * NB: return value of non-zero would mean that we were a stacking driver.
- * make_request must always succeed.
- */
-static int nvme_make_request(struct request_queue *q, struct bio *bio)
+static void nvme_make_request(struct request_queue *q, struct bio *bio)
{
struct nvme_ns *ns = q->queuedata;
struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
@@ -635,8 +632,6 @@ static int nvme_make_request(struct request_queue *q, struct bio *bio)
spin_unlock_irq(&nvmeq->q_lock);
put_nvmeq(nvmeq);
-
- return 0;
}
static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
diff --git a/drivers/block/paride/bpck6.c b/drivers/block/paride/bpck6.c
index ad124525ac23..ec64e7f5d1ce 100644
--- a/drivers/block/paride/bpck6.c
+++ b/drivers/block/paride/bpck6.c
@@ -20,9 +20,6 @@
*/
-/* PARAMETERS */
-static int verbose; /* set this to 1 to see debugging messages and whatnot */
-
#define BACKPACK_VERSION "2.0.2"
#include <linux/module.h>
@@ -36,6 +33,8 @@ static int verbose; /* set this to 1 to see debugging messages and whatnot */
#include "ppc6lnx.c"
#include "paride.h"
+/* PARAMETERS */
+static bool verbose; /* set this to 1 to see debugging messages and whatnot */
#define PPCSTRUCT(pi) ((Interface *)(pi->private))
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 46b8136c31bb..ba2b6b5e5910 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -144,7 +144,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
static DEFINE_MUTEX(pcd_mutex);
static DEFINE_SPINLOCK(pcd_lock);
-module_param(verbose, bool, 0644);
+module_param(verbose, int, 0644);
module_param(major, int, 0);
module_param(name, charp, 0);
module_param(nice, int, 0);
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 869e7676d46f..831e3ac156e6 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -124,8 +124,9 @@
by default.
*/
+#include <linux/types.h>
-static int verbose = 0;
+static bool verbose = 0;
static int major = PD_MAJOR;
static char *name = PD_NAME;
static int cluster = 64;
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index f21b520ef419..ec8f9ed6326e 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -118,13 +118,15 @@
#define PF_NAME "pf"
#define PF_UNITS 4
+#include <linux/types.h>
+
/* Here are things one can override from the insmod command.
Most are autoprobed by paride unless set here. Verbose is off
by default.
*/
-static int verbose = 0;
+static bool verbose = 0;
static int major = PF_MAJOR;
static char *name = PF_NAME;
static int cluster = 64;
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index 6b9a2000d56a..4a27b1de5fcb 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -130,13 +130,14 @@
#define PI_PG 4
#endif
+#include <linux/types.h>
/* Here are things one can override from the insmod command.
Most are autoprobed by paride unless set here. Verbose is 0
by default.
*/
-static int verbose = 0;
+static bool verbose = 0;
static int major = PG_MAJOR;
static char *name = PG_NAME;
static int disable = 0;
@@ -630,6 +631,7 @@ static ssize_t pg_read(struct file *filp, char __user *buf, size_t count, loff_t
if (dev->status & 0x10)
return -ETIME;
+ memset(&hdr, 0, sizeof(hdr));
hdr.magic = PG_MAGIC;
hdr.dlen = dev->dlen;
copy = 0;
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index 7179f79d7468..2596042eb987 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -109,13 +109,15 @@
#define PT_NAME "pt"
#define PT_UNITS 4
+#include <linux/types.h>
+
/* Here are things one can override from the insmod command.
Most are autoprobed by paride unless set here. Verbose is on
by default.
*/
-static int verbose = 0;
+static bool verbose = 0;
static int major = PT_MAJOR;
static char *name = PT_NAME;
static int disable = 0;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index e133f094ab08..ba66e4445f41 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -987,14 +987,14 @@ static void pkt_copy_bio_data(struct bio *src_bio, int seg, int offs, struct pag
while (copy_size > 0) {
struct bio_vec *src_bvl = bio_iovec_idx(src_bio, seg);
- void *vfrom = kmap_atomic(src_bvl->bv_page, KM_USER0) +
+ void *vfrom = kmap_atomic(src_bvl->bv_page) +
src_bvl->bv_offset + offs;
void *vto = page_address(dst_page) + dst_offs;
int len = min_t(int, copy_size, src_bvl->bv_len - offs);
BUG_ON(len < 0);
memcpy(vto, vfrom, len);
- kunmap_atomic(vfrom, KM_USER0);
+ kunmap_atomic(vfrom);
seg++;
offs = 0;
@@ -1019,10 +1019,10 @@ static void pkt_make_local_copy(struct packet_data *pkt, struct bio_vec *bvec)
offs = 0;
for (f = 0; f < pkt->frames; f++) {
if (bvec[f].bv_page != pkt->pages[p]) {
- void *vfrom = kmap_atomic(bvec[f].bv_page, KM_USER0) + bvec[f].bv_offset;
+ void *vfrom = kmap_atomic(bvec[f].bv_page) + bvec[f].bv_offset;
void *vto = page_address(pkt->pages[p]) + offs;
memcpy(vto, vfrom, CD_FRAMESIZE);
- kunmap_atomic(vfrom, KM_USER0);
+ kunmap_atomic(vfrom);
bvec[f].bv_page = pkt->pages[p];
bvec[f].bv_offset = offs;
} else {
@@ -2444,7 +2444,7 @@ static void pkt_end_io_read_cloned(struct bio *bio, int err)
pkt_bio_finished(pd);
}
-static int pkt_make_request(struct request_queue *q, struct bio *bio)
+static void pkt_make_request(struct request_queue *q, struct bio *bio)
{
struct pktcdvd_device *pd;
char b[BDEVNAME_SIZE];
@@ -2473,7 +2473,7 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio)
cloned_bio->bi_end_io = pkt_end_io_read_cloned;
pd->stats.secs_r += bio->bi_size >> 9;
pkt_queue_bio(pd, cloned_bio);
- return 0;
+ return;
}
if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
@@ -2509,7 +2509,7 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio)
pkt_make_request(q, &bp->bio1);
pkt_make_request(q, &bp->bio2);
bio_pair_release(bp);
- return 0;
+ return;
}
}
@@ -2533,7 +2533,7 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio)
}
spin_unlock(&pkt->lock);
spin_unlock(&pd->cdrw.active_list_lock);
- return 0;
+ return;
} else {
blocked_bio = 1;
}
@@ -2584,10 +2584,9 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio)
*/
wake_up(&pd->wqueue);
}
- return 0;
+ return;
end_io:
bio_io_error(bio);
- return 0;
}
@@ -2818,7 +2817,7 @@ static const struct block_device_operations pktcdvd_ops = {
.check_events = pkt_check_events,
};
-static char *pktcdvd_devnode(struct gendisk *gd, mode_t *mode)
+static char *pktcdvd_devnode(struct gendisk *gd, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "pktcdvd/%s", gd->disk_name);
}
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 8e1ce2e2916a..da0abc1838c1 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -21,6 +21,7 @@
#include <linux/ata.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
+#include <linux/module.h>
#include <asm/lv1call.h>
#include <asm/ps3stor.h>
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index b3bdb8af89cf..f58cdcfb305f 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -10,6 +10,7 @@
#include <linux/blkdev.h>
#include <linux/delay.h>
+#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
@@ -596,7 +597,7 @@ out:
return next;
}
-static int ps3vram_make_request(struct request_queue *q, struct bio *bio)
+static void ps3vram_make_request(struct request_queue *q, struct bio *bio)
{
struct ps3_system_bus_device *dev = q->queuedata;
struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
@@ -610,13 +611,11 @@ static int ps3vram_make_request(struct request_queue *q, struct bio *bio)
spin_unlock_irq(&priv->lock);
if (busy)
- return 0;
+ return;
do {
bio = ps3vram_do_bio(dev, bio);
} while (bio);
-
- return 0;
}
static int __devinit ps3vram_probe(struct ps3_system_bus_device *dev)
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 15f65b5f3fc7..54a55f03115d 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -41,19 +41,33 @@
#include "rbd_types.h"
-#define DRV_NAME "rbd"
-#define DRV_NAME_LONG "rbd (rados block device)"
+/*
+ * The basic unit of block I/O is a sector. It is interpreted in a
+ * number of contexts in Linux (blk, bio, genhd), but the default is
+ * universally 512 bytes. These symbols are just slightly more
+ * meaningful than the bare numbers they represent.
+ */
+#define SECTOR_SHIFT 9
+#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
+
+#define RBD_DRV_NAME "rbd"
+#define RBD_DRV_NAME_LONG "rbd (rados block device)"
#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
-#define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX))
-#define RBD_MAX_POOL_NAME_LEN 64
#define RBD_MAX_SNAP_NAME_LEN 32
#define RBD_MAX_OPT_LEN 1024
#define RBD_SNAP_HEAD_NAME "-"
+/*
+ * An RBD device name will be "rbd#", where the "rbd" comes from
+ * RBD_DRV_NAME above, and # is a unique integer identifier.
+ * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
+ * enough to hold all possible device names.
+ */
#define DEV_NAME_LEN 32
+#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
@@ -62,14 +76,12 @@
*/
struct rbd_image_header {
u64 image_size;
- char block_name[32];
+ char *object_prefix;
__u8 obj_order;
__u8 crypt_type;
__u8 comp_type;
- struct rw_semaphore snap_rwsem;
struct ceph_snap_context *snapc;
size_t snap_names_len;
- u64 snap_seq;
u32 total_snaps;
char *snap_names;
@@ -83,7 +95,7 @@ struct rbd_options {
};
/*
- * an instance of the client. multiple devices may share a client.
+ * an instance of the client. multiple devices may share an rbd client.
*/
struct rbd_client {
struct ceph_client *client;
@@ -92,20 +104,9 @@ struct rbd_client {
struct list_head node;
};
-struct rbd_req_coll;
-
/*
- * a single io request
+ * a request completion status
*/
-struct rbd_request {
- struct request *rq; /* blk layer request */
- struct bio *bio; /* cloned bio */
- struct page **pages; /* list of used pages */
- u64 len;
- int coll_index;
- struct rbd_req_coll *coll;
-};
-
struct rbd_req_status {
int done;
int rc;
@@ -122,10 +123,22 @@ struct rbd_req_coll {
struct rbd_req_status status[0];
};
+/*
+ * a single io request
+ */
+struct rbd_request {
+ struct request *rq; /* blk layer request */
+ struct bio *bio; /* cloned bio */
+ struct page **pages; /* list of used pages */
+ u64 len;
+ int coll_index;
+ struct rbd_req_coll *coll;
+};
+
struct rbd_snap {
struct device dev;
const char *name;
- size_t size;
+ u64 size;
struct list_head node;
u64 id;
};
@@ -134,13 +147,12 @@ struct rbd_snap {
* a single device
*/
struct rbd_device {
- int id; /* blkdev unique id */
+ int dev_id; /* blkdev unique id */
int major; /* blkdev assigned major */
struct gendisk *disk; /* blkdev's gendisk and rq */
struct request_queue *q;
- struct ceph_client *client;
struct rbd_client *rbd_client;
char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
@@ -148,19 +160,24 @@ struct rbd_device {
spinlock_t lock; /* queue lock */
struct rbd_image_header header;
- char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
- int obj_len;
- char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
- char pool_name[RBD_MAX_POOL_NAME_LEN];
- int poolid;
+ char *image_name;
+ size_t image_name_len;
+ char *header_name;
+ char *pool_name;
+ int pool_id;
struct ceph_osd_event *watch_event;
struct ceph_osd_request *watch_request;
- char snap_name[RBD_MAX_SNAP_NAME_LEN];
- u32 cur_snap; /* index+1 of current snapshot within snap context
- 0 - for the head */
- int read_only;
+ /* protects updating the header */
+ struct rw_semaphore header_rwsem;
+ /* name of the snapshot this device reads from */
+ char *snap_name;
+ /* id of the snapshot this device reads from */
+ u64 snap_id; /* current snapshot id */
+ /* whether the snap_id this device reads from still exists */
+ bool snap_exists;
+ int read_only;
struct list_head node;
@@ -171,35 +188,48 @@ struct rbd_device {
struct device dev;
};
-static struct bus_type rbd_bus_type = {
- .name = "rbd",
-};
-
-static spinlock_t node_lock; /* protects client get/put */
-
static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
+
static LIST_HEAD(rbd_dev_list); /* devices */
-static LIST_HEAD(rbd_client_list); /* clients */
+static DEFINE_SPINLOCK(rbd_dev_list_lock);
+
+static LIST_HEAD(rbd_client_list); /* clients */
+static DEFINE_SPINLOCK(rbd_client_list_lock);
static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
static void rbd_dev_release(struct device *dev);
-static ssize_t rbd_snap_rollback(struct device *dev,
- struct device_attribute *attr,
- const char *buf,
- size_t size);
static ssize_t rbd_snap_add(struct device *dev,
struct device_attribute *attr,
const char *buf,
size_t count);
-static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
- struct rbd_snap *snap);;
+static void __rbd_remove_snap_dev(struct rbd_snap *snap);
+static ssize_t rbd_add(struct bus_type *bus, const char *buf,
+ size_t count);
+static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
+ size_t count);
-static struct rbd_device *dev_to_rbd(struct device *dev)
+static struct bus_attribute rbd_bus_attrs[] = {
+ __ATTR(add, S_IWUSR, NULL, rbd_add),
+ __ATTR(remove, S_IWUSR, NULL, rbd_remove),
+ __ATTR_NULL
+};
+
+static struct bus_type rbd_bus_type = {
+ .name = "rbd",
+ .bus_attrs = rbd_bus_attrs,
+};
+
+static void rbd_root_dev_release(struct device *dev)
{
- return container_of(dev, struct rbd_device, dev);
}
+static struct device rbd_root_dev = {
+ .init_name = "rbd",
+ .release = rbd_root_dev_release,
+};
+
+
static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
{
return get_device(&rbd_dev->dev);
@@ -210,20 +240,18 @@ static void rbd_put_dev(struct rbd_device *rbd_dev)
put_device(&rbd_dev->dev);
}
-static int __rbd_update_snaps(struct rbd_device *rbd_dev);
+static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
static int rbd_open(struct block_device *bdev, fmode_t mode)
{
- struct gendisk *disk = bdev->bd_disk;
- struct rbd_device *rbd_dev = disk->private_data;
-
- rbd_get_dev(rbd_dev);
-
- set_device_ro(bdev, rbd_dev->read_only);
+ struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
if ((mode & FMODE_WRITE) && rbd_dev->read_only)
return -EROFS;
+ rbd_get_dev(rbd_dev);
+ set_device_ro(bdev, rbd_dev->read_only);
+
return 0;
}
@@ -244,9 +272,9 @@ static const struct block_device_operations rbd_bd_ops = {
/*
* Initialize an rbd client instance.
- * We own *opt.
+ * We own *ceph_opts.
*/
-static struct rbd_client *rbd_client_create(struct ceph_options *opt,
+static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts,
struct rbd_options *rbd_opts)
{
struct rbd_client *rbdc;
@@ -260,10 +288,12 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt,
kref_init(&rbdc->kref);
INIT_LIST_HEAD(&rbdc->node);
- rbdc->client = ceph_create_client(opt, rbdc);
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
if (IS_ERR(rbdc->client))
- goto out_rbdc;
- opt = NULL; /* Now rbdc->client is responsible for opt */
+ goto out_mutex;
+ ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
ret = ceph_open_session(rbdc->client);
if (ret < 0)
@@ -271,35 +301,38 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt,
rbdc->rbd_opts = rbd_opts;
- spin_lock(&node_lock);
+ spin_lock(&rbd_client_list_lock);
list_add_tail(&rbdc->node, &rbd_client_list);
- spin_unlock(&node_lock);
+ spin_unlock(&rbd_client_list_lock);
+
+ mutex_unlock(&ctl_mutex);
dout("rbd_client_create created %p\n", rbdc);
return rbdc;
out_err:
ceph_destroy_client(rbdc->client);
-out_rbdc:
+out_mutex:
+ mutex_unlock(&ctl_mutex);
kfree(rbdc);
out_opt:
- if (opt)
- ceph_destroy_options(opt);
+ if (ceph_opts)
+ ceph_destroy_options(ceph_opts);
return ERR_PTR(ret);
}
/*
* Find a ceph client with specific addr and configuration.
*/
-static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
+static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts)
{
struct rbd_client *client_node;
- if (opt->flags & CEPH_OPT_NOSHARE)
+ if (ceph_opts->flags & CEPH_OPT_NOSHARE)
return NULL;
list_for_each_entry(client_node, &rbd_client_list, node)
- if (ceph_compare_options(opt, client_node->client) == 0)
+ if (!ceph_compare_options(ceph_opts, client_node->client))
return client_node;
return NULL;
}
@@ -315,7 +348,7 @@ enum {
/* string args above */
};
-static match_table_t rbdopt_tokens = {
+static match_table_t rbd_opts_tokens = {
{Opt_notify_timeout, "notify_timeout=%d"},
/* int args above */
/* string args above */
@@ -324,11 +357,11 @@ static match_table_t rbdopt_tokens = {
static int parse_rbd_opts_token(char *c, void *private)
{
- struct rbd_options *rbdopt = private;
+ struct rbd_options *rbd_opts = private;
substring_t argstr[MAX_OPT_ARGS];
int token, intval, ret;
- token = match_token((char *)c, rbdopt_tokens, argstr);
+ token = match_token(c, rbd_opts_tokens, argstr);
if (token < 0)
return -EINVAL;
@@ -349,7 +382,7 @@ static int parse_rbd_opts_token(char *c, void *private)
switch (token) {
case Opt_notify_timeout:
- rbdopt->notify_timeout = intval;
+ rbd_opts->notify_timeout = intval;
break;
default:
BUG_ON(token);
@@ -361,64 +394,63 @@ static int parse_rbd_opts_token(char *c, void *private)
* Get a ceph client with specific addr and configuration, if one does
* not exist create it.
*/
-static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
- char *options)
+static struct rbd_client *rbd_get_client(const char *mon_addr,
+ size_t mon_addr_len,
+ char *options)
{
struct rbd_client *rbdc;
- struct ceph_options *opt;
- int ret;
+ struct ceph_options *ceph_opts;
struct rbd_options *rbd_opts;
rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
if (!rbd_opts)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
- ret = ceph_parse_options(&opt, options, mon_addr,
- mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts);
- if (ret < 0)
- goto done_err;
+ ceph_opts = ceph_parse_options(options, mon_addr,
+ mon_addr + mon_addr_len,
+ parse_rbd_opts_token, rbd_opts);
+ if (IS_ERR(ceph_opts)) {
+ kfree(rbd_opts);
+ return ERR_CAST(ceph_opts);
+ }
- spin_lock(&node_lock);
- rbdc = __rbd_client_find(opt);
+ spin_lock(&rbd_client_list_lock);
+ rbdc = __rbd_client_find(ceph_opts);
if (rbdc) {
- ceph_destroy_options(opt);
-
/* using an existing client */
kref_get(&rbdc->kref);
- rbd_dev->rbd_client = rbdc;
- rbd_dev->client = rbdc->client;
- spin_unlock(&node_lock);
- return 0;
- }
- spin_unlock(&node_lock);
+ spin_unlock(&rbd_client_list_lock);
- rbdc = rbd_client_create(opt, rbd_opts);
- if (IS_ERR(rbdc)) {
- ret = PTR_ERR(rbdc);
- goto done_err;
+ ceph_destroy_options(ceph_opts);
+ kfree(rbd_opts);
+
+ return rbdc;
}
+ spin_unlock(&rbd_client_list_lock);
- rbd_dev->rbd_client = rbdc;
- rbd_dev->client = rbdc->client;
- return 0;
-done_err:
- kfree(rbd_opts);
- return ret;
+ rbdc = rbd_client_create(ceph_opts, rbd_opts);
+
+ if (IS_ERR(rbdc))
+ kfree(rbd_opts);
+
+ return rbdc;
}
/*
* Destroy ceph client
+ *
+ * Caller must hold rbd_client_list_lock.
*/
static void rbd_client_release(struct kref *kref)
{
struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
dout("rbd_release_client %p\n", rbdc);
- spin_lock(&node_lock);
+ spin_lock(&rbd_client_list_lock);
list_del(&rbdc->node);
- spin_unlock(&node_lock);
+ spin_unlock(&rbd_client_list_lock);
ceph_destroy_client(rbdc->client);
kfree(rbdc->rbd_opts);
@@ -433,7 +465,6 @@ static void rbd_put_client(struct rbd_device *rbd_dev)
{
kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
rbd_dev->rbd_client = NULL;
- rbd_dev->client = NULL;
}
/*
@@ -448,28 +479,37 @@ static void rbd_coll_release(struct kref *kref)
kfree(coll);
}
+static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
+{
+ return !memcmp(&ondisk->text,
+ RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT));
+}
+
/*
* Create a new header structure, translate header format from the on-disk
* header.
*/
static int rbd_header_from_disk(struct rbd_image_header *header,
struct rbd_image_header_ondisk *ondisk,
- int allocated_snaps,
- gfp_t gfp_flags)
+ u32 allocated_snaps)
{
- int i;
- u32 snap_count = le32_to_cpu(ondisk->snap_count);
- int ret = -ENOMEM;
+ u32 snap_count;
- init_rwsem(&header->snap_rwsem);
- header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
+ if (!rbd_dev_ondisk_valid(ondisk))
+ return -ENXIO;
+
+ snap_count = le32_to_cpu(ondisk->snap_count);
+ if (snap_count > (SIZE_MAX - sizeof(struct ceph_snap_context))
+ / sizeof (u64))
+ return -EINVAL;
header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
- snap_count *
- sizeof(struct rbd_image_snap_ondisk),
- gfp_flags);
+ snap_count * sizeof(u64),
+ GFP_KERNEL);
if (!header->snapc)
return -ENOMEM;
+
if (snap_count) {
+ header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
header->snap_names = kmalloc(header->snap_names_len,
GFP_KERNEL);
if (!header->snap_names)
@@ -479,11 +519,20 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
if (!header->snap_sizes)
goto err_names;
} else {
+ WARN_ON(ondisk->snap_names_len);
+ header->snap_names_len = 0;
header->snap_names = NULL;
header->snap_sizes = NULL;
}
- memcpy(header->block_name, ondisk->block_name,
+
+ header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1,
+ GFP_KERNEL);
+ if (!header->object_prefix)
+ goto err_sizes;
+
+ memcpy(header->object_prefix, ondisk->block_name,
sizeof(ondisk->block_name));
+ header->object_prefix[sizeof (ondisk->block_name)] = '\0';
header->image_size = le64_to_cpu(ondisk->image_size);
header->obj_order = ondisk->options.order;
@@ -491,12 +540,13 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
header->comp_type = ondisk->options.comp_type;
atomic_set(&header->snapc->nref, 1);
- header->snap_seq = le64_to_cpu(ondisk->snap_seq);
+ header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
header->snapc->num_snaps = snap_count;
header->total_snaps = snap_count;
- if (snap_count &&
- allocated_snaps == snap_count) {
+ if (snap_count && allocated_snaps == snap_count) {
+ int i;
+
for (i = 0; i < snap_count; i++) {
header->snapc->snaps[i] =
le64_to_cpu(ondisk->snaps[i].id);
@@ -505,32 +555,23 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
}
/* copy snapshot names */
- memcpy(header->snap_names, &ondisk->snaps[i],
+ memcpy(header->snap_names, &ondisk->snaps[snap_count],
header->snap_names_len);
}
return 0;
+err_sizes:
+ kfree(header->snap_sizes);
+ header->snap_sizes = NULL;
err_names:
kfree(header->snap_names);
+ header->snap_names = NULL;
err_snapc:
kfree(header->snapc);
- return ret;
-}
-
-static int snap_index(struct rbd_image_header *header, int snap_num)
-{
- return header->total_snaps - snap_num;
-}
-
-static u64 cur_snap_id(struct rbd_device *rbd_dev)
-{
- struct rbd_image_header *header = &rbd_dev->header;
-
- if (!rbd_dev->cur_snap)
- return 0;
+ header->snapc = NULL;
- return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
+ return -ENOMEM;
}
static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
@@ -539,70 +580,66 @@ static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
int i;
char *p = header->snap_names;
- for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
- if (strcmp(snap_name, p) == 0)
- break;
- }
- if (i == header->total_snaps)
- return -ENOENT;
- if (seq)
- *seq = header->snapc->snaps[i];
+ for (i = 0; i < header->total_snaps; i++) {
+ if (!strcmp(snap_name, p)) {
- if (size)
- *size = header->snap_sizes[i];
+ /* Found it. Pass back its id and/or size */
- return i;
+ if (seq)
+ *seq = header->snapc->snaps[i];
+ if (size)
+ *size = header->snap_sizes[i];
+ return i;
+ }
+ p += strlen(p) + 1; /* Skip ahead to the next name */
+ }
+ return -ENOENT;
}
-static int rbd_header_set_snap(struct rbd_device *dev,
- const char *snap_name,
- u64 *size)
-{
- struct rbd_image_header *header = &dev->header;
- struct ceph_snap_context *snapc = header->snapc;
- int ret = -ENOENT;
-
- down_write(&header->snap_rwsem);
-
- if (!snap_name ||
- !*snap_name ||
- strcmp(snap_name, "-") == 0 ||
- strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
- if (header->total_snaps)
- snapc->seq = header->snap_seq;
- else
- snapc->seq = 0;
- dev->cur_snap = 0;
- dev->read_only = 0;
+static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
+{
+ int ret;
+
+ down_write(&rbd_dev->header_rwsem);
+
+ if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
+ sizeof (RBD_SNAP_HEAD_NAME))) {
+ rbd_dev->snap_id = CEPH_NOSNAP;
+ rbd_dev->snap_exists = false;
+ rbd_dev->read_only = 0;
if (size)
- *size = header->image_size;
+ *size = rbd_dev->header.image_size;
} else {
- ret = snap_by_name(header, snap_name, &snapc->seq, size);
+ u64 snap_id = 0;
+
+ ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
+ &snap_id, size);
if (ret < 0)
goto done;
-
- dev->cur_snap = header->total_snaps - ret;
- dev->read_only = 1;
+ rbd_dev->snap_id = snap_id;
+ rbd_dev->snap_exists = true;
+ rbd_dev->read_only = 1;
}
ret = 0;
done:
- up_write(&header->snap_rwsem);
+ up_write(&rbd_dev->header_rwsem);
return ret;
}
static void rbd_header_free(struct rbd_image_header *header)
{
- kfree(header->snapc);
- kfree(header->snap_names);
+ kfree(header->object_prefix);
kfree(header->snap_sizes);
+ kfree(header->snap_names);
+ ceph_put_snap_context(header->snapc);
}
/*
* get the actual striped segment name, offset and length
*/
static u64 rbd_get_segment(struct rbd_image_header *header,
- const char *block_name,
+ const char *object_prefix,
u64 ofs, u64 len,
char *seg_name, u64 *segofs)
{
@@ -610,7 +647,7 @@ static u64 rbd_get_segment(struct rbd_image_header *header,
if (seg_name)
snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
- "%s.%012llx", block_name, seg);
+ "%s.%012llx", object_prefix, seg);
ofs = ofs & ((1 << header->obj_order) - 1);
len = min_t(u64, len, (1 << header->obj_order) - ofs);
@@ -708,13 +745,12 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
* split_bio will BUG_ON if this is not the case
*/
dout("bio_chain_clone split! total=%d remaining=%d"
- "bi_size=%d\n",
- (int)total, (int)len-total,
- (int)old_chain->bi_size);
+ "bi_size=%u\n",
+ total, len - total, old_chain->bi_size);
/* split the bio. We'll release it either in the next
call, or it will have to be released outside */
- bp = bio_split(old_chain, (len - total) / 512ULL);
+ bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
if (!bp)
goto err_out;
@@ -759,22 +795,24 @@ err_out:
/*
* helpers for osd request op vectors.
*/
-static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
- int num_ops,
- int opcode,
- u32 payload_len)
-{
- *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
- GFP_NOIO);
- if (!*ops)
- return -ENOMEM;
- (*ops)[0].op = opcode;
+static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
+ int opcode, u32 payload_len)
+{
+ struct ceph_osd_req_op *ops;
+
+ ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
+ if (!ops)
+ return NULL;
+
+ ops[0].op = opcode;
+
/*
* op extent offset and length will be set later on
* in calc_raw_layout()
*/
- (*ops)[0].payload_len = payload_len;
- return 0;
+ ops[0].payload_len = payload_len;
+
+ return ops;
}
static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
@@ -790,8 +828,8 @@ static void rbd_coll_end_req_index(struct request *rq,
struct request_queue *q;
int min, max, i;
- dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
- coll, index, ret, len);
+ dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
+ coll, index, ret, (unsigned long long) len);
if (!rq)
return;
@@ -830,16 +868,15 @@ static void rbd_coll_end_req(struct rbd_request *req,
* Send ceph osd request
*/
static int rbd_do_request(struct request *rq,
- struct rbd_device *dev,
+ struct rbd_device *rbd_dev,
struct ceph_snap_context *snapc,
u64 snapid,
- const char *obj, u64 ofs, u64 len,
+ const char *object_name, u64 ofs, u64 len,
struct bio *bio,
struct page **pages,
int num_pages,
int flags,
struct ceph_osd_req_op *ops,
- int num_reply,
struct rbd_req_coll *coll,
int coll_index,
void (*rbd_cb)(struct ceph_osd_request *req,
@@ -854,7 +891,7 @@ static int rbd_do_request(struct request *rq,
struct timespec mtime = CURRENT_TIME;
struct rbd_request *req_data;
struct ceph_osd_request_head *reqhead;
- struct rbd_image_header *header = &dev->header;
+ struct ceph_osd_client *osdc;
req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
if (!req_data) {
@@ -869,17 +906,13 @@ static int rbd_do_request(struct request *rq,
req_data->coll_index = coll_index;
}
- dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
+ dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
+ (unsigned long long) ofs, (unsigned long long) len);
- down_read(&header->snap_rwsem);
-
- req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
- snapc,
- ops,
- false,
- GFP_NOIO, pages, bio);
+ osdc = &rbd_dev->rbd_client->client->osdc;
+ req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
+ false, GFP_NOIO, pages, bio);
if (!req) {
- up_read(&header->snap_rwsem);
ret = -ENOMEM;
goto done_pages;
}
@@ -896,7 +929,7 @@ static int rbd_do_request(struct request *rq,
reqhead = req->r_request->front.iov_base;
reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
- strncpy(req->r_oid, obj, sizeof(req->r_oid));
+ strncpy(req->r_oid, object_name, sizeof(req->r_oid));
req->r_oid_len = strlen(req->r_oid);
layout = &req->r_file_layout;
@@ -904,33 +937,32 @@ static int rbd_do_request(struct request *rq,
layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
layout->fl_stripe_count = cpu_to_le32(1);
layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
- layout->fl_pg_preferred = cpu_to_le32(-1);
- layout->fl_pg_pool = cpu_to_le32(dev->poolid);
- ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
- ofs, &len, &bno, req, ops);
+ layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
+ ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
+ req, ops);
ceph_osdc_build_request(req, ofs, &len,
ops,
snapc,
&mtime,
req->r_oid, req->r_oid_len);
- up_read(&header->snap_rwsem);
if (linger_req) {
- ceph_osdc_set_request_linger(&dev->client->osdc, req);
+ ceph_osdc_set_request_linger(osdc, req);
*linger_req = req;
}
- ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
+ ret = ceph_osdc_start_request(osdc, req, false);
if (ret < 0)
goto done_err;
if (!rbd_cb) {
- ret = ceph_osdc_wait_request(&dev->client->osdc, req);
+ ret = ceph_osdc_wait_request(osdc, req);
if (ver)
*ver = le64_to_cpu(req->r_reassert_version.version);
- dout("reassert_ver=%lld\n",
- le64_to_cpu(req->r_reassert_version.version));
+ dout("reassert_ver=%llu\n",
+ (unsigned long long)
+ le64_to_cpu(req->r_reassert_version.version));
ceph_osdc_put_request(req);
}
return ret;
@@ -962,9 +994,10 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
op = (void *)(replyhead + 1);
rc = le32_to_cpu(replyhead->result);
bytes = le64_to_cpu(op->extent.length);
- read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
+ read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
- dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
+ dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
+ (unsigned long long) bytes, read_op, (int) rc);
if (rc == -ENOENT && read_op) {
zero_bio_chain(req_data->bio, 0);
@@ -991,14 +1024,12 @@ static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg
/*
* Do a synchronous ceph osd operation
*/
-static int rbd_req_sync_op(struct rbd_device *dev,
+static int rbd_req_sync_op(struct rbd_device *rbd_dev,
struct ceph_snap_context *snapc,
u64 snapid,
- int opcode,
int flags,
- struct ceph_osd_req_op *orig_ops,
- int num_reply,
- const char *obj,
+ struct ceph_osd_req_op *ops,
+ const char *object_name,
u64 ofs, u64 len,
char *buf,
struct ceph_osd_request **linger_req,
@@ -1007,45 +1038,28 @@ static int rbd_req_sync_op(struct rbd_device *dev,
int ret;
struct page **pages;
int num_pages;
- struct ceph_osd_req_op *ops = orig_ops;
- u32 payload_len;
+
+ BUG_ON(ops == NULL);
num_pages = calc_pages_for(ofs , len);
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages))
return PTR_ERR(pages);
- if (!orig_ops) {
- payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
- ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
- if (ret < 0)
- goto done;
-
- if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
- ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
- if (ret < 0)
- goto done_ops;
- }
- }
-
- ret = rbd_do_request(NULL, dev, snapc, snapid,
- obj, ofs, len, NULL,
+ ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
+ object_name, ofs, len, NULL,
pages, num_pages,
flags,
ops,
- 2,
NULL, 0,
NULL,
linger_req, ver);
if (ret < 0)
- goto done_ops;
+ goto done;
if ((flags & CEPH_OSD_FLAG_READ) && buf)
ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
-done_ops:
- if (!orig_ops)
- rbd_destroy_ops(ops);
done:
ceph_release_page_vector(pages, num_pages);
return ret;
@@ -1055,10 +1069,10 @@ done:
* Do an asynchronous ceph osd operation
*/
static int rbd_do_op(struct request *rq,
- struct rbd_device *rbd_dev ,
+ struct rbd_device *rbd_dev,
struct ceph_snap_context *snapc,
u64 snapid,
- int opcode, int flags, int num_reply,
+ int opcode, int flags,
u64 ofs, u64 len,
struct bio *bio,
struct rbd_req_coll *coll,
@@ -1076,14 +1090,15 @@ static int rbd_do_op(struct request *rq,
return -ENOMEM;
seg_len = rbd_get_segment(&rbd_dev->header,
- rbd_dev->header.block_name,
+ rbd_dev->header.object_prefix,
ofs, len,
seg_name, &seg_ofs);
payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
- ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
- if (ret < 0)
+ ret = -ENOMEM;
+ ops = rbd_create_rw_ops(1, opcode, payload_len);
+ if (!ops)
goto done;
/* we've taken care of segment sizes earlier when we
@@ -1097,7 +1112,6 @@ static int rbd_do_op(struct request *rq,
NULL, 0,
flags,
ops,
- num_reply,
coll, coll_index,
rbd_req_cb, 0, NULL);
@@ -1121,7 +1135,6 @@ static int rbd_req_write(struct request *rq,
return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
CEPH_OSD_OP_WRITE,
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
- 2,
ofs, len, bio, coll, coll_index);
}
@@ -1137,58 +1150,61 @@ static int rbd_req_read(struct request *rq,
int coll_index)
{
return rbd_do_op(rq, rbd_dev, NULL,
- (snapid ? snapid : CEPH_NOSNAP),
+ snapid,
CEPH_OSD_OP_READ,
CEPH_OSD_FLAG_READ,
- 2,
ofs, len, bio, coll, coll_index);
}
/*
* Request sync osd read
*/
-static int rbd_req_sync_read(struct rbd_device *dev,
- struct ceph_snap_context *snapc,
+static int rbd_req_sync_read(struct rbd_device *rbd_dev,
u64 snapid,
- const char *obj,
+ const char *object_name,
u64 ofs, u64 len,
char *buf,
u64 *ver)
{
- return rbd_req_sync_op(dev, NULL,
- (snapid ? snapid : CEPH_NOSNAP),
- CEPH_OSD_OP_READ,
+ struct ceph_osd_req_op *ops;
+ int ret;
+
+ ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
+ if (!ops)
+ return -ENOMEM;
+
+ ret = rbd_req_sync_op(rbd_dev, NULL,
+ snapid,
CEPH_OSD_FLAG_READ,
- NULL,
- 1, obj, ofs, len, buf, NULL, ver);
+ ops, object_name, ofs, len, buf, NULL, ver);
+ rbd_destroy_ops(ops);
+
+ return ret;
}
/*
* Request sync osd watch
*/
-static int rbd_req_sync_notify_ack(struct rbd_device *dev,
+static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
u64 ver,
- u64 notify_id,
- const char *obj)
+ u64 notify_id)
{
struct ceph_osd_req_op *ops;
- struct page **pages = NULL;
int ret;
- ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
- if (ret < 0)
- return ret;
+ ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
+ if (!ops)
+ return -ENOMEM;
- ops[0].watch.ver = cpu_to_le64(dev->header.obj_version);
+ ops[0].watch.ver = cpu_to_le64(ver);
ops[0].watch.cookie = notify_id;
ops[0].watch.flag = 0;
- ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
- obj, 0, 0, NULL,
- pages, 0,
+ ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
+ rbd_dev->header_name, 0, 0, NULL,
+ NULL, 0,
CEPH_OSD_FLAG_READ,
ops,
- 1,
NULL, 0,
rbd_simple_req_cb, 0, NULL);
@@ -1198,54 +1214,53 @@ static int rbd_req_sync_notify_ack(struct rbd_device *dev,
static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
{
- struct rbd_device *dev = (struct rbd_device *)data;
+ struct rbd_device *rbd_dev = (struct rbd_device *)data;
+ u64 hver;
int rc;
- if (!dev)
+ if (!rbd_dev)
return;
- dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
- notify_id, (int)opcode);
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
- rc = __rbd_update_snaps(dev);
- mutex_unlock(&ctl_mutex);
+ dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
+ rbd_dev->header_name, (unsigned long long) notify_id,
+ (unsigned int) opcode);
+ rc = rbd_refresh_header(rbd_dev, &hver);
if (rc)
- pr_warning(DRV_NAME "%d got notification but failed to update"
- " snaps: %d\n", dev->major, rc);
+ pr_warning(RBD_DRV_NAME "%d got notification but failed to "
+ " update snaps: %d\n", rbd_dev->major, rc);
- rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
+ rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
}
/*
* Request sync osd watch
*/
-static int rbd_req_sync_watch(struct rbd_device *dev,
- const char *obj,
- u64 ver)
+static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
{
struct ceph_osd_req_op *ops;
- struct ceph_osd_client *osdc = &dev->client->osdc;
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ int ret;
- int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
- if (ret < 0)
- return ret;
+ ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
+ if (!ops)
+ return -ENOMEM;
ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
- (void *)dev, &dev->watch_event);
+ (void *)rbd_dev, &rbd_dev->watch_event);
if (ret < 0)
goto fail;
- ops[0].watch.ver = cpu_to_le64(ver);
- ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
+ ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
+ ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
ops[0].watch.flag = 1;
- ret = rbd_req_sync_op(dev, NULL,
+ ret = rbd_req_sync_op(rbd_dev, NULL,
CEPH_NOSNAP,
- 0,
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
ops,
- 1, obj, 0, 0, NULL,
- &dev->watch_request, NULL);
+ rbd_dev->header_name,
+ 0, 0, NULL,
+ &rbd_dev->watch_request, NULL);
if (ret < 0)
goto fail_event;
@@ -1254,8 +1269,8 @@ static int rbd_req_sync_watch(struct rbd_device *dev,
return 0;
fail_event:
- ceph_osdc_cancel_event(dev->watch_event);
- dev->watch_event = NULL;
+ ceph_osdc_cancel_event(rbd_dev->watch_event);
+ rbd_dev->watch_event = NULL;
fail:
rbd_destroy_ops(ops);
return ret;
@@ -1264,64 +1279,65 @@ fail:
/*
* Request sync osd unwatch
*/
-static int rbd_req_sync_unwatch(struct rbd_device *dev,
- const char *obj)
+static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
{
struct ceph_osd_req_op *ops;
+ int ret;
- int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
- if (ret < 0)
- return ret;
+ ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
+ if (!ops)
+ return -ENOMEM;
ops[0].watch.ver = 0;
- ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
+ ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
ops[0].watch.flag = 0;
- ret = rbd_req_sync_op(dev, NULL,
+ ret = rbd_req_sync_op(rbd_dev, NULL,
CEPH_NOSNAP,
- 0,
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
ops,
- 1, obj, 0, 0, NULL, NULL, NULL);
+ rbd_dev->header_name,
+ 0, 0, NULL, NULL, NULL);
+
rbd_destroy_ops(ops);
- ceph_osdc_cancel_event(dev->watch_event);
- dev->watch_event = NULL;
+ ceph_osdc_cancel_event(rbd_dev->watch_event);
+ rbd_dev->watch_event = NULL;
return ret;
}
struct rbd_notify_info {
- struct rbd_device *dev;
+ struct rbd_device *rbd_dev;
};
static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
{
- struct rbd_device *dev = (struct rbd_device *)data;
- if (!dev)
+ struct rbd_device *rbd_dev = (struct rbd_device *)data;
+ if (!rbd_dev)
return;
- dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
- notify_id, (int)opcode);
+ dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
+ rbd_dev->header_name, (unsigned long long) notify_id,
+ (unsigned int) opcode);
}
/*
* Request sync osd notify
*/
-static int rbd_req_sync_notify(struct rbd_device *dev,
- const char *obj)
+static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
{
struct ceph_osd_req_op *ops;
- struct ceph_osd_client *osdc = &dev->client->osdc;
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
struct ceph_osd_event *event;
struct rbd_notify_info info;
int payload_len = sizeof(u32) + sizeof(u32);
int ret;
- ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
- if (ret < 0)
- return ret;
+ ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
+ if (!ops)
+ return -ENOMEM;
- info.dev = dev;
+ info.rbd_dev = rbd_dev;
ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
(void *)&info, &event);
@@ -1334,12 +1350,12 @@ static int rbd_req_sync_notify(struct rbd_device *dev,
ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
ops[0].watch.timeout = 12;
- ret = rbd_req_sync_op(dev, NULL,
+ ret = rbd_req_sync_op(rbd_dev, NULL,
CEPH_NOSNAP,
- 0,
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
ops,
- 1, obj, 0, 0, NULL, NULL, NULL);
+ rbd_dev->header_name,
+ 0, 0, NULL, NULL, NULL);
if (ret < 0)
goto fail_event;
@@ -1356,64 +1372,39 @@ fail:
}
/*
- * Request sync osd rollback
- */
-static int rbd_req_sync_rollback_obj(struct rbd_device *dev,
- u64 snapid,
- const char *obj)
-{
- struct ceph_osd_req_op *ops;
- int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_ROLLBACK, 0);
- if (ret < 0)
- return ret;
-
- ops[0].snap.snapid = snapid;
-
- ret = rbd_req_sync_op(dev, NULL,
- CEPH_NOSNAP,
- 0,
- CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
- ops,
- 1, obj, 0, 0, NULL, NULL, NULL);
-
- rbd_destroy_ops(ops);
-
- return ret;
-}
-
-/*
* Request sync osd read
*/
-static int rbd_req_sync_exec(struct rbd_device *dev,
- const char *obj,
- const char *cls,
- const char *method,
+static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
+ const char *object_name,
+ const char *class_name,
+ const char *method_name,
const char *data,
int len,
u64 *ver)
{
struct ceph_osd_req_op *ops;
- int cls_len = strlen(cls);
- int method_len = strlen(method);
- int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
- cls_len + method_len + len);
- if (ret < 0)
- return ret;
+ int class_name_len = strlen(class_name);
+ int method_name_len = strlen(method_name);
+ int ret;
- ops[0].cls.class_name = cls;
- ops[0].cls.class_len = (__u8)cls_len;
- ops[0].cls.method_name = method;
- ops[0].cls.method_len = (__u8)method_len;
+ ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
+ class_name_len + method_name_len + len);
+ if (!ops)
+ return -ENOMEM;
+
+ ops[0].cls.class_name = class_name;
+ ops[0].cls.class_len = (__u8) class_name_len;
+ ops[0].cls.method_name = method_name;
+ ops[0].cls.method_len = (__u8) method_name_len;
ops[0].cls.argc = 0;
ops[0].cls.indata = data;
ops[0].cls.indata_len = len;
- ret = rbd_req_sync_op(dev, NULL,
+ ret = rbd_req_sync_op(rbd_dev, NULL,
CEPH_NOSNAP,
- 0,
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
ops,
- 1, obj, 0, 0, NULL, NULL, ver);
+ object_name, 0, 0, NULL, NULL, ver);
rbd_destroy_ops(ops);
@@ -1444,16 +1435,16 @@ static void rbd_rq_fn(struct request_queue *q)
struct request *rq;
struct bio_pair *bp = NULL;
- rq = blk_fetch_request(q);
-
- while (1) {
+ while ((rq = blk_fetch_request(q))) {
struct bio *bio;
struct bio *rq_bio, *next_bio = NULL;
bool do_write;
- int size, op_size = 0;
+ unsigned int size;
+ u64 op_size = 0;
u64 ofs;
int num_segs, cur_seg = 0;
struct rbd_req_coll *coll;
+ struct ceph_snap_context *snapc;
/* peek at request from block layer */
if (!rq)
@@ -1464,39 +1455,54 @@ static void rbd_rq_fn(struct request_queue *q)
/* filter out block requests we don't understand */
if ((rq->cmd_type != REQ_TYPE_FS)) {
__blk_end_request_all(rq, 0);
- goto next;
+ continue;
}
/* deduce our operation (read, write) */
do_write = (rq_data_dir(rq) == WRITE);
size = blk_rq_bytes(rq);
- ofs = blk_rq_pos(rq) * 512ULL;
+ ofs = blk_rq_pos(rq) * SECTOR_SIZE;
rq_bio = rq->bio;
if (do_write && rbd_dev->read_only) {
__blk_end_request_all(rq, -EROFS);
- goto next;
+ continue;
}
spin_unlock_irq(q->queue_lock);
+ down_read(&rbd_dev->header_rwsem);
+
+ if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
+ up_read(&rbd_dev->header_rwsem);
+ dout("request for non-existent snapshot");
+ spin_lock_irq(q->queue_lock);
+ __blk_end_request_all(rq, -ENXIO);
+ continue;
+ }
+
+ snapc = ceph_get_snap_context(rbd_dev->header.snapc);
+
+ up_read(&rbd_dev->header_rwsem);
+
dout("%s 0x%x bytes at 0x%llx\n",
do_write ? "write" : "read",
- size, blk_rq_pos(rq) * 512ULL);
+ size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
coll = rbd_alloc_coll(num_segs);
if (!coll) {
spin_lock_irq(q->queue_lock);
__blk_end_request_all(rq, -ENOMEM);
- goto next;
+ ceph_put_snap_context(snapc);
+ continue;
}
do {
/* a bio clone to be passed down to OSD req */
- dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
+ dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
op_size = rbd_get_segment(&rbd_dev->header,
- rbd_dev->header.block_name,
+ rbd_dev->header.object_prefix,
ofs, size,
NULL, NULL);
kref_get(&coll->kref);
@@ -1512,13 +1518,13 @@ static void rbd_rq_fn(struct request_queue *q)
/* init OSD command: write or read */
if (do_write)
rbd_req_write(rq, rbd_dev,
- rbd_dev->header.snapc,
+ snapc,
ofs,
op_size, bio,
coll, cur_seg);
else
rbd_req_read(rq, rbd_dev,
- cur_snap_id(rbd_dev),
+ rbd_dev->snap_id,
ofs,
op_size, bio,
coll, cur_seg);
@@ -1535,8 +1541,8 @@ next_seg:
if (bp)
bio_pair_release(bp);
spin_lock_irq(q->queue_lock);
-next:
- rq = blk_fetch_request(q);
+
+ ceph_put_snap_context(snapc);
}
}
@@ -1549,13 +1555,17 @@ static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
struct bio_vec *bvec)
{
struct rbd_device *rbd_dev = q->queuedata;
- unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
- sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
- unsigned int bio_sectors = bmd->bi_size >> 9;
+ unsigned int chunk_sectors;
+ sector_t sector;
+ unsigned int bio_sectors;
int max;
+ chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
+ sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
+ bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
+
max = (chunk_sectors - ((sector & (chunk_sectors - 1))
- + bio_sectors)) << 9;
+ + bio_sectors)) << SECTOR_SHIFT;
if (max < 0)
max = 0; /* bio_add cannot handle a negative return */
if (max <= bvec->bv_len && bio_sectors == 0)
@@ -1587,40 +1597,48 @@ static int rbd_read_header(struct rbd_device *rbd_dev,
{
ssize_t rc;
struct rbd_image_header_ondisk *dh;
- int snap_count = 0;
- u64 snap_names_len = 0;
+ u32 snap_count = 0;
u64 ver;
+ size_t len;
+ /*
+ * First reads the fixed-size header to determine the number
+ * of snapshots, then re-reads it, along with all snapshot
+ * records as well as their stored names.
+ */
+ len = sizeof (*dh);
while (1) {
- int len = sizeof(*dh) +
- snap_count * sizeof(struct rbd_image_snap_ondisk) +
- snap_names_len;
-
- rc = -ENOMEM;
dh = kmalloc(len, GFP_KERNEL);
if (!dh)
return -ENOMEM;
rc = rbd_req_sync_read(rbd_dev,
- NULL, CEPH_NOSNAP,
- rbd_dev->obj_md_name,
+ CEPH_NOSNAP,
+ rbd_dev->header_name,
0, len,
(char *)dh, &ver);
if (rc < 0)
goto out_dh;
- rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
- if (rc < 0)
+ rc = rbd_header_from_disk(header, dh, snap_count);
+ if (rc < 0) {
+ if (rc == -ENXIO)
+ pr_warning("unrecognized header format"
+ " for image %s\n",
+ rbd_dev->image_name);
goto out_dh;
-
- if (snap_count != header->total_snaps) {
- snap_count = header->total_snaps;
- snap_names_len = header->snap_names_len;
- rbd_header_free(header);
- kfree(dh);
- continue;
}
- break;
+
+ if (snap_count == header->total_snaps)
+ break;
+
+ snap_count = header->total_snaps;
+ len = sizeof (*dh) +
+ snap_count * sizeof(struct rbd_image_snap_ondisk) +
+ header->snap_names_len;
+
+ rbd_header_free(header);
+ kfree(dh);
}
header->obj_version = ver;
@@ -1632,7 +1650,7 @@ out_dh:
/*
* create a snapshot
*/
-static int rbd_header_add_snap(struct rbd_device *dev,
+static int rbd_header_add_snap(struct rbd_device *rbd_dev,
const char *snap_name,
gfp_t gfp_flags)
{
@@ -1640,15 +1658,15 @@ static int rbd_header_add_snap(struct rbd_device *dev,
u64 new_snapid;
int ret;
void *data, *p, *e;
- u64 ver;
+ struct ceph_mon_client *monc;
/* we should create a snapshot only if we're pointing at the head */
- if (dev->cur_snap)
+ if (rbd_dev->snap_id != CEPH_NOSNAP)
return -EINVAL;
- ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
- &new_snapid);
- dout("created snapid=%lld\n", new_snapid);
+ monc = &rbd_dev->rbd_client->client->monc;
+ ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
+ dout("created snapid=%llu\n", (unsigned long long) new_snapid);
if (ret < 0)
return ret;
@@ -1662,17 +1680,13 @@ static int rbd_header_add_snap(struct rbd_device *dev,
ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
ceph_encode_64_safe(&p, e, new_snapid, bad);
- ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
- data, p - data, &ver);
+ ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
+ "rbd", "snap_add",
+ data, p - data, NULL);
kfree(data);
- if (ret < 0)
- return ret;
-
- dev->header.snapc->seq = new_snapid;
-
- return 0;
+ return ret < 0 ? ret : 0;
bad:
return -ERANGE;
}
@@ -1680,56 +1694,67 @@ bad:
static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
{
struct rbd_snap *snap;
+ struct rbd_snap *next;
- while (!list_empty(&rbd_dev->snaps)) {
- snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
- __rbd_remove_snap_dev(rbd_dev, snap);
- }
+ list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
+ __rbd_remove_snap_dev(snap);
}
/*
* only read the first part of the ondisk header, without the snaps info
*/
-static int __rbd_update_snaps(struct rbd_device *rbd_dev)
+static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
{
int ret;
struct rbd_image_header h;
- u64 snap_seq;
- int follow_seq = 0;
ret = rbd_read_header(rbd_dev, &h);
if (ret < 0)
return ret;
- /* resized? */
- set_capacity(rbd_dev->disk, h.image_size / 512ULL);
+ down_write(&rbd_dev->header_rwsem);
- down_write(&rbd_dev->header.snap_rwsem);
+ /* resized? */
+ if (rbd_dev->snap_id == CEPH_NOSNAP) {
+ sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
- snap_seq = rbd_dev->header.snapc->seq;
- if (rbd_dev->header.total_snaps &&
- rbd_dev->header.snapc->snaps[0] == snap_seq)
- /* pointing at the head, will need to follow that
- if head moves */
- follow_seq = 1;
+ dout("setting size to %llu sectors", (unsigned long long) size);
+ set_capacity(rbd_dev->disk, size);
+ }
- kfree(rbd_dev->header.snapc);
- kfree(rbd_dev->header.snap_names);
+ /* rbd_dev->header.object_prefix shouldn't change */
kfree(rbd_dev->header.snap_sizes);
+ kfree(rbd_dev->header.snap_names);
+ /* osd requests may still refer to snapc */
+ ceph_put_snap_context(rbd_dev->header.snapc);
+ if (hver)
+ *hver = h.obj_version;
+ rbd_dev->header.obj_version = h.obj_version;
+ rbd_dev->header.image_size = h.image_size;
rbd_dev->header.total_snaps = h.total_snaps;
rbd_dev->header.snapc = h.snapc;
rbd_dev->header.snap_names = h.snap_names;
rbd_dev->header.snap_names_len = h.snap_names_len;
rbd_dev->header.snap_sizes = h.snap_sizes;
- if (follow_seq)
- rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
- else
- rbd_dev->header.snapc->seq = snap_seq;
+ /* Free the extra copy of the object prefix */
+ WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
+ kfree(h.object_prefix);
ret = __rbd_init_snaps_header(rbd_dev);
- up_write(&rbd_dev->header.snap_rwsem);
+ up_write(&rbd_dev->header_rwsem);
+
+ return ret;
+}
+
+static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
+{
+ int ret;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+ ret = __rbd_refresh_header(rbd_dev, hver);
+ mutex_unlock(&ctl_mutex);
return ret;
}
@@ -1739,6 +1764,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
struct gendisk *disk;
struct request_queue *q;
int rc;
+ u64 segment_size;
u64 total_size = 0;
/* contact OSD, request size info about the object being mapped */
@@ -1751,7 +1777,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
if (rc)
return rc;
- rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
+ rc = rbd_header_set_snap(rbd_dev, &total_size);
if (rc)
return rc;
@@ -1761,8 +1787,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
if (!disk)
goto out;
- snprintf(disk->disk_name, sizeof(disk->disk_name), DRV_NAME "%d",
- rbd_dev->id);
+ snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
+ rbd_dev->dev_id);
disk->major = rbd_dev->major;
disk->first_minor = 0;
disk->fops = &rbd_bd_ops;
@@ -1774,11 +1800,15 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
if (!q)
goto out_disk;
+ /* We use the default size, but let's be explicit about it. */
+ blk_queue_physical_block_size(q, SECTOR_SIZE);
+
/* set io sizes to object size */
- blk_queue_max_hw_sectors(q, rbd_obj_bytes(&rbd_dev->header) / 512ULL);
- blk_queue_max_segment_size(q, rbd_obj_bytes(&rbd_dev->header));
- blk_queue_io_min(q, rbd_obj_bytes(&rbd_dev->header));
- blk_queue_io_opt(q, rbd_obj_bytes(&rbd_dev->header));
+ segment_size = rbd_obj_bytes(&rbd_dev->header);
+ blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
+ blk_queue_max_segment_size(q, segment_size);
+ blk_queue_io_min(q, segment_size);
+ blk_queue_io_opt(q, segment_size);
blk_queue_merge_bvec(q, rbd_merge_bvec);
disk->queue = q;
@@ -1789,7 +1819,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
rbd_dev->q = q;
/* finally, announce the disk to the world */
- set_capacity(disk, total_size / 512ULL);
+ set_capacity(disk, total_size / SECTOR_SIZE);
add_disk(disk);
pr_info("%s: added with size 0x%llx\n",
@@ -1806,18 +1836,28 @@ out:
sysfs
*/
+static struct rbd_device *dev_to_rbd_dev(struct device *dev)
+{
+ return container_of(dev, struct rbd_device, dev);
+}
+
static ssize_t rbd_size_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct rbd_device *rbd_dev = dev_to_rbd(dev);
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+ sector_t size;
+
+ down_read(&rbd_dev->header_rwsem);
+ size = get_capacity(rbd_dev->disk);
+ up_read(&rbd_dev->header_rwsem);
- return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
+ return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
}
static ssize_t rbd_major_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct rbd_device *rbd_dev = dev_to_rbd(dev);
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
return sprintf(buf, "%d\n", rbd_dev->major);
}
@@ -1825,32 +1865,41 @@ static ssize_t rbd_major_show(struct device *dev,
static ssize_t rbd_client_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct rbd_device *rbd_dev = dev_to_rbd(dev);
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
- return sprintf(buf, "client%lld\n", ceph_client_id(rbd_dev->client));
+ return sprintf(buf, "client%lld\n",
+ ceph_client_id(rbd_dev->rbd_client->client));
}
static ssize_t rbd_pool_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct rbd_device *rbd_dev = dev_to_rbd(dev);
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
return sprintf(buf, "%s\n", rbd_dev->pool_name);
}
+static ssize_t rbd_pool_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+ return sprintf(buf, "%d\n", rbd_dev->pool_id);
+}
+
static ssize_t rbd_name_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct rbd_device *rbd_dev = dev_to_rbd(dev);
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
- return sprintf(buf, "%s\n", rbd_dev->obj);
+ return sprintf(buf, "%s\n", rbd_dev->image_name);
}
static ssize_t rbd_snap_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- struct rbd_device *rbd_dev = dev_to_rbd(dev);
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
return sprintf(buf, "%s\n", rbd_dev->snap_name);
}
@@ -1860,40 +1909,34 @@ static ssize_t rbd_image_refresh(struct device *dev,
const char *buf,
size_t size)
{
- struct rbd_device *rbd_dev = dev_to_rbd(dev);
- int rc;
- int ret = size;
-
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+ int ret;
- rc = __rbd_update_snaps(rbd_dev);
- if (rc < 0)
- ret = rc;
+ ret = rbd_refresh_header(rbd_dev, NULL);
- mutex_unlock(&ctl_mutex);
- return ret;
+ return ret < 0 ? ret : size;
}
static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
+static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
-static DEVICE_ATTR(rollback_snap, S_IWUSR, NULL, rbd_snap_rollback);
static struct attribute *rbd_attrs[] = {
&dev_attr_size.attr,
&dev_attr_major.attr,
&dev_attr_client_id.attr,
&dev_attr_pool.attr,
+ &dev_attr_pool_id.attr,
&dev_attr_name.attr,
&dev_attr_current_snap.attr,
&dev_attr_refresh.attr,
&dev_attr_create_snap.attr,
- &dev_attr_rollback_snap.attr,
NULL
};
@@ -1927,7 +1970,7 @@ static ssize_t rbd_snap_size_show(struct device *dev,
{
struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
- return sprintf(buf, "%lld\n", (long long)snap->size);
+ return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
}
static ssize_t rbd_snap_id_show(struct device *dev,
@@ -1936,7 +1979,7 @@ static ssize_t rbd_snap_id_show(struct device *dev,
{
struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
- return sprintf(buf, "%lld\n", (long long)snap->id);
+ return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
}
static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
@@ -1969,15 +2012,13 @@ static struct device_type rbd_snap_device_type = {
.release = rbd_snap_dev_release,
};
-static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
- struct rbd_snap *snap)
+static void __rbd_remove_snap_dev(struct rbd_snap *snap)
{
list_del(&snap->node);
device_unregister(&snap->dev);
}
-static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
- struct rbd_snap *snap,
+static int rbd_register_snap_dev(struct rbd_snap *snap,
struct device *parent)
{
struct device *dev = &snap->dev;
@@ -1992,29 +2033,36 @@ static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
return ret;
}
-static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
- int i, const char *name,
- struct rbd_snap **snapp)
+static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
+ int i, const char *name)
{
+ struct rbd_snap *snap;
int ret;
- struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
+
+ snap = kzalloc(sizeof (*snap), GFP_KERNEL);
if (!snap)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
+
+ ret = -ENOMEM;
snap->name = kstrdup(name, GFP_KERNEL);
+ if (!snap->name)
+ goto err;
+
snap->size = rbd_dev->header.snap_sizes[i];
snap->id = rbd_dev->header.snapc->snaps[i];
if (device_is_registered(&rbd_dev->dev)) {
- ret = rbd_register_snap_dev(rbd_dev, snap,
- &rbd_dev->dev);
+ ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
if (ret < 0)
goto err;
}
- *snapp = snap;
- return 0;
+
+ return snap;
+
err:
kfree(snap->name);
kfree(snap);
- return ret;
+
+ return ERR_PTR(ret);
}
/*
@@ -2047,7 +2095,6 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
const char *name, *first_name;
int i = rbd_dev->header.total_snaps;
struct rbd_snap *snap, *old_snap = NULL;
- int ret;
struct list_head *p, *n;
first_name = rbd_dev->header.snap_names;
@@ -2062,8 +2109,15 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
cur_id = rbd_dev->header.snapc->snaps[i - 1];
if (!i || old_snap->id < cur_id) {
- /* old_snap->id was skipped, thus was removed */
- __rbd_remove_snap_dev(rbd_dev, old_snap);
+ /*
+ * old_snap->id was skipped, thus was
+ * removed. If this rbd_dev is mapped to
+ * the removed snapshot, record that it no
+ * longer exists, to prevent further I/O.
+ */
+ if (rbd_dev->snap_id == old_snap->id)
+ rbd_dev->snap_exists = false;
+ __rbd_remove_snap_dev(old_snap);
continue;
}
if (old_snap->id == cur_id) {
@@ -2083,9 +2137,9 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
if (cur_id >= old_snap->id)
break;
/* a new snapshot */
- ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
- if (ret < 0)
- return ret;
+ snap = __rbd_add_snap_dev(rbd_dev, i - 1, name);
+ if (IS_ERR(snap))
+ return PTR_ERR(snap);
/* note that we add it backward so using n and not p */
list_add(&snap->node, n);
@@ -2099,28 +2153,18 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
WARN_ON(1);
return -EINVAL;
}
- ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
- if (ret < 0)
- return ret;
+ snap = __rbd_add_snap_dev(rbd_dev, i - 1, name);
+ if (IS_ERR(snap))
+ return PTR_ERR(snap);
list_add(&snap->node, &rbd_dev->snaps);
}
return 0;
}
-
-static void rbd_root_dev_release(struct device *dev)
-{
-}
-
-static struct device rbd_root_dev = {
- .init_name = "rbd",
- .release = rbd_root_dev_release,
-};
-
static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
{
- int ret = -ENOMEM;
+ int ret;
struct device *dev;
struct rbd_snap *snap;
@@ -2131,21 +2175,17 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
dev->type = &rbd_device_type;
dev->parent = &rbd_root_dev;
dev->release = rbd_dev_release;
- dev_set_name(dev, "%d", rbd_dev->id);
+ dev_set_name(dev, "%d", rbd_dev->dev_id);
ret = device_register(dev);
if (ret < 0)
- goto done_free;
+ goto out;
list_for_each_entry(snap, &rbd_dev->snaps, node) {
- ret = rbd_register_snap_dev(rbd_dev, snap,
- &rbd_dev->dev);
+ ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
if (ret < 0)
break;
}
-
- mutex_unlock(&ctl_mutex);
- return 0;
-done_free:
+out:
mutex_unlock(&ctl_mutex);
return ret;
}
@@ -2160,12 +2200,9 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
int ret, rc;
do {
- ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name,
- rbd_dev->header.obj_version);
+ ret = rbd_req_sync_watch(rbd_dev);
if (ret == -ERANGE) {
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
- rc = __rbd_update_snaps(rbd_dev);
- mutex_unlock(&ctl_mutex);
+ rc = rbd_refresh_header(rbd_dev, NULL);
if (rc < 0)
return rc;
}
@@ -2174,102 +2211,305 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
return ret;
}
+static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
+
+/*
+ * Get a unique rbd identifier for the given new rbd_dev, and add
+ * the rbd_dev to the global list. The minimum rbd id is 1.
+ */
+static void rbd_id_get(struct rbd_device *rbd_dev)
+{
+ rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max);
+
+ spin_lock(&rbd_dev_list_lock);
+ list_add_tail(&rbd_dev->node, &rbd_dev_list);
+ spin_unlock(&rbd_dev_list_lock);
+}
+
+/*
+ * Remove an rbd_dev from the global list, and record that its
+ * identifier is no longer in use.
+ */
+static void rbd_id_put(struct rbd_device *rbd_dev)
+{
+ struct list_head *tmp;
+ int rbd_id = rbd_dev->dev_id;
+ int max_id;
+
+ BUG_ON(rbd_id < 1);
+
+ spin_lock(&rbd_dev_list_lock);
+ list_del_init(&rbd_dev->node);
+
+ /*
+ * If the id being "put" is not the current maximum, there
+ * is nothing special we need to do.
+ */
+ if (rbd_id != atomic64_read(&rbd_id_max)) {
+ spin_unlock(&rbd_dev_list_lock);
+ return;
+ }
+
+ /*
+ * We need to update the current maximum id. Search the
+ * list to find out what it is. We're more likely to find
+ * the maximum at the end, so search the list backward.
+ */
+ max_id = 0;
+ list_for_each_prev(tmp, &rbd_dev_list) {
+ struct rbd_device *rbd_dev;
+
+ rbd_dev = list_entry(tmp, struct rbd_device, node);
+ if (rbd_id > max_id)
+ max_id = rbd_id;
+ }
+ spin_unlock(&rbd_dev_list_lock);
+
+ /*
+ * The max id could have been updated by rbd_id_get(), in
+ * which case it now accurately reflects the new maximum.
+ * Be careful not to overwrite the maximum value in that
+ * case.
+ */
+ atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
+}
+
+/*
+ * Skips over white space at *buf, and updates *buf to point to the
+ * first found non-space character (if any). Returns the length of
+ * the token (string of non-white space characters) found. Note
+ * that *buf must be terminated with '\0'.
+ */
+static inline size_t next_token(const char **buf)
+{
+ /*
+ * These are the characters that produce nonzero for
+ * isspace() in the "C" and "POSIX" locales.
+ */
+ const char *spaces = " \f\n\r\t\v";
+
+ *buf += strspn(*buf, spaces); /* Find start of token */
+
+ return strcspn(*buf, spaces); /* Return token length */
+}
+
+/*
+ * Finds the next token in *buf, and if the provided token buffer is
+ * big enough, copies the found token into it. The result, if
+ * copied, is guaranteed to be terminated with '\0'. Note that *buf
+ * must be terminated with '\0' on entry.
+ *
+ * Returns the length of the token found (not including the '\0').
+ * Return value will be 0 if no token is found, and it will be >=
+ * token_size if the token would not fit.
+ *
+ * The *buf pointer will be updated to point beyond the end of the
+ * found token. Note that this occurs even if the token buffer is
+ * too small to hold it.
+ */
+static inline size_t copy_token(const char **buf,
+ char *token,
+ size_t token_size)
+{
+ size_t len;
+
+ len = next_token(buf);
+ if (len < token_size) {
+ memcpy(token, *buf, len);
+ *(token + len) = '\0';
+ }
+ *buf += len;
+
+ return len;
+}
+
+/*
+ * Finds the next token in *buf, dynamically allocates a buffer big
+ * enough to hold a copy of it, and copies the token into the new
+ * buffer. The copy is guaranteed to be terminated with '\0'. Note
+ * that a duplicate buffer is created even for a zero-length token.
+ *
+ * Returns a pointer to the newly-allocated duplicate, or a null
+ * pointer if memory for the duplicate was not available. If
+ * the lenp argument is a non-null pointer, the length of the token
+ * (not including the '\0') is returned in *lenp.
+ *
+ * If successful, the *buf pointer will be updated to point beyond
+ * the end of the found token.
+ *
+ * Note: uses GFP_KERNEL for allocation.
+ */
+static inline char *dup_token(const char **buf, size_t *lenp)
+{
+ char *dup;
+ size_t len;
+
+ len = next_token(buf);
+ dup = kmalloc(len + 1, GFP_KERNEL);
+ if (!dup)
+ return NULL;
+
+ memcpy(dup, *buf, len);
+ *(dup + len) = '\0';
+ *buf += len;
+
+ if (lenp)
+ *lenp = len;
+
+ return dup;
+}
+
+/*
+ * This fills in the pool_name, image_name, image_name_len, snap_name,
+ * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
+ * on the list of monitor addresses and other options provided via
+ * /sys/bus/rbd/add.
+ *
+ * Note: rbd_dev is assumed to have been initially zero-filled.
+ */
+static int rbd_add_parse_args(struct rbd_device *rbd_dev,
+ const char *buf,
+ const char **mon_addrs,
+ size_t *mon_addrs_size,
+ char *options,
+ size_t options_size)
+{
+ size_t len;
+ int ret;
+
+ /* The first four tokens are required */
+
+ len = next_token(&buf);
+ if (!len)
+ return -EINVAL;
+ *mon_addrs_size = len + 1;
+ *mon_addrs = buf;
+
+ buf += len;
+
+ len = copy_token(&buf, options, options_size);
+ if (!len || len >= options_size)
+ return -EINVAL;
+
+ ret = -ENOMEM;
+ rbd_dev->pool_name = dup_token(&buf, NULL);
+ if (!rbd_dev->pool_name)
+ goto out_err;
+
+ rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
+ if (!rbd_dev->image_name)
+ goto out_err;
+
+ /* Create the name of the header object */
+
+ rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
+ + sizeof (RBD_SUFFIX),
+ GFP_KERNEL);
+ if (!rbd_dev->header_name)
+ goto out_err;
+ sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
+
+ /*
+ * The snapshot name is optional. If none is is supplied,
+ * we use the default value.
+ */
+ rbd_dev->snap_name = dup_token(&buf, &len);
+ if (!rbd_dev->snap_name)
+ goto out_err;
+ if (!len) {
+ /* Replace the empty name with the default */
+ kfree(rbd_dev->snap_name);
+ rbd_dev->snap_name
+ = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
+ if (!rbd_dev->snap_name)
+ goto out_err;
+
+ memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
+ sizeof (RBD_SNAP_HEAD_NAME));
+ }
+
+ return 0;
+
+out_err:
+ kfree(rbd_dev->header_name);
+ kfree(rbd_dev->image_name);
+ kfree(rbd_dev->pool_name);
+ rbd_dev->pool_name = NULL;
+
+ return ret;
+}
+
static ssize_t rbd_add(struct bus_type *bus,
const char *buf,
size_t count)
{
- struct ceph_osd_client *osdc;
- struct rbd_device *rbd_dev;
- ssize_t rc = -ENOMEM;
- int irc, new_id = 0;
- struct list_head *tmp;
- char *mon_dev_name;
char *options;
+ struct rbd_device *rbd_dev = NULL;
+ const char *mon_addrs = NULL;
+ size_t mon_addrs_size = 0;
+ struct ceph_osd_client *osdc;
+ int rc = -ENOMEM;
if (!try_module_get(THIS_MODULE))
return -ENODEV;
- mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
- if (!mon_dev_name)
- goto err_out_mod;
-
- options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
+ options = kmalloc(count, GFP_KERNEL);
if (!options)
- goto err_mon_dev;
-
- /* new rbd_device object */
+ goto err_nomem;
rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
if (!rbd_dev)
- goto err_out_opt;
+ goto err_nomem;
/* static rbd_device initialization */
spin_lock_init(&rbd_dev->lock);
INIT_LIST_HEAD(&rbd_dev->node);
INIT_LIST_HEAD(&rbd_dev->snaps);
+ init_rwsem(&rbd_dev->header_rwsem);
/* generate unique id: find highest unique id, add one */
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
- list_for_each(tmp, &rbd_dev_list) {
- struct rbd_device *rbd_dev;
+ rbd_id_get(rbd_dev);
- rbd_dev = list_entry(tmp, struct rbd_device, node);
- if (rbd_dev->id >= new_id)
- new_id = rbd_dev->id + 1;
- }
-
- rbd_dev->id = new_id;
-
- /* add to global list */
- list_add_tail(&rbd_dev->node, &rbd_dev_list);
+ /* Fill in the device name, now that we have its id. */
+ BUILD_BUG_ON(DEV_NAME_LEN
+ < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
+ sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
/* parse add command */
- if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
- "%" __stringify(RBD_MAX_OPT_LEN) "s "
- "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
- "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
- "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
- mon_dev_name, options, rbd_dev->pool_name,
- rbd_dev->obj, rbd_dev->snap_name) < 4) {
- rc = -EINVAL;
- goto err_out_slot;
- }
-
- if (rbd_dev->snap_name[0] == 0)
- rbd_dev->snap_name[0] = '-';
-
- rbd_dev->obj_len = strlen(rbd_dev->obj);
- snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
- rbd_dev->obj, RBD_SUFFIX);
-
- /* initialize rest of new object */
- snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
- rc = rbd_get_client(rbd_dev, mon_dev_name, options);
- if (rc < 0)
- goto err_out_slot;
+ rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
+ options, count);
+ if (rc)
+ goto err_put_id;
- mutex_unlock(&ctl_mutex);
+ rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
+ options);
+ if (IS_ERR(rbd_dev->rbd_client)) {
+ rc = PTR_ERR(rbd_dev->rbd_client);
+ goto err_put_id;
+ }
/* pick the pool */
- osdc = &rbd_dev->client->osdc;
+ osdc = &rbd_dev->rbd_client->client->osdc;
rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
if (rc < 0)
goto err_out_client;
- rbd_dev->poolid = rc;
+ rbd_dev->pool_id = rc;
/* register our block device */
- irc = register_blkdev(0, rbd_dev->name);
- if (irc < 0) {
- rc = irc;
+ rc = register_blkdev(0, rbd_dev->name);
+ if (rc < 0)
goto err_out_client;
- }
- rbd_dev->major = irc;
+ rbd_dev->major = rc;
rc = rbd_bus_add_dev(rbd_dev);
if (rc)
goto err_out_blkdev;
- /* set up and announce blkdev mapping */
+ /*
+ * At this point cleanup in the event of an error is the job
+ * of the sysfs code (initiated by rbd_bus_del_dev()).
+ *
+ * Set up and announce blkdev mapping.
+ */
rc = rbd_init_disk(rbd_dev);
if (rc)
goto err_out_bus;
@@ -2281,66 +2521,76 @@ static ssize_t rbd_add(struct bus_type *bus,
return count;
err_out_bus:
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
- list_del_init(&rbd_dev->node);
- mutex_unlock(&ctl_mutex);
-
/* this will also clean up rest of rbd_dev stuff */
rbd_bus_del_dev(rbd_dev);
kfree(options);
- kfree(mon_dev_name);
return rc;
err_out_blkdev:
unregister_blkdev(rbd_dev->major, rbd_dev->name);
err_out_client:
rbd_put_client(rbd_dev);
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-err_out_slot:
- list_del_init(&rbd_dev->node);
- mutex_unlock(&ctl_mutex);
-
+err_put_id:
+ if (rbd_dev->pool_name) {
+ kfree(rbd_dev->snap_name);
+ kfree(rbd_dev->header_name);
+ kfree(rbd_dev->image_name);
+ kfree(rbd_dev->pool_name);
+ }
+ rbd_id_put(rbd_dev);
+err_nomem:
kfree(rbd_dev);
-err_out_opt:
kfree(options);
-err_mon_dev:
- kfree(mon_dev_name);
-err_out_mod:
+
dout("Error adding device %s\n", buf);
module_put(THIS_MODULE);
- return rc;
+
+ return (ssize_t) rc;
}
-static struct rbd_device *__rbd_get_dev(unsigned long id)
+static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
{
struct list_head *tmp;
struct rbd_device *rbd_dev;
+ spin_lock(&rbd_dev_list_lock);
list_for_each(tmp, &rbd_dev_list) {
rbd_dev = list_entry(tmp, struct rbd_device, node);
- if (rbd_dev->id == id)
+ if (rbd_dev->dev_id == dev_id) {
+ spin_unlock(&rbd_dev_list_lock);
return rbd_dev;
+ }
}
+ spin_unlock(&rbd_dev_list_lock);
return NULL;
}
static void rbd_dev_release(struct device *dev)
{
- struct rbd_device *rbd_dev =
- container_of(dev, struct rbd_device, dev);
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+ if (rbd_dev->watch_request) {
+ struct ceph_client *client = rbd_dev->rbd_client->client;
- if (rbd_dev->watch_request)
- ceph_osdc_unregister_linger_request(&rbd_dev->client->osdc,
+ ceph_osdc_unregister_linger_request(&client->osdc,
rbd_dev->watch_request);
+ }
if (rbd_dev->watch_event)
- rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name);
+ rbd_req_sync_unwatch(rbd_dev);
rbd_put_client(rbd_dev);
/* clean up and free blkdev */
rbd_free_disk(rbd_dev);
unregister_blkdev(rbd_dev->major, rbd_dev->name);
+
+ /* done with the id, and with the rbd_dev */
+ kfree(rbd_dev->snap_name);
+ kfree(rbd_dev->header_name);
+ kfree(rbd_dev->pool_name);
+ kfree(rbd_dev->image_name);
+ rbd_id_put(rbd_dev);
kfree(rbd_dev);
/* release module ref */
@@ -2373,8 +2623,6 @@ static ssize_t rbd_remove(struct bus_type *bus,
goto done;
}
- list_del_init(&rbd_dev->node);
-
__rbd_remove_all_snaps(rbd_dev);
rbd_bus_del_dev(rbd_dev);
@@ -2388,7 +2636,7 @@ static ssize_t rbd_snap_add(struct device *dev,
const char *buf,
size_t count)
{
- struct rbd_device *rbd_dev = dev_to_rbd(dev);
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
int ret;
char *name = kmalloc(count + 1, GFP_KERNEL);
if (!name)
@@ -2403,7 +2651,7 @@ static ssize_t rbd_snap_add(struct device *dev,
if (ret < 0)
goto err_unlock;
- ret = __rbd_update_snaps(rbd_dev);
+ ret = __rbd_refresh_header(rbd_dev, NULL);
if (ret < 0)
goto err_unlock;
@@ -2412,7 +2660,7 @@ static ssize_t rbd_snap_add(struct device *dev,
mutex_unlock(&ctl_mutex);
/* make a best effort, don't error if failed */
- rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name);
+ rbd_req_sync_notify(rbd_dev);
ret = count;
kfree(name);
@@ -2424,70 +2672,6 @@ err_unlock:
return ret;
}
-static ssize_t rbd_snap_rollback(struct device *dev,
- struct device_attribute *attr,
- const char *buf,
- size_t count)
-{
- struct rbd_device *rbd_dev = dev_to_rbd(dev);
- int ret;
- u64 snapid;
- u64 cur_ofs;
- char *seg_name = NULL;
- char *snap_name = kmalloc(count + 1, GFP_KERNEL);
- ret = -ENOMEM;
- if (!snap_name)
- return ret;
-
- /* parse snaps add command */
- snprintf(snap_name, count, "%s", buf);
- seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
- if (!seg_name)
- goto done;
-
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
- ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL);
- if (ret < 0)
- goto done_unlock;
-
- dout("snapid=%lld\n", snapid);
-
- cur_ofs = 0;
- while (cur_ofs < rbd_dev->header.image_size) {
- cur_ofs += rbd_get_segment(&rbd_dev->header,
- rbd_dev->obj,
- cur_ofs, (u64)-1,
- seg_name, NULL);
- dout("seg_name=%s\n", seg_name);
-
- ret = rbd_req_sync_rollback_obj(rbd_dev, snapid, seg_name);
- if (ret < 0)
- pr_warning("could not roll back obj %s err=%d\n",
- seg_name, ret);
- }
-
- ret = __rbd_update_snaps(rbd_dev);
- if (ret < 0)
- goto done_unlock;
-
- ret = count;
-
-done_unlock:
- mutex_unlock(&ctl_mutex);
-done:
- kfree(seg_name);
- kfree(snap_name);
-
- return ret;
-}
-
-static struct bus_attribute rbd_bus_attrs[] = {
- __ATTR(add, S_IWUSR, NULL, rbd_add),
- __ATTR(remove, S_IWUSR, NULL, rbd_remove),
- __ATTR_NULL
-};
-
/*
* create control files in sysfs
* /sys/bus/rbd/...
@@ -2496,21 +2680,21 @@ static int rbd_sysfs_init(void)
{
int ret;
- rbd_bus_type.bus_attrs = rbd_bus_attrs;
-
- ret = bus_register(&rbd_bus_type);
- if (ret < 0)
+ ret = device_register(&rbd_root_dev);
+ if (ret < 0)
return ret;
- ret = device_register(&rbd_root_dev);
+ ret = bus_register(&rbd_bus_type);
+ if (ret < 0)
+ device_unregister(&rbd_root_dev);
return ret;
}
static void rbd_sysfs_cleanup(void)
{
- device_unregister(&rbd_root_dev);
bus_unregister(&rbd_bus_type);
+ device_unregister(&rbd_root_dev);
}
int __init rbd_init(void)
@@ -2520,8 +2704,7 @@ int __init rbd_init(void)
rc = rbd_sysfs_init();
if (rc)
return rc;
- spin_lock_init(&node_lock);
- pr_info("loaded " DRV_NAME_LONG "\n");
+ pr_info("loaded " RBD_DRV_NAME_LONG "\n");
return 0;
}
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
index fc6c678aa2cb..0924e9e41a60 100644
--- a/drivers/block/rbd_types.h
+++ b/drivers/block/rbd_types.h
@@ -31,7 +31,6 @@
#define RBD_MIN_OBJ_ORDER 16
#define RBD_MAX_OBJ_ORDER 30
-#define RBD_MAX_OBJ_NAME_LEN 96
#define RBD_MAX_SEG_NAME_LEN 128
#define RBD_COMP_NONE 0
@@ -41,10 +40,6 @@
#define RBD_HEADER_SIGNATURE "RBD"
#define RBD_HEADER_VERSION "001.005"
-struct rbd_info {
- __le64 max_id;
-} __attribute__ ((packed));
-
struct rbd_image_snap_ondisk {
__le64 id;
__le64 image_size;
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 48e8fee9f2d4..9dcf76a10bb6 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -839,10 +839,7 @@ static struct vio_driver vdc_port_driver = {
.id_table = vdc_port_match,
.probe = vdc_port_probe,
.remove = vdc_port_remove,
- .driver = {
- .name = "vdc_port",
- .owner = THIS_MODULE,
- }
+ .name = "vdc_port",
};
static int __init vdc_init(void)
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index fd5adcd55944..6d5a914b9619 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -26,7 +26,6 @@
#include <linux/delay.h>
#include <linux/platform_device.h>
-#include <asm/macintosh.h>
#include <asm/mac_via.h>
#define CARDNAME "swim"
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index ae3e167e17ad..89ddab127e33 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -16,6 +16,8 @@
* handle GCR disks
*/
+#undef DEBUG
+
#include <linux/stddef.h>
#include <linux/kernel.h>
#include <linux/sched.h>
@@ -36,13 +38,11 @@
#include <asm/machdep.h>
#include <asm/pmac_feature.h>
-static DEFINE_MUTEX(swim3_mutex);
-static struct request_queue *swim3_queue;
-static struct gendisk *disks[2];
-static struct request *fd_req;
-
#define MAX_FLOPPIES 2
+static DEFINE_MUTEX(swim3_mutex);
+static struct gendisk *disks[MAX_FLOPPIES];
+
enum swim_state {
idle,
locating,
@@ -177,7 +177,6 @@ struct swim3 {
struct floppy_state {
enum swim_state state;
- spinlock_t lock;
struct swim3 __iomem *swim3; /* hardware registers */
struct dbdma_regs __iomem *dma; /* DMA controller registers */
int swim3_intr; /* interrupt number for SWIM3 */
@@ -204,8 +203,20 @@ struct floppy_state {
int wanted;
struct macio_dev *mdev;
char dbdma_cmd_space[5 * sizeof(struct dbdma_cmd)];
+ int index;
+ struct request *cur_req;
};
+#define swim3_err(fmt, arg...) dev_err(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+#define swim3_warn(fmt, arg...) dev_warn(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+#define swim3_info(fmt, arg...) dev_info(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+
+#ifdef DEBUG
+#define swim3_dbg(fmt, arg...) dev_dbg(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+#else
+#define swim3_dbg(fmt, arg...) do { } while(0)
+#endif
+
static struct floppy_state floppy_states[MAX_FLOPPIES];
static int floppy_count = 0;
static DEFINE_SPINLOCK(swim3_lock);
@@ -224,17 +235,8 @@ static unsigned short write_postamble[] = {
0, 0, 0, 0, 0, 0
};
-static void swim3_select(struct floppy_state *fs, int sel);
-static void swim3_action(struct floppy_state *fs, int action);
-static int swim3_readbit(struct floppy_state *fs, int bit);
-static void do_fd_request(struct request_queue * q);
-static void start_request(struct floppy_state *fs);
-static void set_timeout(struct floppy_state *fs, int nticks,
- void (*proc)(unsigned long));
-static void scan_track(struct floppy_state *fs);
static void seek_track(struct floppy_state *fs, int n);
static void init_dma(struct dbdma_cmd *cp, int cmd, void *buf, int count);
-static void setup_transfer(struct floppy_state *fs);
static void act(struct floppy_state *fs);
static void scan_timeout(unsigned long data);
static void seek_timeout(unsigned long data);
@@ -254,18 +256,21 @@ static unsigned int floppy_check_events(struct gendisk *disk,
unsigned int clearing);
static int floppy_revalidate(struct gendisk *disk);
-static bool swim3_end_request(int err, unsigned int nr_bytes)
+static bool swim3_end_request(struct floppy_state *fs, int err, unsigned int nr_bytes)
{
- if (__blk_end_request(fd_req, err, nr_bytes))
- return true;
+ struct request *req = fs->cur_req;
+ int rc;
- fd_req = NULL;
- return false;
-}
+ swim3_dbg(" end request, err=%d nr_bytes=%d, cur_req=%p\n",
+ err, nr_bytes, req);
-static bool swim3_end_request_cur(int err)
-{
- return swim3_end_request(err, blk_rq_cur_bytes(fd_req));
+ if (err)
+ nr_bytes = blk_rq_cur_bytes(req);
+ rc = __blk_end_request(req, err, nr_bytes);
+ if (rc)
+ return true;
+ fs->cur_req = NULL;
+ return false;
}
static void swim3_select(struct floppy_state *fs, int sel)
@@ -303,50 +308,53 @@ static int swim3_readbit(struct floppy_state *fs, int bit)
return (stat & DATA) == 0;
}
-static void do_fd_request(struct request_queue * q)
-{
- int i;
-
- for(i=0; i<floppy_count; i++) {
- struct floppy_state *fs = &floppy_states[i];
- if (fs->mdev->media_bay &&
- check_media_bay(fs->mdev->media_bay) != MB_FD)
- continue;
- start_request(fs);
- }
-}
-
static void start_request(struct floppy_state *fs)
{
struct request *req;
unsigned long x;
+ swim3_dbg("start request, initial state=%d\n", fs->state);
+
if (fs->state == idle && fs->wanted) {
fs->state = available;
wake_up(&fs->wait);
return;
}
while (fs->state == idle) {
- if (!fd_req) {
- fd_req = blk_fetch_request(swim3_queue);
- if (!fd_req)
+ swim3_dbg("start request, idle loop, cur_req=%p\n", fs->cur_req);
+ if (!fs->cur_req) {
+ fs->cur_req = blk_fetch_request(disks[fs->index]->queue);
+ swim3_dbg(" fetched request %p\n", fs->cur_req);
+ if (!fs->cur_req)
break;
}
- req = fd_req;
-#if 0
- printk("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n",
- req->rq_disk->disk_name, req->cmd,
- (long)blk_rq_pos(req), blk_rq_sectors(req), req->buffer);
- printk(" errors=%d current_nr_sectors=%u\n",
- req->errors, blk_rq_cur_sectors(req));
+ req = fs->cur_req;
+
+ if (fs->mdev->media_bay &&
+ check_media_bay(fs->mdev->media_bay) != MB_FD) {
+ swim3_dbg("%s", " media bay absent, dropping req\n");
+ swim3_end_request(fs, -ENODEV, 0);
+ continue;
+ }
+
+#if 0 /* This is really too verbose */
+ swim3_dbg("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n",
+ req->rq_disk->disk_name, req->cmd,
+ (long)blk_rq_pos(req), blk_rq_sectors(req),
+ req->buffer);
+ swim3_dbg(" errors=%d current_nr_sectors=%u\n",
+ req->errors, blk_rq_cur_sectors(req));
#endif
if (blk_rq_pos(req) >= fs->total_secs) {
- swim3_end_request_cur(-EIO);
+ swim3_dbg(" pos out of bounds (%ld, max is %ld)\n",
+ (long)blk_rq_pos(req), (long)fs->total_secs);
+ swim3_end_request(fs, -EIO, 0);
continue;
}
if (fs->ejected) {
- swim3_end_request_cur(-EIO);
+ swim3_dbg("%s", " disk ejected\n");
+ swim3_end_request(fs, -EIO, 0);
continue;
}
@@ -354,7 +362,8 @@ static void start_request(struct floppy_state *fs)
if (fs->write_prot < 0)
fs->write_prot = swim3_readbit(fs, WRITE_PROT);
if (fs->write_prot) {
- swim3_end_request_cur(-EIO);
+ swim3_dbg("%s", " try to write, disk write protected\n");
+ swim3_end_request(fs, -EIO, 0);
continue;
}
}
@@ -369,7 +378,6 @@ static void start_request(struct floppy_state *fs)
x = ((long)blk_rq_pos(req)) % fs->secpercyl;
fs->head = x / fs->secpertrack;
fs->req_sector = x % fs->secpertrack + 1;
- fd_req = req;
fs->state = do_transfer;
fs->retries = 0;
@@ -377,12 +385,14 @@ static void start_request(struct floppy_state *fs)
}
}
+static void do_fd_request(struct request_queue * q)
+{
+ start_request(q->queuedata);
+}
+
static void set_timeout(struct floppy_state *fs, int nticks,
void (*proc)(unsigned long))
{
- unsigned long flags;
-
- spin_lock_irqsave(&fs->lock, flags);
if (fs->timeout_pending)
del_timer(&fs->timeout);
fs->timeout.expires = jiffies + nticks;
@@ -390,7 +400,6 @@ static void set_timeout(struct floppy_state *fs, int nticks,
fs->timeout.data = (unsigned long) fs;
add_timer(&fs->timeout);
fs->timeout_pending = 1;
- spin_unlock_irqrestore(&fs->lock, flags);
}
static inline void scan_track(struct floppy_state *fs)
@@ -442,40 +451,45 @@ static inline void setup_transfer(struct floppy_state *fs)
struct swim3 __iomem *sw = fs->swim3;
struct dbdma_cmd *cp = fs->dma_cmd;
struct dbdma_regs __iomem *dr = fs->dma;
+ struct request *req = fs->cur_req;
- if (blk_rq_cur_sectors(fd_req) <= 0) {
- printk(KERN_ERR "swim3: transfer 0 sectors?\n");
+ if (blk_rq_cur_sectors(req) <= 0) {
+ swim3_warn("%s", "Transfer 0 sectors ?\n");
return;
}
- if (rq_data_dir(fd_req) == WRITE)
+ if (rq_data_dir(req) == WRITE)
n = 1;
else {
n = fs->secpertrack - fs->req_sector + 1;
- if (n > blk_rq_cur_sectors(fd_req))
- n = blk_rq_cur_sectors(fd_req);
+ if (n > blk_rq_cur_sectors(req))
+ n = blk_rq_cur_sectors(req);
}
+
+ swim3_dbg(" setup xfer at sect %d (of %d) head %d for %d\n",
+ fs->req_sector, fs->secpertrack, fs->head, n);
+
fs->scount = n;
swim3_select(fs, fs->head? READ_DATA_1: READ_DATA_0);
out_8(&sw->sector, fs->req_sector);
out_8(&sw->nsect, n);
out_8(&sw->gap3, 0);
out_le32(&dr->cmdptr, virt_to_bus(cp));
- if (rq_data_dir(fd_req) == WRITE) {
+ if (rq_data_dir(req) == WRITE) {
/* Set up 3 dma commands: write preamble, data, postamble */
init_dma(cp, OUTPUT_MORE, write_preamble, sizeof(write_preamble));
++cp;
- init_dma(cp, OUTPUT_MORE, fd_req->buffer, 512);
+ init_dma(cp, OUTPUT_MORE, req->buffer, 512);
++cp;
init_dma(cp, OUTPUT_LAST, write_postamble, sizeof(write_postamble));
} else {
- init_dma(cp, INPUT_LAST, fd_req->buffer, n * 512);
+ init_dma(cp, INPUT_LAST, req->buffer, n * 512);
}
++cp;
out_le16(&cp->command, DBDMA_STOP);
out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS);
in_8(&sw->error);
out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS);
- if (rq_data_dir(fd_req) == WRITE)
+ if (rq_data_dir(req) == WRITE)
out_8(&sw->control_bis, WRITE_SECTORS);
in_8(&sw->intr);
out_le32(&dr->control, (RUN << 16) | RUN);
@@ -488,12 +502,16 @@ static inline void setup_transfer(struct floppy_state *fs)
static void act(struct floppy_state *fs)
{
for (;;) {
+ swim3_dbg(" act loop, state=%d, req_cyl=%d, cur_cyl=%d\n",
+ fs->state, fs->req_cyl, fs->cur_cyl);
+
switch (fs->state) {
case idle:
return; /* XXX shouldn't get here */
case locating:
if (swim3_readbit(fs, TRACK_ZERO)) {
+ swim3_dbg("%s", " locate track 0\n");
fs->cur_cyl = 0;
if (fs->req_cyl == 0)
fs->state = do_transfer;
@@ -511,7 +529,7 @@ static void act(struct floppy_state *fs)
break;
}
if (fs->req_cyl == fs->cur_cyl) {
- printk("whoops, seeking 0\n");
+ swim3_warn("%s", "Whoops, seeking 0\n");
fs->state = do_transfer;
break;
}
@@ -527,7 +545,9 @@ static void act(struct floppy_state *fs)
case do_transfer:
if (fs->cur_cyl != fs->req_cyl) {
if (fs->retries > 5) {
- swim3_end_request_cur(-EIO);
+ swim3_err("Wrong cylinder in transfer, want: %d got %d\n",
+ fs->req_cyl, fs->cur_cyl);
+ swim3_end_request(fs, -EIO, 0);
fs->state = idle;
return;
}
@@ -542,7 +562,7 @@ static void act(struct floppy_state *fs)
return;
default:
- printk(KERN_ERR"swim3: unknown state %d\n", fs->state);
+ swim3_err("Unknown state %d\n", fs->state);
return;
}
}
@@ -552,59 +572,75 @@ static void scan_timeout(unsigned long data)
{
struct floppy_state *fs = (struct floppy_state *) data;
struct swim3 __iomem *sw = fs->swim3;
+ unsigned long flags;
+
+ swim3_dbg("* scan timeout, state=%d\n", fs->state);
+ spin_lock_irqsave(&swim3_lock, flags);
fs->timeout_pending = 0;
out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS);
out_8(&sw->select, RELAX);
out_8(&sw->intr_enable, 0);
fs->cur_cyl = -1;
if (fs->retries > 5) {
- swim3_end_request_cur(-EIO);
+ swim3_end_request(fs, -EIO, 0);
fs->state = idle;
start_request(fs);
} else {
fs->state = jogging;
act(fs);
}
+ spin_unlock_irqrestore(&swim3_lock, flags);
}
static void seek_timeout(unsigned long data)
{
struct floppy_state *fs = (struct floppy_state *) data;
struct swim3 __iomem *sw = fs->swim3;
+ unsigned long flags;
+
+ swim3_dbg("* seek timeout, state=%d\n", fs->state);
+ spin_lock_irqsave(&swim3_lock, flags);
fs->timeout_pending = 0;
out_8(&sw->control_bic, DO_SEEK);
out_8(&sw->select, RELAX);
out_8(&sw->intr_enable, 0);
- printk(KERN_ERR "swim3: seek timeout\n");
- swim3_end_request_cur(-EIO);
+ swim3_err("%s", "Seek timeout\n");
+ swim3_end_request(fs, -EIO, 0);
fs->state = idle;
start_request(fs);
+ spin_unlock_irqrestore(&swim3_lock, flags);
}
static void settle_timeout(unsigned long data)
{
struct floppy_state *fs = (struct floppy_state *) data;
struct swim3 __iomem *sw = fs->swim3;
+ unsigned long flags;
+
+ swim3_dbg("* settle timeout, state=%d\n", fs->state);
+ spin_lock_irqsave(&swim3_lock, flags);
fs->timeout_pending = 0;
if (swim3_readbit(fs, SEEK_COMPLETE)) {
out_8(&sw->select, RELAX);
fs->state = locating;
act(fs);
- return;
+ goto unlock;
}
out_8(&sw->select, RELAX);
if (fs->settle_time < 2*HZ) {
++fs->settle_time;
set_timeout(fs, 1, settle_timeout);
- return;
+ goto unlock;
}
- printk(KERN_ERR "swim3: seek settle timeout\n");
- swim3_end_request_cur(-EIO);
+ swim3_err("%s", "Seek settle timeout\n");
+ swim3_end_request(fs, -EIO, 0);
fs->state = idle;
start_request(fs);
+ unlock:
+ spin_unlock_irqrestore(&swim3_lock, flags);
}
static void xfer_timeout(unsigned long data)
@@ -612,8 +648,12 @@ static void xfer_timeout(unsigned long data)
struct floppy_state *fs = (struct floppy_state *) data;
struct swim3 __iomem *sw = fs->swim3;
struct dbdma_regs __iomem *dr = fs->dma;
+ unsigned long flags;
int n;
+ swim3_dbg("* xfer timeout, state=%d\n", fs->state);
+
+ spin_lock_irqsave(&swim3_lock, flags);
fs->timeout_pending = 0;
out_le32(&dr->control, RUN << 16);
/* We must wait a bit for dbdma to stop */
@@ -622,12 +662,13 @@ static void xfer_timeout(unsigned long data)
out_8(&sw->intr_enable, 0);
out_8(&sw->control_bic, WRITE_SECTORS | DO_ACTION);
out_8(&sw->select, RELAX);
- printk(KERN_ERR "swim3: timeout %sing sector %ld\n",
- (rq_data_dir(fd_req)==WRITE? "writ": "read"),
- (long)blk_rq_pos(fd_req));
- swim3_end_request_cur(-EIO);
+ swim3_err("Timeout %sing sector %ld\n",
+ (rq_data_dir(fs->cur_req)==WRITE? "writ": "read"),
+ (long)blk_rq_pos(fs->cur_req));
+ swim3_end_request(fs, -EIO, 0);
fs->state = idle;
start_request(fs);
+ spin_unlock_irqrestore(&swim3_lock, flags);
}
static irqreturn_t swim3_interrupt(int irq, void *dev_id)
@@ -638,12 +679,17 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
int stat, resid;
struct dbdma_regs __iomem *dr;
struct dbdma_cmd *cp;
+ unsigned long flags;
+ struct request *req = fs->cur_req;
+
+ swim3_dbg("* interrupt, state=%d\n", fs->state);
+ spin_lock_irqsave(&swim3_lock, flags);
intr = in_8(&sw->intr);
err = (intr & ERROR_INTR)? in_8(&sw->error): 0;
if ((intr & ERROR_INTR) && fs->state != do_transfer)
- printk(KERN_ERR "swim3_interrupt, state=%d, dir=%x, intr=%x, err=%x\n",
- fs->state, rq_data_dir(fd_req), intr, err);
+ swim3_err("Non-transfer error interrupt: state=%d, dir=%x, intr=%x, err=%x\n",
+ fs->state, rq_data_dir(req), intr, err);
switch (fs->state) {
case locating:
if (intr & SEEN_SECTOR) {
@@ -653,10 +699,10 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
del_timer(&fs->timeout);
fs->timeout_pending = 0;
if (sw->ctrack == 0xff) {
- printk(KERN_ERR "swim3: seen sector but cyl=ff?\n");
+ swim3_err("%s", "Seen sector but cyl=ff?\n");
fs->cur_cyl = -1;
if (fs->retries > 5) {
- swim3_end_request_cur(-EIO);
+ swim3_end_request(fs, -EIO, 0);
fs->state = idle;
start_request(fs);
} else {
@@ -668,8 +714,8 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
fs->cur_cyl = sw->ctrack;
fs->cur_sector = sw->csect;
if (fs->expect_cyl != -1 && fs->expect_cyl != fs->cur_cyl)
- printk(KERN_ERR "swim3: expected cyl %d, got %d\n",
- fs->expect_cyl, fs->cur_cyl);
+ swim3_err("Expected cyl %d, got %d\n",
+ fs->expect_cyl, fs->cur_cyl);
fs->state = do_transfer;
act(fs);
}
@@ -704,7 +750,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
fs->timeout_pending = 0;
dr = fs->dma;
cp = fs->dma_cmd;
- if (rq_data_dir(fd_req) == WRITE)
+ if (rq_data_dir(req) == WRITE)
++cp;
/*
* Check that the main data transfer has finished.
@@ -729,31 +775,32 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
if (intr & ERROR_INTR) {
n = fs->scount - 1 - resid / 512;
if (n > 0) {
- blk_update_request(fd_req, 0, n << 9);
+ blk_update_request(req, 0, n << 9);
fs->req_sector += n;
}
if (fs->retries < 5) {
++fs->retries;
act(fs);
} else {
- printk("swim3: error %sing block %ld (err=%x)\n",
- rq_data_dir(fd_req) == WRITE? "writ": "read",
- (long)blk_rq_pos(fd_req), err);
- swim3_end_request_cur(-EIO);
+ swim3_err("Error %sing block %ld (err=%x)\n",
+ rq_data_dir(req) == WRITE? "writ": "read",
+ (long)blk_rq_pos(req), err);
+ swim3_end_request(fs, -EIO, 0);
fs->state = idle;
}
} else {
if ((stat & ACTIVE) == 0 || resid != 0) {
/* musta been an error */
- printk(KERN_ERR "swim3: fd dma: stat=%x resid=%d\n", stat, resid);
- printk(KERN_ERR " state=%d, dir=%x, intr=%x, err=%x\n",
- fs->state, rq_data_dir(fd_req), intr, err);
- swim3_end_request_cur(-EIO);
+ swim3_err("fd dma error: stat=%x resid=%d\n", stat, resid);
+ swim3_err(" state=%d, dir=%x, intr=%x, err=%x\n",
+ fs->state, rq_data_dir(req), intr, err);
+ swim3_end_request(fs, -EIO, 0);
fs->state = idle;
start_request(fs);
break;
}
- if (swim3_end_request(0, fs->scount << 9)) {
+ fs->retries = 0;
+ if (swim3_end_request(fs, 0, fs->scount << 9)) {
fs->req_sector += fs->scount;
if (fs->req_sector > fs->secpertrack) {
fs->req_sector -= fs->secpertrack;
@@ -770,8 +817,9 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
start_request(fs);
break;
default:
- printk(KERN_ERR "swim3: don't know what to do in state %d\n", fs->state);
+ swim3_err("Don't know what to do in state %d\n", fs->state);
}
+ spin_unlock_irqrestore(&swim3_lock, flags);
return IRQ_HANDLED;
}
@@ -781,26 +829,31 @@ static void fd_dma_interrupt(int irq, void *dev_id)
}
*/
+/* Called under the mutex to grab exclusive access to a drive */
static int grab_drive(struct floppy_state *fs, enum swim_state state,
int interruptible)
{
unsigned long flags;
- spin_lock_irqsave(&fs->lock, flags);
- if (fs->state != idle) {
+ swim3_dbg("%s", "-> grab drive\n");
+
+ spin_lock_irqsave(&swim3_lock, flags);
+ if (fs->state != idle && fs->state != available) {
++fs->wanted;
while (fs->state != available) {
+ spin_unlock_irqrestore(&swim3_lock, flags);
if (interruptible && signal_pending(current)) {
--fs->wanted;
- spin_unlock_irqrestore(&fs->lock, flags);
return -EINTR;
}
interruptible_sleep_on(&fs->wait);
+ spin_lock_irqsave(&swim3_lock, flags);
}
--fs->wanted;
}
fs->state = state;
- spin_unlock_irqrestore(&fs->lock, flags);
+ spin_unlock_irqrestore(&swim3_lock, flags);
+
return 0;
}
@@ -808,10 +861,12 @@ static void release_drive(struct floppy_state *fs)
{
unsigned long flags;
- spin_lock_irqsave(&fs->lock, flags);
+ swim3_dbg("%s", "-> release drive\n");
+
+ spin_lock_irqsave(&swim3_lock, flags);
fs->state = idle;
start_request(fs);
- spin_unlock_irqrestore(&fs->lock, flags);
+ spin_unlock_irqrestore(&swim3_lock, flags);
}
static int fd_eject(struct floppy_state *fs)
@@ -966,6 +1021,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
{
struct floppy_state *fs = disk->private_data;
struct swim3 __iomem *sw = fs->swim3;
+
mutex_lock(&swim3_mutex);
if (fs->ref_count > 0 && --fs->ref_count == 0) {
swim3_action(fs, MOTOR_OFF);
@@ -1031,30 +1087,48 @@ static const struct block_device_operations floppy_fops = {
.revalidate_disk= floppy_revalidate,
};
+static void swim3_mb_event(struct macio_dev* mdev, int mb_state)
+{
+ struct floppy_state *fs = macio_get_drvdata(mdev);
+ struct swim3 __iomem *sw = fs->swim3;
+
+ if (!fs)
+ return;
+ if (mb_state != MB_FD)
+ return;
+
+ /* Clear state */
+ out_8(&sw->intr_enable, 0);
+ in_8(&sw->intr);
+ in_8(&sw->error);
+}
+
static int swim3_add_device(struct macio_dev *mdev, int index)
{
struct device_node *swim = mdev->ofdev.dev.of_node;
struct floppy_state *fs = &floppy_states[index];
int rc = -EBUSY;
+ /* Do this first for message macros */
+ memset(fs, 0, sizeof(*fs));
+ fs->mdev = mdev;
+ fs->index = index;
+
/* Check & Request resources */
if (macio_resource_count(mdev) < 2) {
- printk(KERN_WARNING "ifd%d: no address for %s\n",
- index, swim->full_name);
+ swim3_err("%s", "No address in device-tree\n");
return -ENXIO;
}
- if (macio_irq_count(mdev) < 2) {
- printk(KERN_WARNING "fd%d: no intrs for device %s\n",
- index, swim->full_name);
+ if (macio_irq_count(mdev) < 1) {
+ swim3_err("%s", "No interrupt in device-tree\n");
+ return -ENXIO;
}
if (macio_request_resource(mdev, 0, "swim3 (mmio)")) {
- printk(KERN_ERR "fd%d: can't request mmio resource for %s\n",
- index, swim->full_name);
+ swim3_err("%s", "Can't request mmio resource\n");
return -EBUSY;
}
if (macio_request_resource(mdev, 1, "swim3 (dma)")) {
- printk(KERN_ERR "fd%d: can't request dma resource for %s\n",
- index, swim->full_name);
+ swim3_err("%s", "Can't request dma resource\n");
macio_release_resource(mdev, 0);
return -EBUSY;
}
@@ -1063,22 +1137,18 @@ static int swim3_add_device(struct macio_dev *mdev, int index)
if (mdev->media_bay == NULL)
pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 1);
- memset(fs, 0, sizeof(*fs));
- spin_lock_init(&fs->lock);
fs->state = idle;
fs->swim3 = (struct swim3 __iomem *)
ioremap(macio_resource_start(mdev, 0), 0x200);
if (fs->swim3 == NULL) {
- printk("fd%d: couldn't map registers for %s\n",
- index, swim->full_name);
+ swim3_err("%s", "Couldn't map mmio registers\n");
rc = -ENOMEM;
goto out_release;
}
fs->dma = (struct dbdma_regs __iomem *)
ioremap(macio_resource_start(mdev, 1), 0x200);
if (fs->dma == NULL) {
- printk("fd%d: couldn't map DMA for %s\n",
- index, swim->full_name);
+ swim3_err("%s", "Couldn't map dma registers\n");
iounmap(fs->swim3);
rc = -ENOMEM;
goto out_release;
@@ -1090,31 +1160,25 @@ static int swim3_add_device(struct macio_dev *mdev, int index)
fs->secpercyl = 36;
fs->secpertrack = 18;
fs->total_secs = 2880;
- fs->mdev = mdev;
init_waitqueue_head(&fs->wait);
fs->dma_cmd = (struct dbdma_cmd *) DBDMA_ALIGN(fs->dbdma_cmd_space);
memset(fs->dma_cmd, 0, 2 * sizeof(struct dbdma_cmd));
st_le16(&fs->dma_cmd[1].command, DBDMA_STOP);
+ if (mdev->media_bay == NULL || check_media_bay(mdev->media_bay) == MB_FD)
+ swim3_mb_event(mdev, MB_FD);
+
if (request_irq(fs->swim3_intr, swim3_interrupt, 0, "SWIM3", fs)) {
- printk(KERN_ERR "fd%d: couldn't request irq %d for %s\n",
- index, fs->swim3_intr, swim->full_name);
+ swim3_err("%s", "Couldn't request interrupt\n");
pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 0);
goto out_unmap;
return -EBUSY;
}
-/*
- if (request_irq(fs->dma_intr, fd_dma_interrupt, 0, "SWIM3-dma", fs)) {
- printk(KERN_ERR "Couldn't get irq %d for SWIM3 DMA",
- fs->dma_intr);
- return -EBUSY;
- }
-*/
init_timer(&fs->timeout);
- printk(KERN_INFO "fd%d: SWIM3 floppy controller %s\n", floppy_count,
+ swim3_info("SWIM3 floppy controller %s\n",
mdev->media_bay ? "in media bay" : "");
return 0;
@@ -1132,41 +1196,42 @@ static int swim3_add_device(struct macio_dev *mdev, int index)
static int __devinit swim3_attach(struct macio_dev *mdev, const struct of_device_id *match)
{
- int i, rc;
struct gendisk *disk;
+ int index, rc;
+
+ index = floppy_count++;
+ if (index >= MAX_FLOPPIES)
+ return -ENXIO;
/* Add the drive */
- rc = swim3_add_device(mdev, floppy_count);
+ rc = swim3_add_device(mdev, index);
if (rc)
return rc;
+ /* Now register that disk. Same comment about failure handling */
+ disk = disks[index] = alloc_disk(1);
+ if (disk == NULL)
+ return -ENOMEM;
+ disk->queue = blk_init_queue(do_fd_request, &swim3_lock);
+ if (disk->queue == NULL) {
+ put_disk(disk);
+ return -ENOMEM;
+ }
+ disk->queue->queuedata = &floppy_states[index];
- /* Now create the queue if not there yet */
- if (swim3_queue == NULL) {
+ if (index == 0) {
/* If we failed, there isn't much we can do as the driver is still
* too dumb to remove the device, just bail out
*/
if (register_blkdev(FLOPPY_MAJOR, "fd"))
return 0;
- swim3_queue = blk_init_queue(do_fd_request, &swim3_lock);
- if (swim3_queue == NULL) {
- unregister_blkdev(FLOPPY_MAJOR, "fd");
- return 0;
- }
}
- /* Now register that disk. Same comment about failure handling */
- i = floppy_count++;
- disk = disks[i] = alloc_disk(1);
- if (disk == NULL)
- return 0;
-
disk->major = FLOPPY_MAJOR;
- disk->first_minor = i;
+ disk->first_minor = index;
disk->fops = &floppy_fops;
- disk->private_data = &floppy_states[i];
- disk->queue = swim3_queue;
+ disk->private_data = &floppy_states[index];
disk->flags |= GENHD_FL_REMOVABLE;
- sprintf(disk->disk_name, "fd%d", i);
+ sprintf(disk->disk_name, "fd%d", index);
set_capacity(disk, 2880);
add_disk(disk);
@@ -1194,6 +1259,9 @@ static struct macio_driver swim3_driver =
.of_match_table = swim3_match,
},
.probe = swim3_attach,
+#ifdef CONFIG_PMAC_MEDIABAY
+ .mediabay_event = swim3_mb_event,
+#endif
#if 0
.suspend = swim3_suspend,
.resume = swim3_resume,
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index b70f0fca9a42..3fb6ab4c8b4e 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -619,8 +619,10 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
host->state == HST_DEV_SCAN);
spin_unlock_irq(&host->lock);
- DPRINTK("blk_insert_request, tag == %u\n", idx);
- blk_insert_request(host->oob_q, crq->rq, 1, crq);
+ DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
+ crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+ crq->rq->special = crq;
+ blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
return 0;
@@ -658,8 +660,10 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
BUG_ON(rc < 0);
crq->msg_bucket = (u32) rc;
- DPRINTK("blk_insert_request, tag == %u\n", idx);
- blk_insert_request(host->oob_q, crq->rq, 1, crq);
+ DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
+ crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+ crq->rq->special = crq;
+ blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
return 0;
}
@@ -1116,7 +1120,7 @@ static inline void carm_handle_resp(struct carm_host *host,
break;
case MISC_GET_FW_VER: {
struct carm_fw_ver *ver = (struct carm_fw_ver *)
- mem + sizeof(struct carm_msg_get_fw_ver);
+ (mem + sizeof(struct carm_msg_get_fw_ver));
if (!error) {
host->fw_ver = le32_to_cpu(ver->version);
host->flags |= (ver->features & FL_FW_VER_MASK);
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 0e376d46bdd1..fcec0225ac76 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -119,43 +119,6 @@
/*
*/
-
-/* command block wrapper */
-struct bulk_cb_wrap {
- __le32 Signature; /* contains 'USBC' */
- u32 Tag; /* unique per command id */
- __le32 DataTransferLength; /* size of data */
- u8 Flags; /* direction in bit 0 */
- u8 Lun; /* LUN */
- u8 Length; /* of of the CDB */
- u8 CDB[UB_MAX_CDB_SIZE]; /* max command */
-};
-
-#define US_BULK_CB_WRAP_LEN 31
-#define US_BULK_CB_SIGN 0x43425355 /*spells out USBC */
-#define US_BULK_FLAG_IN 1
-#define US_BULK_FLAG_OUT 0
-
-/* command status wrapper */
-struct bulk_cs_wrap {
- __le32 Signature; /* should = 'USBS' */
- u32 Tag; /* same as original command */
- __le32 Residue; /* amount not transferred */
- u8 Status; /* see below */
-};
-
-#define US_BULK_CS_WRAP_LEN 13
-#define US_BULK_CS_SIGN 0x53425355 /* spells out 'USBS' */
-#define US_BULK_STAT_OK 0
-#define US_BULK_STAT_FAIL 1
-#define US_BULK_STAT_PHASE 2
-
-/* bulk-only class specific requests */
-#define US_BULK_RESET_REQUEST 0xff
-#define US_BULK_GET_MAX_LUN 0xfe
-
-/*
- */
struct ub_dev;
#define UB_MAX_REQ_SG 9 /* cdrecord requires 32KB and maybe a header */
@@ -1744,12 +1707,11 @@ static int ub_bd_release(struct gendisk *disk, fmode_t mode)
static int ub_bd_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
- struct gendisk *disk = bdev->bd_disk;
void __user *usermem = (void __user *) arg;
int ret;
mutex_lock(&ub_mutex);
- ret = scsi_cmd_ioctl(disk->queue, disk, mode, cmd, usermem);
+ ret = scsi_cmd_blk_ioctl(bdev, mode, cmd, usermem);
mutex_unlock(&ub_mutex);
return ret;
@@ -2478,6 +2440,8 @@ static int __init ub_init(void)
int rc;
int i;
+ pr_info("'Low Performance USB Block' driver is deprecated. "
+ "Please switch to usb-storage\n");
for (i = 0; i < UB_QLOCK_NUM; i++)
spin_lock_init(&ub_qlockv[i]);
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 031ca720d926..eb0d8216f557 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -513,7 +513,22 @@ static void process_page(unsigned long data)
}
}
-static int mm_make_request(struct request_queue *q, struct bio *bio)
+static void mm_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+ struct cardinfo *card = cb->data;
+
+ spin_lock_irq(&card->lock);
+ activate(card);
+ spin_unlock_irq(&card->lock);
+ kfree(cb);
+}
+
+static int mm_check_plugged(struct cardinfo *card)
+{
+ return !!blk_check_plugged(mm_unplug, card, sizeof(struct blk_plug_cb));
+}
+
+static void mm_make_request(struct request_queue *q, struct bio *bio)
{
struct cardinfo *card = q->queuedata;
pr_debug("mm_make_request %llu %u\n",
@@ -523,9 +538,11 @@ static int mm_make_request(struct request_queue *q, struct bio *bio)
*card->biotail = bio;
bio->bi_next = NULL;
card->biotail = &bio->bi_next;
+ if (bio->bi_rw & REQ_SYNC || !mm_check_plugged(card))
+ activate(card);
spin_unlock_irq(&card->lock);
- return 0;
+ return;
}
static irqreturn_t mm_interrupt(int irq, void *__card)
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
deleted file mode 100644
index 9a5b2a2d616d..000000000000
--- a/drivers/block/viodasd.c
+++ /dev/null
@@ -1,809 +0,0 @@
-/* -*- linux-c -*-
- * viodasd.c
- * Authors: Dave Boutcher <boutcher@us.ibm.com>
- * Ryan Arnold <ryanarn@us.ibm.com>
- * Colin Devilbiss <devilbis@us.ibm.com>
- * Stephen Rothwell
- *
- * (C) Copyright 2000-2004 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * This routine provides access to disk space (termed "DASD" in historical
- * IBM terms) owned and managed by an OS/400 partition running on the
- * same box as this Linux partition.
- *
- * All disk operations are performed by sending messages back and forth to
- * the OS/400 partition.
- */
-
-#define pr_fmt(fmt) "viod: " fmt
-
-#include <linux/major.h>
-#include <linux/fs.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/blkdev.h>
-#include <linux/genhd.h>
-#include <linux/hdreg.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/mutex.h>
-#include <linux/dma-mapping.h>
-#include <linux/completion.h>
-#include <linux/device.h>
-#include <linux/scatterlist.h>
-
-#include <asm/uaccess.h>
-#include <asm/vio.h>
-#include <asm/iseries/hv_types.h>
-#include <asm/iseries/hv_lp_event.h>
-#include <asm/iseries/hv_lp_config.h>
-#include <asm/iseries/vio.h>
-#include <asm/firmware.h>
-
-MODULE_DESCRIPTION("iSeries Virtual DASD");
-MODULE_AUTHOR("Dave Boutcher");
-MODULE_LICENSE("GPL");
-
-/*
- * We only support 7 partitions per physical disk....so with minor
- * numbers 0-255 we get a maximum of 32 disks.
- */
-#define VIOD_GENHD_NAME "iseries/vd"
-
-#define VIOD_VERS "1.64"
-
-enum {
- PARTITION_SHIFT = 3,
- MAX_DISKNO = HVMAXARCHITECTEDVIRTUALDISKS,
- MAX_DISK_NAME = FIELD_SIZEOF(struct gendisk, disk_name)
-};
-
-static DEFINE_MUTEX(viodasd_mutex);
-static DEFINE_SPINLOCK(viodasd_spinlock);
-
-#define VIOMAXREQ 16
-
-#define DEVICE_NO(cell) ((struct viodasd_device *)(cell) - &viodasd_devices[0])
-
-struct viodasd_waitevent {
- struct completion com;
- int rc;
- u16 sub_result;
- int max_disk; /* open */
-};
-
-static const struct vio_error_entry viodasd_err_table[] = {
- { 0x0201, EINVAL, "Invalid Range" },
- { 0x0202, EINVAL, "Invalid Token" },
- { 0x0203, EIO, "DMA Error" },
- { 0x0204, EIO, "Use Error" },
- { 0x0205, EIO, "Release Error" },
- { 0x0206, EINVAL, "Invalid Disk" },
- { 0x0207, EBUSY, "Can't Lock" },
- { 0x0208, EIO, "Already Locked" },
- { 0x0209, EIO, "Already Unlocked" },
- { 0x020A, EIO, "Invalid Arg" },
- { 0x020B, EIO, "Bad IFS File" },
- { 0x020C, EROFS, "Read Only Device" },
- { 0x02FF, EIO, "Internal Error" },
- { 0x0000, 0, NULL },
-};
-
-/*
- * Figure out the biggest I/O request (in sectors) we can accept
- */
-#define VIODASD_MAXSECTORS (4096 / 512 * VIOMAXBLOCKDMA)
-
-/*
- * Number of disk I/O requests we've sent to OS/400
- */
-static int num_req_outstanding;
-
-/*
- * This is our internal structure for keeping track of disk devices
- */
-struct viodasd_device {
- u16 cylinders;
- u16 tracks;
- u16 sectors;
- u16 bytes_per_sector;
- u64 size;
- int read_only;
- spinlock_t q_lock;
- struct gendisk *disk;
- struct device *dev;
-} viodasd_devices[MAX_DISKNO];
-
-/*
- * External open entry point.
- */
-static int viodasd_open(struct block_device *bdev, fmode_t mode)
-{
- struct viodasd_device *d = bdev->bd_disk->private_data;
- HvLpEvent_Rc hvrc;
- struct viodasd_waitevent we;
- u16 flags = 0;
-
- if (d->read_only) {
- if (mode & FMODE_WRITE)
- return -EROFS;
- flags = vioblockflags_ro;
- }
-
- init_completion(&we.com);
-
- /* Send the open event to OS/400 */
- hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
- HvLpEvent_Type_VirtualIo,
- viomajorsubtype_blockio | vioblockopen,
- HvLpEvent_AckInd_DoAck, HvLpEvent_AckType_ImmediateAck,
- viopath_sourceinst(viopath_hostLp),
- viopath_targetinst(viopath_hostLp),
- (u64)(unsigned long)&we, VIOVERSION << 16,
- ((u64)DEVICE_NO(d) << 48) | ((u64)flags << 32),
- 0, 0, 0);
- if (hvrc != 0) {
- pr_warning("HV open failed %d\n", (int)hvrc);
- return -EIO;
- }
-
- wait_for_completion(&we.com);
-
- /* Check the return code */
- if (we.rc != 0) {
- const struct vio_error_entry *err =
- vio_lookup_rc(viodasd_err_table, we.sub_result);
-
- pr_warning("bad rc opening disk: %d:0x%04x (%s)\n",
- (int)we.rc, we.sub_result, err->msg);
- return -EIO;
- }
-
- return 0;
-}
-
-static int viodasd_unlocked_open(struct block_device *bdev, fmode_t mode)
-{
- int ret;
-
- mutex_lock(&viodasd_mutex);
- ret = viodasd_open(bdev, mode);
- mutex_unlock(&viodasd_mutex);
-
- return ret;
-}
-
-
-/*
- * External release entry point.
- */
-static int viodasd_release(struct gendisk *disk, fmode_t mode)
-{
- struct viodasd_device *d = disk->private_data;
- HvLpEvent_Rc hvrc;
-
- mutex_lock(&viodasd_mutex);
- /* Send the event to OS/400. We DON'T expect a response */
- hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
- HvLpEvent_Type_VirtualIo,
- viomajorsubtype_blockio | vioblockclose,
- HvLpEvent_AckInd_NoAck, HvLpEvent_AckType_ImmediateAck,
- viopath_sourceinst(viopath_hostLp),
- viopath_targetinst(viopath_hostLp),
- 0, VIOVERSION << 16,
- ((u64)DEVICE_NO(d) << 48) /* | ((u64)flags << 32) */,
- 0, 0, 0);
- if (hvrc != 0)
- pr_warning("HV close call failed %d\n", (int)hvrc);
-
- mutex_unlock(&viodasd_mutex);
-
- return 0;
-}
-
-
-/* External ioctl entry point.
- */
-static int viodasd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
- struct gendisk *disk = bdev->bd_disk;
- struct viodasd_device *d = disk->private_data;
-
- geo->sectors = d->sectors ? d->sectors : 32;
- geo->heads = d->tracks ? d->tracks : 64;
- geo->cylinders = d->cylinders ? d->cylinders :
- get_capacity(disk) / (geo->sectors * geo->heads);
-
- return 0;
-}
-
-/*
- * Our file operations table
- */
-static const struct block_device_operations viodasd_fops = {
- .owner = THIS_MODULE,
- .open = viodasd_unlocked_open,
- .release = viodasd_release,
- .getgeo = viodasd_getgeo,
-};
-
-/*
- * End a request
- */
-static void viodasd_end_request(struct request *req, int error,
- int num_sectors)
-{
- __blk_end_request(req, error, num_sectors << 9);
-}
-
-/*
- * Send an actual I/O request to OS/400
- */
-static int send_request(struct request *req)
-{
- u64 start;
- int direction;
- int nsg;
- u16 viocmd;
- HvLpEvent_Rc hvrc;
- struct vioblocklpevent *bevent;
- struct HvLpEvent *hev;
- struct scatterlist sg[VIOMAXBLOCKDMA];
- int sgindex;
- struct viodasd_device *d;
- unsigned long flags;
-
- start = (u64)blk_rq_pos(req) << 9;
-
- if (rq_data_dir(req) == READ) {
- direction = DMA_FROM_DEVICE;
- viocmd = viomajorsubtype_blockio | vioblockread;
- } else {
- direction = DMA_TO_DEVICE;
- viocmd = viomajorsubtype_blockio | vioblockwrite;
- }
-
- d = req->rq_disk->private_data;
-
- /* Now build the scatter-gather list */
- sg_init_table(sg, VIOMAXBLOCKDMA);
- nsg = blk_rq_map_sg(req->q, req, sg);
- nsg = dma_map_sg(d->dev, sg, nsg, direction);
-
- spin_lock_irqsave(&viodasd_spinlock, flags);
- num_req_outstanding++;
-
- /* This optimization handles a single DMA block */
- if (nsg == 1)
- hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
- HvLpEvent_Type_VirtualIo, viocmd,
- HvLpEvent_AckInd_DoAck,
- HvLpEvent_AckType_ImmediateAck,
- viopath_sourceinst(viopath_hostLp),
- viopath_targetinst(viopath_hostLp),
- (u64)(unsigned long)req, VIOVERSION << 16,
- ((u64)DEVICE_NO(d) << 48), start,
- ((u64)sg_dma_address(&sg[0])) << 32,
- sg_dma_len(&sg[0]));
- else {
- bevent = (struct vioblocklpevent *)
- vio_get_event_buffer(viomajorsubtype_blockio);
- if (bevent == NULL) {
- pr_warning("error allocating disk event buffer\n");
- goto error_ret;
- }
-
- /*
- * Now build up the actual request. Note that we store
- * the pointer to the request in the correlation
- * token so we can match the response up later
- */
- memset(bevent, 0, sizeof(struct vioblocklpevent));
- hev = &bevent->event;
- hev->flags = HV_LP_EVENT_VALID | HV_LP_EVENT_DO_ACK |
- HV_LP_EVENT_INT;
- hev->xType = HvLpEvent_Type_VirtualIo;
- hev->xSubtype = viocmd;
- hev->xSourceLp = HvLpConfig_getLpIndex();
- hev->xTargetLp = viopath_hostLp;
- hev->xSizeMinus1 =
- offsetof(struct vioblocklpevent, u.rw_data.dma_info) +
- (sizeof(bevent->u.rw_data.dma_info[0]) * nsg) - 1;
- hev->xSourceInstanceId = viopath_sourceinst(viopath_hostLp);
- hev->xTargetInstanceId = viopath_targetinst(viopath_hostLp);
- hev->xCorrelationToken = (u64)req;
- bevent->version = VIOVERSION;
- bevent->disk = DEVICE_NO(d);
- bevent->u.rw_data.offset = start;
-
- /*
- * Copy just the dma information from the sg list
- * into the request
- */
- for (sgindex = 0; sgindex < nsg; sgindex++) {
- bevent->u.rw_data.dma_info[sgindex].token =
- sg_dma_address(&sg[sgindex]);
- bevent->u.rw_data.dma_info[sgindex].len =
- sg_dma_len(&sg[sgindex]);
- }
-
- /* Send the request */
- hvrc = HvCallEvent_signalLpEvent(&bevent->event);
- vio_free_event_buffer(viomajorsubtype_blockio, bevent);
- }
-
- if (hvrc != HvLpEvent_Rc_Good) {
- pr_warning("error sending disk event to OS/400 (rc %d)\n",
- (int)hvrc);
- goto error_ret;
- }
- spin_unlock_irqrestore(&viodasd_spinlock, flags);
- return 0;
-
-error_ret:
- num_req_outstanding--;
- spin_unlock_irqrestore(&viodasd_spinlock, flags);
- dma_unmap_sg(d->dev, sg, nsg, direction);
- return -1;
-}
-
-/*
- * This is the external request processing routine
- */
-static void do_viodasd_request(struct request_queue *q)
-{
- struct request *req;
-
- /*
- * If we already have the maximum number of requests
- * outstanding to OS/400 just bail out. We'll come
- * back later.
- */
- while (num_req_outstanding < VIOMAXREQ) {
- req = blk_fetch_request(q);
- if (req == NULL)
- return;
- /* check that request contains a valid command */
- if (req->cmd_type != REQ_TYPE_FS) {
- viodasd_end_request(req, -EIO, blk_rq_sectors(req));
- continue;
- }
- /* Try sending the request */
- if (send_request(req) != 0)
- viodasd_end_request(req, -EIO, blk_rq_sectors(req));
- }
-}
-
-/*
- * Probe a single disk and fill in the viodasd_device structure
- * for it.
- */
-static int probe_disk(struct viodasd_device *d)
-{
- HvLpEvent_Rc hvrc;
- struct viodasd_waitevent we;
- int dev_no = DEVICE_NO(d);
- struct gendisk *g;
- struct request_queue *q;
- u16 flags = 0;
-
-retry:
- init_completion(&we.com);
-
- /* Send the open event to OS/400 */
- hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
- HvLpEvent_Type_VirtualIo,
- viomajorsubtype_blockio | vioblockopen,
- HvLpEvent_AckInd_DoAck, HvLpEvent_AckType_ImmediateAck,
- viopath_sourceinst(viopath_hostLp),
- viopath_targetinst(viopath_hostLp),
- (u64)(unsigned long)&we, VIOVERSION << 16,
- ((u64)dev_no << 48) | ((u64)flags<< 32),
- 0, 0, 0);
- if (hvrc != 0) {
- pr_warning("bad rc on HV open %d\n", (int)hvrc);
- return 0;
- }
-
- wait_for_completion(&we.com);
-
- if (we.rc != 0) {
- if (flags != 0)
- return 0;
- /* try again with read only flag set */
- flags = vioblockflags_ro;
- goto retry;
- }
- if (we.max_disk > (MAX_DISKNO - 1)) {
- printk_once(KERN_INFO pr_fmt("Only examining the first %d of %d disks connected\n"),
- MAX_DISKNO, we.max_disk + 1);
- }
-
- /* Send the close event to OS/400. We DON'T expect a response */
- hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
- HvLpEvent_Type_VirtualIo,
- viomajorsubtype_blockio | vioblockclose,
- HvLpEvent_AckInd_NoAck, HvLpEvent_AckType_ImmediateAck,
- viopath_sourceinst(viopath_hostLp),
- viopath_targetinst(viopath_hostLp),
- 0, VIOVERSION << 16,
- ((u64)dev_no << 48) | ((u64)flags << 32),
- 0, 0, 0);
- if (hvrc != 0) {
- pr_warning("bad rc sending event to OS/400 %d\n", (int)hvrc);
- return 0;
- }
-
- if (d->dev == NULL) {
- /* this is when we reprobe for new disks */
- if (vio_create_viodasd(dev_no) == NULL) {
- pr_warning("cannot allocate virtual device for disk %d\n",
- dev_no);
- return 0;
- }
- /*
- * The vio_create_viodasd will have recursed into this
- * routine with d->dev set to the new vio device and
- * will finish the setup of the disk below.
- */
- return 1;
- }
-
- /* create the request queue for the disk */
- spin_lock_init(&d->q_lock);
- q = blk_init_queue(do_viodasd_request, &d->q_lock);
- if (q == NULL) {
- pr_warning("cannot allocate queue for disk %d\n", dev_no);
- return 0;
- }
- g = alloc_disk(1 << PARTITION_SHIFT);
- if (g == NULL) {
- pr_warning("cannot allocate disk structure for disk %d\n",
- dev_no);
- blk_cleanup_queue(q);
- return 0;
- }
-
- d->disk = g;
- blk_queue_max_segments(q, VIOMAXBLOCKDMA);
- blk_queue_max_hw_sectors(q, VIODASD_MAXSECTORS);
- g->major = VIODASD_MAJOR;
- g->first_minor = dev_no << PARTITION_SHIFT;
- if (dev_no >= 26)
- snprintf(g->disk_name, sizeof(g->disk_name),
- VIOD_GENHD_NAME "%c%c",
- 'a' + (dev_no / 26) - 1, 'a' + (dev_no % 26));
- else
- snprintf(g->disk_name, sizeof(g->disk_name),
- VIOD_GENHD_NAME "%c", 'a' + (dev_no % 26));
- g->fops = &viodasd_fops;
- g->queue = q;
- g->private_data = d;
- g->driverfs_dev = d->dev;
- set_capacity(g, d->size >> 9);
-
- pr_info("disk %d: %lu sectors (%lu MB) CHS=%d/%d/%d sector size %d%s\n",
- dev_no, (unsigned long)(d->size >> 9),
- (unsigned long)(d->size >> 20),
- (int)d->cylinders, (int)d->tracks,
- (int)d->sectors, (int)d->bytes_per_sector,
- d->read_only ? " (RO)" : "");
-
- /* register us in the global list */
- add_disk(g);
- return 1;
-}
-
-/* returns the total number of scatterlist elements converted */
-static int block_event_to_scatterlist(const struct vioblocklpevent *bevent,
- struct scatterlist *sg, int *total_len)
-{
- int i, numsg;
- const struct rw_data *rw_data = &bevent->u.rw_data;
- static const int offset =
- offsetof(struct vioblocklpevent, u.rw_data.dma_info);
- static const int element_size = sizeof(rw_data->dma_info[0]);
-
- numsg = ((bevent->event.xSizeMinus1 + 1) - offset) / element_size;
- if (numsg > VIOMAXBLOCKDMA)
- numsg = VIOMAXBLOCKDMA;
-
- *total_len = 0;
- sg_init_table(sg, VIOMAXBLOCKDMA);
- for (i = 0; (i < numsg) && (rw_data->dma_info[i].len > 0); ++i) {
- sg_dma_address(&sg[i]) = rw_data->dma_info[i].token;
- sg_dma_len(&sg[i]) = rw_data->dma_info[i].len;
- *total_len += rw_data->dma_info[i].len;
- }
- return i;
-}
-
-/*
- * Restart all queues, starting with the one _after_ the disk given,
- * thus reducing the chance of starvation of higher numbered disks.
- */
-static void viodasd_restart_all_queues_starting_from(int first_index)
-{
- int i;
-
- for (i = first_index + 1; i < MAX_DISKNO; ++i)
- if (viodasd_devices[i].disk)
- blk_run_queue(viodasd_devices[i].disk->queue);
- for (i = 0; i <= first_index; ++i)
- if (viodasd_devices[i].disk)
- blk_run_queue(viodasd_devices[i].disk->queue);
-}
-
-/*
- * For read and write requests, decrement the number of outstanding requests,
- * Free the DMA buffers we allocated.
- */
-static int viodasd_handle_read_write(struct vioblocklpevent *bevent)
-{
- int num_sg, num_sect, pci_direction, total_len;
- struct request *req;
- struct scatterlist sg[VIOMAXBLOCKDMA];
- struct HvLpEvent *event = &bevent->event;
- unsigned long irq_flags;
- struct viodasd_device *d;
- int error;
- spinlock_t *qlock;
-
- num_sg = block_event_to_scatterlist(bevent, sg, &total_len);
- num_sect = total_len >> 9;
- if (event->xSubtype == (viomajorsubtype_blockio | vioblockread))
- pci_direction = DMA_FROM_DEVICE;
- else
- pci_direction = DMA_TO_DEVICE;
- req = (struct request *)bevent->event.xCorrelationToken;
- d = req->rq_disk->private_data;
-
- dma_unmap_sg(d->dev, sg, num_sg, pci_direction);
-
- /*
- * Since this is running in interrupt mode, we need to make sure
- * we're not stepping on any global I/O operations
- */
- spin_lock_irqsave(&viodasd_spinlock, irq_flags);
- num_req_outstanding--;
- spin_unlock_irqrestore(&viodasd_spinlock, irq_flags);
-
- error = (event->xRc == HvLpEvent_Rc_Good) ? 0 : -EIO;
- if (error) {
- const struct vio_error_entry *err;
- err = vio_lookup_rc(viodasd_err_table, bevent->sub_result);
- pr_warning("read/write error %d:0x%04x (%s)\n",
- event->xRc, bevent->sub_result, err->msg);
- num_sect = blk_rq_sectors(req);
- }
- qlock = req->q->queue_lock;
- spin_lock_irqsave(qlock, irq_flags);
- viodasd_end_request(req, error, num_sect);
- spin_unlock_irqrestore(qlock, irq_flags);
-
- /* Finally, try to get more requests off of this device's queue */
- viodasd_restart_all_queues_starting_from(DEVICE_NO(d));
-
- return 0;
-}
-
-/* This routine handles incoming block LP events */
-static void handle_block_event(struct HvLpEvent *event)
-{
- struct vioblocklpevent *bevent = (struct vioblocklpevent *)event;
- struct viodasd_waitevent *pwe;
-
- if (event == NULL)
- /* Notification that a partition went away! */
- return;
- /* First, we should NEVER get an int here...only acks */
- if (hvlpevent_is_int(event)) {
- pr_warning("Yikes! got an int in viodasd event handler!\n");
- if (hvlpevent_need_ack(event)) {
- event->xRc = HvLpEvent_Rc_InvalidSubtype;
- HvCallEvent_ackLpEvent(event);
- }
- }
-
- switch (event->xSubtype & VIOMINOR_SUBTYPE_MASK) {
- case vioblockopen:
- /*
- * Handle a response to an open request. We get all the
- * disk information in the response, so update it. The
- * correlation token contains a pointer to a waitevent
- * structure that has a completion in it. update the
- * return code in the waitevent structure and post the
- * completion to wake up the guy who sent the request
- */
- pwe = (struct viodasd_waitevent *)event->xCorrelationToken;
- pwe->rc = event->xRc;
- pwe->sub_result = bevent->sub_result;
- if (event->xRc == HvLpEvent_Rc_Good) {
- const struct open_data *data = &bevent->u.open_data;
- struct viodasd_device *device =
- &viodasd_devices[bevent->disk];
- device->read_only =
- bevent->flags & vioblockflags_ro;
- device->size = data->disk_size;
- device->cylinders = data->cylinders;
- device->tracks = data->tracks;
- device->sectors = data->sectors;
- device->bytes_per_sector = data->bytes_per_sector;
- pwe->max_disk = data->max_disk;
- }
- complete(&pwe->com);
- break;
- case vioblockclose:
- break;
- case vioblockread:
- case vioblockwrite:
- viodasd_handle_read_write(bevent);
- break;
-
- default:
- pr_warning("invalid subtype!");
- if (hvlpevent_need_ack(event)) {
- event->xRc = HvLpEvent_Rc_InvalidSubtype;
- HvCallEvent_ackLpEvent(event);
- }
- }
-}
-
-/*
- * Get the driver to reprobe for more disks.
- */
-static ssize_t probe_disks(struct device_driver *drv, const char *buf,
- size_t count)
-{
- struct viodasd_device *d;
-
- for (d = viodasd_devices; d < &viodasd_devices[MAX_DISKNO]; d++) {
- if (d->disk == NULL)
- probe_disk(d);
- }
- return count;
-}
-static DRIVER_ATTR(probe, S_IWUSR, NULL, probe_disks);
-
-static int viodasd_probe(struct vio_dev *vdev, const struct vio_device_id *id)
-{
- struct viodasd_device *d = &viodasd_devices[vdev->unit_address];
-
- d->dev = &vdev->dev;
- if (!probe_disk(d))
- return -ENODEV;
- return 0;
-}
-
-static int viodasd_remove(struct vio_dev *vdev)
-{
- struct viodasd_device *d;
-
- d = &viodasd_devices[vdev->unit_address];
- if (d->disk) {
- del_gendisk(d->disk);
- blk_cleanup_queue(d->disk->queue);
- put_disk(d->disk);
- d->disk = NULL;
- }
- d->dev = NULL;
- return 0;
-}
-
-/**
- * viodasd_device_table: Used by vio.c to match devices that we
- * support.
- */
-static struct vio_device_id viodasd_device_table[] __devinitdata = {
- { "block", "IBM,iSeries-viodasd" },
- { "", "" }
-};
-MODULE_DEVICE_TABLE(vio, viodasd_device_table);
-
-static struct vio_driver viodasd_driver = {
- .id_table = viodasd_device_table,
- .probe = viodasd_probe,
- .remove = viodasd_remove,
- .driver = {
- .name = "viodasd",
- .owner = THIS_MODULE,
- }
-};
-
-static int need_delete_probe;
-
-/*
- * Initialize the whole device driver. Handle module and non-module
- * versions
- */
-static int __init viodasd_init(void)
-{
- int rc;
-
- if (!firmware_has_feature(FW_FEATURE_ISERIES)) {
- rc = -ENODEV;
- goto early_fail;
- }
-
- /* Try to open to our host lp */
- if (viopath_hostLp == HvLpIndexInvalid)
- vio_set_hostlp();
-
- if (viopath_hostLp == HvLpIndexInvalid) {
- pr_warning("invalid hosting partition\n");
- rc = -EIO;
- goto early_fail;
- }
-
- pr_info("vers " VIOD_VERS ", hosting partition %d\n", viopath_hostLp);
-
- /* register the block device */
- rc = register_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME);
- if (rc) {
- pr_warning("Unable to get major number %d for %s\n",
- VIODASD_MAJOR, VIOD_GENHD_NAME);
- goto early_fail;
- }
- /* Actually open the path to the hosting partition */
- rc = viopath_open(viopath_hostLp, viomajorsubtype_blockio,
- VIOMAXREQ + 2);
- if (rc) {
- pr_warning("error opening path to host partition %d\n",
- viopath_hostLp);
- goto unregister_blk;
- }
-
- /* Initialize our request handler */
- vio_setHandler(viomajorsubtype_blockio, handle_block_event);
-
- rc = vio_register_driver(&viodasd_driver);
- if (rc) {
- pr_warning("vio_register_driver failed\n");
- goto unset_handler;
- }
-
- /*
- * If this call fails, it just means that we cannot dynamically
- * add virtual disks, but the driver will still work fine for
- * all existing disk, so ignore the failure.
- */
- if (!driver_create_file(&viodasd_driver.driver, &driver_attr_probe))
- need_delete_probe = 1;
-
- return 0;
-
-unset_handler:
- vio_clearHandler(viomajorsubtype_blockio);
- viopath_close(viopath_hostLp, viomajorsubtype_blockio, VIOMAXREQ + 2);
-unregister_blk:
- unregister_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME);
-early_fail:
- return rc;
-}
-module_init(viodasd_init);
-
-void __exit viodasd_exit(void)
-{
- if (need_delete_probe)
- driver_remove_file(&viodasd_driver.driver, &driver_attr_probe);
- vio_unregister_driver(&viodasd_driver);
- vio_clearHandler(viomajorsubtype_blockio);
- viopath_close(viopath_hostLp, viomajorsubtype_blockio, VIOMAXREQ + 2);
- unregister_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME);
-}
-module_exit(viodasd_exit);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 079c08808d8a..c0bbeb470754 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -3,45 +3,53 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/hdreg.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
#include <linux/virtio.h>
#include <linux/virtio_blk.h>
#include <linux/scatterlist.h>
#include <linux/string_helpers.h>
#include <scsi/scsi_cmnd.h>
+#include <linux/idr.h>
#define PART_BITS 4
-static int major, index;
+static int major;
+static DEFINE_IDA(vd_index_ida);
+
struct workqueue_struct *virtblk_wq;
struct virtio_blk
{
- spinlock_t lock;
-
struct virtio_device *vdev;
struct virtqueue *vq;
/* The disk structure for the kernel. */
struct gendisk *disk;
- /* Request tracking. */
- struct list_head reqs;
-
mempool_t *pool;
/* Process context for config space updates */
struct work_struct config_work;
+ /* Lock for config space updates */
+ struct mutex config_lock;
+
+ /* enable config space updates */
+ bool config_enable;
+
/* What host tells us, plus 2 for header & tailer. */
unsigned int sg_elems;
+ /* Ida index - used to track minor number allocations. */
+ int index;
+
/* Scatterlist: can be too big for stack. */
struct scatterlist sg[/*sg_elems*/];
};
struct virtblk_req
{
- struct list_head list;
struct request *req;
struct virtio_blk_outhdr out_hdr;
struct virtio_scsi_inhdr in_hdr;
@@ -55,7 +63,7 @@ static void blk_done(struct virtqueue *vq)
unsigned int len;
unsigned long flags;
- spin_lock_irqsave(&vblk->lock, flags);
+ spin_lock_irqsave(vblk->disk->queue->queue_lock, flags);
while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
int error;
@@ -85,12 +93,11 @@ static void blk_done(struct virtqueue *vq)
}
__blk_end_request_all(vbr->req, error);
- list_del(&vbr->list);
mempool_free(vbr, vblk->pool);
}
/* In case queue is stopped waiting for more buffers. */
blk_start_queue(vblk->disk->queue);
- spin_unlock_irqrestore(&vblk->lock, flags);
+ spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags);
}
static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
@@ -165,12 +172,11 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
}
}
- if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr) < 0) {
+ if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, GFP_ATOMIC)<0) {
mempool_free(vbr, vblk->pool);
return false;
}
- list_add_tail(&vbr->list, &vblk->reqs);
return true;
}
@@ -236,8 +242,8 @@ static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI))
return -ENOTTY;
- return scsi_cmd_ioctl(disk->queue, disk, mode, cmd,
- (void __user *)data);
+ return scsi_cmd_blk_ioctl(bdev, mode, cmd,
+ (void __user *)data);
}
/* We provide getgeo only to please some old bootloader/partitioning tools */
@@ -276,6 +282,11 @@ static int index_to_minor(int index)
return index << PART_BITS;
}
+static int minor_to_index(int minor)
+{
+ return minor >> PART_BITS;
+}
+
static ssize_t virtblk_serial_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -306,6 +317,10 @@ static void virtblk_config_changed_work(struct work_struct *work)
char cap_str_2[10], cap_str_10[10];
u64 capacity, size;
+ mutex_lock(&vblk->config_lock);
+ if (!vblk->config_enable)
+ goto done;
+
/* Host must always specify the capacity. */
vdev->config->get(vdev, offsetof(struct virtio_blk_config, capacity),
&capacity, sizeof(capacity));
@@ -328,6 +343,9 @@ static void virtblk_config_changed_work(struct work_struct *work)
cap_str_10, cap_str_2);
set_capacity(vblk->disk, capacity);
+ revalidate_disk(vblk->disk);
+done:
+ mutex_unlock(&vblk->config_lock);
}
static void virtblk_config_changed(struct virtio_device *vdev)
@@ -337,18 +355,138 @@ static void virtblk_config_changed(struct virtio_device *vdev)
queue_work(virtblk_wq, &vblk->config_work);
}
+static int init_vq(struct virtio_blk *vblk)
+{
+ int err = 0;
+
+ /* We expect one virtqueue, for output. */
+ vblk->vq = virtio_find_single_vq(vblk->vdev, blk_done, "requests");
+ if (IS_ERR(vblk->vq))
+ err = PTR_ERR(vblk->vq);
+
+ return err;
+}
+
+/*
+ * Legacy naming scheme used for virtio devices. We are stuck with it for
+ * virtio blk but don't ever use it for any new driver.
+ */
+static int virtblk_name_format(char *prefix, int index, char *buf, int buflen)
+{
+ const int base = 'z' - 'a' + 1;
+ char *begin = buf + strlen(prefix);
+ char *end = buf + buflen;
+ char *p;
+ int unit;
+
+ p = end - 1;
+ *p = '\0';
+ unit = base;
+ do {
+ if (p == begin)
+ return -EINVAL;
+ *--p = 'a' + (index % unit);
+ index = (index / unit) - 1;
+ } while (index >= 0);
+
+ memmove(begin, p, end - p);
+ memcpy(buf, prefix, strlen(prefix));
+
+ return 0;
+}
+
+static int virtblk_get_cache_mode(struct virtio_device *vdev)
+{
+ u8 writeback;
+ int err;
+
+ err = virtio_config_val(vdev, VIRTIO_BLK_F_CONFIG_WCE,
+ offsetof(struct virtio_blk_config, wce),
+ &writeback);
+ if (err)
+ writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE);
+
+ return writeback;
+}
+
+static void virtblk_update_cache_mode(struct virtio_device *vdev)
+{
+ u8 writeback = virtblk_get_cache_mode(vdev);
+ struct virtio_blk *vblk = vdev->priv;
+
+ if (writeback)
+ blk_queue_flush(vblk->disk->queue, REQ_FLUSH);
+ else
+ blk_queue_flush(vblk->disk->queue, 0);
+
+ revalidate_disk(vblk->disk);
+}
+
+static const char *const virtblk_cache_types[] = {
+ "write through", "write back"
+};
+
+static ssize_t
+virtblk_cache_type_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct virtio_blk *vblk = disk->private_data;
+ struct virtio_device *vdev = vblk->vdev;
+ int i;
+ u8 writeback;
+
+ BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE));
+ for (i = ARRAY_SIZE(virtblk_cache_types); --i >= 0; )
+ if (sysfs_streq(buf, virtblk_cache_types[i]))
+ break;
+
+ if (i < 0)
+ return -EINVAL;
+
+ writeback = i;
+ vdev->config->set(vdev,
+ offsetof(struct virtio_blk_config, wce),
+ &writeback, sizeof(writeback));
+
+ virtblk_update_cache_mode(vdev);
+ return count;
+}
+
+static ssize_t
+virtblk_cache_type_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct virtio_blk *vblk = disk->private_data;
+ u8 writeback = virtblk_get_cache_mode(vblk->vdev);
+
+ BUG_ON(writeback >= ARRAY_SIZE(virtblk_cache_types));
+ return snprintf(buf, 40, "%s\n", virtblk_cache_types[writeback]);
+}
+
+static const struct device_attribute dev_attr_cache_type_ro =
+ __ATTR(cache_type, S_IRUGO,
+ virtblk_cache_type_show, NULL);
+static const struct device_attribute dev_attr_cache_type_rw =
+ __ATTR(cache_type, S_IRUGO|S_IWUSR,
+ virtblk_cache_type_show, virtblk_cache_type_store);
+
static int __devinit virtblk_probe(struct virtio_device *vdev)
{
struct virtio_blk *vblk;
struct request_queue *q;
- int err;
+ int err, index;
u64 cap;
u32 v, blk_size, sg_elems, opt_io_size;
u16 min_io_size;
u8 physical_block_exp, alignment_offset;
- if (index_to_minor(index) >= 1 << MINORBITS)
- return -ENOSPC;
+ err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
+ GFP_KERNEL);
+ if (err < 0)
+ goto out;
+ index = err;
/* We need to know how many segments before we allocate. */
err = virtio_config_val(vdev, VIRTIO_BLK_F_SEG_MAX,
@@ -365,22 +503,19 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
sizeof(vblk->sg[0]) * sg_elems, GFP_KERNEL);
if (!vblk) {
err = -ENOMEM;
- goto out;
+ goto out_free_index;
}
- INIT_LIST_HEAD(&vblk->reqs);
- spin_lock_init(&vblk->lock);
vblk->vdev = vdev;
vblk->sg_elems = sg_elems;
sg_init_table(vblk->sg, vblk->sg_elems);
+ mutex_init(&vblk->config_lock);
INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
+ vblk->config_enable = true;
- /* We expect one virtqueue, for output. */
- vblk->vq = virtio_find_single_vq(vdev, blk_done, "requests");
- if (IS_ERR(vblk->vq)) {
- err = PTR_ERR(vblk->vq);
+ err = init_vq(vblk);
+ if (err)
goto out_free_vblk;
- }
vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req));
if (!vblk->pool) {
@@ -395,7 +530,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
goto out_mempool;
}
- q = vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock);
+ q = vblk->disk->queue = blk_init_queue(do_virtblk_request, NULL);
if (!q) {
err = -ENOMEM;
goto out_put_disk;
@@ -403,29 +538,17 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
q->queuedata = vblk;
- if (index < 26) {
- sprintf(vblk->disk->disk_name, "vd%c", 'a' + index % 26);
- } else if (index < (26 + 1) * 26) {
- sprintf(vblk->disk->disk_name, "vd%c%c",
- 'a' + index / 26 - 1, 'a' + index % 26);
- } else {
- const unsigned int m1 = (index / 26 - 1) / 26 - 1;
- const unsigned int m2 = (index / 26 - 1) % 26;
- const unsigned int m3 = index % 26;
- sprintf(vblk->disk->disk_name, "vd%c%c%c",
- 'a' + m1, 'a' + m2, 'a' + m3);
- }
+ virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
vblk->disk->major = major;
vblk->disk->first_minor = index_to_minor(index);
vblk->disk->private_data = vblk;
vblk->disk->fops = &virtblk_fops;
vblk->disk->driverfs_dev = &vdev->dev;
- index++;
+ vblk->index = index;
/* configure queue flush support */
- if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
- blk_queue_flush(q, REQ_FLUSH);
+ virtblk_update_cache_mode(vdev);
/* If disk is read-only in the host, the guest should obey */
if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
@@ -503,6 +626,14 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
if (err)
goto out_del_disk;
+ if (virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE))
+ err = device_create_file(disk_to_dev(vblk->disk),
+ &dev_attr_cache_type_rw);
+ else
+ err = device_create_file(disk_to_dev(vblk->disk),
+ &dev_attr_cache_type_ro);
+ if (err)
+ goto out_del_disk;
return 0;
out_del_disk:
@@ -516,6 +647,8 @@ out_free_vq:
vdev->config->del_vqs(vdev);
out_free_vblk:
kfree(vblk);
+out_free_index:
+ ida_simple_remove(&vd_index_ida, index);
out:
return err;
}
@@ -523,22 +656,67 @@ out:
static void __devexit virtblk_remove(struct virtio_device *vdev)
{
struct virtio_blk *vblk = vdev->priv;
+ int index = vblk->index;
- flush_work(&vblk->config_work);
+ /* Prevent config work handler from accessing the device. */
+ mutex_lock(&vblk->config_lock);
+ vblk->config_enable = false;
+ mutex_unlock(&vblk->config_lock);
- /* Nothing should be pending. */
- BUG_ON(!list_empty(&vblk->reqs));
+ del_gendisk(vblk->disk);
+ blk_cleanup_queue(vblk->disk->queue);
/* Stop all the virtqueues. */
vdev->config->reset(vdev);
- del_gendisk(vblk->disk);
- blk_cleanup_queue(vblk->disk->queue);
+ flush_work(&vblk->config_work);
+
put_disk(vblk->disk);
mempool_destroy(vblk->pool);
vdev->config->del_vqs(vdev);
kfree(vblk);
+ ida_simple_remove(&vd_index_ida, index);
+}
+
+#ifdef CONFIG_PM
+static int virtblk_freeze(struct virtio_device *vdev)
+{
+ struct virtio_blk *vblk = vdev->priv;
+
+ /* Ensure we don't receive any more interrupts */
+ vdev->config->reset(vdev);
+
+ /* Prevent config work handler from accessing the device. */
+ mutex_lock(&vblk->config_lock);
+ vblk->config_enable = false;
+ mutex_unlock(&vblk->config_lock);
+
+ flush_work(&vblk->config_work);
+
+ spin_lock_irq(vblk->disk->queue->queue_lock);
+ blk_stop_queue(vblk->disk->queue);
+ spin_unlock_irq(vblk->disk->queue->queue_lock);
+ blk_sync_queue(vblk->disk->queue);
+
+ vdev->config->del_vqs(vdev);
+ return 0;
+}
+
+static int virtblk_restore(struct virtio_device *vdev)
+{
+ struct virtio_blk *vblk = vdev->priv;
+ int ret;
+
+ vblk->config_enable = true;
+ ret = init_vq(vdev->priv);
+ if (!ret) {
+ spin_lock_irq(vblk->disk->queue->queue_lock);
+ blk_start_queue(vblk->disk->queue);
+ spin_unlock_irq(vblk->disk->queue->queue_lock);
+ }
+ return ret;
}
+#endif
static const struct virtio_device_id id_table[] = {
{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
@@ -548,7 +726,7 @@ static const struct virtio_device_id id_table[] = {
static unsigned int features[] = {
VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
- VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY
+ VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE
};
/*
@@ -565,6 +743,10 @@ static struct virtio_driver __refdata virtio_blk = {
.probe = virtblk_probe,
.remove = __devexit_p(virtblk_remove),
.config_changed = virtblk_config_changed,
+#ifdef CONFIG_PM
+ .freeze = virtblk_freeze,
+ .restore = virtblk_restore,
+#endif
};
static int __init init(void)
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 4abd2bcd20fb..ff540520bada 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -52,7 +52,6 @@
#include <linux/io.h>
#include <linux/gfp.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/dma.h>
@@ -148,7 +147,7 @@ static volatile int xdc_busy;
static struct timer_list xd_watchdog_int;
static volatile u_char xd_error;
-static int nodma = XD_DONT_USE_DMA;
+static bool nodma = XD_DONT_USE_DMA;
static struct request_queue *xd_queue;
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 2330a9ad5e95..c6decb901e5e 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -258,13 +258,16 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
static void print_stats(struct xen_blkif *blkif)
{
- pr_info("xen-blkback (%s): oo %3d | rd %4d | wr %4d | f %4d\n",
+ pr_info("xen-blkback (%s): oo %3d | rd %4d | wr %4d | f %4d"
+ " | ds %4d\n",
current->comm, blkif->st_oo_req,
- blkif->st_rd_req, blkif->st_wr_req, blkif->st_f_req);
+ blkif->st_rd_req, blkif->st_wr_req,
+ blkif->st_f_req, blkif->st_ds_req);
blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
blkif->st_rd_req = 0;
blkif->st_wr_req = 0;
blkif->st_oo_req = 0;
+ blkif->st_ds_req = 0;
}
int xen_blkif_schedule(void *arg)
@@ -318,6 +321,7 @@ struct seg_buf {
static void xen_blkbk_unmap(struct pending_req *req)
{
struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
unsigned int i, invcount = 0;
grant_handle_t handle;
int ret;
@@ -329,25 +333,12 @@ static void xen_blkbk_unmap(struct pending_req *req)
gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
GNTMAP_host_map, handle);
pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
+ pages[invcount] = virt_to_page(vaddr(req, i));
invcount++;
}
- ret = HYPERVISOR_grant_table_op(
- GNTTABOP_unmap_grant_ref, unmap, invcount);
+ ret = gnttab_unmap_refs(unmap, NULL, pages, invcount);
BUG_ON(ret);
- /*
- * Note, we use invcount, so nr->pages, so we can't index
- * using vaddr(req, i).
- */
- for (i = 0; i < invcount; i++) {
- ret = m2p_remove_override(
- virt_to_page(unmap[i].host_addr), false);
- if (ret) {
- pr_alert(DRV_PFX "Failed to remove M2P override for %lx\n",
- (unsigned long)unmap[i].host_addr);
- continue;
- }
- }
}
static int xen_blkbk_map(struct blkif_request *req,
@@ -356,7 +347,7 @@ static int xen_blkbk_map(struct blkif_request *req,
{
struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
int i;
- int nseg = req->nr_segments;
+ int nseg = req->u.rw.nr_segments;
int ret = 0;
/*
@@ -375,7 +366,7 @@ static int xen_blkbk_map(struct blkif_request *req,
pending_req->blkif->domid);
}
- ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
+ ret = gnttab_map_refs(map, NULL, &blkbk->pending_page(pending_req, 0), nseg);
BUG_ON(ret);
/*
@@ -395,21 +386,59 @@ static int xen_blkbk_map(struct blkif_request *req,
if (ret)
continue;
- ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr),
- blkbk->pending_page(pending_req, i), false);
- if (ret) {
- pr_alert(DRV_PFX "Failed to install M2P override for %lx (ret: %d)\n",
- (unsigned long)map[i].dev_bus_addr, ret);
- /* We could switch over to GNTTABOP_copy */
- continue;
- }
-
seg[i].buf = map[i].dev_bus_addr |
(req->u.rw.seg[i].first_sect << 9);
}
return ret;
}
+static int dispatch_discard_io(struct xen_blkif *blkif,
+ struct blkif_request *req)
+{
+ int err = 0;
+ int status = BLKIF_RSP_OKAY;
+ struct block_device *bdev = blkif->vbd.bdev;
+ unsigned long secure;
+
+ blkif->st_ds_req++;
+
+ xen_blkif_get(blkif);
+ secure = (blkif->vbd.discard_secure &&
+ (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
+ BLKDEV_DISCARD_SECURE : 0;
+
+ err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
+ req->u.discard.nr_sectors,
+ GFP_KERNEL, secure);
+
+ if (err == -EOPNOTSUPP) {
+ pr_debug(DRV_PFX "discard op failed, not supported\n");
+ status = BLKIF_RSP_EOPNOTSUPP;
+ } else if (err)
+ status = BLKIF_RSP_ERROR;
+
+ make_response(blkif, req->u.discard.id, req->operation, status);
+ xen_blkif_put(blkif);
+ return err;
+}
+
+static void xen_blk_drain_io(struct xen_blkif *blkif)
+{
+ atomic_set(&blkif->drain, 1);
+ do {
+ /* The initial value is one, and one refcnt taken at the
+ * start of the xen_blkif_schedule thread. */
+ if (atomic_read(&blkif->refcnt) <= 2)
+ break;
+ wait_for_completion_interruptible_timeout(
+ &blkif->drain_complete, HZ);
+
+ if (!atomic_read(&blkif->drain))
+ break;
+ } while (!kthread_should_stop());
+ atomic_set(&blkif->drain, 0);
+}
+
/*
* Completion callback on the bio's. Called as bh->b_end_io()
*/
@@ -422,6 +451,11 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
pr_debug(DRV_PFX "flush diskcache op failed, not supported\n");
xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0);
pending_req->status = BLKIF_RSP_EOPNOTSUPP;
+ } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
+ (error == -EOPNOTSUPP)) {
+ pr_debug(DRV_PFX "write barrier op failed, not supported\n");
+ xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0);
+ pending_req->status = BLKIF_RSP_EOPNOTSUPP;
} else if (error) {
pr_debug(DRV_PFX "Buffer not up-to-date at end of operation,"
" error=%d\n", error);
@@ -438,6 +472,10 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
make_response(pending_req->blkif, pending_req->id,
pending_req->operation, pending_req->status);
xen_blkif_put(pending_req->blkif);
+ if (atomic_read(&pending_req->blkif->refcnt) <= 2) {
+ if (atomic_read(&pending_req->blkif->drain))
+ complete(&pending_req->blkif->drain_complete);
+ }
free_req(pending_req);
}
}
@@ -505,8 +543,11 @@ __do_block_io_op(struct xen_blkif *blkif)
/* Apply all sanity checks to /private copy/ of request. */
barrier();
-
- if (dispatch_rw_block_io(blkif, &req, pending_req))
+ if (unlikely(req.operation == BLKIF_OP_DISCARD)) {
+ free_req(pending_req);
+ if (dispatch_discard_io(blkif, &req))
+ break;
+ } else if (dispatch_rw_block_io(blkif, &req, pending_req))
break;
/* Yield point for this unbounded loop. */
@@ -532,7 +573,6 @@ do_block_io_op(struct xen_blkif *blkif)
return more_to_do;
}
-
/*
* Transmutation of the 'struct blkif_request' to a proper 'struct bio'
* and call the 'submit_bio' to pass it to the underlying storage.
@@ -549,6 +589,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
int i, nbio = 0;
int operation;
struct blk_plug plug;
+ bool drain = false;
switch (req->operation) {
case BLKIF_OP_READ:
@@ -559,11 +600,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
blkif->st_wr_req++;
operation = WRITE_ODIRECT;
break;
+ case BLKIF_OP_WRITE_BARRIER:
+ drain = true;
case BLKIF_OP_FLUSH_DISKCACHE:
blkif->st_f_req++;
operation = WRITE_FLUSH;
break;
- case BLKIF_OP_WRITE_BARRIER:
default:
operation = 0; /* make gcc happy */
goto fail_response;
@@ -571,7 +613,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
}
/* Check that the number of segments is sane. */
- nseg = req->nr_segments;
+ nseg = req->u.rw.nr_segments;
+
if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
@@ -580,12 +623,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
goto fail_response;
}
- preq.dev = req->handle;
+ preq.dev = req->u.rw.handle;
preq.sector_number = req->u.rw.sector_number;
preq.nr_sects = 0;
pending_req->blkif = blkif;
- pending_req->id = req->id;
+ pending_req->id = req->u.rw.id;
pending_req->operation = req->operation;
pending_req->status = BLKIF_RSP_OKAY;
pending_req->nr_pages = nseg;
@@ -621,6 +664,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
}
}
+ /* Wait on all outstanding I/O's and once that has been completed
+ * issue the WRITE_FLUSH.
+ */
+ if (drain)
+ xen_blk_drain_io(pending_req->blkif);
+
/*
* If we have failed at this point, we need to undo the M2P override,
* set gnttab_set_unmap_op on all of the grant references and perform
@@ -630,7 +679,10 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
if (xen_blkbk_map(req, pending_req, seg))
goto fail_flush;
- /* This corresponding xen_blkif_put is done in __end_block_io_op */
+ /*
+ * This corresponding xen_blkif_put is done in __end_block_io_op, or
+ * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
+ */
xen_blkif_get(blkif);
for (i = 0; i < nseg; i++) {
@@ -654,7 +706,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
preq.sector_number += seg[i].nsec;
}
- /* This will be hit if the operation was a flush. */
+ /* This will be hit if the operation was a flush or discard. */
if (!bio) {
BUG_ON(operation != WRITE_FLUSH);
@@ -685,7 +737,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
if (operation == READ)
blkif->st_rd_sect += preq.nr_sects;
- else if (operation == WRITE || operation == WRITE_FLUSH)
+ else if (operation & WRITE)
blkif->st_wr_sect += preq.nr_sects;
return 0;
@@ -694,7 +746,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
xen_blkbk_unmap(pending_req);
fail_response:
/* Haven't submitted any bio's yet. */
- make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+ make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR);
free_req(pending_req);
msleep(1); /* back off a bit */
return -EIO;
@@ -754,7 +806,7 @@ static int __init xen_blkif_init(void)
int i, mmap_pages;
int rc = 0;
- if (!xen_pv_domain())
+ if (!xen_domain())
return -ENODEV;
blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
@@ -765,9 +817,9 @@ static int __init xen_blkif_init(void)
mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
- blkbk->pending_reqs = kmalloc(sizeof(blkbk->pending_reqs[0]) *
+ blkbk->pending_reqs = kzalloc(sizeof(blkbk->pending_reqs[0]) *
xen_blkif_reqs, GFP_KERNEL);
- blkbk->pending_grant_handles = kzalloc(sizeof(blkbk->pending_grant_handles[0]) *
+ blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) *
mmap_pages, GFP_KERNEL);
blkbk->pending_pages = kzalloc(sizeof(blkbk->pending_pages[0]) *
mmap_pages, GFP_KERNEL);
@@ -790,8 +842,6 @@ static int __init xen_blkif_init(void)
if (rc)
goto failed_init;
- memset(blkbk->pending_reqs, 0, sizeof(blkbk->pending_reqs));
-
INIT_LIST_HEAD(&blkbk->pending_free);
spin_lock_init(&blkbk->pending_free_lock);
init_waitqueue_head(&blkbk->pending_free_wq);
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 00c57c90e2d6..9ad3b5ec1dc1 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -27,7 +27,6 @@
#ifndef __XEN_BLKIF__BACKEND__COMMON_H__
#define __XEN_BLKIF__BACKEND__COMMON_H__
-#include <linux/version.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
@@ -61,32 +60,66 @@ struct blkif_common_response {
char dummy;
};
-/* i386 protocol version */
-#pragma pack(push, 4)
-struct blkif_x86_32_request {
- uint8_t operation; /* BLKIF_OP_??? */
+struct blkif_x86_32_request_rw {
uint8_t nr_segments; /* number of segments */
blkif_vdev_t handle; /* only for read/write requests */
uint64_t id; /* private guest value, echoed in resp */
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-};
+} __attribute__((__packed__));
+
+struct blkif_x86_32_request_discard {
+ uint8_t flag; /* BLKIF_DISCARD_SECURE or zero */
+ blkif_vdev_t _pad1; /* was "handle" for read/write requests */
+ uint64_t id; /* private guest value, echoed in resp */
+ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
+ uint64_t nr_sectors;
+} __attribute__((__packed__));
+
+struct blkif_x86_32_request {
+ uint8_t operation; /* BLKIF_OP_??? */
+ union {
+ struct blkif_x86_32_request_rw rw;
+ struct blkif_x86_32_request_discard discard;
+ } u;
+} __attribute__((__packed__));
+
+/* i386 protocol version */
+#pragma pack(push, 4)
struct blkif_x86_32_response {
uint64_t id; /* copied from request */
uint8_t operation; /* copied from request */
int16_t status; /* BLKIF_RSP_??? */
};
#pragma pack(pop)
-
/* x86_64 protocol version */
-struct blkif_x86_64_request {
- uint8_t operation; /* BLKIF_OP_??? */
+
+struct blkif_x86_64_request_rw {
uint8_t nr_segments; /* number of segments */
blkif_vdev_t handle; /* only for read/write requests */
- uint64_t __attribute__((__aligned__(8))) id;
+ uint32_t _pad1; /* offsetof(blkif_reqest..,u.rw.id)==8 */
+ uint64_t id;
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-};
+} __attribute__((__packed__));
+
+struct blkif_x86_64_request_discard {
+ uint8_t flag; /* BLKIF_DISCARD_SECURE or zero */
+ blkif_vdev_t _pad1; /* was "handle" for read/write requests */
+ uint32_t _pad2; /* offsetof(blkif_..,u.discard.id)==8 */
+ uint64_t id;
+ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
+ uint64_t nr_sectors;
+} __attribute__((__packed__));
+
+struct blkif_x86_64_request {
+ uint8_t operation; /* BLKIF_OP_??? */
+ union {
+ struct blkif_x86_64_request_rw rw;
+ struct blkif_x86_64_request_discard discard;
+ } u;
+} __attribute__((__packed__));
+
struct blkif_x86_64_response {
uint64_t __attribute__((__aligned__(8))) id;
uint8_t operation; /* copied from request */
@@ -126,6 +159,7 @@ struct xen_vbd {
/* Cached size parameter. */
sector_t size;
bool flush_support;
+ bool discard_secure;
};
struct backend_info;
@@ -139,7 +173,7 @@ struct xen_blkif {
/* Comms information. */
enum blkif_protocol blk_protocol;
union blkif_back_rings blk_rings;
- struct vm_struct *blk_ring_area;
+ void *blk_ring;
/* The VBD attached to this interface. */
struct xen_vbd vbd;
/* Back pointer to the backend_info. */
@@ -149,6 +183,9 @@ struct xen_blkif {
atomic_t refcnt;
wait_queue_head_t wq;
+ /* for barrier (drain) requests */
+ struct completion drain_complete;
+ atomic_t drain;
/* One thread per one blkif. */
struct task_struct *xenblkd;
unsigned int waiting_reqs;
@@ -159,13 +196,11 @@ struct xen_blkif {
int st_wr_req;
int st_oo_req;
int st_f_req;
+ int st_ds_req;
int st_rd_sect;
int st_wr_sect;
wait_queue_head_t waiting_to_free;
-
- grant_handle_t shmem_handle;
- grant_ref_t shmem_ref;
};
@@ -182,7 +217,7 @@ struct xen_blkif {
struct phys_req {
unsigned short dev;
- unsigned short nr_sects;
+ blkif_sector_t nr_sects;
struct block_device *bdev;
blkif_sector_t sector_number;
};
@@ -196,6 +231,8 @@ int xen_blkif_schedule(void *arg);
int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
struct backend_info *be, int state);
+int xen_blkbk_barrier(struct xenbus_transaction xbt,
+ struct backend_info *be, int state);
struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
static inline void blkif_get_x86_32_req(struct blkif_request *dst,
@@ -203,15 +240,30 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
{
int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
dst->operation = src->operation;
- dst->nr_segments = src->nr_segments;
- dst->handle = src->handle;
- dst->id = src->id;
- dst->u.rw.sector_number = src->sector_number;
- barrier();
- if (n > dst->nr_segments)
- n = dst->nr_segments;
- for (i = 0; i < n; i++)
- dst->u.rw.seg[i] = src->seg[i];
+ switch (src->operation) {
+ case BLKIF_OP_READ:
+ case BLKIF_OP_WRITE:
+ case BLKIF_OP_WRITE_BARRIER:
+ case BLKIF_OP_FLUSH_DISKCACHE:
+ dst->u.rw.nr_segments = src->u.rw.nr_segments;
+ dst->u.rw.handle = src->u.rw.handle;
+ dst->u.rw.id = src->u.rw.id;
+ dst->u.rw.sector_number = src->u.rw.sector_number;
+ barrier();
+ if (n > dst->u.rw.nr_segments)
+ n = dst->u.rw.nr_segments;
+ for (i = 0; i < n; i++)
+ dst->u.rw.seg[i] = src->u.rw.seg[i];
+ break;
+ case BLKIF_OP_DISCARD:
+ dst->u.discard.flag = src->u.discard.flag;
+ dst->u.discard.id = src->u.discard.id;
+ dst->u.discard.sector_number = src->u.discard.sector_number;
+ dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
+ break;
+ default:
+ break;
+ }
}
static inline void blkif_get_x86_64_req(struct blkif_request *dst,
@@ -219,15 +271,30 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst,
{
int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
dst->operation = src->operation;
- dst->nr_segments = src->nr_segments;
- dst->handle = src->handle;
- dst->id = src->id;
- dst->u.rw.sector_number = src->sector_number;
- barrier();
- if (n > dst->nr_segments)
- n = dst->nr_segments;
- for (i = 0; i < n; i++)
- dst->u.rw.seg[i] = src->seg[i];
+ switch (src->operation) {
+ case BLKIF_OP_READ:
+ case BLKIF_OP_WRITE:
+ case BLKIF_OP_WRITE_BARRIER:
+ case BLKIF_OP_FLUSH_DISKCACHE:
+ dst->u.rw.nr_segments = src->u.rw.nr_segments;
+ dst->u.rw.handle = src->u.rw.handle;
+ dst->u.rw.id = src->u.rw.id;
+ dst->u.rw.sector_number = src->u.rw.sector_number;
+ barrier();
+ if (n > dst->u.rw.nr_segments)
+ n = dst->u.rw.nr_segments;
+ for (i = 0; i < n; i++)
+ dst->u.rw.seg[i] = src->u.rw.seg[i];
+ break;
+ case BLKIF_OP_DISCARD:
+ dst->u.discard.flag = src->u.discard.flag;
+ dst->u.discard.id = src->u.discard.id;
+ dst->u.discard.sector_number = src->u.discard.sector_number;
+ dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
+ break;
+ default:
+ break;
+ }
}
#endif /* __XEN_BLKIF__BACKEND__COMMON_H__ */
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 5fd2010f7d2b..4f66171c6683 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -114,44 +114,14 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
spin_lock_init(&blkif->blk_ring_lock);
atomic_set(&blkif->refcnt, 1);
init_waitqueue_head(&blkif->wq);
+ init_completion(&blkif->drain_complete);
+ atomic_set(&blkif->drain, 0);
blkif->st_print = jiffies;
init_waitqueue_head(&blkif->waiting_to_free);
return blkif;
}
-static int map_frontend_page(struct xen_blkif *blkif, unsigned long shared_page)
-{
- struct gnttab_map_grant_ref op;
-
- gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
- GNTMAP_host_map, shared_page, blkif->domid);
-
- if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
- BUG();
-
- if (op.status) {
- DPRINTK("Grant table operation failure !\n");
- return op.status;
- }
-
- blkif->shmem_ref = shared_page;
- blkif->shmem_handle = op.handle;
-
- return 0;
-}
-
-static void unmap_frontend_page(struct xen_blkif *blkif)
-{
- struct gnttab_unmap_grant_ref op;
-
- gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
- GNTMAP_host_map, blkif->shmem_handle);
-
- if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
- BUG();
-}
-
static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
unsigned int evtchn)
{
@@ -161,35 +131,29 @@ static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
if (blkif->irq)
return 0;
- blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE);
- if (!blkif->blk_ring_area)
- return -ENOMEM;
-
- err = map_frontend_page(blkif, shared_page);
- if (err) {
- free_vm_area(blkif->blk_ring_area);
+ err = xenbus_map_ring_valloc(blkif->be->dev, shared_page, &blkif->blk_ring);
+ if (err < 0)
return err;
- }
switch (blkif->blk_protocol) {
case BLKIF_PROTOCOL_NATIVE:
{
struct blkif_sring *sring;
- sring = (struct blkif_sring *)blkif->blk_ring_area->addr;
+ sring = (struct blkif_sring *)blkif->blk_ring;
BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
break;
}
case BLKIF_PROTOCOL_X86_32:
{
struct blkif_x86_32_sring *sring_x86_32;
- sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring_area->addr;
+ sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
break;
}
case BLKIF_PROTOCOL_X86_64:
{
struct blkif_x86_64_sring *sring_x86_64;
- sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring_area->addr;
+ sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
break;
}
@@ -201,8 +165,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
xen_blkif_be_int, 0,
"blkif-backend", blkif);
if (err < 0) {
- unmap_frontend_page(blkif);
- free_vm_area(blkif->blk_ring_area);
+ xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
blkif->blk_rings.common.sring = NULL;
return err;
}
@@ -228,8 +191,7 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif)
}
if (blkif->blk_rings.common.sring) {
- unmap_frontend_page(blkif);
- free_vm_area(blkif->blk_ring_area);
+ xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
blkif->blk_rings.common.sring = NULL;
}
}
@@ -272,6 +234,7 @@ VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req);
VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req);
VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req);
VBD_SHOW(f_req, "%d\n", be->blkif->st_f_req);
+VBD_SHOW(ds_req, "%d\n", be->blkif->st_ds_req);
VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
@@ -280,6 +243,7 @@ static struct attribute *xen_vbdstat_attrs[] = {
&dev_attr_rd_req.attr,
&dev_attr_wr_req.attr,
&dev_attr_f_req.attr,
+ &dev_attr_ds_req.attr,
&dev_attr_rd_sect.attr,
&dev_attr_wr_sect.attr,
NULL
@@ -374,6 +338,9 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
if (q && q->flush_flags)
vbd->flush_support = true;
+ if (q && blk_queue_secdiscard(q))
+ vbd->discard_secure = true;
+
DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
handle, blkif->domid);
return 0;
@@ -414,7 +381,60 @@ int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
err = xenbus_printf(xbt, dev->nodename, "feature-flush-cache",
"%d", state);
if (err)
- xenbus_dev_fatal(dev, err, "writing feature-flush-cache");
+ dev_warn(&dev->dev, "writing feature-flush-cache (%d)", err);
+
+ return err;
+}
+
+static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info *be)
+{
+ struct xenbus_device *dev = be->dev;
+ struct xen_blkif *blkif = be->blkif;
+ int err;
+ int state = 0;
+ struct block_device *bdev = be->blkif->vbd.bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ if (blk_queue_discard(q)) {
+ err = xenbus_printf(xbt, dev->nodename,
+ "discard-granularity", "%u",
+ q->limits.discard_granularity);
+ if (err) {
+ dev_warn(&dev->dev, "writing discard-granularity (%d)", err);
+ return;
+ }
+ err = xenbus_printf(xbt, dev->nodename,
+ "discard-alignment", "%u",
+ q->limits.discard_alignment);
+ if (err) {
+ dev_warn(&dev->dev, "writing discard-alignment (%d)", err);
+ return;
+ }
+ state = 1;
+ /* Optional. */
+ err = xenbus_printf(xbt, dev->nodename,
+ "discard-secure", "%d",
+ blkif->vbd.discard_secure);
+ if (err) {
+ dev_warn(&dev->dev, "writing discard-secure (%d)", err);
+ return;
+ }
+ }
+ err = xenbus_printf(xbt, dev->nodename, "feature-discard",
+ "%d", state);
+ if (err)
+ dev_warn(&dev->dev, "writing feature-discard (%d)", err);
+}
+int xen_blkbk_barrier(struct xenbus_transaction xbt,
+ struct backend_info *be, int state)
+{
+ struct xenbus_device *dev = be->dev;
+ int err;
+
+ err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
+ "%d", state);
+ if (err)
+ dev_warn(&dev->dev, "writing feature-barrier (%d)", err);
return err;
}
@@ -582,7 +602,7 @@ static void frontend_changed(struct xenbus_device *dev,
case XenbusStateConnected:
/*
* Ensure we connect even when two watches fire in
- * close successsion and we miss the intermediate value
+ * close succession and we miss the intermediate value
* of frontend_state.
*/
if (dev->state == XenbusStateConnected)
@@ -646,9 +666,12 @@ again:
return;
}
- err = xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support);
- if (err)
- goto abort;
+ /* If we can't advertise it is OK. */
+ xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support);
+
+ xen_blkbk_discard(xbt, be);
+
+ xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support);
err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
(unsigned long long)vbd_sz(&be->blkif->vbd));
@@ -751,17 +774,14 @@ static const struct xenbus_device_id xen_blkbk_ids[] = {
};
-static struct xenbus_driver xen_blkbk = {
- .name = "vbd",
- .owner = THIS_MODULE,
- .ids = xen_blkbk_ids,
+static DEFINE_XENBUS_DRIVER(xen_blkbk, ,
.probe = xen_blkbk_probe,
.remove = xen_blkbk_remove,
.otherend_changed = frontend_changed
-};
+);
int xen_blkif_xenbus_init(void)
{
- return xenbus_register_backend(&xen_blkbk);
+ return xenbus_register_backend(&xen_blkbk_driver);
}
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 9ea8c2576c70..2c2d2e5c1597 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -43,6 +43,7 @@
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/scatterlist.h>
+#include <linux/bitmap.h>
#include <xen/xen.h>
#include <xen/xenbus.h>
@@ -81,6 +82,7 @@ static const struct block_device_operations xlvbd_block_fops;
*/
struct blkfront_info
{
+ spinlock_t io_lock;
struct mutex mutex;
struct xenbus_device *xbdev;
struct gendisk *gd;
@@ -98,11 +100,13 @@ struct blkfront_info
unsigned long shadow_free;
unsigned int feature_flush;
unsigned int flush_op;
+ unsigned int feature_discard:1;
+ unsigned int feature_secdiscard:1;
+ unsigned int discard_granularity;
+ unsigned int discard_alignment;
int is_ready;
};
-static DEFINE_SPINLOCK(blkif_io_lock);
-
static unsigned int nr_minors;
static unsigned long *minors;
static DEFINE_SPINLOCK(minor_lock);
@@ -132,19 +136,41 @@ static int get_id_from_freelist(struct blkfront_info *info)
{
unsigned long free = info->shadow_free;
BUG_ON(free >= BLK_RING_SIZE);
- info->shadow_free = info->shadow[free].req.id;
- info->shadow[free].req.id = 0x0fffffee; /* debug */
+ info->shadow_free = info->shadow[free].req.u.rw.id;
+ info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
return free;
}
-static void add_id_to_freelist(struct blkfront_info *info,
+static int add_id_to_freelist(struct blkfront_info *info,
unsigned long id)
{
- info->shadow[id].req.id = info->shadow_free;
+ if (info->shadow[id].req.u.rw.id != id)
+ return -EINVAL;
+ if (info->shadow[id].request == NULL)
+ return -EINVAL;
+ info->shadow[id].req.u.rw.id = info->shadow_free;
info->shadow[id].request = NULL;
info->shadow_free = id;
+ return 0;
}
+static const char *op_name(int op)
+{
+ static const char *const names[] = {
+ [BLKIF_OP_READ] = "read",
+ [BLKIF_OP_WRITE] = "write",
+ [BLKIF_OP_WRITE_BARRIER] = "barrier",
+ [BLKIF_OP_FLUSH_DISKCACHE] = "flush",
+ [BLKIF_OP_DISCARD] = "discard" };
+
+ if (op < 0 || op >= ARRAY_SIZE(names))
+ return "unknown";
+
+ if (!names[op])
+ return "reserved";
+
+ return names[op];
+}
static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
{
unsigned int end = minor + nr;
@@ -153,7 +179,7 @@ static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
if (end > nr_minors) {
unsigned long *bitmap, *old;
- bitmap = kzalloc(BITS_TO_LONGS(end) * sizeof(*bitmap),
+ bitmap = kcalloc(BITS_TO_LONGS(end), sizeof(*bitmap),
GFP_KERNEL);
if (bitmap == NULL)
return -ENOMEM;
@@ -173,8 +199,7 @@ static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
spin_lock(&minor_lock);
if (find_next_bit(minors, end, minor) >= end) {
- for (; minor < end; ++minor)
- __set_bit(minor, minors);
+ bitmap_set(minors, minor, nr);
rc = 0;
} else
rc = -EBUSY;
@@ -189,8 +214,7 @@ static void xlbd_release_minors(unsigned int minor, unsigned int nr)
BUG_ON(end > nr_minors);
spin_lock(&minor_lock);
- for (; minor < end; ++minor)
- __clear_bit(minor, minors);
+ bitmap_clear(minors, minor, nr);
spin_unlock(&minor_lock);
}
@@ -284,9 +308,9 @@ static int blkif_queue_request(struct request *req)
id = get_id_from_freelist(info);
info->shadow[id].request = req;
- ring_req->id = id;
+ ring_req->u.rw.id = id;
ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
- ring_req->handle = info->handle;
+ ring_req->u.rw.handle = info->handle;
ring_req->operation = rq_data_dir(req) ?
BLKIF_OP_WRITE : BLKIF_OP_READ;
@@ -302,29 +326,41 @@ static int blkif_queue_request(struct request *req)
ring_req->operation = info->flush_op;
}
- ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
- BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
-
- for_each_sg(info->sg, sg, ring_req->nr_segments, i) {
- buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg)));
- fsect = sg->offset >> 9;
- lsect = fsect + (sg->length >> 9) - 1;
- /* install a grant reference. */
- ref = gnttab_claim_grant_reference(&gref_head);
- BUG_ON(ref == -ENOSPC);
-
- gnttab_grant_foreign_access_ref(
- ref,
- info->xbdev->otherend_id,
- buffer_mfn,
- rq_data_dir(req) );
-
- info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
- ring_req->u.rw.seg[i] =
- (struct blkif_request_segment) {
- .gref = ref,
- .first_sect = fsect,
- .last_sect = lsect };
+ if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
+ /* id, sector_number and handle are set above. */
+ ring_req->operation = BLKIF_OP_DISCARD;
+ ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
+ if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
+ ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
+ else
+ ring_req->u.discard.flag = 0;
+ } else {
+ ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req,
+ info->sg);
+ BUG_ON(ring_req->u.rw.nr_segments >
+ BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+ for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) {
+ buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg)));
+ fsect = sg->offset >> 9;
+ lsect = fsect + (sg->length >> 9) - 1;
+ /* install a grant reference. */
+ ref = gnttab_claim_grant_reference(&gref_head);
+ BUG_ON(ref == -ENOSPC);
+
+ gnttab_grant_foreign_access_ref(
+ ref,
+ info->xbdev->otherend_id,
+ buffer_mfn,
+ rq_data_dir(req));
+
+ info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
+ ring_req->u.rw.seg[i] =
+ (struct blkif_request_segment) {
+ .gref = ref,
+ .first_sect = fsect,
+ .last_sect = lsect };
+ }
}
info->ring.req_prod_pvt++;
@@ -370,7 +406,9 @@ static void do_blkif_request(struct request_queue *rq)
blk_start_request(req);
- if (req->cmd_type != REQ_TYPE_FS) {
+ if ((req->cmd_type != REQ_TYPE_FS) ||
+ ((req->cmd_flags & (REQ_FLUSH | REQ_FUA)) &&
+ !info->flush_op)) {
__blk_end_request_all(req, -EIO);
continue;
}
@@ -399,13 +437,23 @@ wait:
static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
{
struct request_queue *rq;
+ struct blkfront_info *info = gd->private_data;
- rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
+ rq = blk_init_queue(do_blkif_request, &info->io_lock);
if (rq == NULL)
return -1;
queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
+ if (info->feature_discard) {
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, rq);
+ blk_queue_max_discard_sectors(rq, get_capacity(gd));
+ rq->limits.discard_granularity = info->discard_granularity;
+ rq->limits.discard_alignment = info->discard_alignment;
+ if (info->feature_secdiscard)
+ queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq);
+ }
+
/* Hard sector size and max sectors impersonate the equiv. hardware. */
blk_queue_logical_block_size(rq, sector_size);
blk_queue_max_hw_sectors(rq, 512);
@@ -500,6 +548,14 @@ static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
return 0;
}
+static char *encode_disk_name(char *ptr, unsigned int n)
+{
+ if (n >= 26)
+ ptr = encode_disk_name(ptr, n / 26 - 1);
+ *ptr = 'a' + n % 26;
+ return ptr + 1;
+}
+
static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
struct blkfront_info *info,
u16 vdisk_info, u16 sector_size)
@@ -510,6 +566,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
unsigned int offset;
int minor;
int nr_parts;
+ char *ptr;
BUG_ON(info->gd != NULL);
BUG_ON(info->rq != NULL);
@@ -534,7 +591,11 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
"emulated IDE disks,\n\t choose an xvd device name"
"from xvde on\n", info->vdevice);
}
- err = -ENODEV;
+ if (minor >> MINORBITS) {
+ pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n",
+ info->vdevice, minor);
+ return -ENODEV;
+ }
if ((minor % nr_parts) == 0)
nr_minors = nr_parts;
@@ -548,23 +609,14 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
if (gd == NULL)
goto release;
- if (nr_minors > 1) {
- if (offset < 26)
- sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset);
- else
- sprintf(gd->disk_name, "%s%c%c", DEV_NAME,
- 'a' + ((offset / 26)-1), 'a' + (offset % 26));
- } else {
- if (offset < 26)
- sprintf(gd->disk_name, "%s%c%d", DEV_NAME,
- 'a' + offset,
- minor & (nr_parts - 1));
- else
- sprintf(gd->disk_name, "%s%c%c%d", DEV_NAME,
- 'a' + ((offset / 26) - 1),
- 'a' + (offset % 26),
- minor & (nr_parts - 1));
- }
+ strcpy(gd->disk_name, DEV_NAME);
+ ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset);
+ BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN);
+ if (nr_minors > 1)
+ *ptr = 0;
+ else
+ snprintf(ptr, gd->disk_name + DISK_NAME_LEN - ptr,
+ "%d", minor & (nr_parts - 1));
gd->major = XENVBD_MAJOR;
gd->first_minor = minor;
@@ -608,14 +660,14 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
if (info->rq == NULL)
return;
- spin_lock_irqsave(&blkif_io_lock, flags);
+ spin_lock_irqsave(&info->io_lock, flags);
/* No more blkif_request(). */
blk_stop_queue(info->rq);
/* No more gnttab callback work. */
gnttab_cancel_free_callback(&info->callback);
- spin_unlock_irqrestore(&blkif_io_lock, flags);
+ spin_unlock_irqrestore(&info->io_lock, flags);
/* Flush gnttab callback work. Must be done with no locks held. */
flush_work_sync(&info->work);
@@ -647,16 +699,16 @@ static void blkif_restart_queue(struct work_struct *work)
{
struct blkfront_info *info = container_of(work, struct blkfront_info, work);
- spin_lock_irq(&blkif_io_lock);
+ spin_lock_irq(&info->io_lock);
if (info->connected == BLKIF_STATE_CONNECTED)
kick_pending_request_queues(info);
- spin_unlock_irq(&blkif_io_lock);
+ spin_unlock_irq(&info->io_lock);
}
static void blkif_free(struct blkfront_info *info, int suspend)
{
/* Prevent new requests being issued until we fix things up. */
- spin_lock_irq(&blkif_io_lock);
+ spin_lock_irq(&info->io_lock);
info->connected = suspend ?
BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
/* No more blkif_request(). */
@@ -664,7 +716,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
blk_stop_queue(info->rq);
/* No more gnttab callback work. */
gnttab_cancel_free_callback(&info->callback);
- spin_unlock_irq(&blkif_io_lock);
+ spin_unlock_irq(&info->io_lock);
/* Flush gnttab callback work. Must be done with no locks held. */
flush_work_sync(&info->work);
@@ -685,7 +737,9 @@ static void blkif_free(struct blkfront_info *info, int suspend)
static void blkif_completion(struct blk_shadow *s)
{
int i;
- for (i = 0; i < s->req.nr_segments; i++)
+ /* Do not let BLKIF_OP_DISCARD as nr_segment is in the same place
+ * flag. */
+ for (i = 0; i < s->req.u.rw.nr_segments; i++)
gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL);
}
@@ -698,10 +752,10 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
struct blkfront_info *info = (struct blkfront_info *)dev_id;
int error;
- spin_lock_irqsave(&blkif_io_lock, flags);
+ spin_lock_irqsave(&info->io_lock, flags);
if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
- spin_unlock_irqrestore(&blkif_io_lock, flags);
+ spin_unlock_irqrestore(&info->io_lock, flags);
return IRQ_HANDLED;
}
@@ -714,29 +768,55 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
bret = RING_GET_RESPONSE(&info->ring, i);
id = bret->id;
+ /*
+ * The backend has messed up and given us an id that we would
+ * never have given to it (we stamp it up to BLK_RING_SIZE -
+ * look in get_id_from_freelist.
+ */
+ if (id >= BLK_RING_SIZE) {
+ WARN(1, "%s: response to %s has incorrect id (%ld)\n",
+ info->gd->disk_name, op_name(bret->operation), id);
+ /* We can't safely get the 'struct request' as
+ * the id is busted. */
+ continue;
+ }
req = info->shadow[id].request;
- blkif_completion(&info->shadow[id]);
+ if (bret->operation != BLKIF_OP_DISCARD)
+ blkif_completion(&info->shadow[id]);
- add_id_to_freelist(info, id);
+ if (add_id_to_freelist(info, id)) {
+ WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
+ info->gd->disk_name, op_name(bret->operation), id);
+ continue;
+ }
error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
switch (bret->operation) {
+ case BLKIF_OP_DISCARD:
+ if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
+ struct request_queue *rq = info->rq;
+ printk(KERN_WARNING "blkfront: %s: %s op failed\n",
+ info->gd->disk_name, op_name(bret->operation));
+ error = -EOPNOTSUPP;
+ info->feature_discard = 0;
+ info->feature_secdiscard = 0;
+ queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
+ queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
+ }
+ __blk_end_request_all(req, error);
+ break;
case BLKIF_OP_FLUSH_DISKCACHE:
case BLKIF_OP_WRITE_BARRIER:
if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
- printk(KERN_WARNING "blkfront: %s: write %s op failed\n",
- info->flush_op == BLKIF_OP_WRITE_BARRIER ?
- "barrier" : "flush disk cache",
- info->gd->disk_name);
+ printk(KERN_WARNING "blkfront: %s: %s op failed\n",
+ info->gd->disk_name, op_name(bret->operation));
error = -EOPNOTSUPP;
}
if (unlikely(bret->status == BLKIF_RSP_ERROR &&
- info->shadow[id].req.nr_segments == 0)) {
- printk(KERN_WARNING "blkfront: %s: empty write %s op failed\n",
- info->flush_op == BLKIF_OP_WRITE_BARRIER ?
- "barrier" : "flush disk cache",
- info->gd->disk_name);
+ info->shadow[id].req.u.rw.nr_segments == 0)) {
+ printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
+ info->gd->disk_name, op_name(bret->operation));
error = -EOPNOTSUPP;
}
if (unlikely(error)) {
@@ -772,7 +852,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
kick_pending_request_queues(info);
- spin_unlock_irqrestore(&blkif_io_lock, flags);
+ spin_unlock_irqrestore(&info->io_lock, flags);
return IRQ_HANDLED;
}
@@ -808,9 +888,8 @@ static int setup_blkring(struct xenbus_device *dev,
if (err)
goto fail;
- err = bind_evtchn_to_irqhandler(info->evtchn,
- blkif_interrupt,
- IRQF_SAMPLE_RANDOM, "blkif", info);
+ err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0,
+ "blkif", info);
if (err <= 0) {
xenbus_dev_fatal(dev, err,
"bind_evtchn_to_irqhandler failed");
@@ -947,14 +1026,15 @@ static int blkfront_probe(struct xenbus_device *dev,
}
mutex_init(&info->mutex);
+ spin_lock_init(&info->io_lock);
info->xbdev = dev;
info->vdevice = vdevice;
info->connected = BLKIF_STATE_DISCONNECTED;
INIT_WORK(&info->work, blkif_restart_queue);
for (i = 0; i < BLK_RING_SIZE; i++)
- info->shadow[i].req.id = i+1;
- info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+ info->shadow[i].req.u.rw.id = i+1;
+ info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
/* Front end dir is a number, which is used as the id. */
info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
@@ -988,9 +1068,9 @@ static int blkif_recover(struct blkfront_info *info)
/* Stage 2: Set up free list. */
memset(&info->shadow, 0, sizeof(info->shadow));
for (i = 0; i < BLK_RING_SIZE; i++)
- info->shadow[i].req.id = i+1;
+ info->shadow[i].req.u.rw.id = i+1;
info->shadow_free = info->ring.req_prod_pvt;
- info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+ info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
/* Stage 3: Find pending requests and requeue them. */
for (i = 0; i < BLK_RING_SIZE; i++) {
@@ -1003,17 +1083,19 @@ static int blkif_recover(struct blkfront_info *info)
*req = copy[i].req;
/* We get a new request id, and must reset the shadow state. */
- req->id = get_id_from_freelist(info);
- memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
+ req->u.rw.id = get_id_from_freelist(info);
+ memcpy(&info->shadow[req->u.rw.id], &copy[i], sizeof(copy[i]));
+ if (req->operation != BLKIF_OP_DISCARD) {
/* Rewrite any grant references invalidated by susp/resume. */
- for (j = 0; j < req->nr_segments; j++)
- gnttab_grant_foreign_access_ref(
- req->u.rw.seg[j].gref,
- info->xbdev->otherend_id,
- pfn_to_mfn(info->shadow[req->id].frame[j]),
- rq_data_dir(info->shadow[req->id].request));
- info->shadow[req->id].req = *req;
+ for (j = 0; j < req->u.rw.nr_segments; j++)
+ gnttab_grant_foreign_access_ref(
+ req->u.rw.seg[j].gref,
+ info->xbdev->otherend_id,
+ pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]),
+ rq_data_dir(info->shadow[req->u.rw.id].request));
+ }
+ info->shadow[req->u.rw.id].req = *req;
info->ring.req_prod_pvt++;
}
@@ -1022,7 +1104,7 @@ static int blkif_recover(struct blkfront_info *info)
xenbus_switch_state(info->xbdev, XenbusStateConnected);
- spin_lock_irq(&blkif_io_lock);
+ spin_lock_irq(&info->io_lock);
/* Now safe for us to use the shared ring */
info->connected = BLKIF_STATE_CONNECTED;
@@ -1033,7 +1115,7 @@ static int blkif_recover(struct blkfront_info *info)
/* Kick any other new requests queued since we resumed */
kick_pending_request_queues(info);
- spin_unlock_irq(&blkif_io_lock);
+ spin_unlock_irq(&info->io_lock);
return 0;
}
@@ -1098,6 +1180,41 @@ blkfront_closing(struct blkfront_info *info)
bdput(bdev);
}
+static void blkfront_setup_discard(struct blkfront_info *info)
+{
+ int err;
+ char *type;
+ unsigned int discard_granularity;
+ unsigned int discard_alignment;
+ unsigned int discard_secure;
+
+ type = xenbus_read(XBT_NIL, info->xbdev->otherend, "type", NULL);
+ if (IS_ERR(type))
+ return;
+
+ info->feature_secdiscard = 0;
+ if (strncmp(type, "phy", 3) == 0) {
+ err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+ "discard-granularity", "%u", &discard_granularity,
+ "discard-alignment", "%u", &discard_alignment,
+ NULL);
+ if (!err) {
+ info->feature_discard = 1;
+ info->discard_granularity = discard_granularity;
+ info->discard_alignment = discard_alignment;
+ }
+ err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+ "discard-secure", "%d", &discard_secure,
+ NULL);
+ if (!err)
+ info->feature_secdiscard = discard_secure;
+
+ } else if (strncmp(type, "file", 4) == 0)
+ info->feature_discard = 1;
+
+ kfree(type);
+}
+
/*
* Invoked when the backend is finally 'ready' (and has told produced
* the details about the physical device - #sectors, size, etc).
@@ -1108,7 +1225,7 @@ static void blkfront_connect(struct blkfront_info *info)
unsigned long sector_size;
unsigned int binfo;
int err;
- int barrier, flush;
+ int barrier, flush, discard;
switch (info->connected) {
case BLKIF_STATE_CONNECTED:
@@ -1178,7 +1295,14 @@ static void blkfront_connect(struct blkfront_info *info)
info->feature_flush = REQ_FLUSH;
info->flush_op = BLKIF_OP_FLUSH_DISKCACHE;
}
-
+
+ err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+ "feature-discard", "%d", &discard,
+ NULL);
+
+ if (!err && discard)
+ blkfront_setup_discard(info);
+
err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
if (err) {
xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
@@ -1189,10 +1313,10 @@ static void blkfront_connect(struct blkfront_info *info)
xenbus_switch_state(info->xbdev, XenbusStateConnected);
/* Kick pending requests. */
- spin_lock_irq(&blkif_io_lock);
+ spin_lock_irq(&info->io_lock);
info->connected = BLKIF_STATE_CONNECTED;
kick_pending_request_queues(info);
- spin_unlock_irq(&blkif_io_lock);
+ spin_unlock_irq(&info->io_lock);
add_disk(info->gd);
@@ -1322,7 +1446,6 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
mutex_lock(&blkfront_mutex);
bdev = bdget_disk(disk, 0);
- bdput(bdev);
if (bdev->bd_openers)
goto out;
@@ -1353,6 +1476,7 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
}
out:
+ bdput(bdev);
mutex_unlock(&blkfront_mutex);
return 0;
}
@@ -1372,36 +1496,46 @@ static const struct xenbus_device_id blkfront_ids[] = {
{ "" }
};
-static struct xenbus_driver blkfront = {
- .name = "vbd",
- .owner = THIS_MODULE,
- .ids = blkfront_ids,
+static DEFINE_XENBUS_DRIVER(blkfront, ,
.probe = blkfront_probe,
.remove = blkfront_remove,
.resume = blkfront_resume,
.otherend_changed = blkback_changed,
.is_ready = blkfront_is_ready,
-};
+);
static int __init xlblk_init(void)
{
+ int ret;
+
if (!xen_domain())
return -ENODEV;
+ if (xen_hvm_domain() && !xen_platform_pci_unplug)
+ return -ENODEV;
+
if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
XENVBD_MAJOR, DEV_NAME);
return -ENODEV;
}
- return xenbus_register_frontend(&blkfront);
+ ret = xenbus_register_frontend(&blkfront_driver);
+ if (ret) {
+ unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
+ return ret;
+ }
+
+ return 0;
}
module_init(xlblk_init);
static void __exit xlblk_exit(void)
{
- return xenbus_unregister_driver(&blkfront);
+ xenbus_unregister_driver(&blkfront_driver);
+ unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
+ kfree(minors);
}
module_exit(xlblk_exit);
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index fb1975d82a73..1a17e338735e 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -456,7 +456,7 @@ static inline void ace_fsm_yieldirq(struct ace_device *ace)
{
dev_dbg(ace->dev, "ace_fsm_yieldirq()\n");
- if (ace->irq == NO_IRQ)
+ if (!ace->irq)
/* No IRQ assigned, so need to poll */
tasklet_schedule(&ace->fsm_tasklet);
ace->fsm_continue_flag = 0;
@@ -1034,12 +1034,12 @@ static int __devinit ace_setup(struct ace_device *ace)
ACE_CTRL_DATABUFRDYIRQ | ACE_CTRL_ERRORIRQ);
/* Now we can hook up the irq handler */
- if (ace->irq != NO_IRQ) {
+ if (ace->irq) {
rc = request_irq(ace->irq, ace_interrupt, 0, "systemace", ace);
if (rc) {
/* Failure - fall back to polled mode */
dev_err(ace->dev, "request_irq failed\n");
- ace->irq = NO_IRQ;
+ ace->irq = 0;
}
}
@@ -1086,7 +1086,7 @@ static void __devexit ace_teardown(struct ace_device *ace)
tasklet_kill(&ace->fsm_tasklet);
- if (ace->irq != NO_IRQ)
+ if (ace->irq)
free_irq(ace->irq, ace);
iounmap(ace->baseaddr);
@@ -1156,7 +1156,7 @@ static int __devinit ace_probe(struct platform_device *dev)
resource_size_t physaddr = 0;
int bus_width = ACE_BUS_WIDTH_16; /* FIXME: should not be hard coded */
u32 id = dev->id;
- int irq = NO_IRQ;
+ int irq = 0;
int i;
dev_dbg(&dev->dev, "ace_probe(%p)\n", dev);