aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/DAC960.c3
-rw-r--r--drivers/block/Kconfig23
-rw-r--r--drivers/block/Makefile3
-rw-r--r--drivers/block/drbd/drbd_main.c29
-rw-r--r--drivers/block/loop.c87
-rw-r--r--drivers/block/mtip32xx/Kconfig2
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c431
-rw-r--r--drivers/block/mtip32xx/mtip32xx.h48
-rw-r--r--drivers/block/nbd.c38
-rw-r--r--drivers/block/rbd.c1852
-rw-r--r--drivers/block/rsxx/Makefile2
-rw-r--r--drivers/block/rsxx/config.c213
-rw-r--r--drivers/block/rsxx/core.c649
-rw-r--r--drivers/block/rsxx/cregs.c758
-rw-r--r--drivers/block/rsxx/dev.c367
-rw-r--r--drivers/block/rsxx/dma.c998
-rw-r--r--drivers/block/rsxx/rsxx.h45
-rw-r--r--drivers/block/rsxx/rsxx_cfg.h72
-rw-r--r--drivers/block/rsxx/rsxx_priv.h399
-rw-r--r--drivers/block/swim3.c5
-rw-r--r--drivers/block/xd.c1123
-rw-r--r--drivers/block/xd.h134
-rw-r--r--drivers/block/xen-blkback/blkback.c7
-rw-r--r--drivers/block/xen-blkback/xenbus.c49
-rw-r--r--drivers/block/xen-blkfront.c13
25 files changed, 5200 insertions, 2150 deletions
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 9a13e889837e..5b5ee79ff236 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -6547,7 +6547,7 @@ static ssize_t dac960_user_command_proc_write(struct file *file,
const char __user *Buffer,
size_t Count, loff_t *pos)
{
- DAC960_Controller_T *Controller = (DAC960_Controller_T *) PDE(file->f_path.dentry->d_inode)->data;
+ DAC960_Controller_T *Controller = (DAC960_Controller_T *) PDE(file_inode(file))->data;
unsigned char CommandBuffer[80];
int Length;
if (Count > sizeof(CommandBuffer)-1) return -EINVAL;
@@ -7054,6 +7054,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
else
ErrorCode = 0;
}
+ break;
default:
ErrorCode = -ENOTTY;
}
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 824e09c4d0d7..5dc0daed8fac 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -63,19 +63,6 @@ config AMIGA_Z2RAM
To compile this driver as a module, choose M here: the
module will be called z2ram.
-config BLK_DEV_XD
- tristate "XT hard disk support"
- depends on ISA && ISA_DMA_API
- select CHECK_SIGNATURE
- help
- Very old 8 bit hard disk controllers used in the IBM XT computer
- will be supported if you say Y here.
-
- To compile this driver as a module, choose M here: the
- module will be called xd.
-
- It's pretty unlikely that you have one of these: say N.
-
config GDROM
tristate "SEGA Dreamcast GD-ROM drive"
depends on SH_DREAMCAST
@@ -544,4 +531,14 @@ config BLK_DEV_RBD
If unsure, say N.
+config BLK_DEV_RSXX
+ tristate "RamSam PCIe Flash SSD Device Driver"
+ depends on PCI
+ help
+ Device driver for IBM's high speed PCIe SSD
+ storage devices: RamSan-70 and RamSan-80.
+
+ To compile this driver as a module, choose M here: the
+ module will be called rsxx.
+
endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 17e82df3df74..a3b40232c6ab 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -15,7 +15,6 @@ obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o
obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o
obj-$(CONFIG_BLK_DEV_RAM) += brd.o
obj-$(CONFIG_BLK_DEV_LOOP) += loop.o
-obj-$(CONFIG_BLK_DEV_XD) += xd.o
obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o
obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o
obj-$(CONFIG_BLK_DEV_DAC960) += DAC960.o
@@ -41,4 +40,6 @@ obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/
+obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
+
swim_mod-y := swim.o swim_asm.o
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 8c13eeb83c53..e98da675f0c1 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2660,25 +2660,24 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor,
mdev->read_requests = RB_ROOT;
mdev->write_requests = RB_ROOT;
- if (!idr_pre_get(&minors, GFP_KERNEL))
- goto out_no_minor_idr;
- if (idr_get_new_above(&minors, mdev, minor, &minor_got))
+ minor_got = idr_alloc(&minors, mdev, minor, minor + 1, GFP_KERNEL);
+ if (minor_got < 0) {
+ if (minor_got == -ENOSPC) {
+ err = ERR_MINOR_EXISTS;
+ drbd_msg_put_info("requested minor exists already");
+ }
goto out_no_minor_idr;
- if (minor_got != minor) {
- err = ERR_MINOR_EXISTS;
- drbd_msg_put_info("requested minor exists already");
- goto out_idr_remove_minor;
}
- if (!idr_pre_get(&tconn->volumes, GFP_KERNEL))
- goto out_idr_remove_minor;
- if (idr_get_new_above(&tconn->volumes, mdev, vnr, &vnr_got))
+ vnr_got = idr_alloc(&tconn->volumes, mdev, vnr, vnr + 1, GFP_KERNEL);
+ if (vnr_got < 0) {
+ if (vnr_got == -ENOSPC) {
+ err = ERR_INVALID_REQUEST;
+ drbd_msg_put_info("requested volume exists already");
+ }
goto out_idr_remove_minor;
- if (vnr_got != vnr) {
- err = ERR_INVALID_REQUEST;
- drbd_msg_put_info("requested volume exists already");
- goto out_idr_remove_vol;
}
+
add_disk(disk);
kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */
@@ -2689,8 +2688,6 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor,
return NO_ERROR;
-out_idr_remove_vol:
- idr_remove(&tconn->volumes, vnr_got);
out_idr_remove_minor:
idr_remove(&minors, minor_got);
synchronize_rcu();
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ae1251270624..747bb2af69dc 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -162,12 +162,13 @@ static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = {
static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file)
{
- loff_t size, loopsize;
+ loff_t loopsize;
/* Compute loopsize in bytes */
- size = i_size_read(file->f_mapping->host);
- loopsize = size - offset;
- /* offset is beyond i_size, wierd but possible */
+ loopsize = i_size_read(file->f_mapping->host);
+ if (offset > 0)
+ loopsize -= offset;
+ /* offset is beyond i_size, weird but possible */
if (loopsize < 0)
return 0;
@@ -190,6 +191,7 @@ figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit)
{
loff_t size = get_size(offset, sizelimit, lo->lo_backing_file);
sector_t x = (sector_t)size;
+ struct block_device *bdev = lo->lo_device;
if (unlikely((loff_t)x != size))
return -EFBIG;
@@ -198,6 +200,9 @@ figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit)
if (lo->lo_sizelimit != sizelimit)
lo->lo_sizelimit = sizelimit;
set_capacity(lo->lo_disk, x);
+ bd_set_size(bdev, (loff_t)get_capacity(bdev->bd_disk) << 9);
+ /* let user-space know about the new size */
+ kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
return 0;
}
@@ -1091,10 +1096,10 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
return err;
if (lo->lo_offset != info->lo_offset ||
- lo->lo_sizelimit != info->lo_sizelimit) {
+ lo->lo_sizelimit != info->lo_sizelimit)
if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit))
return -EFBIG;
- }
+
loop_config_discard(lo);
memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE);
@@ -1139,7 +1144,7 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
if (lo->lo_state != Lo_bound)
return -ENXIO;
- error = vfs_getattr(file->f_path.mnt, file->f_path.dentry, &stat);
+ error = vfs_getattr(&file->f_path, &stat);
if (error)
return error;
memset(info, 0, sizeof(*info));
@@ -1271,28 +1276,10 @@ loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) {
static int loop_set_capacity(struct loop_device *lo, struct block_device *bdev)
{
- int err;
- sector_t sec;
- loff_t sz;
-
- err = -ENXIO;
if (unlikely(lo->lo_state != Lo_bound))
- goto out;
- err = figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit);
- if (unlikely(err))
- goto out;
- sec = get_capacity(lo->lo_disk);
- /* the width of sector_t may be narrow for bit-shift */
- sz = sec;
- sz <<= 9;
- mutex_lock(&bdev->bd_mutex);
- bd_set_size(bdev, sz);
- /* let user-space know about the new size */
- kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
- mutex_unlock(&bdev->bd_mutex);
+ return -ENXIO;
- out:
- return err;
+ return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit);
}
static int lo_ioctl(struct block_device *bdev, fmode_t mode,
@@ -1624,30 +1611,17 @@ static int loop_add(struct loop_device **l, int i)
if (!lo)
goto out;
- if (!idr_pre_get(&loop_index_idr, GFP_KERNEL))
- goto out_free_dev;
-
+ /* allocate id, if @id >= 0, we're requesting that specific id */
if (i >= 0) {
- int m;
-
- /* create specific i in the index */
- err = idr_get_new_above(&loop_index_idr, lo, i, &m);
- if (err >= 0 && i != m) {
- idr_remove(&loop_index_idr, m);
+ err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL);
+ if (err == -ENOSPC)
err = -EEXIST;
- }
- } else if (i == -1) {
- int m;
-
- /* get next free nr */
- err = idr_get_new(&loop_index_idr, lo, &m);
- if (err >= 0)
- i = m;
} else {
- err = -EINVAL;
+ err = idr_alloc(&loop_index_idr, lo, 0, 0, GFP_KERNEL);
}
if (err < 0)
goto out_free_dev;
+ i = err;
lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
if (!lo->lo_queue)
@@ -1858,11 +1832,15 @@ static int __init loop_init(void)
max_part = (1UL << part_shift) - 1;
}
- if ((1UL << part_shift) > DISK_MAX_PARTS)
- return -EINVAL;
+ if ((1UL << part_shift) > DISK_MAX_PARTS) {
+ err = -EINVAL;
+ goto misc_out;
+ }
- if (max_loop > 1UL << (MINORBITS - part_shift))
- return -EINVAL;
+ if (max_loop > 1UL << (MINORBITS - part_shift)) {
+ err = -EINVAL;
+ goto misc_out;
+ }
/*
* If max_loop is specified, create that many devices upfront.
@@ -1880,8 +1858,10 @@ static int __init loop_init(void)
range = 1UL << MINORBITS;
}
- if (register_blkdev(LOOP_MAJOR, "loop"))
- return -EIO;
+ if (register_blkdev(LOOP_MAJOR, "loop")) {
+ err = -EIO;
+ goto misc_out;
+ }
blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
THIS_MODULE, loop_probe, NULL, NULL);
@@ -1894,6 +1874,10 @@ static int __init loop_init(void)
printk(KERN_INFO "loop: module loaded\n");
return 0;
+
+misc_out:
+ misc_deregister(&loop_misc);
+ return err;
}
static int loop_exit_cb(int id, void *ptr, void *data)
@@ -1911,7 +1895,6 @@ static void __exit loop_exit(void)
range = max_loop ? max_loop << part_shift : 1UL << MINORBITS;
idr_for_each(&loop_index_idr, &loop_exit_cb, NULL);
- idr_remove_all(&loop_index_idr);
idr_destroy(&loop_index_idr);
blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
diff --git a/drivers/block/mtip32xx/Kconfig b/drivers/block/mtip32xx/Kconfig
index 0ba837fc62a8..1fca1f996b45 100644
--- a/drivers/block/mtip32xx/Kconfig
+++ b/drivers/block/mtip32xx/Kconfig
@@ -4,6 +4,6 @@
config BLK_DEV_PCIESSD_MTIP32XX
tristate "Block Device Driver for Micron PCIe SSDs"
- depends on PCI
+ depends on PCI && GENERIC_HARDIRQS
help
This enables the block driver for Micron PCIe SSDs.
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 3fd100990453..11cc9522cdd4 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -88,6 +88,8 @@ static int instance;
static int mtip_major;
static struct dentry *dfs_parent;
+static u32 cpu_use[NR_CPUS];
+
static DEFINE_SPINLOCK(rssd_index_lock);
static DEFINE_IDA(rssd_index_ida);
@@ -296,16 +298,17 @@ static int hba_reset_nosleep(struct driver_data *dd)
*/
static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag)
{
- atomic_set(&port->commands[tag].active, 1);
+ int group = tag >> 5;
- spin_lock(&port->cmd_issue_lock);
+ atomic_set(&port->commands[tag].active, 1);
+ /* guard SACT and CI registers */
+ spin_lock(&port->cmd_issue_lock[group]);
writel((1 << MTIP_TAG_BIT(tag)),
port->s_active[MTIP_TAG_INDEX(tag)]);
writel((1 << MTIP_TAG_BIT(tag)),
port->cmd_issue[MTIP_TAG_INDEX(tag)]);
-
- spin_unlock(&port->cmd_issue_lock);
+ spin_unlock(&port->cmd_issue_lock[group]);
/* Set the command's timeout value.*/
port->commands[tag].comp_time = jiffies + msecs_to_jiffies(
@@ -964,56 +967,56 @@ handle_tfe_exit:
/*
* Handle a set device bits interrupt
*/
-static inline void mtip_process_sdbf(struct driver_data *dd)
+static inline void mtip_workq_sdbfx(struct mtip_port *port, int group,
+ u32 completed)
{
- struct mtip_port *port = dd->port;
- int group, tag, bit;
- u32 completed;
+ struct driver_data *dd = port->dd;
+ int tag, bit;
struct mtip_cmd *command;
- /* walk all bits in all slot groups */
- for (group = 0; group < dd->slot_groups; group++) {
- completed = readl(port->completed[group]);
- if (!completed)
- continue;
+ if (!completed) {
+ WARN_ON_ONCE(!completed);
+ return;
+ }
+ /* clear completed status register in the hardware.*/
+ writel(completed, port->completed[group]);
- /* clear completed status register in the hardware.*/
- writel(completed, port->completed[group]);
+ /* Process completed commands. */
+ for (bit = 0; (bit < 32) && completed; bit++) {
+ if (completed & 0x01) {
+ tag = (group << 5) | bit;
- /* Process completed commands. */
- for (bit = 0;
- (bit < 32) && completed;
- bit++, completed >>= 1) {
- if (completed & 0x01) {
- tag = (group << 5) | bit;
+ /* skip internal command slot. */
+ if (unlikely(tag == MTIP_TAG_INTERNAL))
+ continue;
- /* skip internal command slot. */
- if (unlikely(tag == MTIP_TAG_INTERNAL))
- continue;
+ command = &port->commands[tag];
+ /* make internal callback */
+ if (likely(command->comp_func)) {
+ command->comp_func(
+ port,
+ tag,
+ command->comp_data,
+ 0);
+ } else {
+ dev_warn(&dd->pdev->dev,
+ "Null completion "
+ "for tag %d",
+ tag);
- command = &port->commands[tag];
- /* make internal callback */
- if (likely(command->comp_func)) {
- command->comp_func(
- port,
- tag,
- command->comp_data,
- 0);
- } else {
- dev_warn(&dd->pdev->dev,
- "Null completion "
- "for tag %d",
- tag);
-
- if (mtip_check_surprise_removal(
- dd->pdev)) {
- mtip_command_cleanup(dd);
- return;
- }
+ if (mtip_check_surprise_removal(
+ dd->pdev)) {
+ mtip_command_cleanup(dd);
+ return;
}
}
}
+ completed >>= 1;
}
+
+ /* If last, re-enable interrupts */
+ if (atomic_dec_return(&dd->irq_workers_active) == 0)
+ writel(0xffffffff, dd->mmio + HOST_IRQ_STAT);
}
/*
@@ -1072,6 +1075,8 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data)
struct mtip_port *port = dd->port;
u32 hba_stat, port_stat;
int rv = IRQ_NONE;
+ int do_irq_enable = 1, i, workers;
+ struct mtip_work *twork;
hba_stat = readl(dd->mmio + HOST_IRQ_STAT);
if (hba_stat) {
@@ -1082,8 +1087,42 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data)
writel(port_stat, port->mmio + PORT_IRQ_STAT);
/* Demux port status */
- if (likely(port_stat & PORT_IRQ_SDB_FIS))
- mtip_process_sdbf(dd);
+ if (likely(port_stat & PORT_IRQ_SDB_FIS)) {
+ do_irq_enable = 0;
+ WARN_ON_ONCE(atomic_read(&dd->irq_workers_active) != 0);
+
+ /* Start at 1: group zero is always local? */
+ for (i = 0, workers = 0; i < MTIP_MAX_SLOT_GROUPS;
+ i++) {
+ twork = &dd->work[i];
+ twork->completed = readl(port->completed[i]);
+ if (twork->completed)
+ workers++;
+ }
+
+ atomic_set(&dd->irq_workers_active, workers);
+ if (workers) {
+ for (i = 1; i < MTIP_MAX_SLOT_GROUPS; i++) {
+ twork = &dd->work[i];
+ if (twork->completed)
+ queue_work_on(
+ twork->cpu_binding,
+ dd->isr_workq,
+ &twork->work);
+ }
+
+ if (likely(dd->work[0].completed))
+ mtip_workq_sdbfx(port, 0,
+ dd->work[0].completed);
+
+ } else {
+ /*
+ * Chip quirk: SDB interrupt but nothing
+ * to complete
+ */
+ do_irq_enable = 1;
+ }
+ }
if (unlikely(port_stat & PORT_IRQ_ERR)) {
if (unlikely(mtip_check_surprise_removal(dd->pdev))) {
@@ -1103,21 +1142,13 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data)
}
/* acknowledge interrupt */
- writel(hba_stat, dd->mmio + HOST_IRQ_STAT);
+ if (unlikely(do_irq_enable))
+ writel(hba_stat, dd->mmio + HOST_IRQ_STAT);
return rv;
}
/*
- * Wrapper for mtip_handle_irq
- * (ignores return code)
- */
-static void mtip_tasklet(unsigned long data)
-{
- mtip_handle_irq((struct driver_data *) data);
-}
-
-/*
* HBA interrupt subroutine.
*
* @irq IRQ number.
@@ -1130,8 +1161,8 @@ static void mtip_tasklet(unsigned long data)
static irqreturn_t mtip_irq_handler(int irq, void *instance)
{
struct driver_data *dd = instance;
- tasklet_schedule(&dd->tasklet);
- return IRQ_HANDLED;
+
+ return mtip_handle_irq(dd);
}
static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag)
@@ -1489,6 +1520,12 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
}
#endif
+ /* Demux ID.DRAT & ID.RZAT to determine trim support */
+ if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5))
+ port->dd->trim_supp = true;
+ else
+ port->dd->trim_supp = false;
+
/* Set the identify buffer as valid. */
port->identify_valid = 1;
@@ -1676,6 +1713,81 @@ static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id,
}
/*
+ * Trim unused sectors
+ *
+ * @dd pointer to driver_data structure
+ * @lba starting lba
+ * @len # of 512b sectors to trim
+ *
+ * return value
+ * -ENOMEM Out of dma memory
+ * -EINVAL Invalid parameters passed in, trim not supported
+ * -EIO Error submitting trim request to hw
+ */
+static int mtip_send_trim(struct driver_data *dd, unsigned int lba, unsigned int len)
+{
+ int i, rv = 0;
+ u64 tlba, tlen, sect_left;
+ struct mtip_trim_entry *buf;
+ dma_addr_t dma_addr;
+ struct host_to_dev_fis fis;
+
+ if (!len || dd->trim_supp == false)
+ return -EINVAL;
+
+ /* Trim request too big */
+ WARN_ON(len > (MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES));
+
+ /* Trim request not aligned on 4k boundary */
+ WARN_ON(len % 8 != 0);
+
+ /* Warn if vu_trim structure is too big */
+ WARN_ON(sizeof(struct mtip_trim) > ATA_SECT_SIZE);
+
+ /* Allocate a DMA buffer for the trim structure */
+ buf = dmam_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr,
+ GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ memset(buf, 0, ATA_SECT_SIZE);
+
+ for (i = 0, sect_left = len, tlba = lba;
+ i < MTIP_MAX_TRIM_ENTRIES && sect_left;
+ i++) {
+ tlen = (sect_left >= MTIP_MAX_TRIM_ENTRY_LEN ?
+ MTIP_MAX_TRIM_ENTRY_LEN :
+ sect_left);
+ buf[i].lba = __force_bit2int cpu_to_le32(tlba);
+ buf[i].range = __force_bit2int cpu_to_le16(tlen);
+ tlba += tlen;
+ sect_left -= tlen;
+ }
+ WARN_ON(sect_left != 0);
+
+ /* Build the fis */
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = 0xfb;
+ fis.features = 0x60;
+ fis.sect_count = 1;
+ fis.device = ATA_DEVICE_OBS;
+
+ if (mtip_exec_internal_command(dd->port,
+ &fis,
+ 5,
+ dma_addr,
+ ATA_SECT_SIZE,
+ 0,
+ GFP_KERNEL,
+ MTIP_TRIM_TIMEOUT_MS) < 0)
+ rv = -EIO;
+
+ dmam_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr);
+ return rv;
+}
+
+/*
* Get the drive capacity.
*
* @dd Pointer to the device data structure.
@@ -3005,20 +3117,24 @@ static int mtip_hw_init(struct driver_data *dd)
hba_setup(dd);
- tasklet_init(&dd->tasklet, mtip_tasklet, (unsigned long)dd);
-
- dd->port = kzalloc(sizeof(struct mtip_port), GFP_KERNEL);
+ dd->port = kzalloc_node(sizeof(struct mtip_port), GFP_KERNEL,
+ dd->numa_node);
if (!dd->port) {
dev_err(&dd->pdev->dev,
"Memory allocation: port structure\n");
return -ENOMEM;
}
+ /* Continue workqueue setup */
+ for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++)
+ dd->work[i].port = dd->port;
+
/* Counting semaphore to track command slot usage */
sema_init(&dd->port->cmd_slot, num_command_slots - 1);
/* Spinlock to prevent concurrent issue */
- spin_lock_init(&dd->port->cmd_issue_lock);
+ for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++)
+ spin_lock_init(&dd->port->cmd_issue_lock[i]);
/* Set the port mmio base address. */
dd->port->mmio = dd->mmio + PORT_OFFSET;
@@ -3165,6 +3281,7 @@ static int mtip_hw_init(struct driver_data *dd)
"Unable to allocate IRQ %d\n", dd->pdev->irq);
goto out2;
}
+ irq_set_affinity_hint(dd->pdev->irq, get_cpu_mask(dd->isr_binding));
/* Enable interrupts on the HBA. */
writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
@@ -3241,7 +3358,8 @@ out3:
writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
dd->mmio + HOST_CTL);
- /*Release the IRQ. */
+ /* Release the IRQ. */
+ irq_set_affinity_hint(dd->pdev->irq, NULL);
devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
out2:
@@ -3291,11 +3409,9 @@ static int mtip_hw_exit(struct driver_data *dd)
del_timer_sync(&dd->port->cmd_timer);
/* Release the IRQ. */
+ irq_set_affinity_hint(dd->pdev->irq, NULL);
devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
- /* Stop the bottom half tasklet. */
- tasklet_kill(&dd->tasklet);
-
/* Free the command/command header memory. */
dmam_free_coherent(&dd->pdev->dev,
HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
@@ -3641,6 +3757,12 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
}
}
+ if (unlikely(bio->bi_rw & REQ_DISCARD)) {
+ bio_endio(bio, mtip_send_trim(dd, bio->bi_sector,
+ bio_sectors(bio)));
+ return;
+ }
+
if (unlikely(!bio_has_data(bio))) {
blk_queue_flush(queue, 0);
bio_endio(bio, 0);
@@ -3711,7 +3833,7 @@ static int mtip_block_initialize(struct driver_data *dd)
goto protocol_init_error;
}
- dd->disk = alloc_disk(MTIP_MAX_MINORS);
+ dd->disk = alloc_disk_node(MTIP_MAX_MINORS, dd->numa_node);
if (dd->disk == NULL) {
dev_err(&dd->pdev->dev,
"Unable to allocate gendisk structure\n");
@@ -3755,7 +3877,7 @@ static int mtip_block_initialize(struct driver_data *dd)
skip_create_disk:
/* Allocate the request queue. */
- dd->queue = blk_alloc_queue(GFP_KERNEL);
+ dd->queue = blk_alloc_queue_node(GFP_KERNEL, dd->numa_node);
if (dd->queue == NULL) {
dev_err(&dd->pdev->dev,
"Unable to allocate request queue\n");
@@ -3783,6 +3905,15 @@ skip_create_disk:
*/
blk_queue_flush(dd->queue, 0);
+ /* Signal trim support */
+ if (dd->trim_supp == true) {
+ set_bit(QUEUE_FLAG_DISCARD, &dd->queue->queue_flags);
+ dd->queue->limits.discard_granularity = 4096;
+ blk_queue_max_discard_sectors(dd->queue,
+ MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES);
+ dd->queue->limits.discard_zeroes_data = 0;
+ }
+
/* Set the capacity of the device in 512 byte sectors. */
if (!(mtip_hw_get_capacity(dd, &capacity))) {
dev_warn(&dd->pdev->dev,
@@ -3813,9 +3944,8 @@ skip_create_disk:
start_service_thread:
sprintf(thd_name, "mtip_svc_thd_%02d", index);
-
- dd->mtip_svc_handler = kthread_run(mtip_service_thread,
- dd, thd_name);
+ dd->mtip_svc_handler = kthread_create_on_node(mtip_service_thread,
+ dd, dd->numa_node, thd_name);
if (IS_ERR(dd->mtip_svc_handler)) {
dev_err(&dd->pdev->dev, "service thread failed to start\n");
@@ -3823,7 +3953,7 @@ start_service_thread:
rv = -EFAULT;
goto kthread_run_error;
}
-
+ wake_up_process(dd->mtip_svc_handler);
if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
rv = wait_for_rebuild;
@@ -3963,6 +4093,56 @@ static int mtip_block_resume(struct driver_data *dd)
return 0;
}
+static void drop_cpu(int cpu)
+{
+ cpu_use[cpu]--;
+}
+
+static int get_least_used_cpu_on_node(int node)
+{
+ int cpu, least_used_cpu, least_cnt;
+ const struct cpumask *node_mask;
+
+ node_mask = cpumask_of_node(node);
+ least_used_cpu = cpumask_first(node_mask);
+ least_cnt = cpu_use[least_used_cpu];
+ cpu = least_used_cpu;
+
+ for_each_cpu(cpu, node_mask) {
+ if (cpu_use[cpu] < least_cnt) {
+ least_used_cpu = cpu;
+ least_cnt = cpu_use[cpu];
+ }
+ }
+ cpu_use[least_used_cpu]++;
+ return least_used_cpu;
+}
+
+/* Helper for selecting a node in round robin mode */
+static inline int mtip_get_next_rr_node(void)
+{
+ static int next_node = -1;
+
+ if (next_node == -1) {
+ next_node = first_online_node;
+ return next_node;
+ }
+
+ next_node = next_online_node(next_node);
+ if (next_node == MAX_NUMNODES)
+ next_node = first_online_node;
+ return next_node;
+}
+
+static DEFINE_HANDLER(0);
+static DEFINE_HANDLER(1);
+static DEFINE_HANDLER(2);
+static DEFINE_HANDLER(3);
+static DEFINE_HANDLER(4);
+static DEFINE_HANDLER(5);
+static DEFINE_HANDLER(6);
+static DEFINE_HANDLER(7);
+
/*
* Called for each supported PCI device detected.
*
@@ -3977,9 +4157,25 @@ static int mtip_pci_probe(struct pci_dev *pdev,
{
int rv = 0;
struct driver_data *dd = NULL;
+ char cpu_list[256];
+ const struct cpumask *node_mask;
+ int cpu, i = 0, j = 0;
+ int my_node = NUMA_NO_NODE;
/* Allocate memory for this devices private data. */
- dd = kzalloc(sizeof(struct driver_data), GFP_KERNEL);
+ my_node = pcibus_to_node(pdev->bus);
+ if (my_node != NUMA_NO_NODE) {
+ if (!node_online(my_node))
+ my_node = mtip_get_next_rr_node();
+ } else {
+ dev_info(&pdev->dev, "Kernel not reporting proximity, choosing a node\n");
+ my_node = mtip_get_next_rr_node();
+ }
+ dev_info(&pdev->dev, "NUMA node %d (closest: %d,%d, probe on %d:%d)\n",
+ my_node, pcibus_to_node(pdev->bus), dev_to_node(&pdev->dev),
+ cpu_to_node(smp_processor_id()), smp_processor_id());
+
+ dd = kzalloc_node(sizeof(struct driver_data), GFP_KERNEL, my_node);
if (dd == NULL) {
dev_err(&pdev->dev,
"Unable to allocate memory for driver data\n");
@@ -4016,19 +4212,82 @@ static int mtip_pci_probe(struct pci_dev *pdev,
}
}
- pci_set_master(pdev);
+ /* Copy the info we may need later into the private data structure. */
+ dd->major = mtip_major;
+ dd->instance = instance;
+ dd->pdev = pdev;
+ dd->numa_node = my_node;
+ memset(dd->workq_name, 0, 32);
+ snprintf(dd->workq_name, 31, "mtipq%d", dd->instance);
+
+ dd->isr_workq = create_workqueue(dd->workq_name);
+ if (!dd->isr_workq) {
+ dev_warn(&pdev->dev, "Can't create wq %d\n", dd->instance);
+ goto block_initialize_err;
+ }
+
+ memset(cpu_list, 0, sizeof(cpu_list));
+
+ node_mask = cpumask_of_node(dd->numa_node);
+ if (!cpumask_empty(node_mask)) {
+ for_each_cpu(cpu, node_mask)
+ {
+ snprintf(&cpu_list[j], 256 - j, "%d ", cpu);
+ j = strlen(cpu_list);
+ }
+
+ dev_info(&pdev->dev, "Node %d on package %d has %d cpu(s): %s\n",
+ dd->numa_node,
+ topology_physical_package_id(cpumask_first(node_mask)),
+ nr_cpus_node(dd->numa_node),
+ cpu_list);
+ } else
+ dev_dbg(&pdev->dev, "mtip32xx: node_mask empty\n");
+
+ dd->isr_binding = get_least_used_cpu_on_node(dd->numa_node);
+ dev_info(&pdev->dev, "Initial IRQ binding node:cpu %d:%d\n",
+ cpu_to_node(dd->isr_binding), dd->isr_binding);
+
+ /* first worker context always runs in ISR */
+ dd->work[0].cpu_binding = dd->isr_binding;
+ dd->work[1].cpu_binding = get_least_used_cpu_on_node(dd->numa_node);
+ dd->work[2].cpu_binding = get_least_used_cpu_on_node(dd->numa_node);
+ dd->work[3].cpu_binding = dd->work[0].cpu_binding;
+ dd->work[4].cpu_binding = dd->work[1].cpu_binding;
+ dd->work[5].cpu_binding = dd->work[2].cpu_binding;
+ dd->work[6].cpu_binding = dd->work[2].cpu_binding;
+ dd->work[7].cpu_binding = dd->work[1].cpu_binding;
+
+ /* Log the bindings */
+ for_each_present_cpu(cpu) {
+ memset(cpu_list, 0, sizeof(cpu_list));
+ for (i = 0, j = 0; i < MTIP_MAX_SLOT_GROUPS; i++) {
+ if (dd->work[i].cpu_binding == cpu) {
+ snprintf(&cpu_list[j], 256 - j, "%d ", i);
+ j = strlen(cpu_list);
+ }
+ }
+ if (j)
+ dev_info(&pdev->dev, "CPU %d: WQs %s\n", cpu, cpu_list);
+ }
+
+ INIT_WORK(&dd->work[0].work, mtip_workq_sdbf0);
+ INIT_WORK(&dd->work[1].work, mtip_workq_sdbf1);
+ INIT_WORK(&dd->work[2].work, mtip_workq_sdbf2);
+ INIT_WORK(&dd->work[3].work, mtip_workq_sdbf3);
+ INIT_WORK(&dd->work[4].work, mtip_workq_sdbf4);
+ INIT_WORK(&dd->work[5].work, mtip_workq_sdbf5);
+ INIT_WORK(&dd->work[6].work, mtip_workq_sdbf6);
+ INIT_WORK(&dd->work[7].work, mtip_workq_sdbf7);
+
+ pci_set_master(pdev);
if (pci_enable_msi(pdev)) {
dev_warn(&pdev->dev,
"Unable to enable MSI interrupt.\n");
goto block_initialize_err;
}
- /* Copy the info we may need later into the private data structure. */
- dd->major = mtip_major;
- dd->instance = instance;
- dd->pdev = pdev;
-
/* Initialize the block layer. */
rv = mtip_block_initialize(dd);
if (rv < 0) {
@@ -4048,7 +4307,13 @@ static int mtip_pci_probe(struct pci_dev *pdev,
block_initialize_err:
pci_disable_msi(pdev);
-
+ if (dd->isr_workq) {
+ flush_workqueue(dd->isr_workq);
+ destroy_workqueue(dd->isr_workq);
+ drop_cpu(dd->work[0].cpu_binding);
+ drop_cpu(dd->work[1].cpu_binding);
+ drop_cpu(dd->work[2].cpu_binding);
+ }
setmask_err:
pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
@@ -4089,6 +4354,14 @@ static void mtip_pci_remove(struct pci_dev *pdev)
/* Clean up the block layer. */
mtip_block_remove(dd);
+ if (dd->isr_workq) {
+ flush_workqueue(dd->isr_workq);
+ destroy_workqueue(dd->isr_workq);
+ drop_cpu(dd->work[0].cpu_binding);
+ drop_cpu(dd->work[1].cpu_binding);
+ drop_cpu(dd->work[2].cpu_binding);
+ }
+
pci_disable_msi(pdev);
kfree(dd);
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index b1742640556a..3bffff5f670c 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -164,6 +164,35 @@ struct smart_attr {
u8 res[3];
} __packed;
+struct mtip_work {
+ struct work_struct work;
+ void *port;
+ int cpu_binding;
+ u32 completed;
+} ____cacheline_aligned_in_smp;
+
+#define DEFINE_HANDLER(group) \
+ void mtip_workq_sdbf##group(struct work_struct *work) \
+ { \
+ struct mtip_work *w = (struct mtip_work *) work; \
+ mtip_workq_sdbfx(w->port, group, w->completed); \
+ }
+
+#define MTIP_TRIM_TIMEOUT_MS 240000
+#define MTIP_MAX_TRIM_ENTRIES 8
+#define MTIP_MAX_TRIM_ENTRY_LEN 0xfff8
+
+struct mtip_trim_entry {
+ u32 lba; /* starting lba of region */
+ u16 rsvd; /* unused */
+ u16 range; /* # of 512b blocks to trim */
+} __packed;
+
+struct mtip_trim {
+ /* Array of regions to trim */
+ struct mtip_trim_entry entry[MTIP_MAX_TRIM_ENTRIES];
+} __packed;
+
/* Register Frame Information Structure (FIS), host to device. */
struct host_to_dev_fis {
/*
@@ -424,7 +453,7 @@ struct mtip_port {
*/
struct semaphore cmd_slot;
/* Spinlock for working around command-issue bug. */
- spinlock_t cmd_issue_lock;
+ spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS];
};
/*
@@ -447,9 +476,6 @@ struct driver_data {
struct mtip_port *port; /* Pointer to the port data structure. */
- /* Tasklet used to process the bottom half of the ISR. */
- struct tasklet_struct tasklet;
-
unsigned product_type; /* magic value declaring the product type */
unsigned slot_groups; /* number of slot groups the product supports */
@@ -461,6 +487,20 @@ struct driver_data {
struct task_struct *mtip_svc_handler; /* task_struct of svc thd */
struct dentry *dfs_node;
+
+ bool trim_supp; /* flag indicating trim support */
+
+ int numa_node; /* NUMA support */
+
+ char workq_name[32];
+
+ struct workqueue_struct *isr_workq;
+
+ struct mtip_work work[MTIP_MAX_SLOT_GROUPS];
+
+ atomic_t irq_workers_active;
+
+ int isr_binding;
};
#endif
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 043ddcca4abf..7fecc784be01 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -98,6 +98,7 @@ static const char *nbdcmd_to_ascii(int cmd)
case NBD_CMD_READ: return "read";
case NBD_CMD_WRITE: return "write";
case NBD_CMD_DISC: return "disconnect";
+ case NBD_CMD_FLUSH: return "flush";
case NBD_CMD_TRIM: return "trim/discard";
}
return "invalid";
@@ -244,8 +245,15 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
request.magic = htonl(NBD_REQUEST_MAGIC);
request.type = htonl(nbd_cmd(req));
- request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
- request.len = htonl(size);
+
+ if (nbd_cmd(req) == NBD_CMD_FLUSH) {
+ /* Other values are reserved for FLUSH requests. */
+ request.from = 0;
+ request.len = 0;
+ } else {
+ request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
+ request.len = htonl(size);
+ }
memcpy(request.handle, &req, sizeof(req));
dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n",
@@ -482,6 +490,11 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
}
}
+ if (req->cmd_flags & REQ_FLUSH) {
+ BUG_ON(unlikely(blk_rq_sectors(req)));
+ nbd_cmd(req) = NBD_CMD_FLUSH;
+ }
+
req->errors = 0;
mutex_lock(&nbd->tx_lock);
@@ -551,6 +564,7 @@ static int nbd_thread(void *data)
*/
static void do_nbd_request(struct request_queue *q)
+ __releases(q->queue_lock) __acquires(q->queue_lock)
{
struct request *req;
@@ -595,12 +609,20 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
struct request sreq;
dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
+ if (!nbd->sock)
+ return -EINVAL;
+ mutex_unlock(&nbd->tx_lock);
+ fsync_bdev(bdev);
+ mutex_lock(&nbd->tx_lock);
blk_rq_init(NULL, &sreq);
sreq.cmd_type = REQ_TYPE_SPECIAL;
nbd_cmd(&sreq) = NBD_CMD_DISC;
+
+ /* Check again after getting mutex back. */
if (!nbd->sock)
return -EINVAL;
+
nbd_send_req(nbd, &sreq);
return 0;
}
@@ -614,6 +636,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
nbd_clear_que(nbd);
BUG_ON(!list_empty(&nbd->queue_head));
BUG_ON(!list_empty(&nbd->waiting_queue));
+ kill_bdev(bdev);
if (file)
fput(file);
return 0;
@@ -625,7 +648,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
return -EBUSY;
file = fget(arg);
if (file) {
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
if (S_ISSOCK(inode->i_mode)) {
nbd->file = file;
nbd->sock = SOCKET_I(inode);
@@ -681,9 +704,15 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
mutex_unlock(&nbd->tx_lock);
+ if (nbd->flags & NBD_FLAG_READ_ONLY)
+ set_device_ro(bdev, true);
if (nbd->flags & NBD_FLAG_SEND_TRIM)
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
nbd->disk->queue);
+ if (nbd->flags & NBD_FLAG_SEND_FLUSH)
+ blk_queue_flush(nbd->disk->queue, REQ_FLUSH);
+ else
+ blk_queue_flush(nbd->disk->queue, 0);
thread = kthread_create(nbd_thread, nbd, nbd->disk->disk_name);
if (IS_ERR(thread)) {
@@ -702,9 +731,12 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
nbd->file = NULL;
nbd_clear_que(nbd);
dev_warn(disk_to_dev(nbd->disk), "queue cleared\n");
+ kill_bdev(bdev);
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
+ set_device_ro(bdev, false);
if (file)
fput(file);
+ nbd->flags = 0;
nbd->bytesize = 0;
bdev->bd_inode->i_size = 0;
set_capacity(nbd->disk, 0);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 89576a0b3f2e..6c81a4c040b9 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -52,9 +52,12 @@
#define SECTOR_SHIFT 9
#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
-/* It might be useful to have this defined elsewhere too */
+/* It might be useful to have these defined elsewhere */
-#define U64_MAX ((u64) (~0ULL))
+#define U8_MAX ((u8) (~0U))
+#define U16_MAX ((u16) (~0U))
+#define U32_MAX ((u32) (~0U))
+#define U64_MAX ((u64) (~0ULL))
#define RBD_DRV_NAME "rbd"
#define RBD_DRV_NAME_LONG "rbd (rados block device)"
@@ -66,7 +69,6 @@
(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
-#define RBD_MAX_OPT_LEN 1024
#define RBD_SNAP_HEAD_NAME "-"
@@ -93,8 +95,6 @@
#define DEV_NAME_LEN 32
#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
-#define RBD_READ_ONLY_DEFAULT false
-
/*
* block device image metadata (in-memory version)
*/
@@ -119,16 +119,33 @@ struct rbd_image_header {
* An rbd image specification.
*
* The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
- * identify an image.
+ * identify an image. Each rbd_dev structure includes a pointer to
+ * an rbd_spec structure that encapsulates this identity.
+ *
+ * Each of the id's in an rbd_spec has an associated name. For a
+ * user-mapped image, the names are supplied and the id's associated
+ * with them are looked up. For a layered image, a parent image is
+ * defined by the tuple, and the names are looked up.
+ *
+ * An rbd_dev structure contains a parent_spec pointer which is
+ * non-null if the image it represents is a child in a layered
+ * image. This pointer will refer to the rbd_spec structure used
+ * by the parent rbd_dev for its own identity (i.e., the structure
+ * is shared between the parent and child).
+ *
+ * Since these structures are populated once, during the discovery
+ * phase of image construction, they are effectively immutable so
+ * we make no effort to synchronize access to them.
+ *
+ * Note that code herein does not assume the image name is known (it
+ * could be a null pointer).
*/
struct rbd_spec {
u64 pool_id;
char *pool_name;
char *image_id;
- size_t image_id_len;
char *image_name;
- size_t image_name_len;
u64 snap_id;
char *snap_name;
@@ -136,10 +153,6 @@ struct rbd_spec {
struct kref kref;
};
-struct rbd_options {
- bool read_only;
-};
-
/*
* an instance of the client. multiple devices may share an rbd client.
*/
@@ -149,37 +162,76 @@ struct rbd_client {
struct list_head node;
};
-/*
- * a request completion status
- */
-struct rbd_req_status {
- int done;
- int rc;
- u64 bytes;
+struct rbd_img_request;
+typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
+
+#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
+
+struct rbd_obj_request;
+typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
+
+enum obj_request_type {
+ OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
};
-/*
- * a collection of requests
- */
-struct rbd_req_coll {
- int total;
- int num_done;
+struct rbd_obj_request {
+ const char *object_name;
+ u64 offset; /* object start byte */
+ u64 length; /* bytes from offset */
+
+ struct rbd_img_request *img_request;
+ struct list_head links; /* img_request->obj_requests */
+ u32 which; /* posn image request list */
+
+ enum obj_request_type type;
+ union {
+ struct bio *bio_list;
+ struct {
+ struct page **pages;
+ u32 page_count;
+ };
+ };
+
+ struct ceph_osd_request *osd_req;
+
+ u64 xferred; /* bytes transferred */
+ u64 version;
+ int result;
+ atomic_t done;
+
+ rbd_obj_callback_t callback;
+ struct completion completion;
+
struct kref kref;
- struct rbd_req_status status[0];
};
-/*
- * a single io request
- */
-struct rbd_request {
- struct request *rq; /* blk layer request */
- struct bio *bio; /* cloned bio */
- struct page **pages; /* list of used pages */
- u64 len;
- int coll_index;
- struct rbd_req_coll *coll;
+struct rbd_img_request {
+ struct request *rq;
+ struct rbd_device *rbd_dev;
+ u64 offset; /* starting image byte offset */
+ u64 length; /* byte count from offset */
+ bool write_request; /* false for read */
+ union {
+ struct ceph_snap_context *snapc; /* for writes */
+ u64 snap_id; /* for reads */
+ };
+ spinlock_t completion_lock;/* protects next_completion */
+ u32 next_completion;
+ rbd_img_callback_t callback;
+
+ u32 obj_request_count;
+ struct list_head obj_requests; /* rbd_obj_request structs */
+
+ struct kref kref;
};
+#define for_each_obj_request(ireq, oreq) \
+ list_for_each_entry(oreq, &(ireq)->obj_requests, links)
+#define for_each_obj_request_from(ireq, oreq) \
+ list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
+#define for_each_obj_request_safe(ireq, oreq, n) \
+ list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
+
struct rbd_snap {
struct device dev;
const char *name;
@@ -209,16 +261,18 @@ struct rbd_device {
char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
- spinlock_t lock; /* queue lock */
+ spinlock_t lock; /* queue, flags, open_count */
struct rbd_image_header header;
- bool exists;
+ unsigned long flags; /* possibly lock protected */
struct rbd_spec *spec;
char *header_name;
+ struct ceph_file_layout layout;
+
struct ceph_osd_event *watch_event;
- struct ceph_osd_request *watch_request;
+ struct rbd_obj_request *watch_request;
struct rbd_spec *parent_spec;
u64 parent_overlap;
@@ -235,7 +289,19 @@ struct rbd_device {
/* sysfs related */
struct device dev;
- unsigned long open_count;
+ unsigned long open_count; /* protected by lock */
+};
+
+/*
+ * Flag bits for rbd_dev->flags. If atomicity is required,
+ * rbd_dev->lock is used to protect access.
+ *
+ * Currently, only the "removing" flag (which is coupled with the
+ * "open_count" field) requires atomic access.
+ */
+enum rbd_dev_flags {
+ RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
+ RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
};
static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
@@ -277,6 +343,33 @@ static struct device rbd_root_dev = {
.release = rbd_root_dev_release,
};
+static __printf(2, 3)
+void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ if (!rbd_dev)
+ printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
+ else if (rbd_dev->disk)
+ printk(KERN_WARNING "%s: %s: %pV\n",
+ RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
+ else if (rbd_dev->spec && rbd_dev->spec->image_name)
+ printk(KERN_WARNING "%s: image %s: %pV\n",
+ RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
+ else if (rbd_dev->spec && rbd_dev->spec->image_id)
+ printk(KERN_WARNING "%s: id %s: %pV\n",
+ RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
+ else /* punt */
+ printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
+ RBD_DRV_NAME, rbd_dev, &vaf);
+ va_end(args);
+}
+
#ifdef RBD_DEBUG
#define rbd_assert(expr) \
if (unlikely(!(expr))) { \
@@ -296,14 +389,23 @@ static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
static int rbd_open(struct block_device *bdev, fmode_t mode)
{
struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
+ bool removing = false;
if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
return -EROFS;
+ spin_lock_irq(&rbd_dev->lock);
+ if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
+ removing = true;
+ else
+ rbd_dev->open_count++;
+ spin_unlock_irq(&rbd_dev->lock);
+ if (removing)
+ return -ENOENT;
+
mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
(void) get_device(&rbd_dev->dev);
set_device_ro(bdev, rbd_dev->mapping.read_only);
- rbd_dev->open_count++;
mutex_unlock(&ctl_mutex);
return 0;
@@ -312,10 +414,14 @@ static int rbd_open(struct block_device *bdev, fmode_t mode)
static int rbd_release(struct gendisk *disk, fmode_t mode)
{
struct rbd_device *rbd_dev = disk->private_data;
+ unsigned long open_count_before;
+
+ spin_lock_irq(&rbd_dev->lock);
+ open_count_before = rbd_dev->open_count--;
+ spin_unlock_irq(&rbd_dev->lock);
+ rbd_assert(open_count_before > 0);
mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
- rbd_assert(rbd_dev->open_count > 0);
- rbd_dev->open_count--;
put_device(&rbd_dev->dev);
mutex_unlock(&ctl_mutex);
@@ -337,7 +443,7 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
struct rbd_client *rbdc;
int ret = -ENOMEM;
- dout("rbd_client_create\n");
+ dout("%s:\n", __func__);
rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
if (!rbdc)
goto out_opt;
@@ -361,8 +467,8 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
spin_unlock(&rbd_client_list_lock);
mutex_unlock(&ctl_mutex);
+ dout("%s: rbdc %p\n", __func__, rbdc);
- dout("rbd_client_create created %p\n", rbdc);
return rbdc;
out_err:
@@ -373,6 +479,8 @@ out_mutex:
out_opt:
if (ceph_opts)
ceph_destroy_options(ceph_opts);
+ dout("%s: error %d\n", __func__, ret);
+
return ERR_PTR(ret);
}
@@ -426,6 +534,12 @@ static match_table_t rbd_opts_tokens = {
{-1, NULL}
};
+struct rbd_options {
+ bool read_only;
+};
+
+#define RBD_READ_ONLY_DEFAULT false
+
static int parse_rbd_opts_token(char *c, void *private)
{
struct rbd_options *rbd_opts = private;
@@ -493,7 +607,7 @@ static void rbd_client_release(struct kref *kref)
{
struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
- dout("rbd_release_client %p\n", rbdc);
+ dout("%s: rbdc %p\n", __func__, rbdc);
spin_lock(&rbd_client_list_lock);
list_del(&rbdc->node);
spin_unlock(&rbd_client_list_lock);
@@ -512,18 +626,6 @@ static void rbd_put_client(struct rbd_client *rbdc)
kref_put(&rbdc->kref, rbd_client_release);
}
-/*
- * Destroy requests collection
- */
-static void rbd_coll_release(struct kref *kref)
-{
- struct rbd_req_coll *coll =
- container_of(kref, struct rbd_req_coll, kref);
-
- dout("rbd_coll_release %p\n", coll);
- kfree(coll);
-}
-
static bool rbd_image_format_valid(u32 image_format)
{
return image_format == 1 || image_format == 2;
@@ -707,7 +809,8 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
goto done;
rbd_dev->mapping.read_only = true;
}
- rbd_dev->exists = true;
+ set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
+
done:
return ret;
}
@@ -724,7 +827,7 @@ static void rbd_header_free(struct rbd_image_header *header)
header->snapc = NULL;
}
-static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
+static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
{
char *name;
u64 segment;
@@ -767,23 +870,6 @@ static u64 rbd_segment_length(struct rbd_device *rbd_dev,
return length;
}
-static int rbd_get_num_segments(struct rbd_image_header *header,
- u64 ofs, u64 len)
-{
- u64 start_seg;
- u64 end_seg;
-
- if (!len)
- return 0;
- if (len - 1 > U64_MAX - ofs)
- return -ERANGE;
-
- start_seg = ofs >> header->obj_order;
- end_seg = (ofs + len - 1) >> header->obj_order;
-
- return end_seg - start_seg + 1;
-}
-
/*
* returns the size of an object in the image
*/
@@ -949,8 +1035,10 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src,
unsigned int bi_size;
struct bio *bio;
- if (!bi)
+ if (!bi) {
+ rbd_warn(NULL, "bio_chain exhausted with %u left", len);
goto out_err; /* EINVAL; ran out of bio's */
+ }
bi_size = min_t(unsigned int, bi->bi_size - off, len);
bio = bio_clone_range(bi, off, bi_size, gfpmask);
if (!bio)
@@ -976,399 +1064,721 @@ out_err:
return NULL;
}
-/*
- * helpers for osd request op vectors.
- */
-static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
- int opcode, u32 payload_len)
+static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
{
- struct ceph_osd_req_op *ops;
+ dout("%s: obj %p (was %d)\n", __func__, obj_request,
+ atomic_read(&obj_request->kref.refcount));
+ kref_get(&obj_request->kref);
+}
+
+static void rbd_obj_request_destroy(struct kref *kref);
+static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
+{
+ rbd_assert(obj_request != NULL);
+ dout("%s: obj %p (was %d)\n", __func__, obj_request,
+ atomic_read(&obj_request->kref.refcount));
+ kref_put(&obj_request->kref, rbd_obj_request_destroy);
+}
+
+static void rbd_img_request_get(struct rbd_img_request *img_request)
+{
+ dout("%s: img %p (was %d)\n", __func__, img_request,
+ atomic_read(&img_request->kref.refcount));
+ kref_get(&img_request->kref);
+}
+
+static void rbd_img_request_destroy(struct kref *kref);
+static void rbd_img_request_put(struct rbd_img_request *img_request)
+{
+ rbd_assert(img_request != NULL);
+ dout("%s: img %p (was %d)\n", __func__, img_request,
+ atomic_read(&img_request->kref.refcount));
+ kref_put(&img_request->kref, rbd_img_request_destroy);
+}
+
+static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
+ struct rbd_obj_request *obj_request)
+{
+ rbd_assert(obj_request->img_request == NULL);
+
+ rbd_obj_request_get(obj_request);
+ obj_request->img_request = img_request;
+ obj_request->which = img_request->obj_request_count;
+ rbd_assert(obj_request->which != BAD_WHICH);
+ img_request->obj_request_count++;
+ list_add_tail(&obj_request->links, &img_request->obj_requests);
+ dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
+ obj_request->which);
+}
- ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
- if (!ops)
+static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
+ struct rbd_obj_request *obj_request)
+{
+ rbd_assert(obj_request->which != BAD_WHICH);
+
+ dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
+ obj_request->which);
+ list_del(&obj_request->links);
+ rbd_assert(img_request->obj_request_count > 0);
+ img_request->obj_request_count--;
+ rbd_assert(obj_request->which == img_request->obj_request_count);
+ obj_request->which = BAD_WHICH;
+ rbd_assert(obj_request->img_request == img_request);
+ obj_request->img_request = NULL;
+ obj_request->callback = NULL;
+ rbd_obj_request_put(obj_request);
+}
+
+static bool obj_request_type_valid(enum obj_request_type type)
+{
+ switch (type) {
+ case OBJ_REQUEST_NODATA:
+ case OBJ_REQUEST_BIO:
+ case OBJ_REQUEST_PAGES:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
+{
+ struct ceph_osd_req_op *op;
+ va_list args;
+ size_t size;
+
+ op = kzalloc(sizeof (*op), GFP_NOIO);
+ if (!op)
return NULL;
+ op->op = opcode;
+ va_start(args, opcode);
+ switch (opcode) {
+ case CEPH_OSD_OP_READ:
+ case CEPH_OSD_OP_WRITE:
+ /* rbd_osd_req_op_create(READ, offset, length) */
+ /* rbd_osd_req_op_create(WRITE, offset, length) */
+ op->extent.offset = va_arg(args, u64);
+ op->extent.length = va_arg(args, u64);
+ if (opcode == CEPH_OSD_OP_WRITE)
+ op->payload_len = op->extent.length;
+ break;
+ case CEPH_OSD_OP_STAT:
+ break;
+ case CEPH_OSD_OP_CALL:
+ /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
+ op->cls.class_name = va_arg(args, char *);
+ size = strlen(op->cls.class_name);
+ rbd_assert(size <= (size_t) U8_MAX);
+ op->cls.class_len = size;
+ op->payload_len = size;
+
+ op->cls.method_name = va_arg(args, char *);
+ size = strlen(op->cls.method_name);
+ rbd_assert(size <= (size_t) U8_MAX);
+ op->cls.method_len = size;
+ op->payload_len += size;
+
+ op->cls.argc = 0;
+ op->cls.indata = va_arg(args, void *);
+ size = va_arg(args, size_t);
+ rbd_assert(size <= (size_t) U32_MAX);
+ op->cls.indata_len = (u32) size;
+ op->payload_len += size;
+ break;
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ case CEPH_OSD_OP_WATCH:
+ /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
+ /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
+ op->watch.cookie = va_arg(args, u64);
+ op->watch.ver = va_arg(args, u64);
+ op->watch.ver = cpu_to_le64(op->watch.ver);
+ if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
+ op->watch.flag = (u8) 1;
+ break;
+ default:
+ rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
+ kfree(op);
+ op = NULL;
+ break;
+ }
+ va_end(args);
- ops[0].op = opcode;
+ return op;
+}
- /*
- * op extent offset and length will be set later on
- * in calc_raw_layout()
- */
- ops[0].payload_len = payload_len;
+static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
+{
+ kfree(op);
+}
+
+static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
+ struct rbd_obj_request *obj_request)
+{
+ dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
- return ops;
+ return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
}
-static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
+static void rbd_img_request_complete(struct rbd_img_request *img_request)
{
- kfree(ops);
+ dout("%s: img %p\n", __func__, img_request);
+ if (img_request->callback)
+ img_request->callback(img_request);
+ else
+ rbd_img_request_put(img_request);
}
-static void rbd_coll_end_req_index(struct request *rq,
- struct rbd_req_coll *coll,
- int index,
- int ret, u64 len)
+/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
+
+static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
{
- struct request_queue *q;
- int min, max, i;
+ dout("%s: obj %p\n", __func__, obj_request);
- dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
- coll, index, ret, (unsigned long long) len);
+ return wait_for_completion_interruptible(&obj_request->completion);
+}
- if (!rq)
- return;
+static void obj_request_done_init(struct rbd_obj_request *obj_request)
+{
+ atomic_set(&obj_request->done, 0);
+ smp_wmb();
+}
- if (!coll) {
- blk_end_request(rq, ret, len);
- return;
+static void obj_request_done_set(struct rbd_obj_request *obj_request)
+{
+ int done;
+
+ done = atomic_inc_return(&obj_request->done);
+ if (done > 1) {
+ struct rbd_img_request *img_request = obj_request->img_request;
+ struct rbd_device *rbd_dev;
+
+ rbd_dev = img_request ? img_request->rbd_dev : NULL;
+ rbd_warn(rbd_dev, "obj_request %p was already done\n",
+ obj_request);
}
+}
- q = rq->q;
-
- spin_lock_irq(q->queue_lock);
- coll->status[index].done = 1;
- coll->status[index].rc = ret;
- coll->status[index].bytes = len;
- max = min = coll->num_done;
- while (max < coll->total && coll->status[max].done)
- max++;
-
- for (i = min; i<max; i++) {
- __blk_end_request(rq, coll->status[i].rc,
- coll->status[i].bytes);
- coll->num_done++;
- kref_put(&coll->kref, rbd_coll_release);
+static bool obj_request_done_test(struct rbd_obj_request *obj_request)
+{
+ smp_mb();
+ return atomic_read(&obj_request->done) != 0;
+}
+
+static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
+{
+ dout("%s: obj %p cb %p\n", __func__, obj_request,
+ obj_request->callback);
+ if (obj_request->callback)
+ obj_request->callback(obj_request);
+ else
+ complete_all(&obj_request->completion);
+}
+
+static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
+{
+ dout("%s: obj %p\n", __func__, obj_request);
+ obj_request_done_set(obj_request);
+}
+
+static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
+{
+ dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
+ obj_request->result, obj_request->xferred, obj_request->length);
+ /*
+ * ENOENT means a hole in the object. We zero-fill the
+ * entire length of the request. A short read also implies
+ * zero-fill to the end of the request. Either way we
+ * update the xferred count to indicate the whole request
+ * was satisfied.
+ */
+ if (obj_request->result == -ENOENT) {
+ zero_bio_chain(obj_request->bio_list, 0);
+ obj_request->result = 0;
+ obj_request->xferred = obj_request->length;
+ } else if (obj_request->xferred < obj_request->length &&
+ !obj_request->result) {
+ zero_bio_chain(obj_request->bio_list, obj_request->xferred);
+ obj_request->xferred = obj_request->length;
}
- spin_unlock_irq(q->queue_lock);
+ obj_request_done_set(obj_request);
}
-static void rbd_coll_end_req(struct rbd_request *req,
- int ret, u64 len)
+static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
{
- rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
+ dout("%s: obj %p result %d %llu\n", __func__, obj_request,
+ obj_request->result, obj_request->length);
+ /*
+ * There is no such thing as a successful short write.
+ * Our xferred value is the number of bytes transferred
+ * back. Set it to our originally-requested length.
+ */
+ obj_request->xferred = obj_request->length;
+ obj_request_done_set(obj_request);
}
/*
- * Send ceph osd request
+ * For a simple stat call there's nothing to do. We'll do more if
+ * this is part of a write sequence for a layered image.
*/
-static int rbd_do_request(struct request *rq,
- struct rbd_device *rbd_dev,
- struct ceph_snap_context *snapc,
- u64 snapid,
- const char *object_name, u64 ofs, u64 len,
- struct bio *bio,
- struct page **pages,
- int num_pages,
- int flags,
- struct ceph_osd_req_op *ops,
- struct rbd_req_coll *coll,
- int coll_index,
- void (*rbd_cb)(struct ceph_osd_request *req,
- struct ceph_msg *msg),
- struct ceph_osd_request **linger_req,
- u64 *ver)
-{
- struct ceph_osd_request *req;
- struct ceph_file_layout *layout;
- int ret;
- u64 bno;
- struct timespec mtime = CURRENT_TIME;
- struct rbd_request *req_data;
- struct ceph_osd_request_head *reqhead;
- struct ceph_osd_client *osdc;
+static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
+{
+ dout("%s: obj %p\n", __func__, obj_request);
+ obj_request_done_set(obj_request);
+}
- req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
- if (!req_data) {
- if (coll)
- rbd_coll_end_req_index(rq, coll, coll_index,
- -ENOMEM, len);
- return -ENOMEM;
+static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
+ struct ceph_msg *msg)
+{
+ struct rbd_obj_request *obj_request = osd_req->r_priv;
+ u16 opcode;
+
+ dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
+ rbd_assert(osd_req == obj_request->osd_req);
+ rbd_assert(!!obj_request->img_request ^
+ (obj_request->which == BAD_WHICH));
+
+ if (osd_req->r_result < 0)
+ obj_request->result = osd_req->r_result;
+ obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
+
+ WARN_ON(osd_req->r_num_ops != 1); /* For now */
+
+ /*
+ * We support a 64-bit length, but ultimately it has to be
+ * passed to blk_end_request(), which takes an unsigned int.
+ */
+ obj_request->xferred = osd_req->r_reply_op_len[0];
+ rbd_assert(obj_request->xferred < (u64) UINT_MAX);
+ opcode = osd_req->r_request_ops[0].op;
+ switch (opcode) {
+ case CEPH_OSD_OP_READ:
+ rbd_osd_read_callback(obj_request);
+ break;
+ case CEPH_OSD_OP_WRITE:
+ rbd_osd_write_callback(obj_request);
+ break;
+ case CEPH_OSD_OP_STAT:
+ rbd_osd_stat_callback(obj_request);
+ break;
+ case CEPH_OSD_OP_CALL:
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ case CEPH_OSD_OP_WATCH:
+ rbd_osd_trivial_callback(obj_request);
+ break;
+ default:
+ rbd_warn(NULL, "%s: unsupported op %hu\n",
+ obj_request->object_name, (unsigned short) opcode);
+ break;
}
- if (coll) {
- req_data->coll = coll;
- req_data->coll_index = coll_index;
+ if (obj_request_done_test(obj_request))
+ rbd_obj_request_complete(obj_request);
+}
+
+static struct ceph_osd_request *rbd_osd_req_create(
+ struct rbd_device *rbd_dev,
+ bool write_request,
+ struct rbd_obj_request *obj_request,
+ struct ceph_osd_req_op *op)
+{
+ struct rbd_img_request *img_request = obj_request->img_request;
+ struct ceph_snap_context *snapc = NULL;
+ struct ceph_osd_client *osdc;
+ struct ceph_osd_request *osd_req;
+ struct timespec now;
+ struct timespec *mtime;
+ u64 snap_id = CEPH_NOSNAP;
+ u64 offset = obj_request->offset;
+ u64 length = obj_request->length;
+
+ if (img_request) {
+ rbd_assert(img_request->write_request == write_request);
+ if (img_request->write_request)
+ snapc = img_request->snapc;
+ else
+ snap_id = img_request->snap_id;
}
- dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
- object_name, (unsigned long long) ofs,
- (unsigned long long) len, coll, coll_index);
+ /* Allocate and initialize the request, for the single op */
osdc = &rbd_dev->rbd_client->client->osdc;
- req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
- false, GFP_NOIO, pages, bio);
- if (!req) {
- ret = -ENOMEM;
- goto done_pages;
+ osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
+ if (!osd_req)
+ return NULL; /* ENOMEM */
+
+ rbd_assert(obj_request_type_valid(obj_request->type));
+ switch (obj_request->type) {
+ case OBJ_REQUEST_NODATA:
+ break; /* Nothing to do */
+ case OBJ_REQUEST_BIO:
+ rbd_assert(obj_request->bio_list != NULL);
+ osd_req->r_bio = obj_request->bio_list;
+ break;
+ case OBJ_REQUEST_PAGES:
+ osd_req->r_pages = obj_request->pages;
+ osd_req->r_num_pages = obj_request->page_count;
+ osd_req->r_page_alignment = offset & ~PAGE_MASK;
+ break;
}
- req->r_callback = rbd_cb;
+ if (write_request) {
+ osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+ now = CURRENT_TIME;
+ mtime = &now;
+ } else {
+ osd_req->r_flags = CEPH_OSD_FLAG_READ;
+ mtime = NULL; /* not needed for reads */
+ offset = 0; /* These are not used... */
+ length = 0; /* ...for osd read requests */
+ }
- req_data->rq = rq;
- req_data->bio = bio;
- req_data->pages = pages;
- req_data->len = len;
+ osd_req->r_callback = rbd_osd_req_callback;
+ osd_req->r_priv = obj_request;
- req->r_priv = req_data;
+ osd_req->r_oid_len = strlen(obj_request->object_name);
+ rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
+ memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
- reqhead = req->r_request->front.iov_base;
- reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
+ osd_req->r_file_layout = rbd_dev->layout; /* struct */
- strncpy(req->r_oid, object_name, sizeof(req->r_oid));
- req->r_oid_len = strlen(req->r_oid);
+ /* osd_req will get its own reference to snapc (if non-null) */
- layout = &req->r_file_layout;
- memset(layout, 0, sizeof(*layout));
- layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
- layout->fl_stripe_count = cpu_to_le32(1);
- layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
- layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
- ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
- req, ops);
- rbd_assert(ret == 0);
+ ceph_osdc_build_request(osd_req, offset, length, 1, op,
+ snapc, snap_id, mtime);
- ceph_osdc_build_request(req, ofs, &len,
- ops,
- snapc,
- &mtime,
- req->r_oid, req->r_oid_len);
+ return osd_req;
+}
- if (linger_req) {
- ceph_osdc_set_request_linger(osdc, req);
- *linger_req = req;
- }
+static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
+{
+ ceph_osdc_put_request(osd_req);
+}
- ret = ceph_osdc_start_request(osdc, req, false);
- if (ret < 0)
- goto done_err;
-
- if (!rbd_cb) {
- ret = ceph_osdc_wait_request(osdc, req);
- if (ver)
- *ver = le64_to_cpu(req->r_reassert_version.version);
- dout("reassert_ver=%llu\n",
- (unsigned long long)
- le64_to_cpu(req->r_reassert_version.version));
- ceph_osdc_put_request(req);
+/* object_name is assumed to be a non-null pointer and NUL-terminated */
+
+static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
+ u64 offset, u64 length,
+ enum obj_request_type type)
+{
+ struct rbd_obj_request *obj_request;
+ size_t size;
+ char *name;
+
+ rbd_assert(obj_request_type_valid(type));
+
+ size = strlen(object_name) + 1;
+ obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
+ if (!obj_request)
+ return NULL;
+
+ name = (char *)(obj_request + 1);
+ obj_request->object_name = memcpy(name, object_name, size);
+ obj_request->offset = offset;
+ obj_request->length = length;
+ obj_request->which = BAD_WHICH;
+ obj_request->type = type;
+ INIT_LIST_HEAD(&obj_request->links);
+ obj_request_done_init(obj_request);
+ init_completion(&obj_request->completion);
+ kref_init(&obj_request->kref);
+
+ dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
+ offset, length, (int)type, obj_request);
+
+ return obj_request;
+}
+
+static void rbd_obj_request_destroy(struct kref *kref)
+{
+ struct rbd_obj_request *obj_request;
+
+ obj_request = container_of(kref, struct rbd_obj_request, kref);
+
+ dout("%s: obj %p\n", __func__, obj_request);
+
+ rbd_assert(obj_request->img_request == NULL);
+ rbd_assert(obj_request->which == BAD_WHICH);
+
+ if (obj_request->osd_req)
+ rbd_osd_req_destroy(obj_request->osd_req);
+
+ rbd_assert(obj_request_type_valid(obj_request->type));
+ switch (obj_request->type) {
+ case OBJ_REQUEST_NODATA:
+ break; /* Nothing to do */
+ case OBJ_REQUEST_BIO:
+ if (obj_request->bio_list)
+ bio_chain_put(obj_request->bio_list);
+ break;
+ case OBJ_REQUEST_PAGES:
+ if (obj_request->pages)
+ ceph_release_page_vector(obj_request->pages,
+ obj_request->page_count);
+ break;
}
- return ret;
-done_err:
- bio_chain_put(req_data->bio);
- ceph_osdc_put_request(req);
-done_pages:
- rbd_coll_end_req(req_data, ret, len);
- kfree(req_data);
- return ret;
+ kfree(obj_request);
}
/*
- * Ceph osd op callback
+ * Caller is responsible for filling in the list of object requests
+ * that comprises the image request, and the Linux request pointer
+ * (if there is one).
*/
-static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
-{
- struct rbd_request *req_data = req->r_priv;
- struct ceph_osd_reply_head *replyhead;
- struct ceph_osd_op *op;
- __s32 rc;
- u64 bytes;
- int read_op;
-
- /* parse reply */
- replyhead = msg->front.iov_base;
- WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
- op = (void *)(replyhead + 1);
- rc = le32_to_cpu(replyhead->result);
- bytes = le64_to_cpu(op->extent.length);
- read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
-
- dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
- (unsigned long long) bytes, read_op, (int) rc);
-
- if (rc == -ENOENT && read_op) {
- zero_bio_chain(req_data->bio, 0);
- rc = 0;
- } else if (rc == 0 && read_op && bytes < req_data->len) {
- zero_bio_chain(req_data->bio, bytes);
- bytes = req_data->len;
- }
+static struct rbd_img_request *rbd_img_request_create(
+ struct rbd_device *rbd_dev,
+ u64 offset, u64 length,
+ bool write_request)
+{
+ struct rbd_img_request *img_request;
+ struct ceph_snap_context *snapc = NULL;
- rbd_coll_end_req(req_data, rc, bytes);
+ img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
+ if (!img_request)
+ return NULL;
- if (req_data->bio)
- bio_chain_put(req_data->bio);
+ if (write_request) {
+ down_read(&rbd_dev->header_rwsem);
+ snapc = ceph_get_snap_context(rbd_dev->header.snapc);
+ up_read(&rbd_dev->header_rwsem);
+ if (WARN_ON(!snapc)) {
+ kfree(img_request);
+ return NULL; /* Shouldn't happen */
+ }
+ }
- ceph_osdc_put_request(req);
- kfree(req_data);
+ img_request->rq = NULL;
+ img_request->rbd_dev = rbd_dev;
+ img_request->offset = offset;
+ img_request->length = length;
+ img_request->write_request = write_request;
+ if (write_request)
+ img_request->snapc = snapc;
+ else
+ img_request->snap_id = rbd_dev->spec->snap_id;
+ spin_lock_init(&img_request->completion_lock);
+ img_request->next_completion = 0;
+ img_request->callback = NULL;
+ img_request->obj_request_count = 0;
+ INIT_LIST_HEAD(&img_request->obj_requests);
+ kref_init(&img_request->kref);
+
+ rbd_img_request_get(img_request); /* Avoid a warning */
+ rbd_img_request_put(img_request); /* TEMPORARY */
+
+ dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
+ write_request ? "write" : "read", offset, length,
+ img_request);
+
+ return img_request;
}
-static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
+static void rbd_img_request_destroy(struct kref *kref)
{
- ceph_osdc_put_request(req);
+ struct rbd_img_request *img_request;
+ struct rbd_obj_request *obj_request;
+ struct rbd_obj_request *next_obj_request;
+
+ img_request = container_of(kref, struct rbd_img_request, kref);
+
+ dout("%s: img %p\n", __func__, img_request);
+
+ for_each_obj_request_safe(img_request, obj_request, next_obj_request)
+ rbd_img_obj_request_del(img_request, obj_request);
+ rbd_assert(img_request->obj_request_count == 0);
+
+ if (img_request->write_request)
+ ceph_put_snap_context(img_request->snapc);
+
+ kfree(img_request);
}
-/*
- * Do a synchronous ceph osd operation
- */
-static int rbd_req_sync_op(struct rbd_device *rbd_dev,
- struct ceph_snap_context *snapc,
- u64 snapid,
- int flags,
- struct ceph_osd_req_op *ops,
- const char *object_name,
- u64 ofs, u64 inbound_size,
- char *inbound,
- struct ceph_osd_request **linger_req,
- u64 *ver)
+static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
+ struct bio *bio_list)
{
- int ret;
- struct page **pages;
- int num_pages;
-
- rbd_assert(ops != NULL);
+ struct rbd_device *rbd_dev = img_request->rbd_dev;
+ struct rbd_obj_request *obj_request = NULL;
+ struct rbd_obj_request *next_obj_request;
+ unsigned int bio_offset;
+ u64 image_offset;
+ u64 resid;
+ u16 opcode;
+
+ dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
+
+ opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
+ : CEPH_OSD_OP_READ;
+ bio_offset = 0;
+ image_offset = img_request->offset;
+ rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
+ resid = img_request->length;
+ rbd_assert(resid > 0);
+ while (resid) {
+ const char *object_name;
+ unsigned int clone_size;
+ struct ceph_osd_req_op *op;
+ u64 offset;
+ u64 length;
+
+ object_name = rbd_segment_name(rbd_dev, image_offset);
+ if (!object_name)
+ goto out_unwind;
+ offset = rbd_segment_offset(rbd_dev, image_offset);
+ length = rbd_segment_length(rbd_dev, image_offset, resid);
+ obj_request = rbd_obj_request_create(object_name,
+ offset, length,
+ OBJ_REQUEST_BIO);
+ kfree(object_name); /* object request has its own copy */
+ if (!obj_request)
+ goto out_unwind;
+
+ rbd_assert(length <= (u64) UINT_MAX);
+ clone_size = (unsigned int) length;
+ obj_request->bio_list = bio_chain_clone_range(&bio_list,
+ &bio_offset, clone_size,
+ GFP_ATOMIC);
+ if (!obj_request->bio_list)
+ goto out_partial;
- num_pages = calc_pages_for(ofs, inbound_size);
- pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
- if (IS_ERR(pages))
- return PTR_ERR(pages);
+ /*
+ * Build up the op to use in building the osd
+ * request. Note that the contents of the op are
+ * copied by rbd_osd_req_create().
+ */
+ op = rbd_osd_req_op_create(opcode, offset, length);
+ if (!op)
+ goto out_partial;
+ obj_request->osd_req = rbd_osd_req_create(rbd_dev,
+ img_request->write_request,
+ obj_request, op);
+ rbd_osd_req_op_destroy(op);
+ if (!obj_request->osd_req)
+ goto out_partial;
+ /* status and version are initially zero-filled */
+
+ rbd_img_obj_request_add(img_request, obj_request);
+
+ image_offset += length;
+ resid -= length;
+ }
- ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
- object_name, ofs, inbound_size, NULL,
- pages, num_pages,
- flags,
- ops,
- NULL, 0,
- NULL,
- linger_req, ver);
- if (ret < 0)
- goto done;
+ return 0;
- if ((flags & CEPH_OSD_FLAG_READ) && inbound)
- ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
+out_partial:
+ rbd_obj_request_put(obj_request);
+out_unwind:
+ for_each_obj_request_safe(img_request, obj_request, next_obj_request)
+ rbd_obj_request_put(obj_request);
-done:
- ceph_release_page_vector(pages, num_pages);
- return ret;
+ return -ENOMEM;
}
-/*
- * Do an asynchronous ceph osd operation
- */
-static int rbd_do_op(struct request *rq,
- struct rbd_device *rbd_dev,
- struct ceph_snap_context *snapc,
- u64 ofs, u64 len,
- struct bio *bio,
- struct rbd_req_coll *coll,
- int coll_index)
-{
- char *seg_name;
- u64 seg_ofs;
- u64 seg_len;
- int ret;
- struct ceph_osd_req_op *ops;
- u32 payload_len;
- int opcode;
- int flags;
- u64 snapid;
-
- seg_name = rbd_segment_name(rbd_dev, ofs);
- if (!seg_name)
- return -ENOMEM;
- seg_len = rbd_segment_length(rbd_dev, ofs, len);
- seg_ofs = rbd_segment_offset(rbd_dev, ofs);
-
- if (rq_data_dir(rq) == WRITE) {
- opcode = CEPH_OSD_OP_WRITE;
- flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
- snapid = CEPH_NOSNAP;
- payload_len = seg_len;
- } else {
- opcode = CEPH_OSD_OP_READ;
- flags = CEPH_OSD_FLAG_READ;
- snapc = NULL;
- snapid = rbd_dev->spec->snap_id;
- payload_len = 0;
+static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
+{
+ struct rbd_img_request *img_request;
+ u32 which = obj_request->which;
+ bool more = true;
+
+ img_request = obj_request->img_request;
+
+ dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
+ rbd_assert(img_request != NULL);
+ rbd_assert(img_request->rq != NULL);
+ rbd_assert(img_request->obj_request_count > 0);
+ rbd_assert(which != BAD_WHICH);
+ rbd_assert(which < img_request->obj_request_count);
+ rbd_assert(which >= img_request->next_completion);
+
+ spin_lock_irq(&img_request->completion_lock);
+ if (which != img_request->next_completion)
+ goto out;
+
+ for_each_obj_request_from(img_request, obj_request) {
+ unsigned int xferred;
+ int result;
+
+ rbd_assert(more);
+ rbd_assert(which < img_request->obj_request_count);
+
+ if (!obj_request_done_test(obj_request))
+ break;
+
+ rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
+ xferred = (unsigned int) obj_request->xferred;
+ result = (int) obj_request->result;
+ if (result)
+ rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
+ img_request->write_request ? "write" : "read",
+ result, xferred);
+
+ more = blk_end_request(img_request->rq, result, xferred);
+ which++;
}
- ret = -ENOMEM;
- ops = rbd_create_rw_ops(1, opcode, payload_len);
- if (!ops)
- goto done;
+ rbd_assert(more ^ (which == img_request->obj_request_count));
+ img_request->next_completion = which;
+out:
+ spin_unlock_irq(&img_request->completion_lock);
- /* we've taken care of segment sizes earlier when we
- cloned the bios. We should never have a segment
- truncated at this point */
- rbd_assert(seg_len == len);
-
- ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
- seg_name, seg_ofs, seg_len,
- bio,
- NULL, 0,
- flags,
- ops,
- coll, coll_index,
- rbd_req_cb, 0, NULL);
-
- rbd_destroy_ops(ops);
-done:
- kfree(seg_name);
- return ret;
+ if (!more)
+ rbd_img_request_complete(img_request);
}
-/*
- * Request sync osd read
- */
-static int rbd_req_sync_read(struct rbd_device *rbd_dev,
- u64 snapid,
- const char *object_name,
- u64 ofs, u64 len,
- char *buf,
- u64 *ver)
-{
- struct ceph_osd_req_op *ops;
- int ret;
+static int rbd_img_request_submit(struct rbd_img_request *img_request)
+{
+ struct rbd_device *rbd_dev = img_request->rbd_dev;
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ struct rbd_obj_request *obj_request;
- ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
- if (!ops)
- return -ENOMEM;
+ dout("%s: img %p\n", __func__, img_request);
+ for_each_obj_request(img_request, obj_request) {
+ int ret;
- ret = rbd_req_sync_op(rbd_dev, NULL,
- snapid,
- CEPH_OSD_FLAG_READ,
- ops, object_name, ofs, len, buf, NULL, ver);
- rbd_destroy_ops(ops);
+ obj_request->callback = rbd_img_obj_callback;
+ ret = rbd_obj_request_submit(osdc, obj_request);
+ if (ret)
+ return ret;
+ /*
+ * The image request has its own reference to each
+ * of its object requests, so we can safely drop the
+ * initial one here.
+ */
+ rbd_obj_request_put(obj_request);
+ }
- return ret;
+ return 0;
}
-/*
- * Request sync osd watch
- */
-static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
- u64 ver,
- u64 notify_id)
+static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
+ u64 ver, u64 notify_id)
{
- struct ceph_osd_req_op *ops;
+ struct rbd_obj_request *obj_request;
+ struct ceph_osd_req_op *op;
+ struct ceph_osd_client *osdc;
int ret;
- ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
- if (!ops)
+ obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
+ OBJ_REQUEST_NODATA);
+ if (!obj_request)
return -ENOMEM;
- ops[0].watch.ver = cpu_to_le64(ver);
- ops[0].watch.cookie = notify_id;
- ops[0].watch.flag = 0;
+ ret = -ENOMEM;
+ op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
+ if (!op)
+ goto out;
+ obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
+ obj_request, op);
+ rbd_osd_req_op_destroy(op);
+ if (!obj_request->osd_req)
+ goto out;
- ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
- rbd_dev->header_name, 0, 0, NULL,
- NULL, 0,
- CEPH_OSD_FLAG_READ,
- ops,
- NULL, 0,
- rbd_simple_req_cb, 0, NULL);
+ osdc = &rbd_dev->rbd_client->client->osdc;
+ obj_request->callback = rbd_obj_request_put;
+ ret = rbd_obj_request_submit(osdc, obj_request);
+out:
+ if (ret)
+ rbd_obj_request_put(obj_request);
- rbd_destroy_ops(ops);
return ret;
}
@@ -1381,95 +1791,103 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
if (!rbd_dev)
return;
- dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
+ dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
rbd_dev->header_name, (unsigned long long) notify_id,
(unsigned int) opcode);
rc = rbd_dev_refresh(rbd_dev, &hver);
if (rc)
- pr_warning(RBD_DRV_NAME "%d got notification but failed to "
- " update snaps: %d\n", rbd_dev->major, rc);
+ rbd_warn(rbd_dev, "got notification but failed to "
+ " update snaps: %d\n", rc);
- rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
+ rbd_obj_notify_ack(rbd_dev, hver, notify_id);
}
/*
- * Request sync osd watch
+ * Request sync osd watch/unwatch. The value of "start" determines
+ * whether a watch request is being initiated or torn down.
*/
-static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
+static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
{
- struct ceph_osd_req_op *ops;
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ struct rbd_obj_request *obj_request;
+ struct ceph_osd_req_op *op;
int ret;
- ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
- if (!ops)
- return -ENOMEM;
+ rbd_assert(start ^ !!rbd_dev->watch_event);
+ rbd_assert(start ^ !!rbd_dev->watch_request);
- ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
- (void *)rbd_dev, &rbd_dev->watch_event);
- if (ret < 0)
- goto fail;
+ if (start) {
+ ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
+ &rbd_dev->watch_event);
+ if (ret < 0)
+ return ret;
+ rbd_assert(rbd_dev->watch_event != NULL);
+ }
- ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
- ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
- ops[0].watch.flag = 1;
+ ret = -ENOMEM;
+ obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
+ OBJ_REQUEST_NODATA);
+ if (!obj_request)
+ goto out_cancel;
+
+ op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
+ rbd_dev->watch_event->cookie,
+ rbd_dev->header.obj_version, start);
+ if (!op)
+ goto out_cancel;
+ obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
+ obj_request, op);
+ rbd_osd_req_op_destroy(op);
+ if (!obj_request->osd_req)
+ goto out_cancel;
+
+ if (start)
+ ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
+ else
+ ceph_osdc_unregister_linger_request(osdc,
+ rbd_dev->watch_request->osd_req);
+ ret = rbd_obj_request_submit(osdc, obj_request);
+ if (ret)
+ goto out_cancel;
+ ret = rbd_obj_request_wait(obj_request);
+ if (ret)
+ goto out_cancel;
+ ret = obj_request->result;
+ if (ret)
+ goto out_cancel;
- ret = rbd_req_sync_op(rbd_dev, NULL,
- CEPH_NOSNAP,
- CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
- ops,
- rbd_dev->header_name,
- 0, 0, NULL,
- &rbd_dev->watch_request, NULL);
+ /*
+ * A watch request is set to linger, so the underlying osd
+ * request won't go away until we unregister it. We retain
+ * a pointer to the object request during that time (in
+ * rbd_dev->watch_request), so we'll keep a reference to
+ * it. We'll drop that reference (below) after we've
+ * unregistered it.
+ */
+ if (start) {
+ rbd_dev->watch_request = obj_request;
- if (ret < 0)
- goto fail_event;
+ return 0;
+ }
- rbd_destroy_ops(ops);
- return 0;
+ /* We have successfully torn down the watch request */
-fail_event:
+ rbd_obj_request_put(rbd_dev->watch_request);
+ rbd_dev->watch_request = NULL;
+out_cancel:
+ /* Cancel the event if we're tearing down, or on error */
ceph_osdc_cancel_event(rbd_dev->watch_event);
rbd_dev->watch_event = NULL;
-fail:
- rbd_destroy_ops(ops);
- return ret;
-}
+ if (obj_request)
+ rbd_obj_request_put(obj_request);
-/*
- * Request sync osd unwatch
- */
-static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
-{
- struct ceph_osd_req_op *ops;
- int ret;
-
- ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
- if (!ops)
- return -ENOMEM;
-
- ops[0].watch.ver = 0;
- ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
- ops[0].watch.flag = 0;
-
- ret = rbd_req_sync_op(rbd_dev, NULL,
- CEPH_NOSNAP,
- CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
- ops,
- rbd_dev->header_name,
- 0, 0, NULL, NULL, NULL);
-
-
- rbd_destroy_ops(ops);
- ceph_osdc_cancel_event(rbd_dev->watch_event);
- rbd_dev->watch_event = NULL;
return ret;
}
/*
* Synchronous osd object method call
*/
-static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
+static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
const char *object_name,
const char *class_name,
const char *method_name,
@@ -1477,169 +1895,154 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
size_t outbound_size,
char *inbound,
size_t inbound_size,
- int flags,
- u64 *ver)
+ u64 *version)
{
- struct ceph_osd_req_op *ops;
- int class_name_len = strlen(class_name);
- int method_name_len = strlen(method_name);
- int payload_size;
+ struct rbd_obj_request *obj_request;
+ struct ceph_osd_client *osdc;
+ struct ceph_osd_req_op *op;
+ struct page **pages;
+ u32 page_count;
int ret;
/*
- * Any input parameters required by the method we're calling
- * will be sent along with the class and method names as
- * part of the message payload. That data and its size are
- * supplied via the indata and indata_len fields (named from
- * the perspective of the server side) in the OSD request
- * operation.
+ * Method calls are ultimately read operations but they
+ * don't involve object data (so no offset or length).
+ * The result should placed into the inbound buffer
+ * provided. They also supply outbound data--parameters for
+ * the object method. Currently if this is present it will
+ * be a snapshot id.
*/
- payload_size = class_name_len + method_name_len + outbound_size;
- ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
- if (!ops)
- return -ENOMEM;
+ page_count = (u32) calc_pages_for(0, inbound_size);
+ pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
- ops[0].cls.class_name = class_name;
- ops[0].cls.class_len = (__u8) class_name_len;
- ops[0].cls.method_name = method_name;
- ops[0].cls.method_len = (__u8) method_name_len;
- ops[0].cls.argc = 0;
- ops[0].cls.indata = outbound;
- ops[0].cls.indata_len = outbound_size;
+ ret = -ENOMEM;
+ obj_request = rbd_obj_request_create(object_name, 0, 0,
+ OBJ_REQUEST_PAGES);
+ if (!obj_request)
+ goto out;
- ret = rbd_req_sync_op(rbd_dev, NULL,
- CEPH_NOSNAP,
- flags, ops,
- object_name, 0, inbound_size, inbound,
- NULL, ver);
+ obj_request->pages = pages;
+ obj_request->page_count = page_count;
- rbd_destroy_ops(ops);
+ op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
+ method_name, outbound, outbound_size);
+ if (!op)
+ goto out;
+ obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
+ obj_request, op);
+ rbd_osd_req_op_destroy(op);
+ if (!obj_request->osd_req)
+ goto out;
- dout("cls_exec returned %d\n", ret);
- return ret;
-}
+ osdc = &rbd_dev->rbd_client->client->osdc;
+ ret = rbd_obj_request_submit(osdc, obj_request);
+ if (ret)
+ goto out;
+ ret = rbd_obj_request_wait(obj_request);
+ if (ret)
+ goto out;
-static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
-{
- struct rbd_req_coll *coll =
- kzalloc(sizeof(struct rbd_req_coll) +
- sizeof(struct rbd_req_status) * num_reqs,
- GFP_ATOMIC);
+ ret = obj_request->result;
+ if (ret < 0)
+ goto out;
+ ret = 0;
+ ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
+ if (version)
+ *version = obj_request->version;
+out:
+ if (obj_request)
+ rbd_obj_request_put(obj_request);
+ else
+ ceph_release_page_vector(pages, page_count);
- if (!coll)
- return NULL;
- coll->total = num_reqs;
- kref_init(&coll->kref);
- return coll;
+ return ret;
}
-/*
- * block device queue callback
- */
-static void rbd_rq_fn(struct request_queue *q)
+static void rbd_request_fn(struct request_queue *q)
+ __releases(q->queue_lock) __acquires(q->queue_lock)
{
struct rbd_device *rbd_dev = q->queuedata;
+ bool read_only = rbd_dev->mapping.read_only;
struct request *rq;
+ int result;
while ((rq = blk_fetch_request(q))) {
- struct bio *bio;
- bool do_write;
- unsigned int size;
- u64 ofs;
- int num_segs, cur_seg = 0;
- struct rbd_req_coll *coll;
- struct ceph_snap_context *snapc;
- unsigned int bio_offset;
-
- dout("fetched request\n");
-
- /* filter out block requests we don't understand */
- if ((rq->cmd_type != REQ_TYPE_FS)) {
- __blk_end_request_all(rq, 0);
- continue;
- }
+ bool write_request = rq_data_dir(rq) == WRITE;
+ struct rbd_img_request *img_request;
+ u64 offset;
+ u64 length;
+
+ /* Ignore any non-FS requests that filter through. */
- /* deduce our operation (read, write) */
- do_write = (rq_data_dir(rq) == WRITE);
- if (do_write && rbd_dev->mapping.read_only) {
- __blk_end_request_all(rq, -EROFS);
+ if (rq->cmd_type != REQ_TYPE_FS) {
+ dout("%s: non-fs request type %d\n", __func__,
+ (int) rq->cmd_type);
+ __blk_end_request_all(rq, 0);
continue;
}
- spin_unlock_irq(q->queue_lock);
+ /* Ignore/skip any zero-length requests */
- down_read(&rbd_dev->header_rwsem);
+ offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
+ length = (u64) blk_rq_bytes(rq);
- if (!rbd_dev->exists) {
- rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
- up_read(&rbd_dev->header_rwsem);
- dout("request for non-existent snapshot");
- spin_lock_irq(q->queue_lock);
- __blk_end_request_all(rq, -ENXIO);
+ if (!length) {
+ dout("%s: zero-length request\n", __func__);
+ __blk_end_request_all(rq, 0);
continue;
}
- snapc = ceph_get_snap_context(rbd_dev->header.snapc);
-
- up_read(&rbd_dev->header_rwsem);
-
- size = blk_rq_bytes(rq);
- ofs = blk_rq_pos(rq) * SECTOR_SIZE;
- bio = rq->bio;
+ spin_unlock_irq(q->queue_lock);
- dout("%s 0x%x bytes at 0x%llx\n",
- do_write ? "write" : "read",
- size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
+ /* Disallow writes to a read-only device */
- num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
- if (num_segs <= 0) {
- spin_lock_irq(q->queue_lock);
- __blk_end_request_all(rq, num_segs);
- ceph_put_snap_context(snapc);
- continue;
+ if (write_request) {
+ result = -EROFS;
+ if (read_only)
+ goto end_request;
+ rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
}
- coll = rbd_alloc_coll(num_segs);
- if (!coll) {
- spin_lock_irq(q->queue_lock);
- __blk_end_request_all(rq, -ENOMEM);
- ceph_put_snap_context(snapc);
- continue;
- }
-
- bio_offset = 0;
- do {
- u64 limit = rbd_segment_length(rbd_dev, ofs, size);
- unsigned int chain_size;
- struct bio *bio_chain;
-
- BUG_ON(limit > (u64) UINT_MAX);
- chain_size = (unsigned int) limit;
- dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
- kref_get(&coll->kref);
+ /*
+ * Quit early if the mapped snapshot no longer
+ * exists. It's still possible the snapshot will
+ * have disappeared by the time our request arrives
+ * at the osd, but there's no sense in sending it if
+ * we already know.
+ */
+ if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
+ dout("request for non-existent snapshot");
+ rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
+ result = -ENXIO;
+ goto end_request;
+ }
- /* Pass a cloned bio chain via an osd request */
+ result = -EINVAL;
+ if (WARN_ON(offset && length > U64_MAX - offset + 1))
+ goto end_request; /* Shouldn't happen */
- bio_chain = bio_chain_clone_range(&bio,
- &bio_offset, chain_size,
- GFP_ATOMIC);
- if (bio_chain)
- (void) rbd_do_op(rq, rbd_dev, snapc,
- ofs, chain_size,
- bio_chain, coll, cur_seg);
- else
- rbd_coll_end_req_index(rq, coll, cur_seg,
- -ENOMEM, chain_size);
- size -= chain_size;
- ofs += chain_size;
+ result = -ENOMEM;
+ img_request = rbd_img_request_create(rbd_dev, offset, length,
+ write_request);
+ if (!img_request)
+ goto end_request;
- cur_seg++;
- } while (size > 0);
- kref_put(&coll->kref, rbd_coll_release);
+ img_request->rq = rq;
+ result = rbd_img_request_fill_bio(img_request, rq->bio);
+ if (!result)
+ result = rbd_img_request_submit(img_request);
+ if (result)
+ rbd_img_request_put(img_request);
+end_request:
spin_lock_irq(q->queue_lock);
-
- ceph_put_snap_context(snapc);
+ if (result < 0) {
+ rbd_warn(rbd_dev, "obj_request %s result %d\n",
+ write_request ? "write" : "read", result);
+ __blk_end_request_all(rq, result);
+ }
}
}
@@ -1703,6 +2106,71 @@ static void rbd_free_disk(struct rbd_device *rbd_dev)
put_disk(disk);
}
+static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
+ const char *object_name,
+ u64 offset, u64 length,
+ char *buf, u64 *version)
+
+{
+ struct ceph_osd_req_op *op;
+ struct rbd_obj_request *obj_request;
+ struct ceph_osd_client *osdc;
+ struct page **pages = NULL;
+ u32 page_count;
+ size_t size;
+ int ret;
+
+ page_count = (u32) calc_pages_for(offset, length);
+ pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
+ if (IS_ERR(pages))
+ ret = PTR_ERR(pages);
+
+ ret = -ENOMEM;
+ obj_request = rbd_obj_request_create(object_name, offset, length,
+ OBJ_REQUEST_PAGES);
+ if (!obj_request)
+ goto out;
+
+ obj_request->pages = pages;
+ obj_request->page_count = page_count;
+
+ op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
+ if (!op)
+ goto out;
+ obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
+ obj_request, op);
+ rbd_osd_req_op_destroy(op);
+ if (!obj_request->osd_req)
+ goto out;
+
+ osdc = &rbd_dev->rbd_client->client->osdc;
+ ret = rbd_obj_request_submit(osdc, obj_request);
+ if (ret)
+ goto out;
+ ret = rbd_obj_request_wait(obj_request);
+ if (ret)
+ goto out;
+
+ ret = obj_request->result;
+ if (ret < 0)
+ goto out;
+
+ rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
+ size = (size_t) obj_request->xferred;
+ ceph_copy_from_page_vector(pages, buf, 0, size);
+ rbd_assert(size <= (size_t) INT_MAX);
+ ret = (int) size;
+ if (version)
+ *version = obj_request->version;
+out:
+ if (obj_request)
+ rbd_obj_request_put(obj_request);
+ else
+ ceph_release_page_vector(pages, page_count);
+
+ return ret;
+}
+
/*
* Read the complete header for the given rbd device.
*
@@ -1741,24 +2209,20 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
if (!ondisk)
return ERR_PTR(-ENOMEM);
- ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
- rbd_dev->header_name,
+ ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
0, size,
(char *) ondisk, version);
-
if (ret < 0)
goto out_err;
if (WARN_ON((size_t) ret < size)) {
ret = -ENXIO;
- pr_warning("short header read for image %s"
- " (want %zd got %d)\n",
- rbd_dev->spec->image_name, size, ret);
+ rbd_warn(rbd_dev, "short header read (want %zd got %d)",
+ size, ret);
goto out_err;
}
if (!rbd_dev_ondisk_valid(ondisk)) {
ret = -ENXIO;
- pr_warning("invalid header for image %s\n",
- rbd_dev->spec->image_name);
+ rbd_warn(rbd_dev, "invalid header");
goto out_err;
}
@@ -1895,8 +2359,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
disk->fops = &rbd_bd_ops;
disk->private_data = rbd_dev;
- /* init rq */
- q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
+ q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
if (!q)
goto out_disk;
@@ -2233,7 +2696,7 @@ static void rbd_spec_free(struct kref *kref)
kfree(spec);
}
-struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
+static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
struct rbd_spec *spec)
{
struct rbd_device *rbd_dev;
@@ -2243,6 +2706,7 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
return NULL;
spin_lock_init(&rbd_dev->lock);
+ rbd_dev->flags = 0;
INIT_LIST_HEAD(&rbd_dev->node);
INIT_LIST_HEAD(&rbd_dev->snaps);
init_rwsem(&rbd_dev->header_rwsem);
@@ -2250,6 +2714,13 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
rbd_dev->spec = spec;
rbd_dev->rbd_client = rbdc;
+ /* Initialize the layout used for all rbd requests */
+
+ rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+ rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
+ rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+ rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
+
return rbd_dev;
}
@@ -2360,12 +2831,11 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
__le64 size;
} __attribute__ ((packed)) size_buf = { 0 };
- ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
+ ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
"rbd", "get_size",
(char *) &snapid, sizeof (snapid),
- (char *) &size_buf, sizeof (size_buf),
- CEPH_OSD_FLAG_READ, NULL);
- dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+ (char *) &size_buf, sizeof (size_buf), NULL);
+ dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
if (ret < 0)
return ret;
@@ -2396,15 +2866,13 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
if (!reply_buf)
return -ENOMEM;
- ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
+ ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
"rbd", "get_object_prefix",
NULL, 0,
- reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
- CEPH_OSD_FLAG_READ, NULL);
- dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+ reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
+ dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
if (ret < 0)
goto out;
- ret = 0; /* rbd_req_sync_exec() can return positive */
p = reply_buf;
rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
@@ -2435,12 +2903,12 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
u64 incompat;
int ret;
- ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
+ ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
"rbd", "get_features",
(char *) &snapid, sizeof (snapid),
(char *) &features_buf, sizeof (features_buf),
- CEPH_OSD_FLAG_READ, NULL);
- dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+ NULL);
+ dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
if (ret < 0)
return ret;
@@ -2474,7 +2942,6 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
void *end;
char *image_id;
u64 overlap;
- size_t len = 0;
int ret;
parent_spec = rbd_spec_alloc();
@@ -2492,12 +2959,11 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
}
snapid = cpu_to_le64(CEPH_NOSNAP);
- ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
+ ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
"rbd", "get_parent",
(char *) &snapid, sizeof (snapid),
- (char *) reply_buf, size,
- CEPH_OSD_FLAG_READ, NULL);
- dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+ (char *) reply_buf, size, NULL);
+ dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
if (ret < 0)
goto out_err;
@@ -2508,13 +2974,18 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
if (parent_spec->pool_id == CEPH_NOPOOL)
goto out; /* No parent? No problem. */
- image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
+ /* The ceph file layout needs to fit pool id in 32 bits */
+
+ ret = -EIO;
+ if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
+ goto out;
+
+ image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
if (IS_ERR(image_id)) {
ret = PTR_ERR(image_id);
goto out_err;
}
parent_spec->image_id = image_id;
- parent_spec->image_id_len = len;
ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
ceph_decode_64_safe(&p, end, overlap, out_err);
@@ -2544,26 +3015,25 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
rbd_assert(!rbd_dev->spec->image_name);
- image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len;
+ len = strlen(rbd_dev->spec->image_id);
+ image_id_size = sizeof (__le32) + len;
image_id = kmalloc(image_id_size, GFP_KERNEL);
if (!image_id)
return NULL;
p = image_id;
end = (char *) image_id + image_id_size;
- ceph_encode_string(&p, end, rbd_dev->spec->image_id,
- (u32) rbd_dev->spec->image_id_len);
+ ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
reply_buf = kmalloc(size, GFP_KERNEL);
if (!reply_buf)
goto out;
- ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
+ ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
"rbd", "dir_get_name",
image_id, image_id_size,
- (char *) reply_buf, size,
- CEPH_OSD_FLAG_READ, NULL);
+ (char *) reply_buf, size, NULL);
if (ret < 0)
goto out;
p = reply_buf;
@@ -2602,8 +3072,11 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
osdc = &rbd_dev->rbd_client->client->osdc;
name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
- if (!name)
- return -EIO; /* pool id too large (>= 2^31) */
+ if (!name) {
+ rbd_warn(rbd_dev, "there is no pool with id %llu",
+ rbd_dev->spec->pool_id); /* Really a BUG() */
+ return -EIO;
+ }
rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
if (!rbd_dev->spec->pool_name)
@@ -2612,19 +3085,17 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
/* Fetch the image name; tolerate failure here */
name = rbd_dev_image_name(rbd_dev);
- if (name) {
- rbd_dev->spec->image_name_len = strlen(name);
+ if (name)
rbd_dev->spec->image_name = (char *) name;
- } else {
- pr_warning(RBD_DRV_NAME "%d "
- "unable to get image name for image id %s\n",
- rbd_dev->major, rbd_dev->spec->image_id);
- }
+ else
+ rbd_warn(rbd_dev, "unable to get image name");
/* Look up the snapshot name. */
name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
if (!name) {
+ rbd_warn(rbd_dev, "no snapshot with id %llu",
+ rbd_dev->spec->snap_id); /* Really a BUG() */
ret = -EIO;
goto out_err;
}
@@ -2665,12 +3136,11 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
if (!reply_buf)
return -ENOMEM;
- ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
+ ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
"rbd", "get_snapcontext",
NULL, 0,
- reply_buf, size,
- CEPH_OSD_FLAG_READ, ver);
- dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+ reply_buf, size, ver);
+ dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
if (ret < 0)
goto out;
@@ -2735,12 +3205,11 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
return ERR_PTR(-ENOMEM);
snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
- ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
+ ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
"rbd", "get_snapshot_name",
(char *) &snap_id, sizeof (snap_id),
- reply_buf, size,
- CEPH_OSD_FLAG_READ, NULL);
- dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+ reply_buf, size, NULL);
+ dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
if (ret < 0)
goto out;
@@ -2766,7 +3235,7 @@ out:
static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
u64 *snap_size, u64 *snap_features)
{
- __le64 snap_id;
+ u64 snap_id;
u8 order;
int ret;
@@ -2865,10 +3334,17 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
struct list_head *next = links->next;
- /* Existing snapshot not in the new snap context */
-
+ /*
+ * A previously-existing snapshot is not in
+ * the new snap context.
+ *
+ * If the now missing snapshot is the one the
+ * image is mapped to, clear its exists flag
+ * so we can avoid sending any more requests
+ * to it.
+ */
if (rbd_dev->spec->snap_id == snap->id)
- rbd_dev->exists = false;
+ clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
rbd_remove_snap_dev(snap);
dout("%ssnap id %llu has been removed\n",
rbd_dev->spec->snap_id == snap->id ?
@@ -2942,7 +3418,7 @@ static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
struct rbd_snap *snap;
int ret = 0;
- dout("%s called\n", __func__);
+ dout("%s:\n", __func__);
if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
return -EIO;
@@ -2983,22 +3459,6 @@ static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
device_unregister(&rbd_dev->dev);
}
-static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
-{
- int ret, rc;
-
- do {
- ret = rbd_req_sync_watch(rbd_dev);
- if (ret == -ERANGE) {
- rc = rbd_dev_refresh(rbd_dev, NULL);
- if (rc < 0)
- return rc;
- }
- } while (ret == -ERANGE);
-
- return ret;
-}
-
static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
/*
@@ -3138,11 +3598,9 @@ static inline char *dup_token(const char **buf, size_t *lenp)
size_t len;
len = next_token(buf);
- dup = kmalloc(len + 1, GFP_KERNEL);
+ dup = kmemdup(*buf, len + 1, GFP_KERNEL);
if (!dup)
return NULL;
-
- memcpy(dup, *buf, len);
*(dup + len) = '\0';
*buf += len;
@@ -3210,8 +3668,10 @@ static int rbd_add_parse_args(const char *buf,
/* The first four tokens are required */
len = next_token(&buf);
- if (!len)
- return -EINVAL; /* Missing monitor address(es) */
+ if (!len) {
+ rbd_warn(NULL, "no monitor address(es) provided");
+ return -EINVAL;
+ }
mon_addrs = buf;
mon_addrs_size = len + 1;
buf += len;
@@ -3220,8 +3680,10 @@ static int rbd_add_parse_args(const char *buf,
options = dup_token(&buf, NULL);
if (!options)
return -ENOMEM;
- if (!*options)
- goto out_err; /* Missing options */
+ if (!*options) {
+ rbd_warn(NULL, "no options provided");
+ goto out_err;
+ }
spec = rbd_spec_alloc();
if (!spec)
@@ -3230,14 +3692,18 @@ static int rbd_add_parse_args(const char *buf,
spec->pool_name = dup_token(&buf, NULL);
if (!spec->pool_name)
goto out_mem;
- if (!*spec->pool_name)
- goto out_err; /* Missing pool name */
+ if (!*spec->pool_name) {
+ rbd_warn(NULL, "no pool name provided");
+ goto out_err;
+ }
- spec->image_name = dup_token(&buf, &spec->image_name_len);
+ spec->image_name = dup_token(&buf, NULL);
if (!spec->image_name)
goto out_mem;
- if (!*spec->image_name)
- goto out_err; /* Missing image name */
+ if (!*spec->image_name) {
+ rbd_warn(NULL, "no image name provided");
+ goto out_err;
+ }
/*
* Snapshot name is optional; default is to use "-"
@@ -3251,10 +3717,9 @@ static int rbd_add_parse_args(const char *buf,
ret = -ENAMETOOLONG;
goto out_err;
}
- spec->snap_name = kmalloc(len + 1, GFP_KERNEL);
+ spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
if (!spec->snap_name)
goto out_mem;
- memcpy(spec->snap_name, buf, len);
*(spec->snap_name + len) = '\0';
/* Initialize all rbd options to the defaults */
@@ -3323,7 +3788,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
* First, see if the format 2 image id file exists, and if
* so, get the image's persistent id from it.
*/
- size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len;
+ size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
object_name = kmalloc(size, GFP_NOIO);
if (!object_name)
return -ENOMEM;
@@ -3339,21 +3804,18 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
goto out;
}
- ret = rbd_req_sync_exec(rbd_dev, object_name,
+ ret = rbd_obj_method_sync(rbd_dev, object_name,
"rbd", "get_id",
NULL, 0,
- response, RBD_IMAGE_ID_LEN_MAX,
- CEPH_OSD_FLAG_READ, NULL);
- dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+ response, RBD_IMAGE_ID_LEN_MAX, NULL);
+ dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
if (ret < 0)
goto out;
- ret = 0; /* rbd_req_sync_exec() can return positive */
p = response;
rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
p + RBD_IMAGE_ID_LEN_MAX,
- &rbd_dev->spec->image_id_len,
- GFP_NOIO);
+ NULL, GFP_NOIO);
if (IS_ERR(rbd_dev->spec->image_id)) {
ret = PTR_ERR(rbd_dev->spec->image_id);
rbd_dev->spec->image_id = NULL;
@@ -3377,11 +3839,10 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
if (!rbd_dev->spec->image_id)
return -ENOMEM;
- rbd_dev->spec->image_id_len = 0;
/* Record the header object name for this rbd image. */
- size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX);
+ size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
if (!rbd_dev->header_name) {
ret = -ENOMEM;
@@ -3427,7 +3888,7 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
* Image id was filled in by the caller. Record the header
* object name for this rbd image.
*/
- size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len;
+ size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
if (!rbd_dev->header_name)
return -ENOMEM;
@@ -3542,7 +4003,7 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
if (ret)
goto err_out_bus;
- ret = rbd_init_watch_dev(rbd_dev);
+ ret = rbd_dev_header_watch_sync(rbd_dev, 1);
if (ret)
goto err_out_bus;
@@ -3638,6 +4099,13 @@ static ssize_t rbd_add(struct bus_type *bus,
goto err_out_client;
spec->pool_id = (u64) rc;
+ /* The ceph file layout needs to fit pool id in 32 bits */
+
+ if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
+ rc = -EIO;
+ goto err_out_client;
+ }
+
rbd_dev = rbd_dev_create(rbdc, spec);
if (!rbd_dev)
goto err_out_client;
@@ -3691,15 +4159,8 @@ static void rbd_dev_release(struct device *dev)
{
struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
- if (rbd_dev->watch_request) {
- struct ceph_client *client = rbd_dev->rbd_client->client;
-
- ceph_osdc_unregister_linger_request(&client->osdc,
- rbd_dev->watch_request);
- }
if (rbd_dev->watch_event)
- rbd_req_sync_unwatch(rbd_dev);
-
+ rbd_dev_header_watch_sync(rbd_dev, 0);
/* clean up and free blkdev */
rbd_free_disk(rbd_dev);
@@ -3743,10 +4204,14 @@ static ssize_t rbd_remove(struct bus_type *bus,
goto done;
}
- if (rbd_dev->open_count) {
+ spin_lock_irq(&rbd_dev->lock);
+ if (rbd_dev->open_count)
ret = -EBUSY;
+ else
+ set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
+ spin_unlock_irq(&rbd_dev->lock);
+ if (ret < 0)
goto done;
- }
rbd_remove_all_snaps(rbd_dev);
rbd_bus_del_dev(rbd_dev);
@@ -3782,10 +4247,15 @@ static void rbd_sysfs_cleanup(void)
device_unregister(&rbd_root_dev);
}
-int __init rbd_init(void)
+static int __init rbd_init(void)
{
int rc;
+ if (!libceph_compatible(NULL)) {
+ rbd_warn(NULL, "libceph incompatibility (quitting)");
+
+ return -EINVAL;
+ }
rc = rbd_sysfs_init();
if (rc)
return rc;
@@ -3793,7 +4263,7 @@ int __init rbd_init(void)
return 0;
}
-void __exit rbd_exit(void)
+static void __exit rbd_exit(void)
{
rbd_sysfs_cleanup();
}
diff --git a/drivers/block/rsxx/Makefile b/drivers/block/rsxx/Makefile
new file mode 100644
index 000000000000..f35cd0b71f7b
--- /dev/null
+++ b/drivers/block/rsxx/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_BLK_DEV_RSXX) += rsxx.o
+rsxx-y := config.o core.o cregs.o dev.o dma.o
diff --git a/drivers/block/rsxx/config.c b/drivers/block/rsxx/config.c
new file mode 100644
index 000000000000..a295e7e9ee41
--- /dev/null
+++ b/drivers/block/rsxx/config.c
@@ -0,0 +1,213 @@
+/*
+* Filename: config.c
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+* Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <linux/types.h>
+#include <linux/crc32.h>
+#include <linux/swab.h>
+
+#include "rsxx_priv.h"
+#include "rsxx_cfg.h"
+
+static void initialize_config(void *config)
+{
+ struct rsxx_card_cfg *cfg = config;
+
+ cfg->hdr.version = RSXX_CFG_VERSION;
+
+ cfg->data.block_size = RSXX_HW_BLK_SIZE;
+ cfg->data.stripe_size = RSXX_HW_BLK_SIZE;
+ cfg->data.vendor_id = RSXX_VENDOR_ID_TMS_IBM;
+ cfg->data.cache_order = (-1);
+ cfg->data.intr_coal.mode = RSXX_INTR_COAL_DISABLED;
+ cfg->data.intr_coal.count = 0;
+ cfg->data.intr_coal.latency = 0;
+}
+
+static u32 config_data_crc32(struct rsxx_card_cfg *cfg)
+{
+ /*
+ * Return the compliment of the CRC to ensure compatibility
+ * (i.e. this is how early rsxx drivers did it.)
+ */
+
+ return ~crc32(~0, &cfg->data, sizeof(cfg->data));
+}
+
+
+/*----------------- Config Byte Swap Functions -------------------*/
+static void config_hdr_be_to_cpu(struct card_cfg_hdr *hdr)
+{
+ hdr->version = be32_to_cpu((__force __be32) hdr->version);
+ hdr->crc = be32_to_cpu((__force __be32) hdr->crc);
+}
+
+static void config_hdr_cpu_to_be(struct card_cfg_hdr *hdr)
+{
+ hdr->version = (__force u32) cpu_to_be32(hdr->version);
+ hdr->crc = (__force u32) cpu_to_be32(hdr->crc);
+}
+
+static void config_data_swab(struct rsxx_card_cfg *cfg)
+{
+ u32 *data = (u32 *) &cfg->data;
+ int i;
+
+ for (i = 0; i < (sizeof(cfg->data) / 4); i++)
+ data[i] = swab32(data[i]);
+}
+
+static void config_data_le_to_cpu(struct rsxx_card_cfg *cfg)
+{
+ u32 *data = (u32 *) &cfg->data;
+ int i;
+
+ for (i = 0; i < (sizeof(cfg->data) / 4); i++)
+ data[i] = le32_to_cpu((__force __le32) data[i]);
+}
+
+static void config_data_cpu_to_le(struct rsxx_card_cfg *cfg)
+{
+ u32 *data = (u32 *) &cfg->data;
+ int i;
+
+ for (i = 0; i < (sizeof(cfg->data) / 4); i++)
+ data[i] = (__force u32) cpu_to_le32(data[i]);
+}
+
+
+/*----------------- Config Operations ------------------*/
+static int rsxx_save_config(struct rsxx_cardinfo *card)
+{
+ struct rsxx_card_cfg cfg;
+ int st;
+
+ memcpy(&cfg, &card->config, sizeof(cfg));
+
+ if (unlikely(cfg.hdr.version != RSXX_CFG_VERSION)) {
+ dev_err(CARD_TO_DEV(card),
+ "Cannot save config with invalid version %d\n",
+ cfg.hdr.version);
+ return -EINVAL;
+ }
+
+ /* Convert data to little endian for the CRC calculation. */
+ config_data_cpu_to_le(&cfg);
+
+ cfg.hdr.crc = config_data_crc32(&cfg);
+
+ /*
+ * Swap the data from little endian to big endian so it can be
+ * stored.
+ */
+ config_data_swab(&cfg);
+ config_hdr_cpu_to_be(&cfg.hdr);
+
+ st = rsxx_creg_write(card, CREG_ADD_CONFIG, sizeof(cfg), &cfg, 1);
+ if (st)
+ return st;
+
+ return 0;
+}
+
+int rsxx_load_config(struct rsxx_cardinfo *card)
+{
+ int st;
+ u32 crc;
+
+ st = rsxx_creg_read(card, CREG_ADD_CONFIG, sizeof(card->config),
+ &card->config, 1);
+ if (st) {
+ dev_err(CARD_TO_DEV(card),
+ "Failed reading card config.\n");
+ return st;
+ }
+
+ config_hdr_be_to_cpu(&card->config.hdr);
+
+ if (card->config.hdr.version == RSXX_CFG_VERSION) {
+ /*
+ * We calculate the CRC with the data in little endian, because
+ * early drivers did not take big endian CPUs into account.
+ * The data is always stored in big endian, so we need to byte
+ * swap it before calculating the CRC.
+ */
+
+ config_data_swab(&card->config);
+
+ /* Check the CRC */
+ crc = config_data_crc32(&card->config);
+ if (crc != card->config.hdr.crc) {
+ dev_err(CARD_TO_DEV(card),
+ "Config corruption detected!\n");
+ dev_info(CARD_TO_DEV(card),
+ "CRC (sb x%08x is x%08x)\n",
+ card->config.hdr.crc, crc);
+ return -EIO;
+ }
+
+ /* Convert the data to CPU byteorder */
+ config_data_le_to_cpu(&card->config);
+
+ } else if (card->config.hdr.version != 0) {
+ dev_err(CARD_TO_DEV(card),
+ "Invalid config version %d.\n",
+ card->config.hdr.version);
+ /*
+ * Config version changes require special handling from the
+ * user
+ */
+ return -EINVAL;
+ } else {
+ dev_info(CARD_TO_DEV(card),
+ "Initializing card configuration.\n");
+ initialize_config(card);
+ st = rsxx_save_config(card);
+ if (st)
+ return st;
+ }
+
+ card->config_valid = 1;
+
+ dev_dbg(CARD_TO_DEV(card), "version: x%08x\n",
+ card->config.hdr.version);
+ dev_dbg(CARD_TO_DEV(card), "crc: x%08x\n",
+ card->config.hdr.crc);
+ dev_dbg(CARD_TO_DEV(card), "block_size: x%08x\n",
+ card->config.data.block_size);
+ dev_dbg(CARD_TO_DEV(card), "stripe_size: x%08x\n",
+ card->config.data.stripe_size);
+ dev_dbg(CARD_TO_DEV(card), "vendor_id: x%08x\n",
+ card->config.data.vendor_id);
+ dev_dbg(CARD_TO_DEV(card), "cache_order: x%08x\n",
+ card->config.data.cache_order);
+ dev_dbg(CARD_TO_DEV(card), "mode: x%08x\n",
+ card->config.data.intr_coal.mode);
+ dev_dbg(CARD_TO_DEV(card), "count: x%08x\n",
+ card->config.data.intr_coal.count);
+ dev_dbg(CARD_TO_DEV(card), "latency: x%08x\n",
+ card->config.data.intr_coal.latency);
+
+ return 0;
+}
+
diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c
new file mode 100644
index 000000000000..e5162487686a
--- /dev/null
+++ b/drivers/block/rsxx/core.c
@@ -0,0 +1,649 @@
+/*
+* Filename: core.c
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+* Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/reboot.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+
+#include <linux/genhd.h>
+#include <linux/idr.h>
+
+#include "rsxx_priv.h"
+#include "rsxx_cfg.h"
+
+#define NO_LEGACY 0
+
+MODULE_DESCRIPTION("IBM RamSan PCIe Flash SSD Device Driver");
+MODULE_AUTHOR("IBM <support@ramsan.com>");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRIVER_VERSION);
+
+static unsigned int force_legacy = NO_LEGACY;
+module_param(force_legacy, uint, 0444);
+MODULE_PARM_DESC(force_legacy, "Force the use of legacy type PCI interrupts");
+
+static DEFINE_IDA(rsxx_disk_ida);
+static DEFINE_SPINLOCK(rsxx_ida_lock);
+
+/*----------------- Interrupt Control & Handling -------------------*/
+static void __enable_intr(unsigned int *mask, unsigned int intr)
+{
+ *mask |= intr;
+}
+
+static void __disable_intr(unsigned int *mask, unsigned int intr)
+{
+ *mask &= ~intr;
+}
+
+/*
+ * NOTE: Disabling the IER will disable the hardware interrupt.
+ * Disabling the ISR will disable the software handling of the ISR bit.
+ *
+ * Enable/Disable interrupt functions assume the card->irq_lock
+ * is held by the caller.
+ */
+void rsxx_enable_ier(struct rsxx_cardinfo *card, unsigned int intr)
+{
+ if (unlikely(card->halt))
+ return;
+
+ __enable_intr(&card->ier_mask, intr);
+ iowrite32(card->ier_mask, card->regmap + IER);
+}
+
+void rsxx_disable_ier(struct rsxx_cardinfo *card, unsigned int intr)
+{
+ __disable_intr(&card->ier_mask, intr);
+ iowrite32(card->ier_mask, card->regmap + IER);
+}
+
+void rsxx_enable_ier_and_isr(struct rsxx_cardinfo *card,
+ unsigned int intr)
+{
+ if (unlikely(card->halt))
+ return;
+
+ __enable_intr(&card->isr_mask, intr);
+ __enable_intr(&card->ier_mask, intr);
+ iowrite32(card->ier_mask, card->regmap + IER);
+}
+void rsxx_disable_ier_and_isr(struct rsxx_cardinfo *card,
+ unsigned int intr)
+{
+ __disable_intr(&card->isr_mask, intr);
+ __disable_intr(&card->ier_mask, intr);
+ iowrite32(card->ier_mask, card->regmap + IER);
+}
+
+static irqreturn_t rsxx_isr(int irq, void *pdata)
+{
+ struct rsxx_cardinfo *card = pdata;
+ unsigned int isr;
+ int handled = 0;
+ int reread_isr;
+ int i;
+
+ spin_lock(&card->irq_lock);
+
+ do {
+ reread_isr = 0;
+
+ isr = ioread32(card->regmap + ISR);
+ if (isr == 0xffffffff) {
+ /*
+ * A few systems seem to have an intermittent issue
+ * where PCI reads return all Fs, but retrying the read
+ * a little later will return as expected.
+ */
+ dev_info(CARD_TO_DEV(card),
+ "ISR = 0xFFFFFFFF, retrying later\n");
+ break;
+ }
+
+ isr &= card->isr_mask;
+ if (!isr)
+ break;
+
+ for (i = 0; i < card->n_targets; i++) {
+ if (isr & CR_INTR_DMA(i)) {
+ if (card->ier_mask & CR_INTR_DMA(i)) {
+ rsxx_disable_ier(card, CR_INTR_DMA(i));
+ reread_isr = 1;
+ }
+ queue_work(card->ctrl[i].done_wq,
+ &card->ctrl[i].dma_done_work);
+ handled++;
+ }
+ }
+
+ if (isr & CR_INTR_CREG) {
+ schedule_work(&card->creg_ctrl.done_work);
+ handled++;
+ }
+
+ if (isr & CR_INTR_EVENT) {
+ schedule_work(&card->event_work);
+ rsxx_disable_ier_and_isr(card, CR_INTR_EVENT);
+ handled++;
+ }
+ } while (reread_isr);
+
+ spin_unlock(&card->irq_lock);
+
+ return handled ? IRQ_HANDLED : IRQ_NONE;
+}
+
+/*----------------- Card Event Handler -------------------*/
+static char *rsxx_card_state_to_str(unsigned int state)
+{
+ static char *state_strings[] = {
+ "Unknown", "Shutdown", "Starting", "Formatting",
+ "Uninitialized", "Good", "Shutting Down",
+ "Fault", "Read Only Fault", "dStroying"
+ };
+
+ return state_strings[ffs(state)];
+}
+
+static void card_state_change(struct rsxx_cardinfo *card,
+ unsigned int new_state)
+{
+ int st;
+
+ dev_info(CARD_TO_DEV(card),
+ "card state change detected.(%s -> %s)\n",
+ rsxx_card_state_to_str(card->state),
+ rsxx_card_state_to_str(new_state));
+
+ card->state = new_state;
+
+ /* Don't attach DMA interfaces if the card has an invalid config */
+ if (!card->config_valid)
+ return;
+
+ switch (new_state) {
+ case CARD_STATE_RD_ONLY_FAULT:
+ dev_crit(CARD_TO_DEV(card),
+ "Hardware has entered read-only mode!\n");
+ /*
+ * Fall through so the DMA devices can be attached and
+ * the user can attempt to pull off their data.
+ */
+ case CARD_STATE_GOOD:
+ st = rsxx_get_card_size8(card, &card->size8);
+ if (st)
+ dev_err(CARD_TO_DEV(card),
+ "Failed attaching DMA devices\n");
+
+ if (card->config_valid)
+ set_capacity(card->gendisk, card->size8 >> 9);
+ break;
+
+ case CARD_STATE_FAULT:
+ dev_crit(CARD_TO_DEV(card),
+ "Hardware Fault reported!\n");
+ /* Fall through. */
+
+ /* Everything else, detach DMA interface if it's attached. */
+ case CARD_STATE_SHUTDOWN:
+ case CARD_STATE_STARTING:
+ case CARD_STATE_FORMATTING:
+ case CARD_STATE_UNINITIALIZED:
+ case CARD_STATE_SHUTTING_DOWN:
+ /*
+ * dStroy is a term coined by marketing to represent the low level
+ * secure erase.
+ */
+ case CARD_STATE_DSTROYING:
+ set_capacity(card->gendisk, 0);
+ break;
+ }
+}
+
+static void card_event_handler(struct work_struct *work)
+{
+ struct rsxx_cardinfo *card;
+ unsigned int state;
+ unsigned long flags;
+ int st;
+
+ card = container_of(work, struct rsxx_cardinfo, event_work);
+
+ if (unlikely(card->halt))
+ return;
+
+ /*
+ * Enable the interrupt now to avoid any weird race conditions where a
+ * state change might occur while rsxx_get_card_state() is
+ * processing a returned creg cmd.
+ */
+ spin_lock_irqsave(&card->irq_lock, flags);
+ rsxx_enable_ier_and_isr(card, CR_INTR_EVENT);
+ spin_unlock_irqrestore(&card->irq_lock, flags);
+
+ st = rsxx_get_card_state(card, &state);
+ if (st) {
+ dev_info(CARD_TO_DEV(card),
+ "Failed reading state after event.\n");
+ return;
+ }
+
+ if (card->state != state)
+ card_state_change(card, state);
+
+ if (card->creg_ctrl.creg_stats.stat & CREG_STAT_LOG_PENDING)
+ rsxx_read_hw_log(card);
+}
+
+/*----------------- Card Operations -------------------*/
+static int card_shutdown(struct rsxx_cardinfo *card)
+{
+ unsigned int state;
+ signed long start;
+ const int timeout = msecs_to_jiffies(120000);
+ int st;
+
+ /* We can't issue a shutdown if the card is in a transition state */
+ start = jiffies;
+ do {
+ st = rsxx_get_card_state(card, &state);
+ if (st)
+ return st;
+ } while (state == CARD_STATE_STARTING &&
+ (jiffies - start < timeout));
+
+ if (state == CARD_STATE_STARTING)
+ return -ETIMEDOUT;
+
+ /* Only issue a shutdown if we need to */
+ if ((state != CARD_STATE_SHUTTING_DOWN) &&
+ (state != CARD_STATE_SHUTDOWN)) {
+ st = rsxx_issue_card_cmd(card, CARD_CMD_SHUTDOWN);
+ if (st)
+ return st;
+ }
+
+ start = jiffies;
+ do {
+ st = rsxx_get_card_state(card, &state);
+ if (st)
+ return st;
+ } while (state != CARD_STATE_SHUTDOWN &&
+ (jiffies - start < timeout));
+
+ if (state != CARD_STATE_SHUTDOWN)
+ return -ETIMEDOUT;
+
+ return 0;
+}
+
+/*----------------- Driver Initialization & Setup -------------------*/
+/* Returns: 0 if the driver is compatible with the device
+ -1 if the driver is NOT compatible with the device */
+static int rsxx_compatibility_check(struct rsxx_cardinfo *card)
+{
+ unsigned char pci_rev;
+
+ pci_read_config_byte(card->dev, PCI_REVISION_ID, &pci_rev);
+
+ if (pci_rev > RS70_PCI_REV_SUPPORTED)
+ return -1;
+ return 0;
+}
+
+static int rsxx_pci_probe(struct pci_dev *dev,
+ const struct pci_device_id *id)
+{
+ struct rsxx_cardinfo *card;
+ int st;
+
+ dev_info(&dev->dev, "PCI-Flash SSD discovered\n");
+
+ card = kzalloc(sizeof(*card), GFP_KERNEL);
+ if (!card)
+ return -ENOMEM;
+
+ card->dev = dev;
+ pci_set_drvdata(dev, card);
+
+ do {
+ if (!ida_pre_get(&rsxx_disk_ida, GFP_KERNEL)) {
+ st = -ENOMEM;
+ goto failed_ida_get;
+ }
+
+ spin_lock(&rsxx_ida_lock);
+ st = ida_get_new(&rsxx_disk_ida, &card->disk_id);
+ spin_unlock(&rsxx_ida_lock);
+ } while (st == -EAGAIN);
+
+ if (st)
+ goto failed_ida_get;
+
+ st = pci_enable_device(dev);
+ if (st)
+ goto failed_enable;
+
+ pci_set_master(dev);
+ pci_set_dma_max_seg_size(dev, RSXX_HW_BLK_SIZE);
+
+ st = pci_set_dma_mask(dev, DMA_BIT_MASK(64));
+ if (st) {
+ dev_err(CARD_TO_DEV(card),
+ "No usable DMA configuration,aborting\n");
+ goto failed_dma_mask;
+ }
+
+ st = pci_request_regions(dev, DRIVER_NAME);
+ if (st) {
+ dev_err(CARD_TO_DEV(card),
+ "Failed to request memory region\n");
+ goto failed_request_regions;
+ }
+
+ if (pci_resource_len(dev, 0) == 0) {
+ dev_err(CARD_TO_DEV(card), "BAR0 has length 0!\n");
+ st = -ENOMEM;
+ goto failed_iomap;
+ }
+
+ card->regmap = pci_iomap(dev, 0, 0);
+ if (!card->regmap) {
+ dev_err(CARD_TO_DEV(card), "Failed to map BAR0\n");
+ st = -ENOMEM;
+ goto failed_iomap;
+ }
+
+ spin_lock_init(&card->irq_lock);
+ card->halt = 0;
+
+ spin_lock_irq(&card->irq_lock);
+ rsxx_disable_ier_and_isr(card, CR_INTR_ALL);
+ spin_unlock_irq(&card->irq_lock);
+
+ if (!force_legacy) {
+ st = pci_enable_msi(dev);
+ if (st)
+ dev_warn(CARD_TO_DEV(card),
+ "Failed to enable MSI\n");
+ }
+
+ st = request_irq(dev->irq, rsxx_isr, IRQF_DISABLED | IRQF_SHARED,
+ DRIVER_NAME, card);
+ if (st) {
+ dev_err(CARD_TO_DEV(card),
+ "Failed requesting IRQ%d\n", dev->irq);
+ goto failed_irq;
+ }
+
+ /************* Setup Processor Command Interface *************/
+ rsxx_creg_setup(card);
+
+ spin_lock_irq(&card->irq_lock);
+ rsxx_enable_ier_and_isr(card, CR_INTR_CREG);
+ spin_unlock_irq(&card->irq_lock);
+
+ st = rsxx_compatibility_check(card);
+ if (st) {
+ dev_warn(CARD_TO_DEV(card),
+ "Incompatible driver detected. Please update the driver.\n");
+ st = -EINVAL;
+ goto failed_compatiblity_check;
+ }
+
+ /************* Load Card Config *************/
+ st = rsxx_load_config(card);
+ if (st)
+ dev_err(CARD_TO_DEV(card),
+ "Failed loading card config\n");
+
+ /************* Setup DMA Engine *************/
+ st = rsxx_get_num_targets(card, &card->n_targets);
+ if (st)
+ dev_info(CARD_TO_DEV(card),
+ "Failed reading the number of DMA targets\n");
+
+ card->ctrl = kzalloc(card->n_targets * sizeof(*card->ctrl), GFP_KERNEL);
+ if (!card->ctrl) {
+ st = -ENOMEM;
+ goto failed_dma_setup;
+ }
+
+ st = rsxx_dma_setup(card);
+ if (st) {
+ dev_info(CARD_TO_DEV(card),
+ "Failed to setup DMA engine\n");
+ goto failed_dma_setup;
+ }
+
+ /************* Setup Card Event Handler *************/
+ INIT_WORK(&card->event_work, card_event_handler);
+
+ st = rsxx_setup_dev(card);
+ if (st)
+ goto failed_create_dev;
+
+ rsxx_get_card_state(card, &card->state);
+
+ dev_info(CARD_TO_DEV(card),
+ "card state: %s\n",
+ rsxx_card_state_to_str(card->state));
+
+ /*
+ * Now that the DMA Engine and devices have been setup,
+ * we can enable the event interrupt(it kicks off actions in
+ * those layers so we couldn't enable it right away.)
+ */
+ spin_lock_irq(&card->irq_lock);
+ rsxx_enable_ier_and_isr(card, CR_INTR_EVENT);
+ spin_unlock_irq(&card->irq_lock);
+
+ if (card->state == CARD_STATE_SHUTDOWN) {
+ st = rsxx_issue_card_cmd(card, CARD_CMD_STARTUP);
+ if (st)
+ dev_crit(CARD_TO_DEV(card),
+ "Failed issuing card startup\n");
+ } else if (card->state == CARD_STATE_GOOD ||
+ card->state == CARD_STATE_RD_ONLY_FAULT) {
+ st = rsxx_get_card_size8(card, &card->size8);
+ if (st)
+ card->size8 = 0;
+ }
+
+ rsxx_attach_dev(card);
+
+ return 0;
+
+failed_create_dev:
+ rsxx_dma_destroy(card);
+failed_dma_setup:
+failed_compatiblity_check:
+ spin_lock_irq(&card->irq_lock);
+ rsxx_disable_ier_and_isr(card, CR_INTR_ALL);
+ spin_unlock_irq(&card->irq_lock);
+ free_irq(dev->irq, card);
+ if (!force_legacy)
+ pci_disable_msi(dev);
+failed_irq:
+ pci_iounmap(dev, card->regmap);
+failed_iomap:
+ pci_release_regions(dev);
+failed_request_regions:
+failed_dma_mask:
+ pci_disable_device(dev);
+failed_enable:
+ spin_lock(&rsxx_ida_lock);
+ ida_remove(&rsxx_disk_ida, card->disk_id);
+ spin_unlock(&rsxx_ida_lock);
+failed_ida_get:
+ kfree(card);
+
+ return st;
+}
+
+static void rsxx_pci_remove(struct pci_dev *dev)
+{
+ struct rsxx_cardinfo *card = pci_get_drvdata(dev);
+ unsigned long flags;
+ int st;
+ int i;
+
+ if (!card)
+ return;
+
+ dev_info(CARD_TO_DEV(card),
+ "Removing PCI-Flash SSD.\n");
+
+ rsxx_detach_dev(card);
+
+ for (i = 0; i < card->n_targets; i++) {
+ spin_lock_irqsave(&card->irq_lock, flags);
+ rsxx_disable_ier_and_isr(card, CR_INTR_DMA(i));
+ spin_unlock_irqrestore(&card->irq_lock, flags);
+ }
+
+ st = card_shutdown(card);
+ if (st)
+ dev_crit(CARD_TO_DEV(card), "Shutdown failed!\n");
+
+ /* Sync outstanding event handlers. */
+ spin_lock_irqsave(&card->irq_lock, flags);
+ rsxx_disable_ier_and_isr(card, CR_INTR_EVENT);
+ spin_unlock_irqrestore(&card->irq_lock, flags);
+
+ /* Prevent work_structs from re-queuing themselves. */
+ card->halt = 1;
+
+ cancel_work_sync(&card->event_work);
+
+ rsxx_destroy_dev(card);
+ rsxx_dma_destroy(card);
+
+ spin_lock_irqsave(&card->irq_lock, flags);
+ rsxx_disable_ier_and_isr(card, CR_INTR_ALL);
+ spin_unlock_irqrestore(&card->irq_lock, flags);
+ free_irq(dev->irq, card);
+
+ if (!force_legacy)
+ pci_disable_msi(dev);
+
+ rsxx_creg_destroy(card);
+
+ pci_iounmap(dev, card->regmap);
+
+ pci_disable_device(dev);
+ pci_release_regions(dev);
+
+ kfree(card);
+}
+
+static int rsxx_pci_suspend(struct pci_dev *dev, pm_message_t state)
+{
+ /* We don't support suspend at this time. */
+ return -ENOSYS;
+}
+
+static void rsxx_pci_shutdown(struct pci_dev *dev)
+{
+ struct rsxx_cardinfo *card = pci_get_drvdata(dev);
+ unsigned long flags;
+ int i;
+
+ if (!card)
+ return;
+
+ dev_info(CARD_TO_DEV(card), "Shutting down PCI-Flash SSD.\n");
+
+ rsxx_detach_dev(card);
+
+ for (i = 0; i < card->n_targets; i++) {
+ spin_lock_irqsave(&card->irq_lock, flags);
+ rsxx_disable_ier_and_isr(card, CR_INTR_DMA(i));
+ spin_unlock_irqrestore(&card->irq_lock, flags);
+ }
+
+ card_shutdown(card);
+}
+
+static DEFINE_PCI_DEVICE_TABLE(rsxx_pci_ids) = {
+ {PCI_DEVICE(PCI_VENDOR_ID_TMS_IBM, PCI_DEVICE_ID_RS70_FLASH)},
+ {PCI_DEVICE(PCI_VENDOR_ID_TMS_IBM, PCI_DEVICE_ID_RS70D_FLASH)},
+ {PCI_DEVICE(PCI_VENDOR_ID_TMS_IBM, PCI_DEVICE_ID_RS80_FLASH)},
+ {PCI_DEVICE(PCI_VENDOR_ID_TMS_IBM, PCI_DEVICE_ID_RS81_FLASH)},
+ {0,},
+};
+
+MODULE_DEVICE_TABLE(pci, rsxx_pci_ids);
+
+static struct pci_driver rsxx_pci_driver = {
+ .name = DRIVER_NAME,
+ .id_table = rsxx_pci_ids,
+ .probe = rsxx_pci_probe,
+ .remove = rsxx_pci_remove,
+ .suspend = rsxx_pci_suspend,
+ .shutdown = rsxx_pci_shutdown,
+};
+
+static int __init rsxx_core_init(void)
+{
+ int st;
+
+ st = rsxx_dev_init();
+ if (st)
+ return st;
+
+ st = rsxx_dma_init();
+ if (st)
+ goto dma_init_failed;
+
+ st = rsxx_creg_init();
+ if (st)
+ goto creg_init_failed;
+
+ return pci_register_driver(&rsxx_pci_driver);
+
+creg_init_failed:
+ rsxx_dma_cleanup();
+dma_init_failed:
+ rsxx_dev_cleanup();
+
+ return st;
+}
+
+static void __exit rsxx_core_cleanup(void)
+{
+ pci_unregister_driver(&rsxx_pci_driver);
+ rsxx_creg_cleanup();
+ rsxx_dma_cleanup();
+ rsxx_dev_cleanup();
+}
+
+module_init(rsxx_core_init);
+module_exit(rsxx_core_cleanup);
diff --git a/drivers/block/rsxx/cregs.c b/drivers/block/rsxx/cregs.c
new file mode 100644
index 000000000000..80bbe639fccd
--- /dev/null
+++ b/drivers/block/rsxx/cregs.c
@@ -0,0 +1,758 @@
+/*
+* Filename: cregs.c
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+* Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <linux/completion.h>
+#include <linux/slab.h>
+
+#include "rsxx_priv.h"
+
+#define CREG_TIMEOUT_MSEC 10000
+
+typedef void (*creg_cmd_cb)(struct rsxx_cardinfo *card,
+ struct creg_cmd *cmd,
+ int st);
+
+struct creg_cmd {
+ struct list_head list;
+ creg_cmd_cb cb;
+ void *cb_private;
+ unsigned int op;
+ unsigned int addr;
+ int cnt8;
+ void *buf;
+ unsigned int stream;
+ unsigned int status;
+};
+
+static struct kmem_cache *creg_cmd_pool;
+
+
+/*------------ Private Functions --------------*/
+
+#if defined(__LITTLE_ENDIAN)
+#define LITTLE_ENDIAN 1
+#elif defined(__BIG_ENDIAN)
+#define LITTLE_ENDIAN 0
+#else
+#error Unknown endianess!!! Aborting...
+#endif
+
+static void copy_to_creg_data(struct rsxx_cardinfo *card,
+ int cnt8,
+ void *buf,
+ unsigned int stream)
+{
+ int i = 0;
+ u32 *data = buf;
+
+ for (i = 0; cnt8 > 0; i++, cnt8 -= 4) {
+ /*
+ * Firmware implementation makes it necessary to byte swap on
+ * little endian processors.
+ */
+ if (LITTLE_ENDIAN && stream)
+ iowrite32be(data[i], card->regmap + CREG_DATA(i));
+ else
+ iowrite32(data[i], card->regmap + CREG_DATA(i));
+ }
+}
+
+
+static void copy_from_creg_data(struct rsxx_cardinfo *card,
+ int cnt8,
+ void *buf,
+ unsigned int stream)
+{
+ int i = 0;
+ u32 *data = buf;
+
+ for (i = 0; cnt8 > 0; i++, cnt8 -= 4) {
+ /*
+ * Firmware implementation makes it necessary to byte swap on
+ * little endian processors.
+ */
+ if (LITTLE_ENDIAN && stream)
+ data[i] = ioread32be(card->regmap + CREG_DATA(i));
+ else
+ data[i] = ioread32(card->regmap + CREG_DATA(i));
+ }
+}
+
+static struct creg_cmd *pop_active_cmd(struct rsxx_cardinfo *card)
+{
+ struct creg_cmd *cmd;
+
+ /*
+ * Spin lock is needed because this can be called in atomic/interrupt
+ * context.
+ */
+ spin_lock_bh(&card->creg_ctrl.lock);
+ cmd = card->creg_ctrl.active_cmd;
+ card->creg_ctrl.active_cmd = NULL;
+ spin_unlock_bh(&card->creg_ctrl.lock);
+
+ return cmd;
+}
+
+static void creg_issue_cmd(struct rsxx_cardinfo *card, struct creg_cmd *cmd)
+{
+ iowrite32(cmd->addr, card->regmap + CREG_ADD);
+ iowrite32(cmd->cnt8, card->regmap + CREG_CNT);
+
+ if (cmd->op == CREG_OP_WRITE) {
+ if (cmd->buf)
+ copy_to_creg_data(card, cmd->cnt8,
+ cmd->buf, cmd->stream);
+ }
+
+ /*
+ * Data copy must complete before initiating the command. This is
+ * needed for weakly ordered processors (i.e. PowerPC), so that all
+ * neccessary registers are written before we kick the hardware.
+ */
+ wmb();
+
+ /* Setting the valid bit will kick off the command. */
+ iowrite32(cmd->op, card->regmap + CREG_CMD);
+}
+
+static void creg_kick_queue(struct rsxx_cardinfo *card)
+{
+ if (card->creg_ctrl.active || list_empty(&card->creg_ctrl.queue))
+ return;
+
+ card->creg_ctrl.active = 1;
+ card->creg_ctrl.active_cmd = list_first_entry(&card->creg_ctrl.queue,
+ struct creg_cmd, list);
+ list_del(&card->creg_ctrl.active_cmd->list);
+ card->creg_ctrl.q_depth--;
+
+ /*
+ * We have to set the timer before we push the new command. Otherwise,
+ * we could create a race condition that would occur if the timer
+ * was not canceled, and expired after the new command was pushed,
+ * but before the command was issued to hardware.
+ */
+ mod_timer(&card->creg_ctrl.cmd_timer,
+ jiffies + msecs_to_jiffies(CREG_TIMEOUT_MSEC));
+
+ creg_issue_cmd(card, card->creg_ctrl.active_cmd);
+}
+
+static int creg_queue_cmd(struct rsxx_cardinfo *card,
+ unsigned int op,
+ unsigned int addr,
+ unsigned int cnt8,
+ void *buf,
+ int stream,
+ creg_cmd_cb callback,
+ void *cb_private)
+{
+ struct creg_cmd *cmd;
+
+ /* Don't queue stuff up if we're halted. */
+ if (unlikely(card->halt))
+ return -EINVAL;
+
+ if (card->creg_ctrl.reset)
+ return -EAGAIN;
+
+ if (cnt8 > MAX_CREG_DATA8)
+ return -EINVAL;
+
+ cmd = kmem_cache_alloc(creg_cmd_pool, GFP_KERNEL);
+ if (!cmd)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&cmd->list);
+
+ cmd->op = op;
+ cmd->addr = addr;
+ cmd->cnt8 = cnt8;
+ cmd->buf = buf;
+ cmd->stream = stream;
+ cmd->cb = callback;
+ cmd->cb_private = cb_private;
+ cmd->status = 0;
+
+ spin_lock(&card->creg_ctrl.lock);
+ list_add_tail(&cmd->list, &card->creg_ctrl.queue);
+ card->creg_ctrl.q_depth++;
+ creg_kick_queue(card);
+ spin_unlock(&card->creg_ctrl.lock);
+
+ return 0;
+}
+
+static void creg_cmd_timed_out(unsigned long data)
+{
+ struct rsxx_cardinfo *card = (struct rsxx_cardinfo *) data;
+ struct creg_cmd *cmd;
+
+ cmd = pop_active_cmd(card);
+ if (cmd == NULL) {
+ card->creg_ctrl.creg_stats.creg_timeout++;
+ dev_warn(CARD_TO_DEV(card),
+ "No active command associated with timeout!\n");
+ return;
+ }
+
+ if (cmd->cb)
+ cmd->cb(card, cmd, -ETIMEDOUT);
+
+ kmem_cache_free(creg_cmd_pool, cmd);
+
+
+ spin_lock(&card->creg_ctrl.lock);
+ card->creg_ctrl.active = 0;
+ creg_kick_queue(card);
+ spin_unlock(&card->creg_ctrl.lock);
+}
+
+
+static void creg_cmd_done(struct work_struct *work)
+{
+ struct rsxx_cardinfo *card;
+ struct creg_cmd *cmd;
+ int st = 0;
+
+ card = container_of(work, struct rsxx_cardinfo,
+ creg_ctrl.done_work);
+
+ /*
+ * The timer could not be cancelled for some reason,
+ * race to pop the active command.
+ */
+ if (del_timer_sync(&card->creg_ctrl.cmd_timer) == 0)
+ card->creg_ctrl.creg_stats.failed_cancel_timer++;
+
+ cmd = pop_active_cmd(card);
+ if (cmd == NULL) {
+ dev_err(CARD_TO_DEV(card),
+ "Spurious creg interrupt!\n");
+ return;
+ }
+
+ card->creg_ctrl.creg_stats.stat = ioread32(card->regmap + CREG_STAT);
+ cmd->status = card->creg_ctrl.creg_stats.stat;
+ if ((cmd->status & CREG_STAT_STATUS_MASK) == 0) {
+ dev_err(CARD_TO_DEV(card),
+ "Invalid status on creg command\n");
+ /*
+ * At this point we're probably reading garbage from HW. Don't
+ * do anything else that could mess up the system and let
+ * the sync function return an error.
+ */
+ st = -EIO;
+ goto creg_done;
+ } else if (cmd->status & CREG_STAT_ERROR) {
+ st = -EIO;
+ }
+
+ if ((cmd->op == CREG_OP_READ)) {
+ unsigned int cnt8 = ioread32(card->regmap + CREG_CNT);
+
+ /* Paranoid Sanity Checks */
+ if (!cmd->buf) {
+ dev_err(CARD_TO_DEV(card),
+ "Buffer not given for read.\n");
+ st = -EIO;
+ goto creg_done;
+ }
+ if (cnt8 != cmd->cnt8) {
+ dev_err(CARD_TO_DEV(card),
+ "count mismatch\n");
+ st = -EIO;
+ goto creg_done;
+ }
+
+ copy_from_creg_data(card, cnt8, cmd->buf, cmd->stream);
+ }
+
+creg_done:
+ if (cmd->cb)
+ cmd->cb(card, cmd, st);
+
+ kmem_cache_free(creg_cmd_pool, cmd);
+
+ spin_lock(&card->creg_ctrl.lock);
+ card->creg_ctrl.active = 0;
+ creg_kick_queue(card);
+ spin_unlock(&card->creg_ctrl.lock);
+}
+
+static void creg_reset(struct rsxx_cardinfo *card)
+{
+ struct creg_cmd *cmd = NULL;
+ struct creg_cmd *tmp;
+ unsigned long flags;
+
+ /*
+ * mutex_trylock is used here because if reset_lock is taken then a
+ * reset is already happening. So, we can just go ahead and return.
+ */
+ if (!mutex_trylock(&card->creg_ctrl.reset_lock))
+ return;
+
+ card->creg_ctrl.reset = 1;
+ spin_lock_irqsave(&card->irq_lock, flags);
+ rsxx_disable_ier_and_isr(card, CR_INTR_CREG | CR_INTR_EVENT);
+ spin_unlock_irqrestore(&card->irq_lock, flags);
+
+ dev_warn(CARD_TO_DEV(card),
+ "Resetting creg interface for recovery\n");
+
+ /* Cancel outstanding commands */
+ spin_lock(&card->creg_ctrl.lock);
+ list_for_each_entry_safe(cmd, tmp, &card->creg_ctrl.queue, list) {
+ list_del(&cmd->list);
+ card->creg_ctrl.q_depth--;
+ if (cmd->cb)
+ cmd->cb(card, cmd, -ECANCELED);
+ kmem_cache_free(creg_cmd_pool, cmd);
+ }
+
+ cmd = card->creg_ctrl.active_cmd;
+ card->creg_ctrl.active_cmd = NULL;
+ if (cmd) {
+ if (timer_pending(&card->creg_ctrl.cmd_timer))
+ del_timer_sync(&card->creg_ctrl.cmd_timer);
+
+ if (cmd->cb)
+ cmd->cb(card, cmd, -ECANCELED);
+ kmem_cache_free(creg_cmd_pool, cmd);
+
+ card->creg_ctrl.active = 0;
+ }
+ spin_unlock(&card->creg_ctrl.lock);
+
+ card->creg_ctrl.reset = 0;
+ spin_lock_irqsave(&card->irq_lock, flags);
+ rsxx_enable_ier_and_isr(card, CR_INTR_CREG | CR_INTR_EVENT);
+ spin_unlock_irqrestore(&card->irq_lock, flags);
+
+ mutex_unlock(&card->creg_ctrl.reset_lock);
+}
+
+/* Used for synchronous accesses */
+struct creg_completion {
+ struct completion *cmd_done;
+ int st;
+ u32 creg_status;
+};
+
+static void creg_cmd_done_cb(struct rsxx_cardinfo *card,
+ struct creg_cmd *cmd,
+ int st)
+{
+ struct creg_completion *cmd_completion;
+
+ cmd_completion = cmd->cb_private;
+ BUG_ON(!cmd_completion);
+
+ cmd_completion->st = st;
+ cmd_completion->creg_status = cmd->status;
+ complete(cmd_completion->cmd_done);
+}
+
+static int __issue_creg_rw(struct rsxx_cardinfo *card,
+ unsigned int op,
+ unsigned int addr,
+ unsigned int cnt8,
+ void *buf,
+ int stream,
+ unsigned int *hw_stat)
+{
+ DECLARE_COMPLETION_ONSTACK(cmd_done);
+ struct creg_completion completion;
+ unsigned long timeout;
+ int st;
+
+ completion.cmd_done = &cmd_done;
+ completion.st = 0;
+ completion.creg_status = 0;
+
+ st = creg_queue_cmd(card, op, addr, cnt8, buf, stream, creg_cmd_done_cb,
+ &completion);
+ if (st)
+ return st;
+
+ /*
+ * This timeout is neccessary for unresponsive hardware. The additional
+ * 20 seconds to used to guarantee that each cregs requests has time to
+ * complete.
+ */
+ timeout = msecs_to_jiffies((CREG_TIMEOUT_MSEC *
+ card->creg_ctrl.q_depth) + 20000);
+
+ /*
+ * The creg interface is guaranteed to complete. It has a timeout
+ * mechanism that will kick in if hardware does not respond.
+ */
+ st = wait_for_completion_timeout(completion.cmd_done, timeout);
+ if (st == 0) {
+ /*
+ * This is really bad, because the kernel timer did not
+ * expire and notify us of a timeout!
+ */
+ dev_crit(CARD_TO_DEV(card),
+ "cregs timer failed\n");
+ creg_reset(card);
+ return -EIO;
+ }
+
+ *hw_stat = completion.creg_status;
+
+ if (completion.st) {
+ dev_warn(CARD_TO_DEV(card),
+ "creg command failed(%d x%08x)\n",
+ completion.st, addr);
+ return completion.st;
+ }
+
+ return 0;
+}
+
+static int issue_creg_rw(struct rsxx_cardinfo *card,
+ u32 addr,
+ unsigned int size8,
+ void *data,
+ int stream,
+ int read)
+{
+ unsigned int hw_stat;
+ unsigned int xfer;
+ unsigned int op;
+ int st;
+
+ op = read ? CREG_OP_READ : CREG_OP_WRITE;
+
+ do {
+ xfer = min_t(unsigned int, size8, MAX_CREG_DATA8);
+
+ st = __issue_creg_rw(card, op, addr, xfer,
+ data, stream, &hw_stat);
+ if (st)
+ return st;
+
+ data = (char *)data + xfer;
+ addr += xfer;
+ size8 -= xfer;
+ } while (size8);
+
+ return 0;
+}
+
+/* ---------------------------- Public API ---------------------------------- */
+int rsxx_creg_write(struct rsxx_cardinfo *card,
+ u32 addr,
+ unsigned int size8,
+ void *data,
+ int byte_stream)
+{
+ return issue_creg_rw(card, addr, size8, data, byte_stream, 0);
+}
+
+int rsxx_creg_read(struct rsxx_cardinfo *card,
+ u32 addr,
+ unsigned int size8,
+ void *data,
+ int byte_stream)
+{
+ return issue_creg_rw(card, addr, size8, data, byte_stream, 1);
+}
+
+int rsxx_get_card_state(struct rsxx_cardinfo *card, unsigned int *state)
+{
+ return rsxx_creg_read(card, CREG_ADD_CARD_STATE,
+ sizeof(*state), state, 0);
+}
+
+int rsxx_get_card_size8(struct rsxx_cardinfo *card, u64 *size8)
+{
+ unsigned int size;
+ int st;
+
+ st = rsxx_creg_read(card, CREG_ADD_CARD_SIZE,
+ sizeof(size), &size, 0);
+ if (st)
+ return st;
+
+ *size8 = (u64)size * RSXX_HW_BLK_SIZE;
+ return 0;
+}
+
+int rsxx_get_num_targets(struct rsxx_cardinfo *card,
+ unsigned int *n_targets)
+{
+ return rsxx_creg_read(card, CREG_ADD_NUM_TARGETS,
+ sizeof(*n_targets), n_targets, 0);
+}
+
+int rsxx_get_card_capabilities(struct rsxx_cardinfo *card,
+ u32 *capabilities)
+{
+ return rsxx_creg_read(card, CREG_ADD_CAPABILITIES,
+ sizeof(*capabilities), capabilities, 0);
+}
+
+int rsxx_issue_card_cmd(struct rsxx_cardinfo *card, u32 cmd)
+{
+ return rsxx_creg_write(card, CREG_ADD_CARD_CMD,
+ sizeof(cmd), &cmd, 0);
+}
+
+
+/*----------------- HW Log Functions -------------------*/
+static void hw_log_msg(struct rsxx_cardinfo *card, const char *str, int len)
+{
+ static char level;
+
+ /*
+ * New messages start with "<#>", where # is the log level. Messages
+ * that extend past the log buffer will use the previous level
+ */
+ if ((len > 3) && (str[0] == '<') && (str[2] == '>')) {
+ level = str[1];
+ str += 3; /* Skip past the log level. */
+ len -= 3;
+ }
+
+ switch (level) {
+ case '0':
+ dev_emerg(CARD_TO_DEV(card), "HW: %.*s", len, str);
+ break;
+ case '1':
+ dev_alert(CARD_TO_DEV(card), "HW: %.*s", len, str);
+ break;
+ case '2':
+ dev_crit(CARD_TO_DEV(card), "HW: %.*s", len, str);
+ break;
+ case '3':
+ dev_err(CARD_TO_DEV(card), "HW: %.*s", len, str);
+ break;
+ case '4':
+ dev_warn(CARD_TO_DEV(card), "HW: %.*s", len, str);
+ break;
+ case '5':
+ dev_notice(CARD_TO_DEV(card), "HW: %.*s", len, str);
+ break;
+ case '6':
+ dev_info(CARD_TO_DEV(card), "HW: %.*s", len, str);
+ break;
+ case '7':
+ dev_dbg(CARD_TO_DEV(card), "HW: %.*s", len, str);
+ break;
+ default:
+ dev_info(CARD_TO_DEV(card), "HW: %.*s", len, str);
+ break;
+ }
+}
+
+/*
+ * The substrncpy function copies the src string (which includes the
+ * terminating '\0' character), up to the count into the dest pointer.
+ * Returns the number of bytes copied to dest.
+ */
+static int substrncpy(char *dest, const char *src, int count)
+{
+ int max_cnt = count;
+
+ while (count) {
+ count--;
+ *dest = *src;
+ if (*dest == '\0')
+ break;
+ src++;
+ dest++;
+ }
+ return max_cnt - count;
+}
+
+
+static void read_hw_log_done(struct rsxx_cardinfo *card,
+ struct creg_cmd *cmd,
+ int st)
+{
+ char *buf;
+ char *log_str;
+ int cnt;
+ int len;
+ int off;
+
+ buf = cmd->buf;
+ off = 0;
+
+ /* Failed getting the log message */
+ if (st)
+ return;
+
+ while (off < cmd->cnt8) {
+ log_str = &card->log.buf[card->log.buf_len];
+ cnt = min(cmd->cnt8 - off, LOG_BUF_SIZE8 - card->log.buf_len);
+ len = substrncpy(log_str, &buf[off], cnt);
+
+ off += len;
+ card->log.buf_len += len;
+
+ /*
+ * Flush the log if we've hit the end of a message or if we've
+ * run out of buffer space.
+ */
+ if ((log_str[len - 1] == '\0') ||
+ (card->log.buf_len == LOG_BUF_SIZE8)) {
+ if (card->log.buf_len != 1) /* Don't log blank lines. */
+ hw_log_msg(card, card->log.buf,
+ card->log.buf_len);
+ card->log.buf_len = 0;
+ }
+
+ }
+
+ if (cmd->status & CREG_STAT_LOG_PENDING)
+ rsxx_read_hw_log(card);
+}
+
+int rsxx_read_hw_log(struct rsxx_cardinfo *card)
+{
+ int st;
+
+ st = creg_queue_cmd(card, CREG_OP_READ, CREG_ADD_LOG,
+ sizeof(card->log.tmp), card->log.tmp,
+ 1, read_hw_log_done, NULL);
+ if (st)
+ dev_err(CARD_TO_DEV(card),
+ "Failed getting log text\n");
+
+ return st;
+}
+
+/*-------------- IOCTL REG Access ------------------*/
+static int issue_reg_cmd(struct rsxx_cardinfo *card,
+ struct rsxx_reg_access *cmd,
+ int read)
+{
+ unsigned int op = read ? CREG_OP_READ : CREG_OP_WRITE;
+
+ return __issue_creg_rw(card, op, cmd->addr, cmd->cnt, cmd->data,
+ cmd->stream, &cmd->stat);
+}
+
+int rsxx_reg_access(struct rsxx_cardinfo *card,
+ struct rsxx_reg_access __user *ucmd,
+ int read)
+{
+ struct rsxx_reg_access cmd;
+ int st;
+
+ st = copy_from_user(&cmd, ucmd, sizeof(cmd));
+ if (st)
+ return -EFAULT;
+
+ if (cmd.cnt > RSXX_MAX_REG_CNT)
+ return -EFAULT;
+
+ st = issue_reg_cmd(card, &cmd, read);
+ if (st)
+ return st;
+
+ st = put_user(cmd.stat, &ucmd->stat);
+ if (st)
+ return -EFAULT;
+
+ if (read) {
+ st = copy_to_user(ucmd->data, cmd.data, cmd.cnt);
+ if (st)
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/*------------ Initialization & Setup --------------*/
+int rsxx_creg_setup(struct rsxx_cardinfo *card)
+{
+ card->creg_ctrl.active_cmd = NULL;
+
+ INIT_WORK(&card->creg_ctrl.done_work, creg_cmd_done);
+ mutex_init(&card->creg_ctrl.reset_lock);
+ INIT_LIST_HEAD(&card->creg_ctrl.queue);
+ spin_lock_init(&card->creg_ctrl.lock);
+ setup_timer(&card->creg_ctrl.cmd_timer, creg_cmd_timed_out,
+ (unsigned long) card);
+
+ return 0;
+}
+
+void rsxx_creg_destroy(struct rsxx_cardinfo *card)
+{
+ struct creg_cmd *cmd;
+ struct creg_cmd *tmp;
+ int cnt = 0;
+
+ /* Cancel outstanding commands */
+ spin_lock(&card->creg_ctrl.lock);
+ list_for_each_entry_safe(cmd, tmp, &card->creg_ctrl.queue, list) {
+ list_del(&cmd->list);
+ if (cmd->cb)
+ cmd->cb(card, cmd, -ECANCELED);
+ kmem_cache_free(creg_cmd_pool, cmd);
+ cnt++;
+ }
+
+ if (cnt)
+ dev_info(CARD_TO_DEV(card),
+ "Canceled %d queue creg commands\n", cnt);
+
+ cmd = card->creg_ctrl.active_cmd;
+ card->creg_ctrl.active_cmd = NULL;
+ if (cmd) {
+ if (timer_pending(&card->creg_ctrl.cmd_timer))
+ del_timer_sync(&card->creg_ctrl.cmd_timer);
+
+ if (cmd->cb)
+ cmd->cb(card, cmd, -ECANCELED);
+ dev_info(CARD_TO_DEV(card),
+ "Canceled active creg command\n");
+ kmem_cache_free(creg_cmd_pool, cmd);
+ }
+ spin_unlock(&card->creg_ctrl.lock);
+
+ cancel_work_sync(&card->creg_ctrl.done_work);
+}
+
+
+int rsxx_creg_init(void)
+{
+ creg_cmd_pool = KMEM_CACHE(creg_cmd, SLAB_HWCACHE_ALIGN);
+ if (!creg_cmd_pool)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void rsxx_creg_cleanup(void)
+{
+ kmem_cache_destroy(creg_cmd_pool);
+}
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
new file mode 100644
index 000000000000..4346d17d2949
--- /dev/null
+++ b/drivers/block/rsxx/dev.c
@@ -0,0 +1,367 @@
+/*
+* Filename: dev.c
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+* Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+
+#include <linux/hdreg.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+
+#include <linux/fs.h>
+
+#include "rsxx_priv.h"
+
+static unsigned int blkdev_minors = 64;
+module_param(blkdev_minors, uint, 0444);
+MODULE_PARM_DESC(blkdev_minors, "Number of minors(partitions)");
+
+/*
+ * For now I'm making this tweakable in case any applications hit this limit.
+ * If you see a "bio too big" error in the log you will need to raise this
+ * value.
+ */
+static unsigned int blkdev_max_hw_sectors = 1024;
+module_param(blkdev_max_hw_sectors, uint, 0444);
+MODULE_PARM_DESC(blkdev_max_hw_sectors, "Max hw sectors for a single BIO");
+
+static unsigned int enable_blkdev = 1;
+module_param(enable_blkdev , uint, 0444);
+MODULE_PARM_DESC(enable_blkdev, "Enable block device interfaces");
+
+
+struct rsxx_bio_meta {
+ struct bio *bio;
+ atomic_t pending_dmas;
+ atomic_t error;
+ unsigned long start_time;
+};
+
+static struct kmem_cache *bio_meta_pool;
+
+/*----------------- Block Device Operations -----------------*/
+static int rsxx_blkdev_ioctl(struct block_device *bdev,
+ fmode_t mode,
+ unsigned int cmd,
+ unsigned long arg)
+{
+ struct rsxx_cardinfo *card = bdev->bd_disk->private_data;
+
+ switch (cmd) {
+ case RSXX_GETREG:
+ return rsxx_reg_access(card, (void __user *)arg, 1);
+ case RSXX_SETREG:
+ return rsxx_reg_access(card, (void __user *)arg, 0);
+ }
+
+ return -ENOTTY;
+}
+
+static int rsxx_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ struct rsxx_cardinfo *card = bdev->bd_disk->private_data;
+ u64 blocks = card->size8 >> 9;
+
+ /*
+ * get geometry: Fake it. I haven't found any drivers that set
+ * geo->start, so we won't either.
+ */
+ if (card->size8) {
+ geo->heads = 64;
+ geo->sectors = 16;
+ do_div(blocks, (geo->heads * geo->sectors));
+ geo->cylinders = blocks;
+ } else {
+ geo->heads = 0;
+ geo->sectors = 0;
+ geo->cylinders = 0;
+ }
+ return 0;
+}
+
+static const struct block_device_operations rsxx_fops = {
+ .owner = THIS_MODULE,
+ .getgeo = rsxx_getgeo,
+ .ioctl = rsxx_blkdev_ioctl,
+};
+
+static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio)
+{
+ struct hd_struct *part0 = &card->gendisk->part0;
+ int rw = bio_data_dir(bio);
+ int cpu;
+
+ cpu = part_stat_lock();
+
+ part_round_stats(cpu, part0);
+ part_inc_in_flight(part0, rw);
+
+ part_stat_unlock();
+}
+
+static void disk_stats_complete(struct rsxx_cardinfo *card,
+ struct bio *bio,
+ unsigned long start_time)
+{
+ struct hd_struct *part0 = &card->gendisk->part0;
+ unsigned long duration = jiffies - start_time;
+ int rw = bio_data_dir(bio);
+ int cpu;
+
+ cpu = part_stat_lock();
+
+ part_stat_add(cpu, part0, sectors[rw], bio_sectors(bio));
+ part_stat_inc(cpu, part0, ios[rw]);
+ part_stat_add(cpu, part0, ticks[rw], duration);
+
+ part_round_stats(cpu, part0);
+ part_dec_in_flight(part0, rw);
+
+ part_stat_unlock();
+}
+
+static void bio_dma_done_cb(struct rsxx_cardinfo *card,
+ void *cb_data,
+ unsigned int error)
+{
+ struct rsxx_bio_meta *meta = cb_data;
+
+ if (error)
+ atomic_set(&meta->error, 1);
+
+ if (atomic_dec_and_test(&meta->pending_dmas)) {
+ disk_stats_complete(card, meta->bio, meta->start_time);
+
+ bio_endio(meta->bio, atomic_read(&meta->error) ? -EIO : 0);
+ kmem_cache_free(bio_meta_pool, meta);
+ }
+}
+
+static void rsxx_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct rsxx_cardinfo *card = q->queuedata;
+ struct rsxx_bio_meta *bio_meta;
+ int st = -EINVAL;
+
+ might_sleep();
+
+ if (unlikely(card->halt)) {
+ st = -EFAULT;
+ goto req_err;
+ }
+
+ if (unlikely(card->dma_fault)) {
+ st = (-EFAULT);
+ goto req_err;
+ }
+
+ if (bio->bi_size == 0) {
+ dev_err(CARD_TO_DEV(card), "size zero BIO!\n");
+ goto req_err;
+ }
+
+ bio_meta = kmem_cache_alloc(bio_meta_pool, GFP_KERNEL);
+ if (!bio_meta) {
+ st = -ENOMEM;
+ goto req_err;
+ }
+
+ bio_meta->bio = bio;
+ atomic_set(&bio_meta->error, 0);
+ atomic_set(&bio_meta->pending_dmas, 0);
+ bio_meta->start_time = jiffies;
+
+ disk_stats_start(card, bio);
+
+ dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n",
+ bio_data_dir(bio) ? 'W' : 'R', bio_meta,
+ (u64)bio->bi_sector << 9, bio->bi_size);
+
+ st = rsxx_dma_queue_bio(card, bio, &bio_meta->pending_dmas,
+ bio_dma_done_cb, bio_meta);
+ if (st)
+ goto queue_err;
+
+ return;
+
+queue_err:
+ kmem_cache_free(bio_meta_pool, bio_meta);
+req_err:
+ bio_endio(bio, st);
+}
+
+/*----------------- Device Setup -------------------*/
+static bool rsxx_discard_supported(struct rsxx_cardinfo *card)
+{
+ unsigned char pci_rev;
+
+ pci_read_config_byte(card->dev, PCI_REVISION_ID, &pci_rev);
+
+ return (pci_rev >= RSXX_DISCARD_SUPPORT);
+}
+
+static unsigned short rsxx_get_logical_block_size(
+ struct rsxx_cardinfo *card)
+{
+ u32 capabilities = 0;
+ int st;
+
+ st = rsxx_get_card_capabilities(card, &capabilities);
+ if (st)
+ dev_warn(CARD_TO_DEV(card),
+ "Failed reading card capabilities register\n");
+
+ /* Earlier firmware did not have support for 512 byte accesses */
+ if (capabilities & CARD_CAP_SUBPAGE_WRITES)
+ return 512;
+ else
+ return RSXX_HW_BLK_SIZE;
+}
+
+int rsxx_attach_dev(struct rsxx_cardinfo *card)
+{
+ mutex_lock(&card->dev_lock);
+
+ /* The block device requires the stripe size from the config. */
+ if (enable_blkdev) {
+ if (card->config_valid)
+ set_capacity(card->gendisk, card->size8 >> 9);
+ else
+ set_capacity(card->gendisk, 0);
+ add_disk(card->gendisk);
+
+ card->bdev_attached = 1;
+ }
+
+ mutex_unlock(&card->dev_lock);
+
+ return 0;
+}
+
+void rsxx_detach_dev(struct rsxx_cardinfo *card)
+{
+ mutex_lock(&card->dev_lock);
+
+ if (card->bdev_attached) {
+ del_gendisk(card->gendisk);
+ card->bdev_attached = 0;
+ }
+
+ mutex_unlock(&card->dev_lock);
+}
+
+int rsxx_setup_dev(struct rsxx_cardinfo *card)
+{
+ unsigned short blk_size;
+
+ mutex_init(&card->dev_lock);
+
+ if (!enable_blkdev)
+ return 0;
+
+ card->major = register_blkdev(0, DRIVER_NAME);
+ if (card->major < 0) {
+ dev_err(CARD_TO_DEV(card), "Failed to get major number\n");
+ return -ENOMEM;
+ }
+
+ card->queue = blk_alloc_queue(GFP_KERNEL);
+ if (!card->queue) {
+ dev_err(CARD_TO_DEV(card), "Failed queue alloc\n");
+ unregister_blkdev(card->major, DRIVER_NAME);
+ return -ENOMEM;
+ }
+
+ card->gendisk = alloc_disk(blkdev_minors);
+ if (!card->gendisk) {
+ dev_err(CARD_TO_DEV(card), "Failed disk alloc\n");
+ blk_cleanup_queue(card->queue);
+ unregister_blkdev(card->major, DRIVER_NAME);
+ return -ENOMEM;
+ }
+
+ blk_size = rsxx_get_logical_block_size(card);
+
+ blk_queue_make_request(card->queue, rsxx_make_request);
+ blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY);
+ blk_queue_dma_alignment(card->queue, blk_size - 1);
+ blk_queue_max_hw_sectors(card->queue, blkdev_max_hw_sectors);
+ blk_queue_logical_block_size(card->queue, blk_size);
+ blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE);
+
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, card->queue);
+ if (rsxx_discard_supported(card)) {
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, card->queue);
+ blk_queue_max_discard_sectors(card->queue,
+ RSXX_HW_BLK_SIZE >> 9);
+ card->queue->limits.discard_granularity = RSXX_HW_BLK_SIZE;
+ card->queue->limits.discard_alignment = RSXX_HW_BLK_SIZE;
+ card->queue->limits.discard_zeroes_data = 1;
+ }
+
+ card->queue->queuedata = card;
+
+ snprintf(card->gendisk->disk_name, sizeof(card->gendisk->disk_name),
+ "rsxx%d", card->disk_id);
+ card->gendisk->driverfs_dev = &card->dev->dev;
+ card->gendisk->major = card->major;
+ card->gendisk->first_minor = 0;
+ card->gendisk->fops = &rsxx_fops;
+ card->gendisk->private_data = card;
+ card->gendisk->queue = card->queue;
+
+ return 0;
+}
+
+void rsxx_destroy_dev(struct rsxx_cardinfo *card)
+{
+ if (!enable_blkdev)
+ return;
+
+ put_disk(card->gendisk);
+ card->gendisk = NULL;
+
+ blk_cleanup_queue(card->queue);
+ unregister_blkdev(card->major, DRIVER_NAME);
+}
+
+int rsxx_dev_init(void)
+{
+ bio_meta_pool = KMEM_CACHE(rsxx_bio_meta, SLAB_HWCACHE_ALIGN);
+ if (!bio_meta_pool)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void rsxx_dev_cleanup(void)
+{
+ kmem_cache_destroy(bio_meta_pool);
+}
+
+
diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
new file mode 100644
index 000000000000..63176e67662f
--- /dev/null
+++ b/drivers/block/rsxx/dma.c
@@ -0,0 +1,998 @@
+/*
+* Filename: dma.c
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+* Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <linux/slab.h>
+#include "rsxx_priv.h"
+
+struct rsxx_dma {
+ struct list_head list;
+ u8 cmd;
+ unsigned int laddr; /* Logical address on the ramsan */
+ struct {
+ u32 off;
+ u32 cnt;
+ } sub_page;
+ dma_addr_t dma_addr;
+ struct page *page;
+ unsigned int pg_off; /* Page Offset */
+ rsxx_dma_cb cb;
+ void *cb_data;
+};
+
+/* This timeout is used to detect a stalled DMA channel */
+#define DMA_ACTIVITY_TIMEOUT msecs_to_jiffies(10000)
+
+struct hw_status {
+ u8 status;
+ u8 tag;
+ __le16 count;
+ __le32 _rsvd2;
+ __le64 _rsvd3;
+} __packed;
+
+enum rsxx_dma_status {
+ DMA_SW_ERR = 0x1,
+ DMA_HW_FAULT = 0x2,
+ DMA_CANCELLED = 0x4,
+};
+
+struct hw_cmd {
+ u8 command;
+ u8 tag;
+ u8 _rsvd;
+ u8 sub_page; /* Bit[0:2]: 512byte offset */
+ /* Bit[4:6]: 512byte count */
+ __le32 device_addr;
+ __le64 host_addr;
+} __packed;
+
+enum rsxx_hw_cmd {
+ HW_CMD_BLK_DISCARD = 0x70,
+ HW_CMD_BLK_WRITE = 0x80,
+ HW_CMD_BLK_READ = 0xC0,
+ HW_CMD_BLK_RECON_READ = 0xE0,
+};
+
+enum rsxx_hw_status {
+ HW_STATUS_CRC = 0x01,
+ HW_STATUS_HARD_ERR = 0x02,
+ HW_STATUS_SOFT_ERR = 0x04,
+ HW_STATUS_FAULT = 0x08,
+};
+
+#define STATUS_BUFFER_SIZE8 4096
+#define COMMAND_BUFFER_SIZE8 4096
+
+static struct kmem_cache *rsxx_dma_pool;
+
+struct dma_tracker {
+ int next_tag;
+ struct rsxx_dma *dma;
+};
+
+#define DMA_TRACKER_LIST_SIZE8 (sizeof(struct dma_tracker_list) + \
+ (sizeof(struct dma_tracker) * RSXX_MAX_OUTSTANDING_CMDS))
+
+struct dma_tracker_list {
+ spinlock_t lock;
+ int head;
+ struct dma_tracker list[0];
+};
+
+
+/*----------------- Misc Utility Functions -------------------*/
+static unsigned int rsxx_addr8_to_laddr(u64 addr8, struct rsxx_cardinfo *card)
+{
+ unsigned long long tgt_addr8;
+
+ tgt_addr8 = ((addr8 >> card->_stripe.upper_shift) &
+ card->_stripe.upper_mask) |
+ ((addr8) & card->_stripe.lower_mask);
+ do_div(tgt_addr8, RSXX_HW_BLK_SIZE);
+ return tgt_addr8;
+}
+
+static unsigned int rsxx_get_dma_tgt(struct rsxx_cardinfo *card, u64 addr8)
+{
+ unsigned int tgt;
+
+ tgt = (addr8 >> card->_stripe.target_shift) & card->_stripe.target_mask;
+
+ return tgt;
+}
+
+static void rsxx_dma_queue_reset(struct rsxx_cardinfo *card)
+{
+ /* Reset all DMA Command/Status Queues */
+ iowrite32(DMA_QUEUE_RESET, card->regmap + RESET);
+}
+
+static unsigned int get_dma_size(struct rsxx_dma *dma)
+{
+ if (dma->sub_page.cnt)
+ return dma->sub_page.cnt << 9;
+ else
+ return RSXX_HW_BLK_SIZE;
+}
+
+
+/*----------------- DMA Tracker -------------------*/
+static void set_tracker_dma(struct dma_tracker_list *trackers,
+ int tag,
+ struct rsxx_dma *dma)
+{
+ trackers->list[tag].dma = dma;
+}
+
+static struct rsxx_dma *get_tracker_dma(struct dma_tracker_list *trackers,
+ int tag)
+{
+ return trackers->list[tag].dma;
+}
+
+static int pop_tracker(struct dma_tracker_list *trackers)
+{
+ int tag;
+
+ spin_lock(&trackers->lock);
+ tag = trackers->head;
+ if (tag != -1) {
+ trackers->head = trackers->list[tag].next_tag;
+ trackers->list[tag].next_tag = -1;
+ }
+ spin_unlock(&trackers->lock);
+
+ return tag;
+}
+
+static void push_tracker(struct dma_tracker_list *trackers, int tag)
+{
+ spin_lock(&trackers->lock);
+ trackers->list[tag].next_tag = trackers->head;
+ trackers->head = tag;
+ trackers->list[tag].dma = NULL;
+ spin_unlock(&trackers->lock);
+}
+
+
+/*----------------- Interrupt Coalescing -------------*/
+/*
+ * Interrupt Coalescing Register Format:
+ * Interrupt Timer (64ns units) [15:0]
+ * Interrupt Count [24:16]
+ * Reserved [31:25]
+*/
+#define INTR_COAL_LATENCY_MASK (0x0000ffff)
+
+#define INTR_COAL_COUNT_SHIFT 16
+#define INTR_COAL_COUNT_BITS 9
+#define INTR_COAL_COUNT_MASK (((1 << INTR_COAL_COUNT_BITS) - 1) << \
+ INTR_COAL_COUNT_SHIFT)
+#define INTR_COAL_LATENCY_UNITS_NS 64
+
+
+static u32 dma_intr_coal_val(u32 mode, u32 count, u32 latency)
+{
+ u32 latency_units = latency / INTR_COAL_LATENCY_UNITS_NS;
+
+ if (mode == RSXX_INTR_COAL_DISABLED)
+ return 0;
+
+ return ((count << INTR_COAL_COUNT_SHIFT) & INTR_COAL_COUNT_MASK) |
+ (latency_units & INTR_COAL_LATENCY_MASK);
+
+}
+
+static void dma_intr_coal_auto_tune(struct rsxx_cardinfo *card)
+{
+ int i;
+ u32 q_depth = 0;
+ u32 intr_coal;
+
+ if (card->config.data.intr_coal.mode != RSXX_INTR_COAL_AUTO_TUNE)
+ return;
+
+ for (i = 0; i < card->n_targets; i++)
+ q_depth += atomic_read(&card->ctrl[i].stats.hw_q_depth);
+
+ intr_coal = dma_intr_coal_val(card->config.data.intr_coal.mode,
+ q_depth / 2,
+ card->config.data.intr_coal.latency);
+ iowrite32(intr_coal, card->regmap + INTR_COAL);
+}
+
+/*----------------- RSXX DMA Handling -------------------*/
+static void rsxx_complete_dma(struct rsxx_cardinfo *card,
+ struct rsxx_dma *dma,
+ unsigned int status)
+{
+ if (status & DMA_SW_ERR)
+ printk_ratelimited(KERN_ERR
+ "SW Error in DMA(cmd x%02x, laddr x%08x)\n",
+ dma->cmd, dma->laddr);
+ if (status & DMA_HW_FAULT)
+ printk_ratelimited(KERN_ERR
+ "HW Fault in DMA(cmd x%02x, laddr x%08x)\n",
+ dma->cmd, dma->laddr);
+ if (status & DMA_CANCELLED)
+ printk_ratelimited(KERN_ERR
+ "DMA Cancelled(cmd x%02x, laddr x%08x)\n",
+ dma->cmd, dma->laddr);
+
+ if (dma->dma_addr)
+ pci_unmap_page(card->dev, dma->dma_addr, get_dma_size(dma),
+ dma->cmd == HW_CMD_BLK_WRITE ?
+ PCI_DMA_TODEVICE :
+ PCI_DMA_FROMDEVICE);
+
+ if (dma->cb)
+ dma->cb(card, dma->cb_data, status ? 1 : 0);
+
+ kmem_cache_free(rsxx_dma_pool, dma);
+}
+
+static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl,
+ struct rsxx_dma *dma)
+{
+ /*
+ * Requeued DMAs go to the front of the queue so they are issued
+ * first.
+ */
+ spin_lock(&ctrl->queue_lock);
+ list_add(&dma->list, &ctrl->queue);
+ spin_unlock(&ctrl->queue_lock);
+}
+
+static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl,
+ struct rsxx_dma *dma,
+ u8 hw_st)
+{
+ unsigned int status = 0;
+ int requeue_cmd = 0;
+
+ dev_dbg(CARD_TO_DEV(ctrl->card),
+ "Handling DMA error(cmd x%02x, laddr x%08x st:x%02x)\n",
+ dma->cmd, dma->laddr, hw_st);
+
+ if (hw_st & HW_STATUS_CRC)
+ ctrl->stats.crc_errors++;
+ if (hw_st & HW_STATUS_HARD_ERR)
+ ctrl->stats.hard_errors++;
+ if (hw_st & HW_STATUS_SOFT_ERR)
+ ctrl->stats.soft_errors++;
+
+ switch (dma->cmd) {
+ case HW_CMD_BLK_READ:
+ if (hw_st & (HW_STATUS_CRC | HW_STATUS_HARD_ERR)) {
+ if (ctrl->card->scrub_hard) {
+ dma->cmd = HW_CMD_BLK_RECON_READ;
+ requeue_cmd = 1;
+ ctrl->stats.reads_retried++;
+ } else {
+ status |= DMA_HW_FAULT;
+ ctrl->stats.reads_failed++;
+ }
+ } else if (hw_st & HW_STATUS_FAULT) {
+ status |= DMA_HW_FAULT;
+ ctrl->stats.reads_failed++;
+ }
+
+ break;
+ case HW_CMD_BLK_RECON_READ:
+ if (hw_st & (HW_STATUS_CRC | HW_STATUS_HARD_ERR)) {
+ /* Data could not be reconstructed. */
+ status |= DMA_HW_FAULT;
+ ctrl->stats.reads_failed++;
+ }
+
+ break;
+ case HW_CMD_BLK_WRITE:
+ status |= DMA_HW_FAULT;
+ ctrl->stats.writes_failed++;
+
+ break;
+ case HW_CMD_BLK_DISCARD:
+ status |= DMA_HW_FAULT;
+ ctrl->stats.discards_failed++;
+
+ break;
+ default:
+ dev_err(CARD_TO_DEV(ctrl->card),
+ "Unknown command in DMA!(cmd: x%02x "
+ "laddr x%08x st: x%02x\n",
+ dma->cmd, dma->laddr, hw_st);
+ status |= DMA_SW_ERR;
+
+ break;
+ }
+
+ if (requeue_cmd)
+ rsxx_requeue_dma(ctrl, dma);
+ else
+ rsxx_complete_dma(ctrl->card, dma, status);
+}
+
+static void dma_engine_stalled(unsigned long data)
+{
+ struct rsxx_dma_ctrl *ctrl = (struct rsxx_dma_ctrl *)data;
+
+ if (atomic_read(&ctrl->stats.hw_q_depth) == 0)
+ return;
+
+ if (ctrl->cmd.idx != ioread32(ctrl->regmap + SW_CMD_IDX)) {
+ /*
+ * The dma engine was stalled because the SW_CMD_IDX write
+ * was lost. Issue it again to recover.
+ */
+ dev_warn(CARD_TO_DEV(ctrl->card),
+ "SW_CMD_IDX write was lost, re-writing...\n");
+ iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX);
+ mod_timer(&ctrl->activity_timer,
+ jiffies + DMA_ACTIVITY_TIMEOUT);
+ } else {
+ dev_warn(CARD_TO_DEV(ctrl->card),
+ "DMA channel %d has stalled, faulting interface.\n",
+ ctrl->id);
+ ctrl->card->dma_fault = 1;
+ }
+}
+
+static void rsxx_issue_dmas(struct work_struct *work)
+{
+ struct rsxx_dma_ctrl *ctrl;
+ struct rsxx_dma *dma;
+ int tag;
+ int cmds_pending = 0;
+ struct hw_cmd *hw_cmd_buf;
+
+ ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work);
+ hw_cmd_buf = ctrl->cmd.buf;
+
+ if (unlikely(ctrl->card->halt))
+ return;
+
+ while (1) {
+ spin_lock(&ctrl->queue_lock);
+ if (list_empty(&ctrl->queue)) {
+ spin_unlock(&ctrl->queue_lock);
+ break;
+ }
+ spin_unlock(&ctrl->queue_lock);
+
+ tag = pop_tracker(ctrl->trackers);
+ if (tag == -1)
+ break;
+
+ spin_lock(&ctrl->queue_lock);
+ dma = list_entry(ctrl->queue.next, struct rsxx_dma, list);
+ list_del(&dma->list);
+ ctrl->stats.sw_q_depth--;
+ spin_unlock(&ctrl->queue_lock);
+
+ /*
+ * This will catch any DMAs that slipped in right before the
+ * fault, but was queued after all the other DMAs were
+ * cancelled.
+ */
+ if (unlikely(ctrl->card->dma_fault)) {
+ push_tracker(ctrl->trackers, tag);
+ rsxx_complete_dma(ctrl->card, dma, DMA_CANCELLED);
+ continue;
+ }
+
+ set_tracker_dma(ctrl->trackers, tag, dma);
+ hw_cmd_buf[ctrl->cmd.idx].command = dma->cmd;
+ hw_cmd_buf[ctrl->cmd.idx].tag = tag;
+ hw_cmd_buf[ctrl->cmd.idx]._rsvd = 0;
+ hw_cmd_buf[ctrl->cmd.idx].sub_page =
+ ((dma->sub_page.cnt & 0x7) << 4) |
+ (dma->sub_page.off & 0x7);
+
+ hw_cmd_buf[ctrl->cmd.idx].device_addr =
+ cpu_to_le32(dma->laddr);
+
+ hw_cmd_buf[ctrl->cmd.idx].host_addr =
+ cpu_to_le64(dma->dma_addr);
+
+ dev_dbg(CARD_TO_DEV(ctrl->card),
+ "Issue DMA%d(laddr %d tag %d) to idx %d\n",
+ ctrl->id, dma->laddr, tag, ctrl->cmd.idx);
+
+ ctrl->cmd.idx = (ctrl->cmd.idx + 1) & RSXX_CS_IDX_MASK;
+ cmds_pending++;
+
+ if (dma->cmd == HW_CMD_BLK_WRITE)
+ ctrl->stats.writes_issued++;
+ else if (dma->cmd == HW_CMD_BLK_DISCARD)
+ ctrl->stats.discards_issued++;
+ else
+ ctrl->stats.reads_issued++;
+ }
+
+ /* Let HW know we've queued commands. */
+ if (cmds_pending) {
+ /*
+ * We must guarantee that the CPU writes to 'ctrl->cmd.buf'
+ * (which is in PCI-consistent system-memory) from the loop
+ * above make it into the coherency domain before the
+ * following PIO "trigger" updating the cmd.idx. A WMB is
+ * sufficient. We need not explicitly CPU cache-flush since
+ * the memory is a PCI-consistent (ie; coherent) mapping.
+ */
+ wmb();
+
+ atomic_add(cmds_pending, &ctrl->stats.hw_q_depth);
+ mod_timer(&ctrl->activity_timer,
+ jiffies + DMA_ACTIVITY_TIMEOUT);
+ iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX);
+ }
+}
+
+static void rsxx_dma_done(struct work_struct *work)
+{
+ struct rsxx_dma_ctrl *ctrl;
+ struct rsxx_dma *dma;
+ unsigned long flags;
+ u16 count;
+ u8 status;
+ u8 tag;
+ struct hw_status *hw_st_buf;
+
+ ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work);
+ hw_st_buf = ctrl->status.buf;
+
+ if (unlikely(ctrl->card->halt) ||
+ unlikely(ctrl->card->dma_fault))
+ return;
+
+ count = le16_to_cpu(hw_st_buf[ctrl->status.idx].count);
+
+ while (count == ctrl->e_cnt) {
+ /*
+ * The read memory-barrier is necessary to keep aggressive
+ * processors/optimizers (such as the PPC Apple G5) from
+ * reordering the following status-buffer tag & status read
+ * *before* the count read on subsequent iterations of the
+ * loop!
+ */
+ rmb();
+
+ status = hw_st_buf[ctrl->status.idx].status;
+ tag = hw_st_buf[ctrl->status.idx].tag;
+
+ dma = get_tracker_dma(ctrl->trackers, tag);
+ if (dma == NULL) {
+ spin_lock_irqsave(&ctrl->card->irq_lock, flags);
+ rsxx_disable_ier(ctrl->card, CR_INTR_DMA_ALL);
+ spin_unlock_irqrestore(&ctrl->card->irq_lock, flags);
+
+ dev_err(CARD_TO_DEV(ctrl->card),
+ "No tracker for tag %d "
+ "(idx %d id %d)\n",
+ tag, ctrl->status.idx, ctrl->id);
+ return;
+ }
+
+ dev_dbg(CARD_TO_DEV(ctrl->card),
+ "Completing DMA%d"
+ "(laddr x%x tag %d st: x%x cnt: x%04x) from idx %d.\n",
+ ctrl->id, dma->laddr, tag, status, count,
+ ctrl->status.idx);
+
+ atomic_dec(&ctrl->stats.hw_q_depth);
+
+ mod_timer(&ctrl->activity_timer,
+ jiffies + DMA_ACTIVITY_TIMEOUT);
+
+ if (status)
+ rsxx_handle_dma_error(ctrl, dma, status);
+ else
+ rsxx_complete_dma(ctrl->card, dma, 0);
+
+ push_tracker(ctrl->trackers, tag);
+
+ ctrl->status.idx = (ctrl->status.idx + 1) &
+ RSXX_CS_IDX_MASK;
+ ctrl->e_cnt++;
+
+ count = le16_to_cpu(hw_st_buf[ctrl->status.idx].count);
+ }
+
+ dma_intr_coal_auto_tune(ctrl->card);
+
+ if (atomic_read(&ctrl->stats.hw_q_depth) == 0)
+ del_timer_sync(&ctrl->activity_timer);
+
+ spin_lock_irqsave(&ctrl->card->irq_lock, flags);
+ rsxx_enable_ier(ctrl->card, CR_INTR_DMA(ctrl->id));
+ spin_unlock_irqrestore(&ctrl->card->irq_lock, flags);
+
+ spin_lock(&ctrl->queue_lock);
+ if (ctrl->stats.sw_q_depth)
+ queue_work(ctrl->issue_wq, &ctrl->issue_dma_work);
+ spin_unlock(&ctrl->queue_lock);
+}
+
+static int rsxx_cleanup_dma_queue(struct rsxx_cardinfo *card,
+ struct list_head *q)
+{
+ struct rsxx_dma *dma;
+ struct rsxx_dma *tmp;
+ int cnt = 0;
+
+ list_for_each_entry_safe(dma, tmp, q, list) {
+ list_del(&dma->list);
+
+ if (dma->dma_addr)
+ pci_unmap_page(card->dev, dma->dma_addr,
+ get_dma_size(dma),
+ (dma->cmd == HW_CMD_BLK_WRITE) ?
+ PCI_DMA_TODEVICE :
+ PCI_DMA_FROMDEVICE);
+ kmem_cache_free(rsxx_dma_pool, dma);
+ cnt++;
+ }
+
+ return cnt;
+}
+
+static int rsxx_queue_discard(struct rsxx_cardinfo *card,
+ struct list_head *q,
+ unsigned int laddr,
+ rsxx_dma_cb cb,
+ void *cb_data)
+{
+ struct rsxx_dma *dma;
+
+ dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL);
+ if (!dma)
+ return -ENOMEM;
+
+ dma->cmd = HW_CMD_BLK_DISCARD;
+ dma->laddr = laddr;
+ dma->dma_addr = 0;
+ dma->sub_page.off = 0;
+ dma->sub_page.cnt = 0;
+ dma->page = NULL;
+ dma->pg_off = 0;
+ dma->cb = cb;
+ dma->cb_data = cb_data;
+
+ dev_dbg(CARD_TO_DEV(card), "Queuing[D] laddr %x\n", dma->laddr);
+
+ list_add_tail(&dma->list, q);
+
+ return 0;
+}
+
+static int rsxx_queue_dma(struct rsxx_cardinfo *card,
+ struct list_head *q,
+ int dir,
+ unsigned int dma_off,
+ unsigned int dma_len,
+ unsigned int laddr,
+ struct page *page,
+ unsigned int pg_off,
+ rsxx_dma_cb cb,
+ void *cb_data)
+{
+ struct rsxx_dma *dma;
+
+ dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL);
+ if (!dma)
+ return -ENOMEM;
+
+ dma->dma_addr = pci_map_page(card->dev, page, pg_off, dma_len,
+ dir ? PCI_DMA_TODEVICE :
+ PCI_DMA_FROMDEVICE);
+ if (!dma->dma_addr) {
+ kmem_cache_free(rsxx_dma_pool, dma);
+ return -ENOMEM;
+ }
+
+ dma->cmd = dir ? HW_CMD_BLK_WRITE : HW_CMD_BLK_READ;
+ dma->laddr = laddr;
+ dma->sub_page.off = (dma_off >> 9);
+ dma->sub_page.cnt = (dma_len >> 9);
+ dma->page = page;
+ dma->pg_off = pg_off;
+ dma->cb = cb;
+ dma->cb_data = cb_data;
+
+ dev_dbg(CARD_TO_DEV(card),
+ "Queuing[%c] laddr %x off %d cnt %d page %p pg_off %d\n",
+ dir ? 'W' : 'R', dma->laddr, dma->sub_page.off,
+ dma->sub_page.cnt, dma->page, dma->pg_off);
+
+ /* Queue the DMA */
+ list_add_tail(&dma->list, q);
+
+ return 0;
+}
+
+int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
+ struct bio *bio,
+ atomic_t *n_dmas,
+ rsxx_dma_cb cb,
+ void *cb_data)
+{
+ struct list_head dma_list[RSXX_MAX_TARGETS];
+ struct bio_vec *bvec;
+ unsigned long long addr8;
+ unsigned int laddr;
+ unsigned int bv_len;
+ unsigned int bv_off;
+ unsigned int dma_off;
+ unsigned int dma_len;
+ int dma_cnt[RSXX_MAX_TARGETS];
+ int tgt;
+ int st;
+ int i;
+
+ addr8 = bio->bi_sector << 9; /* sectors are 512 bytes */
+ atomic_set(n_dmas, 0);
+
+ for (i = 0; i < card->n_targets; i++) {
+ INIT_LIST_HEAD(&dma_list[i]);
+ dma_cnt[i] = 0;
+ }
+
+ if (bio->bi_rw & REQ_DISCARD) {
+ bv_len = bio->bi_size;
+
+ while (bv_len > 0) {
+ tgt = rsxx_get_dma_tgt(card, addr8);
+ laddr = rsxx_addr8_to_laddr(addr8, card);
+
+ st = rsxx_queue_discard(card, &dma_list[tgt], laddr,
+ cb, cb_data);
+ if (st)
+ goto bvec_err;
+
+ dma_cnt[tgt]++;
+ atomic_inc(n_dmas);
+ addr8 += RSXX_HW_BLK_SIZE;
+ bv_len -= RSXX_HW_BLK_SIZE;
+ }
+ } else {
+ bio_for_each_segment(bvec, bio, i) {
+ bv_len = bvec->bv_len;
+ bv_off = bvec->bv_offset;
+
+ while (bv_len > 0) {
+ tgt = rsxx_get_dma_tgt(card, addr8);
+ laddr = rsxx_addr8_to_laddr(addr8, card);
+ dma_off = addr8 & RSXX_HW_BLK_MASK;
+ dma_len = min(bv_len,
+ RSXX_HW_BLK_SIZE - dma_off);
+
+ st = rsxx_queue_dma(card, &dma_list[tgt],
+ bio_data_dir(bio),
+ dma_off, dma_len,
+ laddr, bvec->bv_page,
+ bv_off, cb, cb_data);
+ if (st)
+ goto bvec_err;
+
+ dma_cnt[tgt]++;
+ atomic_inc(n_dmas);
+ addr8 += dma_len;
+ bv_off += dma_len;
+ bv_len -= dma_len;
+ }
+ }
+ }
+
+ for (i = 0; i < card->n_targets; i++) {
+ if (!list_empty(&dma_list[i])) {
+ spin_lock(&card->ctrl[i].queue_lock);
+ card->ctrl[i].stats.sw_q_depth += dma_cnt[i];
+ list_splice_tail(&dma_list[i], &card->ctrl[i].queue);
+ spin_unlock(&card->ctrl[i].queue_lock);
+
+ queue_work(card->ctrl[i].issue_wq,
+ &card->ctrl[i].issue_dma_work);
+ }
+ }
+
+ return 0;
+
+bvec_err:
+ for (i = 0; i < card->n_targets; i++)
+ rsxx_cleanup_dma_queue(card, &dma_list[i]);
+
+ return st;
+}
+
+
+/*----------------- DMA Engine Initialization & Setup -------------------*/
+static int rsxx_dma_ctrl_init(struct pci_dev *dev,
+ struct rsxx_dma_ctrl *ctrl)
+{
+ int i;
+
+ memset(&ctrl->stats, 0, sizeof(ctrl->stats));
+
+ ctrl->status.buf = pci_alloc_consistent(dev, STATUS_BUFFER_SIZE8,
+ &ctrl->status.dma_addr);
+ ctrl->cmd.buf = pci_alloc_consistent(dev, COMMAND_BUFFER_SIZE8,
+ &ctrl->cmd.dma_addr);
+ if (ctrl->status.buf == NULL || ctrl->cmd.buf == NULL)
+ return -ENOMEM;
+
+ ctrl->trackers = vmalloc(DMA_TRACKER_LIST_SIZE8);
+ if (!ctrl->trackers)
+ return -ENOMEM;
+
+ ctrl->trackers->head = 0;
+ for (i = 0; i < RSXX_MAX_OUTSTANDING_CMDS; i++) {
+ ctrl->trackers->list[i].next_tag = i + 1;
+ ctrl->trackers->list[i].dma = NULL;
+ }
+ ctrl->trackers->list[RSXX_MAX_OUTSTANDING_CMDS-1].next_tag = -1;
+ spin_lock_init(&ctrl->trackers->lock);
+
+ spin_lock_init(&ctrl->queue_lock);
+ INIT_LIST_HEAD(&ctrl->queue);
+
+ setup_timer(&ctrl->activity_timer, dma_engine_stalled,
+ (unsigned long)ctrl);
+
+ ctrl->issue_wq = alloc_ordered_workqueue(DRIVER_NAME"_issue", 0);
+ if (!ctrl->issue_wq)
+ return -ENOMEM;
+
+ ctrl->done_wq = alloc_ordered_workqueue(DRIVER_NAME"_done", 0);
+ if (!ctrl->done_wq)
+ return -ENOMEM;
+
+ INIT_WORK(&ctrl->issue_dma_work, rsxx_issue_dmas);
+ INIT_WORK(&ctrl->dma_done_work, rsxx_dma_done);
+
+ memset(ctrl->status.buf, 0xac, STATUS_BUFFER_SIZE8);
+ iowrite32(lower_32_bits(ctrl->status.dma_addr),
+ ctrl->regmap + SB_ADD_LO);
+ iowrite32(upper_32_bits(ctrl->status.dma_addr),
+ ctrl->regmap + SB_ADD_HI);
+
+ memset(ctrl->cmd.buf, 0x83, COMMAND_BUFFER_SIZE8);
+ iowrite32(lower_32_bits(ctrl->cmd.dma_addr), ctrl->regmap + CB_ADD_LO);
+ iowrite32(upper_32_bits(ctrl->cmd.dma_addr), ctrl->regmap + CB_ADD_HI);
+
+ ctrl->status.idx = ioread32(ctrl->regmap + HW_STATUS_CNT);
+ if (ctrl->status.idx > RSXX_MAX_OUTSTANDING_CMDS) {
+ dev_crit(&dev->dev, "Failed reading status cnt x%x\n",
+ ctrl->status.idx);
+ return -EINVAL;
+ }
+ iowrite32(ctrl->status.idx, ctrl->regmap + HW_STATUS_CNT);
+ iowrite32(ctrl->status.idx, ctrl->regmap + SW_STATUS_CNT);
+
+ ctrl->cmd.idx = ioread32(ctrl->regmap + HW_CMD_IDX);
+ if (ctrl->cmd.idx > RSXX_MAX_OUTSTANDING_CMDS) {
+ dev_crit(&dev->dev, "Failed reading cmd cnt x%x\n",
+ ctrl->status.idx);
+ return -EINVAL;
+ }
+ iowrite32(ctrl->cmd.idx, ctrl->regmap + HW_CMD_IDX);
+ iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX);
+
+ wmb();
+
+ return 0;
+}
+
+static int rsxx_dma_stripe_setup(struct rsxx_cardinfo *card,
+ unsigned int stripe_size8)
+{
+ if (!is_power_of_2(stripe_size8)) {
+ dev_err(CARD_TO_DEV(card),
+ "stripe_size is NOT a power of 2!\n");
+ return -EINVAL;
+ }
+
+ card->_stripe.lower_mask = stripe_size8 - 1;
+
+ card->_stripe.upper_mask = ~(card->_stripe.lower_mask);
+ card->_stripe.upper_shift = ffs(card->n_targets) - 1;
+
+ card->_stripe.target_mask = card->n_targets - 1;
+ card->_stripe.target_shift = ffs(stripe_size8) - 1;
+
+ dev_dbg(CARD_TO_DEV(card), "_stripe.lower_mask = x%016llx\n",
+ card->_stripe.lower_mask);
+ dev_dbg(CARD_TO_DEV(card), "_stripe.upper_shift = x%016llx\n",
+ card->_stripe.upper_shift);
+ dev_dbg(CARD_TO_DEV(card), "_stripe.upper_mask = x%016llx\n",
+ card->_stripe.upper_mask);
+ dev_dbg(CARD_TO_DEV(card), "_stripe.target_mask = x%016llx\n",
+ card->_stripe.target_mask);
+ dev_dbg(CARD_TO_DEV(card), "_stripe.target_shift = x%016llx\n",
+ card->_stripe.target_shift);
+
+ return 0;
+}
+
+static int rsxx_dma_configure(struct rsxx_cardinfo *card)
+{
+ u32 intr_coal;
+
+ intr_coal = dma_intr_coal_val(card->config.data.intr_coal.mode,
+ card->config.data.intr_coal.count,
+ card->config.data.intr_coal.latency);
+ iowrite32(intr_coal, card->regmap + INTR_COAL);
+
+ return rsxx_dma_stripe_setup(card, card->config.data.stripe_size);
+}
+
+int rsxx_dma_setup(struct rsxx_cardinfo *card)
+{
+ unsigned long flags;
+ int st;
+ int i;
+
+ dev_info(CARD_TO_DEV(card),
+ "Initializing %d DMA targets\n",
+ card->n_targets);
+
+ /* Regmap is divided up into 4K chunks. One for each DMA channel */
+ for (i = 0; i < card->n_targets; i++)
+ card->ctrl[i].regmap = card->regmap + (i * 4096);
+
+ card->dma_fault = 0;
+
+ /* Reset the DMA queues */
+ rsxx_dma_queue_reset(card);
+
+ /************* Setup DMA Control *************/
+ for (i = 0; i < card->n_targets; i++) {
+ st = rsxx_dma_ctrl_init(card->dev, &card->ctrl[i]);
+ if (st)
+ goto failed_dma_setup;
+
+ card->ctrl[i].card = card;
+ card->ctrl[i].id = i;
+ }
+
+ card->scrub_hard = 1;
+
+ if (card->config_valid)
+ rsxx_dma_configure(card);
+
+ /* Enable the interrupts after all setup has completed. */
+ for (i = 0; i < card->n_targets; i++) {
+ spin_lock_irqsave(&card->irq_lock, flags);
+ rsxx_enable_ier_and_isr(card, CR_INTR_DMA(i));
+ spin_unlock_irqrestore(&card->irq_lock, flags);
+ }
+
+ return 0;
+
+failed_dma_setup:
+ for (i = 0; i < card->n_targets; i++) {
+ struct rsxx_dma_ctrl *ctrl = &card->ctrl[i];
+
+ if (ctrl->issue_wq) {
+ destroy_workqueue(ctrl->issue_wq);
+ ctrl->issue_wq = NULL;
+ }
+
+ if (ctrl->done_wq) {
+ destroy_workqueue(ctrl->done_wq);
+ ctrl->done_wq = NULL;
+ }
+
+ if (ctrl->trackers)
+ vfree(ctrl->trackers);
+
+ if (ctrl->status.buf)
+ pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8,
+ ctrl->status.buf,
+ ctrl->status.dma_addr);
+ if (ctrl->cmd.buf)
+ pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8,
+ ctrl->cmd.buf, ctrl->cmd.dma_addr);
+ }
+
+ return st;
+}
+
+
+void rsxx_dma_destroy(struct rsxx_cardinfo *card)
+{
+ struct rsxx_dma_ctrl *ctrl;
+ struct rsxx_dma *dma;
+ int i, j;
+ int cnt = 0;
+
+ for (i = 0; i < card->n_targets; i++) {
+ ctrl = &card->ctrl[i];
+
+ if (ctrl->issue_wq) {
+ destroy_workqueue(ctrl->issue_wq);
+ ctrl->issue_wq = NULL;
+ }
+
+ if (ctrl->done_wq) {
+ destroy_workqueue(ctrl->done_wq);
+ ctrl->done_wq = NULL;
+ }
+
+ if (timer_pending(&ctrl->activity_timer))
+ del_timer_sync(&ctrl->activity_timer);
+
+ /* Clean up the DMA queue */
+ spin_lock(&ctrl->queue_lock);
+ cnt = rsxx_cleanup_dma_queue(card, &ctrl->queue);
+ spin_unlock(&ctrl->queue_lock);
+
+ if (cnt)
+ dev_info(CARD_TO_DEV(card),
+ "Freed %d queued DMAs on channel %d\n",
+ cnt, i);
+
+ /* Clean up issued DMAs */
+ for (j = 0; j < RSXX_MAX_OUTSTANDING_CMDS; j++) {
+ dma = get_tracker_dma(ctrl->trackers, j);
+ if (dma) {
+ pci_unmap_page(card->dev, dma->dma_addr,
+ get_dma_size(dma),
+ (dma->cmd == HW_CMD_BLK_WRITE) ?
+ PCI_DMA_TODEVICE :
+ PCI_DMA_FROMDEVICE);
+ kmem_cache_free(rsxx_dma_pool, dma);
+ cnt++;
+ }
+ }
+
+ if (cnt)
+ dev_info(CARD_TO_DEV(card),
+ "Freed %d pending DMAs on channel %d\n",
+ cnt, i);
+
+ vfree(ctrl->trackers);
+
+ pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8,
+ ctrl->status.buf, ctrl->status.dma_addr);
+ pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8,
+ ctrl->cmd.buf, ctrl->cmd.dma_addr);
+ }
+}
+
+
+int rsxx_dma_init(void)
+{
+ rsxx_dma_pool = KMEM_CACHE(rsxx_dma, SLAB_HWCACHE_ALIGN);
+ if (!rsxx_dma_pool)
+ return -ENOMEM;
+
+ return 0;
+}
+
+
+void rsxx_dma_cleanup(void)
+{
+ kmem_cache_destroy(rsxx_dma_pool);
+}
+
diff --git a/drivers/block/rsxx/rsxx.h b/drivers/block/rsxx/rsxx.h
new file mode 100644
index 000000000000..2e50b65902b7
--- /dev/null
+++ b/drivers/block/rsxx/rsxx.h
@@ -0,0 +1,45 @@
+/*
+* Filename: rsxx.h
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+* Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#ifndef __RSXX_H__
+#define __RSXX_H__
+
+/*----------------- IOCTL Definitions -------------------*/
+
+struct rsxx_reg_access {
+ __u32 addr;
+ __u32 cnt;
+ __u32 stat;
+ __u32 stream;
+ __u32 data[8];
+};
+
+#define RSXX_MAX_REG_CNT (8 * (sizeof(__u32)))
+
+#define RSXX_IOC_MAGIC 'r'
+
+#define RSXX_GETREG _IOWR(RSXX_IOC_MAGIC, 0x20, struct rsxx_reg_access)
+#define RSXX_SETREG _IOWR(RSXX_IOC_MAGIC, 0x21, struct rsxx_reg_access)
+
+#endif /* __RSXX_H_ */
diff --git a/drivers/block/rsxx/rsxx_cfg.h b/drivers/block/rsxx/rsxx_cfg.h
new file mode 100644
index 000000000000..c025fe5fdb70
--- /dev/null
+++ b/drivers/block/rsxx/rsxx_cfg.h
@@ -0,0 +1,72 @@
+/*
+* Filename: rsXX_cfg.h
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+* Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#ifndef __RSXX_CFG_H__
+#define __RSXX_CFG_H__
+
+/* NOTE: Config values will be saved in network byte order (i.e. Big endian) */
+#include <linux/types.h>
+
+/*
+ * The card config version must match the driver's expected version. If it does
+ * not, the DMA interfaces will not be attached and the user will need to
+ * initialize/upgrade the card configuration using the card config utility.
+ */
+#define RSXX_CFG_VERSION 4
+
+struct card_cfg_hdr {
+ __u32 version;
+ __u32 crc;
+};
+
+struct card_cfg_data {
+ __u32 block_size;
+ __u32 stripe_size;
+ __u32 vendor_id;
+ __u32 cache_order;
+ struct {
+ __u32 mode; /* Disabled, manual, auto-tune... */
+ __u32 count; /* Number of intr to coalesce */
+ __u32 latency;/* Max wait time (in ns) */
+ } intr_coal;
+};
+
+struct rsxx_card_cfg {
+ struct card_cfg_hdr hdr;
+ struct card_cfg_data data;
+};
+
+/* Vendor ID Values */
+#define RSXX_VENDOR_ID_TMS_IBM 0
+#define RSXX_VENDOR_ID_DSI 1
+#define RSXX_VENDOR_COUNT 2
+
+/* Interrupt Coalescing Values */
+#define RSXX_INTR_COAL_DISABLED 0
+#define RSXX_INTR_COAL_EXPLICIT 1
+#define RSXX_INTR_COAL_AUTO_TUNE 2
+
+
+#endif /* __RSXX_CFG_H__ */
+
diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h
new file mode 100644
index 000000000000..a1ac907d8f4c
--- /dev/null
+++ b/drivers/block/rsxx/rsxx_priv.h
@@ -0,0 +1,399 @@
+/*
+* Filename: rsxx_priv.h
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+* Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#ifndef __RSXX_PRIV_H__
+#define __RSXX_PRIV_H__
+
+#include <linux/version.h>
+#include <linux/semaphore.h>
+
+#include <linux/fs.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/sysfs.h>
+#include <linux/workqueue.h>
+#include <linux/bio.h>
+#include <linux/vmalloc.h>
+#include <linux/timer.h>
+#include <linux/ioctl.h>
+
+#include "rsxx.h"
+#include "rsxx_cfg.h"
+
+struct proc_cmd;
+
+#define PCI_VENDOR_ID_TMS_IBM 0x15B6
+#define PCI_DEVICE_ID_RS70_FLASH 0x0019
+#define PCI_DEVICE_ID_RS70D_FLASH 0x001A
+#define PCI_DEVICE_ID_RS80_FLASH 0x001C
+#define PCI_DEVICE_ID_RS81_FLASH 0x001E
+
+#define RS70_PCI_REV_SUPPORTED 4
+
+#define DRIVER_NAME "rsxx"
+#define DRIVER_VERSION "3.7"
+
+/* Block size is 4096 */
+#define RSXX_HW_BLK_SHIFT 12
+#define RSXX_HW_BLK_SIZE (1 << RSXX_HW_BLK_SHIFT)
+#define RSXX_HW_BLK_MASK (RSXX_HW_BLK_SIZE - 1)
+
+#define MAX_CREG_DATA8 32
+#define LOG_BUF_SIZE8 128
+
+#define RSXX_MAX_OUTSTANDING_CMDS 255
+#define RSXX_CS_IDX_MASK 0xff
+
+#define RSXX_MAX_TARGETS 8
+
+struct dma_tracker_list;
+
+/* DMA Command/Status Buffer structure */
+struct rsxx_cs_buffer {
+ dma_addr_t dma_addr;
+ void *buf;
+ u32 idx;
+};
+
+struct rsxx_dma_stats {
+ u32 crc_errors;
+ u32 hard_errors;
+ u32 soft_errors;
+ u32 writes_issued;
+ u32 writes_failed;
+ u32 reads_issued;
+ u32 reads_failed;
+ u32 reads_retried;
+ u32 discards_issued;
+ u32 discards_failed;
+ u32 done_rescheduled;
+ u32 issue_rescheduled;
+ u32 sw_q_depth; /* Number of DMAs on the SW queue. */
+ atomic_t hw_q_depth; /* Number of DMAs queued to HW. */
+};
+
+struct rsxx_dma_ctrl {
+ struct rsxx_cardinfo *card;
+ int id;
+ void __iomem *regmap;
+ struct rsxx_cs_buffer status;
+ struct rsxx_cs_buffer cmd;
+ u16 e_cnt;
+ spinlock_t queue_lock;
+ struct list_head queue;
+ struct workqueue_struct *issue_wq;
+ struct work_struct issue_dma_work;
+ struct workqueue_struct *done_wq;
+ struct work_struct dma_done_work;
+ struct timer_list activity_timer;
+ struct dma_tracker_list *trackers;
+ struct rsxx_dma_stats stats;
+};
+
+struct rsxx_cardinfo {
+ struct pci_dev *dev;
+ unsigned int halt;
+
+ void __iomem *regmap;
+ spinlock_t irq_lock;
+ unsigned int isr_mask;
+ unsigned int ier_mask;
+
+ struct rsxx_card_cfg config;
+ int config_valid;
+
+ /* Embedded CPU Communication */
+ struct {
+ spinlock_t lock;
+ bool active;
+ struct creg_cmd *active_cmd;
+ struct work_struct done_work;
+ struct list_head queue;
+ unsigned int q_depth;
+ /* Cache the creg status to prevent ioreads */
+ struct {
+ u32 stat;
+ u32 failed_cancel_timer;
+ u32 creg_timeout;
+ } creg_stats;
+ struct timer_list cmd_timer;
+ struct mutex reset_lock;
+ int reset;
+ } creg_ctrl;
+
+ struct {
+ char tmp[MAX_CREG_DATA8];
+ char buf[LOG_BUF_SIZE8]; /* terminated */
+ int buf_len;
+ } log;
+
+ struct work_struct event_work;
+ unsigned int state;
+ u64 size8;
+
+ /* Lock the device attach/detach function */
+ struct mutex dev_lock;
+
+ /* Block Device Variables */
+ bool bdev_attached;
+ int disk_id;
+ int major;
+ struct request_queue *queue;
+ struct gendisk *gendisk;
+ struct {
+ /* Used to convert a byte address to a device address. */
+ u64 lower_mask;
+ u64 upper_shift;
+ u64 upper_mask;
+ u64 target_mask;
+ u64 target_shift;
+ } _stripe;
+ unsigned int dma_fault;
+
+ int scrub_hard;
+
+ int n_targets;
+ struct rsxx_dma_ctrl *ctrl;
+};
+
+enum rsxx_pci_regmap {
+ HWID = 0x00, /* Hardware Identification Register */
+ SCRATCH = 0x04, /* Scratch/Debug Register */
+ RESET = 0x08, /* Reset Register */
+ ISR = 0x10, /* Interrupt Status Register */
+ IER = 0x14, /* Interrupt Enable Register */
+ IPR = 0x18, /* Interrupt Poll Register */
+ CB_ADD_LO = 0x20, /* Command Host Buffer Address [31:0] */
+ CB_ADD_HI = 0x24, /* Command Host Buffer Address [63:32]*/
+ HW_CMD_IDX = 0x28, /* Hardware Processed Command Index */
+ SW_CMD_IDX = 0x2C, /* Software Processed Command Index */
+ SB_ADD_LO = 0x30, /* Status Host Buffer Address [31:0] */
+ SB_ADD_HI = 0x34, /* Status Host Buffer Address [63:32] */
+ HW_STATUS_CNT = 0x38, /* Hardware Status Counter */
+ SW_STATUS_CNT = 0x3C, /* Deprecated */
+ CREG_CMD = 0x40, /* CPU Command Register */
+ CREG_ADD = 0x44, /* CPU Address Register */
+ CREG_CNT = 0x48, /* CPU Count Register */
+ CREG_STAT = 0x4C, /* CPU Status Register */
+ CREG_DATA0 = 0x50, /* CPU Data Registers */
+ CREG_DATA1 = 0x54,
+ CREG_DATA2 = 0x58,
+ CREG_DATA3 = 0x5C,
+ CREG_DATA4 = 0x60,
+ CREG_DATA5 = 0x64,
+ CREG_DATA6 = 0x68,
+ CREG_DATA7 = 0x6c,
+ INTR_COAL = 0x70, /* Interrupt Coalescing Register */
+ HW_ERROR = 0x74, /* Card Error Register */
+ PCI_DEBUG0 = 0x78, /* PCI Debug Registers */
+ PCI_DEBUG1 = 0x7C,
+ PCI_DEBUG2 = 0x80,
+ PCI_DEBUG3 = 0x84,
+ PCI_DEBUG4 = 0x88,
+ PCI_DEBUG5 = 0x8C,
+ PCI_DEBUG6 = 0x90,
+ PCI_DEBUG7 = 0x94,
+ PCI_POWER_THROTTLE = 0x98,
+ PERF_CTRL = 0x9c,
+ PERF_TIMER_LO = 0xa0,
+ PERF_TIMER_HI = 0xa4,
+ PERF_RD512_LO = 0xa8,
+ PERF_RD512_HI = 0xac,
+ PERF_WR512_LO = 0xb0,
+ PERF_WR512_HI = 0xb4,
+};
+
+enum rsxx_intr {
+ CR_INTR_DMA0 = 0x00000001,
+ CR_INTR_CREG = 0x00000002,
+ CR_INTR_DMA1 = 0x00000004,
+ CR_INTR_EVENT = 0x00000008,
+ CR_INTR_DMA2 = 0x00000010,
+ CR_INTR_DMA3 = 0x00000020,
+ CR_INTR_DMA4 = 0x00000040,
+ CR_INTR_DMA5 = 0x00000080,
+ CR_INTR_DMA6 = 0x00000100,
+ CR_INTR_DMA7 = 0x00000200,
+ CR_INTR_DMA_ALL = 0x000003f5,
+ CR_INTR_ALL = 0xffffffff,
+};
+
+static inline int CR_INTR_DMA(int N)
+{
+ static const unsigned int _CR_INTR_DMA[] = {
+ CR_INTR_DMA0, CR_INTR_DMA1, CR_INTR_DMA2, CR_INTR_DMA3,
+ CR_INTR_DMA4, CR_INTR_DMA5, CR_INTR_DMA6, CR_INTR_DMA7
+ };
+ return _CR_INTR_DMA[N];
+}
+enum rsxx_pci_reset {
+ DMA_QUEUE_RESET = 0x00000001,
+};
+
+enum rsxx_pci_revision {
+ RSXX_DISCARD_SUPPORT = 2,
+};
+
+enum rsxx_creg_cmd {
+ CREG_CMD_TAG_MASK = 0x0000FF00,
+ CREG_OP_WRITE = 0x000000C0,
+ CREG_OP_READ = 0x000000E0,
+};
+
+enum rsxx_creg_addr {
+ CREG_ADD_CARD_CMD = 0x80001000,
+ CREG_ADD_CARD_STATE = 0x80001004,
+ CREG_ADD_CARD_SIZE = 0x8000100c,
+ CREG_ADD_CAPABILITIES = 0x80001050,
+ CREG_ADD_LOG = 0x80002000,
+ CREG_ADD_NUM_TARGETS = 0x80003000,
+ CREG_ADD_CONFIG = 0xB0000000,
+};
+
+enum rsxx_creg_card_cmd {
+ CARD_CMD_STARTUP = 1,
+ CARD_CMD_SHUTDOWN = 2,
+ CARD_CMD_LOW_LEVEL_FORMAT = 3,
+ CARD_CMD_FPGA_RECONFIG_BR = 4,
+ CARD_CMD_FPGA_RECONFIG_MAIN = 5,
+ CARD_CMD_BACKUP = 6,
+ CARD_CMD_RESET = 7,
+ CARD_CMD_deprecated = 8,
+ CARD_CMD_UNINITIALIZE = 9,
+ CARD_CMD_DSTROY_EMERGENCY = 10,
+ CARD_CMD_DSTROY_NORMAL = 11,
+ CARD_CMD_DSTROY_EXTENDED = 12,
+ CARD_CMD_DSTROY_ABORT = 13,
+};
+
+enum rsxx_card_state {
+ CARD_STATE_SHUTDOWN = 0x00000001,
+ CARD_STATE_STARTING = 0x00000002,
+ CARD_STATE_FORMATTING = 0x00000004,
+ CARD_STATE_UNINITIALIZED = 0x00000008,
+ CARD_STATE_GOOD = 0x00000010,
+ CARD_STATE_SHUTTING_DOWN = 0x00000020,
+ CARD_STATE_FAULT = 0x00000040,
+ CARD_STATE_RD_ONLY_FAULT = 0x00000080,
+ CARD_STATE_DSTROYING = 0x00000100,
+};
+
+enum rsxx_led {
+ LED_DEFAULT = 0x0,
+ LED_IDENTIFY = 0x1,
+ LED_SOAK = 0x2,
+};
+
+enum rsxx_creg_flash_lock {
+ CREG_FLASH_LOCK = 1,
+ CREG_FLASH_UNLOCK = 2,
+};
+
+enum rsxx_card_capabilities {
+ CARD_CAP_SUBPAGE_WRITES = 0x00000080,
+};
+
+enum rsxx_creg_stat {
+ CREG_STAT_STATUS_MASK = 0x00000003,
+ CREG_STAT_SUCCESS = 0x1,
+ CREG_STAT_ERROR = 0x2,
+ CREG_STAT_CHAR_PENDING = 0x00000004, /* Character I/O pending bit */
+ CREG_STAT_LOG_PENDING = 0x00000008, /* HW log message pending bit */
+ CREG_STAT_TAG_MASK = 0x0000ff00,
+};
+
+static inline unsigned int CREG_DATA(int N)
+{
+ return CREG_DATA0 + (N << 2);
+}
+
+/*----------------- Convenient Log Wrappers -------------------*/
+#define CARD_TO_DEV(__CARD) (&(__CARD)->dev->dev)
+
+/***** config.c *****/
+int rsxx_load_config(struct rsxx_cardinfo *card);
+
+/***** core.c *****/
+void rsxx_enable_ier(struct rsxx_cardinfo *card, unsigned int intr);
+void rsxx_disable_ier(struct rsxx_cardinfo *card, unsigned int intr);
+void rsxx_enable_ier_and_isr(struct rsxx_cardinfo *card,
+ unsigned int intr);
+void rsxx_disable_ier_and_isr(struct rsxx_cardinfo *card,
+ unsigned int intr);
+
+/***** dev.c *****/
+int rsxx_attach_dev(struct rsxx_cardinfo *card);
+void rsxx_detach_dev(struct rsxx_cardinfo *card);
+int rsxx_setup_dev(struct rsxx_cardinfo *card);
+void rsxx_destroy_dev(struct rsxx_cardinfo *card);
+int rsxx_dev_init(void);
+void rsxx_dev_cleanup(void);
+
+/***** dma.c ****/
+typedef void (*rsxx_dma_cb)(struct rsxx_cardinfo *card,
+ void *cb_data,
+ unsigned int status);
+int rsxx_dma_setup(struct rsxx_cardinfo *card);
+void rsxx_dma_destroy(struct rsxx_cardinfo *card);
+int rsxx_dma_init(void);
+void rsxx_dma_cleanup(void);
+int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
+ struct bio *bio,
+ atomic_t *n_dmas,
+ rsxx_dma_cb cb,
+ void *cb_data);
+
+/***** cregs.c *****/
+int rsxx_creg_write(struct rsxx_cardinfo *card, u32 addr,
+ unsigned int size8,
+ void *data,
+ int byte_stream);
+int rsxx_creg_read(struct rsxx_cardinfo *card,
+ u32 addr,
+ unsigned int size8,
+ void *data,
+ int byte_stream);
+int rsxx_read_hw_log(struct rsxx_cardinfo *card);
+int rsxx_get_card_state(struct rsxx_cardinfo *card,
+ unsigned int *state);
+int rsxx_get_card_size8(struct rsxx_cardinfo *card, u64 *size8);
+int rsxx_get_num_targets(struct rsxx_cardinfo *card,
+ unsigned int *n_targets);
+int rsxx_get_card_capabilities(struct rsxx_cardinfo *card,
+ u32 *capabilities);
+int rsxx_issue_card_cmd(struct rsxx_cardinfo *card, u32 cmd);
+int rsxx_creg_setup(struct rsxx_cardinfo *card);
+void rsxx_creg_destroy(struct rsxx_cardinfo *card);
+int rsxx_creg_init(void);
+void rsxx_creg_cleanup(void);
+
+int rsxx_reg_access(struct rsxx_cardinfo *card,
+ struct rsxx_reg_access __user *ucmd,
+ int read);
+
+
+
+#endif /* __DRIVERS_BLOCK_RSXX_H__ */
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 57763c54363a..758f2ac878cf 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -1090,10 +1090,13 @@ static const struct block_device_operations floppy_fops = {
static void swim3_mb_event(struct macio_dev* mdev, int mb_state)
{
struct floppy_state *fs = macio_get_drvdata(mdev);
- struct swim3 __iomem *sw = fs->swim3;
+ struct swim3 __iomem *sw;
if (!fs)
return;
+
+ sw = fs->swim3;
+
if (mb_state != MB_FD)
return;
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
deleted file mode 100644
index ff540520bada..000000000000
--- a/drivers/block/xd.c
+++ /dev/null
@@ -1,1123 +0,0 @@
-/*
- * This file contains the driver for an XT hard disk controller
- * (at least the DTC 5150X) for Linux.
- *
- * Author: Pat Mackinlay, pat@it.com.au
- * Date: 29/09/92
- *
- * Revised: 01/01/93, ...
- *
- * Ref: DTC 5150X Controller Specification (thanks to Kevin Fowler,
- * kevinf@agora.rain.com)
- * Also thanks to: Salvador Abreu, Dave Thaler, Risto Kankkunen and
- * Wim Van Dorst.
- *
- * Revised: 04/04/94 by Risto Kankkunen
- * Moved the detection code from xd_init() to xd_geninit() as it needed
- * interrupts enabled and Linus didn't want to enable them in that first
- * phase. xd_geninit() is the place to do these kinds of things anyway,
- * he says.
- *
- * Modularized: 04/10/96 by Todd Fries, tfries@umr.edu
- *
- * Revised: 13/12/97 by Andrzej Krzysztofowicz, ankry@mif.pg.gda.pl
- * Fixed some problems with disk initialization and module initiation.
- * Added support for manual geometry setting (except Seagate controllers)
- * in form:
- * xd_geo=<cyl_xda>,<head_xda>,<sec_xda>[,<cyl_xdb>,<head_xdb>,<sec_xdb>]
- * Recovered DMA access. Abridged messages. Added support for DTC5051CX,
- * WD1002-27X & XEBEC controllers. Driver uses now some jumper settings.
- * Extended ioctl() support.
- *
- * Bugfix: 15/02/01, Paul G. - inform queue layer of tiny xd_maxsect.
- *
- */
-
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <linux/genhd.h>
-#include <linux/hdreg.h>
-#include <linux/ioport.h>
-#include <linux/init.h>
-#include <linux/wait.h>
-#include <linux/blkdev.h>
-#include <linux/mutex.h>
-#include <linux/blkpg.h>
-#include <linux/delay.h>
-#include <linux/io.h>
-#include <linux/gfp.h>
-
-#include <asm/uaccess.h>
-#include <asm/dma.h>
-
-#include "xd.h"
-
-static DEFINE_MUTEX(xd_mutex);
-static void __init do_xd_setup (int *integers);
-#ifdef MODULE
-static int xd[5] = { -1,-1,-1,-1, };
-#endif
-
-#define XD_DONT_USE_DMA 0 /* Initial value. may be overriden using
- "nodma" module option */
-#define XD_INIT_DISK_DELAY (30) /* 30 ms delay during disk initialization */
-
-/* Above may need to be increased if a problem with the 2nd drive detection
- (ST11M controller) or resetting a controller (WD) appears */
-
-static XD_INFO xd_info[XD_MAXDRIVES];
-
-/* If you try this driver and find that your card is not detected by the driver at bootup, you need to add your BIOS
- signature and details to the following list of signatures. A BIOS signature is a string embedded into the first
- few bytes of your controller's on-board ROM BIOS. To find out what yours is, use something like MS-DOS's DEBUG
- command. Run DEBUG, and then you can examine your BIOS signature with:
-
- d xxxx:0000
-
- where xxxx is the segment of your controller (like C800 or D000 or something). On the ASCII dump at the right, you should
- be able to see a string mentioning the manufacturer's copyright etc. Add this string into the table below. The parameters
- in the table are, in order:
-
- offset ; this is the offset (in bytes) from the start of your ROM where the signature starts
- signature ; this is the actual text of the signature
- xd_?_init_controller ; this is the controller init routine used by your controller
- xd_?_init_drive ; this is the drive init routine used by your controller
-
- The controllers directly supported at the moment are: DTC 5150x, WD 1004A27X, ST11M/R and override. If your controller is
- made by the same manufacturer as one of these, try using the same init routines as they do. If that doesn't work, your
- best bet is to use the "override" routines. These routines use a "portable" method of getting the disk's geometry, and
- may work with your card. If none of these seem to work, try sending me some email and I'll see what I can do <grin>.
-
- NOTE: You can now specify your XT controller's parameters from the command line in the form xd=TYPE,IRQ,IO,DMA. The driver
- should be able to detect your drive's geometry from this info. (eg: xd=0,5,0x320,3 is the "standard"). */
-
-#include <asm/page.h>
-#define xd_dma_mem_alloc(size) __get_dma_pages(GFP_KERNEL,get_order(size))
-#define xd_dma_mem_free(addr, size) free_pages(addr, get_order(size))
-static char *xd_dma_buffer;
-
-static XD_SIGNATURE xd_sigs[] __initdata = {
- { 0x0000,"Override geometry handler",NULL,xd_override_init_drive,"n unknown" }, /* Pat Mackinlay, pat@it.com.au */
- { 0x0008,"[BXD06 (C) DTC 17-MAY-1985]",xd_dtc_init_controller,xd_dtc5150cx_init_drive," DTC 5150CX" }, /* Andrzej Krzysztofowicz, ankry@mif.pg.gda.pl */
- { 0x000B,"CRD18A Not an IBM rom. (C) Copyright Data Technology Corp. 05/31/88",xd_dtc_init_controller,xd_dtc_init_drive," DTC 5150X" }, /* Todd Fries, tfries@umr.edu */
- { 0x000B,"CXD23A Not an IBM ROM (C)Copyright Data Technology Corp 12/03/88",xd_dtc_init_controller,xd_dtc_init_drive," DTC 5150X" }, /* Pat Mackinlay, pat@it.com.au */
- { 0x0008,"07/15/86(C) Copyright 1986 Western Digital Corp.",xd_wd_init_controller,xd_wd_init_drive," Western Dig. 1002-27X" }, /* Andrzej Krzysztofowicz, ankry@mif.pg.gda.pl */
- { 0x0008,"06/24/88(C) Copyright 1988 Western Digital Corp.",xd_wd_init_controller,xd_wd_init_drive," Western Dig. WDXT-GEN2" }, /* Dan Newcombe, newcombe@aa.csc.peachnet.edu */
- { 0x0015,"SEAGATE ST11 BIOS REVISION",xd_seagate_init_controller,xd_seagate_init_drive," Seagate ST11M/R" }, /* Salvador Abreu, spa@fct.unl.pt */
- { 0x0010,"ST11R BIOS",xd_seagate_init_controller,xd_seagate_init_drive," Seagate ST11M/R" }, /* Risto Kankkunen, risto.kankkunen@cs.helsinki.fi */
- { 0x0010,"ST11 BIOS v1.7",xd_seagate_init_controller,xd_seagate_init_drive," Seagate ST11R" }, /* Alan Hourihane, alanh@fairlite.demon.co.uk */
- { 0x1000,"(c)Copyright 1987 SMS",xd_omti_init_controller,xd_omti_init_drive,"n OMTI 5520" }, /* Dirk Melchers, dirk@merlin.nbg.sub.org */
- { 0x0006,"COPYRIGHT XEBEC (C) 1984",xd_xebec_init_controller,xd_xebec_init_drive," XEBEC" }, /* Andrzej Krzysztofowicz, ankry@mif.pg.gda.pl */
- { 0x0008,"(C) Copyright 1984 Western Digital Corp", xd_wd_init_controller, xd_wd_init_drive," Western Dig. 1002s-wx2" },
- { 0x0008,"(C) Copyright 1986 Western Digital Corporation", xd_wd_init_controller, xd_wd_init_drive," 1986 Western Digital" }, /* jfree@sovereign.org */
-};
-
-static unsigned int xd_bases[] __initdata =
-{
- 0xC8000, 0xCA000, 0xCC000,
- 0xCE000, 0xD0000, 0xD2000,
- 0xD4000, 0xD6000, 0xD8000,
- 0xDA000, 0xDC000, 0xDE000,
- 0xE0000
-};
-
-static DEFINE_SPINLOCK(xd_lock);
-
-static struct gendisk *xd_gendisk[2];
-
-static int xd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
-
-static const struct block_device_operations xd_fops = {
- .owner = THIS_MODULE,
- .ioctl = xd_ioctl,
- .getgeo = xd_getgeo,
-};
-static DECLARE_WAIT_QUEUE_HEAD(xd_wait_int);
-static u_char xd_drives, xd_irq = 5, xd_dma = 3, xd_maxsectors;
-static u_char xd_override __initdata = 0, xd_type __initdata = 0;
-static u_short xd_iobase = 0x320;
-static int xd_geo[XD_MAXDRIVES*3] __initdata = { 0, };
-
-static volatile int xdc_busy;
-static struct timer_list xd_watchdog_int;
-
-static volatile u_char xd_error;
-static bool nodma = XD_DONT_USE_DMA;
-
-static struct request_queue *xd_queue;
-
-/* xd_init: register the block device number and set up pointer tables */
-static int __init xd_init(void)
-{
- u_char i,controller;
- unsigned int address;
- int err;
-
-#ifdef MODULE
- {
- u_char count = 0;
- for (i = 4; i > 0; i--)
- if (((xd[i] = xd[i-1]) >= 0) && !count)
- count = i;
- if ((xd[0] = count))
- do_xd_setup(xd);
- }
-#endif
-
- init_timer (&xd_watchdog_int); xd_watchdog_int.function = xd_watchdog;
-
- err = -EBUSY;
- if (register_blkdev(XT_DISK_MAJOR, "xd"))
- goto out1;
-
- err = -ENOMEM;
- xd_queue = blk_init_queue(do_xd_request, &xd_lock);
- if (!xd_queue)
- goto out1a;
-
- if (xd_detect(&controller,&address)) {
-
- printk("Detected a%s controller (type %d) at address %06x\n",
- xd_sigs[controller].name,controller,address);
- if (!request_region(xd_iobase,4,"xd")) {
- printk("xd: Ports at 0x%x are not available\n",
- xd_iobase);
- goto out2;
- }
- if (controller)
- xd_sigs[controller].init_controller(address);
- xd_drives = xd_initdrives(xd_sigs[controller].init_drive);
-
- printk("Detected %d hard drive%s (using IRQ%d & DMA%d)\n",
- xd_drives,xd_drives == 1 ? "" : "s",xd_irq,xd_dma);
- }
-
- /*
- * With the drive detected, xd_maxsectors should now be known.
- * If xd_maxsectors is 0, nothing was detected and we fall through
- * to return -ENODEV
- */
- if (!xd_dma_buffer && xd_maxsectors) {
- xd_dma_buffer = (char *)xd_dma_mem_alloc(xd_maxsectors * 0x200);
- if (!xd_dma_buffer) {
- printk(KERN_ERR "xd: Out of memory.\n");
- goto out3;
- }
- }
-
- err = -ENODEV;
- if (!xd_drives)
- goto out3;
-
- for (i = 0; i < xd_drives; i++) {
- XD_INFO *p = &xd_info[i];
- struct gendisk *disk = alloc_disk(64);
- if (!disk)
- goto Enomem;
- p->unit = i;
- disk->major = XT_DISK_MAJOR;
- disk->first_minor = i<<6;
- sprintf(disk->disk_name, "xd%c", i+'a');
- disk->fops = &xd_fops;
- disk->private_data = p;
- disk->queue = xd_queue;
- set_capacity(disk, p->heads * p->cylinders * p->sectors);
- printk(" %s: CHS=%d/%d/%d\n", disk->disk_name,
- p->cylinders, p->heads, p->sectors);
- xd_gendisk[i] = disk;
- }
-
- err = -EBUSY;
- if (request_irq(xd_irq,xd_interrupt_handler, 0, "XT hard disk", NULL)) {
- printk("xd: unable to get IRQ%d\n",xd_irq);
- goto out4;
- }
-
- if (request_dma(xd_dma,"xd")) {
- printk("xd: unable to get DMA%d\n",xd_dma);
- goto out5;
- }
-
- /* xd_maxsectors depends on controller - so set after detection */
- blk_queue_max_hw_sectors(xd_queue, xd_maxsectors);
-
- for (i = 0; i < xd_drives; i++)
- add_disk(xd_gendisk[i]);
-
- return 0;
-
-out5:
- free_irq(xd_irq, NULL);
-out4:
- for (i = 0; i < xd_drives; i++)
- put_disk(xd_gendisk[i]);
-out3:
- if (xd_maxsectors)
- release_region(xd_iobase,4);
-
- if (xd_dma_buffer)
- xd_dma_mem_free((unsigned long)xd_dma_buffer,
- xd_maxsectors * 0x200);
-out2:
- blk_cleanup_queue(xd_queue);
-out1a:
- unregister_blkdev(XT_DISK_MAJOR, "xd");
-out1:
- return err;
-Enomem:
- err = -ENOMEM;
- while (i--)
- put_disk(xd_gendisk[i]);
- goto out3;
-}
-
-/* xd_detect: scan the possible BIOS ROM locations for the signature strings */
-static u_char __init xd_detect (u_char *controller, unsigned int *address)
-{
- int i, j;
-
- if (xd_override)
- {
- *controller = xd_type;
- *address = 0;
- return(1);
- }
-
- for (i = 0; i < ARRAY_SIZE(xd_bases); i++) {
- void __iomem *p = ioremap(xd_bases[i], 0x2000);
- if (!p)
- continue;
- for (j = 1; j < ARRAY_SIZE(xd_sigs); j++) {
- const char *s = xd_sigs[j].string;
- if (check_signature(p + xd_sigs[j].offset, s, strlen(s))) {
- *controller = j;
- xd_type = j;
- *address = xd_bases[i];
- iounmap(p);
- return 1;
- }
- }
- iounmap(p);
- }
- return 0;
-}
-
-/* do_xd_request: handle an incoming request */
-static void do_xd_request (struct request_queue * q)
-{
- struct request *req;
-
- if (xdc_busy)
- return;
-
- req = blk_fetch_request(q);
- while (req) {
- unsigned block = blk_rq_pos(req);
- unsigned count = blk_rq_cur_sectors(req);
- XD_INFO *disk = req->rq_disk->private_data;
- int res = -EIO;
- int retry;
-
- if (req->cmd_type != REQ_TYPE_FS)
- goto done;
- if (block + count > get_capacity(req->rq_disk))
- goto done;
- for (retry = 0; (retry < XD_RETRIES) && !res; retry++)
- res = xd_readwrite(rq_data_dir(req), disk, req->buffer,
- block, count);
- done:
- /* wrap up, 0 = success, -errno = fail */
- if (!__blk_end_request_cur(req, res))
- req = blk_fetch_request(q);
- }
-}
-
-static int xd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
- XD_INFO *p = bdev->bd_disk->private_data;
-
- geo->heads = p->heads;
- geo->sectors = p->sectors;
- geo->cylinders = p->cylinders;
- return 0;
-}
-
-/* xd_ioctl: handle device ioctl's */
-static int xd_locked_ioctl(struct block_device *bdev, fmode_t mode, u_int cmd, u_long arg)
-{
- switch (cmd) {
- case HDIO_SET_DMA:
- if (!capable(CAP_SYS_ADMIN)) return -EACCES;
- if (xdc_busy) return -EBUSY;
- nodma = !arg;
- if (nodma && xd_dma_buffer) {
- xd_dma_mem_free((unsigned long)xd_dma_buffer,
- xd_maxsectors * 0x200);
- xd_dma_buffer = NULL;
- } else if (!nodma && !xd_dma_buffer) {
- xd_dma_buffer = (char *)xd_dma_mem_alloc(xd_maxsectors * 0x200);
- if (!xd_dma_buffer) {
- nodma = XD_DONT_USE_DMA;
- return -ENOMEM;
- }
- }
- return 0;
- case HDIO_GET_DMA:
- return put_user(!nodma, (long __user *) arg);
- case HDIO_GET_MULTCOUNT:
- return put_user(xd_maxsectors, (long __user *) arg);
- default:
- return -EINVAL;
- }
-}
-
-static int xd_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long param)
-{
- int ret;
-
- mutex_lock(&xd_mutex);
- ret = xd_locked_ioctl(bdev, mode, cmd, param);
- mutex_unlock(&xd_mutex);
-
- return ret;
-}
-
-/* xd_readwrite: handle a read/write request */
-static int xd_readwrite (u_char operation,XD_INFO *p,char *buffer,u_int block,u_int count)
-{
- int drive = p->unit;
- u_char cmdblk[6],sense[4];
- u_short track,cylinder;
- u_char head,sector,control,mode = PIO_MODE,temp;
- char **real_buffer;
- register int i;
-
-#ifdef DEBUG_READWRITE
- printk("xd_readwrite: operation = %s, drive = %d, buffer = 0x%X, block = %d, count = %d\n",operation == READ ? "read" : "write",drive,buffer,block,count);
-#endif /* DEBUG_READWRITE */
-
- spin_unlock_irq(&xd_lock);
-
- control = p->control;
- if (!xd_dma_buffer)
- xd_dma_buffer = (char *)xd_dma_mem_alloc(xd_maxsectors * 0x200);
- while (count) {
- temp = count < xd_maxsectors ? count : xd_maxsectors;
-
- track = block / p->sectors;
- head = track % p->heads;
- cylinder = track / p->heads;
- sector = block % p->sectors;
-
-#ifdef DEBUG_READWRITE
- printk("xd_readwrite: drive = %d, head = %d, cylinder = %d, sector = %d, count = %d\n",drive,head,cylinder,sector,temp);
-#endif /* DEBUG_READWRITE */
-
- if (xd_dma_buffer) {
- mode = xd_setup_dma(operation == READ ? DMA_MODE_READ : DMA_MODE_WRITE,(u_char *)(xd_dma_buffer),temp * 0x200);
- real_buffer = &xd_dma_buffer;
- for (i=0; i < (temp * 0x200); i++)
- xd_dma_buffer[i] = buffer[i];
- }
- else
- real_buffer = &buffer;
-
- xd_build(cmdblk,operation == READ ? CMD_READ : CMD_WRITE,drive,head,cylinder,sector,temp & 0xFF,control);
-
- switch (xd_command(cmdblk,mode,(u_char *)(*real_buffer),(u_char *)(*real_buffer),sense,XD_TIMEOUT)) {
- case 1:
- printk("xd%c: %s timeout, recalibrating drive\n",'a'+drive,(operation == READ ? "read" : "write"));
- xd_recalibrate(drive);
- spin_lock_irq(&xd_lock);
- return -EIO;
- case 2:
- if (sense[0] & 0x30) {
- printk("xd%c: %s - ",'a'+drive,(operation == READ ? "reading" : "writing"));
- switch ((sense[0] & 0x30) >> 4) {
- case 0: printk("drive error, code = 0x%X",sense[0] & 0x0F);
- break;
- case 1: printk("controller error, code = 0x%X",sense[0] & 0x0F);
- break;
- case 2: printk("command error, code = 0x%X",sense[0] & 0x0F);
- break;
- case 3: printk("miscellaneous error, code = 0x%X",sense[0] & 0x0F);
- break;
- }
- }
- if (sense[0] & 0x80)
- printk(" - CHS = %d/%d/%d\n",((sense[2] & 0xC0) << 2) | sense[3],sense[1] & 0x1F,sense[2] & 0x3F);
- /* reported drive number = (sense[1] & 0xE0) >> 5 */
- else
- printk(" - no valid disk address\n");
- spin_lock_irq(&xd_lock);
- return -EIO;
- }
- if (xd_dma_buffer)
- for (i=0; i < (temp * 0x200); i++)
- buffer[i] = xd_dma_buffer[i];
-
- count -= temp, buffer += temp * 0x200, block += temp;
- }
- spin_lock_irq(&xd_lock);
- return 0;
-}
-
-/* xd_recalibrate: recalibrate a given drive and reset controller if necessary */
-static void xd_recalibrate (u_char drive)
-{
- u_char cmdblk[6];
-
- xd_build(cmdblk,CMD_RECALIBRATE,drive,0,0,0,0,0);
- if (xd_command(cmdblk,PIO_MODE,NULL,NULL,NULL,XD_TIMEOUT * 8))
- printk("xd%c: warning! error recalibrating, controller may be unstable\n", 'a'+drive);
-}
-
-/* xd_interrupt_handler: interrupt service routine */
-static irqreturn_t xd_interrupt_handler(int irq, void *dev_id)
-{
- if (inb(XD_STATUS) & STAT_INTERRUPT) { /* check if it was our device */
-#ifdef DEBUG_OTHER
- printk("xd_interrupt_handler: interrupt detected\n");
-#endif /* DEBUG_OTHER */
- outb(0,XD_CONTROL); /* acknowledge interrupt */
- wake_up(&xd_wait_int); /* and wake up sleeping processes */
- return IRQ_HANDLED;
- }
- else
- printk("xd: unexpected interrupt\n");
- return IRQ_NONE;
-}
-
-/* xd_setup_dma: set up the DMA controller for a data transfer */
-static u_char xd_setup_dma (u_char mode,u_char *buffer,u_int count)
-{
- unsigned long f;
-
- if (nodma)
- return (PIO_MODE);
- if (((unsigned long) buffer & 0xFFFF0000) != (((unsigned long) buffer + count) & 0xFFFF0000)) {
-#ifdef DEBUG_OTHER
- printk("xd_setup_dma: using PIO, transfer overlaps 64k boundary\n");
-#endif /* DEBUG_OTHER */
- return (PIO_MODE);
- }
-
- f=claim_dma_lock();
- disable_dma(xd_dma);
- clear_dma_ff(xd_dma);
- set_dma_mode(xd_dma,mode);
- set_dma_addr(xd_dma, (unsigned long) buffer);
- set_dma_count(xd_dma,count);
-
- release_dma_lock(f);
-
- return (DMA_MODE); /* use DMA and INT */
-}
-
-/* xd_build: put stuff into an array in a format suitable for the controller */
-static u_char *xd_build (u_char *cmdblk,u_char command,u_char drive,u_char head,u_short cylinder,u_char sector,u_char count,u_char control)
-{
- cmdblk[0] = command;
- cmdblk[1] = ((drive & 0x07) << 5) | (head & 0x1F);
- cmdblk[2] = ((cylinder & 0x300) >> 2) | (sector & 0x3F);
- cmdblk[3] = cylinder & 0xFF;
- cmdblk[4] = count;
- cmdblk[5] = control;
-
- return (cmdblk);
-}
-
-static void xd_watchdog (unsigned long unused)
-{
- xd_error = 1;
- wake_up(&xd_wait_int);
-}
-
-/* xd_waitport: waits until port & mask == flags or a timeout occurs. return 1 for a timeout */
-static inline u_char xd_waitport (u_short port,u_char flags,u_char mask,u_long timeout)
-{
- u_long expiry = jiffies + timeout;
- int success;
-
- xdc_busy = 1;
- while ((success = ((inb(port) & mask) != flags)) && time_before(jiffies, expiry))
- schedule_timeout_uninterruptible(1);
- xdc_busy = 0;
- return (success);
-}
-
-static inline u_int xd_wait_for_IRQ (void)
-{
- unsigned long flags;
- xd_watchdog_int.expires = jiffies + 8 * HZ;
- add_timer(&xd_watchdog_int);
-
- flags=claim_dma_lock();
- enable_dma(xd_dma);
- release_dma_lock(flags);
-
- sleep_on(&xd_wait_int);
- del_timer(&xd_watchdog_int);
- xdc_busy = 0;
-
- flags=claim_dma_lock();
- disable_dma(xd_dma);
- release_dma_lock(flags);
-
- if (xd_error) {
- printk("xd: missed IRQ - command aborted\n");
- xd_error = 0;
- return (1);
- }
- return (0);
-}
-
-/* xd_command: handle all data transfers necessary for a single command */
-static u_int xd_command (u_char *command,u_char mode,u_char *indata,u_char *outdata,u_char *sense,u_long timeout)
-{
- u_char cmdblk[6],csb,complete = 0;
-
-#ifdef DEBUG_COMMAND
- printk("xd_command: command = 0x%X, mode = 0x%X, indata = 0x%X, outdata = 0x%X, sense = 0x%X\n",command,mode,indata,outdata,sense);
-#endif /* DEBUG_COMMAND */
-
- outb(0,XD_SELECT);
- outb(mode,XD_CONTROL);
-
- if (xd_waitport(XD_STATUS,STAT_SELECT,STAT_SELECT,timeout))
- return (1);
-
- while (!complete) {
- if (xd_waitport(XD_STATUS,STAT_READY,STAT_READY,timeout))
- return (1);
-
- switch (inb(XD_STATUS) & (STAT_COMMAND | STAT_INPUT)) {
- case 0:
- if (mode == DMA_MODE) {
- if (xd_wait_for_IRQ())
- return (1);
- } else
- outb(outdata ? *outdata++ : 0,XD_DATA);
- break;
- case STAT_INPUT:
- if (mode == DMA_MODE) {
- if (xd_wait_for_IRQ())
- return (1);
- } else
- if (indata)
- *indata++ = inb(XD_DATA);
- else
- inb(XD_DATA);
- break;
- case STAT_COMMAND:
- outb(command ? *command++ : 0,XD_DATA);
- break;
- case STAT_COMMAND | STAT_INPUT:
- complete = 1;
- break;
- }
- }
- csb = inb(XD_DATA);
-
- if (xd_waitport(XD_STATUS,0,STAT_SELECT,timeout)) /* wait until deselected */
- return (1);
-
- if (csb & CSB_ERROR) { /* read sense data if error */
- xd_build(cmdblk,CMD_SENSE,(csb & CSB_LUN) >> 5,0,0,0,0,0);
- if (xd_command(cmdblk,0,sense,NULL,NULL,XD_TIMEOUT))
- printk("xd: warning! sense command failed!\n");
- }
-
-#ifdef DEBUG_COMMAND
- printk("xd_command: completed with csb = 0x%X\n",csb);
-#endif /* DEBUG_COMMAND */
-
- return (csb & CSB_ERROR);
-}
-
-static u_char __init xd_initdrives (void (*init_drive)(u_char drive))
-{
- u_char cmdblk[6],i,count = 0;
-
- for (i = 0; i < XD_MAXDRIVES; i++) {
- xd_build(cmdblk,CMD_TESTREADY,i,0,0,0,0,0);
- if (!xd_command(cmdblk,PIO_MODE,NULL,NULL,NULL,XD_TIMEOUT*8)) {
- msleep_interruptible(XD_INIT_DISK_DELAY);
-
- init_drive(count);
- count++;
-
- msleep_interruptible(XD_INIT_DISK_DELAY);
- }
- }
- return (count);
-}
-
-static void __init xd_manual_geo_set (u_char drive)
-{
- xd_info[drive].heads = (u_char)(xd_geo[3 * drive + 1]);
- xd_info[drive].cylinders = (u_short)(xd_geo[3 * drive]);
- xd_info[drive].sectors = (u_char)(xd_geo[3 * drive + 2]);
-}
-
-static void __init xd_dtc_init_controller (unsigned int address)
-{
- switch (address) {
- case 0x00000:
- case 0xC8000: break; /*initial: 0x320 */
- case 0xCA000: xd_iobase = 0x324;
- case 0xD0000: /*5150CX*/
- case 0xD8000: break; /*5150CX & 5150XL*/
- default: printk("xd_dtc_init_controller: unsupported BIOS address %06x\n",address);
- break;
- }
- xd_maxsectors = 0x01; /* my card seems to have trouble doing multi-block transfers? */
-
- outb(0,XD_RESET); /* reset the controller */
-}
-
-
-static void __init xd_dtc5150cx_init_drive (u_char drive)
-{
- /* values from controller's BIOS - BIOS chip may be removed */
- static u_short geometry_table[][4] = {
- {0x200,8,0x200,0x100},
- {0x267,2,0x267,0x267},
- {0x264,4,0x264,0x80},
- {0x132,4,0x132,0x0},
- {0x132,2,0x80, 0x132},
- {0x177,8,0x177,0x0},
- {0x132,8,0x84, 0x0},
- {}, /* not used */
- {0x132,6,0x80, 0x100},
- {0x200,6,0x100,0x100},
- {0x264,2,0x264,0x80},
- {0x280,4,0x280,0x100},
- {0x2B9,3,0x2B9,0x2B9},
- {0x2B9,5,0x2B9,0x2B9},
- {0x280,6,0x280,0x100},
- {0x132,4,0x132,0x0}};
- u_char n;
-
- n = inb(XD_JUMPER);
- n = (drive ? n : (n >> 2)) & 0x33;
- n = (n | (n >> 2)) & 0x0F;
- if (xd_geo[3*drive])
- xd_manual_geo_set(drive);
- else
- if (n != 7) {
- xd_info[drive].heads = (u_char)(geometry_table[n][1]); /* heads */
- xd_info[drive].cylinders = geometry_table[n][0]; /* cylinders */
- xd_info[drive].sectors = 17; /* sectors */
-#if 0
- xd_info[drive].rwrite = geometry_table[n][2]; /* reduced write */
- xd_info[drive].precomp = geometry_table[n][3] /* write precomp */
- xd_info[drive].ecc = 0x0B; /* ecc length */
-#endif /* 0 */
- }
- else {
- printk("xd%c: undetermined drive geometry\n",'a'+drive);
- return;
- }
- xd_info[drive].control = 5; /* control byte */
- xd_setparam(CMD_DTCSETPARAM,drive,xd_info[drive].heads,xd_info[drive].cylinders,geometry_table[n][2],geometry_table[n][3],0x0B);
- xd_recalibrate(drive);
-}
-
-static void __init xd_dtc_init_drive (u_char drive)
-{
- u_char cmdblk[6],buf[64];
-
- xd_build(cmdblk,CMD_DTCGETGEOM,drive,0,0,0,0,0);
- if (!xd_command(cmdblk,PIO_MODE,buf,NULL,NULL,XD_TIMEOUT * 2)) {
- xd_info[drive].heads = buf[0x0A]; /* heads */
- xd_info[drive].cylinders = ((u_short *) (buf))[0x04]; /* cylinders */
- xd_info[drive].sectors = 17; /* sectors */
- if (xd_geo[3*drive])
- xd_manual_geo_set(drive);
-#if 0
- xd_info[drive].rwrite = ((u_short *) (buf + 1))[0x05]; /* reduced write */
- xd_info[drive].precomp = ((u_short *) (buf + 1))[0x06]; /* write precomp */
- xd_info[drive].ecc = buf[0x0F]; /* ecc length */
-#endif /* 0 */
- xd_info[drive].control = 0; /* control byte */
-
- xd_setparam(CMD_DTCSETPARAM,drive,xd_info[drive].heads,xd_info[drive].cylinders,((u_short *) (buf + 1))[0x05],((u_short *) (buf + 1))[0x06],buf[0x0F]);
- xd_build(cmdblk,CMD_DTCSETSTEP,drive,0,0,0,0,7);
- if (xd_command(cmdblk,PIO_MODE,NULL,NULL,NULL,XD_TIMEOUT * 2))
- printk("xd_dtc_init_drive: error setting step rate for xd%c\n", 'a'+drive);
- }
- else
- printk("xd_dtc_init_drive: error reading geometry for xd%c\n", 'a'+drive);
-}
-
-static void __init xd_wd_init_controller (unsigned int address)
-{
- switch (address) {
- case 0x00000:
- case 0xC8000: break; /*initial: 0x320 */
- case 0xCA000: xd_iobase = 0x324; break;
- case 0xCC000: xd_iobase = 0x328; break;
- case 0xCE000: xd_iobase = 0x32C; break;
- case 0xD0000: xd_iobase = 0x328; break; /* ? */
- case 0xD8000: xd_iobase = 0x32C; break; /* ? */
- default: printk("xd_wd_init_controller: unsupported BIOS address %06x\n",address);
- break;
- }
- xd_maxsectors = 0x01; /* this one doesn't wrap properly either... */
-
- outb(0,XD_RESET); /* reset the controller */
-
- msleep(XD_INIT_DISK_DELAY);
-}
-
-static void __init xd_wd_init_drive (u_char drive)
-{
- /* values from controller's BIOS - BIOS may be disabled */
- static u_short geometry_table[][4] = {
- {0x264,4,0x1C2,0x1C2}, /* common part */
- {0x132,4,0x099,0x0},
- {0x267,2,0x1C2,0x1C2},
- {0x267,4,0x1C2,0x1C2},
-
- {0x334,6,0x335,0x335}, /* 1004 series RLL */
- {0x30E,4,0x30F,0x3DC},
- {0x30E,2,0x30F,0x30F},
- {0x267,4,0x268,0x268},
-
- {0x3D5,5,0x3D6,0x3D6}, /* 1002 series RLL */
- {0x3DB,7,0x3DC,0x3DC},
- {0x264,4,0x265,0x265},
- {0x267,4,0x268,0x268}};
-
- u_char cmdblk[6],buf[0x200];
- u_char n = 0,rll,jumper_state,use_jumper_geo;
- u_char wd_1002 = (xd_sigs[xd_type].string[7] == '6');
-
- jumper_state = ~(inb(0x322));
- if (jumper_state & 0x40)
- xd_irq = 9;
- rll = (jumper_state & 0x30) ? (0x04 << wd_1002) : 0;
- xd_build(cmdblk,CMD_READ,drive,0,0,0,1,0);
- if (!xd_command(cmdblk,PIO_MODE,buf,NULL,NULL,XD_TIMEOUT * 2)) {
- xd_info[drive].heads = buf[0x1AF]; /* heads */
- xd_info[drive].cylinders = ((u_short *) (buf + 1))[0xD6]; /* cylinders */
- xd_info[drive].sectors = 17; /* sectors */
- if (xd_geo[3*drive])
- xd_manual_geo_set(drive);
-#if 0
- xd_info[drive].rwrite = ((u_short *) (buf))[0xD8]; /* reduced write */
- xd_info[drive].wprecomp = ((u_short *) (buf))[0xDA]; /* write precomp */
- xd_info[drive].ecc = buf[0x1B4]; /* ecc length */
-#endif /* 0 */
- xd_info[drive].control = buf[0x1B5]; /* control byte */
- use_jumper_geo = !(xd_info[drive].heads) || !(xd_info[drive].cylinders);
- if (xd_geo[3*drive]) {
- xd_manual_geo_set(drive);
- xd_info[drive].control = rll ? 7 : 5;
- }
- else if (use_jumper_geo) {
- n = (((jumper_state & 0x0F) >> (drive << 1)) & 0x03) | rll;
- xd_info[drive].cylinders = geometry_table[n][0];
- xd_info[drive].heads = (u_char)(geometry_table[n][1]);
- xd_info[drive].control = rll ? 7 : 5;
-#if 0
- xd_info[drive].rwrite = geometry_table[n][2];
- xd_info[drive].wprecomp = geometry_table[n][3];
- xd_info[drive].ecc = 0x0B;
-#endif /* 0 */
- }
- if (!wd_1002) {
- if (use_jumper_geo)
- xd_setparam(CMD_WDSETPARAM,drive,xd_info[drive].heads,xd_info[drive].cylinders,
- geometry_table[n][2],geometry_table[n][3],0x0B);
- else
- xd_setparam(CMD_WDSETPARAM,drive,xd_info[drive].heads,xd_info[drive].cylinders,
- ((u_short *) (buf))[0xD8],((u_short *) (buf))[0xDA],buf[0x1B4]);
- }
- /* 1002 based RLL controller requests converted addressing, but reports physical
- (physical 26 sec., logical 17 sec.)
- 1004 based ???? */
- if (rll & wd_1002) {
- if ((xd_info[drive].cylinders *= 26,
- xd_info[drive].cylinders /= 17) > 1023)
- xd_info[drive].cylinders = 1023; /* 1024 ? */
-#if 0
- xd_info[drive].rwrite *= 26;
- xd_info[drive].rwrite /= 17;
- xd_info[drive].wprecomp *= 26
- xd_info[drive].wprecomp /= 17;
-#endif /* 0 */
- }
- }
- else
- printk("xd_wd_init_drive: error reading geometry for xd%c\n",'a'+drive);
-
-}
-
-static void __init xd_seagate_init_controller (unsigned int address)
-{
- switch (address) {
- case 0x00000:
- case 0xC8000: break; /*initial: 0x320 */
- case 0xD0000: xd_iobase = 0x324; break;
- case 0xD8000: xd_iobase = 0x328; break;
- case 0xE0000: xd_iobase = 0x32C; break;
- default: printk("xd_seagate_init_controller: unsupported BIOS address %06x\n",address);
- break;
- }
- xd_maxsectors = 0x40;
-
- outb(0,XD_RESET); /* reset the controller */
-}
-
-static void __init xd_seagate_init_drive (u_char drive)
-{
- u_char cmdblk[6],buf[0x200];
-
- xd_build(cmdblk,CMD_ST11GETGEOM,drive,0,0,0,1,0);
- if (!xd_command(cmdblk,PIO_MODE,buf,NULL,NULL,XD_TIMEOUT * 2)) {
- xd_info[drive].heads = buf[0x04]; /* heads */
- xd_info[drive].cylinders = (buf[0x02] << 8) | buf[0x03]; /* cylinders */
- xd_info[drive].sectors = buf[0x05]; /* sectors */
- xd_info[drive].control = 0; /* control byte */
- }
- else
- printk("xd_seagate_init_drive: error reading geometry from xd%c\n", 'a'+drive);
-}
-
-/* Omti support courtesy Dirk Melchers */
-static void __init xd_omti_init_controller (unsigned int address)
-{
- switch (address) {
- case 0x00000:
- case 0xC8000: break; /*initial: 0x320 */
- case 0xD0000: xd_iobase = 0x324; break;
- case 0xD8000: xd_iobase = 0x328; break;
- case 0xE0000: xd_iobase = 0x32C; break;
- default: printk("xd_omti_init_controller: unsupported BIOS address %06x\n",address);
- break;
- }
-
- xd_maxsectors = 0x40;
-
- outb(0,XD_RESET); /* reset the controller */
-}
-
-static void __init xd_omti_init_drive (u_char drive)
-{
- /* gets infos from drive */
- xd_override_init_drive(drive);
-
- /* set other parameters, Hardcoded, not that nice :-) */
- xd_info[drive].control = 2;
-}
-
-/* Xebec support (AK) */
-static void __init xd_xebec_init_controller (unsigned int address)
-{
-/* iobase may be set manually in range 0x300 - 0x33C
- irq may be set manually to 2(9),3,4,5,6,7
- dma may be set manually to 1,2,3
- (How to detect them ???)
-BIOS address may be set manually in range 0x0 - 0xF8000
-If you need non-standard settings use the xd=... command */
-
- switch (address) {
- case 0x00000:
- case 0xC8000: /* initially: xd_iobase==0x320 */
- case 0xD0000:
- case 0xD2000:
- case 0xD4000:
- case 0xD6000:
- case 0xD8000:
- case 0xDA000:
- case 0xDC000:
- case 0xDE000:
- case 0xE0000: break;
- default: printk("xd_xebec_init_controller: unsupported BIOS address %06x\n",address);
- break;
- }
-
- xd_maxsectors = 0x01;
- outb(0,XD_RESET); /* reset the controller */
-
- msleep(XD_INIT_DISK_DELAY);
-}
-
-static void __init xd_xebec_init_drive (u_char drive)
-{
- /* values from controller's BIOS - BIOS chip may be removed */
- static u_short geometry_table[][5] = {
- {0x132,4,0x080,0x080,0x7},
- {0x132,4,0x080,0x080,0x17},
- {0x264,2,0x100,0x100,0x7},
- {0x264,2,0x100,0x100,0x17},
- {0x132,8,0x080,0x080,0x7},
- {0x132,8,0x080,0x080,0x17},
- {0x264,4,0x100,0x100,0x6},
- {0x264,4,0x100,0x100,0x17},
- {0x2BC,5,0x2BC,0x12C,0x6},
- {0x3A5,4,0x3A5,0x3A5,0x7},
- {0x26C,6,0x26C,0x26C,0x7},
- {0x200,8,0x200,0x100,0x17},
- {0x400,5,0x400,0x400,0x7},
- {0x400,6,0x400,0x400,0x7},
- {0x264,8,0x264,0x200,0x17},
- {0x33E,7,0x33E,0x200,0x7}};
- u_char n;
-
- n = inb(XD_JUMPER) & 0x0F; /* BIOS's drive number: same geometry
- is assumed for BOTH drives */
- if (xd_geo[3*drive])
- xd_manual_geo_set(drive);
- else {
- xd_info[drive].heads = (u_char)(geometry_table[n][1]); /* heads */
- xd_info[drive].cylinders = geometry_table[n][0]; /* cylinders */
- xd_info[drive].sectors = 17; /* sectors */
-#if 0
- xd_info[drive].rwrite = geometry_table[n][2]; /* reduced write */
- xd_info[drive].precomp = geometry_table[n][3] /* write precomp */
- xd_info[drive].ecc = 0x0B; /* ecc length */
-#endif /* 0 */
- }
- xd_info[drive].control = geometry_table[n][4]; /* control byte */
- xd_setparam(CMD_XBSETPARAM,drive,xd_info[drive].heads,xd_info[drive].cylinders,geometry_table[n][2],geometry_table[n][3],0x0B);
- xd_recalibrate(drive);
-}
-
-/* xd_override_init_drive: this finds disk geometry in a "binary search" style, narrowing in on the "correct" number of heads
- etc. by trying values until it gets the highest successful value. Idea courtesy Salvador Abreu (spa@fct.unl.pt). */
-static void __init xd_override_init_drive (u_char drive)
-{
- u_short min[] = { 0,0,0 },max[] = { 16,1024,64 },test[] = { 0,0,0 };
- u_char cmdblk[6],i;
-
- if (xd_geo[3*drive])
- xd_manual_geo_set(drive);
- else {
- for (i = 0; i < 3; i++) {
- while (min[i] != max[i] - 1) {
- test[i] = (min[i] + max[i]) / 2;
- xd_build(cmdblk,CMD_SEEK,drive,(u_char) test[0],(u_short) test[1],(u_char) test[2],0,0);
- if (!xd_command(cmdblk,PIO_MODE,NULL,NULL,NULL,XD_TIMEOUT * 2))
- min[i] = test[i];
- else
- max[i] = test[i];
- }
- test[i] = min[i];
- }
- xd_info[drive].heads = (u_char) min[0] + 1;
- xd_info[drive].cylinders = (u_short) min[1] + 1;
- xd_info[drive].sectors = (u_char) min[2] + 1;
- }
- xd_info[drive].control = 0;
-}
-
-/* xd_setup: initialise controller from command line parameters */
-static void __init do_xd_setup (int *integers)
-{
- switch (integers[0]) {
- case 4: if (integers[4] < 0)
- nodma = 1;
- else if (integers[4] < 8)
- xd_dma = integers[4];
- case 3: if ((integers[3] > 0) && (integers[3] <= 0x3FC))
- xd_iobase = integers[3];
- case 2: if ((integers[2] > 0) && (integers[2] < 16))
- xd_irq = integers[2];
- case 1: xd_override = 1;
- if ((integers[1] >= 0) && (integers[1] < ARRAY_SIZE(xd_sigs)))
- xd_type = integers[1];
- case 0: break;
- default:printk("xd: too many parameters for xd\n");
- }
- xd_maxsectors = 0x01;
-}
-
-/* xd_setparam: set the drive characteristics */
-static void __init xd_setparam (u_char command,u_char drive,u_char heads,u_short cylinders,u_short rwrite,u_short wprecomp,u_char ecc)
-{
- u_char cmdblk[14];
-
- xd_build(cmdblk,command,drive,0,0,0,0,0);
- cmdblk[6] = (u_char) (cylinders >> 8) & 0x03;
- cmdblk[7] = (u_char) (cylinders & 0xFF);
- cmdblk[8] = heads & 0x1F;
- cmdblk[9] = (u_char) (rwrite >> 8) & 0x03;
- cmdblk[10] = (u_char) (rwrite & 0xFF);
- cmdblk[11] = (u_char) (wprecomp >> 8) & 0x03;
- cmdblk[12] = (u_char) (wprecomp & 0xFF);
- cmdblk[13] = ecc;
-
- /* Some controllers require geometry info as data, not command */
-
- if (xd_command(cmdblk,PIO_MODE,NULL,&cmdblk[6],NULL,XD_TIMEOUT * 2))
- printk("xd: error setting characteristics for xd%c\n", 'a'+drive);
-}
-
-
-#ifdef MODULE
-
-module_param_array(xd, int, NULL, 0);
-module_param_array(xd_geo, int, NULL, 0);
-module_param(nodma, bool, 0);
-
-MODULE_LICENSE("GPL");
-
-void cleanup_module(void)
-{
- int i;
- unregister_blkdev(XT_DISK_MAJOR, "xd");
- for (i = 0; i < xd_drives; i++) {
- del_gendisk(xd_gendisk[i]);
- put_disk(xd_gendisk[i]);
- }
- blk_cleanup_queue(xd_queue);
- release_region(xd_iobase,4);
- if (xd_drives) {
- free_irq(xd_irq, NULL);
- free_dma(xd_dma);
- if (xd_dma_buffer)
- xd_dma_mem_free((unsigned long)xd_dma_buffer, xd_maxsectors * 0x200);
- }
-}
-#else
-
-static int __init xd_setup (char *str)
-{
- int ints[5];
- get_options (str, ARRAY_SIZE (ints), ints);
- do_xd_setup (ints);
- return 1;
-}
-
-/* xd_manual_geo_init: initialise drive geometry from command line parameters
- (used only for WD drives) */
-static int __init xd_manual_geo_init (char *str)
-{
- int i, integers[1 + 3*XD_MAXDRIVES];
-
- get_options (str, ARRAY_SIZE (integers), integers);
- if (integers[0]%3 != 0) {
- printk("xd: incorrect number of parameters for xd_geo\n");
- return 1;
- }
- for (i = 0; (i < integers[0]) && (i < 3*XD_MAXDRIVES); i++)
- xd_geo[i] = integers[i+1];
- return 1;
-}
-
-__setup ("xd=", xd_setup);
-__setup ("xd_geo=", xd_manual_geo_init);
-
-#endif /* MODULE */
-
-module_init(xd_init);
-MODULE_ALIAS_BLOCKDEV_MAJOR(XT_DISK_MAJOR);
diff --git a/drivers/block/xd.h b/drivers/block/xd.h
deleted file mode 100644
index 37cacef16e93..000000000000
--- a/drivers/block/xd.h
+++ /dev/null
@@ -1,134 +0,0 @@
-#ifndef _LINUX_XD_H
-#define _LINUX_XD_H
-
-/*
- * This file contains the definitions for the IO ports and errors etc. for XT hard disk controllers (at least the DTC 5150X).
- *
- * Author: Pat Mackinlay, pat@it.com.au
- * Date: 29/09/92
- *
- * Revised: 01/01/93, ...
- *
- * Ref: DTC 5150X Controller Specification (thanks to Kevin Fowler, kevinf@agora.rain.com)
- * Also thanks to: Salvador Abreu, Dave Thaler, Risto Kankkunen and Wim Van Dorst.
- */
-
-#include <linux/interrupt.h>
-
-/* XT hard disk controller registers */
-#define XD_DATA (xd_iobase + 0x00) /* data RW register */
-#define XD_RESET (xd_iobase + 0x01) /* reset WO register */
-#define XD_STATUS (xd_iobase + 0x01) /* status RO register */
-#define XD_SELECT (xd_iobase + 0x02) /* select WO register */
-#define XD_JUMPER (xd_iobase + 0x02) /* jumper RO register */
-#define XD_CONTROL (xd_iobase + 0x03) /* DMAE/INTE WO register */
-#define XD_RESERVED (xd_iobase + 0x03) /* reserved */
-
-/* XT hard disk controller commands (incomplete list) */
-#define CMD_TESTREADY 0x00 /* test drive ready */
-#define CMD_RECALIBRATE 0x01 /* recalibrate drive */
-#define CMD_SENSE 0x03 /* request sense */
-#define CMD_FORMATDRV 0x04 /* format drive */
-#define CMD_VERIFY 0x05 /* read verify */
-#define CMD_FORMATTRK 0x06 /* format track */
-#define CMD_FORMATBAD 0x07 /* format bad track */
-#define CMD_READ 0x08 /* read */
-#define CMD_WRITE 0x0A /* write */
-#define CMD_SEEK 0x0B /* seek */
-
-/* Controller specific commands */
-#define CMD_DTCSETPARAM 0x0C /* set drive parameters (DTC 5150X & CX only?) */
-#define CMD_DTCGETECC 0x0D /* get ecc error length (DTC 5150X only?) */
-#define CMD_DTCREADBUF 0x0E /* read sector buffer (DTC 5150X only?) */
-#define CMD_DTCWRITEBUF 0x0F /* write sector buffer (DTC 5150X only?) */
-#define CMD_DTCREMAPTRK 0x11 /* assign alternate track (DTC 5150X only?) */
-#define CMD_DTCGETPARAM 0xFB /* get drive parameters (DTC 5150X only?) */
-#define CMD_DTCSETSTEP 0xFC /* set step rate (DTC 5150X only?) */
-#define CMD_DTCSETGEOM 0xFE /* set geometry data (DTC 5150X only?) */
-#define CMD_DTCGETGEOM 0xFF /* get geometry data (DTC 5150X only?) */
-#define CMD_ST11GETGEOM 0xF8 /* get geometry data (Seagate ST11R/M only?) */
-#define CMD_WDSETPARAM 0x0C /* set drive parameters (WD 1004A27X only?) */
-#define CMD_XBSETPARAM 0x0C /* set drive parameters (XEBEC only?) */
-
-/* Bits for command status byte */
-#define CSB_ERROR 0x02 /* error */
-#define CSB_LUN 0x20 /* logical Unit Number */
-
-/* XT hard disk controller status bits */
-#define STAT_READY 0x01 /* controller is ready */
-#define STAT_INPUT 0x02 /* data flowing from controller to host */
-#define STAT_COMMAND 0x04 /* controller in command phase */
-#define STAT_SELECT 0x08 /* controller is selected */
-#define STAT_REQUEST 0x10 /* controller requesting data */
-#define STAT_INTERRUPT 0x20 /* controller requesting interrupt */
-
-/* XT hard disk controller control bits */
-#define PIO_MODE 0x00 /* control bits to set for PIO */
-#define DMA_MODE 0x03 /* control bits to set for DMA & interrupt */
-
-#define XD_MAXDRIVES 2 /* maximum 2 drives */
-#define XD_TIMEOUT HZ /* 1 second timeout */
-#define XD_RETRIES 4 /* maximum 4 retries */
-
-#undef DEBUG /* define for debugging output */
-
-#ifdef DEBUG
- #define DEBUG_STARTUP /* debug driver initialisation */
- #define DEBUG_OVERRIDE /* debug override geometry detection */
- #define DEBUG_READWRITE /* debug each read/write command */
- #define DEBUG_OTHER /* debug misc. interrupt/DMA stuff */
- #define DEBUG_COMMAND /* debug each controller command */
-#endif /* DEBUG */
-
-/* this structure defines the XT drives and their types */
-typedef struct {
- u_char heads;
- u_short cylinders;
- u_char sectors;
- u_char control;
- int unit;
-} XD_INFO;
-
-/* this structure defines a ROM BIOS signature */
-typedef struct {
- unsigned int offset;
- const char *string;
- void (*init_controller)(unsigned int address);
- void (*init_drive)(u_char drive);
- const char *name;
-} XD_SIGNATURE;
-
-#ifndef MODULE
-static int xd_manual_geo_init (char *command);
-#endif /* MODULE */
-static u_char xd_detect (u_char *controller, unsigned int *address);
-static u_char xd_initdrives (void (*init_drive)(u_char drive));
-
-static void do_xd_request (struct request_queue * q);
-static int xd_ioctl (struct block_device *bdev,fmode_t mode,unsigned int cmd,unsigned long arg);
-static int xd_readwrite (u_char operation,XD_INFO *disk,char *buffer,u_int block,u_int count);
-static void xd_recalibrate (u_char drive);
-
-static irqreturn_t xd_interrupt_handler(int irq, void *dev_id);
-static u_char xd_setup_dma (u_char opcode,u_char *buffer,u_int count);
-static u_char *xd_build (u_char *cmdblk,u_char command,u_char drive,u_char head,u_short cylinder,u_char sector,u_char count,u_char control);
-static void xd_watchdog (unsigned long unused);
-static inline u_char xd_waitport (u_short port,u_char flags,u_char mask,u_long timeout);
-static u_int xd_command (u_char *command,u_char mode,u_char *indata,u_char *outdata,u_char *sense,u_long timeout);
-
-/* card specific setup and geometry gathering code */
-static void xd_dtc_init_controller (unsigned int address);
-static void xd_dtc5150cx_init_drive (u_char drive);
-static void xd_dtc_init_drive (u_char drive);
-static void xd_wd_init_controller (unsigned int address);
-static void xd_wd_init_drive (u_char drive);
-static void xd_seagate_init_controller (unsigned int address);
-static void xd_seagate_init_drive (u_char drive);
-static void xd_omti_init_controller (unsigned int address);
-static void xd_omti_init_drive (u_char drive);
-static void xd_xebec_init_controller (unsigned int address);
-static void xd_xebec_init_drive (u_char drive);
-static void xd_setparam (u_char command,u_char drive,u_char heads,u_short cylinders,u_short rwrite,u_short wprecomp,u_char ecc);
-static void xd_override_init_drive (u_char drive);
-
-#endif /* _LINUX_XD_H */
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 5ac841ff6cc7..de1f319f7bd7 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -46,6 +46,7 @@
#include <xen/xen.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
+#include <xen/balloon.h>
#include "common.h"
/*
@@ -239,6 +240,7 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num)
ret = gnttab_unmap_refs(unmap, NULL, pages,
segs_to_unmap);
BUG_ON(ret);
+ free_xenballooned_pages(segs_to_unmap, pages);
segs_to_unmap = 0;
}
@@ -527,8 +529,8 @@ static int xen_blkbk_map(struct blkif_request *req,
GFP_KERNEL);
if (!persistent_gnt)
return -ENOMEM;
- persistent_gnt->page = alloc_page(GFP_KERNEL);
- if (!persistent_gnt->page) {
+ if (alloc_xenballooned_pages(1, &persistent_gnt->page,
+ false)) {
kfree(persistent_gnt);
return -ENOMEM;
}
@@ -879,7 +881,6 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
goto fail_response;
}
- preq.dev = req->u.rw.handle;
preq.sector_number = req->u.rw.sector_number;
preq.nr_sects = 0;
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 63980722db41..5e237f630c47 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -367,6 +367,7 @@ static int xen_blkbk_remove(struct xenbus_device *dev)
be->blkif = NULL;
}
+ kfree(be->mode);
kfree(be);
dev_set_drvdata(&dev->dev, NULL);
return 0;
@@ -502,6 +503,7 @@ static void backend_changed(struct xenbus_watch *watch,
= container_of(watch, struct backend_info, backend_watch);
struct xenbus_device *dev = be->dev;
int cdrom = 0;
+ unsigned long handle;
char *device_type;
DPRINTK("");
@@ -521,10 +523,10 @@ static void backend_changed(struct xenbus_watch *watch,
return;
}
- if ((be->major || be->minor) &&
- ((be->major != major) || (be->minor != minor))) {
- pr_warn(DRV_PFX "changing physical device (from %x:%x to %x:%x) not supported.\n",
- be->major, be->minor, major, minor);
+ if (be->major | be->minor) {
+ if (be->major != major || be->minor != minor)
+ pr_warn(DRV_PFX "changing physical device (from %x:%x to %x:%x) not supported.\n",
+ be->major, be->minor, major, minor);
return;
}
@@ -542,36 +544,33 @@ static void backend_changed(struct xenbus_watch *watch,
kfree(device_type);
}
- if (be->major == 0 && be->minor == 0) {
- /* Front end dir is a number, which is used as the handle. */
-
- char *p = strrchr(dev->otherend, '/') + 1;
- long handle;
- err = strict_strtoul(p, 0, &handle);
- if (err)
- return;
+ /* Front end dir is a number, which is used as the handle. */
+ err = strict_strtoul(strrchr(dev->otherend, '/') + 1, 0, &handle);
+ if (err)
+ return;
- be->major = major;
- be->minor = minor;
+ be->major = major;
+ be->minor = minor;
- err = xen_vbd_create(be->blkif, handle, major, minor,
- (NULL == strchr(be->mode, 'w')), cdrom);
- if (err) {
- be->major = 0;
- be->minor = 0;
- xenbus_dev_fatal(dev, err, "creating vbd structure");
- return;
- }
+ err = xen_vbd_create(be->blkif, handle, major, minor,
+ !strchr(be->mode, 'w'), cdrom);
+ if (err)
+ xenbus_dev_fatal(dev, err, "creating vbd structure");
+ else {
err = xenvbd_sysfs_addif(dev);
if (err) {
xen_vbd_free(&be->blkif->vbd);
- be->major = 0;
- be->minor = 0;
xenbus_dev_fatal(dev, err, "creating sysfs entries");
- return;
}
+ }
+ if (err) {
+ kfree(be->mode);
+ be->mode = NULL;
+ be->major = 0;
+ be->minor = 0;
+ } else {
/* We're potentially connected now */
xen_update_blkif_status(be->blkif);
}
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 11043c18ac5a..c3dae2e0f290 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -791,7 +791,7 @@ static void blkif_restart_queue(struct work_struct *work)
static void blkif_free(struct blkfront_info *info, int suspend)
{
struct llist_node *all_gnts;
- struct grant *persistent_gnt;
+ struct grant *persistent_gnt, *tmp;
struct llist_node *n;
/* Prevent new requests being issued until we fix things up. */
@@ -805,10 +805,17 @@ static void blkif_free(struct blkfront_info *info, int suspend)
/* Remove all persistent grants */
if (info->persistent_gnts_c) {
all_gnts = llist_del_all(&info->persistent_gnts);
- llist_for_each_entry_safe(persistent_gnt, n, all_gnts, node) {
+ persistent_gnt = llist_entry(all_gnts, typeof(*(persistent_gnt)), node);
+ while (persistent_gnt) {
gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
__free_page(pfn_to_page(persistent_gnt->pfn));
- kfree(persistent_gnt);
+ tmp = persistent_gnt;
+ n = persistent_gnt->node.next;
+ if (n)
+ persistent_gnt = llist_entry(n, typeof(*(persistent_gnt)), node);
+ else
+ persistent_gnt = NULL;
+ kfree(tmp);
}
info->persistent_gnts_c = 0;
}