aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-12-08 08:19:16 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-08 08:19:16 -0800
commit6035ccd8e9e40bb654fbfdef325902ab531679a5 (patch)
treec1810d8a4d4ef150cdf14af72e6087dfc3f4b6e0 /drivers
parentMerge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jgarzik/libata-dev (diff)
parentcfq-iosched: Do not access cfqq after freeing it (diff)
downloadlinux-dev-6035ccd8e9e40bb654fbfdef325902ab531679a5.tar.xz
linux-dev-6035ccd8e9e40bb654fbfdef325902ab531679a5.zip
Merge branch 'for-2.6.33' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.33' of git://git.kernel.dk/linux-2.6-block: (113 commits) cfq-iosched: Do not access cfqq after freeing it block: include linux/err.h to use ERR_PTR cfq-iosched: use call_rcu() instead of doing grace period stall on queue exit blkio: Allow CFQ group IO scheduling even when CFQ is a module blkio: Implement dynamic io controlling policy registration blkio: Export some symbols from blkio as its user CFQ can be a module block: Fix io_context leak after failure of clone with CLONE_IO block: Fix io_context leak after clone with CLONE_IO cfq-iosched: make nonrot check logic consistent io controller: quick fix for blk-cgroup and modular CFQ cfq-iosched: move IO controller declerations to a header file cfq-iosched: fix compile problem with !CONFIG_CGROUP blkio: Documentation blkio: Wait on sync-noidle queue even if rq_noidle = 1 blkio: Implement group_isolation tunable blkio: Determine async workload length based on total number of queues blkio: Wait for cfq queue to get backlogged if group is empty blkio: Propagate cgroup weight updation to cfq groups blkio: Drop the reference to queue once the task changes cgroup blkio: Provide some isolation between groups ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/block/Kconfig2
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/cciss.c544
-rw-r--r--drivers/block/cciss.h18
-rw-r--r--drivers/block/cciss_cmd.h7
-rw-r--r--drivers/block/cciss_scsi.c4
-rw-r--r--drivers/block/drbd/Kconfig71
-rw-r--r--drivers/block/drbd/Makefile5
-rw-r--r--drivers/block/drbd/drbd_actlog.c1424
-rw-r--r--drivers/block/drbd/drbd_bitmap.c1327
-rw-r--r--drivers/block/drbd/drbd_int.h2252
-rw-r--r--drivers/block/drbd/drbd_main.c3699
-rw-r--r--drivers/block/drbd/drbd_nl.c2364
-rw-r--r--drivers/block/drbd/drbd_proc.c265
-rw-r--r--drivers/block/drbd/drbd_receiver.c4426
-rw-r--r--drivers/block/drbd/drbd_req.c1125
-rw-r--r--drivers/block/drbd/drbd_req.h326
-rw-r--r--drivers/block/drbd/drbd_strings.c113
-rw-r--r--drivers/block/drbd/drbd_vli.h351
-rw-r--r--drivers/block/drbd/drbd_worker.c1512
-rw-r--r--drivers/block/drbd/drbd_wrappers.h91
-rw-r--r--drivers/block/ps3vram.c10
-rw-r--r--drivers/mtd/mtd_blkdevs.c2
-rw-r--r--drivers/staging/pohmelfs/inode.c10
24 files changed, 19651 insertions, 298 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1d886e079c58..77bfce52e9ca 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -271,6 +271,8 @@ config BLK_DEV_CRYPTOLOOP
instead, which can be configured to be on-disk compatible with the
cryptoloop device.
+source "drivers/block/drbd/Kconfig"
+
config BLK_DEV_NBD
tristate "Network block device support"
depends on NET
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index cdaa3f8fddf0..aff5ac925c34 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -36,5 +36,6 @@ obj-$(CONFIG_BLK_DEV_UB) += ub.o
obj-$(CONFIG_BLK_DEV_HD) += hd.o
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
+obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
swim_mod-objs := swim.o swim_asm.o
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 92b126394fa1..873e594860d3 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -179,19 +179,17 @@ static int rebuild_lun_table(ctlr_info_t *h, int first_time, int via_ioctl);
static int deregister_disk(ctlr_info_t *h, int drv_index,
int clear_all, int via_ioctl);
-static void cciss_read_capacity(int ctlr, int logvol, int withirq,
+static void cciss_read_capacity(int ctlr, int logvol,
sector_t *total_size, unsigned int *block_size);
-static void cciss_read_capacity_16(int ctlr, int logvol, int withirq,
+static void cciss_read_capacity_16(int ctlr, int logvol,
sector_t *total_size, unsigned int *block_size);
static void cciss_geometry_inquiry(int ctlr, int logvol,
- int withirq, sector_t total_size,
+ sector_t total_size,
unsigned int block_size, InquiryData_struct *inq_buff,
drive_info_struct *drv);
static void __devinit cciss_interrupt_mode(ctlr_info_t *, struct pci_dev *,
__u32);
static void start_io(ctlr_info_t *h);
-static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
- __u8 page_code, unsigned char *scsi3addr, int cmd_type);
static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
__u8 page_code, unsigned char scsi3addr[],
int cmd_type);
@@ -424,12 +422,9 @@ cciss_proc_write(struct file *file, const char __user *buf,
if (strncmp(ENGAGE_SCSI, buffer, sizeof ENGAGE_SCSI - 1) == 0) {
struct seq_file *seq = file->private_data;
ctlr_info_t *h = seq->private;
- int rc;
- rc = cciss_engage_scsi(h->ctlr);
- if (rc != 0)
- err = -rc;
- else
+ err = cciss_engage_scsi(h->ctlr);
+ if (err == 0)
err = length;
} else
#endif /* CONFIG_CISS_SCSI_TAPE */
@@ -1657,9 +1652,11 @@ static void cciss_softirq_done(struct request *rq)
{
CommandList_struct *cmd = rq->completion_data;
ctlr_info_t *h = hba[cmd->ctlr];
+ SGDescriptor_struct *curr_sg = cmd->SG;
unsigned long flags;
u64bit temp64;
int i, ddir;
+ int sg_index = 0;
if (cmd->Request.Type.Direction == XFER_READ)
ddir = PCI_DMA_FROMDEVICE;
@@ -1669,9 +1666,22 @@ static void cciss_softirq_done(struct request *rq)
/* command did not need to be retried */
/* unmap the DMA mapping for all the scatter gather elements */
for (i = 0; i < cmd->Header.SGList; i++) {
- temp64.val32.lower = cmd->SG[i].Addr.lower;
- temp64.val32.upper = cmd->SG[i].Addr.upper;
- pci_unmap_page(h->pdev, temp64.val, cmd->SG[i].Len, ddir);
+ if (curr_sg[sg_index].Ext == CCISS_SG_CHAIN) {
+ temp64.val32.lower = cmd->SG[i].Addr.lower;
+ temp64.val32.upper = cmd->SG[i].Addr.upper;
+ pci_dma_sync_single_for_cpu(h->pdev, temp64.val,
+ cmd->SG[i].Len, ddir);
+ pci_unmap_single(h->pdev, temp64.val,
+ cmd->SG[i].Len, ddir);
+ /* Point to the next block */
+ curr_sg = h->cmd_sg_list[cmd->cmdindex]->sgchain;
+ sg_index = 0;
+ }
+ temp64.val32.lower = curr_sg[sg_index].Addr.lower;
+ temp64.val32.upper = curr_sg[sg_index].Addr.upper;
+ pci_unmap_page(h->pdev, temp64.val, curr_sg[sg_index].Len,
+ ddir);
+ ++sg_index;
}
#ifdef CCISS_DEBUG
@@ -1701,7 +1711,7 @@ static inline void log_unit_to_scsi3addr(ctlr_info_t *h,
* via the inquiry page 0. Model, vendor, and rev are set to empty strings if
* they cannot be read.
*/
-static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
+static void cciss_get_device_descr(int ctlr, int logvol,
char *vendor, char *model, char *rev)
{
int rc;
@@ -1717,14 +1727,8 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
return;
log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
- if (withirq)
- rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf,
- sizeof(InquiryData_struct), 0,
- scsi3addr, TYPE_CMD);
- else
- rc = sendcmd(CISS_INQUIRY, ctlr, inq_buf,
- sizeof(InquiryData_struct), 0,
- scsi3addr, TYPE_CMD);
+ rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf, sizeof(*inq_buf), 0,
+ scsi3addr, TYPE_CMD);
if (rc == IO_OK) {
memcpy(vendor, &inq_buf->data_byte[8], VENDOR_LEN);
vendor[VENDOR_LEN] = '\0';
@@ -1743,7 +1747,7 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
* number cannot be had, for whatever reason, 16 bytes of 0xff
* are returned instead.
*/
-static void cciss_get_serial_no(int ctlr, int logvol, int withirq,
+static void cciss_get_serial_no(int ctlr, int logvol,
unsigned char *serial_no, int buflen)
{
#define PAGE_83_INQ_BYTES 64
@@ -1759,12 +1763,8 @@ static void cciss_get_serial_no(int ctlr, int logvol, int withirq,
return;
memset(serial_no, 0, buflen);
log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
- if (withirq)
- rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf,
- PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
- else
- rc = sendcmd(CISS_INQUIRY, ctlr, buf,
- PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
+ rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf,
+ PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
if (rc == IO_OK)
memcpy(serial_no, &buf[8], buflen);
kfree(buf);
@@ -1793,10 +1793,10 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask);
/* This is a hardware imposed limit. */
- blk_queue_max_hw_segments(disk->queue, MAXSGENTRIES);
+ blk_queue_max_hw_segments(disk->queue, h->maxsgentries);
/* This is a limit in the driver and could be eliminated. */
- blk_queue_max_phys_segments(disk->queue, MAXSGENTRIES);
+ blk_queue_max_phys_segments(disk->queue, h->maxsgentries);
blk_queue_max_sectors(disk->queue, h->cciss_max_sectors);
@@ -1852,18 +1852,16 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time,
/* testing to see if 16-byte CDBs are already being used */
if (h->cciss_read == CCISS_READ_16) {
- cciss_read_capacity_16(h->ctlr, drv_index, 1,
+ cciss_read_capacity_16(h->ctlr, drv_index,
&total_size, &block_size);
} else {
- cciss_read_capacity(ctlr, drv_index, 1,
- &total_size, &block_size);
-
+ cciss_read_capacity(ctlr, drv_index, &total_size, &block_size);
/* if read_capacity returns all F's this volume is >2TB */
/* in size so we switch to 16-byte CDB's for all */
/* read/write ops */
if (total_size == 0xFFFFFFFFULL) {
- cciss_read_capacity_16(ctlr, drv_index, 1,
+ cciss_read_capacity_16(ctlr, drv_index,
&total_size, &block_size);
h->cciss_read = CCISS_READ_16;
h->cciss_write = CCISS_WRITE_16;
@@ -1873,14 +1871,14 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time,
}
}
- cciss_geometry_inquiry(ctlr, drv_index, 1, total_size, block_size,
+ cciss_geometry_inquiry(ctlr, drv_index, total_size, block_size,
inq_buff, drvinfo);
drvinfo->block_size = block_size;
drvinfo->nr_blocks = total_size + 1;
- cciss_get_device_descr(ctlr, drv_index, 1, drvinfo->vendor,
+ cciss_get_device_descr(ctlr, drv_index, drvinfo->vendor,
drvinfo->model, drvinfo->rev);
- cciss_get_serial_no(ctlr, drv_index, 1, drvinfo->serial_no,
+ cciss_get_serial_no(ctlr, drv_index, drvinfo->serial_no,
sizeof(drvinfo->serial_no));
/* Save the lunid in case we deregister the disk, below. */
memcpy(drvinfo->LunID, h->drv[drv_index]->LunID,
@@ -2531,6 +2529,8 @@ static int check_target_status(ctlr_info_t *h, CommandList_struct *c)
case 0: return IO_OK; /* no sense */
case 1: return IO_OK; /* recovered error */
default:
+ if (check_for_unit_attention(h, c))
+ return IO_NEEDS_RETRY;
printk(KERN_WARNING "cciss%d: cmd 0x%02x "
"check condition, sense key = 0x%02x\n",
h->ctlr, c->Request.CDB[0],
@@ -2672,7 +2672,7 @@ static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
}
static void cciss_geometry_inquiry(int ctlr, int logvol,
- int withirq, sector_t total_size,
+ sector_t total_size,
unsigned int block_size,
InquiryData_struct *inq_buff,
drive_info_struct *drv)
@@ -2683,14 +2683,8 @@ static void cciss_geometry_inquiry(int ctlr, int logvol,
memset(inq_buff, 0, sizeof(InquiryData_struct));
log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
- if (withirq)
- return_code = sendcmd_withirq(CISS_INQUIRY, ctlr,
- inq_buff, sizeof(*inq_buff),
- 0xC1, scsi3addr, TYPE_CMD);
- else
- return_code = sendcmd(CISS_INQUIRY, ctlr, inq_buff,
- sizeof(*inq_buff), 0xC1, scsi3addr,
- TYPE_CMD);
+ return_code = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buff,
+ sizeof(*inq_buff), 0xC1, scsi3addr, TYPE_CMD);
if (return_code == IO_OK) {
if (inq_buff->data_byte[8] == 0xFF) {
printk(KERN_WARNING
@@ -2723,7 +2717,7 @@ static void cciss_geometry_inquiry(int ctlr, int logvol,
}
static void
-cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
+cciss_read_capacity(int ctlr, int logvol, sector_t *total_size,
unsigned int *block_size)
{
ReadCapdata_struct *buf;
@@ -2737,14 +2731,8 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
}
log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
- if (withirq)
- return_code = sendcmd_withirq(CCISS_READ_CAPACITY,
- ctlr, buf, sizeof(ReadCapdata_struct),
- 0, scsi3addr, TYPE_CMD);
- else
- return_code = sendcmd(CCISS_READ_CAPACITY,
- ctlr, buf, sizeof(ReadCapdata_struct),
- 0, scsi3addr, TYPE_CMD);
+ return_code = sendcmd_withirq(CCISS_READ_CAPACITY, ctlr, buf,
+ sizeof(ReadCapdata_struct), 0, scsi3addr, TYPE_CMD);
if (return_code == IO_OK) {
*total_size = be32_to_cpu(*(__be32 *) buf->total_size);
*block_size = be32_to_cpu(*(__be32 *) buf->block_size);
@@ -2756,8 +2744,8 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
kfree(buf);
}
-static void
-cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size, unsigned int *block_size)
+static void cciss_read_capacity_16(int ctlr, int logvol,
+ sector_t *total_size, unsigned int *block_size)
{
ReadCapdata_struct_16 *buf;
int return_code;
@@ -2770,16 +2758,9 @@ cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size,
}
log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
- if (withirq) {
- return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16,
- ctlr, buf, sizeof(ReadCapdata_struct_16),
- 0, scsi3addr, TYPE_CMD);
- }
- else {
- return_code = sendcmd(CCISS_READ_CAPACITY_16,
- ctlr, buf, sizeof(ReadCapdata_struct_16),
- 0, scsi3addr, TYPE_CMD);
- }
+ return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16,
+ ctlr, buf, sizeof(ReadCapdata_struct_16),
+ 0, scsi3addr, TYPE_CMD);
if (return_code == IO_OK) {
*total_size = be64_to_cpu(*(__be64 *) buf->total_size);
*block_size = be32_to_cpu(*(__be32 *) buf->block_size);
@@ -2820,13 +2801,13 @@ static int cciss_revalidate(struct gendisk *disk)
return 1;
}
if (h->cciss_read == CCISS_READ_10) {
- cciss_read_capacity(h->ctlr, logvol, 1,
+ cciss_read_capacity(h->ctlr, logvol,
&total_size, &block_size);
} else {
- cciss_read_capacity_16(h->ctlr, logvol, 1,
+ cciss_read_capacity_16(h->ctlr, logvol,
&total_size, &block_size);
}
- cciss_geometry_inquiry(h->ctlr, logvol, 1, total_size, block_size,
+ cciss_geometry_inquiry(h->ctlr, logvol, total_size, block_size,
inq_buff, drv);
blk_queue_logical_block_size(drv->queue, drv->block_size);
@@ -2837,167 +2818,6 @@ static int cciss_revalidate(struct gendisk *disk)
}
/*
- * Wait polling for a command to complete.
- * The memory mapped FIFO is polled for the completion.
- * Used only at init time, interrupts from the HBA are disabled.
- */
-static unsigned long pollcomplete(int ctlr)
-{
- unsigned long done;
- int i;
-
- /* Wait (up to 20 seconds) for a command to complete */
-
- for (i = 20 * HZ; i > 0; i--) {
- done = hba[ctlr]->access.command_completed(hba[ctlr]);
- if (done == FIFO_EMPTY)
- schedule_timeout_uninterruptible(1);
- else
- return done;
- }
- /* Invalid address to tell caller we ran out of time */
- return 1;
-}
-
-/* Send command c to controller h and poll for it to complete.
- * Turns interrupts off on the board. Used at driver init time
- * and during SCSI error recovery.
- */
-static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c)
-{
- int i;
- unsigned long complete;
- int status = IO_ERROR;
- u64bit buff_dma_handle;
-
-resend_cmd1:
-
- /* Disable interrupt on the board. */
- h->access.set_intr_mask(h, CCISS_INTR_OFF);
-
- /* Make sure there is room in the command FIFO */
- /* Actually it should be completely empty at this time */
- /* unless we are in here doing error handling for the scsi */
- /* tape side of the driver. */
- for (i = 200000; i > 0; i--) {
- /* if fifo isn't full go */
- if (!(h->access.fifo_full(h)))
- break;
- udelay(10);
- printk(KERN_WARNING "cciss cciss%d: SendCmd FIFO full,"
- " waiting!\n", h->ctlr);
- }
- h->access.submit_command(h, c); /* Send the cmd */
- do {
- complete = pollcomplete(h->ctlr);
-
-#ifdef CCISS_DEBUG
- printk(KERN_DEBUG "cciss: command completed\n");
-#endif /* CCISS_DEBUG */
-
- if (complete == 1) {
- printk(KERN_WARNING
- "cciss cciss%d: SendCmd Timeout out, "
- "No command list address returned!\n", h->ctlr);
- status = IO_ERROR;
- break;
- }
-
- /* Make sure it's the command we're expecting. */
- if ((complete & ~CISS_ERROR_BIT) != c->busaddr) {
- printk(KERN_WARNING "cciss%d: Unexpected command "
- "completion.\n", h->ctlr);
- continue;
- }
-
- /* It is our command. If no error, we're done. */
- if (!(complete & CISS_ERROR_BIT)) {
- status = IO_OK;
- break;
- }
-
- /* There is an error... */
-
- /* if data overrun or underun on Report command ignore it */
- if (((c->Request.CDB[0] == CISS_REPORT_LOG) ||
- (c->Request.CDB[0] == CISS_REPORT_PHYS) ||
- (c->Request.CDB[0] == CISS_INQUIRY)) &&
- ((c->err_info->CommandStatus == CMD_DATA_OVERRUN) ||
- (c->err_info->CommandStatus == CMD_DATA_UNDERRUN))) {
- complete = c->busaddr;
- status = IO_OK;
- break;
- }
-
- if (c->err_info->CommandStatus == CMD_UNSOLICITED_ABORT) {
- printk(KERN_WARNING "cciss%d: unsolicited abort %p\n",
- h->ctlr, c);
- if (c->retry_count < MAX_CMD_RETRIES) {
- printk(KERN_WARNING "cciss%d: retrying %p\n",
- h->ctlr, c);
- c->retry_count++;
- /* erase the old error information */
- memset(c->err_info, 0, sizeof(c->err_info));
- goto resend_cmd1;
- }
- printk(KERN_WARNING "cciss%d: retried %p too many "
- "times\n", h->ctlr, c);
- status = IO_ERROR;
- break;
- }
-
- if (c->err_info->CommandStatus == CMD_UNABORTABLE) {
- printk(KERN_WARNING "cciss%d: command could not be "
- "aborted.\n", h->ctlr);
- status = IO_ERROR;
- break;
- }
-
- if (c->err_info->CommandStatus == CMD_TARGET_STATUS) {
- status = check_target_status(h, c);
- break;
- }
-
- printk(KERN_WARNING "cciss%d: sendcmd error\n", h->ctlr);
- printk(KERN_WARNING "cmd = 0x%02x, CommandStatus = 0x%02x\n",
- c->Request.CDB[0], c->err_info->CommandStatus);
- status = IO_ERROR;
- break;
-
- } while (1);
-
- /* unlock the data buffer from DMA */
- buff_dma_handle.val32.lower = c->SG[0].Addr.lower;
- buff_dma_handle.val32.upper = c->SG[0].Addr.upper;
- pci_unmap_single(h->pdev, (dma_addr_t) buff_dma_handle.val,
- c->SG[0].Len, PCI_DMA_BIDIRECTIONAL);
- return status;
-}
-
-/*
- * Send a command to the controller, and wait for it to complete.
- * Used at init time, and during SCSI error recovery.
- */
-static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
- __u8 page_code, unsigned char *scsi3addr, int cmd_type)
-{
- CommandList_struct *c;
- int status;
-
- c = cmd_alloc(hba[ctlr], 1);
- if (!c) {
- printk(KERN_WARNING "cciss: unable to get memory");
- return IO_ERROR;
- }
- status = fill_cmd(c, cmd, ctlr, buff, size, page_code,
- scsi3addr, cmd_type);
- if (status == IO_OK)
- status = sendcmd_core(hba[ctlr], c);
- cmd_free(hba[ctlr], c, 1);
- return status;
-}
-
-/*
* Map (physical) PCI mem into (virtual) kernel space
*/
static void __iomem *remap_pci_mem(ulong base, ulong size)
@@ -3255,9 +3075,13 @@ static void do_cciss_request(struct request_queue *q)
int seg;
struct request *creq;
u64bit temp64;
- struct scatterlist tmp_sg[MAXSGENTRIES];
+ struct scatterlist *tmp_sg;
+ SGDescriptor_struct *curr_sg;
drive_info_struct *drv;
int i, dir;
+ int nseg = 0;
+ int sg_index = 0;
+ int chained = 0;
/* We call start_io here in case there is a command waiting on the
* queue that has not been sent.
@@ -3270,13 +3094,14 @@ static void do_cciss_request(struct request_queue *q)
if (!creq)
goto startio;
- BUG_ON(creq->nr_phys_segments > MAXSGENTRIES);
+ BUG_ON(creq->nr_phys_segments > h->maxsgentries);
if ((c = cmd_alloc(h, 1)) == NULL)
goto full;
blk_start_request(creq);
+ tmp_sg = h->scatter_list[c->cmdindex];
spin_unlock_irq(q->queue_lock);
c->cmd_type = CMD_RWREQ;
@@ -3305,7 +3130,7 @@ static void do_cciss_request(struct request_queue *q)
(int)blk_rq_pos(creq), (int)blk_rq_sectors(creq));
#endif /* CCISS_DEBUG */
- sg_init_table(tmp_sg, MAXSGENTRIES);
+ sg_init_table(tmp_sg, h->maxsgentries);
seg = blk_rq_map_sg(q, creq, tmp_sg);
/* get the DMA records for the setup */
@@ -3314,25 +3139,70 @@ static void do_cciss_request(struct request_queue *q)
else
dir = PCI_DMA_TODEVICE;
+ curr_sg = c->SG;
+ sg_index = 0;
+ chained = 0;
+
for (i = 0; i < seg; i++) {
- c->SG[i].Len = tmp_sg[i].length;
+ if (((sg_index+1) == (h->max_cmd_sgentries)) &&
+ !chained && ((seg - i) > 1)) {
+ nseg = seg - i;
+ curr_sg[sg_index].Len = (nseg) *
+ sizeof(SGDescriptor_struct);
+ curr_sg[sg_index].Ext = CCISS_SG_CHAIN;
+
+ /* Point to next chain block. */
+ curr_sg = h->cmd_sg_list[c->cmdindex]->sgchain;
+ sg_index = 0;
+ chained = 1;
+ }
+ curr_sg[sg_index].Len = tmp_sg[i].length;
temp64.val = (__u64) pci_map_page(h->pdev, sg_page(&tmp_sg[i]),
- tmp_sg[i].offset,
- tmp_sg[i].length, dir);
- c->SG[i].Addr.lower = temp64.val32.lower;
- c->SG[i].Addr.upper = temp64.val32.upper;
- c->SG[i].Ext = 0; // we are not chaining
+ tmp_sg[i].offset,
+ tmp_sg[i].length, dir);
+ curr_sg[sg_index].Addr.lower = temp64.val32.lower;
+ curr_sg[sg_index].Addr.upper = temp64.val32.upper;
+ curr_sg[sg_index].Ext = 0; /* we are not chaining */
+
+ ++sg_index;
+ }
+
+ if (chained) {
+ int len;
+ curr_sg = c->SG;
+ sg_index = h->max_cmd_sgentries - 1;
+ len = curr_sg[sg_index].Len;
+ /* Setup pointer to next chain block.
+ * Fill out last element in current chain
+ * block with address of next chain block.
+ */
+ temp64.val = pci_map_single(h->pdev,
+ h->cmd_sg_list[c->cmdindex]->sgchain,
+ len, dir);
+
+ h->cmd_sg_list[c->cmdindex]->sg_chain_dma = temp64.val;
+ curr_sg[sg_index].Addr.lower = temp64.val32.lower;
+ curr_sg[sg_index].Addr.upper = temp64.val32.upper;
+
+ pci_dma_sync_single_for_device(h->pdev,
+ h->cmd_sg_list[c->cmdindex]->sg_chain_dma,
+ len, dir);
}
+
/* track how many SG entries we are using */
if (seg > h->maxSG)
h->maxSG = seg;
#ifdef CCISS_DEBUG
- printk(KERN_DEBUG "cciss: Submitting %u sectors in %d segments\n",
- blk_rq_sectors(creq), seg);
+ printk(KERN_DEBUG "cciss: Submitting %ld sectors in %d segments "
+ "chained[%d]\n",
+ blk_rq_sectors(creq), seg, chained);
#endif /* CCISS_DEBUG */
- c->Header.SGList = c->Header.SGTotal = seg;
+ c->Header.SGList = c->Header.SGTotal = seg + chained;
+ if (seg > h->max_cmd_sgentries)
+ c->Header.SGList = h->max_cmd_sgentries;
+
if (likely(blk_fs_request(creq))) {
if(h->cciss_read == CCISS_READ_10) {
c->Request.CDB[1] = 0;
@@ -3513,28 +3383,33 @@ static int add_to_scan_list(struct ctlr_info *h)
* @h: Pointer to the controller.
*
* Removes the controller from the rescan queue if present. Blocks if
- * the controller is currently conducting a rescan.
+ * the controller is currently conducting a rescan. The controller
+ * can be in one of three states:
+ * 1. Doesn't need a scan
+ * 2. On the scan list, but not scanning yet (we remove it)
+ * 3. Busy scanning (and not on the list). In this case we want to wait for
+ * the scan to complete to make sure the scanning thread for this
+ * controller is completely idle.
**/
static void remove_from_scan_list(struct ctlr_info *h)
{
struct ctlr_info *test_h, *tmp_h;
- int scanning = 0;
mutex_lock(&scan_mutex);
list_for_each_entry_safe(test_h, tmp_h, &scan_q, scan_list) {
- if (test_h == h) {
+ if (test_h == h) { /* state 2. */
list_del(&h->scan_list);
complete_all(&h->scan_wait);
mutex_unlock(&scan_mutex);
return;
}
}
- if (&h->busy_scanning)
- scanning = 0;
- mutex_unlock(&scan_mutex);
-
- if (scanning)
+ if (h->busy_scanning) { /* state 3. */
+ mutex_unlock(&scan_mutex);
wait_for_completion(&h->scan_wait);
+ } else { /* state 1, nothing to do. */
+ mutex_unlock(&scan_mutex);
+ }
}
/**
@@ -3573,13 +3448,11 @@ static int scan_thread(void *data)
h->busy_scanning = 1;
mutex_unlock(&scan_mutex);
- if (h) {
- rebuild_lun_table(h, 0, 0);
- complete_all(&h->scan_wait);
- mutex_lock(&scan_mutex);
- h->busy_scanning = 0;
- mutex_unlock(&scan_mutex);
- }
+ rebuild_lun_table(h, 0, 0);
+ complete_all(&h->scan_wait);
+ mutex_lock(&scan_mutex);
+ h->busy_scanning = 0;
+ mutex_unlock(&scan_mutex);
}
}
@@ -3605,8 +3478,22 @@ static int check_for_unit_attention(ctlr_info_t *h, CommandList_struct *c)
case REPORT_LUNS_CHANGED:
printk(KERN_WARNING "cciss%d: report LUN data "
"changed\n", h->ctlr);
- add_to_scan_list(h);
- wake_up_process(cciss_scan_thread);
+ /*
+ * Here, we could call add_to_scan_list and wake up the scan thread,
+ * except that it's quite likely that we will get more than one
+ * REPORT_LUNS_CHANGED condition in quick succession, which means
+ * that those which occur after the first one will likely happen
+ * *during* the scan_thread's rescan. And the rescan code is not
+ * robust enough to restart in the middle, undoing what it has already
+ * done, and it's not clear that it's even possible to do this, since
+ * part of what it does is notify the block layer, which starts
+ * doing it's own i/o to read partition tables and so on, and the
+ * driver doesn't have visibility to know what might need undoing.
+ * In any event, if possible, it is horribly complicated to get right
+ * so we just don't do it for now.
+ *
+ * Note: this REPORT_LUNS_CHANGED condition only occurs on the MSA2012.
+ */
return 1;
break;
case POWER_OR_RESET:
@@ -3888,6 +3775,23 @@ static int __devinit cciss_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
* leave a little room for ioctl calls.
*/
c->max_commands = readl(&(c->cfgtable->CmdsOutMax));
+ c->maxsgentries = readl(&(c->cfgtable->MaxSGElements));
+
+ /*
+ * Limit native command to 32 s/g elements to save dma'able memory.
+ * Howvever spec says if 0, use 31
+ */
+
+ c->max_cmd_sgentries = 31;
+ if (c->maxsgentries > 512) {
+ c->max_cmd_sgentries = 32;
+ c->chainsize = c->maxsgentries - c->max_cmd_sgentries + 1;
+ c->maxsgentries -= 1; /* account for chain pointer */
+ } else {
+ c->maxsgentries = 31; /* Default to traditional value */
+ c->chainsize = 0; /* traditional */
+ }
+
c->product_name = products[prod_index].product_name;
c->access = *(products[prod_index].access);
c->nr_cmds = c->max_commands - 4;
@@ -4214,6 +4118,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
{
int i;
int j = 0;
+ int k = 0;
int rc;
int dac, return_code;
InquiryData_struct *inq_buff;
@@ -4317,6 +4222,53 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
printk(KERN_ERR "cciss: out of memory");
goto clean4;
}
+
+ /* Need space for temp scatter list */
+ hba[i]->scatter_list = kmalloc(hba[i]->max_commands *
+ sizeof(struct scatterlist *),
+ GFP_KERNEL);
+ for (k = 0; k < hba[i]->nr_cmds; k++) {
+ hba[i]->scatter_list[k] = kmalloc(sizeof(struct scatterlist) *
+ hba[i]->maxsgentries,
+ GFP_KERNEL);
+ if (hba[i]->scatter_list[k] == NULL) {
+ printk(KERN_ERR "cciss%d: could not allocate "
+ "s/g lists\n", i);
+ goto clean4;
+ }
+ }
+ hba[i]->cmd_sg_list = kmalloc(sizeof(struct Cmd_sg_list *) *
+ hba[i]->nr_cmds,
+ GFP_KERNEL);
+ if (!hba[i]->cmd_sg_list) {
+ printk(KERN_ERR "cciss%d: Cannot get memory for "
+ "s/g chaining.\n", i);
+ goto clean4;
+ }
+ /* Build up chain blocks for each command */
+ if (hba[i]->chainsize > 0) {
+ for (j = 0; j < hba[i]->nr_cmds; j++) {
+ hba[i]->cmd_sg_list[j] =
+ kmalloc(sizeof(struct Cmd_sg_list),
+ GFP_KERNEL);
+ if (!hba[i]->cmd_sg_list[j]) {
+ printk(KERN_ERR "cciss%d: Cannot get memory "
+ "for chain block.\n", i);
+ goto clean4;
+ }
+ /* Need a block of chainsized s/g elements. */
+ hba[i]->cmd_sg_list[j]->sgchain =
+ kmalloc((hba[i]->chainsize *
+ sizeof(SGDescriptor_struct)),
+ GFP_KERNEL);
+ if (!hba[i]->cmd_sg_list[j]->sgchain) {
+ printk(KERN_ERR "cciss%d: Cannot get memory "
+ "for s/g chains\n", i);
+ goto clean4;
+ }
+ }
+ }
+
spin_lock_init(&hba[i]->lock);
/* Initialize the pdev driver private data.
@@ -4362,7 +4314,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
cciss_procinit(i);
- hba[i]->cciss_max_sectors = 2048;
+ hba[i]->cciss_max_sectors = 8192;
rebuild_lun_table(hba[i], 1, 0);
hba[i]->busy_initializing = 0;
@@ -4370,6 +4322,20 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
clean4:
kfree(hba[i]->cmd_pool_bits);
+ /* Free up sg elements */
+ for (k = 0; k < hba[i]->nr_cmds; k++)
+ kfree(hba[i]->scatter_list[k]);
+ kfree(hba[i]->scatter_list);
+ /* Only free up extra s/g lists if controller supports them */
+ if (hba[i]->chainsize > 0) {
+ for (j = 0; j < hba[i]->nr_cmds; j++) {
+ if (hba[i]->cmd_sg_list[j]) {
+ kfree(hba[i]->cmd_sg_list[j]->sgchain);
+ kfree(hba[i]->cmd_sg_list[j]);
+ }
+ }
+ kfree(hba[i]->cmd_sg_list);
+ }
if (hba[i]->cmd_pool)
pci_free_consistent(hba[i]->pdev,
hba[i]->nr_cmds * sizeof(CommandList_struct),
@@ -4400,30 +4366,28 @@ clean_no_release_regions:
static void cciss_shutdown(struct pci_dev *pdev)
{
- ctlr_info_t *tmp_ptr;
- int i;
- char flush_buf[4];
+ ctlr_info_t *h;
+ char *flush_buf;
int return_code;
- tmp_ptr = pci_get_drvdata(pdev);
- if (tmp_ptr == NULL)
- return;
- i = tmp_ptr->ctlr;
- if (hba[i] == NULL)
+ h = pci_get_drvdata(pdev);
+ flush_buf = kzalloc(4, GFP_KERNEL);
+ if (!flush_buf) {
+ printk(KERN_WARNING
+ "cciss:%d cache not flushed, out of memory.\n",
+ h->ctlr);
return;
-
- /* Turn board interrupts off and send the flush cache command */
- /* sendcmd will turn off interrupt, and send the flush...
- * To write all data in the battery backed cache to disks */
- memset(flush_buf, 0, 4);
- return_code = sendcmd(CCISS_CACHE_FLUSH, i, flush_buf, 4, 0,
- CTLR_LUNID, TYPE_CMD);
- if (return_code == IO_OK) {
- printk(KERN_INFO "Completed flushing cache on controller %d\n", i);
- } else {
- printk(KERN_WARNING "Error flushing cache on controller %d\n", i);
}
- free_irq(hba[i]->intr[2], hba[i]);
+ /* write all data in the battery backed cache to disk */
+ memset(flush_buf, 0, 4);
+ return_code = sendcmd_withirq(CCISS_CACHE_FLUSH, h->ctlr, flush_buf,
+ 4, 0, CTLR_LUNID, TYPE_CMD);
+ kfree(flush_buf);
+ if (return_code != IO_OK)
+ printk(KERN_WARNING "cciss%d: Error flushing cache\n",
+ h->ctlr);
+ h->access.set_intr_mask(h, CCISS_INTR_OFF);
+ free_irq(h->intr[2], h);
}
static void __devexit cciss_remove_one(struct pci_dev *pdev)
@@ -4485,6 +4449,20 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev)
pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(ErrorInfo_struct),
hba[i]->errinfo_pool, hba[i]->errinfo_pool_dhandle);
kfree(hba[i]->cmd_pool_bits);
+ /* Free up sg elements */
+ for (j = 0; j < hba[i]->nr_cmds; j++)
+ kfree(hba[i]->scatter_list[j]);
+ kfree(hba[i]->scatter_list);
+ /* Only free up extra s/g lists if controller supports them */
+ if (hba[i]->chainsize > 0) {
+ for (j = 0; j < hba[i]->nr_cmds; j++) {
+ if (hba[i]->cmd_sg_list[j]) {
+ kfree(hba[i]->cmd_sg_list[j]->sgchain);
+ kfree(hba[i]->cmd_sg_list[j]);
+ }
+ }
+ kfree(hba[i]->cmd_sg_list);
+ }
/*
* Deliberately omit pci_disable_device(): it does something nasty to
* Smart Array controllers that pci_enable_device does not undo
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 31524cf42c77..1d95db254069 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -55,7 +55,13 @@ typedef struct _drive_info_struct
char device_initialized; /* indicates whether dev is initialized */
} drive_info_struct;
-struct ctlr_info
+struct Cmd_sg_list {
+ SGDescriptor_struct *sgchain;
+ dma_addr_t sg_chain_dma;
+ int chain_block_size;
+};
+
+struct ctlr_info
{
int ctlr;
char devname[8];
@@ -75,6 +81,16 @@ struct ctlr_info
int num_luns;
int highest_lun;
int usage_count; /* number of opens all all minor devices */
+ /* Need space for temp sg list
+ * number of scatter/gathers supported
+ * number of scatter/gathers in chained block
+ */
+ struct scatterlist **scatter_list;
+ int maxsgentries;
+ int chainsize;
+ int max_cmd_sgentries;
+ struct Cmd_sg_list **cmd_sg_list;
+
# define DOORBELL_INT 0
# define PERF_MODE_INT 1
# define SIMPLE_MODE_INT 2
diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h
index dbaed1ea0da3..b50a9b261b85 100644
--- a/drivers/block/cciss_cmd.h
+++ b/drivers/block/cciss_cmd.h
@@ -7,7 +7,8 @@
//general boundary defintions
#define SENSEINFOBYTES 32//note that this value may vary between host implementations
-#define MAXSGENTRIES 31
+#define MAXSGENTRIES 32
+#define CCISS_SG_CHAIN 0x80000000
#define MAXREPLYQS 256
//Command Status value
@@ -319,6 +320,10 @@ typedef struct _CfgTable_struct {
BYTE ServerName[16];
DWORD HeartBeat;
DWORD SCSI_Prefetch;
+ DWORD MaxSGElements;
+ DWORD MaxLogicalUnits;
+ DWORD MaxPhysicalDrives;
+ DWORD MaxPhysicalDrivesPerLogicalUnit;
} CfgTable_struct;
#pragma pack()
#endif // CCISS_CMD_H
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 3315268b4ec7..5d0e46dc3632 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -755,7 +755,7 @@ complete_scsi_command( CommandList_struct *cp, int timeout, __u32 tag)
cp,
ei->ScsiStatus);
#endif
- cmd->result |= (ei->ScsiStatus < 1);
+ cmd->result |= (ei->ScsiStatus << 1);
}
else { /* scsi status is zero??? How??? */
@@ -1547,7 +1547,7 @@ cciss_engage_scsi(int ctlr)
if (sa->registered) {
printk("cciss%d: SCSI subsystem already engaged.\n", ctlr);
spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
- return ENXIO;
+ return -ENXIO;
}
sa->registered = 1;
spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig
new file mode 100644
index 000000000000..f4acd04ebeef
--- /dev/null
+++ b/drivers/block/drbd/Kconfig
@@ -0,0 +1,71 @@
+#
+# DRBD device driver configuration
+#
+
+comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected"
+ depends on !PROC_FS || !INET || !CONNECTOR
+
+config BLK_DEV_DRBD
+ tristate "DRBD Distributed Replicated Block Device support"
+ depends on PROC_FS && INET && CONNECTOR
+ select LRU_CACHE
+ default n
+ help
+
+ NOTE: In order to authenticate connections you have to select
+ CRYPTO_HMAC and a hash function as well.
+
+ DRBD is a shared-nothing, synchronously replicated block device. It
+ is designed to serve as a building block for high availability
+ clusters and in this context, is a "drop-in" replacement for shared
+ storage. Simplistically, you could see it as a network RAID 1.
+
+ Each minor device has a role, which can be 'primary' or 'secondary'.
+ On the node with the primary device the application is supposed to
+ run and to access the device (/dev/drbdX). Every write is sent to
+ the local 'lower level block device' and, across the network, to the
+ node with the device in 'secondary' state. The secondary device
+ simply writes the data to its lower level block device.
+
+ DRBD can also be used in dual-Primary mode (device writable on both
+ nodes), which means it can exhibit shared disk semantics in a
+ shared-nothing cluster. Needless to say, on top of dual-Primary
+ DRBD utilizing a cluster file system is necessary to maintain for
+ cache coherency.
+
+ For automatic failover you need a cluster manager (e.g. heartbeat).
+ See also: http://www.drbd.org/, http://www.linux-ha.org
+
+ If unsure, say N.
+
+config DRBD_FAULT_INJECTION
+ bool "DRBD fault injection"
+ depends on BLK_DEV_DRBD
+ help
+
+ Say Y here if you want to simulate IO errors, in order to test DRBD's
+ behavior.
+
+ The actual simulation of IO errors is done by writing 3 values to
+ /sys/module/drbd/parameters/
+
+ enable_faults: bitmask of...
+ 1 meta data write
+ 2 read
+ 4 resync data write
+ 8 read
+ 16 data write
+ 32 data read
+ 64 read ahead
+ 128 kmalloc of bitmap
+ 256 allocation of EE (epoch_entries)
+
+ fault_devs: bitmask of minor numbers
+ fault_rate: frequency in percent
+
+ Example: Simulate data write errors on /dev/drbd0 with a probability of 5%.
+ echo 16 > /sys/module/drbd/parameters/enable_faults
+ echo 1 > /sys/module/drbd/parameters/fault_devs
+ echo 5 > /sys/module/drbd/parameters/fault_rate
+
+ If unsure, say N.
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
new file mode 100644
index 000000000000..0d3f337ff5ff
--- /dev/null
+++ b/drivers/block/drbd/Makefile
@@ -0,0 +1,5 @@
+drbd-y := drbd_bitmap.o drbd_proc.o
+drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
+drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
+
+obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
new file mode 100644
index 000000000000..17956ff6a08d
--- /dev/null
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -0,0 +1,1424 @@
+/*
+ drbd_actlog.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+#include "drbd_wrappers.h"
+
+/* We maintain a trivial check sum in our on disk activity log.
+ * With that we can ensure correct operation even when the storage
+ * device might do a partial (last) sector write while loosing power.
+ */
+struct __packed al_transaction {
+ u32 magic;
+ u32 tr_number;
+ struct __packed {
+ u32 pos;
+ u32 extent; } updates[1 + AL_EXTENTS_PT];
+ u32 xor_sum;
+};
+
+struct update_odbm_work {
+ struct drbd_work w;
+ unsigned int enr;
+};
+
+struct update_al_work {
+ struct drbd_work w;
+ struct lc_element *al_ext;
+ struct completion event;
+ unsigned int enr;
+ /* if old_enr != LC_FREE, write corresponding bitmap sector, too */
+ unsigned int old_enr;
+};
+
+struct drbd_atodb_wait {
+ atomic_t count;
+ struct completion io_done;
+ struct drbd_conf *mdev;
+ int error;
+};
+
+
+int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
+
+static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
+ struct drbd_backing_dev *bdev,
+ struct page *page, sector_t sector,
+ int rw, int size)
+{
+ struct bio *bio;
+ struct drbd_md_io md_io;
+ int ok;
+
+ md_io.mdev = mdev;
+ init_completion(&md_io.event);
+ md_io.error = 0;
+
+ if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags))
+ rw |= (1 << BIO_RW_BARRIER);
+ rw |= ((1<<BIO_RW_UNPLUG) | (1<<BIO_RW_SYNCIO));
+
+ retry:
+ bio = bio_alloc(GFP_NOIO, 1);
+ bio->bi_bdev = bdev->md_bdev;
+ bio->bi_sector = sector;
+ ok = (bio_add_page(bio, page, size, 0) == size);
+ if (!ok)
+ goto out;
+ bio->bi_private = &md_io;
+ bio->bi_end_io = drbd_md_io_complete;
+ bio->bi_rw = rw;
+
+ if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
+ bio_endio(bio, -EIO);
+ else
+ submit_bio(rw, bio);
+ wait_for_completion(&md_io.event);
+ ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
+
+ /* check for unsupported barrier op.
+ * would rather check on EOPNOTSUPP, but that is not reliable.
+ * don't try again for ANY return value != 0 */
+ if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && !ok)) {
+ /* Try again with no barrier */
+ dev_warn(DEV, "Barriers not supported on meta data device - disabling\n");
+ set_bit(MD_NO_BARRIER, &mdev->flags);
+ rw &= ~(1 << BIO_RW_BARRIER);
+ bio_put(bio);
+ goto retry;
+ }
+ out:
+ bio_put(bio);
+ return ok;
+}
+
+int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+ sector_t sector, int rw)
+{
+ int logical_block_size, mask, ok;
+ int offset = 0;
+ struct page *iop = mdev->md_io_page;
+
+ D_ASSERT(mutex_is_locked(&mdev->md_io_mutex));
+
+ BUG_ON(!bdev->md_bdev);
+
+ logical_block_size = bdev_logical_block_size(bdev->md_bdev);
+ if (logical_block_size == 0)
+ logical_block_size = MD_SECTOR_SIZE;
+
+ /* in case logical_block_size != 512 [ s390 only? ] */
+ if (logical_block_size != MD_SECTOR_SIZE) {
+ mask = (logical_block_size / MD_SECTOR_SIZE) - 1;
+ D_ASSERT(mask == 1 || mask == 3 || mask == 7);
+ D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE);
+ offset = sector & mask;
+ sector = sector & ~mask;
+ iop = mdev->md_io_tmpp;
+
+ if (rw & WRITE) {
+ /* these are GFP_KERNEL pages, pre-allocated
+ * on device initialization */
+ void *p = page_address(mdev->md_io_page);
+ void *hp = page_address(mdev->md_io_tmpp);
+
+ ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector,
+ READ, logical_block_size);
+
+ if (unlikely(!ok)) {
+ dev_err(DEV, "drbd_md_sync_page_io(,%llus,"
+ "READ [logical_block_size!=512]) failed!\n",
+ (unsigned long long)sector);
+ return 0;
+ }
+
+ memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE);
+ }
+ }
+
+ if (sector < drbd_md_first_sector(bdev) ||
+ sector > drbd_md_last_sector(bdev))
+ dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
+ current->comm, current->pid, __func__,
+ (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+
+ ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size);
+ if (unlikely(!ok)) {
+ dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n",
+ (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+ return 0;
+ }
+
+ if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) {
+ void *p = page_address(mdev->md_io_page);
+ void *hp = page_address(mdev->md_io_tmpp);
+
+ memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE);
+ }
+
+ return ok;
+}
+
+static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
+{
+ struct lc_element *al_ext;
+ struct lc_element *tmp;
+ unsigned long al_flags = 0;
+
+ spin_lock_irq(&mdev->al_lock);
+ tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
+ if (unlikely(tmp != NULL)) {
+ struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+ if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
+ spin_unlock_irq(&mdev->al_lock);
+ return NULL;
+ }
+ }
+ al_ext = lc_get(mdev->act_log, enr);
+ al_flags = mdev->act_log->flags;
+ spin_unlock_irq(&mdev->al_lock);
+
+ /*
+ if (!al_ext) {
+ if (al_flags & LC_STARVING)
+ dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n");
+ if (al_flags & LC_DIRTY)
+ dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n");
+ }
+ */
+
+ return al_ext;
+}
+
+void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector)
+{
+ unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
+ struct lc_element *al_ext;
+ struct update_al_work al_work;
+
+ D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
+
+ wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)));
+
+ if (al_ext->lc_number != enr) {
+ /* drbd_al_write_transaction(mdev,al_ext,enr);
+ * recurses into generic_make_request(), which
+ * disallows recursion, bios being serialized on the
+ * current->bio_tail list now.
+ * we have to delegate updates to the activity log
+ * to the worker thread. */
+ init_completion(&al_work.event);
+ al_work.al_ext = al_ext;
+ al_work.enr = enr;
+ al_work.old_enr = al_ext->lc_number;
+ al_work.w.cb = w_al_write_transaction;
+ drbd_queue_work_front(&mdev->data.work, &al_work.w);
+ wait_for_completion(&al_work.event);
+
+ mdev->al_writ_cnt++;
+
+ spin_lock_irq(&mdev->al_lock);
+ lc_changed(mdev->act_log, al_ext);
+ spin_unlock_irq(&mdev->al_lock);
+ wake_up(&mdev->al_wait);
+ }
+}
+
+void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
+{
+ unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
+ struct lc_element *extent;
+ unsigned long flags;
+
+ spin_lock_irqsave(&mdev->al_lock, flags);
+
+ extent = lc_find(mdev->act_log, enr);
+
+ if (!extent) {
+ spin_unlock_irqrestore(&mdev->al_lock, flags);
+ dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
+ return;
+ }
+
+ if (lc_put(mdev->act_log, extent) == 0)
+ wake_up(&mdev->al_wait);
+
+ spin_unlock_irqrestore(&mdev->al_lock, flags);
+}
+
+int
+w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+ struct update_al_work *aw = container_of(w, struct update_al_work, w);
+ struct lc_element *updated = aw->al_ext;
+ const unsigned int new_enr = aw->enr;
+ const unsigned int evicted = aw->old_enr;
+ struct al_transaction *buffer;
+ sector_t sector;
+ int i, n, mx;
+ unsigned int extent_nr;
+ u32 xor_sum = 0;
+
+ if (!get_ldev(mdev)) {
+ dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n");
+ complete(&((struct update_al_work *)w)->event);
+ return 1;
+ }
+ /* do we have to do a bitmap write, first?
+ * TODO reduce maximum latency:
+ * submit both bios, then wait for both,
+ * instead of doing two synchronous sector writes. */
+ if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
+ drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);
+
+ mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */
+ buffer = (struct al_transaction *)page_address(mdev->md_io_page);
+
+ buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
+ buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
+
+ n = lc_index_of(mdev->act_log, updated);
+
+ buffer->updates[0].pos = cpu_to_be32(n);
+ buffer->updates[0].extent = cpu_to_be32(new_enr);
+
+ xor_sum ^= new_enr;
+
+ mx = min_t(int, AL_EXTENTS_PT,
+ mdev->act_log->nr_elements - mdev->al_tr_cycle);
+ for (i = 0; i < mx; i++) {
+ unsigned idx = mdev->al_tr_cycle + i;
+ extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
+ buffer->updates[i+1].pos = cpu_to_be32(idx);
+ buffer->updates[i+1].extent = cpu_to_be32(extent_nr);
+ xor_sum ^= extent_nr;
+ }
+ for (; i < AL_EXTENTS_PT; i++) {
+ buffer->updates[i+1].pos = __constant_cpu_to_be32(-1);
+ buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE);
+ xor_sum ^= LC_FREE;
+ }
+ mdev->al_tr_cycle += AL_EXTENTS_PT;
+ if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
+ mdev->al_tr_cycle = 0;
+
+ buffer->xor_sum = cpu_to_be32(xor_sum);
+
+ sector = mdev->ldev->md.md_offset
+ + mdev->ldev->md.al_offset + mdev->al_tr_pos;
+
+ if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
+ drbd_chk_io_error(mdev, 1, TRUE);
+
+ if (++mdev->al_tr_pos >
+ div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
+ mdev->al_tr_pos = 0;
+
+ D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
+ mdev->al_tr_number++;
+
+ mutex_unlock(&mdev->md_io_mutex);
+
+ complete(&((struct update_al_work *)w)->event);
+ put_ldev(mdev);
+
+ return 1;
+}
+
+/**
+ * drbd_al_read_tr() - Read a single transaction from the on disk activity log
+ * @mdev: DRBD device.
+ * @bdev: Block device to read form.
+ * @b: pointer to an al_transaction.
+ * @index: On disk slot of the transaction to read.
+ *
+ * Returns -1 on IO error, 0 on checksum error and 1 upon success.
+ */
+static int drbd_al_read_tr(struct drbd_conf *mdev,
+ struct drbd_backing_dev *bdev,
+ struct al_transaction *b,
+ int index)
+{
+ sector_t sector;
+ int rv, i;
+ u32 xor_sum = 0;
+
+ sector = bdev->md.md_offset + bdev->md.al_offset + index;
+
+ /* Dont process error normally,
+ * as this is done before disk is attached! */
+ if (!drbd_md_sync_page_io(mdev, bdev, sector, READ))
+ return -1;
+
+ rv = (be32_to_cpu(b->magic) == DRBD_MAGIC);
+
+ for (i = 0; i < AL_EXTENTS_PT + 1; i++)
+ xor_sum ^= be32_to_cpu(b->updates[i].extent);
+ rv &= (xor_sum == be32_to_cpu(b->xor_sum));
+
+ return rv;
+}
+
+/**
+ * drbd_al_read_log() - Restores the activity log from its on disk representation.
+ * @mdev: DRBD device.
+ * @bdev: Block device to read form.
+ *
+ * Returns 1 on success, returns 0 when reading the log failed due to IO errors.
+ */
+int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+{
+ struct al_transaction *buffer;
+ int i;
+ int rv;
+ int mx;
+ int active_extents = 0;
+ int transactions = 0;
+ int found_valid = 0;
+ int from = 0;
+ int to = 0;
+ u32 from_tnr = 0;
+ u32 to_tnr = 0;
+ u32 cnr;
+
+ mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT);
+
+ /* lock out all other meta data io for now,
+ * and make sure the page is mapped.
+ */
+ mutex_lock(&mdev->md_io_mutex);
+ buffer = page_address(mdev->md_io_page);
+
+ /* Find the valid transaction in the log */
+ for (i = 0; i <= mx; i++) {
+ rv = drbd_al_read_tr(mdev, bdev, buffer, i);
+ if (rv == 0)
+ continue;
+ if (rv == -1) {
+ mutex_unlock(&mdev->md_io_mutex);
+ return 0;
+ }
+ cnr = be32_to_cpu(buffer->tr_number);
+
+ if (++found_valid == 1) {
+ from = i;
+ to = i;
+ from_tnr = cnr;
+ to_tnr = cnr;
+ continue;
+ }
+ if ((int)cnr - (int)from_tnr < 0) {
+ D_ASSERT(from_tnr - cnr + i - from == mx+1);
+ from = i;
+ from_tnr = cnr;
+ }
+ if ((int)cnr - (int)to_tnr > 0) {
+ D_ASSERT(cnr - to_tnr == i - to);
+ to = i;
+ to_tnr = cnr;
+ }
+ }
+
+ if (!found_valid) {
+ dev_warn(DEV, "No usable activity log found.\n");
+ mutex_unlock(&mdev->md_io_mutex);
+ return 1;
+ }
+
+ /* Read the valid transactions.
+ * dev_info(DEV, "Reading from %d to %d.\n",from,to); */
+ i = from;
+ while (1) {
+ int j, pos;
+ unsigned int extent_nr;
+ unsigned int trn;
+
+ rv = drbd_al_read_tr(mdev, bdev, buffer, i);
+ ERR_IF(rv == 0) goto cancel;
+ if (rv == -1) {
+ mutex_unlock(&mdev->md_io_mutex);
+ return 0;
+ }
+
+ trn = be32_to_cpu(buffer->tr_number);
+
+ spin_lock_irq(&mdev->al_lock);
+
+ /* This loop runs backwards because in the cyclic
+ elements there might be an old version of the
+ updated element (in slot 0). So the element in slot 0
+ can overwrite old versions. */
+ for (j = AL_EXTENTS_PT; j >= 0; j--) {
+ pos = be32_to_cpu(buffer->updates[j].pos);
+ extent_nr = be32_to_cpu(buffer->updates[j].extent);
+
+ if (extent_nr == LC_FREE)
+ continue;
+
+ lc_set(mdev->act_log, extent_nr, pos);
+ active_extents++;
+ }
+ spin_unlock_irq(&mdev->al_lock);
+
+ transactions++;
+
+cancel:
+ if (i == to)
+ break;
+ i++;
+ if (i > mx)
+ i = 0;
+ }
+
+ mdev->al_tr_number = to_tnr+1;
+ mdev->al_tr_pos = to;
+ if (++mdev->al_tr_pos >
+ div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
+ mdev->al_tr_pos = 0;
+
+ /* ok, we are done with it */
+ mutex_unlock(&mdev->md_io_mutex);
+
+ dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
+ transactions, active_extents);
+
+ return 1;
+}
+
+static void atodb_endio(struct bio *bio, int error)
+{
+ struct drbd_atodb_wait *wc = bio->bi_private;
+ struct drbd_conf *mdev = wc->mdev;
+ struct page *page;
+ int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+ /* strange behavior of some lower level drivers...
+ * fail the request by clearing the uptodate flag,
+ * but do not return any error?! */
+ if (!error && !uptodate)
+ error = -EIO;
+
+ drbd_chk_io_error(mdev, error, TRUE);
+ if (error && wc->error == 0)
+ wc->error = error;
+
+ if (atomic_dec_and_test(&wc->count))
+ complete(&wc->io_done);
+
+ page = bio->bi_io_vec[0].bv_page;
+ put_page(page);
+ bio_put(bio);
+ mdev->bm_writ_cnt++;
+ put_ldev(mdev);
+}
+
+#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
+/* activity log to on disk bitmap -- prepare bio unless that sector
+ * is already covered by previously prepared bios */
+static int atodb_prepare_unless_covered(struct drbd_conf *mdev,
+ struct bio **bios,
+ unsigned int enr,
+ struct drbd_atodb_wait *wc) __must_hold(local)
+{
+ struct bio *bio;
+ struct page *page;
+ sector_t on_disk_sector = enr + mdev->ldev->md.md_offset
+ + mdev->ldev->md.bm_offset;
+ unsigned int page_offset = PAGE_SIZE;
+ int offset;
+ int i = 0;
+ int err = -ENOMEM;
+
+ /* Check if that enr is already covered by an already created bio.
+ * Caution, bios[] is not NULL terminated,
+ * but only initialized to all NULL.
+ * For completely scattered activity log,
+ * the last invocation iterates over all bios,
+ * and finds the last NULL entry.
+ */
+ while ((bio = bios[i])) {
+ if (bio->bi_sector == on_disk_sector)
+ return 0;
+ i++;
+ }
+ /* bios[i] == NULL, the next not yet used slot */
+
+ /* GFP_KERNEL, we are not in the write-out path */
+ bio = bio_alloc(GFP_KERNEL, 1);
+ if (bio == NULL)
+ return -ENOMEM;
+
+ if (i > 0) {
+ const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec;
+ page_offset = prev_bv->bv_offset + prev_bv->bv_len;
+ page = prev_bv->bv_page;
+ }
+ if (page_offset == PAGE_SIZE) {
+ page = alloc_page(__GFP_HIGHMEM);
+ if (page == NULL)
+ goto out_bio_put;
+ page_offset = 0;
+ } else {
+ get_page(page);
+ }
+
+ offset = S2W(enr);
+ drbd_bm_get_lel(mdev, offset,
+ min_t(size_t, S2W(1), drbd_bm_words(mdev) - offset),
+ kmap(page) + page_offset);
+ kunmap(page);
+
+ bio->bi_private = wc;
+ bio->bi_end_io = atodb_endio;
+ bio->bi_bdev = mdev->ldev->md_bdev;
+ bio->bi_sector = on_disk_sector;
+
+ if (bio_add_page(bio, page, MD_SECTOR_SIZE, page_offset) != MD_SECTOR_SIZE)
+ goto out_put_page;
+
+ atomic_inc(&wc->count);
+ /* we already know that we may do this...
+ * get_ldev_if_state(mdev,D_ATTACHING);
+ * just get the extra reference, so that the local_cnt reflects
+ * the number of pending IO requests DRBD at its backing device.
+ */
+ atomic_inc(&mdev->local_cnt);
+
+ bios[i] = bio;
+
+ return 0;
+
+out_put_page:
+ err = -EINVAL;
+ put_page(page);
+out_bio_put:
+ bio_put(bio);
+ return err;
+}
+
+/**
+ * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents
+ * @mdev: DRBD device.
+ *
+ * Called when we detach (unconfigure) local storage,
+ * or when we go from R_PRIMARY to R_SECONDARY role.
+ */
+void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
+{
+ int i, nr_elements;
+ unsigned int enr;
+ struct bio **bios;
+ struct drbd_atodb_wait wc;
+
+ ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING))
+ return; /* sorry, I don't have any act_log etc... */
+
+ wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+
+ nr_elements = mdev->act_log->nr_elements;
+
+ /* GFP_KERNEL, we are not in anyone's write-out path */
+ bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL);
+ if (!bios)
+ goto submit_one_by_one;
+
+ atomic_set(&wc.count, 0);
+ init_completion(&wc.io_done);
+ wc.mdev = mdev;
+ wc.error = 0;
+
+ for (i = 0; i < nr_elements; i++) {
+ enr = lc_element_by_index(mdev->act_log, i)->lc_number;
+ if (enr == LC_FREE)
+ continue;
+ /* next statement also does atomic_inc wc.count and local_cnt */
+ if (atodb_prepare_unless_covered(mdev, bios,
+ enr/AL_EXT_PER_BM_SECT,
+ &wc))
+ goto free_bios_submit_one_by_one;
+ }
+
+ /* unnecessary optimization? */
+ lc_unlock(mdev->act_log);
+ wake_up(&mdev->al_wait);
+
+ /* all prepared, submit them */
+ for (i = 0; i < nr_elements; i++) {
+ if (bios[i] == NULL)
+ break;
+ if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) {
+ bios[i]->bi_rw = WRITE;
+ bio_endio(bios[i], -EIO);
+ } else {
+ submit_bio(WRITE, bios[i]);
+ }
+ }
+
+ drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
+
+ /* always (try to) flush bitmap to stable storage */
+ drbd_md_flush(mdev);
+
+ /* In case we did not submit a single IO do not wait for
+ * them to complete. ( Because we would wait forever here. )
+ *
+ * In case we had IOs and they are already complete, there
+ * is not point in waiting anyways.
+ * Therefore this if () ... */
+ if (atomic_read(&wc.count))
+ wait_for_completion(&wc.io_done);
+
+ put_ldev(mdev);
+
+ kfree(bios);
+ return;
+
+ free_bios_submit_one_by_one:
+ /* free everything by calling the endio callback directly. */
+ for (i = 0; i < nr_elements && bios[i]; i++)
+ bio_endio(bios[i], 0);
+
+ kfree(bios);
+
+ submit_one_by_one:
+ dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n");
+
+ for (i = 0; i < mdev->act_log->nr_elements; i++) {
+ enr = lc_element_by_index(mdev->act_log, i)->lc_number;
+ if (enr == LC_FREE)
+ continue;
+ /* Really slow: if we have al-extents 16..19 active,
+ * sector 4 will be written four times! Synchronous! */
+ drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT);
+ }
+
+ lc_unlock(mdev->act_log);
+ wake_up(&mdev->al_wait);
+ put_ldev(mdev);
+}
+
+/**
+ * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
+ * @mdev: DRBD device.
+ */
+void drbd_al_apply_to_bm(struct drbd_conf *mdev)
+{
+ unsigned int enr;
+ unsigned long add = 0;
+ char ppb[10];
+ int i;
+
+ wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+
+ for (i = 0; i < mdev->act_log->nr_elements; i++) {
+ enr = lc_element_by_index(mdev->act_log, i)->lc_number;
+ if (enr == LC_FREE)
+ continue;
+ add += drbd_bm_ALe_set_all(mdev, enr);
+ }
+
+ lc_unlock(mdev->act_log);
+ wake_up(&mdev->al_wait);
+
+ dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",
+ ppsize(ppb, Bit2KB(add)));
+}
+
+static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
+{
+ int rv;
+
+ spin_lock_irq(&mdev->al_lock);
+ rv = (al_ext->refcnt == 0);
+ if (likely(rv))
+ lc_del(mdev->act_log, al_ext);
+ spin_unlock_irq(&mdev->al_lock);
+
+ return rv;
+}
+
+/**
+ * drbd_al_shrink() - Removes all active extents form the activity log
+ * @mdev: DRBD device.
+ *
+ * Removes all active extents form the activity log, waiting until
+ * the reference count of each entry dropped to 0 first, of course.
+ *
+ * You need to lock mdev->act_log with lc_try_lock() / lc_unlock()
+ */
+void drbd_al_shrink(struct drbd_conf *mdev)
+{
+ struct lc_element *al_ext;
+ int i;
+
+ D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags));
+
+ for (i = 0; i < mdev->act_log->nr_elements; i++) {
+ al_ext = lc_element_by_index(mdev->act_log, i);
+ if (al_ext->lc_number == LC_FREE)
+ continue;
+ wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext));
+ }
+
+ wake_up(&mdev->al_wait);
+}
+
+static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+ struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
+
+ if (!get_ldev(mdev)) {
+ if (__ratelimit(&drbd_ratelimit_state))
+ dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
+ kfree(udw);
+ return 1;
+ }
+
+ drbd_bm_write_sect(mdev, udw->enr);
+ put_ldev(mdev);
+
+ kfree(udw);
+
+ if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) {
+ switch (mdev->state.conn) {
+ case C_SYNC_SOURCE: case C_SYNC_TARGET:
+ case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
+ drbd_resync_finished(mdev);
+ default:
+ /* nothing to do */
+ break;
+ }
+ }
+ drbd_bcast_sync_progress(mdev);
+
+ return 1;
+}
+
+
+/* ATTENTION. The AL's extents are 4MB each, while the extents in the
+ * resync LRU-cache are 16MB each.
+ * The caller of this function has to hold an get_ldev() reference.
+ *
+ * TODO will be obsoleted once we have a caching lru of the on disk bitmap
+ */
+static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
+ int count, int success)
+{
+ struct lc_element *e;
+ struct update_odbm_work *udw;
+
+ unsigned int enr;
+
+ D_ASSERT(atomic_read(&mdev->local_cnt));
+
+ /* I simply assume that a sector/size pair never crosses
+ * a 16 MB extent border. (Currently this is true...) */
+ enr = BM_SECT_TO_EXT(sector);
+
+ e = lc_get(mdev->resync, enr);
+ if (e) {
+ struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
+ if (ext->lce.lc_number == enr) {
+ if (success)
+ ext->rs_left -= count;
+ else
+ ext->rs_failed += count;
+ if (ext->rs_left < ext->rs_failed) {
+ dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
+ "rs_failed=%d count=%d\n",
+ (unsigned long long)sector,
+ ext->lce.lc_number, ext->rs_left,
+ ext->rs_failed, count);
+ dump_stack();
+
+ lc_put(mdev->resync, &ext->lce);
+ drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ return;
+ }
+ } else {
+ /* Normally this element should be in the cache,
+ * since drbd_rs_begin_io() pulled it already in.
+ *
+ * But maybe an application write finished, and we set
+ * something outside the resync lru_cache in sync.
+ */
+ int rs_left = drbd_bm_e_weight(mdev, enr);
+ if (ext->flags != 0) {
+ dev_warn(DEV, "changing resync lce: %d[%u;%02lx]"
+ " -> %d[%u;00]\n",
+ ext->lce.lc_number, ext->rs_left,
+ ext->flags, enr, rs_left);
+ ext->flags = 0;
+ }
+ if (ext->rs_failed) {
+ dev_warn(DEV, "Kicking resync_lru element enr=%u "
+ "out with rs_failed=%d\n",
+ ext->lce.lc_number, ext->rs_failed);
+ set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
+ }
+ ext->rs_left = rs_left;
+ ext->rs_failed = success ? 0 : count;
+ lc_changed(mdev->resync, &ext->lce);
+ }
+ lc_put(mdev->resync, &ext->lce);
+ /* no race, we are within the al_lock! */
+
+ if (ext->rs_left == ext->rs_failed) {
+ ext->rs_failed = 0;
+
+ udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
+ if (udw) {
+ udw->enr = ext->lce.lc_number;
+ udw->w.cb = w_update_odbm;
+ drbd_queue_work_front(&mdev->data.work, &udw->w);
+ } else {
+ dev_warn(DEV, "Could not kmalloc an udw\n");
+ set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
+ }
+ }
+ } else {
+ dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n",
+ mdev->resync_locked,
+ mdev->resync->nr_elements,
+ mdev->resync->flags);
+ }
+}
+
+/* clear the bit corresponding to the piece of storage in question:
+ * size byte of data starting from sector. Only clear a bits of the affected
+ * one ore more _aligned_ BM_BLOCK_SIZE blocks.
+ *
+ * called by worker on C_SYNC_TARGET and receiver on SyncSource.
+ *
+ */
+void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
+ const char *file, const unsigned int line)
+{
+ /* Is called from worker and receiver context _only_ */
+ unsigned long sbnr, ebnr, lbnr;
+ unsigned long count = 0;
+ sector_t esector, nr_sectors;
+ int wake_up = 0;
+ unsigned long flags;
+
+ if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+ dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
+ (unsigned long long)sector, size);
+ return;
+ }
+ nr_sectors = drbd_get_capacity(mdev->this_bdev);
+ esector = sector + (size >> 9) - 1;
+
+ ERR_IF(sector >= nr_sectors) return;
+ ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+
+ lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+ /* we clear it (in sync).
+ * round up start sector, round down end sector. we make sure we only
+ * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
+ if (unlikely(esector < BM_SECT_PER_BIT-1))
+ return;
+ if (unlikely(esector == (nr_sectors-1)))
+ ebnr = lbnr;
+ else
+ ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+ sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+
+ if (sbnr > ebnr)
+ return;
+
+ /*
+ * ok, (capacity & 7) != 0 sometimes, but who cares...
+ * we count rs_{total,left} in bits, not sectors.
+ */
+ spin_lock_irqsave(&mdev->al_lock, flags);
+ count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
+ if (count) {
+ /* we need the lock for drbd_try_clear_on_disk_bm */
+ if (jiffies - mdev->rs_mark_time > HZ*10) {
+ /* should be rolling marks,
+ * but we estimate only anyways. */
+ if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) &&
+ mdev->state.conn != C_PAUSED_SYNC_T &&
+ mdev->state.conn != C_PAUSED_SYNC_S) {
+ mdev->rs_mark_time = jiffies;
+ mdev->rs_mark_left = drbd_bm_total_weight(mdev);
+ }
+ }
+ if (get_ldev(mdev)) {
+ drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
+ put_ldev(mdev);
+ }
+ /* just wake_up unconditional now, various lc_chaged(),
+ * lc_put() in drbd_try_clear_on_disk_bm(). */
+ wake_up = 1;
+ }
+ spin_unlock_irqrestore(&mdev->al_lock, flags);
+ if (wake_up)
+ wake_up(&mdev->al_wait);
+}
+
+/*
+ * this is intended to set one request worth of data out of sync.
+ * affects at least 1 bit,
+ * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits.
+ *
+ * called by tl_clear and drbd_send_dblock (==drbd_make_request).
+ * so this can be _any_ process.
+ */
+void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
+ const char *file, const unsigned int line)
+{
+ unsigned long sbnr, ebnr, lbnr, flags;
+ sector_t esector, nr_sectors;
+ unsigned int enr, count;
+ struct lc_element *e;
+
+ if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+ dev_err(DEV, "sector: %llus, size: %d\n",
+ (unsigned long long)sector, size);
+ return;
+ }
+
+ if (!get_ldev(mdev))
+ return; /* no disk, no metadata, no bitmap to set bits in */
+
+ nr_sectors = drbd_get_capacity(mdev->this_bdev);
+ esector = sector + (size >> 9) - 1;
+
+ ERR_IF(sector >= nr_sectors)
+ goto out;
+ ERR_IF(esector >= nr_sectors)
+ esector = (nr_sectors-1);
+
+ lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+ /* we set it out of sync,
+ * we do not need to round anything here */
+ sbnr = BM_SECT_TO_BIT(sector);
+ ebnr = BM_SECT_TO_BIT(esector);
+
+ /* ok, (capacity & 7) != 0 sometimes, but who cares...
+ * we count rs_{total,left} in bits, not sectors. */
+ spin_lock_irqsave(&mdev->al_lock, flags);
+ count = drbd_bm_set_bits(mdev, sbnr, ebnr);
+
+ enr = BM_SECT_TO_EXT(sector);
+ e = lc_find(mdev->resync, enr);
+ if (e)
+ lc_entry(e, struct bm_extent, lce)->rs_left += count;
+ spin_unlock_irqrestore(&mdev->al_lock, flags);
+
+out:
+ put_ldev(mdev);
+}
+
+static
+struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
+{
+ struct lc_element *e;
+ struct bm_extent *bm_ext;
+ int wakeup = 0;
+ unsigned long rs_flags;
+
+ spin_lock_irq(&mdev->al_lock);
+ if (mdev->resync_locked > mdev->resync->nr_elements/2) {
+ spin_unlock_irq(&mdev->al_lock);
+ return NULL;
+ }
+ e = lc_get(mdev->resync, enr);
+ bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+ if (bm_ext) {
+ if (bm_ext->lce.lc_number != enr) {
+ bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
+ bm_ext->rs_failed = 0;
+ lc_changed(mdev->resync, &bm_ext->lce);
+ wakeup = 1;
+ }
+ if (bm_ext->lce.refcnt == 1)
+ mdev->resync_locked++;
+ set_bit(BME_NO_WRITES, &bm_ext->flags);
+ }
+ rs_flags = mdev->resync->flags;
+ spin_unlock_irq(&mdev->al_lock);
+ if (wakeup)
+ wake_up(&mdev->al_wait);
+
+ if (!bm_ext) {
+ if (rs_flags & LC_STARVING)
+ dev_warn(DEV, "Have to wait for element"
+ " (resync LRU too small?)\n");
+ BUG_ON(rs_flags & LC_DIRTY);
+ }
+
+ return bm_ext;
+}
+
+static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
+{
+ struct lc_element *al_ext;
+ int rv = 0;
+
+ spin_lock_irq(&mdev->al_lock);
+ if (unlikely(enr == mdev->act_log->new_number))
+ rv = 1;
+ else {
+ al_ext = lc_find(mdev->act_log, enr);
+ if (al_ext) {
+ if (al_ext->refcnt)
+ rv = 1;
+ }
+ }
+ spin_unlock_irq(&mdev->al_lock);
+
+ /*
+ if (unlikely(rv)) {
+ dev_info(DEV, "Delaying sync read until app's write is done\n");
+ }
+ */
+ return rv;
+}
+
+/**
+ * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
+ * @mdev: DRBD device.
+ * @sector: The sector number.
+ *
+ * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted.
+ */
+int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
+{
+ unsigned int enr = BM_SECT_TO_EXT(sector);
+ struct bm_extent *bm_ext;
+ int i, sig;
+
+ sig = wait_event_interruptible(mdev->al_wait,
+ (bm_ext = _bme_get(mdev, enr)));
+ if (sig)
+ return 0;
+
+ if (test_bit(BME_LOCKED, &bm_ext->flags))
+ return 1;
+
+ for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
+ sig = wait_event_interruptible(mdev->al_wait,
+ !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i));
+ if (sig) {
+ spin_lock_irq(&mdev->al_lock);
+ if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
+ clear_bit(BME_NO_WRITES, &bm_ext->flags);
+ mdev->resync_locked--;
+ wake_up(&mdev->al_wait);
+ }
+ spin_unlock_irq(&mdev->al_lock);
+ return 0;
+ }
+ }
+
+ set_bit(BME_LOCKED, &bm_ext->flags);
+
+ return 1;
+}
+
+/**
+ * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
+ * @mdev: DRBD device.
+ * @sector: The sector number.
+ *
+ * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
+ * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
+ * if there is still application IO going on in this area.
+ */
+int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
+{
+ unsigned int enr = BM_SECT_TO_EXT(sector);
+ const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
+ struct lc_element *e;
+ struct bm_extent *bm_ext;
+ int i;
+
+ spin_lock_irq(&mdev->al_lock);
+ if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) {
+ /* in case you have very heavy scattered io, it may
+ * stall the syncer undefined if we give up the ref count
+ * when we try again and requeue.
+ *
+ * if we don't give up the refcount, but the next time
+ * we are scheduled this extent has been "synced" by new
+ * application writes, we'd miss the lc_put on the
+ * extent we keep the refcount on.
+ * so we remembered which extent we had to try again, and
+ * if the next requested one is something else, we do
+ * the lc_put here...
+ * we also have to wake_up
+ */
+ e = lc_find(mdev->resync, mdev->resync_wenr);
+ bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+ if (bm_ext) {
+ D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
+ D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
+ clear_bit(BME_NO_WRITES, &bm_ext->flags);
+ mdev->resync_wenr = LC_FREE;
+ if (lc_put(mdev->resync, &bm_ext->lce) == 0)
+ mdev->resync_locked--;
+ wake_up(&mdev->al_wait);
+ } else {
+ dev_alert(DEV, "LOGIC BUG\n");
+ }
+ }
+ /* TRY. */
+ e = lc_try_get(mdev->resync, enr);
+ bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+ if (bm_ext) {
+ if (test_bit(BME_LOCKED, &bm_ext->flags))
+ goto proceed;
+ if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
+ mdev->resync_locked++;
+ } else {
+ /* we did set the BME_NO_WRITES,
+ * but then could not set BME_LOCKED,
+ * so we tried again.
+ * drop the extra reference. */
+ bm_ext->lce.refcnt--;
+ D_ASSERT(bm_ext->lce.refcnt > 0);
+ }
+ goto check_al;
+ } else {
+ /* do we rather want to try later? */
+ if (mdev->resync_locked > mdev->resync->nr_elements-3)
+ goto try_again;
+ /* Do or do not. There is no try. -- Yoda */
+ e = lc_get(mdev->resync, enr);
+ bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+ if (!bm_ext) {
+ const unsigned long rs_flags = mdev->resync->flags;
+ if (rs_flags & LC_STARVING)
+ dev_warn(DEV, "Have to wait for element"
+ " (resync LRU too small?)\n");
+ BUG_ON(rs_flags & LC_DIRTY);
+ goto try_again;
+ }
+ if (bm_ext->lce.lc_number != enr) {
+ bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
+ bm_ext->rs_failed = 0;
+ lc_changed(mdev->resync, &bm_ext->lce);
+ wake_up(&mdev->al_wait);
+ D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);
+ }
+ set_bit(BME_NO_WRITES, &bm_ext->flags);
+ D_ASSERT(bm_ext->lce.refcnt == 1);
+ mdev->resync_locked++;
+ goto check_al;
+ }
+check_al:
+ for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
+ if (unlikely(al_enr+i == mdev->act_log->new_number))
+ goto try_again;
+ if (lc_is_used(mdev->act_log, al_enr+i))
+ goto try_again;
+ }
+ set_bit(BME_LOCKED, &bm_ext->flags);
+proceed:
+ mdev->resync_wenr = LC_FREE;
+ spin_unlock_irq(&mdev->al_lock);
+ return 0;
+
+try_again:
+ if (bm_ext)
+ mdev->resync_wenr = enr;
+ spin_unlock_irq(&mdev->al_lock);
+ return -EAGAIN;
+}
+
+void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)
+{
+ unsigned int enr = BM_SECT_TO_EXT(sector);
+ struct lc_element *e;
+ struct bm_extent *bm_ext;
+ unsigned long flags;
+
+ spin_lock_irqsave(&mdev->al_lock, flags);
+ e = lc_find(mdev->resync, enr);
+ bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+ if (!bm_ext) {
+ spin_unlock_irqrestore(&mdev->al_lock, flags);
+ if (__ratelimit(&drbd_ratelimit_state))
+ dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n");
+ return;
+ }
+
+ if (bm_ext->lce.refcnt == 0) {
+ spin_unlock_irqrestore(&mdev->al_lock, flags);
+ dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, "
+ "but refcnt is 0!?\n",
+ (unsigned long long)sector, enr);
+ return;
+ }
+
+ if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
+ clear_bit(BME_LOCKED, &bm_ext->flags);
+ clear_bit(BME_NO_WRITES, &bm_ext->flags);
+ mdev->resync_locked--;
+ wake_up(&mdev->al_wait);
+ }
+
+ spin_unlock_irqrestore(&mdev->al_lock, flags);
+}
+
+/**
+ * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
+ * @mdev: DRBD device.
+ */
+void drbd_rs_cancel_all(struct drbd_conf *mdev)
+{
+ spin_lock_irq(&mdev->al_lock);
+
+ if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */
+ lc_reset(mdev->resync);
+ put_ldev(mdev);
+ }
+ mdev->resync_locked = 0;
+ mdev->resync_wenr = LC_FREE;
+ spin_unlock_irq(&mdev->al_lock);
+ wake_up(&mdev->al_wait);
+}
+
+/**
+ * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
+ * @mdev: DRBD device.
+ *
+ * Returns 0 upon success, -EAGAIN if at least one reference count was
+ * not zero.
+ */
+int drbd_rs_del_all(struct drbd_conf *mdev)
+{
+ struct lc_element *e;
+ struct bm_extent *bm_ext;
+ int i;
+
+ spin_lock_irq(&mdev->al_lock);
+
+ if (get_ldev_if_state(mdev, D_FAILED)) {
+ /* ok, ->resync is there. */
+ for (i = 0; i < mdev->resync->nr_elements; i++) {
+ e = lc_element_by_index(mdev->resync, i);
+ bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+ if (bm_ext->lce.lc_number == LC_FREE)
+ continue;
+ if (bm_ext->lce.lc_number == mdev->resync_wenr) {
+ dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently"
+ " got 'synced' by application io\n",
+ mdev->resync_wenr);
+ D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
+ D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
+ clear_bit(BME_NO_WRITES, &bm_ext->flags);
+ mdev->resync_wenr = LC_FREE;
+ lc_put(mdev->resync, &bm_ext->lce);
+ }
+ if (bm_ext->lce.refcnt != 0) {
+ dev_info(DEV, "Retrying drbd_rs_del_all() later. "
+ "refcnt=%d\n", bm_ext->lce.refcnt);
+ put_ldev(mdev);
+ spin_unlock_irq(&mdev->al_lock);
+ return -EAGAIN;
+ }
+ D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
+ D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags));
+ lc_del(mdev->resync, &bm_ext->lce);
+ }
+ D_ASSERT(mdev->resync->used == 0);
+ put_ldev(mdev);
+ }
+ spin_unlock_irq(&mdev->al_lock);
+
+ return 0;
+}
+
+/**
+ * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
+ * @mdev: DRBD device.
+ * @sector: The sector number.
+ * @size: Size of failed IO operation, in byte.
+ */
+void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
+{
+ /* Is called from worker and receiver context _only_ */
+ unsigned long sbnr, ebnr, lbnr;
+ unsigned long count;
+ sector_t esector, nr_sectors;
+ int wake_up = 0;
+
+ if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+ dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
+ (unsigned long long)sector, size);
+ return;
+ }
+ nr_sectors = drbd_get_capacity(mdev->this_bdev);
+ esector = sector + (size >> 9) - 1;
+
+ ERR_IF(sector >= nr_sectors) return;
+ ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+
+ lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+ /*
+ * round up start sector, round down end sector. we make sure we only
+ * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
+ if (unlikely(esector < BM_SECT_PER_BIT-1))
+ return;
+ if (unlikely(esector == (nr_sectors-1)))
+ ebnr = lbnr;
+ else
+ ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+ sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+
+ if (sbnr > ebnr)
+ return;
+
+ /*
+ * ok, (capacity & 7) != 0 sometimes, but who cares...
+ * we count rs_{total,left} in bits, not sectors.
+ */
+ spin_lock_irq(&mdev->al_lock);
+ count = drbd_bm_count_bits(mdev, sbnr, ebnr);
+ if (count) {
+ mdev->rs_failed += count;
+
+ if (get_ldev(mdev)) {
+ drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE);
+ put_ldev(mdev);
+ }
+
+ /* just wake_up unconditional now, various lc_chaged(),
+ * lc_put() in drbd_try_clear_on_disk_bm(). */
+ wake_up = 1;
+ }
+ spin_unlock_irq(&mdev->al_lock);
+ if (wake_up)
+ wake_up(&mdev->al_wait);
+}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
new file mode 100644
index 000000000000..b61057e77882
--- /dev/null
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -0,0 +1,1327 @@
+/*
+ drbd_bitmap.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/bitops.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h>
+#include <linux/drbd.h>
+#include <asm/kmap_types.h>
+#include "drbd_int.h"
+
+/* OPAQUE outside this file!
+ * interface defined in drbd_int.h
+
+ * convention:
+ * function name drbd_bm_... => used elsewhere, "public".
+ * function name bm_... => internal to implementation, "private".
+
+ * Note that since find_first_bit returns int, at the current granularity of
+ * the bitmap (4KB per byte), this implementation "only" supports up to
+ * 1<<(32+12) == 16 TB...
+ */
+
+/*
+ * NOTE
+ * Access to the *bm_pages is protected by bm_lock.
+ * It is safe to read the other members within the lock.
+ *
+ * drbd_bm_set_bits is called from bio_endio callbacks,
+ * We may be called with irq already disabled,
+ * so we need spin_lock_irqsave().
+ * And we need the kmap_atomic.
+ */
+struct drbd_bitmap {
+ struct page **bm_pages;
+ spinlock_t bm_lock;
+ /* WARNING unsigned long bm_*:
+ * 32bit number of bit offset is just enough for 512 MB bitmap.
+ * it will blow up if we make the bitmap bigger...
+ * not that it makes much sense to have a bitmap that large,
+ * rather change the granularity to 16k or 64k or something.
+ * (that implies other problems, however...)
+ */
+ unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
+ unsigned long bm_bits;
+ size_t bm_words;
+ size_t bm_number_of_pages;
+ sector_t bm_dev_capacity;
+ struct semaphore bm_change; /* serializes resize operations */
+
+ atomic_t bm_async_io;
+ wait_queue_head_t bm_io_wait;
+
+ unsigned long bm_flags;
+
+ /* debugging aid, in case we are still racy somewhere */
+ char *bm_why;
+ struct task_struct *bm_task;
+};
+
+/* definition of bits in bm_flags */
+#define BM_LOCKED 0
+#define BM_MD_IO_ERROR 1
+#define BM_P_VMALLOCED 2
+
+static int bm_is_locked(struct drbd_bitmap *b)
+{
+ return test_bit(BM_LOCKED, &b->bm_flags);
+}
+
+#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
+static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
+{
+ struct drbd_bitmap *b = mdev->bitmap;
+ if (!__ratelimit(&drbd_ratelimit_state))
+ return;
+ dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
+ current == mdev->receiver.task ? "receiver" :
+ current == mdev->asender.task ? "asender" :
+ current == mdev->worker.task ? "worker" : current->comm,
+ func, b->bm_why ?: "?",
+ b->bm_task == mdev->receiver.task ? "receiver" :
+ b->bm_task == mdev->asender.task ? "asender" :
+ b->bm_task == mdev->worker.task ? "worker" : "?");
+}
+
+void drbd_bm_lock(struct drbd_conf *mdev, char *why)
+{
+ struct drbd_bitmap *b = mdev->bitmap;
+ int trylock_failed;
+
+ if (!b) {
+ dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n");
+ return;
+ }
+
+ trylock_failed = down_trylock(&b->bm_change);
+
+ if (trylock_failed) {
+ dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
+ current == mdev->receiver.task ? "receiver" :
+ current == mdev->asender.task ? "asender" :
+ current == mdev->worker.task ? "worker" : current->comm,
+ why, b->bm_why ?: "?",
+ b->bm_task == mdev->receiver.task ? "receiver" :
+ b->bm_task == mdev->asender.task ? "asender" :
+ b->bm_task == mdev->worker.task ? "worker" : "?");
+ down(&b->bm_change);
+ }
+ if (__test_and_set_bit(BM_LOCKED, &b->bm_flags))
+ dev_err(DEV, "FIXME bitmap already locked in bm_lock\n");
+
+ b->bm_why = why;
+ b->bm_task = current;
+}
+
+void drbd_bm_unlock(struct drbd_conf *mdev)
+{
+ struct drbd_bitmap *b = mdev->bitmap;
+ if (!b) {
+ dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n");
+ return;
+ }
+
+ if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags))
+ dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n");
+
+ b->bm_why = NULL;
+ b->bm_task = NULL;
+ up(&b->bm_change);
+}
+
+/* word offset to long pointer */
+static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km)
+{
+ struct page *page;
+ unsigned long page_nr;
+
+ /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
+ page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
+ BUG_ON(page_nr >= b->bm_number_of_pages);
+ page = b->bm_pages[page_nr];
+
+ return (unsigned long *) kmap_atomic(page, km);
+}
+
+static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset)
+{
+ return __bm_map_paddr(b, offset, KM_IRQ1);
+}
+
+static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
+{
+ kunmap_atomic(p_addr, km);
+};
+
+static void bm_unmap(unsigned long *p_addr)
+{
+ return __bm_unmap(p_addr, KM_IRQ1);
+}
+
+/* long word offset of _bitmap_ sector */
+#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
+/* word offset from start of bitmap to word number _in_page_
+ * modulo longs per page
+#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
+ hm, well, Philipp thinks gcc might not optimze the % into & (... - 1)
+ so do it explicitly:
+ */
+#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
+
+/* Long words per page */
+#define LWPP (PAGE_SIZE/sizeof(long))
+
+/*
+ * actually most functions herein should take a struct drbd_bitmap*, not a
+ * struct drbd_conf*, but for the debug macros I like to have the mdev around
+ * to be able to report device specific.
+ */
+
+static void bm_free_pages(struct page **pages, unsigned long number)
+{
+ unsigned long i;
+ if (!pages)
+ return;
+
+ for (i = 0; i < number; i++) {
+ if (!pages[i]) {
+ printk(KERN_ALERT "drbd: bm_free_pages tried to free "
+ "a NULL pointer; i=%lu n=%lu\n",
+ i, number);
+ continue;
+ }
+ __free_page(pages[i]);
+ pages[i] = NULL;
+ }
+}
+
+static void bm_vk_free(void *ptr, int v)
+{
+ if (v)
+ vfree(ptr);
+ else
+ kfree(ptr);
+}
+
+/*
+ * "have" and "want" are NUMBER OF PAGES.
+ */
+static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
+{
+ struct page **old_pages = b->bm_pages;
+ struct page **new_pages, *page;
+ unsigned int i, bytes, vmalloced = 0;
+ unsigned long have = b->bm_number_of_pages;
+
+ BUG_ON(have == 0 && old_pages != NULL);
+ BUG_ON(have != 0 && old_pages == NULL);
+
+ if (have == want)
+ return old_pages;
+
+ /* Trying kmalloc first, falling back to vmalloc.
+ * GFP_KERNEL is ok, as this is done when a lower level disk is
+ * "attached" to the drbd. Context is receiver thread or cqueue
+ * thread. As we have no disk yet, we are not in the IO path,
+ * not even the IO path of the peer. */
+ bytes = sizeof(struct page *)*want;
+ new_pages = kmalloc(bytes, GFP_KERNEL);
+ if (!new_pages) {
+ new_pages = vmalloc(bytes);
+ if (!new_pages)
+ return NULL;
+ vmalloced = 1;
+ }
+
+ memset(new_pages, 0, bytes);
+ if (want >= have) {
+ for (i = 0; i < have; i++)
+ new_pages[i] = old_pages[i];
+ for (; i < want; i++) {
+ page = alloc_page(GFP_HIGHUSER);
+ if (!page) {
+ bm_free_pages(new_pages + have, i - have);
+ bm_vk_free(new_pages, vmalloced);
+ return NULL;
+ }
+ new_pages[i] = page;
+ }
+ } else {
+ for (i = 0; i < want; i++)
+ new_pages[i] = old_pages[i];
+ /* NOT HERE, we are outside the spinlock!
+ bm_free_pages(old_pages + want, have - want);
+ */
+ }
+
+ if (vmalloced)
+ set_bit(BM_P_VMALLOCED, &b->bm_flags);
+ else
+ clear_bit(BM_P_VMALLOCED, &b->bm_flags);
+
+ return new_pages;
+}
+
+/*
+ * called on driver init only. TODO call when a device is created.
+ * allocates the drbd_bitmap, and stores it in mdev->bitmap.
+ */
+int drbd_bm_init(struct drbd_conf *mdev)
+{
+ struct drbd_bitmap *b = mdev->bitmap;
+ WARN_ON(b != NULL);
+ b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
+ if (!b)
+ return -ENOMEM;
+ spin_lock_init(&b->bm_lock);
+ init_MUTEX(&b->bm_change);
+ init_waitqueue_head(&b->bm_io_wait);
+
+ mdev->bitmap = b;
+
+ return 0;
+}
+
+sector_t drbd_bm_capacity(struct drbd_conf *mdev)
+{
+ ERR_IF(!mdev->bitmap) return 0;
+ return mdev->bitmap->bm_dev_capacity;
+}
+
+/* called on driver unload. TODO: call when a device is destroyed.
+ */
+void drbd_bm_cleanup(struct drbd_conf *mdev)
+{
+ ERR_IF (!mdev->bitmap) return;
+ bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
+ bm_vk_free(mdev->bitmap->bm_pages, test_bit(BM_P_VMALLOCED, &mdev->bitmap->bm_flags));
+ kfree(mdev->bitmap);
+ mdev->bitmap = NULL;
+}
+
+/*
+ * since (b->bm_bits % BITS_PER_LONG) != 0,
+ * this masks out the remaining bits.
+ * Returns the number of bits cleared.
+ */
+static int bm_clear_surplus(struct drbd_bitmap *b)
+{
+ const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
+ size_t w = b->bm_bits >> LN2_BPL;
+ int cleared = 0;
+ unsigned long *p_addr, *bm;
+
+ p_addr = bm_map_paddr(b, w);
+ bm = p_addr + MLPP(w);
+ if (w < b->bm_words) {
+ cleared = hweight_long(*bm & ~mask);
+ *bm &= mask;
+ w++; bm++;
+ }
+
+ if (w < b->bm_words) {
+ cleared += hweight_long(*bm);
+ *bm = 0;
+ }
+ bm_unmap(p_addr);
+ return cleared;
+}
+
+static void bm_set_surplus(struct drbd_bitmap *b)
+{
+ const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
+ size_t w = b->bm_bits >> LN2_BPL;
+ unsigned long *p_addr, *bm;
+
+ p_addr = bm_map_paddr(b, w);
+ bm = p_addr + MLPP(w);
+ if (w < b->bm_words) {
+ *bm |= ~mask;
+ bm++; w++;
+ }
+
+ if (w < b->bm_words) {
+ *bm = ~(0UL);
+ }
+ bm_unmap(p_addr);
+}
+
+static unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endian)
+{
+ unsigned long *p_addr, *bm, offset = 0;
+ unsigned long bits = 0;
+ unsigned long i, do_now;
+
+ while (offset < b->bm_words) {
+ i = do_now = min_t(size_t, b->bm_words-offset, LWPP);
+ p_addr = __bm_map_paddr(b, offset, KM_USER0);
+ bm = p_addr + MLPP(offset);
+ while (i--) {
+#ifndef __LITTLE_ENDIAN
+ if (swap_endian)
+ *bm = lel_to_cpu(*bm);
+#endif
+ bits += hweight_long(*bm++);
+ }
+ __bm_unmap(p_addr, KM_USER0);
+ offset += do_now;
+ cond_resched();
+ }
+
+ return bits;
+}
+
+static unsigned long bm_count_bits(struct drbd_bitmap *b)
+{
+ return __bm_count_bits(b, 0);
+}
+
+static unsigned long bm_count_bits_swap_endian(struct drbd_bitmap *b)
+{
+ return __bm_count_bits(b, 1);
+}
+
+/* offset and len in long words.*/
+static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
+{
+ unsigned long *p_addr, *bm;
+ size_t do_now, end;
+
+#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512)
+
+ end = offset + len;
+
+ if (end > b->bm_words) {
+ printk(KERN_ALERT "drbd: bm_memset end > bm_words\n");
+ return;
+ }
+
+ while (offset < end) {
+ do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
+ p_addr = bm_map_paddr(b, offset);
+ bm = p_addr + MLPP(offset);
+ if (bm+do_now > p_addr + LWPP) {
+ printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
+ p_addr, bm, (int)do_now);
+ break; /* breaks to after catch_oob_access_end() only! */
+ }
+ memset(bm, c, do_now * sizeof(long));
+ bm_unmap(p_addr);
+ offset += do_now;
+ }
+}
+
+/*
+ * make sure the bitmap has enough room for the attached storage,
+ * if necessary, resize.
+ * called whenever we may have changed the device size.
+ * returns -ENOMEM if we could not allocate enough memory, 0 on success.
+ * In case this is actually a resize, we copy the old bitmap into the new one.
+ * Otherwise, the bitmap is initialized to all bits set.
+ */
+int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
+{
+ struct drbd_bitmap *b = mdev->bitmap;
+ unsigned long bits, words, owords, obits, *p_addr, *bm;
+ unsigned long want, have, onpages; /* number of pages */
+ struct page **npages, **opages = NULL;
+ int err = 0, growing;
+ int opages_vmalloced;
+
+ ERR_IF(!b) return -ENOMEM;
+
+ drbd_bm_lock(mdev, "resize");
+
+ dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n",
+ (unsigned long long)capacity);
+
+ if (capacity == b->bm_dev_capacity)
+ goto out;
+
+ opages_vmalloced = test_bit(BM_P_VMALLOCED, &b->bm_flags);
+
+ if (capacity == 0) {
+ spin_lock_irq(&b->bm_lock);
+ opages = b->bm_pages;
+ onpages = b->bm_number_of_pages;
+ owords = b->bm_words;
+ b->bm_pages = NULL;
+ b->bm_number_of_pages =
+ b->bm_set =
+ b->bm_bits =
+ b->bm_words =
+ b->bm_dev_capacity = 0;
+ spin_unlock_irq(&b->bm_lock);
+ bm_free_pages(opages, onpages);
+ bm_vk_free(opages, opages_vmalloced);
+ goto out;
+ }
+ bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
+
+ /* if we would use
+ words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
+ a 32bit host could present the wrong number of words
+ to a 64bit host.
+ */
+ words = ALIGN(bits, 64) >> LN2_BPL;
+
+ if (get_ldev(mdev)) {
+ D_ASSERT((u64)bits <= (((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12));
+ put_ldev(mdev);
+ }
+
+ /* one extra long to catch off by one errors */
+ want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
+ have = b->bm_number_of_pages;
+ if (want == have) {
+ D_ASSERT(b->bm_pages != NULL);
+ npages = b->bm_pages;
+ } else {
+ if (FAULT_ACTIVE(mdev, DRBD_FAULT_BM_ALLOC))
+ npages = NULL;
+ else
+ npages = bm_realloc_pages(b, want);
+ }
+
+ if (!npages) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ spin_lock_irq(&b->bm_lock);
+ opages = b->bm_pages;
+ owords = b->bm_words;
+ obits = b->bm_bits;
+
+ growing = bits > obits;
+ if (opages)
+ bm_set_surplus(b);
+
+ b->bm_pages = npages;
+ b->bm_number_of_pages = want;
+ b->bm_bits = bits;
+ b->bm_words = words;
+ b->bm_dev_capacity = capacity;
+
+ if (growing) {
+ bm_memset(b, owords, 0xff, words-owords);
+ b->bm_set += bits - obits;
+ }
+
+ if (want < have) {
+ /* implicit: (opages != NULL) && (opages != npages) */
+ bm_free_pages(opages + want, have - want);
+ }
+
+ p_addr = bm_map_paddr(b, words);
+ bm = p_addr + MLPP(words);
+ *bm = DRBD_MAGIC;
+ bm_unmap(p_addr);
+
+ (void)bm_clear_surplus(b);
+
+ spin_unlock_irq(&b->bm_lock);
+ if (opages != npages)
+ bm_vk_free(opages, opages_vmalloced);
+ if (!growing)
+ b->bm_set = bm_count_bits(b);
+ dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words);
+
+ out:
+ drbd_bm_unlock(mdev);
+ return err;
+}
+
+/* inherently racy:
+ * if not protected by other means, return value may be out of date when
+ * leaving this function...
+ * we still need to lock it, since it is important that this returns
+ * bm_set == 0 precisely.
+ *
+ * maybe bm_set should be atomic_t ?
+ */
+static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
+{
+ struct drbd_bitmap *b = mdev->bitmap;
+ unsigned long s;
+ unsigned long flags;
+
+ ERR_IF(!b) return 0;
+ ERR_IF(!b->bm_pages) return 0;
+
+ spin_lock_irqsave(&b->bm_lock, flags);
+ s = b->bm_set;
+ spin_unlock_irqrestore(&b->bm_lock, flags);
+
+ return s;
+}
+
+unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)
+{
+ unsigned long s;
+ /* if I don't have a disk, I don't know about out-of-sync status */
+ if (!get_ldev_if_state(mdev, D_NEGOTIATING))
+ return 0;
+ s = _drbd_bm_total_weight(mdev);
+ put_ldev(mdev);
+ return s;
+}
+
+size_t drbd_bm_words(struct drbd_conf *mdev)
+{
+ struct drbd_bitmap *b = mdev->bitmap;
+ ERR_IF(!b) return 0;
+ ERR_IF(!b->bm_pages) return 0;
+
+ return b->bm_words;
+}
+
+unsigned long drbd_bm_bits(struct drbd_conf *mdev)
+{
+ struct drbd_bitmap *b = mdev->bitmap;
+ ERR_IF(!b) return 0;
+
+ return b->bm_bits;
+}
+
+/* merge number words from buffer into the bitmap starting at offset.
+ * buffer[i] is expected to be little endian unsigned long.
+ * bitmap must be locked by drbd_bm_lock.
+ * currently only used from receive_bitmap.
+ */
+void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
+ unsigned long *buffer)
+{
+ struct drbd_bitmap *b = mdev->bitmap;
+ unsigned long *p_addr, *bm;
+ unsigned long word, bits;
+ size_t end, do_now;
+
+ end = offset + number;
+
+ ERR_IF(!b) return;
+ ERR_IF(!b->bm_pages) return;
+ if (number == 0)
+ return;
+ WARN_ON(offset >= b->bm_words);
+ WARN_ON(end > b->bm_words);
+
+ spin_lock_irq(&b->bm_lock);
+ while (offset < end) {
+ do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
+ p_addr = bm_map_paddr(b, offset);
+ bm = p_addr + MLPP(offset);
+ offset += do_now;
+ while (do_now--) {
+ bits = hweight_long(*bm);
+ word = *bm | lel_to_cpu(*buffer++);
+ *bm++ = word;
+ b->bm_set += hweight_long(word) - bits;
+ }
+ bm_unmap(p_addr);
+ }
+ /* with 32bit <-> 64bit cross-platform connect
+ * this is only correct for current usage,
+ * where we _know_ that we are 64 bit aligned,
+ * and know that this function is used in this way, too...
+ */
+ if (end == b->bm_words)
+ b->bm_set -= bm_clear_surplus(b);
+
+ spin_unlock_irq(&b->bm_lock);
+}
+
+/* copy number words from the bitmap starting at offset into the buffer.
+ * buffer[i] will be little endian unsigned long.
+ */
+void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
+ unsigned long *buffer)
+{
+ struct drbd_bitmap *b = mdev->bitmap;
+ unsigned long *p_addr, *bm;
+ size_t end, do_now;
+
+ end = offset + number;
+
+ ERR_IF(!b) return;
+ ERR_IF(!b->bm_pages) return;
+
+ spin_lock_irq(&b->bm_lock);
+ if ((offset >= b->bm_words) ||
+ (end > b->bm_words) ||
+ (number <= 0))
+ dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n",
+ (unsigned long) offset,
+ (unsigned long) number,
+ (unsigned long) b->bm_words);
+ else {
+ while (offset < end) {
+ do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
+ p_addr = bm_map_paddr(b, offset);
+ bm = p_addr + MLPP(offset);
+ offset += do_now;
+ while (do_now--)
+ *buffer++ = cpu_to_lel(*bm++);
+ bm_unmap(p_addr);
+ }
+ }
+ spin_unlock_irq(&b->bm_lock);
+}
+
+/* set all bits in the bitmap */
+void drbd_bm_set_all(struct drbd_conf *mdev)
+{
+ struct drbd_bitmap *b = mdev->bitmap;
+ ERR_IF(!b) return;
+ ERR_IF(!b->bm_pages) return;
+
+ spin_lock_irq(&b->bm_lock);
+ bm_memset(b, 0, 0xff, b->bm_words);
+ (void)bm_clear_surplus(b);
+ b->bm_set = b->bm_bits;
+ spin_unlock_irq(&b->bm_lock);
+}
+
+/* clear all bits in the bitmap */
+void drbd_bm_clear_all(struct drbd_conf *mdev)
+{
+ struct drbd_bitmap *b = mdev->bitmap;
+ ERR_IF(!b) return;
+ ERR_IF(!b->bm_pages) return;
+
+ spin_lock_irq(&b->bm_lock);
+ bm_memset(b, 0, 0, b->bm_words);
+ b->bm_set = 0;
+ spin_unlock_irq(&b->bm_lock);
+}
+
+static void bm_async_io_complete(struct bio *bio, int error)
+{
+ struct drbd_bitmap *b = bio->bi_private;
+ int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+
+ /* strange behavior of some lower level drivers...
+ * fail the request by clearing the uptodate flag,
+ * but do not return any error?!
+ * do we want to WARN() on this? */
+ if (!error && !uptodate)
+ error = -EIO;
+
+ if (error) {
+ /* doh. what now?
+ * for now, set all bits, and flag MD_IO_ERROR */
+ __set_bit(BM_MD_IO_ERROR, &b->bm_flags);
+ }
+ if (atomic_dec_and_test(&b->bm_async_io))
+ wake_up(&b->bm_io_wait);
+
+ bio_put(bio);
+}
+
+static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local)
+{
+ /* we are process context. we always get a bio */
+ struct bio *bio = bio_alloc(GFP_KERNEL, 1);
+ unsigned int len;
+ sector_t on_disk_sector =
+ mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset;
+ on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
+
+ /* this might happen with very small
+ * flexible external meta data device */
+ len = min_t(unsigned int, PAGE_SIZE,
+ (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9);
+
+ bio->bi_bdev = mdev->ldev->md_bdev;
+ bio->bi_sector = on_disk_sector;
+ bio_add_page(bio, b->bm_pages[page_nr], len, 0);
+ bio->bi_private = b;
+ bio->bi_end_io = bm_async_io_complete;
+
+ if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
+ bio->bi_rw |= rw;
+ bio_endio(bio, -EIO);
+ } else {
+ submit_bio(rw, bio);
+ }
+}
+
+# if defined(__LITTLE_ENDIAN)
+ /* nothing to do, on disk == in memory */
+# define bm_cpu_to_lel(x) ((void)0)
+# else
+void bm_cpu_to_lel(struct drbd_bitmap *b)
+{
+ /* need to cpu_to_lel all the pages ...
+ * this may be optimized by using
+ * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0;
+ * the following is still not optimal, but better than nothing */
+ unsigned int i;
+ unsigned long *p_addr, *bm;
+ if (b->bm_set == 0) {
+ /* no page at all; avoid swap if all is 0 */
+ i = b->bm_number_of_pages;
+ } else if (b->bm_set == b->bm_bits) {
+ /* only the last page */
+ i = b->bm_number_of_pages - 1;
+ } else {
+ /* all pages */
+ i = 0;
+ }
+ for (; i < b->bm_number_of_pages; i++) {
+ p_addr = kmap_atomic(b->bm_pages[i], KM_USER0);
+ for (bm = p_addr; bm < p_addr + PAGE_SIZE/sizeof(long); bm++)
+ *bm = cpu_to_lel(*bm);
+ kunmap_atomic(p_addr, KM_USER0);
+ }
+}
+# endif
+/* lel_to_cpu == cpu_to_lel */
+# define bm_lel_to_cpu(x) bm_cpu_to_lel(x)
+
+/*
+ * bm_rw: read/write the whole bitmap from/to its on disk location.
+ */
+static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local)
+{
+ struct drbd_bitmap *b = mdev->bitmap;
+ /* sector_t sector; */
+ int bm_words, num_pages, i;
+ unsigned long now;
+ char ppb[10];
+ int err = 0;
+
+ WARN_ON(!bm_is_locked(b));
+
+ /* no spinlock here, the drbd_bm_lock should be enough! */
+
+ bm_words = drbd_bm_words(mdev);
+ num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT;
+
+ /* on disk bitmap is little endian */
+ if (rw == WRITE)
+ bm_cpu_to_lel(b);
+
+ now = jiffies;
+ atomic_set(&b->bm_async_io, num_pages);
+ __clear_bit(BM_MD_IO_ERROR, &b->bm_flags);
+
+ /* let the layers below us try to merge these bios... */
+ for (i = 0; i < num_pages; i++)
+ bm_page_io_async(mdev, b, i, rw);
+
+ drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
+ wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0);
+
+ if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) {
+ dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
+ drbd_chk_io_error(mdev, 1, TRUE);
+ err = -EIO;
+ }
+
+ now = jiffies;
+ if (rw == WRITE) {
+ /* swap back endianness */
+ bm_lel_to_cpu(b);
+ /* flush bitmap to stable storage */
+ drbd_md_flush(mdev);
+ } else /* rw == READ */ {
+ /* just read, if necessary adjust endianness */
+ b->bm_set = bm_count_bits_swap_endian(b);
+ dev_info(DEV, "recounting of set bits took additional %lu jiffies\n",
+ jiffies - now);
+ }
+ now = b->bm_set;
+
+ dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
+ ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
+
+ return err;
+}
+
+/**
+ * drbd_bm_read() - Read the whole bitmap from its on disk location.
+ * @mdev: DRBD device.
+ */
+int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
+{
+ return bm_rw(mdev, READ);
+}
+
+/**
+ * drbd_bm_write() - Write the whole bitmap to its on disk location.
+ * @mdev: DRBD device.
+ */
+int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
+{
+ return bm_rw(mdev, WRITE);
+}
+
+/**
+ * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap
+ * @mdev: DRBD device.
+ * @enr: Extent number in the resync lru (happens to be sector offset)
+ *
+ * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered
+ * by a single sector write. Therefore enr == sector offset from the
+ * start of the bitmap.
+ */
+int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local)
+{
+ sector_t on_disk_sector = enr + mdev->ldev->md.md_offset
+ + mdev->ldev->md.bm_offset;
+ int bm_words, num_words, offset;
+ int err = 0;
+
+ mutex_lock(&mdev->md_io_mutex);
+ bm_words = drbd_bm_words(mdev);
+ offset = S2W(enr); /* word offset into bitmap */
+ num_words = min(S2W(1), bm_words - offset);
+ if (num_words < S2W(1))
+ memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE);
+ drbd_bm_get_lel(mdev, offset, num_words,
+ page_address(mdev->md_io_page));
+ if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) {
+ int i;
+ err = -EIO;
+ dev_err(DEV, "IO ERROR writing bitmap sector %lu "
+ "(meta-disk sector %llus)\n",
+ enr, (unsigned long long)on_disk_sector);
+ drbd_chk_io_error(mdev, 1, TRUE);
+ for (i = 0; i < AL_EXT_PER_BM_SECT; i++)
+ drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i);
+ }
+ mdev->bm_writ_cnt++;
+ mutex_unlock(&mdev->md_io_mutex);
+ return err;
+}
+
+/* NOTE
+ * find_first_bit returns int, we return unsigned long.
+ * should not make much difference anyways, but ...
+ *
+ * this returns a bit number, NOT a sector!
+ */
+#define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1)
+static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
+ const int find_zero_bit, const enum km_type km)
+{
+ struct drbd_bitmap *b = mdev->bitmap;
+ unsigned long i = -1UL;
+ unsigned long *p_addr;
+ unsigned long bit_offset; /* bit offset of the mapped page. */
+
+ if (bm_fo > b->bm_bits) {
+ dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
+ } else {
+ while (bm_fo < b->bm_bits) {
+ unsigned long offset;
+ bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */
+ offset = bit_offset >> LN2_BPL; /* word offset of the page */
+ p_addr = __bm_map_paddr(b, offset, km);
+
+ if (find_zero_bit)
+ i = find_next_zero_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
+ else
+ i = find_next_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
+
+ __bm_unmap(p_addr, km);
+ if (i < PAGE_SIZE*8) {
+ i = bit_offset + i;
+ if (i >= b->bm_bits)
+ break;
+ goto found;
+ }
+ bm_fo = bit_offset + PAGE_SIZE*8;
+ }
+ i = -1UL;
+ }
+ found:
+ return i;
+}
+
+static unsigned long bm_find_next(struct drbd_conf *mdev,
+ unsigned long bm_fo, const int find_zero_bit)
+{
+ struct drbd_bitmap *b = mdev->bitmap;
+ unsigned long i = -1UL;
+
+ ERR_IF(!b) return i;
+ ERR_IF(!b->bm_pages) return i;
+
+ spin_lock_irq(&b->bm_lock);
+ if (bm_is_locked(b))
+ bm_print_lock_info(mdev);
+
+ i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
+
+ spin_unlock_irq(&b->bm_lock);
+ return i;
+}
+
+unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
+{
+ return bm_find_next(mdev, bm_fo, 0);
+}
+
+#if 0
+/* not yet needed for anything. */
+unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
+{
+ return bm_find_next(mdev, bm_fo, 1);
+}
+#endif
+
+/* does not spin_lock_irqsave.
+ * you must take drbd_bm_lock() first */
+unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
+{
+ /* WARN_ON(!bm_is_locked(mdev)); */
+ return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
+}
+
+unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
+{
+ /* WARN_ON(!bm_is_locked(mdev)); */
+ return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
+}
+
+/* returns number of bits actually changed.
+ * for val != 0, we change 0 -> 1, return code positive
+ * for val == 0, we change 1 -> 0, return code negative
+ * wants bitnr, not sector.
+ * expected to be called for only a few bits (e - s about BITS_PER_LONG).
+ * Must hold bitmap lock already. */
+int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
+ unsigned long e, int val, const enum km_type km)
+{
+ struct drbd_bitmap *b = mdev->bitmap;
+ unsigned long *p_addr = NULL;
+ unsigned long bitnr;
+ unsigned long last_page_nr = -1UL;
+ int c = 0;
+
+ if (e >= b->bm_bits) {
+ dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
+ s, e, b->bm_bits);
+ e = b->bm_bits ? b->bm_bits -1 : 0;
+ }
+ for (bitnr = s; bitnr <= e; bitnr++) {
+ unsigned long offset = bitnr>>LN2_BPL;
+ unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
+ if (page_nr != last_page_nr) {
+ if (p_addr)
+ __bm_unmap(p_addr, km);
+ p_addr = __bm_map_paddr(b, offset, km);
+ last_page_nr = page_nr;
+ }
+ if (val)
+ c += (0 == __test_and_set_bit(bitnr & BPP_MASK, p_addr));
+ else
+ c -= (0 != __test_and_clear_bit(bitnr & BPP_MASK, p_addr));
+ }
+ if (p_addr)
+ __bm_unmap(p_addr, km);
+ b->bm_set += c;
+ return c;
+}
+
+/* returns number of bits actually changed.
+ * for val != 0, we change 0 -> 1, return code positive
+ * for val == 0, we change 1 -> 0, return code negative
+ * wants bitnr, not sector */
+int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
+ const unsigned long e, int val)
+{
+ unsigned long flags;
+ struct drbd_bitmap *b = mdev->bitmap;
+ int c = 0;
+
+ ERR_IF(!b) return 1;
+ ERR_IF(!b->bm_pages) return 0;
+
+ spin_lock_irqsave(&b->bm_lock, flags);
+ if (bm_is_locked(b))
+ bm_print_lock_info(mdev);
+
+ c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1);
+
+ spin_unlock_irqrestore(&b->bm_lock, flags);
+ return c;
+}
+
+/* returns number of bits changed 0 -> 1 */
+int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
+{
+ return bm_change_bits_to(mdev, s, e, 1);
+}
+
+/* returns number of bits changed 1 -> 0 */
+int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
+{
+ return -bm_change_bits_to(mdev, s, e, 0);
+}
+
+/* sets all bits in full words,
+ * from first_word up to, but not including, last_word */
+static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
+ int page_nr, int first_word, int last_word)
+{
+ int i;
+ int bits;
+ unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_USER0);
+ for (i = first_word; i < last_word; i++) {
+ bits = hweight_long(paddr[i]);
+ paddr[i] = ~0UL;
+ b->bm_set += BITS_PER_LONG - bits;
+ }
+ kunmap_atomic(paddr, KM_USER0);
+}
+
+/* Same thing as drbd_bm_set_bits, but without taking the spin_lock_irqsave.
+ * You must first drbd_bm_lock().
+ * Can be called to set the whole bitmap in one go.
+ * Sets bits from s to e _inclusive_. */
+void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
+{
+ /* First set_bit from the first bit (s)
+ * up to the next long boundary (sl),
+ * then assign full words up to the last long boundary (el),
+ * then set_bit up to and including the last bit (e).
+ *
+ * Do not use memset, because we must account for changes,
+ * so we need to loop over the words with hweight() anyways.
+ */
+ unsigned long sl = ALIGN(s,BITS_PER_LONG);
+ unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
+ int first_page;
+ int last_page;
+ int page_nr;
+ int first_word;
+ int last_word;
+
+ if (e - s <= 3*BITS_PER_LONG) {
+ /* don't bother; el and sl may even be wrong. */
+ __bm_change_bits_to(mdev, s, e, 1, KM_USER0);
+ return;
+ }
+
+ /* difference is large enough that we can trust sl and el */
+
+ /* bits filling the current long */
+ if (sl)
+ __bm_change_bits_to(mdev, s, sl-1, 1, KM_USER0);
+
+ first_page = sl >> (3 + PAGE_SHIFT);
+ last_page = el >> (3 + PAGE_SHIFT);
+
+ /* MLPP: modulo longs per page */
+ /* LWPP: long words per page */
+ first_word = MLPP(sl >> LN2_BPL);
+ last_word = LWPP;
+
+ /* first and full pages, unless first page == last page */
+ for (page_nr = first_page; page_nr < last_page; page_nr++) {
+ bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word);
+ cond_resched();
+ first_word = 0;
+ }
+
+ /* last page (respectively only page, for first page == last page) */
+ last_word = MLPP(el >> LN2_BPL);
+ bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word);
+
+ /* possibly trailing bits.
+ * example: (e & 63) == 63, el will be e+1.
+ * if that even was the very last bit,
+ * it would trigger an assert in __bm_change_bits_to()
+ */
+ if (el <= e)
+ __bm_change_bits_to(mdev, el, e, 1, KM_USER0);
+}
+
+/* returns bit state
+ * wants bitnr, NOT sector.
+ * inherently racy... area needs to be locked by means of {al,rs}_lru
+ * 1 ... bit set
+ * 0 ... bit not set
+ * -1 ... first out of bounds access, stop testing for bits!
+ */
+int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
+{
+ unsigned long flags;
+ struct drbd_bitmap *b = mdev->bitmap;
+ unsigned long *p_addr;
+ int i;
+
+ ERR_IF(!b) return 0;
+ ERR_IF(!b->bm_pages) return 0;
+
+ spin_lock_irqsave(&b->bm_lock, flags);
+ if (bm_is_locked(b))
+ bm_print_lock_info(mdev);
+ if (bitnr < b->bm_bits) {
+ unsigned long offset = bitnr>>LN2_BPL;
+ p_addr = bm_map_paddr(b, offset);
+ i = test_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0;
+ bm_unmap(p_addr);
+ } else if (bitnr == b->bm_bits) {
+ i = -1;
+ } else { /* (bitnr > b->bm_bits) */
+ dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
+ i = 0;
+ }
+
+ spin_unlock_irqrestore(&b->bm_lock, flags);
+ return i;
+}
+
+/* returns number of bits set in the range [s, e] */
+int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
+{
+ unsigned long flags;
+ struct drbd_bitmap *b = mdev->bitmap;
+ unsigned long *p_addr = NULL, page_nr = -1;
+ unsigned long bitnr;
+ int c = 0;
+ size_t w;
+
+ /* If this is called without a bitmap, that is a bug. But just to be
+ * robust in case we screwed up elsewhere, in that case pretend there
+ * was one dirty bit in the requested area, so we won't try to do a
+ * local read there (no bitmap probably implies no disk) */
+ ERR_IF(!b) return 1;
+ ERR_IF(!b->bm_pages) return 1;
+
+ spin_lock_irqsave(&b->bm_lock, flags);
+ if (bm_is_locked(b))
+ bm_print_lock_info(mdev);
+ for (bitnr = s; bitnr <= e; bitnr++) {
+ w = bitnr >> LN2_BPL;
+ if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) {
+ page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3);
+ if (p_addr)
+ bm_unmap(p_addr);
+ p_addr = bm_map_paddr(b, w);
+ }
+ ERR_IF (bitnr >= b->bm_bits) {
+ dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
+ } else {
+ c += (0 != test_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
+ }
+ }
+ if (p_addr)
+ bm_unmap(p_addr);
+ spin_unlock_irqrestore(&b->bm_lock, flags);
+ return c;
+}
+
+
+/* inherently racy...
+ * return value may be already out-of-date when this function returns.
+ * but the general usage is that this is only use during a cstate when bits are
+ * only cleared, not set, and typically only care for the case when the return
+ * value is zero, or we already "locked" this "bitmap extent" by other means.
+ *
+ * enr is bm-extent number, since we chose to name one sector (512 bytes)
+ * worth of the bitmap a "bitmap extent".
+ *
+ * TODO
+ * I think since we use it like a reference count, we should use the real
+ * reference count of some bitmap extent element from some lru instead...
+ *
+ */
+int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
+{
+ struct drbd_bitmap *b = mdev->bitmap;
+ int count, s, e;
+ unsigned long flags;
+ unsigned long *p_addr, *bm;
+
+ ERR_IF(!b) return 0;
+ ERR_IF(!b->bm_pages) return 0;
+
+ spin_lock_irqsave(&b->bm_lock, flags);
+ if (bm_is_locked(b))
+ bm_print_lock_info(mdev);
+
+ s = S2W(enr);
+ e = min((size_t)S2W(enr+1), b->bm_words);
+ count = 0;
+ if (s < b->bm_words) {
+ int n = e-s;
+ p_addr = bm_map_paddr(b, s);
+ bm = p_addr + MLPP(s);
+ while (n--)
+ count += hweight_long(*bm++);
+ bm_unmap(p_addr);
+ } else {
+ dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s);
+ }
+ spin_unlock_irqrestore(&b->bm_lock, flags);
+ return count;
+}
+
+/* set all bits covered by the AL-extent al_enr */
+unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
+{
+ struct drbd_bitmap *b = mdev->bitmap;
+ unsigned long *p_addr, *bm;
+ unsigned long weight;
+ int count, s, e, i, do_now;
+ ERR_IF(!b) return 0;
+ ERR_IF(!b->bm_pages) return 0;
+
+ spin_lock_irq(&b->bm_lock);
+ if (bm_is_locked(b))
+ bm_print_lock_info(mdev);
+ weight = b->bm_set;
+
+ s = al_enr * BM_WORDS_PER_AL_EXT;
+ e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
+ /* assert that s and e are on the same page */
+ D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
+ == s >> (PAGE_SHIFT - LN2_BPL + 3));
+ count = 0;
+ if (s < b->bm_words) {
+ i = do_now = e-s;
+ p_addr = bm_map_paddr(b, s);
+ bm = p_addr + MLPP(s);
+ while (i--) {
+ count += hweight_long(*bm);
+ *bm = -1UL;
+ bm++;
+ }
+ bm_unmap(p_addr);
+ b->bm_set += do_now*BITS_PER_LONG - count;
+ if (e == b->bm_words)
+ b->bm_set -= bm_clear_surplus(b);
+ } else {
+ dev_err(DEV, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s);
+ }
+ weight = b->bm_set - weight;
+ spin_unlock_irq(&b->bm_lock);
+ return weight;
+}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
new file mode 100644
index 000000000000..2312d782fe99
--- /dev/null
+++ b/drivers/block/drbd/drbd_int.h
@@ -0,0 +1,2252 @@
+/*
+ drbd_int.h
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#ifndef _DRBD_INT_H
+#define _DRBD_INT_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/ratelimit.h>
+#include <linux/tcp.h>
+#include <linux/mutex.h>
+#include <linux/major.h>
+#include <linux/blkdev.h>
+#include <linux/genhd.h>
+#include <net/tcp.h>
+#include <linux/lru_cache.h>
+
+#ifdef __CHECKER__
+# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr")))
+# define __protected_read_by(x) __attribute__((require_context(x,1,999,"read")))
+# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write")))
+# define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call")))
+#else
+# define __protected_by(x)
+# define __protected_read_by(x)
+# define __protected_write_by(x)
+# define __must_hold(x)
+#endif
+
+#define __no_warn(lock, stmt) do { __acquire(lock); stmt; __release(lock); } while (0)
+
+/* module parameter, defined in drbd_main.c */
+extern unsigned int minor_count;
+extern int disable_sendpage;
+extern int allow_oos;
+extern unsigned int cn_idx;
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+extern int enable_faults;
+extern int fault_rate;
+extern int fault_devs;
+#endif
+
+extern char usermode_helper[];
+
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+/* I don't remember why XCPU ...
+ * This is used to wake the asender,
+ * and to interrupt sending the sending task
+ * on disconnect.
+ */
+#define DRBD_SIG SIGXCPU
+
+/* This is used to stop/restart our threads.
+ * Cannot use SIGTERM nor SIGKILL, since these
+ * are sent out by init on runlevel changes
+ * I choose SIGHUP for now.
+ */
+#define DRBD_SIGKILL SIGHUP
+
+/* All EEs on the free list should have ID_VACANT (== 0)
+ * freshly allocated EEs get !ID_VACANT (== 1)
+ * so if it says "cannot dereference null pointer at adress 0x00000001",
+ * it is most likely one of these :( */
+
+#define ID_IN_SYNC (4711ULL)
+#define ID_OUT_OF_SYNC (4712ULL)
+
+#define ID_SYNCER (-1ULL)
+#define ID_VACANT 0
+#define is_syncer_block_id(id) ((id) == ID_SYNCER)
+
+struct drbd_conf;
+
+
+/* to shorten dev_warn(DEV, "msg"); and relatives statements */
+#define DEV (disk_to_dev(mdev->vdisk))
+
+#define D_ASSERT(exp) if (!(exp)) \
+ dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
+
+#define ERR_IF(exp) if (({ \
+ int _b = (exp) != 0; \
+ if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n", \
+ __func__, #exp, __FILE__, __LINE__); \
+ _b; \
+ }))
+
+/* Defines to control fault insertion */
+enum {
+ DRBD_FAULT_MD_WR = 0, /* meta data write */
+ DRBD_FAULT_MD_RD = 1, /* read */
+ DRBD_FAULT_RS_WR = 2, /* resync */
+ DRBD_FAULT_RS_RD = 3,
+ DRBD_FAULT_DT_WR = 4, /* data */
+ DRBD_FAULT_DT_RD = 5,
+ DRBD_FAULT_DT_RA = 6, /* data read ahead */
+ DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */
+ DRBD_FAULT_AL_EE = 8, /* alloc ee */
+
+ DRBD_FAULT_MAX,
+};
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+extern unsigned int
+_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type);
+static inline int
+drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
+ return fault_rate &&
+ (enable_faults & (1<<type)) &&
+ _drbd_insert_fault(mdev, type);
+}
+#define FAULT_ACTIVE(_m, _t) (drbd_insert_fault((_m), (_t)))
+
+#else
+#define FAULT_ACTIVE(_m, _t) (0)
+#endif
+
+/* integer division, round _UP_ to the next integer */
+#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0))
+/* usual integer division */
+#define div_floor(A, B) ((A)/(B))
+
+/* drbd_meta-data.c (still in drbd_main.c) */
+/* 4th incarnation of the disk layout. */
+#define DRBD_MD_MAGIC (DRBD_MAGIC+4)
+
+extern struct drbd_conf **minor_table;
+extern struct ratelimit_state drbd_ratelimit_state;
+
+/* on the wire */
+enum drbd_packets {
+ /* receiver (data socket) */
+ P_DATA = 0x00,
+ P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */
+ P_RS_DATA_REPLY = 0x02, /* Response to P_RS_DATA_REQUEST */
+ P_BARRIER = 0x03,
+ P_BITMAP = 0x04,
+ P_BECOME_SYNC_TARGET = 0x05,
+ P_BECOME_SYNC_SOURCE = 0x06,
+ P_UNPLUG_REMOTE = 0x07, /* Used at various times to hint the peer */
+ P_DATA_REQUEST = 0x08, /* Used to ask for a data block */
+ P_RS_DATA_REQUEST = 0x09, /* Used to ask for a data block for resync */
+ P_SYNC_PARAM = 0x0a,
+ P_PROTOCOL = 0x0b,
+ P_UUIDS = 0x0c,
+ P_SIZES = 0x0d,
+ P_STATE = 0x0e,
+ P_SYNC_UUID = 0x0f,
+ P_AUTH_CHALLENGE = 0x10,
+ P_AUTH_RESPONSE = 0x11,
+ P_STATE_CHG_REQ = 0x12,
+
+ /* asender (meta socket */
+ P_PING = 0x13,
+ P_PING_ACK = 0x14,
+ P_RECV_ACK = 0x15, /* Used in protocol B */
+ P_WRITE_ACK = 0x16, /* Used in protocol C */
+ P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
+ P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */
+ P_NEG_ACK = 0x19, /* Sent if local disk is unusable */
+ P_NEG_DREPLY = 0x1a, /* Local disk is broken... */
+ P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */
+ P_BARRIER_ACK = 0x1c,
+ P_STATE_CHG_REPLY = 0x1d,
+
+ /* "new" commands, no longer fitting into the ordering scheme above */
+
+ P_OV_REQUEST = 0x1e, /* data socket */
+ P_OV_REPLY = 0x1f,
+ P_OV_RESULT = 0x20, /* meta socket */
+ P_CSUM_RS_REQUEST = 0x21, /* data socket */
+ P_RS_IS_IN_SYNC = 0x22, /* meta socket */
+ P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
+ P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */
+
+ P_MAX_CMD = 0x25,
+ P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
+ P_MAX_OPT_CMD = 0x101,
+
+ /* special command ids for handshake */
+
+ P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */
+ P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */
+
+ P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */
+};
+
+static inline const char *cmdname(enum drbd_packets cmd)
+{
+ /* THINK may need to become several global tables
+ * when we want to support more than
+ * one PRO_VERSION */
+ static const char *cmdnames[] = {
+ [P_DATA] = "Data",
+ [P_DATA_REPLY] = "DataReply",
+ [P_RS_DATA_REPLY] = "RSDataReply",
+ [P_BARRIER] = "Barrier",
+ [P_BITMAP] = "ReportBitMap",
+ [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
+ [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
+ [P_UNPLUG_REMOTE] = "UnplugRemote",
+ [P_DATA_REQUEST] = "DataRequest",
+ [P_RS_DATA_REQUEST] = "RSDataRequest",
+ [P_SYNC_PARAM] = "SyncParam",
+ [P_SYNC_PARAM89] = "SyncParam89",
+ [P_PROTOCOL] = "ReportProtocol",
+ [P_UUIDS] = "ReportUUIDs",
+ [P_SIZES] = "ReportSizes",
+ [P_STATE] = "ReportState",
+ [P_SYNC_UUID] = "ReportSyncUUID",
+ [P_AUTH_CHALLENGE] = "AuthChallenge",
+ [P_AUTH_RESPONSE] = "AuthResponse",
+ [P_PING] = "Ping",
+ [P_PING_ACK] = "PingAck",
+ [P_RECV_ACK] = "RecvAck",
+ [P_WRITE_ACK] = "WriteAck",
+ [P_RS_WRITE_ACK] = "RSWriteAck",
+ [P_DISCARD_ACK] = "DiscardAck",
+ [P_NEG_ACK] = "NegAck",
+ [P_NEG_DREPLY] = "NegDReply",
+ [P_NEG_RS_DREPLY] = "NegRSDReply",
+ [P_BARRIER_ACK] = "BarrierAck",
+ [P_STATE_CHG_REQ] = "StateChgRequest",
+ [P_STATE_CHG_REPLY] = "StateChgReply",
+ [P_OV_REQUEST] = "OVRequest",
+ [P_OV_REPLY] = "OVReply",
+ [P_OV_RESULT] = "OVResult",
+ [P_MAX_CMD] = NULL,
+ };
+
+ if (cmd == P_HAND_SHAKE_M)
+ return "HandShakeM";
+ if (cmd == P_HAND_SHAKE_S)
+ return "HandShakeS";
+ if (cmd == P_HAND_SHAKE)
+ return "HandShake";
+ if (cmd >= P_MAX_CMD)
+ return "Unknown";
+ return cmdnames[cmd];
+}
+
+/* for sending/receiving the bitmap,
+ * possibly in some encoding scheme */
+struct bm_xfer_ctx {
+ /* "const"
+ * stores total bits and long words
+ * of the bitmap, so we don't need to
+ * call the accessor functions over and again. */
+ unsigned long bm_bits;
+ unsigned long bm_words;
+ /* during xfer, current position within the bitmap */
+ unsigned long bit_offset;
+ unsigned long word_offset;
+
+ /* statistics; index: (h->command == P_BITMAP) */
+ unsigned packets[2];
+ unsigned bytes[2];
+};
+
+extern void INFO_bm_xfer_stats(struct drbd_conf *mdev,
+ const char *direction, struct bm_xfer_ctx *c);
+
+static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
+{
+ /* word_offset counts "native long words" (32 or 64 bit),
+ * aligned at 64 bit.
+ * Encoded packet may end at an unaligned bit offset.
+ * In case a fallback clear text packet is transmitted in
+ * between, we adjust this offset back to the last 64bit
+ * aligned "native long word", which makes coding and decoding
+ * the plain text bitmap much more convenient. */
+#if BITS_PER_LONG == 64
+ c->word_offset = c->bit_offset >> 6;
+#elif BITS_PER_LONG == 32
+ c->word_offset = c->bit_offset >> 5;
+ c->word_offset &= ~(1UL);
+#else
+# error "unsupported BITS_PER_LONG"
+#endif
+}
+
+#ifndef __packed
+#define __packed __attribute__((packed))
+#endif
+
+/* This is the layout for a packet on the wire.
+ * The byteorder is the network byte order.
+ * (except block_id and barrier fields.
+ * these are pointers to local structs
+ * and have no relevance for the partner,
+ * which just echoes them as received.)
+ *
+ * NOTE that the payload starts at a long aligned offset,
+ * regardless of 32 or 64 bit arch!
+ */
+struct p_header {
+ u32 magic;
+ u16 command;
+ u16 length; /* bytes of data after this header */
+ u8 payload[0];
+} __packed;
+/* 8 bytes. packet FIXED for the next century! */
+
+/*
+ * short commands, packets without payload, plain p_header:
+ * P_PING
+ * P_PING_ACK
+ * P_BECOME_SYNC_TARGET
+ * P_BECOME_SYNC_SOURCE
+ * P_UNPLUG_REMOTE
+ */
+
+/*
+ * commands with out-of-struct payload:
+ * P_BITMAP (no additional fields)
+ * P_DATA, P_DATA_REPLY (see p_data)
+ * P_COMPRESSED_BITMAP (see receive_compressed_bitmap)
+ */
+
+/* these defines must not be changed without changing the protocol version */
+#define DP_HARDBARRIER 1
+#define DP_RW_SYNC 2
+#define DP_MAY_SET_IN_SYNC 4
+
+struct p_data {
+ struct p_header head;
+ u64 sector; /* 64 bits sector number */
+ u64 block_id; /* to identify the request in protocol B&C */
+ u32 seq_num;
+ u32 dp_flags;
+} __packed;
+
+/*
+ * commands which share a struct:
+ * p_block_ack:
+ * P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
+ * P_DISCARD_ACK (proto C, two-primaries conflict detection)
+ * p_block_req:
+ * P_DATA_REQUEST, P_RS_DATA_REQUEST
+ */
+struct p_block_ack {
+ struct p_header head;
+ u64 sector;
+ u64 block_id;
+ u32 blksize;
+ u32 seq_num;
+} __packed;
+
+
+struct p_block_req {
+ struct p_header head;
+ u64 sector;
+ u64 block_id;
+ u32 blksize;
+ u32 pad; /* to multiple of 8 Byte */
+} __packed;
+
+/*
+ * commands with their own struct for additional fields:
+ * P_HAND_SHAKE
+ * P_BARRIER
+ * P_BARRIER_ACK
+ * P_SYNC_PARAM
+ * ReportParams
+ */
+
+struct p_handshake {
+ struct p_header head; /* 8 bytes */
+ u32 protocol_min;
+ u32 feature_flags;
+ u32 protocol_max;
+
+ /* should be more than enough for future enhancements
+ * for now, feature_flags and the reserverd array shall be zero.
+ */
+
+ u32 _pad;
+ u64 reserverd[7];
+} __packed;
+/* 80 bytes, FIXED for the next century */
+
+struct p_barrier {
+ struct p_header head;
+ u32 barrier; /* barrier number _handle_ only */
+ u32 pad; /* to multiple of 8 Byte */
+} __packed;
+
+struct p_barrier_ack {
+ struct p_header head;
+ u32 barrier;
+ u32 set_size;
+} __packed;
+
+struct p_rs_param {
+ struct p_header head;
+ u32 rate;
+
+ /* Since protocol version 88 and higher. */
+ char verify_alg[0];
+} __packed;
+
+struct p_rs_param_89 {
+ struct p_header head;
+ u32 rate;
+ /* protocol version 89: */
+ char verify_alg[SHARED_SECRET_MAX];
+ char csums_alg[SHARED_SECRET_MAX];
+} __packed;
+
+struct p_protocol {
+ struct p_header head;
+ u32 protocol;
+ u32 after_sb_0p;
+ u32 after_sb_1p;
+ u32 after_sb_2p;
+ u32 want_lose;
+ u32 two_primaries;
+
+ /* Since protocol version 87 and higher. */
+ char integrity_alg[0];
+
+} __packed;
+
+struct p_uuids {
+ struct p_header head;
+ u64 uuid[UI_EXTENDED_SIZE];
+} __packed;
+
+struct p_rs_uuid {
+ struct p_header head;
+ u64 uuid;
+} __packed;
+
+struct p_sizes {
+ struct p_header head;
+ u64 d_size; /* size of disk */
+ u64 u_size; /* user requested size */
+ u64 c_size; /* current exported size */
+ u32 max_segment_size; /* Maximal size of a BIO */
+ u32 queue_order_type;
+} __packed;
+
+struct p_state {
+ struct p_header head;
+ u32 state;
+} __packed;
+
+struct p_req_state {
+ struct p_header head;
+ u32 mask;
+ u32 val;
+} __packed;
+
+struct p_req_state_reply {
+ struct p_header head;
+ u32 retcode;
+} __packed;
+
+struct p_drbd06_param {
+ u64 size;
+ u32 state;
+ u32 blksize;
+ u32 protocol;
+ u32 version;
+ u32 gen_cnt[5];
+ u32 bit_map_gen[5];
+} __packed;
+
+struct p_discard {
+ struct p_header head;
+ u64 block_id;
+ u32 seq_num;
+ u32 pad;
+} __packed;
+
+/* Valid values for the encoding field.
+ * Bump proto version when changing this. */
+enum drbd_bitmap_code {
+ /* RLE_VLI_Bytes = 0,
+ * and other bit variants had been defined during
+ * algorithm evaluation. */
+ RLE_VLI_Bits = 2,
+};
+
+struct p_compressed_bm {
+ struct p_header head;
+ /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
+ * (encoding & 0x80): polarity (set/unset) of first runlength
+ * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
+ * used to pad up to head.length bytes
+ */
+ u8 encoding;
+
+ u8 code[0];
+} __packed;
+
+/* DCBP: Drbd Compressed Bitmap Packet ... */
+static inline enum drbd_bitmap_code
+DCBP_get_code(struct p_compressed_bm *p)
+{
+ return (enum drbd_bitmap_code)(p->encoding & 0x0f);
+}
+
+static inline void
+DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
+{
+ BUG_ON(code & ~0xf);
+ p->encoding = (p->encoding & ~0xf) | code;
+}
+
+static inline int
+DCBP_get_start(struct p_compressed_bm *p)
+{
+ return (p->encoding & 0x80) != 0;
+}
+
+static inline void
+DCBP_set_start(struct p_compressed_bm *p, int set)
+{
+ p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
+}
+
+static inline int
+DCBP_get_pad_bits(struct p_compressed_bm *p)
+{
+ return (p->encoding >> 4) & 0x7;
+}
+
+static inline void
+DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
+{
+ BUG_ON(n & ~0x7);
+ p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
+}
+
+/* one bitmap packet, including the p_header,
+ * should fit within one _architecture independend_ page.
+ * so we need to use the fixed size 4KiB page size
+ * most architechtures have used for a long time.
+ */
+#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header))
+#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
+#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
+#if (PAGE_SIZE < 4096)
+/* drbd_send_bitmap / receive_bitmap would break horribly */
+#error "PAGE_SIZE too small"
+#endif
+
+union p_polymorph {
+ struct p_header header;
+ struct p_handshake handshake;
+ struct p_data data;
+ struct p_block_ack block_ack;
+ struct p_barrier barrier;
+ struct p_barrier_ack barrier_ack;
+ struct p_rs_param_89 rs_param_89;
+ struct p_protocol protocol;
+ struct p_sizes sizes;
+ struct p_uuids uuids;
+ struct p_state state;
+ struct p_req_state req_state;
+ struct p_req_state_reply req_state_reply;
+ struct p_block_req block_req;
+} __packed;
+
+/**********************************************************************/
+enum drbd_thread_state {
+ None,
+ Running,
+ Exiting,
+ Restarting
+};
+
+struct drbd_thread {
+ spinlock_t t_lock;
+ struct task_struct *task;
+ struct completion stop;
+ enum drbd_thread_state t_state;
+ int (*function) (struct drbd_thread *);
+ struct drbd_conf *mdev;
+ int reset_cpu_mask;
+};
+
+static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
+{
+ /* THINK testing the t_state seems to be uncritical in all cases
+ * (but thread_{start,stop}), so we can read it *without* the lock.
+ * --lge */
+
+ smp_rmb();
+ return thi->t_state;
+}
+
+
+/*
+ * Having this as the first member of a struct provides sort of "inheritance".
+ * "derived" structs can be "drbd_queue_work()"ed.
+ * The callback should know and cast back to the descendant struct.
+ * drbd_request and drbd_epoch_entry are descendants of drbd_work.
+ */
+struct drbd_work;
+typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel);
+struct drbd_work {
+ struct list_head list;
+ drbd_work_cb cb;
+};
+
+struct drbd_tl_epoch;
+struct drbd_request {
+ struct drbd_work w;
+ struct drbd_conf *mdev;
+
+ /* if local IO is not allowed, will be NULL.
+ * if local IO _is_ allowed, holds the locally submitted bio clone,
+ * or, after local IO completion, the ERR_PTR(error).
+ * see drbd_endio_pri(). */
+ struct bio *private_bio;
+
+ struct hlist_node colision;
+ sector_t sector;
+ unsigned int size;
+ unsigned int epoch; /* barrier_nr */
+
+ /* barrier_nr: used to check on "completion" whether this req was in
+ * the current epoch, and we therefore have to close it,
+ * starting a new epoch...
+ */
+
+ /* up to here, the struct layout is identical to drbd_epoch_entry;
+ * we might be able to use that to our advantage... */
+
+ struct list_head tl_requests; /* ring list in the transfer log */
+ struct bio *master_bio; /* master bio pointer */
+ unsigned long rq_state; /* see comments above _req_mod() */
+ int seq_num;
+ unsigned long start_time;
+};
+
+struct drbd_tl_epoch {
+ struct drbd_work w;
+ struct list_head requests; /* requests before */
+ struct drbd_tl_epoch *next; /* pointer to the next barrier */
+ unsigned int br_number; /* the barriers identifier. */
+ int n_req; /* number of requests attached before this barrier */
+};
+
+struct drbd_request;
+
+/* These Tl_epoch_entries may be in one of 6 lists:
+ active_ee .. data packet being written
+ sync_ee .. syncer block being written
+ done_ee .. block written, need to send P_WRITE_ACK
+ read_ee .. [RS]P_DATA_REQUEST being read
+*/
+
+struct drbd_epoch {
+ struct list_head list;
+ unsigned int barrier_nr;
+ atomic_t epoch_size; /* increased on every request added. */
+ atomic_t active; /* increased on every req. added, and dec on every finished. */
+ unsigned long flags;
+};
+
+/* drbd_epoch flag bits */
+enum {
+ DE_BARRIER_IN_NEXT_EPOCH_ISSUED,
+ DE_BARRIER_IN_NEXT_EPOCH_DONE,
+ DE_CONTAINS_A_BARRIER,
+ DE_HAVE_BARRIER_NUMBER,
+ DE_IS_FINISHING,
+};
+
+enum epoch_event {
+ EV_PUT,
+ EV_GOT_BARRIER_NR,
+ EV_BARRIER_DONE,
+ EV_BECAME_LAST,
+ EV_CLEANUP = 32, /* used as flag */
+};
+
+struct drbd_epoch_entry {
+ struct drbd_work w;
+ struct drbd_conf *mdev;
+ struct bio *private_bio;
+ struct hlist_node colision;
+ sector_t sector;
+ unsigned int size;
+ struct drbd_epoch *epoch;
+
+ /* up to here, the struct layout is identical to drbd_request;
+ * we might be able to use that to our advantage... */
+
+ unsigned int flags;
+ u64 block_id;
+};
+
+struct drbd_wq_barrier {
+ struct drbd_work w;
+ struct completion done;
+};
+
+struct digest_info {
+ int digest_size;
+ void *digest;
+};
+
+/* ee flag bits */
+enum {
+ __EE_CALL_AL_COMPLETE_IO,
+ __EE_CONFLICT_PENDING,
+ __EE_MAY_SET_IN_SYNC,
+ __EE_IS_BARRIER,
+};
+#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
+#define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING)
+#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
+#define EE_IS_BARRIER (1<<__EE_IS_BARRIER)
+
+/* global flag bits */
+enum {
+ CREATE_BARRIER, /* next P_DATA is preceeded by a P_BARRIER */
+ SIGNAL_ASENDER, /* whether asender wants to be interrupted */
+ SEND_PING, /* whether asender should send a ping asap */
+
+ STOP_SYNC_TIMER, /* tell timer to cancel itself */
+ UNPLUG_QUEUED, /* only relevant with kernel 2.4 */
+ UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
+ MD_DIRTY, /* current uuids and flags not yet on disk */
+ DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */
+ USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */
+ CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */
+ CL_ST_CHG_SUCCESS,
+ CL_ST_CHG_FAIL,
+ CRASHED_PRIMARY, /* This node was a crashed primary.
+ * Gets cleared when the state.conn
+ * goes into C_CONNECTED state. */
+ WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */
+ NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */
+ CONSIDER_RESYNC,
+
+ MD_NO_BARRIER, /* meta data device does not support barriers,
+ so don't even try */
+ SUSPEND_IO, /* suspend application io */
+ BITMAP_IO, /* suspend application io;
+ once no more io in flight, start bitmap io */
+ BITMAP_IO_QUEUED, /* Started bitmap IO */
+ RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
+ NET_CONGESTED, /* The data socket is congested */
+
+ CONFIG_PENDING, /* serialization of (re)configuration requests.
+ * if set, also prevents the device from dying */
+ DEVICE_DYING, /* device became unconfigured,
+ * but worker thread is still handling the cleanup.
+ * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed,
+ * while this is set. */
+ RESIZE_PENDING, /* Size change detected locally, waiting for the response from
+ * the peer, if it changed there as well. */
+};
+
+struct drbd_bitmap; /* opaque for drbd_conf */
+
+/* TODO sort members for performance
+ * MAYBE group them further */
+
+/* THINK maybe we actually want to use the default "event/%s" worker threads
+ * or similar in linux 2.6, which uses per cpu data and threads.
+ *
+ * To be general, this might need a spin_lock member.
+ * For now, please use the mdev->req_lock to protect list_head,
+ * see drbd_queue_work below.
+ */
+struct drbd_work_queue {
+ struct list_head q;
+ struct semaphore s; /* producers up it, worker down()s it */
+ spinlock_t q_lock; /* to protect the list. */
+};
+
+struct drbd_socket {
+ struct drbd_work_queue work;
+ struct mutex mutex;
+ struct socket *socket;
+ /* this way we get our
+ * send/receive buffers off the stack */
+ union p_polymorph sbuf;
+ union p_polymorph rbuf;
+};
+
+struct drbd_md {
+ u64 md_offset; /* sector offset to 'super' block */
+
+ u64 la_size_sect; /* last agreed size, unit sectors */
+ u64 uuid[UI_SIZE];
+ u64 device_uuid;
+ u32 flags;
+ u32 md_size_sect;
+
+ s32 al_offset; /* signed relative sector offset to al area */
+ s32 bm_offset; /* signed relative sector offset to bitmap */
+
+ /* u32 al_nr_extents; important for restoring the AL
+ * is stored into sync_conf.al_extents, which in turn
+ * gets applied to act_log->nr_elements
+ */
+};
+
+/* for sync_conf and other types... */
+#define NL_PACKET(name, number, fields) struct name { fields };
+#define NL_INTEGER(pn,pr,member) int member;
+#define NL_INT64(pn,pr,member) __u64 member;
+#define NL_BIT(pn,pr,member) unsigned member:1;
+#define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len;
+#include "linux/drbd_nl.h"
+
+struct drbd_backing_dev {
+ struct block_device *backing_bdev;
+ struct block_device *md_bdev;
+ struct file *lo_file;
+ struct file *md_file;
+ struct drbd_md md;
+ struct disk_conf dc; /* The user provided config... */
+ sector_t known_size; /* last known size of that backing device */
+};
+
+struct drbd_md_io {
+ struct drbd_conf *mdev;
+ struct completion event;
+ int error;
+};
+
+struct bm_io_work {
+ struct drbd_work w;
+ char *why;
+ int (*io_fn)(struct drbd_conf *mdev);
+ void (*done)(struct drbd_conf *mdev, int rv);
+};
+
+enum write_ordering_e {
+ WO_none,
+ WO_drain_io,
+ WO_bdev_flush,
+ WO_bio_barrier
+};
+
+struct drbd_conf {
+ /* things that are stored as / read from meta data on disk */
+ unsigned long flags;
+
+ /* configured by drbdsetup */
+ struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */
+ struct syncer_conf sync_conf;
+ struct drbd_backing_dev *ldev __protected_by(local);
+
+ sector_t p_size; /* partner's disk size */
+ struct request_queue *rq_queue;
+ struct block_device *this_bdev;
+ struct gendisk *vdisk;
+
+ struct drbd_socket data; /* data/barrier/cstate/parameter packets */
+ struct drbd_socket meta; /* ping/ack (metadata) packets */
+ int agreed_pro_version; /* actually used protocol version */
+ unsigned long last_received; /* in jiffies, either socket */
+ unsigned int ko_count;
+ struct drbd_work resync_work,
+ unplug_work,
+ md_sync_work;
+ struct timer_list resync_timer;
+ struct timer_list md_sync_timer;
+
+ /* Used after attach while negotiating new disk state. */
+ union drbd_state new_state_tmp;
+
+ union drbd_state state;
+ wait_queue_head_t misc_wait;
+ wait_queue_head_t state_wait; /* upon each state change. */
+ unsigned int send_cnt;
+ unsigned int recv_cnt;
+ unsigned int read_cnt;
+ unsigned int writ_cnt;
+ unsigned int al_writ_cnt;
+ unsigned int bm_writ_cnt;
+ atomic_t ap_bio_cnt; /* Requests we need to complete */
+ atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
+ atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
+ atomic_t unacked_cnt; /* Need to send replys for */
+ atomic_t local_cnt; /* Waiting for local completion */
+ atomic_t net_cnt; /* Users of net_conf */
+ spinlock_t req_lock;
+ struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */
+ struct drbd_tl_epoch *newest_tle;
+ struct drbd_tl_epoch *oldest_tle;
+ struct list_head out_of_sequence_requests;
+ struct hlist_head *tl_hash;
+ unsigned int tl_hash_s;
+
+ /* blocks to sync in this run [unit BM_BLOCK_SIZE] */
+ unsigned long rs_total;
+ /* number of sync IOs that failed in this run */
+ unsigned long rs_failed;
+ /* Syncer's start time [unit jiffies] */
+ unsigned long rs_start;
+ /* cumulated time in PausedSyncX state [unit jiffies] */
+ unsigned long rs_paused;
+ /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
+ unsigned long rs_mark_left;
+ /* marks's time [unit jiffies] */
+ unsigned long rs_mark_time;
+ /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */
+ unsigned long rs_same_csum;
+
+ /* where does the admin want us to start? (sector) */
+ sector_t ov_start_sector;
+ /* where are we now? (sector) */
+ sector_t ov_position;
+ /* Start sector of out of sync range (to merge printk reporting). */
+ sector_t ov_last_oos_start;
+ /* size of out-of-sync range in sectors. */
+ sector_t ov_last_oos_size;
+ unsigned long ov_left; /* in bits */
+ struct crypto_hash *csums_tfm;
+ struct crypto_hash *verify_tfm;
+
+ struct drbd_thread receiver;
+ struct drbd_thread worker;
+ struct drbd_thread asender;
+ struct drbd_bitmap *bitmap;
+ unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
+
+ /* Used to track operations of resync... */
+ struct lru_cache *resync;
+ /* Number of locked elements in resync LRU */
+ unsigned int resync_locked;
+ /* resync extent number waiting for application requests */
+ unsigned int resync_wenr;
+
+ int open_cnt;
+ u64 *p_uuid;
+ struct drbd_epoch *current_epoch;
+ spinlock_t epoch_lock;
+ unsigned int epochs;
+ enum write_ordering_e write_ordering;
+ struct list_head active_ee; /* IO in progress */
+ struct list_head sync_ee; /* IO in progress */
+ struct list_head done_ee; /* send ack */
+ struct list_head read_ee; /* IO in progress */
+ struct list_head net_ee; /* zero-copy network send in progress */
+ struct hlist_head *ee_hash; /* is proteced by req_lock! */
+ unsigned int ee_hash_s;
+
+ /* this one is protected by ee_lock, single thread */
+ struct drbd_epoch_entry *last_write_w_barrier;
+
+ int next_barrier_nr;
+ struct hlist_head *app_reads_hash; /* is proteced by req_lock */
+ struct list_head resync_reads;
+ atomic_t pp_in_use;
+ wait_queue_head_t ee_wait;
+ struct page *md_io_page; /* one page buffer for md_io */
+ struct page *md_io_tmpp; /* for logical_block_size != 512 */
+ struct mutex md_io_mutex; /* protects the md_io_buffer */
+ spinlock_t al_lock;
+ wait_queue_head_t al_wait;
+ struct lru_cache *act_log; /* activity log */
+ unsigned int al_tr_number;
+ int al_tr_cycle;
+ int al_tr_pos; /* position of the next transaction in the journal */
+ struct crypto_hash *cram_hmac_tfm;
+ struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */
+ struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */
+ void *int_dig_out;
+ void *int_dig_in;
+ void *int_dig_vv;
+ wait_queue_head_t seq_wait;
+ atomic_t packet_seq;
+ unsigned int peer_seq;
+ spinlock_t peer_seq_lock;
+ unsigned int minor;
+ unsigned long comm_bm_set; /* communicated number of set bits. */
+ cpumask_var_t cpu_mask;
+ struct bm_io_work bm_io_work;
+ u64 ed_uuid; /* UUID of the exposed data */
+ struct mutex state_mutex;
+ char congestion_reason; /* Why we where congested... */
+};
+
+static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
+{
+ struct drbd_conf *mdev;
+
+ mdev = minor < minor_count ? minor_table[minor] : NULL;
+
+ return mdev;
+}
+
+static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
+{
+ return mdev->minor;
+}
+
+/* returns 1 if it was successfull,
+ * returns 0 if there was no data socket.
+ * so wherever you are going to use the data.socket, e.g. do
+ * if (!drbd_get_data_sock(mdev))
+ * return 0;
+ * CODE();
+ * drbd_put_data_sock(mdev);
+ */
+static inline int drbd_get_data_sock(struct drbd_conf *mdev)
+{
+ mutex_lock(&mdev->data.mutex);
+ /* drbd_disconnect() could have called drbd_free_sock()
+ * while we were waiting in down()... */
+ if (unlikely(mdev->data.socket == NULL)) {
+ mutex_unlock(&mdev->data.mutex);
+ return 0;
+ }
+ return 1;
+}
+
+static inline void drbd_put_data_sock(struct drbd_conf *mdev)
+{
+ mutex_unlock(&mdev->data.mutex);
+}
+
+/*
+ * function declarations
+ *************************/
+
+/* drbd_main.c */
+
+enum chg_state_flags {
+ CS_HARD = 1,
+ CS_VERBOSE = 2,
+ CS_WAIT_COMPLETE = 4,
+ CS_SERIALIZE = 8,
+ CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
+};
+
+extern void drbd_init_set_defaults(struct drbd_conf *mdev);
+extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
+ union drbd_state mask, union drbd_state val);
+extern void drbd_force_state(struct drbd_conf *, union drbd_state,
+ union drbd_state);
+extern int _drbd_request_state(struct drbd_conf *, union drbd_state,
+ union drbd_state, enum chg_state_flags);
+extern int __drbd_set_state(struct drbd_conf *, union drbd_state,
+ enum chg_state_flags, struct completion *done);
+extern void print_st_err(struct drbd_conf *, union drbd_state,
+ union drbd_state, int);
+extern int drbd_thread_start(struct drbd_thread *thi);
+extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
+#ifdef CONFIG_SMP
+extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev);
+extern void drbd_calc_cpu_mask(struct drbd_conf *mdev);
+#else
+#define drbd_thread_current_set_cpu(A) ({})
+#define drbd_calc_cpu_mask(A) ({})
+#endif
+extern void drbd_free_resources(struct drbd_conf *mdev);
+extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
+ unsigned int set_size);
+extern void tl_clear(struct drbd_conf *mdev);
+extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
+extern void drbd_free_sock(struct drbd_conf *mdev);
+extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
+ void *buf, size_t size, unsigned msg_flags);
+extern int drbd_send_protocol(struct drbd_conf *mdev);
+extern int drbd_send_uuids(struct drbd_conf *mdev);
+extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
+extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val);
+extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply);
+extern int _drbd_send_state(struct drbd_conf *mdev);
+extern int drbd_send_state(struct drbd_conf *mdev);
+extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
+ enum drbd_packets cmd, struct p_header *h,
+ size_t size, unsigned msg_flags);
+#define USE_DATA_SOCKET 1
+#define USE_META_SOCKET 0
+extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
+ enum drbd_packets cmd, struct p_header *h,
+ size_t size);
+extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
+ char *data, size_t size);
+extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc);
+extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr,
+ u32 set_size);
+extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
+ struct drbd_epoch_entry *e);
+extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
+ struct p_block_req *rp);
+extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
+ struct p_data *dp);
+extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
+ sector_t sector, int blksize, u64 block_id);
+extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
+ struct drbd_epoch_entry *e);
+extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req);
+extern int _drbd_send_barrier(struct drbd_conf *mdev,
+ struct drbd_tl_epoch *barrier);
+extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
+ sector_t sector, int size, u64 block_id);
+extern int drbd_send_drequest_csum(struct drbd_conf *mdev,
+ sector_t sector,int size,
+ void *digest, int digest_size,
+ enum drbd_packets cmd);
+extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size);
+
+extern int drbd_send_bitmap(struct drbd_conf *mdev);
+extern int _drbd_send_bitmap(struct drbd_conf *mdev);
+extern int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode);
+extern void drbd_free_bc(struct drbd_backing_dev *ldev);
+extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
+
+/* drbd_meta-data.c (still in drbd_main.c) */
+extern void drbd_md_sync(struct drbd_conf *mdev);
+extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
+/* maybe define them below as inline? */
+extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
+extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
+extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
+extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
+extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local);
+extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
+extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
+extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
+extern void drbd_md_mark_dirty(struct drbd_conf *mdev);
+extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
+ int (*io_fn)(struct drbd_conf *),
+ void (*done)(struct drbd_conf *, int),
+ char *why);
+extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
+extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
+extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
+
+
+/* Meta data layout
+ We reserve a 128MB Block (4k aligned)
+ * either at the end of the backing device
+ * or on a seperate meta data device. */
+
+#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */
+/* The following numbers are sectors */
+#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */
+#define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */
+/* Allows up to about 3.8TB */
+#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE)
+
+/* Since the smalles IO unit is usually 512 byte */
+#define MD_SECTOR_SHIFT 9
+#define MD_SECTOR_SIZE (1<<MD_SECTOR_SHIFT)
+
+/* activity log */
+#define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */
+#define AL_EXTENT_SHIFT 22 /* One extent represents 4M Storage */
+#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
+
+#if BITS_PER_LONG == 32
+#define LN2_BPL 5
+#define cpu_to_lel(A) cpu_to_le32(A)
+#define lel_to_cpu(A) le32_to_cpu(A)
+#elif BITS_PER_LONG == 64
+#define LN2_BPL 6
+#define cpu_to_lel(A) cpu_to_le64(A)
+#define lel_to_cpu(A) le64_to_cpu(A)
+#else
+#error "LN2 of BITS_PER_LONG unknown!"
+#endif
+
+/* resync bitmap */
+/* 16MB sized 'bitmap extent' to track syncer usage */
+struct bm_extent {
+ int rs_left; /* number of bits set (out of sync) in this extent. */
+ int rs_failed; /* number of failed resync requests in this extent. */
+ unsigned long flags;
+ struct lc_element lce;
+};
+
+#define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */
+#define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */
+
+/* drbd_bitmap.c */
+/*
+ * We need to store one bit for a block.
+ * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap.
+ * Bit 0 ==> local node thinks this block is binary identical on both nodes
+ * Bit 1 ==> local node thinks this block needs to be synced.
+ */
+
+#define BM_BLOCK_SHIFT 12 /* 4k per bit */
+#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT)
+/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
+ * per sector of on disk bitmap */
+#define BM_EXT_SHIFT (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3) /* = 24 */
+#define BM_EXT_SIZE (1<<BM_EXT_SHIFT)
+
+#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
+#error "HAVE YOU FIXED drbdmeta AS WELL??"
+#endif
+
+/* thus many _storage_ sectors are described by one bit */
+#define BM_SECT_TO_BIT(x) ((x)>>(BM_BLOCK_SHIFT-9))
+#define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SHIFT-9))
+#define BM_SECT_PER_BIT BM_BIT_TO_SECT(1)
+
+/* bit to represented kilo byte conversion */
+#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10))
+
+/* in which _bitmap_ extent (resp. sector) the bit for a certain
+ * _storage_ sector is located in */
+#define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9))
+
+/* how much _storage_ sectors we have per bitmap sector */
+#define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9))
+#define BM_SECT_PER_EXT BM_EXT_TO_SECT(1)
+
+/* in one sector of the bitmap, we have this many activity_log extents. */
+#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
+#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
+
+#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
+#define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
+
+/* the extent in "PER_EXTENT" below is an activity log extent
+ * we need that many (long words/bytes) to store the bitmap
+ * of one AL_EXTENT_SIZE chunk of storage.
+ * we can store the bitmap for that many AL_EXTENTS within
+ * one sector of the _on_disk_ bitmap:
+ * bit 0 bit 37 bit 38 bit (512*8)-1
+ * ...|........|........|.. // ..|........|
+ * sect. 0 `296 `304 ^(512*8*8)-1
+ *
+#define BM_WORDS_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG )
+#define BM_BYTES_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 ) // 128
+#define BM_EXT_PER_SECT ( 512 / BM_BYTES_PER_EXTENT ) // 4
+ */
+
+#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
+#define DRBD_MAX_SECTORS_BM \
+ ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9)))
+#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32
+#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
+#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM
+#elif !defined(CONFIG_LBD) && BITS_PER_LONG == 32
+#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32
+#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
+#else
+#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
+/* 16 TB in units of sectors */
+#if BITS_PER_LONG == 32
+/* adjust by one page worth of bitmap,
+ * so we won't wrap around in drbd_bm_find_next_bit.
+ * you should use 64bit OS for that much storage, anyways. */
+#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff)
+#else
+#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0x1LU << 32)
+#endif
+#endif
+
+/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
+ * With a value of 6 all IO in one 32K block make it to the same slot of the
+ * hash table. */
+#define HT_SHIFT 6
+#define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT))
+
+/* Number of elements in the app_reads_hash */
+#define APP_R_HSIZE 15
+
+extern int drbd_bm_init(struct drbd_conf *mdev);
+extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors);
+extern void drbd_bm_cleanup(struct drbd_conf *mdev);
+extern void drbd_bm_set_all(struct drbd_conf *mdev);
+extern void drbd_bm_clear_all(struct drbd_conf *mdev);
+extern int drbd_bm_set_bits(
+ struct drbd_conf *mdev, unsigned long s, unsigned long e);
+extern int drbd_bm_clear_bits(
+ struct drbd_conf *mdev, unsigned long s, unsigned long e);
+/* bm_set_bits variant for use while holding drbd_bm_lock */
+extern void _drbd_bm_set_bits(struct drbd_conf *mdev,
+ const unsigned long s, const unsigned long e);
+extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr);
+extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
+extern int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local);
+extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
+extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
+extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
+ unsigned long al_enr);
+extern size_t drbd_bm_words(struct drbd_conf *mdev);
+extern unsigned long drbd_bm_bits(struct drbd_conf *mdev);
+extern sector_t drbd_bm_capacity(struct drbd_conf *mdev);
+extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
+/* bm_find_next variants for use while you hold drbd_bm_lock() */
+extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
+extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo);
+extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev);
+extern int drbd_bm_rs_done(struct drbd_conf *mdev);
+/* for receive_bitmap */
+extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset,
+ size_t number, unsigned long *buffer);
+/* for _drbd_send_bitmap and drbd_bm_write_sect */
+extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset,
+ size_t number, unsigned long *buffer);
+
+extern void drbd_bm_lock(struct drbd_conf *mdev, char *why);
+extern void drbd_bm_unlock(struct drbd_conf *mdev);
+
+extern int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e);
+/* drbd_main.c */
+
+extern struct kmem_cache *drbd_request_cache;
+extern struct kmem_cache *drbd_ee_cache; /* epoch entries */
+extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
+extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
+extern mempool_t *drbd_request_mempool;
+extern mempool_t *drbd_ee_mempool;
+
+extern struct page *drbd_pp_pool; /* drbd's page pool */
+extern spinlock_t drbd_pp_lock;
+extern int drbd_pp_vacant;
+extern wait_queue_head_t drbd_pp_wait;
+
+extern rwlock_t global_state_lock;
+
+extern struct drbd_conf *drbd_new_device(unsigned int minor);
+extern void drbd_free_mdev(struct drbd_conf *mdev);
+
+extern int proc_details;
+
+/* drbd_req */
+extern int drbd_make_request_26(struct request_queue *q, struct bio *bio);
+extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
+extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
+extern int is_valid_ar_handle(struct drbd_request *, sector_t);
+
+
+/* drbd_nl.c */
+extern void drbd_suspend_io(struct drbd_conf *mdev);
+extern void drbd_resume_io(struct drbd_conf *mdev);
+extern char *ppsize(char *buf, unsigned long long size);
+extern sector_t drbd_new_dev_size(struct drbd_conf *,
+ struct drbd_backing_dev *);
+enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
+extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *) __must_hold(local);
+extern void resync_after_online_grow(struct drbd_conf *);
+extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
+extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
+ int force);
+enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
+extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
+
+/* drbd_worker.c */
+extern int drbd_worker(struct drbd_thread *thi);
+extern int drbd_alter_sa(struct drbd_conf *mdev, int na);
+extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side);
+extern void resume_next_sg(struct drbd_conf *mdev);
+extern void suspend_other_sg(struct drbd_conf *mdev);
+extern int drbd_resync_finished(struct drbd_conf *mdev);
+/* maybe rather drbd_main.c ? */
+extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
+ struct drbd_backing_dev *bdev, sector_t sector, int rw);
+extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
+
+static inline void ov_oos_print(struct drbd_conf *mdev)
+{
+ if (mdev->ov_last_oos_size) {
+ dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n",
+ (unsigned long long)mdev->ov_last_oos_start,
+ (unsigned long)mdev->ov_last_oos_size);
+ }
+ mdev->ov_last_oos_size=0;
+}
+
+
+extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
+/* worker callbacks */
+extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
+extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
+extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int);
+extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
+extern int w_io_error(struct drbd_conf *, struct drbd_work *, int);
+extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
+extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int);
+extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
+extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
+extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
+
+extern void resync_timer_fn(unsigned long data);
+
+/* drbd_receiver.c */
+extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
+extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
+ u64 id,
+ sector_t sector,
+ unsigned int data_size,
+ gfp_t gfp_mask) __must_hold(local);
+extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e);
+extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
+ struct list_head *head);
+extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
+ struct list_head *head);
+extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
+extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
+extern void drbd_flush_workqueue(struct drbd_conf *mdev);
+
+/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
+ * mess with get_fs/set_fs, we know we are KERNEL_DS always. */
+static inline int drbd_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int optlen)
+{
+ int err;
+ if (level == SOL_SOCKET)
+ err = sock_setsockopt(sock, level, optname, optval, optlen);
+ else
+ err = sock->ops->setsockopt(sock, level, optname, optval,
+ optlen);
+ return err;
+}
+
+static inline void drbd_tcp_cork(struct socket *sock)
+{
+ int __user val = 1;
+ (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
+ (char __user *)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_uncork(struct socket *sock)
+{
+ int __user val = 0;
+ (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
+ (char __user *)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_nodelay(struct socket *sock)
+{
+ int __user val = 1;
+ (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+ (char __user *)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_quickack(struct socket *sock)
+{
+ int __user val = 1;
+ (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
+ (char __user *)&val, sizeof(val));
+}
+
+void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo);
+
+/* drbd_proc.c */
+extern struct proc_dir_entry *drbd_proc;
+extern struct file_operations drbd_proc_fops;
+extern const char *drbd_conn_str(enum drbd_conns s);
+extern const char *drbd_role_str(enum drbd_role s);
+
+/* drbd_actlog.c */
+extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector);
+extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector);
+extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
+extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
+extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
+extern void drbd_rs_cancel_all(struct drbd_conf *mdev);
+extern int drbd_rs_del_all(struct drbd_conf *mdev);
+extern void drbd_rs_failed_io(struct drbd_conf *mdev,
+ sector_t sector, int size);
+extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *);
+extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector,
+ int size, const char *file, const unsigned int line);
+#define drbd_set_in_sync(mdev, sector, size) \
+ __drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__)
+extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
+ int size, const char *file, const unsigned int line);
+#define drbd_set_out_of_sync(mdev, sector, size) \
+ __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
+extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
+extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev);
+extern void drbd_al_shrink(struct drbd_conf *mdev);
+
+
+/* drbd_nl.c */
+
+void drbd_nl_cleanup(void);
+int __init drbd_nl_init(void);
+void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state);
+void drbd_bcast_sync_progress(struct drbd_conf *mdev);
+void drbd_bcast_ee(struct drbd_conf *mdev,
+ const char *reason, const int dgs,
+ const char* seen_hash, const char* calc_hash,
+ const struct drbd_epoch_entry* e);
+
+
+/**
+ * DOC: DRBD State macros
+ *
+ * These macros are used to express state changes in easily readable form.
+ *
+ * The NS macros expand to a mask and a value, that can be bit ored onto the
+ * current state as soon as the spinlock (req_lock) was taken.
+ *
+ * The _NS macros are used for state functions that get called with the
+ * spinlock. These macros expand directly to the new state value.
+ *
+ * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
+ * to express state changes that affect more than one aspect of the state.
+ *
+ * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
+ * Means that the network connection was established and that the peer
+ * is in secondary role.
+ */
+#define role_MASK R_MASK
+#define peer_MASK R_MASK
+#define disk_MASK D_MASK
+#define pdsk_MASK D_MASK
+#define conn_MASK C_MASK
+#define susp_MASK 1
+#define user_isp_MASK 1
+#define aftr_isp_MASK 1
+
+#define NS(T, S) \
+ ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
+ ({ union drbd_state val; val.i = 0; val.T = (S); val; })
+#define NS2(T1, S1, T2, S2) \
+ ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+ mask.T2 = T2##_MASK; mask; }), \
+ ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
+ val.T2 = (S2); val; })
+#define NS3(T1, S1, T2, S2, T3, S3) \
+ ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+ mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
+ ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
+ val.T2 = (S2); val.T3 = (S3); val; })
+
+#define _NS(D, T, S) \
+ D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; })
+#define _NS2(D, T1, S1, T2, S2) \
+ D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
+ __ns.T2 = (S2); __ns; })
+#define _NS3(D, T1, S1, T2, S2, T3, S3) \
+ D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
+ __ns.T2 = (S2); __ns.T3 = (S3); __ns; })
+
+/*
+ * inline helper functions
+ *************************/
+
+static inline void drbd_state_lock(struct drbd_conf *mdev)
+{
+ wait_event(mdev->misc_wait,
+ !test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags));
+}
+
+static inline void drbd_state_unlock(struct drbd_conf *mdev)
+{
+ clear_bit(CLUSTER_ST_CHANGE, &mdev->flags);
+ wake_up(&mdev->misc_wait);
+}
+
+static inline int _drbd_set_state(struct drbd_conf *mdev,
+ union drbd_state ns, enum chg_state_flags flags,
+ struct completion *done)
+{
+ int rv;
+
+ read_lock(&global_state_lock);
+ rv = __drbd_set_state(mdev, ns, flags, done);
+ read_unlock(&global_state_lock);
+
+ return rv;
+}
+
+/**
+ * drbd_request_state() - Reqest a state change
+ * @mdev: DRBD device.
+ * @mask: mask of state bits to change.
+ * @val: value of new state bits.
+ *
+ * This is the most graceful way of requesting a state change. It is verbose
+ * quite verbose in case the state change is not possible, and all those
+ * state changes are globally serialized.
+ */
+static inline int drbd_request_state(struct drbd_conf *mdev,
+ union drbd_state mask,
+ union drbd_state val)
+{
+ return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
+}
+
+#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
+static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where)
+{
+ switch (mdev->ldev->dc.on_io_error) {
+ case EP_PASS_ON:
+ if (!forcedetach) {
+ if (printk_ratelimit())
+ dev_err(DEV, "Local IO failed in %s."
+ "Passing error on...\n", where);
+ break;
+ }
+ /* NOTE fall through to detach case if forcedetach set */
+ case EP_DETACH:
+ case EP_CALL_HELPER:
+ if (mdev->state.disk > D_FAILED) {
+ _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
+ dev_err(DEV, "Local IO failed in %s."
+ "Detaching...\n", where);
+ }
+ break;
+ }
+}
+
+/**
+ * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers
+ * @mdev: DRBD device.
+ * @error: Error code passed to the IO completion callback
+ * @forcedetach: Force detach. I.e. the error happened while accessing the meta data
+ *
+ * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED)
+ */
+#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__)
+static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
+ int error, int forcedetach, const char *where)
+{
+ if (error) {
+ unsigned long flags;
+ spin_lock_irqsave(&mdev->req_lock, flags);
+ __drbd_chk_io_error_(mdev, forcedetach, where);
+ spin_unlock_irqrestore(&mdev->req_lock, flags);
+ }
+}
+
+
+/**
+ * drbd_md_first_sector() - Returns the first sector number of the meta data area
+ * @bdev: Meta data block device.
+ *
+ * BTW, for internal meta data, this happens to be the maximum capacity
+ * we could agree upon with our peer node.
+ */
+static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
+{
+ switch (bdev->dc.meta_dev_idx) {
+ case DRBD_MD_INDEX_INTERNAL:
+ case DRBD_MD_INDEX_FLEX_INT:
+ return bdev->md.md_offset + bdev->md.bm_offset;
+ case DRBD_MD_INDEX_FLEX_EXT:
+ default:
+ return bdev->md.md_offset;
+ }
+}
+
+/**
+ * drbd_md_last_sector() - Return the last sector number of the meta data area
+ * @bdev: Meta data block device.
+ */
+static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
+{
+ switch (bdev->dc.meta_dev_idx) {
+ case DRBD_MD_INDEX_INTERNAL:
+ case DRBD_MD_INDEX_FLEX_INT:
+ return bdev->md.md_offset + MD_AL_OFFSET - 1;
+ case DRBD_MD_INDEX_FLEX_EXT:
+ default:
+ return bdev->md.md_offset + bdev->md.md_size_sect;
+ }
+}
+
+/* Returns the number of 512 byte sectors of the device */
+static inline sector_t drbd_get_capacity(struct block_device *bdev)
+{
+ /* return bdev ? get_capacity(bdev->bd_disk) : 0; */
+ return bdev ? bdev->bd_inode->i_size >> 9 : 0;
+}
+
+/**
+ * drbd_get_max_capacity() - Returns the capacity we announce to out peer
+ * @bdev: Meta data block device.
+ *
+ * returns the capacity we announce to out peer. we clip ourselves at the
+ * various MAX_SECTORS, because if we don't, current implementation will
+ * oops sooner or later
+ */
+static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
+{
+ sector_t s;
+ switch (bdev->dc.meta_dev_idx) {
+ case DRBD_MD_INDEX_INTERNAL:
+ case DRBD_MD_INDEX_FLEX_INT:
+ s = drbd_get_capacity(bdev->backing_bdev)
+ ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
+ drbd_md_first_sector(bdev))
+ : 0;
+ break;
+ case DRBD_MD_INDEX_FLEX_EXT:
+ s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
+ drbd_get_capacity(bdev->backing_bdev));
+ /* clip at maximum size the meta device can support */
+ s = min_t(sector_t, s,
+ BM_EXT_TO_SECT(bdev->md.md_size_sect
+ - bdev->md.bm_offset));
+ break;
+ default:
+ s = min_t(sector_t, DRBD_MAX_SECTORS,
+ drbd_get_capacity(bdev->backing_bdev));
+ }
+ return s;
+}
+
+/**
+ * drbd_md_ss__() - Return the sector number of our meta data super block
+ * @mdev: DRBD device.
+ * @bdev: Meta data block device.
+ */
+static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
+ struct drbd_backing_dev *bdev)
+{
+ switch (bdev->dc.meta_dev_idx) {
+ default: /* external, some index */
+ return MD_RESERVED_SECT * bdev->dc.meta_dev_idx;
+ case DRBD_MD_INDEX_INTERNAL:
+ /* with drbd08, internal meta data is always "flexible" */
+ case DRBD_MD_INDEX_FLEX_INT:
+ /* sizeof(struct md_on_disk_07) == 4k
+ * position: last 4k aligned block of 4k size */
+ if (!bdev->backing_bdev) {
+ if (__ratelimit(&drbd_ratelimit_state)) {
+ dev_err(DEV, "bdev->backing_bdev==NULL\n");
+ dump_stack();
+ }
+ return 0;
+ }
+ return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL)
+ - MD_AL_OFFSET;
+ case DRBD_MD_INDEX_FLEX_EXT:
+ return 0;
+ }
+}
+
+static inline void
+_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
+{
+ list_add_tail(&w->list, &q->q);
+ up(&q->s);
+}
+
+static inline void
+drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&q->q_lock, flags);
+ list_add(&w->list, &q->q);
+ up(&q->s); /* within the spinlock,
+ see comment near end of drbd_worker() */
+ spin_unlock_irqrestore(&q->q_lock, flags);
+}
+
+static inline void
+drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&q->q_lock, flags);
+ list_add_tail(&w->list, &q->q);
+ up(&q->s); /* within the spinlock,
+ see comment near end of drbd_worker() */
+ spin_unlock_irqrestore(&q->q_lock, flags);
+}
+
+static inline void wake_asender(struct drbd_conf *mdev)
+{
+ if (test_bit(SIGNAL_ASENDER, &mdev->flags))
+ force_sig(DRBD_SIG, mdev->asender.task);
+}
+
+static inline void request_ping(struct drbd_conf *mdev)
+{
+ set_bit(SEND_PING, &mdev->flags);
+ wake_asender(mdev);
+}
+
+static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
+ enum drbd_packets cmd)
+{
+ struct p_header h;
+ return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
+}
+
+static inline int drbd_send_ping(struct drbd_conf *mdev)
+{
+ struct p_header h;
+ return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
+}
+
+static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
+{
+ struct p_header h;
+ return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
+}
+
+static inline void drbd_thread_stop(struct drbd_thread *thi)
+{
+ _drbd_thread_stop(thi, FALSE, TRUE);
+}
+
+static inline void drbd_thread_stop_nowait(struct drbd_thread *thi)
+{
+ _drbd_thread_stop(thi, FALSE, FALSE);
+}
+
+static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
+{
+ _drbd_thread_stop(thi, TRUE, FALSE);
+}
+
+/* counts how many answer packets packets we expect from our peer,
+ * for either explicit application requests,
+ * or implicit barrier packets as necessary.
+ * increased:
+ * w_send_barrier
+ * _req_mod(req, queue_for_net_write or queue_for_net_read);
+ * it is much easier and equally valid to count what we queue for the
+ * worker, even before it actually was queued or send.
+ * (drbd_make_request_common; recovery path on read io-error)
+ * decreased:
+ * got_BarrierAck (respective tl_clear, tl_clear_barrier)
+ * _req_mod(req, data_received)
+ * [from receive_DataReply]
+ * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked)
+ * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
+ * for some reason it is NOT decreased in got_NegAck,
+ * but in the resulting cleanup code from report_params.
+ * we should try to remember the reason for that...
+ * _req_mod(req, send_failed or send_canceled)
+ * _req_mod(req, connection_lost_while_pending)
+ * [from tl_clear_barrier]
+ */
+static inline void inc_ap_pending(struct drbd_conf *mdev)
+{
+ atomic_inc(&mdev->ap_pending_cnt);
+}
+
+#define ERR_IF_CNT_IS_NEGATIVE(which) \
+ if (atomic_read(&mdev->which) < 0) \
+ dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \
+ __func__ , __LINE__ , \
+ atomic_read(&mdev->which))
+
+#define dec_ap_pending(mdev) do { \
+ typecheck(struct drbd_conf *, mdev); \
+ if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \
+ wake_up(&mdev->misc_wait); \
+ ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0)
+
+/* counts how many resync-related answers we still expect from the peer
+ * increase decrease
+ * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY)
+ * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK whith ID_SYNCER)
+ * (or P_NEG_ACK with ID_SYNCER)
+ */
+static inline void inc_rs_pending(struct drbd_conf *mdev)
+{
+ atomic_inc(&mdev->rs_pending_cnt);
+}
+
+#define dec_rs_pending(mdev) do { \
+ typecheck(struct drbd_conf *, mdev); \
+ atomic_dec(&mdev->rs_pending_cnt); \
+ ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0)
+
+/* counts how many answers we still need to send to the peer.
+ * increased on
+ * receive_Data unless protocol A;
+ * we need to send a P_RECV_ACK (proto B)
+ * or P_WRITE_ACK (proto C)
+ * receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK
+ * receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA
+ * receive_Barrier_* we need to send a P_BARRIER_ACK
+ */
+static inline void inc_unacked(struct drbd_conf *mdev)
+{
+ atomic_inc(&mdev->unacked_cnt);
+}
+
+#define dec_unacked(mdev) do { \
+ typecheck(struct drbd_conf *, mdev); \
+ atomic_dec(&mdev->unacked_cnt); \
+ ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
+
+#define sub_unacked(mdev, n) do { \
+ typecheck(struct drbd_conf *, mdev); \
+ atomic_sub(n, &mdev->unacked_cnt); \
+ ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
+
+
+static inline void put_net_conf(struct drbd_conf *mdev)
+{
+ if (atomic_dec_and_test(&mdev->net_cnt))
+ wake_up(&mdev->misc_wait);
+}
+
+/**
+ * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there
+ * @mdev: DRBD device.
+ *
+ * You have to call put_net_conf() when finished working with mdev->net_conf.
+ */
+static inline int get_net_conf(struct drbd_conf *mdev)
+{
+ int have_net_conf;
+
+ atomic_inc(&mdev->net_cnt);
+ have_net_conf = mdev->state.conn >= C_UNCONNECTED;
+ if (!have_net_conf)
+ put_net_conf(mdev);
+ return have_net_conf;
+}
+
+/**
+ * get_ldev() - Increase the ref count on mdev->ldev. Returns 0 if there is no ldev
+ * @M: DRBD device.
+ *
+ * You have to call put_ldev() when finished working with mdev->ldev.
+ */
+#define get_ldev(M) __cond_lock(local, _get_ldev_if_state(M,D_INCONSISTENT))
+#define get_ldev_if_state(M,MINS) __cond_lock(local, _get_ldev_if_state(M,MINS))
+
+static inline void put_ldev(struct drbd_conf *mdev)
+{
+ __release(local);
+ if (atomic_dec_and_test(&mdev->local_cnt))
+ wake_up(&mdev->misc_wait);
+ D_ASSERT(atomic_read(&mdev->local_cnt) >= 0);
+}
+
+#ifndef __CHECKER__
+static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
+{
+ int io_allowed;
+
+ atomic_inc(&mdev->local_cnt);
+ io_allowed = (mdev->state.disk >= mins);
+ if (!io_allowed)
+ put_ldev(mdev);
+ return io_allowed;
+}
+#else
+extern int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins);
+#endif
+
+/* you must have an "get_ldev" reference */
+static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
+ unsigned long *bits_left, unsigned int *per_mil_done)
+{
+ /*
+ * this is to break it at compile time when we change that
+ * (we may feel 4TB maximum storage per drbd is not enough)
+ */
+ typecheck(unsigned long, mdev->rs_total);
+
+ /* note: both rs_total and rs_left are in bits, i.e. in
+ * units of BM_BLOCK_SIZE.
+ * for the percentage, we don't care. */
+
+ *bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
+ /* >> 10 to prevent overflow,
+ * +1 to prevent division by zero */
+ if (*bits_left > mdev->rs_total) {
+ /* doh. maybe a logic bug somewhere.
+ * may also be just a race condition
+ * between this and a disconnect during sync.
+ * for now, just prevent in-kernel buffer overflow.
+ */
+ smp_rmb();
+ dev_warn(DEV, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n",
+ drbd_conn_str(mdev->state.conn),
+ *bits_left, mdev->rs_total, mdev->rs_failed);
+ *per_mil_done = 0;
+ } else {
+ /* make sure the calculation happens in long context */
+ unsigned long tmp = 1000UL -
+ (*bits_left >> 10)*1000UL
+ / ((mdev->rs_total >> 10) + 1UL);
+ *per_mil_done = tmp;
+ }
+}
+
+
+/* this throttles on-the-fly application requests
+ * according to max_buffers settings;
+ * maybe re-implement using semaphores? */
+static inline int drbd_get_max_buffers(struct drbd_conf *mdev)
+{
+ int mxb = 1000000; /* arbitrary limit on open requests */
+ if (get_net_conf(mdev)) {
+ mxb = mdev->net_conf->max_buffers;
+ put_net_conf(mdev);
+ }
+ return mxb;
+}
+
+static inline int drbd_state_is_stable(union drbd_state s)
+{
+
+ /* DO NOT add a default clause, we want the compiler to warn us
+ * for any newly introduced state we may have forgotten to add here */
+
+ switch ((enum drbd_conns)s.conn) {
+ /* new io only accepted when there is no connection, ... */
+ case C_STANDALONE:
+ case C_WF_CONNECTION:
+ /* ... or there is a well established connection. */
+ case C_CONNECTED:
+ case C_SYNC_SOURCE:
+ case C_SYNC_TARGET:
+ case C_VERIFY_S:
+ case C_VERIFY_T:
+ case C_PAUSED_SYNC_S:
+ case C_PAUSED_SYNC_T:
+ /* maybe stable, look at the disk state */
+ break;
+
+ /* no new io accepted during tansitional states
+ * like handshake or teardown */
+ case C_DISCONNECTING:
+ case C_UNCONNECTED:
+ case C_TIMEOUT:
+ case C_BROKEN_PIPE:
+ case C_NETWORK_FAILURE:
+ case C_PROTOCOL_ERROR:
+ case C_TEAR_DOWN:
+ case C_WF_REPORT_PARAMS:
+ case C_STARTING_SYNC_S:
+ case C_STARTING_SYNC_T:
+ case C_WF_BITMAP_S:
+ case C_WF_BITMAP_T:
+ case C_WF_SYNC_UUID:
+ case C_MASK:
+ /* not "stable" */
+ return 0;
+ }
+
+ switch ((enum drbd_disk_state)s.disk) {
+ case D_DISKLESS:
+ case D_INCONSISTENT:
+ case D_OUTDATED:
+ case D_CONSISTENT:
+ case D_UP_TO_DATE:
+ /* disk state is stable as well. */
+ break;
+
+ /* no new io accepted during tansitional states */
+ case D_ATTACHING:
+ case D_FAILED:
+ case D_NEGOTIATING:
+ case D_UNKNOWN:
+ case D_MASK:
+ /* not "stable" */
+ return 0;
+ }
+
+ return 1;
+}
+
+static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
+{
+ int mxb = drbd_get_max_buffers(mdev);
+
+ if (mdev->state.susp)
+ return 0;
+ if (test_bit(SUSPEND_IO, &mdev->flags))
+ return 0;
+
+ /* to avoid potential deadlock or bitmap corruption,
+ * in various places, we only allow new application io
+ * to start during "stable" states. */
+
+ /* no new io accepted when attaching or detaching the disk */
+ if (!drbd_state_is_stable(mdev->state))
+ return 0;
+
+ /* since some older kernels don't have atomic_add_unless,
+ * and we are within the spinlock anyways, we have this workaround. */
+ if (atomic_read(&mdev->ap_bio_cnt) > mxb)
+ return 0;
+ if (test_bit(BITMAP_IO, &mdev->flags))
+ return 0;
+ return 1;
+}
+
+/* I'd like to use wait_event_lock_irq,
+ * but I'm not sure when it got introduced,
+ * and not sure when it has 3 or 4 arguments */
+static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
+{
+ /* compare with after_state_ch,
+ * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */
+ DEFINE_WAIT(wait);
+
+ /* we wait here
+ * as long as the device is suspended
+ * until the bitmap is no longer on the fly during connection
+ * handshake as long as we would exeed the max_buffer limit.
+ *
+ * to avoid races with the reconnect code,
+ * we need to atomic_inc within the spinlock. */
+
+ spin_lock_irq(&mdev->req_lock);
+ while (!__inc_ap_bio_cond(mdev)) {
+ prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&mdev->req_lock);
+ schedule();
+ finish_wait(&mdev->misc_wait, &wait);
+ spin_lock_irq(&mdev->req_lock);
+ }
+ atomic_add(one_or_two, &mdev->ap_bio_cnt);
+ spin_unlock_irq(&mdev->req_lock);
+}
+
+static inline void dec_ap_bio(struct drbd_conf *mdev)
+{
+ int mxb = drbd_get_max_buffers(mdev);
+ int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt);
+
+ D_ASSERT(ap_bio >= 0);
+ /* this currently does wake_up for every dec_ap_bio!
+ * maybe rather introduce some type of hysteresis?
+ * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */
+ if (ap_bio < mxb)
+ wake_up(&mdev->misc_wait);
+ if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
+ if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
+ drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
+ }
+}
+
+static inline void drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
+{
+ mdev->ed_uuid = val;
+}
+
+static inline int seq_cmp(u32 a, u32 b)
+{
+ /* we assume wrap around at 32bit.
+ * for wrap around at 24bit (old atomic_t),
+ * we'd have to
+ * a <<= 8; b <<= 8;
+ */
+ return (s32)(a) - (s32)(b);
+}
+#define seq_lt(a, b) (seq_cmp((a), (b)) < 0)
+#define seq_gt(a, b) (seq_cmp((a), (b)) > 0)
+#define seq_ge(a, b) (seq_cmp((a), (b)) >= 0)
+#define seq_le(a, b) (seq_cmp((a), (b)) <= 0)
+/* CAUTION: please no side effects in arguments! */
+#define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b)))
+
+static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq)
+{
+ unsigned int m;
+ spin_lock(&mdev->peer_seq_lock);
+ m = seq_max(mdev->peer_seq, new_seq);
+ mdev->peer_seq = m;
+ spin_unlock(&mdev->peer_seq_lock);
+ if (m == new_seq)
+ wake_up(&mdev->seq_wait);
+}
+
+static inline void drbd_update_congested(struct drbd_conf *mdev)
+{
+ struct sock *sk = mdev->data.socket->sk;
+ if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
+ set_bit(NET_CONGESTED, &mdev->flags);
+}
+
+static inline int drbd_queue_order_type(struct drbd_conf *mdev)
+{
+ /* sorry, we currently have no working implementation
+ * of distributed TCQ stuff */
+#ifndef QUEUE_ORDERED_NONE
+#define QUEUE_ORDERED_NONE 0
+#endif
+ return QUEUE_ORDERED_NONE;
+}
+
+static inline void drbd_blk_run_queue(struct request_queue *q)
+{
+ if (q && q->unplug_fn)
+ q->unplug_fn(q);
+}
+
+static inline void drbd_kick_lo(struct drbd_conf *mdev)
+{
+ if (get_ldev(mdev)) {
+ drbd_blk_run_queue(bdev_get_queue(mdev->ldev->backing_bdev));
+ put_ldev(mdev);
+ }
+}
+
+static inline void drbd_md_flush(struct drbd_conf *mdev)
+{
+ int r;
+
+ if (test_bit(MD_NO_BARRIER, &mdev->flags))
+ return;
+
+ r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL);
+ if (r) {
+ set_bit(MD_NO_BARRIER, &mdev->flags);
+ dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
+ }
+}
+
+#endif
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
new file mode 100644
index 000000000000..157d1e4343c2
--- /dev/null
+++ b/drivers/block/drbd/drbd_main.c
@@ -0,0 +1,3699 @@
+/*
+ drbd.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
+ from Logicworks, Inc. for making SDP replication support possible.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/drbd.h>
+#include <asm/uaccess.h>
+#include <asm/types.h>
+#include <net/sock.h>
+#include <linux/ctype.h>
+#include <linux/smp_lock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/reboot.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+
+#include <linux/drbd_limits.h>
+#include "drbd_int.h"
+#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
+
+#include "drbd_vli.h"
+
+struct after_state_chg_work {
+ struct drbd_work w;
+ union drbd_state os;
+ union drbd_state ns;
+ enum chg_state_flags flags;
+ struct completion *done;
+};
+
+int drbdd_init(struct drbd_thread *);
+int drbd_worker(struct drbd_thread *);
+int drbd_asender(struct drbd_thread *);
+
+int drbd_init(void);
+static int drbd_open(struct block_device *bdev, fmode_t mode);
+static int drbd_release(struct gendisk *gd, fmode_t mode);
+static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
+ union drbd_state ns, enum chg_state_flags flags);
+static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+static void md_sync_timer_fn(unsigned long data);
+static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+
+MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
+ "Lars Ellenberg <lars@linbit.com>");
+MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
+MODULE_VERSION(REL_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
+MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
+
+#include <linux/moduleparam.h>
+/* allow_open_on_secondary */
+MODULE_PARM_DESC(allow_oos, "DONT USE!");
+/* thanks to these macros, if compiled into the kernel (not-module),
+ * this becomes the boot parameter drbd.minor_count */
+module_param(minor_count, uint, 0444);
+module_param(disable_sendpage, bool, 0644);
+module_param(allow_oos, bool, 0);
+module_param(cn_idx, uint, 0444);
+module_param(proc_details, int, 0644);
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+int enable_faults;
+int fault_rate;
+static int fault_count;
+int fault_devs;
+/* bitmap of enabled faults */
+module_param(enable_faults, int, 0664);
+/* fault rate % value - applies to all enabled faults */
+module_param(fault_rate, int, 0664);
+/* count of faults inserted */
+module_param(fault_count, int, 0664);
+/* bitmap of devices to insert faults on */
+module_param(fault_devs, int, 0644);
+#endif
+
+/* module parameter, defined */
+unsigned int minor_count = 32;
+int disable_sendpage;
+int allow_oos;
+unsigned int cn_idx = CN_IDX_DRBD;
+int proc_details; /* Detail level in proc drbd*/
+
+/* Module parameter for setting the user mode helper program
+ * to run. Default is /sbin/drbdadm */
+char usermode_helper[80] = "/sbin/drbdadm";
+
+module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
+
+/* in 2.6.x, our device mapping and config info contains our virtual gendisks
+ * as member "struct gendisk *vdisk;"
+ */
+struct drbd_conf **minor_table;
+
+struct kmem_cache *drbd_request_cache;
+struct kmem_cache *drbd_ee_cache; /* epoch entries */
+struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
+struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
+mempool_t *drbd_request_mempool;
+mempool_t *drbd_ee_mempool;
+
+/* I do not use a standard mempool, because:
+ 1) I want to hand out the pre-allocated objects first.
+ 2) I want to be able to interrupt sleeping allocation with a signal.
+ Note: This is a single linked list, the next pointer is the private
+ member of struct page.
+ */
+struct page *drbd_pp_pool;
+spinlock_t drbd_pp_lock;
+int drbd_pp_vacant;
+wait_queue_head_t drbd_pp_wait;
+
+DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
+
+static struct block_device_operations drbd_ops = {
+ .owner = THIS_MODULE,
+ .open = drbd_open,
+ .release = drbd_release,
+};
+
+#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
+
+#ifdef __CHECKER__
+/* When checking with sparse, and this is an inline function, sparse will
+ give tons of false positives. When this is a real functions sparse works.
+ */
+int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
+{
+ int io_allowed;
+
+ atomic_inc(&mdev->local_cnt);
+ io_allowed = (mdev->state.disk >= mins);
+ if (!io_allowed) {
+ if (atomic_dec_and_test(&mdev->local_cnt))
+ wake_up(&mdev->misc_wait);
+ }
+ return io_allowed;
+}
+
+#endif
+
+/**
+ * DOC: The transfer log
+ *
+ * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
+ * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
+ * of the list. There is always at least one &struct drbd_tl_epoch object.
+ *
+ * Each &struct drbd_tl_epoch has a circular double linked list of requests
+ * attached.
+ */
+static int tl_init(struct drbd_conf *mdev)
+{
+ struct drbd_tl_epoch *b;
+
+ /* during device minor initialization, we may well use GFP_KERNEL */
+ b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
+ if (!b)
+ return 0;
+ INIT_LIST_HEAD(&b->requests);
+ INIT_LIST_HEAD(&b->w.list);
+ b->next = NULL;
+ b->br_number = 4711;
+ b->n_req = 0;
+ b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
+
+ mdev->oldest_tle = b;
+ mdev->newest_tle = b;
+ INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
+
+ mdev->tl_hash = NULL;
+ mdev->tl_hash_s = 0;
+
+ return 1;
+}
+
+static void tl_cleanup(struct drbd_conf *mdev)
+{
+ D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
+ D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
+ kfree(mdev->oldest_tle);
+ mdev->oldest_tle = NULL;
+ kfree(mdev->unused_spare_tle);
+ mdev->unused_spare_tle = NULL;
+ kfree(mdev->tl_hash);
+ mdev->tl_hash = NULL;
+ mdev->tl_hash_s = 0;
+}
+
+/**
+ * _tl_add_barrier() - Adds a barrier to the transfer log
+ * @mdev: DRBD device.
+ * @new: Barrier to be added before the current head of the TL.
+ *
+ * The caller must hold the req_lock.
+ */
+void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
+{
+ struct drbd_tl_epoch *newest_before;
+
+ INIT_LIST_HEAD(&new->requests);
+ INIT_LIST_HEAD(&new->w.list);
+ new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
+ new->next = NULL;
+ new->n_req = 0;
+
+ newest_before = mdev->newest_tle;
+ /* never send a barrier number == 0, because that is special-cased
+ * when using TCQ for our write ordering code */
+ new->br_number = (newest_before->br_number+1) ?: 1;
+ if (mdev->newest_tle != new) {
+ mdev->newest_tle->next = new;
+ mdev->newest_tle = new;
+ }
+}
+
+/**
+ * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
+ * @mdev: DRBD device.
+ * @barrier_nr: Expected identifier of the DRBD write barrier packet.
+ * @set_size: Expected number of requests before that barrier.
+ *
+ * In case the passed barrier_nr or set_size does not match the oldest
+ * &struct drbd_tl_epoch objects this function will cause a termination
+ * of the connection.
+ */
+void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
+ unsigned int set_size)
+{
+ struct drbd_tl_epoch *b, *nob; /* next old barrier */
+ struct list_head *le, *tle;
+ struct drbd_request *r;
+
+ spin_lock_irq(&mdev->req_lock);
+
+ b = mdev->oldest_tle;
+
+ /* first some paranoia code */
+ if (b == NULL) {
+ dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
+ barrier_nr);
+ goto bail;
+ }
+ if (b->br_number != barrier_nr) {
+ dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
+ barrier_nr, b->br_number);
+ goto bail;
+ }
+ if (b->n_req != set_size) {
+ dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n",
+ barrier_nr, set_size, b->n_req);
+ goto bail;
+ }
+
+ /* Clean up list of requests processed during current epoch */
+ list_for_each_safe(le, tle, &b->requests) {
+ r = list_entry(le, struct drbd_request, tl_requests);
+ _req_mod(r, barrier_acked);
+ }
+ /* There could be requests on the list waiting for completion
+ of the write to the local disk. To avoid corruptions of
+ slab's data structures we have to remove the lists head.
+
+ Also there could have been a barrier ack out of sequence, overtaking
+ the write acks - which would be a bug and violating write ordering.
+ To not deadlock in case we lose connection while such requests are
+ still pending, we need some way to find them for the
+ _req_mode(connection_lost_while_pending).
+
+ These have been list_move'd to the out_of_sequence_requests list in
+ _req_mod(, barrier_acked) above.
+ */
+ list_del_init(&b->requests);
+
+ nob = b->next;
+ if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
+ _tl_add_barrier(mdev, b);
+ if (nob)
+ mdev->oldest_tle = nob;
+ /* if nob == NULL b was the only barrier, and becomes the new
+ barrier. Therefore mdev->oldest_tle points already to b */
+ } else {
+ D_ASSERT(nob != NULL);
+ mdev->oldest_tle = nob;
+ kfree(b);
+ }
+
+ spin_unlock_irq(&mdev->req_lock);
+ dec_ap_pending(mdev);
+
+ return;
+
+bail:
+ spin_unlock_irq(&mdev->req_lock);
+ drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+}
+
+
+/**
+ * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
+ * @mdev: DRBD device.
+ *
+ * This is called after the connection to the peer was lost. The storage covered
+ * by the requests on the transfer gets marked as our of sync. Called from the
+ * receiver thread and the worker thread.
+ */
+void tl_clear(struct drbd_conf *mdev)
+{
+ struct drbd_tl_epoch *b, *tmp;
+ struct list_head *le, *tle;
+ struct drbd_request *r;
+ int new_initial_bnr = net_random();
+
+ spin_lock_irq(&mdev->req_lock);
+
+ b = mdev->oldest_tle;
+ while (b) {
+ list_for_each_safe(le, tle, &b->requests) {
+ r = list_entry(le, struct drbd_request, tl_requests);
+ /* It would be nice to complete outside of spinlock.
+ * But this is easier for now. */
+ _req_mod(r, connection_lost_while_pending);
+ }
+ tmp = b->next;
+
+ /* there could still be requests on that ring list,
+ * in case local io is still pending */
+ list_del(&b->requests);
+
+ /* dec_ap_pending corresponding to queue_barrier.
+ * the newest barrier may not have been queued yet,
+ * in which case w.cb is still NULL. */
+ if (b->w.cb != NULL)
+ dec_ap_pending(mdev);
+
+ if (b == mdev->newest_tle) {
+ /* recycle, but reinit! */
+ D_ASSERT(tmp == NULL);
+ INIT_LIST_HEAD(&b->requests);
+ INIT_LIST_HEAD(&b->w.list);
+ b->w.cb = NULL;
+ b->br_number = new_initial_bnr;
+ b->n_req = 0;
+
+ mdev->oldest_tle = b;
+ break;
+ }
+ kfree(b);
+ b = tmp;
+ }
+
+ /* we expect this list to be empty. */
+ D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
+
+ /* but just in case, clean it up anyways! */
+ list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
+ r = list_entry(le, struct drbd_request, tl_requests);
+ /* It would be nice to complete outside of spinlock.
+ * But this is easier for now. */
+ _req_mod(r, connection_lost_while_pending);
+ }
+
+ /* ensure bit indicating barrier is required is clear */
+ clear_bit(CREATE_BARRIER, &mdev->flags);
+
+ spin_unlock_irq(&mdev->req_lock);
+}
+
+/**
+ * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
+ * @mdev: DRBD device.
+ * @os: old (current) state.
+ * @ns: new (wanted) state.
+ */
+static int cl_wide_st_chg(struct drbd_conf *mdev,
+ union drbd_state os, union drbd_state ns)
+{
+ return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
+ ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
+ (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+ (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
+ (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
+ (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
+ (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
+}
+
+int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
+ union drbd_state mask, union drbd_state val)
+{
+ unsigned long flags;
+ union drbd_state os, ns;
+ int rv;
+
+ spin_lock_irqsave(&mdev->req_lock, flags);
+ os = mdev->state;
+ ns.i = (os.i & ~mask.i) | val.i;
+ rv = _drbd_set_state(mdev, ns, f, NULL);
+ ns = mdev->state;
+ spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+ return rv;
+}
+
+/**
+ * drbd_force_state() - Impose a change which happens outside our control on our state
+ * @mdev: DRBD device.
+ * @mask: mask of state bits to change.
+ * @val: value of new state bits.
+ */
+void drbd_force_state(struct drbd_conf *mdev,
+ union drbd_state mask, union drbd_state val)
+{
+ drbd_change_state(mdev, CS_HARD, mask, val);
+}
+
+static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
+static int is_valid_state_transition(struct drbd_conf *,
+ union drbd_state, union drbd_state);
+static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
+ union drbd_state ns, int *warn_sync_abort);
+int drbd_send_state_req(struct drbd_conf *,
+ union drbd_state, union drbd_state);
+
+static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
+ union drbd_state mask, union drbd_state val)
+{
+ union drbd_state os, ns;
+ unsigned long flags;
+ int rv;
+
+ if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
+ return SS_CW_SUCCESS;
+
+ if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
+ return SS_CW_FAILED_BY_PEER;
+
+ rv = 0;
+ spin_lock_irqsave(&mdev->req_lock, flags);
+ os = mdev->state;
+ ns.i = (os.i & ~mask.i) | val.i;
+ ns = sanitize_state(mdev, os, ns, NULL);
+
+ if (!cl_wide_st_chg(mdev, os, ns))
+ rv = SS_CW_NO_NEED;
+ if (!rv) {
+ rv = is_valid_state(mdev, ns);
+ if (rv == SS_SUCCESS) {
+ rv = is_valid_state_transition(mdev, ns, os);
+ if (rv == SS_SUCCESS)
+ rv = 0; /* cont waiting, otherwise fail. */
+ }
+ }
+ spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+ return rv;
+}
+
+/**
+ * drbd_req_state() - Perform an eventually cluster wide state change
+ * @mdev: DRBD device.
+ * @mask: mask of state bits to change.
+ * @val: value of new state bits.
+ * @f: flags
+ *
+ * Should not be called directly, use drbd_request_state() or
+ * _drbd_request_state().
+ */
+static int drbd_req_state(struct drbd_conf *mdev,
+ union drbd_state mask, union drbd_state val,
+ enum chg_state_flags f)
+{
+ struct completion done;
+ unsigned long flags;
+ union drbd_state os, ns;
+ int rv;
+
+ init_completion(&done);
+
+ if (f & CS_SERIALIZE)
+ mutex_lock(&mdev->state_mutex);
+
+ spin_lock_irqsave(&mdev->req_lock, flags);
+ os = mdev->state;
+ ns.i = (os.i & ~mask.i) | val.i;
+ ns = sanitize_state(mdev, os, ns, NULL);
+
+ if (cl_wide_st_chg(mdev, os, ns)) {
+ rv = is_valid_state(mdev, ns);
+ if (rv == SS_SUCCESS)
+ rv = is_valid_state_transition(mdev, ns, os);
+ spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+ if (rv < SS_SUCCESS) {
+ if (f & CS_VERBOSE)
+ print_st_err(mdev, os, ns, rv);
+ goto abort;
+ }
+
+ drbd_state_lock(mdev);
+ if (!drbd_send_state_req(mdev, mask, val)) {
+ drbd_state_unlock(mdev);
+ rv = SS_CW_FAILED_BY_PEER;
+ if (f & CS_VERBOSE)
+ print_st_err(mdev, os, ns, rv);
+ goto abort;
+ }
+
+ wait_event(mdev->state_wait,
+ (rv = _req_st_cond(mdev, mask, val)));
+
+ if (rv < SS_SUCCESS) {
+ drbd_state_unlock(mdev);
+ if (f & CS_VERBOSE)
+ print_st_err(mdev, os, ns, rv);
+ goto abort;
+ }
+ spin_lock_irqsave(&mdev->req_lock, flags);
+ os = mdev->state;
+ ns.i = (os.i & ~mask.i) | val.i;
+ rv = _drbd_set_state(mdev, ns, f, &done);
+ drbd_state_unlock(mdev);
+ } else {
+ rv = _drbd_set_state(mdev, ns, f, &done);
+ }
+
+ spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+ if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
+ D_ASSERT(current != mdev->worker.task);
+ wait_for_completion(&done);
+ }
+
+abort:
+ if (f & CS_SERIALIZE)
+ mutex_unlock(&mdev->state_mutex);
+
+ return rv;
+}
+
+/**
+ * _drbd_request_state() - Request a state change (with flags)
+ * @mdev: DRBD device.
+ * @mask: mask of state bits to change.
+ * @val: value of new state bits.
+ * @f: flags
+ *
+ * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
+ * flag, or when logging of failed state change requests is not desired.
+ */
+int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
+ union drbd_state val, enum chg_state_flags f)
+{
+ int rv;
+
+ wait_event(mdev->state_wait,
+ (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
+
+ return rv;
+}
+
+static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
+{
+ dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
+ name,
+ drbd_conn_str(ns.conn),
+ drbd_role_str(ns.role),
+ drbd_role_str(ns.peer),
+ drbd_disk_str(ns.disk),
+ drbd_disk_str(ns.pdsk),
+ ns.susp ? 's' : 'r',
+ ns.aftr_isp ? 'a' : '-',
+ ns.peer_isp ? 'p' : '-',
+ ns.user_isp ? 'u' : '-'
+ );
+}
+
+void print_st_err(struct drbd_conf *mdev,
+ union drbd_state os, union drbd_state ns, int err)
+{
+ if (err == SS_IN_TRANSIENT_STATE)
+ return;
+ dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
+ print_st(mdev, " state", os);
+ print_st(mdev, "wanted", ns);
+}
+
+
+#define drbd_peer_str drbd_role_str
+#define drbd_pdsk_str drbd_disk_str
+
+#define drbd_susp_str(A) ((A) ? "1" : "0")
+#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
+#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
+#define drbd_user_isp_str(A) ((A) ? "1" : "0")
+
+#define PSC(A) \
+ ({ if (ns.A != os.A) { \
+ pbp += sprintf(pbp, #A "( %s -> %s ) ", \
+ drbd_##A##_str(os.A), \
+ drbd_##A##_str(ns.A)); \
+ } })
+
+/**
+ * is_valid_state() - Returns an SS_ error code if ns is not valid
+ * @mdev: DRBD device.
+ * @ns: State to consider.
+ */
+static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
+{
+ /* See drbd_state_sw_errors in drbd_strings.c */
+
+ enum drbd_fencing_p fp;
+ int rv = SS_SUCCESS;
+
+ fp = FP_DONT_CARE;
+ if (get_ldev(mdev)) {
+ fp = mdev->ldev->dc.fencing;
+ put_ldev(mdev);
+ }
+
+ if (get_net_conf(mdev)) {
+ if (!mdev->net_conf->two_primaries &&
+ ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
+ rv = SS_TWO_PRIMARIES;
+ put_net_conf(mdev);
+ }
+
+ if (rv <= 0)
+ /* already found a reason to abort */;
+ else if (ns.role == R_SECONDARY && mdev->open_cnt)
+ rv = SS_DEVICE_IN_USE;
+
+ else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
+ rv = SS_NO_UP_TO_DATE_DISK;
+
+ else if (fp >= FP_RESOURCE &&
+ ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
+ rv = SS_PRIMARY_NOP;
+
+ else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
+ rv = SS_NO_UP_TO_DATE_DISK;
+
+ else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
+ rv = SS_NO_LOCAL_DISK;
+
+ else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
+ rv = SS_NO_REMOTE_DISK;
+
+ else if ((ns.conn == C_CONNECTED ||
+ ns.conn == C_WF_BITMAP_S ||
+ ns.conn == C_SYNC_SOURCE ||
+ ns.conn == C_PAUSED_SYNC_S) &&
+ ns.disk == D_OUTDATED)
+ rv = SS_CONNECTED_OUTDATES;
+
+ else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+ (mdev->sync_conf.verify_alg[0] == 0))
+ rv = SS_NO_VERIFY_ALG;
+
+ else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+ mdev->agreed_pro_version < 88)
+ rv = SS_NOT_SUPPORTED;
+
+ return rv;
+}
+
+/**
+ * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
+ * @mdev: DRBD device.
+ * @ns: new state.
+ * @os: old state.
+ */
+static int is_valid_state_transition(struct drbd_conf *mdev,
+ union drbd_state ns, union drbd_state os)
+{
+ int rv = SS_SUCCESS;
+
+ if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
+ os.conn > C_CONNECTED)
+ rv = SS_RESYNC_RUNNING;
+
+ if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
+ rv = SS_ALREADY_STANDALONE;
+
+ if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
+ rv = SS_IS_DISKLESS;
+
+ if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
+ rv = SS_NO_NET_CONFIG;
+
+ if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
+ rv = SS_LOWER_THAN_OUTDATED;
+
+ if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
+ rv = SS_IN_TRANSIENT_STATE;
+
+ if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
+ rv = SS_IN_TRANSIENT_STATE;
+
+ if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
+ rv = SS_NEED_CONNECTION;
+
+ if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+ ns.conn != os.conn && os.conn > C_CONNECTED)
+ rv = SS_RESYNC_RUNNING;
+
+ if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
+ os.conn < C_CONNECTED)
+ rv = SS_NEED_CONNECTION;
+
+ return rv;
+}
+
+/**
+ * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
+ * @mdev: DRBD device.
+ * @os: old state.
+ * @ns: new state.
+ * @warn_sync_abort:
+ *
+ * When we loose connection, we have to set the state of the peers disk (pdsk)
+ * to D_UNKNOWN. This rule and many more along those lines are in this function.
+ */
+static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
+ union drbd_state ns, int *warn_sync_abort)
+{
+ enum drbd_fencing_p fp;
+
+ fp = FP_DONT_CARE;
+ if (get_ldev(mdev)) {
+ fp = mdev->ldev->dc.fencing;
+ put_ldev(mdev);
+ }
+
+ /* Disallow Network errors to configure a device's network part */
+ if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
+ os.conn <= C_DISCONNECTING)
+ ns.conn = os.conn;
+
+ /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */
+ if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
+ ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING)
+ ns.conn = os.conn;
+
+ /* After C_DISCONNECTING only C_STANDALONE may follow */
+ if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
+ ns.conn = os.conn;
+
+ if (ns.conn < C_CONNECTED) {
+ ns.peer_isp = 0;
+ ns.peer = R_UNKNOWN;
+ if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
+ ns.pdsk = D_UNKNOWN;
+ }
+
+ /* Clear the aftr_isp when becoming unconfigured */
+ if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
+ ns.aftr_isp = 0;
+
+ if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS)
+ ns.pdsk = D_UNKNOWN;
+
+ /* Abort resync if a disk fails/detaches */
+ if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
+ (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
+ if (warn_sync_abort)
+ *warn_sync_abort = 1;
+ ns.conn = C_CONNECTED;
+ }
+
+ if (ns.conn >= C_CONNECTED &&
+ ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
+ (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
+ switch (ns.conn) {
+ case C_WF_BITMAP_T:
+ case C_PAUSED_SYNC_T:
+ ns.disk = D_OUTDATED;
+ break;
+ case C_CONNECTED:
+ case C_WF_BITMAP_S:
+ case C_SYNC_SOURCE:
+ case C_PAUSED_SYNC_S:
+ ns.disk = D_UP_TO_DATE;
+ break;
+ case C_SYNC_TARGET:
+ ns.disk = D_INCONSISTENT;
+ dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
+ break;
+ }
+ if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
+ dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
+ }
+
+ if (ns.conn >= C_CONNECTED &&
+ (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
+ switch (ns.conn) {
+ case C_CONNECTED:
+ case C_WF_BITMAP_T:
+ case C_PAUSED_SYNC_T:
+ case C_SYNC_TARGET:
+ ns.pdsk = D_UP_TO_DATE;
+ break;
+ case C_WF_BITMAP_S:
+ case C_PAUSED_SYNC_S:
+ ns.pdsk = D_OUTDATED;
+ break;
+ case C_SYNC_SOURCE:
+ ns.pdsk = D_INCONSISTENT;
+ dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
+ break;
+ }
+ if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
+ dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
+ }
+
+ /* Connection breaks down before we finished "Negotiating" */
+ if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
+ get_ldev_if_state(mdev, D_NEGOTIATING)) {
+ if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
+ ns.disk = mdev->new_state_tmp.disk;
+ ns.pdsk = mdev->new_state_tmp.pdsk;
+ } else {
+ dev_alert(DEV, "Connection lost while negotiating, no data!\n");
+ ns.disk = D_DISKLESS;
+ ns.pdsk = D_UNKNOWN;
+ }
+ put_ldev(mdev);
+ }
+
+ if (fp == FP_STONITH &&
+ (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
+ !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
+ ns.susp = 1;
+
+ if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
+ if (ns.conn == C_SYNC_SOURCE)
+ ns.conn = C_PAUSED_SYNC_S;
+ if (ns.conn == C_SYNC_TARGET)
+ ns.conn = C_PAUSED_SYNC_T;
+ } else {
+ if (ns.conn == C_PAUSED_SYNC_S)
+ ns.conn = C_SYNC_SOURCE;
+ if (ns.conn == C_PAUSED_SYNC_T)
+ ns.conn = C_SYNC_TARGET;
+ }
+
+ return ns;
+}
+
+/* helper for __drbd_set_state */
+static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
+{
+ if (cs == C_VERIFY_T) {
+ /* starting online verify from an arbitrary position
+ * does not fit well into the existing protocol.
+ * on C_VERIFY_T, we initialize ov_left and friends
+ * implicitly in receive_DataRequest once the
+ * first P_OV_REQUEST is received */
+ mdev->ov_start_sector = ~(sector_t)0;
+ } else {
+ unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
+ if (bit >= mdev->rs_total)
+ mdev->ov_start_sector =
+ BM_BIT_TO_SECT(mdev->rs_total - 1);
+ mdev->ov_position = mdev->ov_start_sector;
+ }
+}
+
+/**
+ * __drbd_set_state() - Set a new DRBD state
+ * @mdev: DRBD device.
+ * @ns: new state.
+ * @flags: Flags
+ * @done: Optional completion, that will get completed after the after_state_ch() finished
+ *
+ * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
+ */
+int __drbd_set_state(struct drbd_conf *mdev,
+ union drbd_state ns, enum chg_state_flags flags,
+ struct completion *done)
+{
+ union drbd_state os;
+ int rv = SS_SUCCESS;
+ int warn_sync_abort = 0;
+ struct after_state_chg_work *ascw;
+
+ os = mdev->state;
+
+ ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
+
+ if (ns.i == os.i)
+ return SS_NOTHING_TO_DO;
+
+ if (!(flags & CS_HARD)) {
+ /* pre-state-change checks ; only look at ns */
+ /* See drbd_state_sw_errors in drbd_strings.c */
+
+ rv = is_valid_state(mdev, ns);
+ if (rv < SS_SUCCESS) {
+ /* If the old state was illegal as well, then let
+ this happen...*/
+
+ if (is_valid_state(mdev, os) == rv) {
+ dev_err(DEV, "Considering state change from bad state. "
+ "Error would be: '%s'\n",
+ drbd_set_st_err_str(rv));
+ print_st(mdev, "old", os);
+ print_st(mdev, "new", ns);
+ rv = is_valid_state_transition(mdev, ns, os);
+ }
+ } else
+ rv = is_valid_state_transition(mdev, ns, os);
+ }
+
+ if (rv < SS_SUCCESS) {
+ if (flags & CS_VERBOSE)
+ print_st_err(mdev, os, ns, rv);
+ return rv;
+ }
+
+ if (warn_sync_abort)
+ dev_warn(DEV, "Resync aborted.\n");
+
+ {
+ char *pbp, pb[300];
+ pbp = pb;
+ *pbp = 0;
+ PSC(role);
+ PSC(peer);
+ PSC(conn);
+ PSC(disk);
+ PSC(pdsk);
+ PSC(susp);
+ PSC(aftr_isp);
+ PSC(peer_isp);
+ PSC(user_isp);
+ dev_info(DEV, "%s\n", pb);
+ }
+
+ /* solve the race between becoming unconfigured,
+ * worker doing the cleanup, and
+ * admin reconfiguring us:
+ * on (re)configure, first set CONFIG_PENDING,
+ * then wait for a potentially exiting worker,
+ * start the worker, and schedule one no_op.
+ * then proceed with configuration.
+ */
+ if (ns.disk == D_DISKLESS &&
+ ns.conn == C_STANDALONE &&
+ ns.role == R_SECONDARY &&
+ !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
+ set_bit(DEVICE_DYING, &mdev->flags);
+
+ mdev->state.i = ns.i;
+ wake_up(&mdev->misc_wait);
+ wake_up(&mdev->state_wait);
+
+ /* post-state-change actions */
+ if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) {
+ set_bit(STOP_SYNC_TIMER, &mdev->flags);
+ mod_timer(&mdev->resync_timer, jiffies);
+ }
+
+ /* aborted verify run. log the last position */
+ if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
+ ns.conn < C_CONNECTED) {
+ mdev->ov_start_sector =
+ BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left);
+ dev_info(DEV, "Online Verify reached sector %llu\n",
+ (unsigned long long)mdev->ov_start_sector);
+ }
+
+ if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
+ (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
+ dev_info(DEV, "Syncer continues.\n");
+ mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
+ if (ns.conn == C_SYNC_TARGET) {
+ if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
+ mod_timer(&mdev->resync_timer, jiffies);
+ /* This if (!test_bit) is only needed for the case
+ that a device that has ceased to used its timer,
+ i.e. it is already in drbd_resync_finished() gets
+ paused and resumed. */
+ }
+ }
+
+ if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
+ (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
+ dev_info(DEV, "Resync suspended\n");
+ mdev->rs_mark_time = jiffies;
+ if (ns.conn == C_PAUSED_SYNC_T)
+ set_bit(STOP_SYNC_TIMER, &mdev->flags);
+ }
+
+ if (os.conn == C_CONNECTED &&
+ (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
+ mdev->ov_position = 0;
+ mdev->rs_total =
+ mdev->rs_mark_left = drbd_bm_bits(mdev);
+ if (mdev->agreed_pro_version >= 90)
+ set_ov_position(mdev, ns.conn);
+ else
+ mdev->ov_start_sector = 0;
+ mdev->ov_left = mdev->rs_total
+ - BM_SECT_TO_BIT(mdev->ov_position);
+ mdev->rs_start =
+ mdev->rs_mark_time = jiffies;
+ mdev->ov_last_oos_size = 0;
+ mdev->ov_last_oos_start = 0;
+
+ if (ns.conn == C_VERIFY_S) {
+ dev_info(DEV, "Starting Online Verify from sector %llu\n",
+ (unsigned long long)mdev->ov_position);
+ mod_timer(&mdev->resync_timer, jiffies);
+ }
+ }
+
+ if (get_ldev(mdev)) {
+ u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
+ MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
+ MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
+
+ if (test_bit(CRASHED_PRIMARY, &mdev->flags))
+ mdf |= MDF_CRASHED_PRIMARY;
+ if (mdev->state.role == R_PRIMARY ||
+ (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
+ mdf |= MDF_PRIMARY_IND;
+ if (mdev->state.conn > C_WF_REPORT_PARAMS)
+ mdf |= MDF_CONNECTED_IND;
+ if (mdev->state.disk > D_INCONSISTENT)
+ mdf |= MDF_CONSISTENT;
+ if (mdev->state.disk > D_OUTDATED)
+ mdf |= MDF_WAS_UP_TO_DATE;
+ if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
+ mdf |= MDF_PEER_OUT_DATED;
+ if (mdf != mdev->ldev->md.flags) {
+ mdev->ldev->md.flags = mdf;
+ drbd_md_mark_dirty(mdev);
+ }
+ if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
+ drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
+ put_ldev(mdev);
+ }
+
+ /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
+ if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
+ os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
+ set_bit(CONSIDER_RESYNC, &mdev->flags);
+
+ /* Receiver should clean up itself */
+ if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
+ drbd_thread_stop_nowait(&mdev->receiver);
+
+ /* Now the receiver finished cleaning up itself, it should die */
+ if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
+ drbd_thread_stop_nowait(&mdev->receiver);
+
+ /* Upon network failure, we need to restart the receiver. */
+ if (os.conn > C_TEAR_DOWN &&
+ ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
+ drbd_thread_restart_nowait(&mdev->receiver);
+
+ ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
+ if (ascw) {
+ ascw->os = os;
+ ascw->ns = ns;
+ ascw->flags = flags;
+ ascw->w.cb = w_after_state_ch;
+ ascw->done = done;
+ drbd_queue_work(&mdev->data.work, &ascw->w);
+ } else {
+ dev_warn(DEV, "Could not kmalloc an ascw\n");
+ }
+
+ return rv;
+}
+
+static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+ struct after_state_chg_work *ascw =
+ container_of(w, struct after_state_chg_work, w);
+ after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
+ if (ascw->flags & CS_WAIT_COMPLETE) {
+ D_ASSERT(ascw->done != NULL);
+ complete(ascw->done);
+ }
+ kfree(ascw);
+
+ return 1;
+}
+
+static void abw_start_sync(struct drbd_conf *mdev, int rv)
+{
+ if (rv) {
+ dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
+ _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
+ return;
+ }
+
+ switch (mdev->state.conn) {
+ case C_STARTING_SYNC_T:
+ _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+ break;
+ case C_STARTING_SYNC_S:
+ drbd_start_resync(mdev, C_SYNC_SOURCE);
+ break;
+ }
+}
+
+/**
+ * after_state_ch() - Perform after state change actions that may sleep
+ * @mdev: DRBD device.
+ * @os: old state.
+ * @ns: new state.
+ * @flags: Flags
+ */
+static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
+ union drbd_state ns, enum chg_state_flags flags)
+{
+ enum drbd_fencing_p fp;
+
+ if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
+ clear_bit(CRASHED_PRIMARY, &mdev->flags);
+ if (mdev->p_uuid)
+ mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
+ }
+
+ fp = FP_DONT_CARE;
+ if (get_ldev(mdev)) {
+ fp = mdev->ldev->dc.fencing;
+ put_ldev(mdev);
+ }
+
+ /* Inform userspace about the change... */
+ drbd_bcast_state(mdev, ns);
+
+ if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
+ (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
+ drbd_khelper(mdev, "pri-on-incon-degr");
+
+ /* Here we have the actions that are performed after a
+ state change. This function might sleep */
+
+ if (fp == FP_STONITH && ns.susp) {
+ /* case1: The outdate peer handler is successful:
+ * case2: The connection was established again: */
+ if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) ||
+ (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) {
+ tl_clear(mdev);
+ spin_lock_irq(&mdev->req_lock);
+ _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
+ spin_unlock_irq(&mdev->req_lock);
+ }
+ }
+ /* Do not change the order of the if above and the two below... */
+ if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
+ drbd_send_uuids(mdev);
+ drbd_send_state(mdev);
+ }
+ if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
+ drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
+
+ /* Lost contact to peer's copy of the data */
+ if ((os.pdsk >= D_INCONSISTENT &&
+ os.pdsk != D_UNKNOWN &&
+ os.pdsk != D_OUTDATED)
+ && (ns.pdsk < D_INCONSISTENT ||
+ ns.pdsk == D_UNKNOWN ||
+ ns.pdsk == D_OUTDATED)) {
+ kfree(mdev->p_uuid);
+ mdev->p_uuid = NULL;
+ if (get_ldev(mdev)) {
+ if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
+ mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+ drbd_uuid_new_current(mdev);
+ drbd_send_uuids(mdev);
+ }
+ put_ldev(mdev);
+ }
+ }
+
+ if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
+ if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
+ drbd_uuid_new_current(mdev);
+
+ /* D_DISKLESS Peer becomes secondary */
+ if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
+ drbd_al_to_on_disk_bm(mdev);
+ put_ldev(mdev);
+ }
+
+ /* Last part of the attaching process ... */
+ if (ns.conn >= C_CONNECTED &&
+ os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
+ kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
+ mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
+ drbd_send_sizes(mdev, 0); /* to start sync... */
+ drbd_send_uuids(mdev);
+ drbd_send_state(mdev);
+ }
+
+ /* We want to pause/continue resync, tell peer. */
+ if (ns.conn >= C_CONNECTED &&
+ ((os.aftr_isp != ns.aftr_isp) ||
+ (os.user_isp != ns.user_isp)))
+ drbd_send_state(mdev);
+
+ /* In case one of the isp bits got set, suspend other devices. */
+ if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
+ (ns.aftr_isp || ns.peer_isp || ns.user_isp))
+ suspend_other_sg(mdev);
+
+ /* Make sure the peer gets informed about eventual state
+ changes (ISP bits) while we were in WFReportParams. */
+ if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
+ drbd_send_state(mdev);
+
+ /* We are in the progress to start a full sync... */
+ if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+ (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
+ drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
+
+ /* We are invalidating our self... */
+ if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
+ os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
+ drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
+
+ if (os.disk > D_FAILED && ns.disk == D_FAILED) {
+ enum drbd_io_error_p eh;
+
+ eh = EP_PASS_ON;
+ if (get_ldev_if_state(mdev, D_FAILED)) {
+ eh = mdev->ldev->dc.on_io_error;
+ put_ldev(mdev);
+ }
+
+ drbd_rs_cancel_all(mdev);
+ /* since get_ldev() only works as long as disk>=D_INCONSISTENT,
+ and it is D_DISKLESS here, local_cnt can only go down, it can
+ not increase... It will reach zero */
+ wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
+ mdev->rs_total = 0;
+ mdev->rs_failed = 0;
+ atomic_set(&mdev->rs_pending_cnt, 0);
+
+ spin_lock_irq(&mdev->req_lock);
+ _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL);
+ spin_unlock_irq(&mdev->req_lock);
+
+ if (eh == EP_CALL_HELPER)
+ drbd_khelper(mdev, "local-io-error");
+ }
+
+ if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
+
+ if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ {
+ if (drbd_send_state(mdev))
+ dev_warn(DEV, "Notified peer that my disk is broken.\n");
+ else
+ dev_err(DEV, "Sending state in drbd_io_error() failed\n");
+ }
+
+ lc_destroy(mdev->resync);
+ mdev->resync = NULL;
+ lc_destroy(mdev->act_log);
+ mdev->act_log = NULL;
+ __no_warn(local,
+ drbd_free_bc(mdev->ldev);
+ mdev->ldev = NULL;);
+
+ if (mdev->md_io_tmpp)
+ __free_page(mdev->md_io_tmpp);
+ }
+
+ /* Disks got bigger while they were detached */
+ if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
+ test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
+ if (ns.conn == C_CONNECTED)
+ resync_after_online_grow(mdev);
+ }
+
+ /* A resync finished or aborted, wake paused devices... */
+ if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
+ (os.peer_isp && !ns.peer_isp) ||
+ (os.user_isp && !ns.user_isp))
+ resume_next_sg(mdev);
+
+ /* Upon network connection, we need to start the receiver */
+ if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
+ drbd_thread_start(&mdev->receiver);
+
+ /* Terminate worker thread if we are unconfigured - it will be
+ restarted as needed... */
+ if (ns.disk == D_DISKLESS &&
+ ns.conn == C_STANDALONE &&
+ ns.role == R_SECONDARY) {
+ if (os.aftr_isp != ns.aftr_isp)
+ resume_next_sg(mdev);
+ /* set in __drbd_set_state, unless CONFIG_PENDING was set */
+ if (test_bit(DEVICE_DYING, &mdev->flags))
+ drbd_thread_stop_nowait(&mdev->worker);
+ }
+
+ drbd_md_sync(mdev);
+}
+
+
+static int drbd_thread_setup(void *arg)
+{
+ struct drbd_thread *thi = (struct drbd_thread *) arg;
+ struct drbd_conf *mdev = thi->mdev;
+ unsigned long flags;
+ int retval;
+
+restart:
+ retval = thi->function(thi);
+
+ spin_lock_irqsave(&thi->t_lock, flags);
+
+ /* if the receiver has been "Exiting", the last thing it did
+ * was set the conn state to "StandAlone",
+ * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
+ * and receiver thread will be "started".
+ * drbd_thread_start needs to set "Restarting" in that case.
+ * t_state check and assignment needs to be within the same spinlock,
+ * so either thread_start sees Exiting, and can remap to Restarting,
+ * or thread_start see None, and can proceed as normal.
+ */
+
+ if (thi->t_state == Restarting) {
+ dev_info(DEV, "Restarting %s\n", current->comm);
+ thi->t_state = Running;
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ goto restart;
+ }
+
+ thi->task = NULL;
+ thi->t_state = None;
+ smp_mb();
+ complete(&thi->stop);
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+
+ dev_info(DEV, "Terminating %s\n", current->comm);
+
+ /* Release mod reference taken when thread was started */
+ module_put(THIS_MODULE);
+ return retval;
+}
+
+static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
+ int (*func) (struct drbd_thread *))
+{
+ spin_lock_init(&thi->t_lock);
+ thi->task = NULL;
+ thi->t_state = None;
+ thi->function = func;
+ thi->mdev = mdev;
+}
+
+int drbd_thread_start(struct drbd_thread *thi)
+{
+ struct drbd_conf *mdev = thi->mdev;
+ struct task_struct *nt;
+ unsigned long flags;
+
+ const char *me =
+ thi == &mdev->receiver ? "receiver" :
+ thi == &mdev->asender ? "asender" :
+ thi == &mdev->worker ? "worker" : "NONSENSE";
+
+ /* is used from state engine doing drbd_thread_stop_nowait,
+ * while holding the req lock irqsave */
+ spin_lock_irqsave(&thi->t_lock, flags);
+
+ switch (thi->t_state) {
+ case None:
+ dev_info(DEV, "Starting %s thread (from %s [%d])\n",
+ me, current->comm, current->pid);
+
+ /* Get ref on module for thread - this is released when thread exits */
+ if (!try_module_get(THIS_MODULE)) {
+ dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ return FALSE;
+ }
+
+ init_completion(&thi->stop);
+ D_ASSERT(thi->task == NULL);
+ thi->reset_cpu_mask = 1;
+ thi->t_state = Running;
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
+
+ nt = kthread_create(drbd_thread_setup, (void *) thi,
+ "drbd%d_%s", mdev_to_minor(mdev), me);
+
+ if (IS_ERR(nt)) {
+ dev_err(DEV, "Couldn't start thread\n");
+
+ module_put(THIS_MODULE);
+ return FALSE;
+ }
+ spin_lock_irqsave(&thi->t_lock, flags);
+ thi->task = nt;
+ thi->t_state = Running;
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ wake_up_process(nt);
+ break;
+ case Exiting:
+ thi->t_state = Restarting;
+ dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
+ me, current->comm, current->pid);
+ /* fall through */
+ case Running:
+ case Restarting:
+ default:
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ break;
+ }
+
+ return TRUE;
+}
+
+
+void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
+{
+ unsigned long flags;
+
+ enum drbd_thread_state ns = restart ? Restarting : Exiting;
+
+ /* may be called from state engine, holding the req lock irqsave */
+ spin_lock_irqsave(&thi->t_lock, flags);
+
+ if (thi->t_state == None) {
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ if (restart)
+ drbd_thread_start(thi);
+ return;
+ }
+
+ if (thi->t_state != ns) {
+ if (thi->task == NULL) {
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ return;
+ }
+
+ thi->t_state = ns;
+ smp_mb();
+ init_completion(&thi->stop);
+ if (thi->task != current)
+ force_sig(DRBD_SIGKILL, thi->task);
+
+ }
+
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+
+ if (wait)
+ wait_for_completion(&thi->stop);
+}
+
+#ifdef CONFIG_SMP
+/**
+ * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
+ * @mdev: DRBD device.
+ *
+ * Forces all threads of a device onto the same CPU. This is beneficial for
+ * DRBD's performance. May be overwritten by user's configuration.
+ */
+void drbd_calc_cpu_mask(struct drbd_conf *mdev)
+{
+ int ord, cpu;
+
+ /* user override. */
+ if (cpumask_weight(mdev->cpu_mask))
+ return;
+
+ ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
+ for_each_online_cpu(cpu) {
+ if (ord-- == 0) {
+ cpumask_set_cpu(cpu, mdev->cpu_mask);
+ return;
+ }
+ }
+ /* should not be reached */
+ cpumask_setall(mdev->cpu_mask);
+}
+
+/**
+ * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
+ * @mdev: DRBD device.
+ *
+ * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
+ * prematurely.
+ */
+void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
+{
+ struct task_struct *p = current;
+ struct drbd_thread *thi =
+ p == mdev->asender.task ? &mdev->asender :
+ p == mdev->receiver.task ? &mdev->receiver :
+ p == mdev->worker.task ? &mdev->worker :
+ NULL;
+ ERR_IF(thi == NULL)
+ return;
+ if (!thi->reset_cpu_mask)
+ return;
+ thi->reset_cpu_mask = 0;
+ set_cpus_allowed_ptr(p, mdev->cpu_mask);
+}
+#endif
+
+/* the appropriate socket mutex must be held already */
+int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
+ enum drbd_packets cmd, struct p_header *h,
+ size_t size, unsigned msg_flags)
+{
+ int sent, ok;
+
+ ERR_IF(!h) return FALSE;
+ ERR_IF(!size) return FALSE;
+
+ h->magic = BE_DRBD_MAGIC;
+ h->command = cpu_to_be16(cmd);
+ h->length = cpu_to_be16(size-sizeof(struct p_header));
+
+ sent = drbd_send(mdev, sock, h, size, msg_flags);
+
+ ok = (sent == size);
+ if (!ok)
+ dev_err(DEV, "short sent %s size=%d sent=%d\n",
+ cmdname(cmd), (int)size, sent);
+ return ok;
+}
+
+/* don't pass the socket. we may only look at it
+ * when we hold the appropriate socket mutex.
+ */
+int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
+ enum drbd_packets cmd, struct p_header *h, size_t size)
+{
+ int ok = 0;
+ struct socket *sock;
+
+ if (use_data_socket) {
+ mutex_lock(&mdev->data.mutex);
+ sock = mdev->data.socket;
+ } else {
+ mutex_lock(&mdev->meta.mutex);
+ sock = mdev->meta.socket;
+ }
+
+ /* drbd_disconnect() could have called drbd_free_sock()
+ * while we were waiting in down()... */
+ if (likely(sock != NULL))
+ ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
+
+ if (use_data_socket)
+ mutex_unlock(&mdev->data.mutex);
+ else
+ mutex_unlock(&mdev->meta.mutex);
+ return ok;
+}
+
+int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
+ size_t size)
+{
+ struct p_header h;
+ int ok;
+
+ h.magic = BE_DRBD_MAGIC;
+ h.command = cpu_to_be16(cmd);
+ h.length = cpu_to_be16(size);
+
+ if (!drbd_get_data_sock(mdev))
+ return 0;
+
+ ok = (sizeof(h) ==
+ drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
+ ok = ok && (size ==
+ drbd_send(mdev, mdev->data.socket, data, size, 0));
+
+ drbd_put_data_sock(mdev);
+
+ return ok;
+}
+
+int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
+{
+ struct p_rs_param_89 *p;
+ struct socket *sock;
+ int size, rv;
+ const int apv = mdev->agreed_pro_version;
+
+ size = apv <= 87 ? sizeof(struct p_rs_param)
+ : apv == 88 ? sizeof(struct p_rs_param)
+ + strlen(mdev->sync_conf.verify_alg) + 1
+ : /* 89 */ sizeof(struct p_rs_param_89);
+
+ /* used from admin command context and receiver/worker context.
+ * to avoid kmalloc, grab the socket right here,
+ * then use the pre-allocated sbuf there */
+ mutex_lock(&mdev->data.mutex);
+ sock = mdev->data.socket;
+
+ if (likely(sock != NULL)) {
+ enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
+
+ p = &mdev->data.sbuf.rs_param_89;
+
+ /* initialize verify_alg and csums_alg */
+ memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
+
+ p->rate = cpu_to_be32(sc->rate);
+
+ if (apv >= 88)
+ strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
+ if (apv >= 89)
+ strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
+
+ rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
+ } else
+ rv = 0; /* not ok */
+
+ mutex_unlock(&mdev->data.mutex);
+
+ return rv;
+}
+
+int drbd_send_protocol(struct drbd_conf *mdev)
+{
+ struct p_protocol *p;
+ int size, rv;
+
+ size = sizeof(struct p_protocol);
+
+ if (mdev->agreed_pro_version >= 87)
+ size += strlen(mdev->net_conf->integrity_alg) + 1;
+
+ /* we must not recurse into our own queue,
+ * as that is blocked during handshake */
+ p = kmalloc(size, GFP_NOIO);
+ if (p == NULL)
+ return 0;
+
+ p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
+ p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
+ p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
+ p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
+ p->want_lose = cpu_to_be32(mdev->net_conf->want_lose);
+ p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
+
+ if (mdev->agreed_pro_version >= 87)
+ strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
+
+ rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
+ (struct p_header *)p, size);
+ kfree(p);
+ return rv;
+}
+
+int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
+{
+ struct p_uuids p;
+ int i;
+
+ if (!get_ldev_if_state(mdev, D_NEGOTIATING))
+ return 1;
+
+ for (i = UI_CURRENT; i < UI_SIZE; i++)
+ p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
+
+ mdev->comm_bm_set = drbd_bm_total_weight(mdev);
+ p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
+ uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
+ uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
+ uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
+ p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
+
+ put_ldev(mdev);
+
+ return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
+ (struct p_header *)&p, sizeof(p));
+}
+
+int drbd_send_uuids(struct drbd_conf *mdev)
+{
+ return _drbd_send_uuids(mdev, 0);
+}
+
+int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
+{
+ return _drbd_send_uuids(mdev, 8);
+}
+
+
+int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
+{
+ struct p_rs_uuid p;
+
+ p.uuid = cpu_to_be64(val);
+
+ return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
+ (struct p_header *)&p, sizeof(p));
+}
+
+int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
+{
+ struct p_sizes p;
+ sector_t d_size, u_size;
+ int q_order_type;
+ int ok;
+
+ if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
+ D_ASSERT(mdev->ldev->backing_bdev);
+ d_size = drbd_get_max_capacity(mdev->ldev);
+ u_size = mdev->ldev->dc.disk_size;
+ q_order_type = drbd_queue_order_type(mdev);
+ p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev));
+ put_ldev(mdev);
+ } else {
+ d_size = 0;
+ u_size = 0;
+ q_order_type = QUEUE_ORDERED_NONE;
+ }
+
+ p.d_size = cpu_to_be64(d_size);
+ p.u_size = cpu_to_be64(u_size);
+ p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
+ p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
+ p.queue_order_type = cpu_to_be32(q_order_type);
+
+ ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
+ (struct p_header *)&p, sizeof(p));
+ return ok;
+}
+
+/**
+ * drbd_send_state() - Sends the drbd state to the peer
+ * @mdev: DRBD device.
+ */
+int drbd_send_state(struct drbd_conf *mdev)
+{
+ struct socket *sock;
+ struct p_state p;
+ int ok = 0;
+
+ /* Grab state lock so we wont send state if we're in the middle
+ * of a cluster wide state change on another thread */
+ drbd_state_lock(mdev);
+
+ mutex_lock(&mdev->data.mutex);
+
+ p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
+ sock = mdev->data.socket;
+
+ if (likely(sock != NULL)) {
+ ok = _drbd_send_cmd(mdev, sock, P_STATE,
+ (struct p_header *)&p, sizeof(p), 0);
+ }
+
+ mutex_unlock(&mdev->data.mutex);
+
+ drbd_state_unlock(mdev);
+ return ok;
+}
+
+int drbd_send_state_req(struct drbd_conf *mdev,
+ union drbd_state mask, union drbd_state val)
+{
+ struct p_req_state p;
+
+ p.mask = cpu_to_be32(mask.i);
+ p.val = cpu_to_be32(val.i);
+
+ return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
+ (struct p_header *)&p, sizeof(p));
+}
+
+int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
+{
+ struct p_req_state_reply p;
+
+ p.retcode = cpu_to_be32(retcode);
+
+ return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
+ (struct p_header *)&p, sizeof(p));
+}
+
+int fill_bitmap_rle_bits(struct drbd_conf *mdev,
+ struct p_compressed_bm *p,
+ struct bm_xfer_ctx *c)
+{
+ struct bitstream bs;
+ unsigned long plain_bits;
+ unsigned long tmp;
+ unsigned long rl;
+ unsigned len;
+ unsigned toggle;
+ int bits;
+
+ /* may we use this feature? */
+ if ((mdev->sync_conf.use_rle == 0) ||
+ (mdev->agreed_pro_version < 90))
+ return 0;
+
+ if (c->bit_offset >= c->bm_bits)
+ return 0; /* nothing to do. */
+
+ /* use at most thus many bytes */
+ bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
+ memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
+ /* plain bits covered in this code string */
+ plain_bits = 0;
+
+ /* p->encoding & 0x80 stores whether the first run length is set.
+ * bit offset is implicit.
+ * start with toggle == 2 to be able to tell the first iteration */
+ toggle = 2;
+
+ /* see how much plain bits we can stuff into one packet
+ * using RLE and VLI. */
+ do {
+ tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
+ : _drbd_bm_find_next(mdev, c->bit_offset);
+ if (tmp == -1UL)
+ tmp = c->bm_bits;
+ rl = tmp - c->bit_offset;
+
+ if (toggle == 2) { /* first iteration */
+ if (rl == 0) {
+ /* the first checked bit was set,
+ * store start value, */
+ DCBP_set_start(p, 1);
+ /* but skip encoding of zero run length */
+ toggle = !toggle;
+ continue;
+ }
+ DCBP_set_start(p, 0);
+ }
+
+ /* paranoia: catch zero runlength.
+ * can only happen if bitmap is modified while we scan it. */
+ if (rl == 0) {
+ dev_err(DEV, "unexpected zero runlength while encoding bitmap "
+ "t:%u bo:%lu\n", toggle, c->bit_offset);
+ return -1;
+ }
+
+ bits = vli_encode_bits(&bs, rl);
+ if (bits == -ENOBUFS) /* buffer full */
+ break;
+ if (bits <= 0) {
+ dev_err(DEV, "error while encoding bitmap: %d\n", bits);
+ return 0;
+ }
+
+ toggle = !toggle;
+ plain_bits += rl;
+ c->bit_offset = tmp;
+ } while (c->bit_offset < c->bm_bits);
+
+ len = bs.cur.b - p->code + !!bs.cur.bit;
+
+ if (plain_bits < (len << 3)) {
+ /* incompressible with this method.
+ * we need to rewind both word and bit position. */
+ c->bit_offset -= plain_bits;
+ bm_xfer_ctx_bit_to_word_offset(c);
+ c->bit_offset = c->word_offset * BITS_PER_LONG;
+ return 0;
+ }
+
+ /* RLE + VLI was able to compress it just fine.
+ * update c->word_offset. */
+ bm_xfer_ctx_bit_to_word_offset(c);
+
+ /* store pad_bits */
+ DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
+
+ return len;
+}
+
+enum { OK, FAILED, DONE }
+send_bitmap_rle_or_plain(struct drbd_conf *mdev,
+ struct p_header *h, struct bm_xfer_ctx *c)
+{
+ struct p_compressed_bm *p = (void*)h;
+ unsigned long num_words;
+ int len;
+ int ok;
+
+ len = fill_bitmap_rle_bits(mdev, p, c);
+
+ if (len < 0)
+ return FAILED;
+
+ if (len) {
+ DCBP_set_code(p, RLE_VLI_Bits);
+ ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
+ sizeof(*p) + len, 0);
+
+ c->packets[0]++;
+ c->bytes[0] += sizeof(*p) + len;
+
+ if (c->bit_offset >= c->bm_bits)
+ len = 0; /* DONE */
+ } else {
+ /* was not compressible.
+ * send a buffer full of plain text bits instead. */
+ num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
+ len = num_words * sizeof(long);
+ if (len)
+ drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
+ ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
+ h, sizeof(struct p_header) + len, 0);
+ c->word_offset += num_words;
+ c->bit_offset = c->word_offset * BITS_PER_LONG;
+
+ c->packets[1]++;
+ c->bytes[1] += sizeof(struct p_header) + len;
+
+ if (c->bit_offset > c->bm_bits)
+ c->bit_offset = c->bm_bits;
+ }
+ ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
+
+ if (ok == DONE)
+ INFO_bm_xfer_stats(mdev, "send", c);
+ return ok;
+}
+
+/* See the comment at receive_bitmap() */
+int _drbd_send_bitmap(struct drbd_conf *mdev)
+{
+ struct bm_xfer_ctx c;
+ struct p_header *p;
+ int ret;
+
+ ERR_IF(!mdev->bitmap) return FALSE;
+
+ /* maybe we should use some per thread scratch page,
+ * and allocate that during initial device creation? */
+ p = (struct p_header *) __get_free_page(GFP_NOIO);
+ if (!p) {
+ dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
+ return FALSE;
+ }
+
+ if (get_ldev(mdev)) {
+ if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
+ dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
+ drbd_bm_set_all(mdev);
+ if (drbd_bm_write(mdev)) {
+ /* write_bm did fail! Leave full sync flag set in Meta P_DATA
+ * but otherwise process as per normal - need to tell other
+ * side that a full resync is required! */
+ dev_err(DEV, "Failed to write bitmap to disk!\n");
+ } else {
+ drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
+ drbd_md_sync(mdev);
+ }
+ }
+ put_ldev(mdev);
+ }
+
+ c = (struct bm_xfer_ctx) {
+ .bm_bits = drbd_bm_bits(mdev),
+ .bm_words = drbd_bm_words(mdev),
+ };
+
+ do {
+ ret = send_bitmap_rle_or_plain(mdev, p, &c);
+ } while (ret == OK);
+
+ free_page((unsigned long) p);
+ return (ret == DONE);
+}
+
+int drbd_send_bitmap(struct drbd_conf *mdev)
+{
+ int err;
+
+ if (!drbd_get_data_sock(mdev))
+ return -1;
+ err = !_drbd_send_bitmap(mdev);
+ drbd_put_data_sock(mdev);
+ return err;
+}
+
+int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
+{
+ int ok;
+ struct p_barrier_ack p;
+
+ p.barrier = barrier_nr;
+ p.set_size = cpu_to_be32(set_size);
+
+ if (mdev->state.conn < C_CONNECTED)
+ return FALSE;
+ ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
+ (struct p_header *)&p, sizeof(p));
+ return ok;
+}
+
+/**
+ * _drbd_send_ack() - Sends an ack packet
+ * @mdev: DRBD device.
+ * @cmd: Packet command code.
+ * @sector: sector, needs to be in big endian byte order
+ * @blksize: size in byte, needs to be in big endian byte order
+ * @block_id: Id, big endian byte order
+ */
+static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
+ u64 sector,
+ u32 blksize,
+ u64 block_id)
+{
+ int ok;
+ struct p_block_ack p;
+
+ p.sector = sector;
+ p.block_id = block_id;
+ p.blksize = blksize;
+ p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
+
+ if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
+ return FALSE;
+ ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
+ (struct p_header *)&p, sizeof(p));
+ return ok;
+}
+
+int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
+ struct p_data *dp)
+{
+ const int header_size = sizeof(struct p_data)
+ - sizeof(struct p_header);
+ int data_size = ((struct p_header *)dp)->length - header_size;
+
+ return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
+ dp->block_id);
+}
+
+int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
+ struct p_block_req *rp)
+{
+ return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
+}
+
+/**
+ * drbd_send_ack() - Sends an ack packet
+ * @mdev: DRBD device.
+ * @cmd: Packet command code.
+ * @e: Epoch entry.
+ */
+int drbd_send_ack(struct drbd_conf *mdev,
+ enum drbd_packets cmd, struct drbd_epoch_entry *e)
+{
+ return _drbd_send_ack(mdev, cmd,
+ cpu_to_be64(e->sector),
+ cpu_to_be32(e->size),
+ e->block_id);
+}
+
+/* This function misuses the block_id field to signal if the blocks
+ * are is sync or not. */
+int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
+ sector_t sector, int blksize, u64 block_id)
+{
+ return _drbd_send_ack(mdev, cmd,
+ cpu_to_be64(sector),
+ cpu_to_be32(blksize),
+ cpu_to_be64(block_id));
+}
+
+int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
+ sector_t sector, int size, u64 block_id)
+{
+ int ok;
+ struct p_block_req p;
+
+ p.sector = cpu_to_be64(sector);
+ p.block_id = block_id;
+ p.blksize = cpu_to_be32(size);
+
+ ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
+ (struct p_header *)&p, sizeof(p));
+ return ok;
+}
+
+int drbd_send_drequest_csum(struct drbd_conf *mdev,
+ sector_t sector, int size,
+ void *digest, int digest_size,
+ enum drbd_packets cmd)
+{
+ int ok;
+ struct p_block_req p;
+
+ p.sector = cpu_to_be64(sector);
+ p.block_id = BE_DRBD_MAGIC + 0xbeef;
+ p.blksize = cpu_to_be32(size);
+
+ p.head.magic = BE_DRBD_MAGIC;
+ p.head.command = cpu_to_be16(cmd);
+ p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size);
+
+ mutex_lock(&mdev->data.mutex);
+
+ ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
+ ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
+
+ mutex_unlock(&mdev->data.mutex);
+
+ return ok;
+}
+
+int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
+{
+ int ok;
+ struct p_block_req p;
+
+ p.sector = cpu_to_be64(sector);
+ p.block_id = BE_DRBD_MAGIC + 0xbabe;
+ p.blksize = cpu_to_be32(size);
+
+ ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
+ (struct p_header *)&p, sizeof(p));
+ return ok;
+}
+
+/* called on sndtimeo
+ * returns FALSE if we should retry,
+ * TRUE if we think connection is dead
+ */
+static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
+{
+ int drop_it;
+ /* long elapsed = (long)(jiffies - mdev->last_received); */
+
+ drop_it = mdev->meta.socket == sock
+ || !mdev->asender.task
+ || get_t_state(&mdev->asender) != Running
+ || mdev->state.conn < C_CONNECTED;
+
+ if (drop_it)
+ return TRUE;
+
+ drop_it = !--mdev->ko_count;
+ if (!drop_it) {
+ dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
+ current->comm, current->pid, mdev->ko_count);
+ request_ping(mdev);
+ }
+
+ return drop_it; /* && (mdev->state == R_PRIMARY) */;
+}
+
+/* The idea of sendpage seems to be to put some kind of reference
+ * to the page into the skb, and to hand it over to the NIC. In
+ * this process get_page() gets called.
+ *
+ * As soon as the page was really sent over the network put_page()
+ * gets called by some part of the network layer. [ NIC driver? ]
+ *
+ * [ get_page() / put_page() increment/decrement the count. If count
+ * reaches 0 the page will be freed. ]
+ *
+ * This works nicely with pages from FSs.
+ * But this means that in protocol A we might signal IO completion too early!
+ *
+ * In order not to corrupt data during a resync we must make sure
+ * that we do not reuse our own buffer pages (EEs) to early, therefore
+ * we have the net_ee list.
+ *
+ * XFS seems to have problems, still, it submits pages with page_count == 0!
+ * As a workaround, we disable sendpage on pages
+ * with page_count == 0 or PageSlab.
+ */
+static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
+ int offset, size_t size)
+{
+ int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0);
+ kunmap(page);
+ if (sent == size)
+ mdev->send_cnt += size>>9;
+ return sent == size;
+}
+
+static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
+ int offset, size_t size)
+{
+ mm_segment_t oldfs = get_fs();
+ int sent, ok;
+ int len = size;
+
+ /* e.g. XFS meta- & log-data is in slab pages, which have a
+ * page_count of 0 and/or have PageSlab() set.
+ * we cannot use send_page for those, as that does get_page();
+ * put_page(); and would cause either a VM_BUG directly, or
+ * __page_cache_release a page that would actually still be referenced
+ * by someone, leading to some obscure delayed Oops somewhere else. */
+ if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
+ return _drbd_no_send_page(mdev, page, offset, size);
+
+ drbd_update_congested(mdev);
+ set_fs(KERNEL_DS);
+ do {
+ sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
+ offset, len,
+ MSG_NOSIGNAL);
+ if (sent == -EAGAIN) {
+ if (we_should_drop_the_connection(mdev,
+ mdev->data.socket))
+ break;
+ else
+ continue;
+ }
+ if (sent <= 0) {
+ dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
+ __func__, (int)size, len, sent);
+ break;
+ }
+ len -= sent;
+ offset += sent;
+ } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
+ set_fs(oldfs);
+ clear_bit(NET_CONGESTED, &mdev->flags);
+
+ ok = (len == 0);
+ if (likely(ok))
+ mdev->send_cnt += size>>9;
+ return ok;
+}
+
+static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
+{
+ struct bio_vec *bvec;
+ int i;
+ __bio_for_each_segment(bvec, bio, i, 0) {
+ if (!_drbd_no_send_page(mdev, bvec->bv_page,
+ bvec->bv_offset, bvec->bv_len))
+ return 0;
+ }
+ return 1;
+}
+
+static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
+{
+ struct bio_vec *bvec;
+ int i;
+ __bio_for_each_segment(bvec, bio, i, 0) {
+ if (!_drbd_send_page(mdev, bvec->bv_page,
+ bvec->bv_offset, bvec->bv_len))
+ return 0;
+ }
+
+ return 1;
+}
+
+/* Used to send write requests
+ * R_PRIMARY -> Peer (P_DATA)
+ */
+int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
+{
+ int ok = 1;
+ struct p_data p;
+ unsigned int dp_flags = 0;
+ void *dgb;
+ int dgs;
+
+ if (!drbd_get_data_sock(mdev))
+ return 0;
+
+ dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
+ crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
+
+ p.head.magic = BE_DRBD_MAGIC;
+ p.head.command = cpu_to_be16(P_DATA);
+ p.head.length =
+ cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size);
+
+ p.sector = cpu_to_be64(req->sector);
+ p.block_id = (unsigned long)req;
+ p.seq_num = cpu_to_be32(req->seq_num =
+ atomic_add_return(1, &mdev->packet_seq));
+ dp_flags = 0;
+
+ /* NOTE: no need to check if barriers supported here as we would
+ * not pass the test in make_request_common in that case
+ */
+ if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) {
+ dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
+ /* dp_flags |= DP_HARDBARRIER; */
+ }
+ if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO))
+ dp_flags |= DP_RW_SYNC;
+ /* for now handle SYNCIO and UNPLUG
+ * as if they still were one and the same flag */
+ if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG))
+ dp_flags |= DP_RW_SYNC;
+ if (mdev->state.conn >= C_SYNC_SOURCE &&
+ mdev->state.conn <= C_PAUSED_SYNC_T)
+ dp_flags |= DP_MAY_SET_IN_SYNC;
+
+ p.dp_flags = cpu_to_be32(dp_flags);
+ set_bit(UNPLUG_REMOTE, &mdev->flags);
+ ok = (sizeof(p) ==
+ drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
+ if (ok && dgs) {
+ dgb = mdev->int_dig_out;
+ drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
+ ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
+ }
+ if (ok) {
+ if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
+ ok = _drbd_send_bio(mdev, req->master_bio);
+ else
+ ok = _drbd_send_zc_bio(mdev, req->master_bio);
+ }
+
+ drbd_put_data_sock(mdev);
+ return ok;
+}
+
+/* answer packet, used to send data back for read requests:
+ * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
+ * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
+ */
+int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
+ struct drbd_epoch_entry *e)
+{
+ int ok;
+ struct p_data p;
+ void *dgb;
+ int dgs;
+
+ dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
+ crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
+
+ p.head.magic = BE_DRBD_MAGIC;
+ p.head.command = cpu_to_be16(cmd);
+ p.head.length =
+ cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size);
+
+ p.sector = cpu_to_be64(e->sector);
+ p.block_id = e->block_id;
+ /* p.seq_num = 0; No sequence numbers here.. */
+
+ /* Only called by our kernel thread.
+ * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
+ * in response to admin command or module unload.
+ */
+ if (!drbd_get_data_sock(mdev))
+ return 0;
+
+ ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
+ sizeof(p), MSG_MORE);
+ if (ok && dgs) {
+ dgb = mdev->int_dig_out;
+ drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb);
+ ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
+ }
+ if (ok)
+ ok = _drbd_send_zc_bio(mdev, e->private_bio);
+
+ drbd_put_data_sock(mdev);
+ return ok;
+}
+
+/*
+ drbd_send distinguishes two cases:
+
+ Packets sent via the data socket "sock"
+ and packets sent via the meta data socket "msock"
+
+ sock msock
+ -----------------+-------------------------+------------------------------
+ timeout conf.timeout / 2 conf.timeout / 2
+ timeout action send a ping via msock Abort communication
+ and close all sockets
+*/
+
+/*
+ * you must have down()ed the appropriate [m]sock_mutex elsewhere!
+ */
+int drbd_send(struct drbd_conf *mdev, struct socket *sock,
+ void *buf, size_t size, unsigned msg_flags)
+{
+ struct kvec iov;
+ struct msghdr msg;
+ int rv, sent = 0;
+
+ if (!sock)
+ return -1000;
+
+ /* THINK if (signal_pending) return ... ? */
+
+ iov.iov_base = buf;
+ iov.iov_len = size;
+
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = msg_flags | MSG_NOSIGNAL;
+
+ if (sock == mdev->data.socket) {
+ mdev->ko_count = mdev->net_conf->ko_count;
+ drbd_update_congested(mdev);
+ }
+ do {
+ /* STRANGE
+ * tcp_sendmsg does _not_ use its size parameter at all ?
+ *
+ * -EAGAIN on timeout, -EINTR on signal.
+ */
+/* THINK
+ * do we need to block DRBD_SIG if sock == &meta.socket ??
+ * otherwise wake_asender() might interrupt some send_*Ack !
+ */
+ rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
+ if (rv == -EAGAIN) {
+ if (we_should_drop_the_connection(mdev, sock))
+ break;
+ else
+ continue;
+ }
+ D_ASSERT(rv != 0);
+ if (rv == -EINTR) {
+ flush_signals(current);
+ rv = 0;
+ }
+ if (rv < 0)
+ break;
+ sent += rv;
+ iov.iov_base += rv;
+ iov.iov_len -= rv;
+ } while (sent < size);
+
+ if (sock == mdev->data.socket)
+ clear_bit(NET_CONGESTED, &mdev->flags);
+
+ if (rv <= 0) {
+ if (rv != -EAGAIN) {
+ dev_err(DEV, "%s_sendmsg returned %d\n",
+ sock == mdev->meta.socket ? "msock" : "sock",
+ rv);
+ drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
+ } else
+ drbd_force_state(mdev, NS(conn, C_TIMEOUT));
+ }
+
+ return sent;
+}
+
+static int drbd_open(struct block_device *bdev, fmode_t mode)
+{
+ struct drbd_conf *mdev = bdev->bd_disk->private_data;
+ unsigned long flags;
+ int rv = 0;
+
+ spin_lock_irqsave(&mdev->req_lock, flags);
+ /* to have a stable mdev->state.role
+ * and no race with updating open_cnt */
+
+ if (mdev->state.role != R_PRIMARY) {
+ if (mode & FMODE_WRITE)
+ rv = -EROFS;
+ else if (!allow_oos)
+ rv = -EMEDIUMTYPE;
+ }
+
+ if (!rv)
+ mdev->open_cnt++;
+ spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+ return rv;
+}
+
+static int drbd_release(struct gendisk *gd, fmode_t mode)
+{
+ struct drbd_conf *mdev = gd->private_data;
+ mdev->open_cnt--;
+ return 0;
+}
+
+static void drbd_unplug_fn(struct request_queue *q)
+{
+ struct drbd_conf *mdev = q->queuedata;
+
+ /* unplug FIRST */
+ spin_lock_irq(q->queue_lock);
+ blk_remove_plug(q);
+ spin_unlock_irq(q->queue_lock);
+
+ /* only if connected */
+ spin_lock_irq(&mdev->req_lock);
+ if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
+ D_ASSERT(mdev->state.role == R_PRIMARY);
+ if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
+ /* add to the data.work queue,
+ * unless already queued.
+ * XXX this might be a good addition to drbd_queue_work
+ * anyways, to detect "double queuing" ... */
+ if (list_empty(&mdev->unplug_work.list))
+ drbd_queue_work(&mdev->data.work,
+ &mdev->unplug_work);
+ }
+ }
+ spin_unlock_irq(&mdev->req_lock);
+
+ if (mdev->state.disk >= D_INCONSISTENT)
+ drbd_kick_lo(mdev);
+}
+
+static void drbd_set_defaults(struct drbd_conf *mdev)
+{
+ mdev->sync_conf.after = DRBD_AFTER_DEF;
+ mdev->sync_conf.rate = DRBD_RATE_DEF;
+ mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF;
+ mdev->state = (union drbd_state) {
+ { .role = R_SECONDARY,
+ .peer = R_UNKNOWN,
+ .conn = C_STANDALONE,
+ .disk = D_DISKLESS,
+ .pdsk = D_UNKNOWN,
+ .susp = 0
+ } };
+}
+
+void drbd_init_set_defaults(struct drbd_conf *mdev)
+{
+ /* the memset(,0,) did most of this.
+ * note: only assignments, no allocation in here */
+
+ drbd_set_defaults(mdev);
+
+ /* for now, we do NOT yet support it,
+ * even though we start some framework
+ * to eventually support barriers */
+ set_bit(NO_BARRIER_SUPP, &mdev->flags);
+
+ atomic_set(&mdev->ap_bio_cnt, 0);
+ atomic_set(&mdev->ap_pending_cnt, 0);
+ atomic_set(&mdev->rs_pending_cnt, 0);
+ atomic_set(&mdev->unacked_cnt, 0);
+ atomic_set(&mdev->local_cnt, 0);
+ atomic_set(&mdev->net_cnt, 0);
+ atomic_set(&mdev->packet_seq, 0);
+ atomic_set(&mdev->pp_in_use, 0);
+
+ mutex_init(&mdev->md_io_mutex);
+ mutex_init(&mdev->data.mutex);
+ mutex_init(&mdev->meta.mutex);
+ sema_init(&mdev->data.work.s, 0);
+ sema_init(&mdev->meta.work.s, 0);
+ mutex_init(&mdev->state_mutex);
+
+ spin_lock_init(&mdev->data.work.q_lock);
+ spin_lock_init(&mdev->meta.work.q_lock);
+
+ spin_lock_init(&mdev->al_lock);
+ spin_lock_init(&mdev->req_lock);
+ spin_lock_init(&mdev->peer_seq_lock);
+ spin_lock_init(&mdev->epoch_lock);
+
+ INIT_LIST_HEAD(&mdev->active_ee);
+ INIT_LIST_HEAD(&mdev->sync_ee);
+ INIT_LIST_HEAD(&mdev->done_ee);
+ INIT_LIST_HEAD(&mdev->read_ee);
+ INIT_LIST_HEAD(&mdev->net_ee);
+ INIT_LIST_HEAD(&mdev->resync_reads);
+ INIT_LIST_HEAD(&mdev->data.work.q);
+ INIT_LIST_HEAD(&mdev->meta.work.q);
+ INIT_LIST_HEAD(&mdev->resync_work.list);
+ INIT_LIST_HEAD(&mdev->unplug_work.list);
+ INIT_LIST_HEAD(&mdev->md_sync_work.list);
+ INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
+ mdev->resync_work.cb = w_resync_inactive;
+ mdev->unplug_work.cb = w_send_write_hint;
+ mdev->md_sync_work.cb = w_md_sync;
+ mdev->bm_io_work.w.cb = w_bitmap_io;
+ init_timer(&mdev->resync_timer);
+ init_timer(&mdev->md_sync_timer);
+ mdev->resync_timer.function = resync_timer_fn;
+ mdev->resync_timer.data = (unsigned long) mdev;
+ mdev->md_sync_timer.function = md_sync_timer_fn;
+ mdev->md_sync_timer.data = (unsigned long) mdev;
+
+ init_waitqueue_head(&mdev->misc_wait);
+ init_waitqueue_head(&mdev->state_wait);
+ init_waitqueue_head(&mdev->ee_wait);
+ init_waitqueue_head(&mdev->al_wait);
+ init_waitqueue_head(&mdev->seq_wait);
+
+ drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
+ drbd_thread_init(mdev, &mdev->worker, drbd_worker);
+ drbd_thread_init(mdev, &mdev->asender, drbd_asender);
+
+ mdev->agreed_pro_version = PRO_VERSION_MAX;
+ mdev->write_ordering = WO_bio_barrier;
+ mdev->resync_wenr = LC_FREE;
+}
+
+void drbd_mdev_cleanup(struct drbd_conf *mdev)
+{
+ if (mdev->receiver.t_state != None)
+ dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
+ mdev->receiver.t_state);
+
+ /* no need to lock it, I'm the only thread alive */
+ if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
+ dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
+ mdev->al_writ_cnt =
+ mdev->bm_writ_cnt =
+ mdev->read_cnt =
+ mdev->recv_cnt =
+ mdev->send_cnt =
+ mdev->writ_cnt =
+ mdev->p_size =
+ mdev->rs_start =
+ mdev->rs_total =
+ mdev->rs_failed =
+ mdev->rs_mark_left =
+ mdev->rs_mark_time = 0;
+ D_ASSERT(mdev->net_conf == NULL);
+
+ drbd_set_my_capacity(mdev, 0);
+ if (mdev->bitmap) {
+ /* maybe never allocated. */
+ drbd_bm_resize(mdev, 0);
+ drbd_bm_cleanup(mdev);
+ }
+
+ drbd_free_resources(mdev);
+
+ /*
+ * currently we drbd_init_ee only on module load, so
+ * we may do drbd_release_ee only on module unload!
+ */
+ D_ASSERT(list_empty(&mdev->active_ee));
+ D_ASSERT(list_empty(&mdev->sync_ee));
+ D_ASSERT(list_empty(&mdev->done_ee));
+ D_ASSERT(list_empty(&mdev->read_ee));
+ D_ASSERT(list_empty(&mdev->net_ee));
+ D_ASSERT(list_empty(&mdev->resync_reads));
+ D_ASSERT(list_empty(&mdev->data.work.q));
+ D_ASSERT(list_empty(&mdev->meta.work.q));
+ D_ASSERT(list_empty(&mdev->resync_work.list));
+ D_ASSERT(list_empty(&mdev->unplug_work.list));
+
+}
+
+
+static void drbd_destroy_mempools(void)
+{
+ struct page *page;
+
+ while (drbd_pp_pool) {
+ page = drbd_pp_pool;
+ drbd_pp_pool = (struct page *)page_private(page);
+ __free_page(page);
+ drbd_pp_vacant--;
+ }
+
+ /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
+
+ if (drbd_ee_mempool)
+ mempool_destroy(drbd_ee_mempool);
+ if (drbd_request_mempool)
+ mempool_destroy(drbd_request_mempool);
+ if (drbd_ee_cache)
+ kmem_cache_destroy(drbd_ee_cache);
+ if (drbd_request_cache)
+ kmem_cache_destroy(drbd_request_cache);
+ if (drbd_bm_ext_cache)
+ kmem_cache_destroy(drbd_bm_ext_cache);
+ if (drbd_al_ext_cache)
+ kmem_cache_destroy(drbd_al_ext_cache);
+
+ drbd_ee_mempool = NULL;
+ drbd_request_mempool = NULL;
+ drbd_ee_cache = NULL;
+ drbd_request_cache = NULL;
+ drbd_bm_ext_cache = NULL;
+ drbd_al_ext_cache = NULL;
+
+ return;
+}
+
+static int drbd_create_mempools(void)
+{
+ struct page *page;
+ const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
+ int i;
+
+ /* prepare our caches and mempools */
+ drbd_request_mempool = NULL;
+ drbd_ee_cache = NULL;
+ drbd_request_cache = NULL;
+ drbd_bm_ext_cache = NULL;
+ drbd_al_ext_cache = NULL;
+ drbd_pp_pool = NULL;
+
+ /* caches */
+ drbd_request_cache = kmem_cache_create(
+ "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
+ if (drbd_request_cache == NULL)
+ goto Enomem;
+
+ drbd_ee_cache = kmem_cache_create(
+ "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
+ if (drbd_ee_cache == NULL)
+ goto Enomem;
+
+ drbd_bm_ext_cache = kmem_cache_create(
+ "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
+ if (drbd_bm_ext_cache == NULL)
+ goto Enomem;
+
+ drbd_al_ext_cache = kmem_cache_create(
+ "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
+ if (drbd_al_ext_cache == NULL)
+ goto Enomem;
+
+ /* mempools */
+ drbd_request_mempool = mempool_create(number,
+ mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
+ if (drbd_request_mempool == NULL)
+ goto Enomem;
+
+ drbd_ee_mempool = mempool_create(number,
+ mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
+ if (drbd_request_mempool == NULL)
+ goto Enomem;
+
+ /* drbd's page pool */
+ spin_lock_init(&drbd_pp_lock);
+
+ for (i = 0; i < number; i++) {
+ page = alloc_page(GFP_HIGHUSER);
+ if (!page)
+ goto Enomem;
+ set_page_private(page, (unsigned long)drbd_pp_pool);
+ drbd_pp_pool = page;
+ }
+ drbd_pp_vacant = number;
+
+ return 0;
+
+Enomem:
+ drbd_destroy_mempools(); /* in case we allocated some */
+ return -ENOMEM;
+}
+
+static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
+ void *unused)
+{
+ /* just so we have it. you never know what interesting things we
+ * might want to do here some day...
+ */
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block drbd_notifier = {
+ .notifier_call = drbd_notify_sys,
+};
+
+static void drbd_release_ee_lists(struct drbd_conf *mdev)
+{
+ int rr;
+
+ rr = drbd_release_ee(mdev, &mdev->active_ee);
+ if (rr)
+ dev_err(DEV, "%d EEs in active list found!\n", rr);
+
+ rr = drbd_release_ee(mdev, &mdev->sync_ee);
+ if (rr)
+ dev_err(DEV, "%d EEs in sync list found!\n", rr);
+
+ rr = drbd_release_ee(mdev, &mdev->read_ee);
+ if (rr)
+ dev_err(DEV, "%d EEs in read list found!\n", rr);
+
+ rr = drbd_release_ee(mdev, &mdev->done_ee);
+ if (rr)
+ dev_err(DEV, "%d EEs in done list found!\n", rr);
+
+ rr = drbd_release_ee(mdev, &mdev->net_ee);
+ if (rr)
+ dev_err(DEV, "%d EEs in net list found!\n", rr);
+}
+
+/* caution. no locking.
+ * currently only used from module cleanup code. */
+static void drbd_delete_device(unsigned int minor)
+{
+ struct drbd_conf *mdev = minor_to_mdev(minor);
+
+ if (!mdev)
+ return;
+
+ /* paranoia asserts */
+ if (mdev->open_cnt != 0)
+ dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
+ __FILE__ , __LINE__);
+
+ ERR_IF (!list_empty(&mdev->data.work.q)) {
+ struct list_head *lp;
+ list_for_each(lp, &mdev->data.work.q) {
+ dev_err(DEV, "lp = %p\n", lp);
+ }
+ };
+ /* end paranoia asserts */
+
+ del_gendisk(mdev->vdisk);
+
+ /* cleanup stuff that may have been allocated during
+ * device (re-)configuration or state changes */
+
+ if (mdev->this_bdev)
+ bdput(mdev->this_bdev);
+
+ drbd_free_resources(mdev);
+
+ drbd_release_ee_lists(mdev);
+
+ /* should be free'd on disconnect? */
+ kfree(mdev->ee_hash);
+ /*
+ mdev->ee_hash_s = 0;
+ mdev->ee_hash = NULL;
+ */
+
+ lc_destroy(mdev->act_log);
+ lc_destroy(mdev->resync);
+
+ kfree(mdev->p_uuid);
+ /* mdev->p_uuid = NULL; */
+
+ kfree(mdev->int_dig_out);
+ kfree(mdev->int_dig_in);
+ kfree(mdev->int_dig_vv);
+
+ /* cleanup the rest that has been
+ * allocated from drbd_new_device
+ * and actually free the mdev itself */
+ drbd_free_mdev(mdev);
+}
+
+static void drbd_cleanup(void)
+{
+ unsigned int i;
+
+ unregister_reboot_notifier(&drbd_notifier);
+
+ drbd_nl_cleanup();
+
+ if (minor_table) {
+ if (drbd_proc)
+ remove_proc_entry("drbd", NULL);
+ i = minor_count;
+ while (i--)
+ drbd_delete_device(i);
+ drbd_destroy_mempools();
+ }
+
+ kfree(minor_table);
+
+ unregister_blkdev(DRBD_MAJOR, "drbd");
+
+ printk(KERN_INFO "drbd: module cleanup done.\n");
+}
+
+/**
+ * drbd_congested() - Callback for pdflush
+ * @congested_data: User data
+ * @bdi_bits: Bits pdflush is currently interested in
+ *
+ * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
+ */
+static int drbd_congested(void *congested_data, int bdi_bits)
+{
+ struct drbd_conf *mdev = congested_data;
+ struct request_queue *q;
+ char reason = '-';
+ int r = 0;
+
+ if (!__inc_ap_bio_cond(mdev)) {
+ /* DRBD has frozen IO */
+ r = bdi_bits;
+ reason = 'd';
+ goto out;
+ }
+
+ if (get_ldev(mdev)) {
+ q = bdev_get_queue(mdev->ldev->backing_bdev);
+ r = bdi_congested(&q->backing_dev_info, bdi_bits);
+ put_ldev(mdev);
+ if (r)
+ reason = 'b';
+ }
+
+ if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
+ r |= (1 << BDI_async_congested);
+ reason = reason == 'b' ? 'a' : 'n';
+ }
+
+out:
+ mdev->congestion_reason = reason;
+ return r;
+}
+
+struct drbd_conf *drbd_new_device(unsigned int minor)
+{
+ struct drbd_conf *mdev;
+ struct gendisk *disk;
+ struct request_queue *q;
+
+ /* GFP_KERNEL, we are outside of all write-out paths */
+ mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
+ if (!mdev)
+ return NULL;
+ if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
+ goto out_no_cpumask;
+
+ mdev->minor = minor;
+
+ drbd_init_set_defaults(mdev);
+
+ q = blk_alloc_queue(GFP_KERNEL);
+ if (!q)
+ goto out_no_q;
+ mdev->rq_queue = q;
+ q->queuedata = mdev;
+ blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
+
+ disk = alloc_disk(1);
+ if (!disk)
+ goto out_no_disk;
+ mdev->vdisk = disk;
+
+ set_disk_ro(disk, TRUE);
+
+ disk->queue = q;
+ disk->major = DRBD_MAJOR;
+ disk->first_minor = minor;
+ disk->fops = &drbd_ops;
+ sprintf(disk->disk_name, "drbd%d", minor);
+ disk->private_data = mdev;
+
+ mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
+ /* we have no partitions. we contain only ourselves. */
+ mdev->this_bdev->bd_contains = mdev->this_bdev;
+
+ q->backing_dev_info.congested_fn = drbd_congested;
+ q->backing_dev_info.congested_data = mdev;
+
+ blk_queue_make_request(q, drbd_make_request_26);
+ blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
+ blk_queue_merge_bvec(q, drbd_merge_bvec);
+ q->queue_lock = &mdev->req_lock; /* needed since we use */
+ /* plugging on a queue, that actually has no requests! */
+ q->unplug_fn = drbd_unplug_fn;
+
+ mdev->md_io_page = alloc_page(GFP_KERNEL);
+ if (!mdev->md_io_page)
+ goto out_no_io_page;
+
+ if (drbd_bm_init(mdev))
+ goto out_no_bitmap;
+ /* no need to lock access, we are still initializing this minor device. */
+ if (!tl_init(mdev))
+ goto out_no_tl;
+
+ mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
+ if (!mdev->app_reads_hash)
+ goto out_no_app_reads;
+
+ mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
+ if (!mdev->current_epoch)
+ goto out_no_epoch;
+
+ INIT_LIST_HEAD(&mdev->current_epoch->list);
+ mdev->epochs = 1;
+
+ return mdev;
+
+/* out_whatever_else:
+ kfree(mdev->current_epoch); */
+out_no_epoch:
+ kfree(mdev->app_reads_hash);
+out_no_app_reads:
+ tl_cleanup(mdev);
+out_no_tl:
+ drbd_bm_cleanup(mdev);
+out_no_bitmap:
+ __free_page(mdev->md_io_page);
+out_no_io_page:
+ put_disk(disk);
+out_no_disk:
+ blk_cleanup_queue(q);
+out_no_q:
+ free_cpumask_var(mdev->cpu_mask);
+out_no_cpumask:
+ kfree(mdev);
+ return NULL;
+}
+
+/* counterpart of drbd_new_device.
+ * last part of drbd_delete_device. */
+void drbd_free_mdev(struct drbd_conf *mdev)
+{
+ kfree(mdev->current_epoch);
+ kfree(mdev->app_reads_hash);
+ tl_cleanup(mdev);
+ if (mdev->bitmap) /* should no longer be there. */
+ drbd_bm_cleanup(mdev);
+ __free_page(mdev->md_io_page);
+ put_disk(mdev->vdisk);
+ blk_cleanup_queue(mdev->rq_queue);
+ free_cpumask_var(mdev->cpu_mask);
+ kfree(mdev);
+}
+
+
+int __init drbd_init(void)
+{
+ int err;
+
+ if (sizeof(struct p_handshake) != 80) {
+ printk(KERN_ERR
+ "drbd: never change the size or layout "
+ "of the HandShake packet.\n");
+ return -EINVAL;
+ }
+
+ if (1 > minor_count || minor_count > 255) {
+ printk(KERN_ERR
+ "drbd: invalid minor_count (%d)\n", minor_count);
+#ifdef MODULE
+ return -EINVAL;
+#else
+ minor_count = 8;
+#endif
+ }
+
+ err = drbd_nl_init();
+ if (err)
+ return err;
+
+ err = register_blkdev(DRBD_MAJOR, "drbd");
+ if (err) {
+ printk(KERN_ERR
+ "drbd: unable to register block device major %d\n",
+ DRBD_MAJOR);
+ return err;
+ }
+
+ register_reboot_notifier(&drbd_notifier);
+
+ /*
+ * allocate all necessary structs
+ */
+ err = -ENOMEM;
+
+ init_waitqueue_head(&drbd_pp_wait);
+
+ drbd_proc = NULL; /* play safe for drbd_cleanup */
+ minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
+ GFP_KERNEL);
+ if (!minor_table)
+ goto Enomem;
+
+ err = drbd_create_mempools();
+ if (err)
+ goto Enomem;
+
+ drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops);
+ if (!drbd_proc) {
+ printk(KERN_ERR "drbd: unable to register proc file\n");
+ goto Enomem;
+ }
+
+ rwlock_init(&global_state_lock);
+
+ printk(KERN_INFO "drbd: initialized. "
+ "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
+ API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
+ printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
+ printk(KERN_INFO "drbd: registered as block device major %d\n",
+ DRBD_MAJOR);
+ printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
+
+ return 0; /* Success! */
+
+Enomem:
+ drbd_cleanup();
+ if (err == -ENOMEM)
+ /* currently always the case */
+ printk(KERN_ERR "drbd: ran out of memory\n");
+ else
+ printk(KERN_ERR "drbd: initialization failure\n");
+ return err;
+}
+
+void drbd_free_bc(struct drbd_backing_dev *ldev)
+{
+ if (ldev == NULL)
+ return;
+
+ bd_release(ldev->backing_bdev);
+ bd_release(ldev->md_bdev);
+
+ fput(ldev->lo_file);
+ fput(ldev->md_file);
+
+ kfree(ldev);
+}
+
+void drbd_free_sock(struct drbd_conf *mdev)
+{
+ if (mdev->data.socket) {
+ kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
+ sock_release(mdev->data.socket);
+ mdev->data.socket = NULL;
+ }
+ if (mdev->meta.socket) {
+ kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
+ sock_release(mdev->meta.socket);
+ mdev->meta.socket = NULL;
+ }
+}
+
+
+void drbd_free_resources(struct drbd_conf *mdev)
+{
+ crypto_free_hash(mdev->csums_tfm);
+ mdev->csums_tfm = NULL;
+ crypto_free_hash(mdev->verify_tfm);
+ mdev->verify_tfm = NULL;
+ crypto_free_hash(mdev->cram_hmac_tfm);
+ mdev->cram_hmac_tfm = NULL;
+ crypto_free_hash(mdev->integrity_w_tfm);
+ mdev->integrity_w_tfm = NULL;
+ crypto_free_hash(mdev->integrity_r_tfm);
+ mdev->integrity_r_tfm = NULL;
+
+ drbd_free_sock(mdev);
+
+ __no_warn(local,
+ drbd_free_bc(mdev->ldev);
+ mdev->ldev = NULL;);
+}
+
+/* meta data management */
+
+struct meta_data_on_disk {
+ u64 la_size; /* last agreed size. */
+ u64 uuid[UI_SIZE]; /* UUIDs. */
+ u64 device_uuid;
+ u64 reserved_u64_1;
+ u32 flags; /* MDF */
+ u32 magic;
+ u32 md_size_sect;
+ u32 al_offset; /* offset to this block */
+ u32 al_nr_extents; /* important for restoring the AL */
+ /* `-- act_log->nr_elements <-- sync_conf.al_extents */
+ u32 bm_offset; /* offset to the bitmap, from here */
+ u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
+ u32 reserved_u32[4];
+
+} __packed;
+
+/**
+ * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
+ * @mdev: DRBD device.
+ */
+void drbd_md_sync(struct drbd_conf *mdev)
+{
+ struct meta_data_on_disk *buffer;
+ sector_t sector;
+ int i;
+
+ if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
+ return;
+ del_timer(&mdev->md_sync_timer);
+
+ /* We use here D_FAILED and not D_ATTACHING because we try to write
+ * metadata even if we detach due to a disk failure! */
+ if (!get_ldev_if_state(mdev, D_FAILED))
+ return;
+
+ mutex_lock(&mdev->md_io_mutex);
+ buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
+ memset(buffer, 0, 512);
+
+ buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
+ for (i = UI_CURRENT; i < UI_SIZE; i++)
+ buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
+ buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
+ buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
+
+ buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
+ buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
+ buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
+ buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
+ buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
+
+ buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
+
+ D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
+ sector = mdev->ldev->md.md_offset;
+
+ if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
+ clear_bit(MD_DIRTY, &mdev->flags);
+ } else {
+ /* this was a try anyways ... */
+ dev_err(DEV, "meta data update failed!\n");
+
+ drbd_chk_io_error(mdev, 1, TRUE);
+ }
+
+ /* Update mdev->ldev->md.la_size_sect,
+ * since we updated it on metadata. */
+ mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
+
+ mutex_unlock(&mdev->md_io_mutex);
+ put_ldev(mdev);
+}
+
+/**
+ * drbd_md_read() - Reads in the meta data super block
+ * @mdev: DRBD device.
+ * @bdev: Device from which the meta data should be read in.
+ *
+ * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
+ * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
+ */
+int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+{
+ struct meta_data_on_disk *buffer;
+ int i, rv = NO_ERROR;
+
+ if (!get_ldev_if_state(mdev, D_ATTACHING))
+ return ERR_IO_MD_DISK;
+
+ mutex_lock(&mdev->md_io_mutex);
+ buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
+
+ if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
+ /* NOTE: cant do normal error processing here as this is
+ called BEFORE disk is attached */
+ dev_err(DEV, "Error while reading metadata.\n");
+ rv = ERR_IO_MD_DISK;
+ goto err;
+ }
+
+ if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
+ dev_err(DEV, "Error while reading metadata, magic not found.\n");
+ rv = ERR_MD_INVALID;
+ goto err;
+ }
+ if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
+ dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
+ be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
+ rv = ERR_MD_INVALID;
+ goto err;
+ }
+ if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
+ dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
+ be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
+ rv = ERR_MD_INVALID;
+ goto err;
+ }
+ if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
+ dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
+ be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
+ rv = ERR_MD_INVALID;
+ goto err;
+ }
+
+ if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
+ dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
+ be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
+ rv = ERR_MD_INVALID;
+ goto err;
+ }
+
+ bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
+ for (i = UI_CURRENT; i < UI_SIZE; i++)
+ bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
+ bdev->md.flags = be32_to_cpu(buffer->flags);
+ mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
+ bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
+
+ if (mdev->sync_conf.al_extents < 7)
+ mdev->sync_conf.al_extents = 127;
+
+ err:
+ mutex_unlock(&mdev->md_io_mutex);
+ put_ldev(mdev);
+
+ return rv;
+}
+
+/**
+ * drbd_md_mark_dirty() - Mark meta data super block as dirty
+ * @mdev: DRBD device.
+ *
+ * Call this function if you change anything that should be written to
+ * the meta-data super block. This function sets MD_DIRTY, and starts a
+ * timer that ensures that within five seconds you have to call drbd_md_sync().
+ */
+void drbd_md_mark_dirty(struct drbd_conf *mdev)
+{
+ set_bit(MD_DIRTY, &mdev->flags);
+ mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
+}
+
+
+static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
+{
+ int i;
+
+ for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
+ mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
+}
+
+void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
+{
+ if (idx == UI_CURRENT) {
+ if (mdev->state.role == R_PRIMARY)
+ val |= 1;
+ else
+ val &= ~((u64)1);
+
+ drbd_set_ed_uuid(mdev, val);
+ }
+
+ mdev->ldev->md.uuid[idx] = val;
+ drbd_md_mark_dirty(mdev);
+}
+
+
+void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
+{
+ if (mdev->ldev->md.uuid[idx]) {
+ drbd_uuid_move_history(mdev);
+ mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
+ }
+ _drbd_uuid_set(mdev, idx, val);
+}
+
+/**
+ * drbd_uuid_new_current() - Creates a new current UUID
+ * @mdev: DRBD device.
+ *
+ * Creates a new current UUID, and rotates the old current UUID into
+ * the bitmap slot. Causes an incremental resync upon next connect.
+ */
+void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
+{
+ u64 val;
+
+ dev_info(DEV, "Creating new current UUID\n");
+ D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
+ mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
+
+ get_random_bytes(&val, sizeof(u64));
+ _drbd_uuid_set(mdev, UI_CURRENT, val);
+}
+
+void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
+{
+ if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
+ return;
+
+ if (val == 0) {
+ drbd_uuid_move_history(mdev);
+ mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
+ mdev->ldev->md.uuid[UI_BITMAP] = 0;
+ } else {
+ if (mdev->ldev->md.uuid[UI_BITMAP])
+ dev_warn(DEV, "bm UUID already set");
+
+ mdev->ldev->md.uuid[UI_BITMAP] = val;
+ mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
+
+ }
+ drbd_md_mark_dirty(mdev);
+}
+
+/**
+ * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
+ * @mdev: DRBD device.
+ *
+ * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
+ */
+int drbd_bmio_set_n_write(struct drbd_conf *mdev)
+{
+ int rv = -EIO;
+
+ if (get_ldev_if_state(mdev, D_ATTACHING)) {
+ drbd_md_set_flag(mdev, MDF_FULL_SYNC);
+ drbd_md_sync(mdev);
+ drbd_bm_set_all(mdev);
+
+ rv = drbd_bm_write(mdev);
+
+ if (!rv) {
+ drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
+ drbd_md_sync(mdev);
+ }
+
+ put_ldev(mdev);
+ }
+
+ return rv;
+}
+
+/**
+ * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
+ * @mdev: DRBD device.
+ *
+ * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
+ */
+int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
+{
+ int rv = -EIO;
+
+ if (get_ldev_if_state(mdev, D_ATTACHING)) {
+ drbd_bm_clear_all(mdev);
+ rv = drbd_bm_write(mdev);
+ put_ldev(mdev);
+ }
+
+ return rv;
+}
+
+static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+ struct bm_io_work *work = container_of(w, struct bm_io_work, w);
+ int rv;
+
+ D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
+
+ drbd_bm_lock(mdev, work->why);
+ rv = work->io_fn(mdev);
+ drbd_bm_unlock(mdev);
+
+ clear_bit(BITMAP_IO, &mdev->flags);
+ wake_up(&mdev->misc_wait);
+
+ if (work->done)
+ work->done(mdev, rv);
+
+ clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
+ work->why = NULL;
+
+ return 1;
+}
+
+/**
+ * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
+ * @mdev: DRBD device.
+ * @io_fn: IO callback to be called when bitmap IO is possible
+ * @done: callback to be called after the bitmap IO was performed
+ * @why: Descriptive text of the reason for doing the IO
+ *
+ * While IO on the bitmap happens we freeze application IO thus we ensure
+ * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
+ * called from worker context. It MUST NOT be used while a previous such
+ * work is still pending!
+ */
+void drbd_queue_bitmap_io(struct drbd_conf *mdev,
+ int (*io_fn)(struct drbd_conf *),
+ void (*done)(struct drbd_conf *, int),
+ char *why)
+{
+ D_ASSERT(current == mdev->worker.task);
+
+ D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
+ D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
+ D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
+ if (mdev->bm_io_work.why)
+ dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
+ why, mdev->bm_io_work.why);
+
+ mdev->bm_io_work.io_fn = io_fn;
+ mdev->bm_io_work.done = done;
+ mdev->bm_io_work.why = why;
+
+ set_bit(BITMAP_IO, &mdev->flags);
+ if (atomic_read(&mdev->ap_bio_cnt) == 0) {
+ if (list_empty(&mdev->bm_io_work.w.list)) {
+ set_bit(BITMAP_IO_QUEUED, &mdev->flags);
+ drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
+ } else
+ dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
+ }
+}
+
+/**
+ * drbd_bitmap_io() - Does an IO operation on the whole bitmap
+ * @mdev: DRBD device.
+ * @io_fn: IO callback to be called when bitmap IO is possible
+ * @why: Descriptive text of the reason for doing the IO
+ *
+ * freezes application IO while that the actual IO operations runs. This
+ * functions MAY NOT be called from worker context.
+ */
+int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
+{
+ int rv;
+
+ D_ASSERT(current != mdev->worker.task);
+
+ drbd_suspend_io(mdev);
+
+ drbd_bm_lock(mdev, why);
+ rv = io_fn(mdev);
+ drbd_bm_unlock(mdev);
+
+ drbd_resume_io(mdev);
+
+ return rv;
+}
+
+void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
+{
+ if ((mdev->ldev->md.flags & flag) != flag) {
+ drbd_md_mark_dirty(mdev);
+ mdev->ldev->md.flags |= flag;
+ }
+}
+
+void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
+{
+ if ((mdev->ldev->md.flags & flag) != 0) {
+ drbd_md_mark_dirty(mdev);
+ mdev->ldev->md.flags &= ~flag;
+ }
+}
+int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
+{
+ return (bdev->md.flags & flag) != 0;
+}
+
+static void md_sync_timer_fn(unsigned long data)
+{
+ struct drbd_conf *mdev = (struct drbd_conf *) data;
+
+ drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
+}
+
+static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+ dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
+ drbd_md_sync(mdev);
+
+ return 1;
+}
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+/* Fault insertion support including random number generator shamelessly
+ * stolen from kernel/rcutorture.c */
+struct fault_random_state {
+ unsigned long state;
+ unsigned long count;
+};
+
+#define FAULT_RANDOM_MULT 39916801 /* prime */
+#define FAULT_RANDOM_ADD 479001701 /* prime */
+#define FAULT_RANDOM_REFRESH 10000
+
+/*
+ * Crude but fast random-number generator. Uses a linear congruential
+ * generator, with occasional help from get_random_bytes().
+ */
+static unsigned long
+_drbd_fault_random(struct fault_random_state *rsp)
+{
+ long refresh;
+
+ if (--rsp->count < 0) {
+ get_random_bytes(&refresh, sizeof(refresh));
+ rsp->state += refresh;
+ rsp->count = FAULT_RANDOM_REFRESH;
+ }
+ rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
+ return swahw32(rsp->state);
+}
+
+static char *
+_drbd_fault_str(unsigned int type) {
+ static char *_faults[] = {
+ [DRBD_FAULT_MD_WR] = "Meta-data write",
+ [DRBD_FAULT_MD_RD] = "Meta-data read",
+ [DRBD_FAULT_RS_WR] = "Resync write",
+ [DRBD_FAULT_RS_RD] = "Resync read",
+ [DRBD_FAULT_DT_WR] = "Data write",
+ [DRBD_FAULT_DT_RD] = "Data read",
+ [DRBD_FAULT_DT_RA] = "Data read ahead",
+ [DRBD_FAULT_BM_ALLOC] = "BM allocation",
+ [DRBD_FAULT_AL_EE] = "EE allocation"
+ };
+
+ return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
+}
+
+unsigned int
+_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
+{
+ static struct fault_random_state rrs = {0, 0};
+
+ unsigned int ret = (
+ (fault_devs == 0 ||
+ ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
+ (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
+
+ if (ret) {
+ fault_count++;
+
+ if (printk_ratelimit())
+ dev_warn(DEV, "***Simulating %s failure\n",
+ _drbd_fault_str(type));
+ }
+
+ return ret;
+}
+#endif
+
+const char *drbd_buildtag(void)
+{
+ /* DRBD built from external sources has here a reference to the
+ git hash of the source code. */
+
+ static char buildtag[38] = "\0uilt-in";
+
+ if (buildtag[0] == 0) {
+#ifdef CONFIG_MODULES
+ if (THIS_MODULE != NULL)
+ sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
+ else
+#endif
+ buildtag[0] = 'b';
+ }
+
+ return buildtag;
+}
+
+module_init(drbd_init)
+module_exit(drbd_cleanup)
+
+EXPORT_SYMBOL(drbd_conn_str);
+EXPORT_SYMBOL(drbd_role_str);
+EXPORT_SYMBOL(drbd_disk_str);
+EXPORT_SYMBOL(drbd_set_st_err_str);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
new file mode 100644
index 000000000000..436a090b532b
--- /dev/null
+++ b/drivers/block/drbd/drbd_nl.c
@@ -0,0 +1,2364 @@
+/*
+ drbd_nl.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+#include <linux/drbd.h>
+#include <linux/in.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/connector.h>
+#include <linux/blkpg.h>
+#include <linux/cpumask.h>
+#include "drbd_int.h"
+#include "drbd_wrappers.h"
+#include <asm/unaligned.h>
+#include <linux/drbd_tag_magic.h>
+#include <linux/drbd_limits.h>
+
+static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
+static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
+static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *);
+
+/* see get_sb_bdev and bd_claim */
+static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
+
+/* Generate the tag_list to struct functions */
+#define NL_PACKET(name, number, fields) \
+static int name ## _from_tags(struct drbd_conf *mdev, \
+ unsigned short *tags, struct name *arg) __attribute__ ((unused)); \
+static int name ## _from_tags(struct drbd_conf *mdev, \
+ unsigned short *tags, struct name *arg) \
+{ \
+ int tag; \
+ int dlen; \
+ \
+ while ((tag = get_unaligned(tags++)) != TT_END) { \
+ dlen = get_unaligned(tags++); \
+ switch (tag_number(tag)) { \
+ fields \
+ default: \
+ if (tag & T_MANDATORY) { \
+ dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \
+ return 0; \
+ } \
+ } \
+ tags = (unsigned short *)((char *)tags + dlen); \
+ } \
+ return 1; \
+}
+#define NL_INTEGER(pn, pr, member) \
+ case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \
+ arg->member = get_unaligned((int *)(tags)); \
+ break;
+#define NL_INT64(pn, pr, member) \
+ case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \
+ arg->member = get_unaligned((u64 *)(tags)); \
+ break;
+#define NL_BIT(pn, pr, member) \
+ case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \
+ arg->member = *(char *)(tags) ? 1 : 0; \
+ break;
+#define NL_STRING(pn, pr, member, len) \
+ case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \
+ if (dlen > len) { \
+ dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \
+ #member, dlen, (unsigned int)len); \
+ return 0; \
+ } \
+ arg->member ## _len = dlen; \
+ memcpy(arg->member, tags, min_t(size_t, dlen, len)); \
+ break;
+#include "linux/drbd_nl.h"
+
+/* Generate the struct to tag_list functions */
+#define NL_PACKET(name, number, fields) \
+static unsigned short* \
+name ## _to_tags(struct drbd_conf *mdev, \
+ struct name *arg, unsigned short *tags) __attribute__ ((unused)); \
+static unsigned short* \
+name ## _to_tags(struct drbd_conf *mdev, \
+ struct name *arg, unsigned short *tags) \
+{ \
+ fields \
+ return tags; \
+}
+
+#define NL_INTEGER(pn, pr, member) \
+ put_unaligned(pn | pr | TT_INTEGER, tags++); \
+ put_unaligned(sizeof(int), tags++); \
+ put_unaligned(arg->member, (int *)tags); \
+ tags = (unsigned short *)((char *)tags+sizeof(int));
+#define NL_INT64(pn, pr, member) \
+ put_unaligned(pn | pr | TT_INT64, tags++); \
+ put_unaligned(sizeof(u64), tags++); \
+ put_unaligned(arg->member, (u64 *)tags); \
+ tags = (unsigned short *)((char *)tags+sizeof(u64));
+#define NL_BIT(pn, pr, member) \
+ put_unaligned(pn | pr | TT_BIT, tags++); \
+ put_unaligned(sizeof(char), tags++); \
+ *(char *)tags = arg->member; \
+ tags = (unsigned short *)((char *)tags+sizeof(char));
+#define NL_STRING(pn, pr, member, len) \
+ put_unaligned(pn | pr | TT_STRING, tags++); \
+ put_unaligned(arg->member ## _len, tags++); \
+ memcpy(tags, arg->member, arg->member ## _len); \
+ tags = (unsigned short *)((char *)tags + arg->member ## _len);
+#include "linux/drbd_nl.h"
+
+void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name);
+void drbd_nl_send_reply(struct cn_msg *, int);
+
+int drbd_khelper(struct drbd_conf *mdev, char *cmd)
+{
+ char *envp[] = { "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ NULL, /* Will be set to address family */
+ NULL, /* Will be set to address */
+ NULL };
+
+ char mb[12], af[20], ad[60], *afs;
+ char *argv[] = {usermode_helper, cmd, mb, NULL };
+ int ret;
+
+ snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev));
+
+ if (get_net_conf(mdev)) {
+ switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) {
+ case AF_INET6:
+ afs = "ipv6";
+ snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6",
+ &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr);
+ break;
+ case AF_INET:
+ afs = "ipv4";
+ snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
+ &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
+ break;
+ default:
+ afs = "ssocks";
+ snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
+ &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
+ }
+ snprintf(af, 20, "DRBD_PEER_AF=%s", afs);
+ envp[3]=af;
+ envp[4]=ad;
+ put_net_conf(mdev);
+ }
+
+ dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
+
+ drbd_bcast_ev_helper(mdev, cmd);
+ ret = call_usermodehelper(usermode_helper, argv, envp, 1);
+ if (ret)
+ dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
+ usermode_helper, cmd, mb,
+ (ret >> 8) & 0xff, ret);
+ else
+ dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
+ usermode_helper, cmd, mb,
+ (ret >> 8) & 0xff, ret);
+
+ if (ret < 0) /* Ignore any ERRNOs we got. */
+ ret = 0;
+
+ return ret;
+}
+
+enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
+{
+ char *ex_to_string;
+ int r;
+ enum drbd_disk_state nps;
+ enum drbd_fencing_p fp;
+
+ D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
+
+ if (get_ldev_if_state(mdev, D_CONSISTENT)) {
+ fp = mdev->ldev->dc.fencing;
+ put_ldev(mdev);
+ } else {
+ dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
+ return mdev->state.pdsk;
+ }
+
+ if (fp == FP_STONITH)
+ _drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE);
+
+ r = drbd_khelper(mdev, "fence-peer");
+
+ switch ((r>>8) & 0xff) {
+ case 3: /* peer is inconsistent */
+ ex_to_string = "peer is inconsistent or worse";
+ nps = D_INCONSISTENT;
+ break;
+ case 4: /* peer got outdated, or was already outdated */
+ ex_to_string = "peer was fenced";
+ nps = D_OUTDATED;
+ break;
+ case 5: /* peer was down */
+ if (mdev->state.disk == D_UP_TO_DATE) {
+ /* we will(have) create(d) a new UUID anyways... */
+ ex_to_string = "peer is unreachable, assumed to be dead";
+ nps = D_OUTDATED;
+ } else {
+ ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
+ nps = mdev->state.pdsk;
+ }
+ break;
+ case 6: /* Peer is primary, voluntarily outdate myself.
+ * This is useful when an unconnected R_SECONDARY is asked to
+ * become R_PRIMARY, but finds the other peer being active. */
+ ex_to_string = "peer is active";
+ dev_warn(DEV, "Peer is primary, outdating myself.\n");
+ nps = D_UNKNOWN;
+ _drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE);
+ break;
+ case 7:
+ if (fp != FP_STONITH)
+ dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n");
+ ex_to_string = "peer was stonithed";
+ nps = D_OUTDATED;
+ break;
+ default:
+ /* The script is broken ... */
+ nps = D_UNKNOWN;
+ dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
+ return nps;
+ }
+
+ dev_info(DEV, "fence-peer helper returned %d (%s)\n",
+ (r>>8) & 0xff, ex_to_string);
+ return nps;
+}
+
+
+int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
+{
+ const int max_tries = 4;
+ int r = 0;
+ int try = 0;
+ int forced = 0;
+ union drbd_state mask, val;
+ enum drbd_disk_state nps;
+
+ if (new_role == R_PRIMARY)
+ request_ping(mdev); /* Detect a dead peer ASAP */
+
+ mutex_lock(&mdev->state_mutex);
+
+ mask.i = 0; mask.role = R_MASK;
+ val.i = 0; val.role = new_role;
+
+ while (try++ < max_tries) {
+ r = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE);
+
+ /* in case we first succeeded to outdate,
+ * but now suddenly could establish a connection */
+ if (r == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
+ val.pdsk = 0;
+ mask.pdsk = 0;
+ continue;
+ }
+
+ if (r == SS_NO_UP_TO_DATE_DISK && force &&
+ (mdev->state.disk == D_INCONSISTENT ||
+ mdev->state.disk == D_OUTDATED)) {
+ mask.disk = D_MASK;
+ val.disk = D_UP_TO_DATE;
+ forced = 1;
+ continue;
+ }
+
+ if (r == SS_NO_UP_TO_DATE_DISK &&
+ mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) {
+ D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
+ nps = drbd_try_outdate_peer(mdev);
+
+ if (nps == D_OUTDATED || nps == D_INCONSISTENT) {
+ val.disk = D_UP_TO_DATE;
+ mask.disk = D_MASK;
+ }
+
+ val.pdsk = nps;
+ mask.pdsk = D_MASK;
+
+ continue;
+ }
+
+ if (r == SS_NOTHING_TO_DO)
+ goto fail;
+ if (r == SS_PRIMARY_NOP && mask.pdsk == 0) {
+ nps = drbd_try_outdate_peer(mdev);
+
+ if (force && nps > D_OUTDATED) {
+ dev_warn(DEV, "Forced into split brain situation!\n");
+ nps = D_OUTDATED;
+ }
+
+ mask.pdsk = D_MASK;
+ val.pdsk = nps;
+
+ continue;
+ }
+ if (r == SS_TWO_PRIMARIES) {
+ /* Maybe the peer is detected as dead very soon...
+ retry at most once more in this case. */
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10);
+ if (try < max_tries)
+ try = max_tries - 1;
+ continue;
+ }
+ if (r < SS_SUCCESS) {
+ r = _drbd_request_state(mdev, mask, val,
+ CS_VERBOSE + CS_WAIT_COMPLETE);
+ if (r < SS_SUCCESS)
+ goto fail;
+ }
+ break;
+ }
+
+ if (r < SS_SUCCESS)
+ goto fail;
+
+ if (forced)
+ dev_warn(DEV, "Forced to consider local data as UpToDate!\n");
+
+ /* Wait until nothing is on the fly :) */
+ wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0);
+
+ if (new_role == R_SECONDARY) {
+ set_disk_ro(mdev->vdisk, TRUE);
+ if (get_ldev(mdev)) {
+ mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
+ put_ldev(mdev);
+ }
+ } else {
+ if (get_net_conf(mdev)) {
+ mdev->net_conf->want_lose = 0;
+ put_net_conf(mdev);
+ }
+ set_disk_ro(mdev->vdisk, FALSE);
+ if (get_ldev(mdev)) {
+ if (((mdev->state.conn < C_CONNECTED ||
+ mdev->state.pdsk <= D_FAILED)
+ && mdev->ldev->md.uuid[UI_BITMAP] == 0) || forced)
+ drbd_uuid_new_current(mdev);
+
+ mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1;
+ put_ldev(mdev);
+ }
+ }
+
+ if ((new_role == R_SECONDARY) && get_ldev(mdev)) {
+ drbd_al_to_on_disk_bm(mdev);
+ put_ldev(mdev);
+ }
+
+ if (mdev->state.conn >= C_WF_REPORT_PARAMS) {
+ /* if this was forced, we should consider sync */
+ if (forced)
+ drbd_send_uuids(mdev);
+ drbd_send_state(mdev);
+ }
+
+ drbd_md_sync(mdev);
+
+ kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
+ fail:
+ mutex_unlock(&mdev->state_mutex);
+ return r;
+}
+
+
+static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+ struct primary primary_args;
+
+ memset(&primary_args, 0, sizeof(struct primary));
+ if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) {
+ reply->ret_code = ERR_MANDATORY_TAG;
+ return 0;
+ }
+
+ reply->ret_code =
+ drbd_set_role(mdev, R_PRIMARY, primary_args.overwrite_peer);
+
+ return 0;
+}
+
+static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+ reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0);
+
+ return 0;
+}
+
+/* initializes the md.*_offset members, so we are able to find
+ * the on disk meta data */
+static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
+ struct drbd_backing_dev *bdev)
+{
+ sector_t md_size_sect = 0;
+ switch (bdev->dc.meta_dev_idx) {
+ default:
+ /* v07 style fixed size indexed meta data */
+ bdev->md.md_size_sect = MD_RESERVED_SECT;
+ bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
+ bdev->md.al_offset = MD_AL_OFFSET;
+ bdev->md.bm_offset = MD_BM_OFFSET;
+ break;
+ case DRBD_MD_INDEX_FLEX_EXT:
+ /* just occupy the full device; unit: sectors */
+ bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
+ bdev->md.md_offset = 0;
+ bdev->md.al_offset = MD_AL_OFFSET;
+ bdev->md.bm_offset = MD_BM_OFFSET;
+ break;
+ case DRBD_MD_INDEX_INTERNAL:
+ case DRBD_MD_INDEX_FLEX_INT:
+ bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
+ /* al size is still fixed */
+ bdev->md.al_offset = -MD_AL_MAX_SIZE;
+ /* we need (slightly less than) ~ this much bitmap sectors: */
+ md_size_sect = drbd_get_capacity(bdev->backing_bdev);
+ md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
+ md_size_sect = BM_SECT_TO_EXT(md_size_sect);
+ md_size_sect = ALIGN(md_size_sect, 8);
+
+ /* plus the "drbd meta data super block",
+ * and the activity log; */
+ md_size_sect += MD_BM_OFFSET;
+
+ bdev->md.md_size_sect = md_size_sect;
+ /* bitmap offset is adjusted by 'super' block size */
+ bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET;
+ break;
+ }
+}
+
+char *ppsize(char *buf, unsigned long long size)
+{
+ /* Needs 9 bytes at max. */
+ static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
+ int base = 0;
+ while (size >= 10000) {
+ /* shift + round */
+ size = (size >> 10) + !!(size & (1<<9));
+ base++;
+ }
+ sprintf(buf, "%lu %cB", (long)size, units[base]);
+
+ return buf;
+}
+
+/* there is still a theoretical deadlock when called from receiver
+ * on an D_INCONSISTENT R_PRIMARY:
+ * remote READ does inc_ap_bio, receiver would need to receive answer
+ * packet from remote to dec_ap_bio again.
+ * receiver receive_sizes(), comes here,
+ * waits for ap_bio_cnt == 0. -> deadlock.
+ * but this cannot happen, actually, because:
+ * R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable
+ * (not connected, or bad/no disk on peer):
+ * see drbd_fail_request_early, ap_bio_cnt is zero.
+ * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
+ * peer may not initiate a resize.
+ */
+void drbd_suspend_io(struct drbd_conf *mdev)
+{
+ set_bit(SUSPEND_IO, &mdev->flags);
+ wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
+}
+
+void drbd_resume_io(struct drbd_conf *mdev)
+{
+ clear_bit(SUSPEND_IO, &mdev->flags);
+ wake_up(&mdev->misc_wait);
+}
+
+/**
+ * drbd_determine_dev_size() - Sets the right device size obeying all constraints
+ * @mdev: DRBD device.
+ *
+ * Returns 0 on success, negative return values indicate errors.
+ * You should call drbd_md_sync() after calling this function.
+ */
+enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev) __must_hold(local)
+{
+ sector_t prev_first_sect, prev_size; /* previous meta location */
+ sector_t la_size;
+ sector_t size;
+ char ppb[10];
+
+ int md_moved, la_size_changed;
+ enum determine_dev_size rv = unchanged;
+
+ /* race:
+ * application request passes inc_ap_bio,
+ * but then cannot get an AL-reference.
+ * this function later may wait on ap_bio_cnt == 0. -> deadlock.
+ *
+ * to avoid that:
+ * Suspend IO right here.
+ * still lock the act_log to not trigger ASSERTs there.
+ */
+ drbd_suspend_io(mdev);
+
+ /* no wait necessary anymore, actually we could assert that */
+ wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+
+ prev_first_sect = drbd_md_first_sector(mdev->ldev);
+ prev_size = mdev->ldev->md.md_size_sect;
+ la_size = mdev->ldev->md.la_size_sect;
+
+ /* TODO: should only be some assert here, not (re)init... */
+ drbd_md_set_sector_offsets(mdev, mdev->ldev);
+
+ size = drbd_new_dev_size(mdev, mdev->ldev);
+
+ if (drbd_get_capacity(mdev->this_bdev) != size ||
+ drbd_bm_capacity(mdev) != size) {
+ int err;
+ err = drbd_bm_resize(mdev, size);
+ if (unlikely(err)) {
+ /* currently there is only one error: ENOMEM! */
+ size = drbd_bm_capacity(mdev)>>1;
+ if (size == 0) {
+ dev_err(DEV, "OUT OF MEMORY! "
+ "Could not allocate bitmap!\n");
+ } else {
+ dev_err(DEV, "BM resizing failed. "
+ "Leaving size unchanged at size = %lu KB\n",
+ (unsigned long)size);
+ }
+ rv = dev_size_error;
+ }
+ /* racy, see comments above. */
+ drbd_set_my_capacity(mdev, size);
+ mdev->ldev->md.la_size_sect = size;
+ dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
+ (unsigned long long)size>>1);
+ }
+ if (rv == dev_size_error)
+ goto out;
+
+ la_size_changed = (la_size != mdev->ldev->md.la_size_sect);
+
+ md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
+ || prev_size != mdev->ldev->md.md_size_sect;
+
+ if (la_size_changed || md_moved) {
+ drbd_al_shrink(mdev); /* All extents inactive. */
+ dev_info(DEV, "Writing the whole bitmap, %s\n",
+ la_size_changed && md_moved ? "size changed and md moved" :
+ la_size_changed ? "size changed" : "md moved");
+ rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed"); /* does drbd_resume_io() ! */
+ drbd_md_mark_dirty(mdev);
+ }
+
+ if (size > la_size)
+ rv = grew;
+ if (size < la_size)
+ rv = shrunk;
+out:
+ lc_unlock(mdev->act_log);
+ wake_up(&mdev->al_wait);
+ drbd_resume_io(mdev);
+
+ return rv;
+}
+
+sector_t
+drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+{
+ sector_t p_size = mdev->p_size; /* partner's disk size. */
+ sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */
+ sector_t m_size; /* my size */
+ sector_t u_size = bdev->dc.disk_size; /* size requested by user. */
+ sector_t size = 0;
+
+ m_size = drbd_get_max_capacity(bdev);
+
+ if (p_size && m_size) {
+ size = min_t(sector_t, p_size, m_size);
+ } else {
+ if (la_size) {
+ size = la_size;
+ if (m_size && m_size < size)
+ size = m_size;
+ if (p_size && p_size < size)
+ size = p_size;
+ } else {
+ if (m_size)
+ size = m_size;
+ if (p_size)
+ size = p_size;
+ }
+ }
+
+ if (size == 0)
+ dev_err(DEV, "Both nodes diskless!\n");
+
+ if (u_size) {
+ if (u_size > size)
+ dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n",
+ (unsigned long)u_size>>1, (unsigned long)size>>1);
+ else
+ size = u_size;
+ }
+
+ return size;
+}
+
+/**
+ * drbd_check_al_size() - Ensures that the AL is of the right size
+ * @mdev: DRBD device.
+ *
+ * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation
+ * failed, and 0 on success. You should call drbd_md_sync() after you called
+ * this function.
+ */
+static int drbd_check_al_size(struct drbd_conf *mdev)
+{
+ struct lru_cache *n, *t;
+ struct lc_element *e;
+ unsigned int in_use;
+ int i;
+
+ ERR_IF(mdev->sync_conf.al_extents < 7)
+ mdev->sync_conf.al_extents = 127;
+
+ if (mdev->act_log &&
+ mdev->act_log->nr_elements == mdev->sync_conf.al_extents)
+ return 0;
+
+ in_use = 0;
+ t = mdev->act_log;
+ n = lc_create("act_log", drbd_al_ext_cache,
+ mdev->sync_conf.al_extents, sizeof(struct lc_element), 0);
+
+ if (n == NULL) {
+ dev_err(DEV, "Cannot allocate act_log lru!\n");
+ return -ENOMEM;
+ }
+ spin_lock_irq(&mdev->al_lock);
+ if (t) {
+ for (i = 0; i < t->nr_elements; i++) {
+ e = lc_element_by_index(t, i);
+ if (e->refcnt)
+ dev_err(DEV, "refcnt(%d)==%d\n",
+ e->lc_number, e->refcnt);
+ in_use += e->refcnt;
+ }
+ }
+ if (!in_use)
+ mdev->act_log = n;
+ spin_unlock_irq(&mdev->al_lock);
+ if (in_use) {
+ dev_err(DEV, "Activity log still in use!\n");
+ lc_destroy(n);
+ return -EBUSY;
+ } else {
+ if (t)
+ lc_destroy(t);
+ }
+ drbd_md_mark_dirty(mdev); /* we changed mdev->act_log->nr_elemens */
+ return 0;
+}
+
+void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __must_hold(local)
+{
+ struct request_queue * const q = mdev->rq_queue;
+ struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
+ int max_segments = mdev->ldev->dc.max_bio_bvecs;
+
+ if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv)
+ max_seg_s = PAGE_SIZE;
+
+ max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s);
+
+ blk_queue_max_sectors(q, max_seg_s >> 9);
+ blk_queue_max_phys_segments(q, max_segments ? max_segments : MAX_PHYS_SEGMENTS);
+ blk_queue_max_hw_segments(q, max_segments ? max_segments : MAX_HW_SEGMENTS);
+ blk_queue_max_segment_size(q, max_seg_s);
+ blk_queue_logical_block_size(q, 512);
+ blk_queue_segment_boundary(q, PAGE_SIZE-1);
+ blk_stack_limits(&q->limits, &b->limits, 0);
+
+ if (b->merge_bvec_fn)
+ dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n",
+ b->merge_bvec_fn);
+ dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q));
+
+ if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
+ dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
+ q->backing_dev_info.ra_pages,
+ b->backing_dev_info.ra_pages);
+ q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
+ }
+}
+
+/* serialize deconfig (worker exiting, doing cleanup)
+ * and reconfig (drbdsetup disk, drbdsetup net)
+ *
+ * wait for a potentially exiting worker, then restart it,
+ * or start a new one.
+ */
+static void drbd_reconfig_start(struct drbd_conf *mdev)
+{
+ wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags));
+ wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags));
+ drbd_thread_start(&mdev->worker);
+}
+
+/* if still unconfigured, stops worker again.
+ * if configured now, clears CONFIG_PENDING.
+ * wakes potential waiters */
+static void drbd_reconfig_done(struct drbd_conf *mdev)
+{
+ spin_lock_irq(&mdev->req_lock);
+ if (mdev->state.disk == D_DISKLESS &&
+ mdev->state.conn == C_STANDALONE &&
+ mdev->state.role == R_SECONDARY) {
+ set_bit(DEVICE_DYING, &mdev->flags);
+ drbd_thread_stop_nowait(&mdev->worker);
+ } else
+ clear_bit(CONFIG_PENDING, &mdev->flags);
+ spin_unlock_irq(&mdev->req_lock);
+ wake_up(&mdev->state_wait);
+}
+
+/* does always return 0;
+ * interesting return code is in reply->ret_code */
+static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+ enum drbd_ret_codes retcode;
+ enum determine_dev_size dd;
+ sector_t max_possible_sectors;
+ sector_t min_md_device_sectors;
+ struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
+ struct inode *inode, *inode2;
+ struct lru_cache *resync_lru = NULL;
+ union drbd_state ns, os;
+ int rv;
+ int cp_discovered = 0;
+ int logical_block_size;
+
+ drbd_reconfig_start(mdev);
+
+ /* if you want to reconfigure, please tear down first */
+ if (mdev->state.disk > D_DISKLESS) {
+ retcode = ERR_DISK_CONFIGURED;
+ goto fail;
+ }
+
+ /* allocation not in the IO path, cqueue thread context */
+ nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
+ if (!nbc) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+
+ nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF;
+ nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF;
+ nbc->dc.fencing = DRBD_FENCING_DEF;
+ nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF;
+
+ if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) {
+ retcode = ERR_MANDATORY_TAG;
+ goto fail;
+ }
+
+ if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
+ retcode = ERR_MD_IDX_INVALID;
+ goto fail;
+ }
+
+ nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0);
+ if (IS_ERR(nbc->lo_file)) {
+ dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
+ PTR_ERR(nbc->lo_file));
+ nbc->lo_file = NULL;
+ retcode = ERR_OPEN_DISK;
+ goto fail;
+ }
+
+ inode = nbc->lo_file->f_dentry->d_inode;
+
+ if (!S_ISBLK(inode->i_mode)) {
+ retcode = ERR_DISK_NOT_BDEV;
+ goto fail;
+ }
+
+ nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0);
+ if (IS_ERR(nbc->md_file)) {
+ dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
+ PTR_ERR(nbc->md_file));
+ nbc->md_file = NULL;
+ retcode = ERR_OPEN_MD_DISK;
+ goto fail;
+ }
+
+ inode2 = nbc->md_file->f_dentry->d_inode;
+
+ if (!S_ISBLK(inode2->i_mode)) {
+ retcode = ERR_MD_NOT_BDEV;
+ goto fail;
+ }
+
+ nbc->backing_bdev = inode->i_bdev;
+ if (bd_claim(nbc->backing_bdev, mdev)) {
+ printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n",
+ nbc->backing_bdev, mdev,
+ nbc->backing_bdev->bd_holder,
+ nbc->backing_bdev->bd_contains->bd_holder,
+ nbc->backing_bdev->bd_holders);
+ retcode = ERR_BDCLAIM_DISK;
+ goto fail;
+ }
+
+ resync_lru = lc_create("resync", drbd_bm_ext_cache,
+ 61, sizeof(struct bm_extent),
+ offsetof(struct bm_extent, lce));
+ if (!resync_lru) {
+ retcode = ERR_NOMEM;
+ goto release_bdev_fail;
+ }
+
+ /* meta_dev_idx >= 0: external fixed size,
+ * possibly multiple drbd sharing one meta device.
+ * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is
+ * not yet used by some other drbd minor!
+ * (if you use drbd.conf + drbdadm,
+ * that should check it for you already; but if you don't, or someone
+ * fooled it, we need to double check here) */
+ nbc->md_bdev = inode2->i_bdev;
+ if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev
+ : (void *) drbd_m_holder)) {
+ retcode = ERR_BDCLAIM_MD_DISK;
+ goto release_bdev_fail;
+ }
+
+ if ((nbc->backing_bdev == nbc->md_bdev) !=
+ (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+ nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
+ retcode = ERR_MD_IDX_INVALID;
+ goto release_bdev2_fail;
+ }
+
+ /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
+ drbd_md_set_sector_offsets(mdev, nbc);
+
+ if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) {
+ dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
+ (unsigned long long) drbd_get_max_capacity(nbc),
+ (unsigned long long) nbc->dc.disk_size);
+ retcode = ERR_DISK_TO_SMALL;
+ goto release_bdev2_fail;
+ }
+
+ if (nbc->dc.meta_dev_idx < 0) {
+ max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
+ /* at least one MB, otherwise it does not make sense */
+ min_md_device_sectors = (2<<10);
+ } else {
+ max_possible_sectors = DRBD_MAX_SECTORS;
+ min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1);
+ }
+
+ if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
+ retcode = ERR_MD_DISK_TO_SMALL;
+ dev_warn(DEV, "refusing attach: md-device too small, "
+ "at least %llu sectors needed for this meta-disk type\n",
+ (unsigned long long) min_md_device_sectors);
+ goto release_bdev2_fail;
+ }
+
+ /* Make sure the new disk is big enough
+ * (we may currently be R_PRIMARY with no local disk...) */
+ if (drbd_get_max_capacity(nbc) <
+ drbd_get_capacity(mdev->this_bdev)) {
+ retcode = ERR_DISK_TO_SMALL;
+ goto release_bdev2_fail;
+ }
+
+ nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
+
+ if (nbc->known_size > max_possible_sectors) {
+ dev_warn(DEV, "==> truncating very big lower level device "
+ "to currently maximum possible %llu sectors <==\n",
+ (unsigned long long) max_possible_sectors);
+ if (nbc->dc.meta_dev_idx >= 0)
+ dev_warn(DEV, "==>> using internal or flexible "
+ "meta data may help <<==\n");
+ }
+
+ drbd_suspend_io(mdev);
+ /* also wait for the last barrier ack. */
+ wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt));
+ /* and for any other previously queued work */
+ drbd_flush_workqueue(mdev);
+
+ retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
+ drbd_resume_io(mdev);
+ if (retcode < SS_SUCCESS)
+ goto release_bdev2_fail;
+
+ if (!get_ldev_if_state(mdev, D_ATTACHING))
+ goto force_diskless;
+
+ drbd_md_set_sector_offsets(mdev, nbc);
+
+ if (!mdev->bitmap) {
+ if (drbd_bm_init(mdev)) {
+ retcode = ERR_NOMEM;
+ goto force_diskless_dec;
+ }
+ }
+
+ retcode = drbd_md_read(mdev, nbc);
+ if (retcode != NO_ERROR)
+ goto force_diskless_dec;
+
+ if (mdev->state.conn < C_CONNECTED &&
+ mdev->state.role == R_PRIMARY &&
+ (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
+ dev_err(DEV, "Can only attach to data with current UUID=%016llX\n",
+ (unsigned long long)mdev->ed_uuid);
+ retcode = ERR_DATA_NOT_CURRENT;
+ goto force_diskless_dec;
+ }
+
+ /* Since we are diskless, fix the activity log first... */
+ if (drbd_check_al_size(mdev)) {
+ retcode = ERR_NOMEM;
+ goto force_diskless_dec;
+ }
+
+ /* Prevent shrinking of consistent devices ! */
+ if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
+ drbd_new_dev_size(mdev, nbc) < nbc->md.la_size_sect) {
+ dev_warn(DEV, "refusing to truncate a consistent device\n");
+ retcode = ERR_DISK_TO_SMALL;
+ goto force_diskless_dec;
+ }
+
+ if (!drbd_al_read_log(mdev, nbc)) {
+ retcode = ERR_IO_MD_DISK;
+ goto force_diskless_dec;
+ }
+
+ /* allocate a second IO page if logical_block_size != 512 */
+ logical_block_size = bdev_logical_block_size(nbc->md_bdev);
+ if (logical_block_size == 0)
+ logical_block_size = MD_SECTOR_SIZE;
+
+ if (logical_block_size != MD_SECTOR_SIZE) {
+ if (!mdev->md_io_tmpp) {
+ struct page *page = alloc_page(GFP_NOIO);
+ if (!page)
+ goto force_diskless_dec;
+
+ dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n",
+ logical_block_size, MD_SECTOR_SIZE);
+ dev_warn(DEV, "Workaround engaged (has performance impact).\n");
+
+ mdev->md_io_tmpp = page;
+ }
+ }
+
+ /* Reset the "barriers don't work" bits here, then force meta data to
+ * be written, to ensure we determine if barriers are supported. */
+ if (nbc->dc.no_md_flush)
+ set_bit(MD_NO_BARRIER, &mdev->flags);
+ else
+ clear_bit(MD_NO_BARRIER, &mdev->flags);
+
+ /* Point of no return reached.
+ * Devices and memory are no longer released by error cleanup below.
+ * now mdev takes over responsibility, and the state engine should
+ * clean it up somewhere. */
+ D_ASSERT(mdev->ldev == NULL);
+ mdev->ldev = nbc;
+ mdev->resync = resync_lru;
+ nbc = NULL;
+ resync_lru = NULL;
+
+ mdev->write_ordering = WO_bio_barrier;
+ drbd_bump_write_ordering(mdev, WO_bio_barrier);
+
+ if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
+ set_bit(CRASHED_PRIMARY, &mdev->flags);
+ else
+ clear_bit(CRASHED_PRIMARY, &mdev->flags);
+
+ if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) {
+ set_bit(CRASHED_PRIMARY, &mdev->flags);
+ cp_discovered = 1;
+ }
+
+ mdev->send_cnt = 0;
+ mdev->recv_cnt = 0;
+ mdev->read_cnt = 0;
+ mdev->writ_cnt = 0;
+
+ drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE);
+
+ /* If I am currently not R_PRIMARY,
+ * but meta data primary indicator is set,
+ * I just now recover from a hard crash,
+ * and have been R_PRIMARY before that crash.
+ *
+ * Now, if I had no connection before that crash
+ * (have been degraded R_PRIMARY), chances are that
+ * I won't find my peer now either.
+ *
+ * In that case, and _only_ in that case,
+ * we use the degr-wfc-timeout instead of the default,
+ * so we can automatically recover from a crash of a
+ * degraded but active "cluster" after a certain timeout.
+ */
+ clear_bit(USE_DEGR_WFC_T, &mdev->flags);
+ if (mdev->state.role != R_PRIMARY &&
+ drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
+ !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
+ set_bit(USE_DEGR_WFC_T, &mdev->flags);
+
+ dd = drbd_determin_dev_size(mdev);
+ if (dd == dev_size_error) {
+ retcode = ERR_NOMEM_BITMAP;
+ goto force_diskless_dec;
+ } else if (dd == grew)
+ set_bit(RESYNC_AFTER_NEG, &mdev->flags);
+
+ if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
+ dev_info(DEV, "Assuming that all blocks are out of sync "
+ "(aka FullSync)\n");
+ if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) {
+ retcode = ERR_IO_MD_DISK;
+ goto force_diskless_dec;
+ }
+ } else {
+ if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) {
+ retcode = ERR_IO_MD_DISK;
+ goto force_diskless_dec;
+ }
+ }
+
+ if (cp_discovered) {
+ drbd_al_apply_to_bm(mdev);
+ drbd_al_to_on_disk_bm(mdev);
+ }
+
+ spin_lock_irq(&mdev->req_lock);
+ os = mdev->state;
+ ns.i = os.i;
+ /* If MDF_CONSISTENT is not set go into inconsistent state,
+ otherwise investigate MDF_WasUpToDate...
+ If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
+ otherwise into D_CONSISTENT state.
+ */
+ if (drbd_md_test_flag(mdev->ldev, MDF_CONSISTENT)) {
+ if (drbd_md_test_flag(mdev->ldev, MDF_WAS_UP_TO_DATE))
+ ns.disk = D_CONSISTENT;
+ else
+ ns.disk = D_OUTDATED;
+ } else {
+ ns.disk = D_INCONSISTENT;
+ }
+
+ if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED))
+ ns.pdsk = D_OUTDATED;
+
+ if ( ns.disk == D_CONSISTENT &&
+ (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE))
+ ns.disk = D_UP_TO_DATE;
+
+ /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
+ MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
+ this point, because drbd_request_state() modifies these
+ flags. */
+
+ /* In case we are C_CONNECTED postpone any decision on the new disk
+ state after the negotiation phase. */
+ if (mdev->state.conn == C_CONNECTED) {
+ mdev->new_state_tmp.i = ns.i;
+ ns.i = os.i;
+ ns.disk = D_NEGOTIATING;
+ }
+
+ rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
+ ns = mdev->state;
+ spin_unlock_irq(&mdev->req_lock);
+
+ if (rv < SS_SUCCESS)
+ goto force_diskless_dec;
+
+ if (mdev->state.role == R_PRIMARY)
+ mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1;
+ else
+ mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
+
+ drbd_md_mark_dirty(mdev);
+ drbd_md_sync(mdev);
+
+ kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
+ put_ldev(mdev);
+ reply->ret_code = retcode;
+ drbd_reconfig_done(mdev);
+ return 0;
+
+ force_diskless_dec:
+ put_ldev(mdev);
+ force_diskless:
+ drbd_force_state(mdev, NS(disk, D_DISKLESS));
+ drbd_md_sync(mdev);
+ release_bdev2_fail:
+ if (nbc)
+ bd_release(nbc->md_bdev);
+ release_bdev_fail:
+ if (nbc)
+ bd_release(nbc->backing_bdev);
+ fail:
+ if (nbc) {
+ if (nbc->lo_file)
+ fput(nbc->lo_file);
+ if (nbc->md_file)
+ fput(nbc->md_file);
+ kfree(nbc);
+ }
+ lc_destroy(resync_lru);
+
+ reply->ret_code = retcode;
+ drbd_reconfig_done(mdev);
+ return 0;
+}
+
+static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+ reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS));
+ return 0;
+}
+
+static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+ int i, ns;
+ enum drbd_ret_codes retcode;
+ struct net_conf *new_conf = NULL;
+ struct crypto_hash *tfm = NULL;
+ struct crypto_hash *integrity_w_tfm = NULL;
+ struct crypto_hash *integrity_r_tfm = NULL;
+ struct hlist_head *new_tl_hash = NULL;
+ struct hlist_head *new_ee_hash = NULL;
+ struct drbd_conf *odev;
+ char hmac_name[CRYPTO_MAX_ALG_NAME];
+ void *int_dig_out = NULL;
+ void *int_dig_in = NULL;
+ void *int_dig_vv = NULL;
+ struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr;
+
+ drbd_reconfig_start(mdev);
+
+ if (mdev->state.conn > C_STANDALONE) {
+ retcode = ERR_NET_CONFIGURED;
+ goto fail;
+ }
+
+ /* allocation not in the IO path, cqueue thread context */
+ new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
+ if (!new_conf) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+
+ memset(new_conf, 0, sizeof(struct net_conf));
+ new_conf->timeout = DRBD_TIMEOUT_DEF;
+ new_conf->try_connect_int = DRBD_CONNECT_INT_DEF;
+ new_conf->ping_int = DRBD_PING_INT_DEF;
+ new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF;
+ new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF;
+ new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF;
+ new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF;
+ new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF;
+ new_conf->ko_count = DRBD_KO_COUNT_DEF;
+ new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF;
+ new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF;
+ new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF;
+ new_conf->want_lose = 0;
+ new_conf->two_primaries = 0;
+ new_conf->wire_protocol = DRBD_PROT_C;
+ new_conf->ping_timeo = DRBD_PING_TIMEO_DEF;
+ new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF;
+
+ if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) {
+ retcode = ERR_MANDATORY_TAG;
+ goto fail;
+ }
+
+ if (new_conf->two_primaries
+ && (new_conf->wire_protocol != DRBD_PROT_C)) {
+ retcode = ERR_NOT_PROTO_C;
+ goto fail;
+ };
+
+ if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
+ retcode = ERR_DISCARD;
+ goto fail;
+ }
+
+ retcode = NO_ERROR;
+
+ new_my_addr = (struct sockaddr *)&new_conf->my_addr;
+ new_peer_addr = (struct sockaddr *)&new_conf->peer_addr;
+ for (i = 0; i < minor_count; i++) {
+ odev = minor_to_mdev(i);
+ if (!odev || odev == mdev)
+ continue;
+ if (get_net_conf(odev)) {
+ taken_addr = (struct sockaddr *)&odev->net_conf->my_addr;
+ if (new_conf->my_addr_len == odev->net_conf->my_addr_len &&
+ !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len))
+ retcode = ERR_LOCAL_ADDR;
+
+ taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr;
+ if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len &&
+ !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len))
+ retcode = ERR_PEER_ADDR;
+
+ put_net_conf(odev);
+ if (retcode != NO_ERROR)
+ goto fail;
+ }
+ }
+
+ if (new_conf->cram_hmac_alg[0] != 0) {
+ snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
+ new_conf->cram_hmac_alg);
+ tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm)) {
+ tfm = NULL;
+ retcode = ERR_AUTH_ALG;
+ goto fail;
+ }
+
+ if (crypto_tfm_alg_type(crypto_hash_tfm(tfm))
+ != CRYPTO_ALG_TYPE_HASH) {
+ retcode = ERR_AUTH_ALG_ND;
+ goto fail;
+ }
+ }
+
+ if (new_conf->integrity_alg[0]) {
+ integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(integrity_w_tfm)) {
+ integrity_w_tfm = NULL;
+ retcode=ERR_INTEGRITY_ALG;
+ goto fail;
+ }
+
+ if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) {
+ retcode=ERR_INTEGRITY_ALG_ND;
+ goto fail;
+ }
+
+ integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(integrity_r_tfm)) {
+ integrity_r_tfm = NULL;
+ retcode=ERR_INTEGRITY_ALG;
+ goto fail;
+ }
+ }
+
+ ns = new_conf->max_epoch_size/8;
+ if (mdev->tl_hash_s != ns) {
+ new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
+ if (!new_tl_hash) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+ }
+
+ ns = new_conf->max_buffers/8;
+ if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) {
+ new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
+ if (!new_ee_hash) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+ }
+
+ ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
+
+ if (integrity_w_tfm) {
+ i = crypto_hash_digestsize(integrity_w_tfm);
+ int_dig_out = kmalloc(i, GFP_KERNEL);
+ if (!int_dig_out) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+ int_dig_in = kmalloc(i, GFP_KERNEL);
+ if (!int_dig_in) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+ int_dig_vv = kmalloc(i, GFP_KERNEL);
+ if (!int_dig_vv) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+ }
+
+ if (!mdev->bitmap) {
+ if(drbd_bm_init(mdev)) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+ }
+
+ spin_lock_irq(&mdev->req_lock);
+ if (mdev->net_conf != NULL) {
+ retcode = ERR_NET_CONFIGURED;
+ spin_unlock_irq(&mdev->req_lock);
+ goto fail;
+ }
+ mdev->net_conf = new_conf;
+
+ mdev->send_cnt = 0;
+ mdev->recv_cnt = 0;
+
+ if (new_tl_hash) {
+ kfree(mdev->tl_hash);
+ mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8;
+ mdev->tl_hash = new_tl_hash;
+ }
+
+ if (new_ee_hash) {
+ kfree(mdev->ee_hash);
+ mdev->ee_hash_s = mdev->net_conf->max_buffers/8;
+ mdev->ee_hash = new_ee_hash;
+ }
+
+ crypto_free_hash(mdev->cram_hmac_tfm);
+ mdev->cram_hmac_tfm = tfm;
+
+ crypto_free_hash(mdev->integrity_w_tfm);
+ mdev->integrity_w_tfm = integrity_w_tfm;
+
+ crypto_free_hash(mdev->integrity_r_tfm);
+ mdev->integrity_r_tfm = integrity_r_tfm;
+
+ kfree(mdev->int_dig_out);
+ kfree(mdev->int_dig_in);
+ kfree(mdev->int_dig_vv);
+ mdev->int_dig_out=int_dig_out;
+ mdev->int_dig_in=int_dig_in;
+ mdev->int_dig_vv=int_dig_vv;
+ spin_unlock_irq(&mdev->req_lock);
+
+ retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE);
+
+ kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
+ reply->ret_code = retcode;
+ drbd_reconfig_done(mdev);
+ return 0;
+
+fail:
+ kfree(int_dig_out);
+ kfree(int_dig_in);
+ kfree(int_dig_vv);
+ crypto_free_hash(tfm);
+ crypto_free_hash(integrity_w_tfm);
+ crypto_free_hash(integrity_r_tfm);
+ kfree(new_tl_hash);
+ kfree(new_ee_hash);
+ kfree(new_conf);
+
+ reply->ret_code = retcode;
+ drbd_reconfig_done(mdev);
+ return 0;
+}
+
+static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+ int retcode;
+
+ retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED);
+
+ if (retcode == SS_NOTHING_TO_DO)
+ goto done;
+ else if (retcode == SS_ALREADY_STANDALONE)
+ goto done;
+ else if (retcode == SS_PRIMARY_NOP) {
+ /* Our statche checking code wants to see the peer outdated. */
+ retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
+ pdsk, D_OUTDATED));
+ } else if (retcode == SS_CW_FAILED_BY_PEER) {
+ /* The peer probably wants to see us outdated. */
+ retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
+ disk, D_OUTDATED),
+ CS_ORDERED);
+ if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) {
+ drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ retcode = SS_SUCCESS;
+ }
+ }
+
+ if (retcode < SS_SUCCESS)
+ goto fail;
+
+ if (wait_event_interruptible(mdev->state_wait,
+ mdev->state.conn != C_DISCONNECTING)) {
+ /* Do not test for mdev->state.conn == C_STANDALONE, since
+ someone else might connect us in the mean time! */
+ retcode = ERR_INTR;
+ goto fail;
+ }
+
+ done:
+ retcode = NO_ERROR;
+ fail:
+ drbd_md_sync(mdev);
+ reply->ret_code = retcode;
+ return 0;
+}
+
+void resync_after_online_grow(struct drbd_conf *mdev)
+{
+ int iass; /* I am sync source */
+
+ dev_info(DEV, "Resync of new storage after online grow\n");
+ if (mdev->state.role != mdev->state.peer)
+ iass = (mdev->state.role == R_PRIMARY);
+ else
+ iass = test_bit(DISCARD_CONCURRENT, &mdev->flags);
+
+ if (iass)
+ drbd_start_resync(mdev, C_SYNC_SOURCE);
+ else
+ _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
+}
+
+static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+ struct resize rs;
+ int retcode = NO_ERROR;
+ int ldsc = 0; /* local disk size changed */
+ enum determine_dev_size dd;
+
+ memset(&rs, 0, sizeof(struct resize));
+ if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
+ retcode = ERR_MANDATORY_TAG;
+ goto fail;
+ }
+
+ if (mdev->state.conn > C_CONNECTED) {
+ retcode = ERR_RESIZE_RESYNC;
+ goto fail;
+ }
+
+ if (mdev->state.role == R_SECONDARY &&
+ mdev->state.peer == R_SECONDARY) {
+ retcode = ERR_NO_PRIMARY;
+ goto fail;
+ }
+
+ if (!get_ldev(mdev)) {
+ retcode = ERR_NO_DISK;
+ goto fail;
+ }
+
+ if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
+ mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
+ ldsc = 1;
+ }
+
+ mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
+ dd = drbd_determin_dev_size(mdev);
+ drbd_md_sync(mdev);
+ put_ldev(mdev);
+ if (dd == dev_size_error) {
+ retcode = ERR_NOMEM_BITMAP;
+ goto fail;
+ }
+
+ if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) {
+ if (dd == grew)
+ set_bit(RESIZE_PENDING, &mdev->flags);
+
+ drbd_send_uuids(mdev);
+ drbd_send_sizes(mdev, 1);
+ }
+
+ fail:
+ reply->ret_code = retcode;
+ return 0;
+}
+
+static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+ int retcode = NO_ERROR;
+ int err;
+ int ovr; /* online verify running */
+ int rsr; /* re-sync running */
+ struct crypto_hash *verify_tfm = NULL;
+ struct crypto_hash *csums_tfm = NULL;
+ struct syncer_conf sc;
+ cpumask_var_t new_cpu_mask;
+
+ if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+
+ if (nlp->flags & DRBD_NL_SET_DEFAULTS) {
+ memset(&sc, 0, sizeof(struct syncer_conf));
+ sc.rate = DRBD_RATE_DEF;
+ sc.after = DRBD_AFTER_DEF;
+ sc.al_extents = DRBD_AL_EXTENTS_DEF;
+ } else
+ memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
+
+ if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) {
+ retcode = ERR_MANDATORY_TAG;
+ goto fail;
+ }
+
+ /* re-sync running */
+ rsr = ( mdev->state.conn == C_SYNC_SOURCE ||
+ mdev->state.conn == C_SYNC_TARGET ||
+ mdev->state.conn == C_PAUSED_SYNC_S ||
+ mdev->state.conn == C_PAUSED_SYNC_T );
+
+ if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) {
+ retcode = ERR_CSUMS_RESYNC_RUNNING;
+ goto fail;
+ }
+
+ if (!rsr && sc.csums_alg[0]) {
+ csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(csums_tfm)) {
+ csums_tfm = NULL;
+ retcode = ERR_CSUMS_ALG;
+ goto fail;
+ }
+
+ if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) {
+ retcode = ERR_CSUMS_ALG_ND;
+ goto fail;
+ }
+ }
+
+ /* online verify running */
+ ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T);
+
+ if (ovr) {
+ if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) {
+ retcode = ERR_VERIFY_RUNNING;
+ goto fail;
+ }
+ }
+
+ if (!ovr && sc.verify_alg[0]) {
+ verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(verify_tfm)) {
+ verify_tfm = NULL;
+ retcode = ERR_VERIFY_ALG;
+ goto fail;
+ }
+
+ if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) {
+ retcode = ERR_VERIFY_ALG_ND;
+ goto fail;
+ }
+ }
+
+ /* silently ignore cpu mask on UP kernel */
+ if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) {
+ err = __bitmap_parse(sc.cpu_mask, 32, 0,
+ cpumask_bits(new_cpu_mask), nr_cpu_ids);
+ if (err) {
+ dev_warn(DEV, "__bitmap_parse() failed with %d\n", err);
+ retcode = ERR_CPU_MASK_PARSE;
+ goto fail;
+ }
+ }
+
+ ERR_IF (sc.rate < 1) sc.rate = 1;
+ ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */
+#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT)
+ if (sc.al_extents > AL_MAX) {
+ dev_err(DEV, "sc.al_extents > %d\n", AL_MAX);
+ sc.al_extents = AL_MAX;
+ }
+#undef AL_MAX
+
+ /* most sanity checks done, try to assign the new sync-after
+ * dependency. need to hold the global lock in there,
+ * to avoid a race in the dependency loop check. */
+ retcode = drbd_alter_sa(mdev, sc.after);
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ /* ok, assign the rest of it as well.
+ * lock against receive_SyncParam() */
+ spin_lock(&mdev->peer_seq_lock);
+ mdev->sync_conf = sc;
+
+ if (!rsr) {
+ crypto_free_hash(mdev->csums_tfm);
+ mdev->csums_tfm = csums_tfm;
+ csums_tfm = NULL;
+ }
+
+ if (!ovr) {
+ crypto_free_hash(mdev->verify_tfm);
+ mdev->verify_tfm = verify_tfm;
+ verify_tfm = NULL;
+ }
+ spin_unlock(&mdev->peer_seq_lock);
+
+ if (get_ldev(mdev)) {
+ wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+ drbd_al_shrink(mdev);
+ err = drbd_check_al_size(mdev);
+ lc_unlock(mdev->act_log);
+ wake_up(&mdev->al_wait);
+
+ put_ldev(mdev);
+ drbd_md_sync(mdev);
+
+ if (err) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+ }
+
+ if (mdev->state.conn >= C_CONNECTED)
+ drbd_send_sync_param(mdev, &sc);
+
+ if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) {
+ cpumask_copy(mdev->cpu_mask, new_cpu_mask);
+ drbd_calc_cpu_mask(mdev);
+ mdev->receiver.reset_cpu_mask = 1;
+ mdev->asender.reset_cpu_mask = 1;
+ mdev->worker.reset_cpu_mask = 1;
+ }
+
+ kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
+fail:
+ free_cpumask_var(new_cpu_mask);
+ crypto_free_hash(csums_tfm);
+ crypto_free_hash(verify_tfm);
+ reply->ret_code = retcode;
+ return 0;
+}
+
+static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+ int retcode;
+
+ retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
+
+ if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION)
+ retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
+
+ while (retcode == SS_NEED_CONNECTION) {
+ spin_lock_irq(&mdev->req_lock);
+ if (mdev->state.conn < C_CONNECTED)
+ retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL);
+ spin_unlock_irq(&mdev->req_lock);
+
+ if (retcode != SS_NEED_CONNECTION)
+ break;
+
+ retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
+ }
+
+ reply->ret_code = retcode;
+ return 0;
+}
+
+static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+
+ reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
+
+ return 0;
+}
+
+static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+ int retcode = NO_ERROR;
+
+ if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
+ retcode = ERR_PAUSE_IS_SET;
+
+ reply->ret_code = retcode;
+ return 0;
+}
+
+static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+ int retcode = NO_ERROR;
+
+ if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO)
+ retcode = ERR_PAUSE_IS_CLEAR;
+
+ reply->ret_code = retcode;
+ return 0;
+}
+
+static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+ reply->ret_code = drbd_request_state(mdev, NS(susp, 1));
+
+ return 0;
+}
+
+static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+ reply->ret_code = drbd_request_state(mdev, NS(susp, 0));
+ return 0;
+}
+
+static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+ reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED));
+ return 0;
+}
+
+static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+ unsigned short *tl;
+
+ tl = reply->tag_list;
+
+ if (get_ldev(mdev)) {
+ tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl);
+ put_ldev(mdev);
+ }
+
+ if (get_net_conf(mdev)) {
+ tl = net_conf_to_tags(mdev, mdev->net_conf, tl);
+ put_net_conf(mdev);
+ }
+ tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl);
+
+ put_unaligned(TT_END, tl++); /* Close the tag list */
+
+ return (int)((char *)tl - (char *)reply->tag_list);
+}
+
+static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+ unsigned short *tl = reply->tag_list;
+ union drbd_state s = mdev->state;
+ unsigned long rs_left;
+ unsigned int res;
+
+ tl = get_state_to_tags(mdev, (struct get_state *)&s, tl);
+
+ /* no local ref, no bitmap, no syncer progress. */
+ if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) {
+ if (get_ldev(mdev)) {
+ drbd_get_syncer_progress(mdev, &rs_left, &res);
+ tl = tl_add_int(tl, T_sync_progress, &res);
+ put_ldev(mdev);
+ }
+ }
+ put_unaligned(TT_END, tl++); /* Close the tag list */
+
+ return (int)((char *)tl - (char *)reply->tag_list);
+}
+
+static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+ unsigned short *tl;
+
+ tl = reply->tag_list;
+
+ if (get_ldev(mdev)) {
+ tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64));
+ tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags);
+ put_ldev(mdev);
+ }
+ put_unaligned(TT_END, tl++); /* Close the tag list */
+
+ return (int)((char *)tl - (char *)reply->tag_list);
+}
+
+/**
+ * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use
+ * @mdev: DRBD device.
+ * @nlp: Netlink/connector packet from drbdsetup
+ * @reply: Reply packet for drbdsetup
+ */
+static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+ unsigned short *tl;
+ char rv;
+
+ tl = reply->tag_list;
+
+ rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
+ test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT;
+
+ tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv));
+ put_unaligned(TT_END, tl++); /* Close the tag list */
+
+ return (int)((char *)tl - (char *)reply->tag_list);
+}
+
+static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+ /* default to resume from last known position, if possible */
+ struct start_ov args =
+ { .start_sector = mdev->ov_start_sector };
+
+ if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) {
+ reply->ret_code = ERR_MANDATORY_TAG;
+ return 0;
+ }
+ /* w_make_ov_request expects position to be aligned */
+ mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
+ reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
+ return 0;
+}
+
+
+static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+ struct drbd_nl_cfg_reply *reply)
+{
+ int retcode = NO_ERROR;
+ int skip_initial_sync = 0;
+ int err;
+
+ struct new_c_uuid args;
+
+ memset(&args, 0, sizeof(struct new_c_uuid));
+ if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) {
+ reply->ret_code = ERR_MANDATORY_TAG;
+ return 0;
+ }
+
+ mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */
+
+ if (!get_ldev(mdev)) {
+ retcode = ERR_NO_DISK;
+ goto out;
+ }
+
+ /* this is "skip initial sync", assume to be clean */
+ if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 &&
+ mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
+ dev_info(DEV, "Preparing to skip initial sync\n");
+ skip_initial_sync = 1;
+ } else if (mdev->state.conn != C_STANDALONE) {
+ retcode = ERR_CONNECTED;
+ goto out_dec;
+ }
+
+ drbd_uuid_set(mdev, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
+ drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */
+
+ if (args.clear_bm) {
+ err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid");
+ if (err) {
+ dev_err(DEV, "Writing bitmap failed with %d\n",err);
+ retcode = ERR_IO_MD_DISK;
+ }
+ if (skip_initial_sync) {
+ drbd_send_uuids_skip_initial_sync(mdev);
+ _drbd_uuid_set(mdev, UI_BITMAP, 0);
+ spin_lock_irq(&mdev->req_lock);
+ _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
+ CS_VERBOSE, NULL);
+ spin_unlock_irq(&mdev->req_lock);
+ }
+ }
+
+ drbd_md_sync(mdev);
+out_dec:
+ put_ldev(mdev);
+out:
+ mutex_unlock(&mdev->state_mutex);
+
+ reply->ret_code = retcode;
+ return 0;
+}
+
+static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp)
+{
+ struct drbd_conf *mdev;
+
+ if (nlp->drbd_minor >= minor_count)
+ return NULL;
+
+ mdev = minor_to_mdev(nlp->drbd_minor);
+
+ if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) {
+ struct gendisk *disk = NULL;
+ mdev = drbd_new_device(nlp->drbd_minor);
+
+ spin_lock_irq(&drbd_pp_lock);
+ if (minor_table[nlp->drbd_minor] == NULL) {
+ minor_table[nlp->drbd_minor] = mdev;
+ disk = mdev->vdisk;
+ mdev = NULL;
+ } /* else: we lost the race */
+ spin_unlock_irq(&drbd_pp_lock);
+
+ if (disk) /* we won the race above */
+ /* in case we ever add a drbd_delete_device(),
+ * don't forget the del_gendisk! */
+ add_disk(disk);
+ else /* we lost the race above */
+ drbd_free_mdev(mdev);
+
+ mdev = minor_to_mdev(nlp->drbd_minor);
+ }
+
+ return mdev;
+}
+
+struct cn_handler_struct {
+ int (*function)(struct drbd_conf *,
+ struct drbd_nl_cfg_req *,
+ struct drbd_nl_cfg_reply *);
+ int reply_body_size;
+};
+
+static struct cn_handler_struct cnd_table[] = {
+ [ P_primary ] = { &drbd_nl_primary, 0 },
+ [ P_secondary ] = { &drbd_nl_secondary, 0 },
+ [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 },
+ [ P_detach ] = { &drbd_nl_detach, 0 },
+ [ P_net_conf ] = { &drbd_nl_net_conf, 0 },
+ [ P_disconnect ] = { &drbd_nl_disconnect, 0 },
+ [ P_resize ] = { &drbd_nl_resize, 0 },
+ [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 },
+ [ P_invalidate ] = { &drbd_nl_invalidate, 0 },
+ [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 },
+ [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 },
+ [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 },
+ [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 },
+ [ P_resume_io ] = { &drbd_nl_resume_io, 0 },
+ [ P_outdate ] = { &drbd_nl_outdate, 0 },
+ [ P_get_config ] = { &drbd_nl_get_config,
+ sizeof(struct syncer_conf_tag_len_struct) +
+ sizeof(struct disk_conf_tag_len_struct) +
+ sizeof(struct net_conf_tag_len_struct) },
+ [ P_get_state ] = { &drbd_nl_get_state,
+ sizeof(struct get_state_tag_len_struct) +
+ sizeof(struct sync_progress_tag_len_struct) },
+ [ P_get_uuids ] = { &drbd_nl_get_uuids,
+ sizeof(struct get_uuids_tag_len_struct) },
+ [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag,
+ sizeof(struct get_timeout_flag_tag_len_struct)},
+ [ P_start_ov ] = { &drbd_nl_start_ov, 0 },
+ [ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 },
+};
+
+static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp)
+{
+ struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data;
+ struct cn_handler_struct *cm;
+ struct cn_msg *cn_reply;
+ struct drbd_nl_cfg_reply *reply;
+ struct drbd_conf *mdev;
+ int retcode, rr;
+ int reply_size = sizeof(struct cn_msg)
+ + sizeof(struct drbd_nl_cfg_reply)
+ + sizeof(short int);
+
+ if (!try_module_get(THIS_MODULE)) {
+ printk(KERN_ERR "drbd: try_module_get() failed!\n");
+ return;
+ }
+
+ if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) {
+ retcode = ERR_PERM;
+ goto fail;
+ }
+
+ mdev = ensure_mdev(nlp);
+ if (!mdev) {
+ retcode = ERR_MINOR_INVALID;
+ goto fail;
+ }
+
+ if (nlp->packet_type >= P_nl_after_last_packet) {
+ retcode = ERR_PACKET_NR;
+ goto fail;
+ }
+
+ cm = cnd_table + nlp->packet_type;
+
+ /* This may happen if packet number is 0: */
+ if (cm->function == NULL) {
+ retcode = ERR_PACKET_NR;
+ goto fail;
+ }
+
+ reply_size += cm->reply_body_size;
+
+ /* allocation not in the IO path, cqueue thread context */
+ cn_reply = kmalloc(reply_size, GFP_KERNEL);
+ if (!cn_reply) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+ reply = (struct drbd_nl_cfg_reply *) cn_reply->data;
+
+ reply->packet_type =
+ cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet;
+ reply->minor = nlp->drbd_minor;
+ reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */
+ /* reply->tag_list; might be modified by cm->function. */
+
+ rr = cm->function(mdev, nlp, reply);
+
+ cn_reply->id = req->id;
+ cn_reply->seq = req->seq;
+ cn_reply->ack = req->ack + 1;
+ cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr;
+ cn_reply->flags = 0;
+
+ rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL);
+ if (rr && rr != -ESRCH)
+ printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
+
+ kfree(cn_reply);
+ module_put(THIS_MODULE);
+ return;
+ fail:
+ drbd_nl_send_reply(req, retcode);
+ module_put(THIS_MODULE);
+}
+
+static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */
+
+static unsigned short *
+__tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data,
+ unsigned short len, int nul_terminated)
+{
+ unsigned short l = tag_descriptions[tag_number(tag)].max_len;
+ len = (len < l) ? len : l;
+ put_unaligned(tag, tl++);
+ put_unaligned(len, tl++);
+ memcpy(tl, data, len);
+ tl = (unsigned short*)((char*)tl + len);
+ if (nul_terminated)
+ *((char*)tl - 1) = 0;
+ return tl;
+}
+
+static unsigned short *
+tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len)
+{
+ return __tl_add_blob(tl, tag, data, len, 0);
+}
+
+static unsigned short *
+tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str)
+{
+ return __tl_add_blob(tl, tag, str, strlen(str)+1, 0);
+}
+
+static unsigned short *
+tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val)
+{
+ put_unaligned(tag, tl++);
+ switch(tag_type(tag)) {
+ case TT_INTEGER:
+ put_unaligned(sizeof(int), tl++);
+ put_unaligned(*(int *)val, (int *)tl);
+ tl = (unsigned short*)((char*)tl+sizeof(int));
+ break;
+ case TT_INT64:
+ put_unaligned(sizeof(u64), tl++);
+ put_unaligned(*(u64 *)val, (u64 *)tl);
+ tl = (unsigned short*)((char*)tl+sizeof(u64));
+ break;
+ default:
+ /* someone did something stupid. */
+ ;
+ }
+ return tl;
+}
+
+void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state)
+{
+ char buffer[sizeof(struct cn_msg)+
+ sizeof(struct drbd_nl_cfg_reply)+
+ sizeof(struct get_state_tag_len_struct)+
+ sizeof(short int)];
+ struct cn_msg *cn_reply = (struct cn_msg *) buffer;
+ struct drbd_nl_cfg_reply *reply =
+ (struct drbd_nl_cfg_reply *)cn_reply->data;
+ unsigned short *tl = reply->tag_list;
+
+ /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
+
+ tl = get_state_to_tags(mdev, (struct get_state *)&state, tl);
+
+ put_unaligned(TT_END, tl++); /* Close the tag list */
+
+ cn_reply->id.idx = CN_IDX_DRBD;
+ cn_reply->id.val = CN_VAL_DRBD;
+
+ cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
+ cn_reply->ack = 0; /* not used here. */
+ cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
+ (int)((char *)tl - (char *)reply->tag_list);
+ cn_reply->flags = 0;
+
+ reply->packet_type = P_get_state;
+ reply->minor = mdev_to_minor(mdev);
+ reply->ret_code = NO_ERROR;
+
+ cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+}
+
+void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name)
+{
+ char buffer[sizeof(struct cn_msg)+
+ sizeof(struct drbd_nl_cfg_reply)+
+ sizeof(struct call_helper_tag_len_struct)+
+ sizeof(short int)];
+ struct cn_msg *cn_reply = (struct cn_msg *) buffer;
+ struct drbd_nl_cfg_reply *reply =
+ (struct drbd_nl_cfg_reply *)cn_reply->data;
+ unsigned short *tl = reply->tag_list;
+
+ /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
+
+ tl = tl_add_str(tl, T_helper, helper_name);
+ put_unaligned(TT_END, tl++); /* Close the tag list */
+
+ cn_reply->id.idx = CN_IDX_DRBD;
+ cn_reply->id.val = CN_VAL_DRBD;
+
+ cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
+ cn_reply->ack = 0; /* not used here. */
+ cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
+ (int)((char *)tl - (char *)reply->tag_list);
+ cn_reply->flags = 0;
+
+ reply->packet_type = P_call_helper;
+ reply->minor = mdev_to_minor(mdev);
+ reply->ret_code = NO_ERROR;
+
+ cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+}
+
+void drbd_bcast_ee(struct drbd_conf *mdev,
+ const char *reason, const int dgs,
+ const char* seen_hash, const char* calc_hash,
+ const struct drbd_epoch_entry* e)
+{
+ struct cn_msg *cn_reply;
+ struct drbd_nl_cfg_reply *reply;
+ struct bio_vec *bvec;
+ unsigned short *tl;
+ int i;
+
+ if (!e)
+ return;
+ if (!reason || !reason[0])
+ return;
+
+ /* apparently we have to memcpy twice, first to prepare the data for the
+ * struct cn_msg, then within cn_netlink_send from the cn_msg to the
+ * netlink skb. */
+ /* receiver thread context, which is not in the writeout path (of this node),
+ * but may be in the writeout path of the _other_ node.
+ * GFP_NOIO to avoid potential "distributed deadlock". */
+ cn_reply = kmalloc(
+ sizeof(struct cn_msg)+
+ sizeof(struct drbd_nl_cfg_reply)+
+ sizeof(struct dump_ee_tag_len_struct)+
+ sizeof(short int),
+ GFP_NOIO);
+
+ if (!cn_reply) {
+ dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n",
+ (unsigned long long)e->sector, e->size);
+ return;
+ }
+
+ reply = (struct drbd_nl_cfg_reply*)cn_reply->data;
+ tl = reply->tag_list;
+
+ tl = tl_add_str(tl, T_dump_ee_reason, reason);
+ tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs);
+ tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs);
+ tl = tl_add_int(tl, T_ee_sector, &e->sector);
+ tl = tl_add_int(tl, T_ee_block_id, &e->block_id);
+
+ put_unaligned(T_ee_data, tl++);
+ put_unaligned(e->size, tl++);
+
+ __bio_for_each_segment(bvec, e->private_bio, i, 0) {
+ void *d = kmap(bvec->bv_page);
+ memcpy(tl, d + bvec->bv_offset, bvec->bv_len);
+ kunmap(bvec->bv_page);
+ tl=(unsigned short*)((char*)tl + bvec->bv_len);
+ }
+ put_unaligned(TT_END, tl++); /* Close the tag list */
+
+ cn_reply->id.idx = CN_IDX_DRBD;
+ cn_reply->id.val = CN_VAL_DRBD;
+
+ cn_reply->seq = atomic_add_return(1,&drbd_nl_seq);
+ cn_reply->ack = 0; // not used here.
+ cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
+ (int)((char*)tl - (char*)reply->tag_list);
+ cn_reply->flags = 0;
+
+ reply->packet_type = P_dump_ee;
+ reply->minor = mdev_to_minor(mdev);
+ reply->ret_code = NO_ERROR;
+
+ cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+ kfree(cn_reply);
+}
+
+void drbd_bcast_sync_progress(struct drbd_conf *mdev)
+{
+ char buffer[sizeof(struct cn_msg)+
+ sizeof(struct drbd_nl_cfg_reply)+
+ sizeof(struct sync_progress_tag_len_struct)+
+ sizeof(short int)];
+ struct cn_msg *cn_reply = (struct cn_msg *) buffer;
+ struct drbd_nl_cfg_reply *reply =
+ (struct drbd_nl_cfg_reply *)cn_reply->data;
+ unsigned short *tl = reply->tag_list;
+ unsigned long rs_left;
+ unsigned int res;
+
+ /* no local ref, no bitmap, no syncer progress, no broadcast. */
+ if (!get_ldev(mdev))
+ return;
+ drbd_get_syncer_progress(mdev, &rs_left, &res);
+ put_ldev(mdev);
+
+ tl = tl_add_int(tl, T_sync_progress, &res);
+ put_unaligned(TT_END, tl++); /* Close the tag list */
+
+ cn_reply->id.idx = CN_IDX_DRBD;
+ cn_reply->id.val = CN_VAL_DRBD;
+
+ cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
+ cn_reply->ack = 0; /* not used here. */
+ cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
+ (int)((char *)tl - (char *)reply->tag_list);
+ cn_reply->flags = 0;
+
+ reply->packet_type = P_sync_progress;
+ reply->minor = mdev_to_minor(mdev);
+ reply->ret_code = NO_ERROR;
+
+ cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+}
+
+int __init drbd_nl_init(void)
+{
+ static struct cb_id cn_id_drbd;
+ int err, try=10;
+
+ cn_id_drbd.val = CN_VAL_DRBD;
+ do {
+ cn_id_drbd.idx = cn_idx;
+ err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback);
+ if (!err)
+ break;
+ cn_idx = (cn_idx + CN_IDX_STEP);
+ } while (try--);
+
+ if (err) {
+ printk(KERN_ERR "drbd: cn_drbd failed to register\n");
+ return err;
+ }
+
+ return 0;
+}
+
+void drbd_nl_cleanup(void)
+{
+ static struct cb_id cn_id_drbd;
+
+ cn_id_drbd.idx = cn_idx;
+ cn_id_drbd.val = CN_VAL_DRBD;
+
+ cn_del_callback(&cn_id_drbd);
+}
+
+void drbd_nl_send_reply(struct cn_msg *req, int ret_code)
+{
+ char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)];
+ struct cn_msg *cn_reply = (struct cn_msg *) buffer;
+ struct drbd_nl_cfg_reply *reply =
+ (struct drbd_nl_cfg_reply *)cn_reply->data;
+ int rr;
+
+ cn_reply->id = req->id;
+
+ cn_reply->seq = req->seq;
+ cn_reply->ack = req->ack + 1;
+ cn_reply->len = sizeof(struct drbd_nl_cfg_reply);
+ cn_reply->flags = 0;
+
+ reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor;
+ reply->ret_code = ret_code;
+
+ rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+ if (rr && rr != -ESRCH)
+ printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
+}
+
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
new file mode 100644
index 000000000000..bdd0b4943b10
--- /dev/null
+++ b/drivers/block/drbd/drbd_proc.c
@@ -0,0 +1,265 @@
+/*
+ drbd_proc.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+static int drbd_proc_open(struct inode *inode, struct file *file);
+
+
+struct proc_dir_entry *drbd_proc;
+struct file_operations drbd_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = drbd_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+
+/*lge
+ * progress bars shamelessly adapted from driver/md/md.c
+ * output looks like
+ * [=====>..............] 33.5% (23456/123456)
+ * finish: 2:20:20 speed: 6,345 (6,456) K/sec
+ */
+static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
+{
+ unsigned long db, dt, dbdt, rt, rs_left;
+ unsigned int res;
+ int i, x, y;
+
+ drbd_get_syncer_progress(mdev, &rs_left, &res);
+
+ x = res/50;
+ y = 20-x;
+ seq_printf(seq, "\t[");
+ for (i = 1; i < x; i++)
+ seq_printf(seq, "=");
+ seq_printf(seq, ">");
+ for (i = 0; i < y; i++)
+ seq_printf(seq, ".");
+ seq_printf(seq, "] ");
+
+ seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10);
+ /* if more than 1 GB display in MB */
+ if (mdev->rs_total > 0x100000L)
+ seq_printf(seq, "(%lu/%lu)M\n\t",
+ (unsigned long) Bit2KB(rs_left >> 10),
+ (unsigned long) Bit2KB(mdev->rs_total >> 10));
+ else
+ seq_printf(seq, "(%lu/%lu)K\n\t",
+ (unsigned long) Bit2KB(rs_left),
+ (unsigned long) Bit2KB(mdev->rs_total));
+
+ /* see drivers/md/md.c
+ * We do not want to overflow, so the order of operands and
+ * the * 100 / 100 trick are important. We do a +1 to be
+ * safe against division by zero. We only estimate anyway.
+ *
+ * dt: time from mark until now
+ * db: blocks written from mark until now
+ * rt: remaining time
+ */
+ dt = (jiffies - mdev->rs_mark_time) / HZ;
+
+ if (dt > 20) {
+ /* if we made no update to rs_mark_time for too long,
+ * we are stalled. show that. */
+ seq_printf(seq, "stalled\n");
+ return;
+ }
+
+ if (!dt)
+ dt++;
+ db = mdev->rs_mark_left - rs_left;
+ rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
+
+ seq_printf(seq, "finish: %lu:%02lu:%02lu",
+ rt / 3600, (rt % 3600) / 60, rt % 60);
+
+ /* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */
+ dbdt = Bit2KB(db/dt);
+ if (dbdt > 1000)
+ seq_printf(seq, " speed: %ld,%03ld",
+ dbdt/1000, dbdt % 1000);
+ else
+ seq_printf(seq, " speed: %ld", dbdt);
+
+ /* mean speed since syncer started
+ * we do account for PausedSync periods */
+ dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
+ if (dt <= 0)
+ dt = 1;
+ db = mdev->rs_total - rs_left;
+ dbdt = Bit2KB(db/dt);
+ if (dbdt > 1000)
+ seq_printf(seq, " (%ld,%03ld)",
+ dbdt/1000, dbdt % 1000);
+ else
+ seq_printf(seq, " (%ld)", dbdt);
+
+ seq_printf(seq, " K/sec\n");
+}
+
+static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
+{
+ struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
+
+ seq_printf(seq, "%5d %s %s\n", bme->rs_left,
+ bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------",
+ bme->flags & BME_LOCKED ? "LOCKED" : "------"
+ );
+}
+
+static int drbd_seq_show(struct seq_file *seq, void *v)
+{
+ int i, hole = 0;
+ const char *sn;
+ struct drbd_conf *mdev;
+
+ static char write_ordering_chars[] = {
+ [WO_none] = 'n',
+ [WO_drain_io] = 'd',
+ [WO_bdev_flush] = 'f',
+ [WO_bio_barrier] = 'b',
+ };
+
+ seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
+ API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag());
+
+ /*
+ cs .. connection state
+ ro .. node role (local/remote)
+ ds .. disk state (local/remote)
+ protocol
+ various flags
+ ns .. network send
+ nr .. network receive
+ dw .. disk write
+ dr .. disk read
+ al .. activity log write count
+ bm .. bitmap update write count
+ pe .. pending (waiting for ack or data reply)
+ ua .. unack'd (still need to send ack or data reply)
+ ap .. application requests accepted, but not yet completed
+ ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending
+ wo .. write ordering mode currently in use
+ oos .. known out-of-sync kB
+ */
+
+ for (i = 0; i < minor_count; i++) {
+ mdev = minor_to_mdev(i);
+ if (!mdev) {
+ hole = 1;
+ continue;
+ }
+ if (hole) {
+ hole = 0;
+ seq_printf(seq, "\n");
+ }
+
+ sn = drbd_conn_str(mdev->state.conn);
+
+ if (mdev->state.conn == C_STANDALONE &&
+ mdev->state.disk == D_DISKLESS &&
+ mdev->state.role == R_SECONDARY) {
+ seq_printf(seq, "%2d: cs:Unconfigured\n", i);
+ } else {
+ seq_printf(seq,
+ "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n"
+ " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
+ "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
+ i, sn,
+ drbd_role_str(mdev->state.role),
+ drbd_role_str(mdev->state.peer),
+ drbd_disk_str(mdev->state.disk),
+ drbd_disk_str(mdev->state.pdsk),
+ (mdev->net_conf == NULL ? ' ' :
+ (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
+ mdev->state.susp ? 's' : 'r',
+ mdev->state.aftr_isp ? 'a' : '-',
+ mdev->state.peer_isp ? 'p' : '-',
+ mdev->state.user_isp ? 'u' : '-',
+ mdev->congestion_reason ?: '-',
+ mdev->send_cnt/2,
+ mdev->recv_cnt/2,
+ mdev->writ_cnt/2,
+ mdev->read_cnt/2,
+ mdev->al_writ_cnt,
+ mdev->bm_writ_cnt,
+ atomic_read(&mdev->local_cnt),
+ atomic_read(&mdev->ap_pending_cnt) +
+ atomic_read(&mdev->rs_pending_cnt),
+ atomic_read(&mdev->unacked_cnt),
+ atomic_read(&mdev->ap_bio_cnt),
+ mdev->epochs,
+ write_ordering_chars[mdev->write_ordering]
+ );
+ seq_printf(seq, " oos:%lu\n",
+ Bit2KB(drbd_bm_total_weight(mdev)));
+ }
+ if (mdev->state.conn == C_SYNC_SOURCE ||
+ mdev->state.conn == C_SYNC_TARGET)
+ drbd_syncer_progress(mdev, seq);
+
+ if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
+ seq_printf(seq, "\t%3d%% %lu/%lu\n",
+ (int)((mdev->rs_total-mdev->ov_left) /
+ (mdev->rs_total/100+1)),
+ mdev->rs_total - mdev->ov_left,
+ mdev->rs_total);
+
+ if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) {
+ lc_seq_printf_stats(seq, mdev->resync);
+ lc_seq_printf_stats(seq, mdev->act_log);
+ put_ldev(mdev);
+ }
+
+ if (proc_details >= 2) {
+ if (mdev->resync) {
+ lc_seq_dump_details(seq, mdev->resync, "rs_left",
+ resync_dump_detail);
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int drbd_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, drbd_seq_show, PDE(inode)->data);
+}
+
+/* PROC FS stuff end */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
new file mode 100644
index 000000000000..c548f24f54a1
--- /dev/null
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -0,0 +1,4426 @@
+/*
+ drbd_receiver.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <net/sock.h>
+
+#include <linux/version.h>
+#include <linux/drbd.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/pkt_sched.h>
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+#include <linux/random.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+#include "drbd_int.h"
+#include "drbd_req.h"
+
+#include "drbd_vli.h"
+
+struct flush_work {
+ struct drbd_work w;
+ struct drbd_epoch *epoch;
+};
+
+enum finish_epoch {
+ FE_STILL_LIVE,
+ FE_DESTROYED,
+ FE_RECYCLED,
+};
+
+static int drbd_do_handshake(struct drbd_conf *mdev);
+static int drbd_do_auth(struct drbd_conf *mdev);
+
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
+static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
+
+static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
+{
+ struct drbd_epoch *prev;
+ spin_lock(&mdev->epoch_lock);
+ prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
+ if (prev == epoch || prev == mdev->current_epoch)
+ prev = NULL;
+ spin_unlock(&mdev->epoch_lock);
+ return prev;
+}
+
+#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
+
+static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev)
+{
+ struct page *page = NULL;
+
+ /* Yes, testing drbd_pp_vacant outside the lock is racy.
+ * So what. It saves a spin_lock. */
+ if (drbd_pp_vacant > 0) {
+ spin_lock(&drbd_pp_lock);
+ page = drbd_pp_pool;
+ if (page) {
+ drbd_pp_pool = (struct page *)page_private(page);
+ set_page_private(page, 0); /* just to be polite */
+ drbd_pp_vacant--;
+ }
+ spin_unlock(&drbd_pp_lock);
+ }
+ /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
+ * "criss-cross" setup, that might cause write-out on some other DRBD,
+ * which in turn might block on the other node at this very place. */
+ if (!page)
+ page = alloc_page(GFP_TRY);
+ if (page)
+ atomic_inc(&mdev->pp_in_use);
+ return page;
+}
+
+/* kick lower level device, if we have more than (arbitrary number)
+ * reference counts on it, which typically are locally submitted io
+ * requests. don't use unacked_cnt, so we speed up proto A and B, too. */
+static void maybe_kick_lo(struct drbd_conf *mdev)
+{
+ if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark)
+ drbd_kick_lo(mdev);
+}
+
+static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
+{
+ struct drbd_epoch_entry *e;
+ struct list_head *le, *tle;
+
+ /* The EEs are always appended to the end of the list. Since
+ they are sent in order over the wire, they have to finish
+ in order. As soon as we see the first not finished we can
+ stop to examine the list... */
+
+ list_for_each_safe(le, tle, &mdev->net_ee) {
+ e = list_entry(le, struct drbd_epoch_entry, w.list);
+ if (drbd_bio_has_active_page(e->private_bio))
+ break;
+ list_move(le, to_be_freed);
+ }
+}
+
+static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
+{
+ LIST_HEAD(reclaimed);
+ struct drbd_epoch_entry *e, *t;
+
+ maybe_kick_lo(mdev);
+ spin_lock_irq(&mdev->req_lock);
+ reclaim_net_ee(mdev, &reclaimed);
+ spin_unlock_irq(&mdev->req_lock);
+
+ list_for_each_entry_safe(e, t, &reclaimed, w.list)
+ drbd_free_ee(mdev, e);
+}
+
+/**
+ * drbd_pp_alloc() - Returns a page, fails only if a signal comes in
+ * @mdev: DRBD device.
+ * @retry: whether or not to retry allocation forever (or until signalled)
+ *
+ * Tries to allocate a page, first from our own page pool, then from the
+ * kernel, unless this allocation would exceed the max_buffers setting.
+ * If @retry is non-zero, retry until DRBD frees a page somewhere else.
+ */
+static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry)
+{
+ struct page *page = NULL;
+ DEFINE_WAIT(wait);
+
+ if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
+ page = drbd_pp_first_page_or_try_alloc(mdev);
+ if (page)
+ return page;
+ }
+
+ for (;;) {
+ prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
+
+ drbd_kick_lo_and_reclaim_net(mdev);
+
+ if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
+ page = drbd_pp_first_page_or_try_alloc(mdev);
+ if (page)
+ break;
+ }
+
+ if (!retry)
+ break;
+
+ if (signal_pending(current)) {
+ dev_warn(DEV, "drbd_pp_alloc interrupted!\n");
+ break;
+ }
+
+ schedule();
+ }
+ finish_wait(&drbd_pp_wait, &wait);
+
+ return page;
+}
+
+/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
+ * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */
+static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
+{
+ int free_it;
+
+ spin_lock(&drbd_pp_lock);
+ if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
+ free_it = 1;
+ } else {
+ set_page_private(page, (unsigned long)drbd_pp_pool);
+ drbd_pp_pool = page;
+ drbd_pp_vacant++;
+ free_it = 0;
+ }
+ spin_unlock(&drbd_pp_lock);
+
+ atomic_dec(&mdev->pp_in_use);
+
+ if (free_it)
+ __free_page(page);
+
+ wake_up(&drbd_pp_wait);
+}
+
+static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio)
+{
+ struct page *p_to_be_freed = NULL;
+ struct page *page;
+ struct bio_vec *bvec;
+ int i;
+
+ spin_lock(&drbd_pp_lock);
+ __bio_for_each_segment(bvec, bio, i, 0) {
+ if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
+ set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed);
+ p_to_be_freed = bvec->bv_page;
+ } else {
+ set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool);
+ drbd_pp_pool = bvec->bv_page;
+ drbd_pp_vacant++;
+ }
+ }
+ spin_unlock(&drbd_pp_lock);
+ atomic_sub(bio->bi_vcnt, &mdev->pp_in_use);
+
+ while (p_to_be_freed) {
+ page = p_to_be_freed;
+ p_to_be_freed = (struct page *)page_private(page);
+ set_page_private(page, 0); /* just to be polite */
+ put_page(page);
+ }
+
+ wake_up(&drbd_pp_wait);
+}
+
+/*
+You need to hold the req_lock:
+ _drbd_wait_ee_list_empty()
+
+You must not have the req_lock:
+ drbd_free_ee()
+ drbd_alloc_ee()
+ drbd_init_ee()
+ drbd_release_ee()
+ drbd_ee_fix_bhs()
+ drbd_process_done_ee()
+ drbd_clear_done_ee()
+ drbd_wait_ee_list_empty()
+*/
+
+struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
+ u64 id,
+ sector_t sector,
+ unsigned int data_size,
+ gfp_t gfp_mask) __must_hold(local)
+{
+ struct request_queue *q;
+ struct drbd_epoch_entry *e;
+ struct page *page;
+ struct bio *bio;
+ unsigned int ds;
+
+ if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE))
+ return NULL;
+
+ e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
+ if (!e) {
+ if (!(gfp_mask & __GFP_NOWARN))
+ dev_err(DEV, "alloc_ee: Allocation of an EE failed\n");
+ return NULL;
+ }
+
+ bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE));
+ if (!bio) {
+ if (!(gfp_mask & __GFP_NOWARN))
+ dev_err(DEV, "alloc_ee: Allocation of a bio failed\n");
+ goto fail1;
+ }
+
+ bio->bi_bdev = mdev->ldev->backing_bdev;
+ bio->bi_sector = sector;
+
+ ds = data_size;
+ while (ds) {
+ page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT));
+ if (!page) {
+ if (!(gfp_mask & __GFP_NOWARN))
+ dev_err(DEV, "alloc_ee: Allocation of a page failed\n");
+ goto fail2;
+ }
+ if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) {
+ drbd_pp_free(mdev, page);
+ dev_err(DEV, "alloc_ee: bio_add_page(s=%llu,"
+ "data_size=%u,ds=%u) failed\n",
+ (unsigned long long)sector, data_size, ds);
+
+ q = bdev_get_queue(bio->bi_bdev);
+ if (q->merge_bvec_fn) {
+ struct bvec_merge_data bvm = {
+ .bi_bdev = bio->bi_bdev,
+ .bi_sector = bio->bi_sector,
+ .bi_size = bio->bi_size,
+ .bi_rw = bio->bi_rw,
+ };
+ int l = q->merge_bvec_fn(q, &bvm,
+ &bio->bi_io_vec[bio->bi_vcnt]);
+ dev_err(DEV, "merge_bvec_fn() = %d\n", l);
+ }
+
+ /* dump more of the bio. */
+ dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs);
+ dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt);
+ dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size);
+ dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments);
+
+ goto fail2;
+ break;
+ }
+ ds -= min_t(int, ds, PAGE_SIZE);
+ }
+
+ D_ASSERT(data_size == bio->bi_size);
+
+ bio->bi_private = e;
+ e->mdev = mdev;
+ e->sector = sector;
+ e->size = bio->bi_size;
+
+ e->private_bio = bio;
+ e->block_id = id;
+ INIT_HLIST_NODE(&e->colision);
+ e->epoch = NULL;
+ e->flags = 0;
+
+ return e;
+
+ fail2:
+ drbd_pp_free_bio_pages(mdev, bio);
+ bio_put(bio);
+ fail1:
+ mempool_free(e, drbd_ee_mempool);
+
+ return NULL;
+}
+
+void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+{
+ struct bio *bio = e->private_bio;
+ drbd_pp_free_bio_pages(mdev, bio);
+ bio_put(bio);
+ D_ASSERT(hlist_unhashed(&e->colision));
+ mempool_free(e, drbd_ee_mempool);
+}
+
+int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
+{
+ LIST_HEAD(work_list);
+ struct drbd_epoch_entry *e, *t;
+ int count = 0;
+
+ spin_lock_irq(&mdev->req_lock);
+ list_splice_init(list, &work_list);
+ spin_unlock_irq(&mdev->req_lock);
+
+ list_for_each_entry_safe(e, t, &work_list, w.list) {
+ drbd_free_ee(mdev, e);
+ count++;
+ }
+ return count;
+}
+
+
+/*
+ * This function is called from _asender only_
+ * but see also comments in _req_mod(,barrier_acked)
+ * and receive_Barrier.
+ *
+ * Move entries from net_ee to done_ee, if ready.
+ * Grab done_ee, call all callbacks, free the entries.
+ * The callbacks typically send out ACKs.
+ */
+static int drbd_process_done_ee(struct drbd_conf *mdev)
+{
+ LIST_HEAD(work_list);
+ LIST_HEAD(reclaimed);
+ struct drbd_epoch_entry *e, *t;
+ int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS);
+
+ spin_lock_irq(&mdev->req_lock);
+ reclaim_net_ee(mdev, &reclaimed);
+ list_splice_init(&mdev->done_ee, &work_list);
+ spin_unlock_irq(&mdev->req_lock);
+
+ list_for_each_entry_safe(e, t, &reclaimed, w.list)
+ drbd_free_ee(mdev, e);
+
+ /* possible callbacks here:
+ * e_end_block, and e_end_resync_block, e_send_discard_ack.
+ * all ignore the last argument.
+ */
+ list_for_each_entry_safe(e, t, &work_list, w.list) {
+ /* list_del not necessary, next/prev members not touched */
+ ok = e->w.cb(mdev, &e->w, !ok) && ok;
+ drbd_free_ee(mdev, e);
+ }
+ wake_up(&mdev->ee_wait);
+
+ return ok;
+}
+
+void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
+{
+ DEFINE_WAIT(wait);
+
+ /* avoids spin_lock/unlock
+ * and calling prepare_to_wait in the fast path */
+ while (!list_empty(head)) {
+ prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&mdev->req_lock);
+ drbd_kick_lo(mdev);
+ schedule();
+ finish_wait(&mdev->ee_wait, &wait);
+ spin_lock_irq(&mdev->req_lock);
+ }
+}
+
+void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
+{
+ spin_lock_irq(&mdev->req_lock);
+ _drbd_wait_ee_list_empty(mdev, head);
+ spin_unlock_irq(&mdev->req_lock);
+}
+
+/* see also kernel_accept; which is only present since 2.6.18.
+ * also we want to log which part of it failed, exactly */
+static int drbd_accept(struct drbd_conf *mdev, const char **what,
+ struct socket *sock, struct socket **newsock)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ *what = "listen";
+ err = sock->ops->listen(sock, 5);
+ if (err < 0)
+ goto out;
+
+ *what = "sock_create_lite";
+ err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
+ newsock);
+ if (err < 0)
+ goto out;
+
+ *what = "accept";
+ err = sock->ops->accept(sock, *newsock, 0);
+ if (err < 0) {
+ sock_release(*newsock);
+ *newsock = NULL;
+ goto out;
+ }
+ (*newsock)->ops = sock->ops;
+
+out:
+ return err;
+}
+
+static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
+ void *buf, size_t size, int flags)
+{
+ mm_segment_t oldfs;
+ struct kvec iov = {
+ .iov_base = buf,
+ .iov_len = size,
+ };
+ struct msghdr msg = {
+ .msg_iovlen = 1,
+ .msg_iov = (struct iovec *)&iov,
+ .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
+ };
+ int rv;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ rv = sock_recvmsg(sock, &msg, size, msg.msg_flags);
+ set_fs(oldfs);
+
+ return rv;
+}
+
+static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
+{
+ mm_segment_t oldfs;
+ struct kvec iov = {
+ .iov_base = buf,
+ .iov_len = size,
+ };
+ struct msghdr msg = {
+ .msg_iovlen = 1,
+ .msg_iov = (struct iovec *)&iov,
+ .msg_flags = MSG_WAITALL | MSG_NOSIGNAL
+ };
+ int rv;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+
+ for (;;) {
+ rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags);
+ if (rv == size)
+ break;
+
+ /* Note:
+ * ECONNRESET other side closed the connection
+ * ERESTARTSYS (on sock) we got a signal
+ */
+
+ if (rv < 0) {
+ if (rv == -ECONNRESET)
+ dev_info(DEV, "sock was reset by peer\n");
+ else if (rv != -ERESTARTSYS)
+ dev_err(DEV, "sock_recvmsg returned %d\n", rv);
+ break;
+ } else if (rv == 0) {
+ dev_info(DEV, "sock was shut down by peer\n");
+ break;
+ } else {
+ /* signal came in, or peer/link went down,
+ * after we read a partial message
+ */
+ /* D_ASSERT(signal_pending(current)); */
+ break;
+ }
+ };
+
+ set_fs(oldfs);
+
+ if (rv != size)
+ drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
+
+ return rv;
+}
+
+static struct socket *drbd_try_connect(struct drbd_conf *mdev)
+{
+ const char *what;
+ struct socket *sock;
+ struct sockaddr_in6 src_in6;
+ int err;
+ int disconnect_on_error = 1;
+
+ if (!get_net_conf(mdev))
+ return NULL;
+
+ what = "sock_create_kern";
+ err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
+ if (err < 0) {
+ sock = NULL;
+ goto out;
+ }
+
+ sock->sk->sk_rcvtimeo =
+ sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ;
+
+ /* explicitly bind to the configured IP as source IP
+ * for the outgoing connections.
+ * This is needed for multihomed hosts and to be
+ * able to use lo: interfaces for drbd.
+ * Make sure to use 0 as port number, so linux selects
+ * a free one dynamically.
+ */
+ memcpy(&src_in6, mdev->net_conf->my_addr,
+ min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6)));
+ if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6)
+ src_in6.sin6_port = 0;
+ else
+ ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
+
+ what = "bind before connect";
+ err = sock->ops->bind(sock,
+ (struct sockaddr *) &src_in6,
+ mdev->net_conf->my_addr_len);
+ if (err < 0)
+ goto out;
+
+ /* connect may fail, peer not yet available.
+ * stay C_WF_CONNECTION, don't go Disconnecting! */
+ disconnect_on_error = 0;
+ what = "connect";
+ err = sock->ops->connect(sock,
+ (struct sockaddr *)mdev->net_conf->peer_addr,
+ mdev->net_conf->peer_addr_len, 0);
+
+out:
+ if (err < 0) {
+ if (sock) {
+ sock_release(sock);
+ sock = NULL;
+ }
+ switch (-err) {
+ /* timeout, busy, signal pending */
+ case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
+ case EINTR: case ERESTARTSYS:
+ /* peer not (yet) available, network problem */
+ case ECONNREFUSED: case ENETUNREACH:
+ case EHOSTDOWN: case EHOSTUNREACH:
+ disconnect_on_error = 0;
+ break;
+ default:
+ dev_err(DEV, "%s failed, err = %d\n", what, err);
+ }
+ if (disconnect_on_error)
+ drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ }
+ put_net_conf(mdev);
+ return sock;
+}
+
+static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
+{
+ int timeo, err;
+ struct socket *s_estab = NULL, *s_listen;
+ const char *what;
+
+ if (!get_net_conf(mdev))
+ return NULL;
+
+ what = "sock_create_kern";
+ err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
+ SOCK_STREAM, IPPROTO_TCP, &s_listen);
+ if (err) {
+ s_listen = NULL;
+ goto out;
+ }
+
+ timeo = mdev->net_conf->try_connect_int * HZ;
+ timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
+
+ s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */
+ s_listen->sk->sk_rcvtimeo = timeo;
+ s_listen->sk->sk_sndtimeo = timeo;
+
+ what = "bind before listen";
+ err = s_listen->ops->bind(s_listen,
+ (struct sockaddr *) mdev->net_conf->my_addr,
+ mdev->net_conf->my_addr_len);
+ if (err < 0)
+ goto out;
+
+ err = drbd_accept(mdev, &what, s_listen, &s_estab);
+
+out:
+ if (s_listen)
+ sock_release(s_listen);
+ if (err < 0) {
+ if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
+ dev_err(DEV, "%s failed, err = %d\n", what, err);
+ drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ }
+ }
+ put_net_conf(mdev);
+
+ return s_estab;
+}
+
+static int drbd_send_fp(struct drbd_conf *mdev,
+ struct socket *sock, enum drbd_packets cmd)
+{
+ struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
+
+ return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
+}
+
+static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
+{
+ struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
+ int rr;
+
+ rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
+
+ if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC)
+ return be16_to_cpu(h->command);
+
+ return 0xffff;
+}
+
+/**
+ * drbd_socket_okay() - Free the socket if its connection is not okay
+ * @mdev: DRBD device.
+ * @sock: pointer to the pointer to the socket.
+ */
+static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
+{
+ int rr;
+ char tb[4];
+
+ if (!*sock)
+ return FALSE;
+
+ rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
+
+ if (rr > 0 || rr == -EAGAIN) {
+ return TRUE;
+ } else {
+ sock_release(*sock);
+ *sock = NULL;
+ return FALSE;
+ }
+}
+
+/*
+ * return values:
+ * 1 yes, we have a valid connection
+ * 0 oops, did not work out, please try again
+ * -1 peer talks different language,
+ * no point in trying again, please go standalone.
+ * -2 We do not have a network config...
+ */
+static int drbd_connect(struct drbd_conf *mdev)
+{
+ struct socket *s, *sock, *msock;
+ int try, h, ok;
+
+ D_ASSERT(!mdev->data.socket);
+
+ if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags))
+ dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n");
+
+ if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
+ return -2;
+
+ clear_bit(DISCARD_CONCURRENT, &mdev->flags);
+
+ sock = NULL;
+ msock = NULL;
+
+ do {
+ for (try = 0;;) {
+ /* 3 tries, this should take less than a second! */
+ s = drbd_try_connect(mdev);
+ if (s || ++try >= 3)
+ break;
+ /* give the other side time to call bind() & listen() */
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(HZ / 10);
+ }
+
+ if (s) {
+ if (!sock) {
+ drbd_send_fp(mdev, s, P_HAND_SHAKE_S);
+ sock = s;
+ s = NULL;
+ } else if (!msock) {
+ drbd_send_fp(mdev, s, P_HAND_SHAKE_M);
+ msock = s;
+ s = NULL;
+ } else {
+ dev_err(DEV, "Logic error in drbd_connect()\n");
+ goto out_release_sockets;
+ }
+ }
+
+ if (sock && msock) {
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(HZ / 10);
+ ok = drbd_socket_okay(mdev, &sock);
+ ok = drbd_socket_okay(mdev, &msock) && ok;
+ if (ok)
+ break;
+ }
+
+retry:
+ s = drbd_wait_for_connect(mdev);
+ if (s) {
+ try = drbd_recv_fp(mdev, s);
+ drbd_socket_okay(mdev, &sock);
+ drbd_socket_okay(mdev, &msock);
+ switch (try) {
+ case P_HAND_SHAKE_S:
+ if (sock) {
+ dev_warn(DEV, "initial packet S crossed\n");
+ sock_release(sock);
+ }
+ sock = s;
+ break;
+ case P_HAND_SHAKE_M:
+ if (msock) {
+ dev_warn(DEV, "initial packet M crossed\n");
+ sock_release(msock);
+ }
+ msock = s;
+ set_bit(DISCARD_CONCURRENT, &mdev->flags);
+ break;
+ default:
+ dev_warn(DEV, "Error receiving initial packet\n");
+ sock_release(s);
+ if (random32() & 1)
+ goto retry;
+ }
+ }
+
+ if (mdev->state.conn <= C_DISCONNECTING)
+ goto out_release_sockets;
+ if (signal_pending(current)) {
+ flush_signals(current);
+ smp_rmb();
+ if (get_t_state(&mdev->receiver) == Exiting)
+ goto out_release_sockets;
+ }
+
+ if (sock && msock) {
+ ok = drbd_socket_okay(mdev, &sock);
+ ok = drbd_socket_okay(mdev, &msock) && ok;
+ if (ok)
+ break;
+ }
+ } while (1);
+
+ msock->sk->sk_reuse = 1; /* SO_REUSEADDR */
+ sock->sk->sk_reuse = 1; /* SO_REUSEADDR */
+
+ sock->sk->sk_allocation = GFP_NOIO;
+ msock->sk->sk_allocation = GFP_NOIO;
+
+ sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
+ msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
+
+ if (mdev->net_conf->sndbuf_size) {
+ sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size;
+ sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ }
+
+ if (mdev->net_conf->rcvbuf_size) {
+ sock->sk->sk_rcvbuf = mdev->net_conf->rcvbuf_size;
+ sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ }
+
+ /* NOT YET ...
+ * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
+ * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+ * first set it to the P_HAND_SHAKE timeout,
+ * which we set to 4x the configured ping_timeout. */
+ sock->sk->sk_sndtimeo =
+ sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10;
+
+ msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
+ msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
+
+ /* we don't want delays.
+ * we use TCP_CORK where apropriate, though */
+ drbd_tcp_nodelay(sock);
+ drbd_tcp_nodelay(msock);
+
+ mdev->data.socket = sock;
+ mdev->meta.socket = msock;
+ mdev->last_received = jiffies;
+
+ D_ASSERT(mdev->asender.task == NULL);
+
+ h = drbd_do_handshake(mdev);
+ if (h <= 0)
+ return h;
+
+ if (mdev->cram_hmac_tfm) {
+ /* drbd_request_state(mdev, NS(conn, WFAuth)); */
+ if (!drbd_do_auth(mdev)) {
+ dev_err(DEV, "Authentication of peer failed\n");
+ return -1;
+ }
+ }
+
+ if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS)
+ return 0;
+
+ sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
+ sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+
+ atomic_set(&mdev->packet_seq, 0);
+ mdev->peer_seq = 0;
+
+ drbd_thread_start(&mdev->asender);
+
+ drbd_send_protocol(mdev);
+ drbd_send_sync_param(mdev, &mdev->sync_conf);
+ drbd_send_sizes(mdev, 0);
+ drbd_send_uuids(mdev);
+ drbd_send_state(mdev);
+ clear_bit(USE_DEGR_WFC_T, &mdev->flags);
+ clear_bit(RESIZE_PENDING, &mdev->flags);
+
+ return 1;
+
+out_release_sockets:
+ if (sock)
+ sock_release(sock);
+ if (msock)
+ sock_release(msock);
+ return -1;
+}
+
+static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h)
+{
+ int r;
+
+ r = drbd_recv(mdev, h, sizeof(*h));
+
+ if (unlikely(r != sizeof(*h))) {
+ dev_err(DEV, "short read expecting header on sock: r=%d\n", r);
+ return FALSE;
+ };
+ h->command = be16_to_cpu(h->command);
+ h->length = be16_to_cpu(h->length);
+ if (unlikely(h->magic != BE_DRBD_MAGIC)) {
+ dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n",
+ (long)be32_to_cpu(h->magic),
+ h->command, h->length);
+ return FALSE;
+ }
+ mdev->last_received = jiffies;
+
+ return TRUE;
+}
+
+static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
+{
+ int rv;
+
+ if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
+ rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL);
+ if (rv) {
+ dev_err(DEV, "local disk flush failed with status %d\n", rv);
+ /* would rather check on EOPNOTSUPP, but that is not reliable.
+ * don't try again for ANY return value != 0
+ * if (rv == -EOPNOTSUPP) */
+ drbd_bump_write_ordering(mdev, WO_drain_io);
+ }
+ put_ldev(mdev);
+ }
+
+ return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
+}
+
+static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+ struct flush_work *fw = (struct flush_work *)w;
+ struct drbd_epoch *epoch = fw->epoch;
+
+ kfree(w);
+
+ if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
+ drbd_flush_after_epoch(mdev, epoch);
+
+ drbd_may_finish_epoch(mdev, epoch, EV_PUT |
+ (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0));
+
+ return 1;
+}
+
+/**
+ * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
+ * @mdev: DRBD device.
+ * @epoch: Epoch object.
+ * @ev: Epoch event.
+ */
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
+ struct drbd_epoch *epoch,
+ enum epoch_event ev)
+{
+ int finish, epoch_size;
+ struct drbd_epoch *next_epoch;
+ int schedule_flush = 0;
+ enum finish_epoch rv = FE_STILL_LIVE;
+
+ spin_lock(&mdev->epoch_lock);
+ do {
+ next_epoch = NULL;
+ finish = 0;
+
+ epoch_size = atomic_read(&epoch->epoch_size);
+
+ switch (ev & ~EV_CLEANUP) {
+ case EV_PUT:
+ atomic_dec(&epoch->active);
+ break;
+ case EV_GOT_BARRIER_NR:
+ set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
+
+ /* Special case: If we just switched from WO_bio_barrier to
+ WO_bdev_flush we should not finish the current epoch */
+ if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
+ mdev->write_ordering != WO_bio_barrier &&
+ epoch == mdev->current_epoch)
+ clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
+ break;
+ case EV_BARRIER_DONE:
+ set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
+ break;
+ case EV_BECAME_LAST:
+ /* nothing to do*/
+ break;
+ }
+
+ if (epoch_size != 0 &&
+ atomic_read(&epoch->active) == 0 &&
+ test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) &&
+ epoch->list.prev == &mdev->current_epoch->list &&
+ !test_bit(DE_IS_FINISHING, &epoch->flags)) {
+ /* Nearly all conditions are met to finish that epoch... */
+ if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
+ mdev->write_ordering == WO_none ||
+ (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
+ ev & EV_CLEANUP) {
+ finish = 1;
+ set_bit(DE_IS_FINISHING, &epoch->flags);
+ } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
+ mdev->write_ordering == WO_bio_barrier) {
+ atomic_inc(&epoch->active);
+ schedule_flush = 1;
+ }
+ }
+ if (finish) {
+ if (!(ev & EV_CLEANUP)) {
+ spin_unlock(&mdev->epoch_lock);
+ drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
+ spin_lock(&mdev->epoch_lock);
+ }
+ dec_unacked(mdev);
+
+ if (mdev->current_epoch != epoch) {
+ next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
+ list_del(&epoch->list);
+ ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
+ mdev->epochs--;
+ kfree(epoch);
+
+ if (rv == FE_STILL_LIVE)
+ rv = FE_DESTROYED;
+ } else {
+ epoch->flags = 0;
+ atomic_set(&epoch->epoch_size, 0);
+ /* atomic_set(&epoch->active, 0); is alrady zero */
+ if (rv == FE_STILL_LIVE)
+ rv = FE_RECYCLED;
+ }
+ }
+
+ if (!next_epoch)
+ break;
+
+ epoch = next_epoch;
+ } while (1);
+
+ spin_unlock(&mdev->epoch_lock);
+
+ if (schedule_flush) {
+ struct flush_work *fw;
+ fw = kmalloc(sizeof(*fw), GFP_ATOMIC);
+ if (fw) {
+ fw->w.cb = w_flush;
+ fw->epoch = epoch;
+ drbd_queue_work(&mdev->data.work, &fw->w);
+ } else {
+ dev_warn(DEV, "Could not kmalloc a flush_work obj\n");
+ set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
+ /* That is not a recursion, only one level */
+ drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
+ drbd_may_finish_epoch(mdev, epoch, EV_PUT);
+ }
+ }
+
+ return rv;
+}
+
+/**
+ * drbd_bump_write_ordering() - Fall back to an other write ordering method
+ * @mdev: DRBD device.
+ * @wo: Write ordering method to try.
+ */
+void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local)
+{
+ enum write_ordering_e pwo;
+ static char *write_ordering_str[] = {
+ [WO_none] = "none",
+ [WO_drain_io] = "drain",
+ [WO_bdev_flush] = "flush",
+ [WO_bio_barrier] = "barrier",
+ };
+
+ pwo = mdev->write_ordering;
+ wo = min(pwo, wo);
+ if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier)
+ wo = WO_bdev_flush;
+ if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
+ wo = WO_drain_io;
+ if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
+ wo = WO_none;
+ mdev->write_ordering = wo;
+ if (pwo != mdev->write_ordering || wo == WO_bio_barrier)
+ dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
+}
+
+/**
+ * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set
+ * @mdev: DRBD device.
+ * @w: work object.
+ * @cancel: The connection will be closed anyways (unused in this callback)
+ */
+int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
+{
+ struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
+ struct bio *bio = e->private_bio;
+
+ /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
+ (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
+ so that we can finish that epoch in drbd_may_finish_epoch().
+ That is necessary if we already have a long chain of Epochs, before
+ we realize that BIO_RW_BARRIER is actually not supported */
+
+ /* As long as the -ENOTSUPP on the barrier is reported immediately
+ that will never trigger. If it is reported late, we will just
+ print that warning and continue correctly for all future requests
+ with WO_bdev_flush */
+ if (previous_epoch(mdev, e->epoch))
+ dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
+
+ /* prepare bio for re-submit,
+ * re-init volatile members */
+ /* we still have a local reference,
+ * get_ldev was done in receive_Data. */
+ bio->bi_bdev = mdev->ldev->backing_bdev;
+ bio->bi_sector = e->sector;
+ bio->bi_size = e->size;
+ bio->bi_idx = 0;
+
+ bio->bi_flags &= ~(BIO_POOL_MASK - 1);
+ bio->bi_flags |= 1 << BIO_UPTODATE;
+
+ /* don't know whether this is necessary: */
+ bio->bi_phys_segments = 0;
+ bio->bi_next = NULL;
+
+ /* these should be unchanged: */
+ /* bio->bi_end_io = drbd_endio_write_sec; */
+ /* bio->bi_vcnt = whatever; */
+
+ e->w.cb = e_end_block;
+
+ /* This is no longer a barrier request. */
+ bio->bi_rw &= ~(1UL << BIO_RW_BARRIER);
+
+ drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio);
+
+ return 1;
+}
+
+static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
+{
+ int rv, issue_flush;
+ struct p_barrier *p = (struct p_barrier *)h;
+ struct drbd_epoch *epoch;
+
+ ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+
+ rv = drbd_recv(mdev, h->payload, h->length);
+ ERR_IF(rv != h->length) return FALSE;
+
+ inc_unacked(mdev);
+
+ if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
+ drbd_kick_lo(mdev);
+
+ mdev->current_epoch->barrier_nr = p->barrier;
+ rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
+
+ /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
+ * the activity log, which means it would not be resynced in case the
+ * R_PRIMARY crashes now.
+ * Therefore we must send the barrier_ack after the barrier request was
+ * completed. */
+ switch (mdev->write_ordering) {
+ case WO_bio_barrier:
+ case WO_none:
+ if (rv == FE_RECYCLED)
+ return TRUE;
+ break;
+
+ case WO_bdev_flush:
+ case WO_drain_io:
+ D_ASSERT(rv == FE_STILL_LIVE);
+ set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
+ drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
+ rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
+ if (rv == FE_RECYCLED)
+ return TRUE;
+
+ /* The asender will send all the ACKs and barrier ACKs out, since
+ all EEs moved from the active_ee to the done_ee. We need to
+ provide a new epoch object for the EEs that come in soon */
+ break;
+ }
+
+ /* receiver context, in the writeout path of the other node.
+ * avoid potential distributed deadlock */
+ epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+ if (!epoch) {
+ dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
+ issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
+ drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
+ if (issue_flush) {
+ rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
+ if (rv == FE_RECYCLED)
+ return TRUE;
+ }
+
+ drbd_wait_ee_list_empty(mdev, &mdev->done_ee);
+
+ return TRUE;
+ }
+
+ epoch->flags = 0;
+ atomic_set(&epoch->epoch_size, 0);
+ atomic_set(&epoch->active, 0);
+
+ spin_lock(&mdev->epoch_lock);
+ if (atomic_read(&mdev->current_epoch->epoch_size)) {
+ list_add(&epoch->list, &mdev->current_epoch->list);
+ mdev->current_epoch = epoch;
+ mdev->epochs++;
+ } else {
+ /* The current_epoch got recycled while we allocated this one... */
+ kfree(epoch);
+ }
+ spin_unlock(&mdev->epoch_lock);
+
+ return TRUE;
+}
+
+/* used from receive_RSDataReply (recv_resync_read)
+ * and from receive_Data */
+static struct drbd_epoch_entry *
+read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
+{
+ struct drbd_epoch_entry *e;
+ struct bio_vec *bvec;
+ struct page *page;
+ struct bio *bio;
+ int dgs, ds, i, rr;
+ void *dig_in = mdev->int_dig_in;
+ void *dig_vv = mdev->int_dig_vv;
+
+ dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
+ crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
+
+ if (dgs) {
+ rr = drbd_recv(mdev, dig_in, dgs);
+ if (rr != dgs) {
+ dev_warn(DEV, "short read receiving data digest: read %d expected %d\n",
+ rr, dgs);
+ return NULL;
+ }
+ }
+
+ data_size -= dgs;
+
+ ERR_IF(data_size & 0x1ff) return NULL;
+ ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL;
+
+ /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
+ * "criss-cross" setup, that might cause write-out on some other DRBD,
+ * which in turn might block on the other node at this very place. */
+ e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
+ if (!e)
+ return NULL;
+ bio = e->private_bio;
+ ds = data_size;
+ bio_for_each_segment(bvec, bio, i) {
+ page = bvec->bv_page;
+ rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE));
+ kunmap(page);
+ if (rr != min_t(int, ds, PAGE_SIZE)) {
+ drbd_free_ee(mdev, e);
+ dev_warn(DEV, "short read receiving data: read %d expected %d\n",
+ rr, min_t(int, ds, PAGE_SIZE));
+ return NULL;
+ }
+ ds -= rr;
+ }
+
+ if (dgs) {
+ drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
+ if (memcmp(dig_in, dig_vv, dgs)) {
+ dev_err(DEV, "Digest integrity check FAILED.\n");
+ drbd_bcast_ee(mdev, "digest failed",
+ dgs, dig_in, dig_vv, e);
+ drbd_free_ee(mdev, e);
+ return NULL;
+ }
+ }
+ mdev->recv_cnt += data_size>>9;
+ return e;
+}
+
+/* drbd_drain_block() just takes a data block
+ * out of the socket input buffer, and discards it.
+ */
+static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
+{
+ struct page *page;
+ int rr, rv = 1;
+ void *data;
+
+ page = drbd_pp_alloc(mdev, 1);
+
+ data = kmap(page);
+ while (data_size) {
+ rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
+ if (rr != min_t(int, data_size, PAGE_SIZE)) {
+ rv = 0;
+ dev_warn(DEV, "short read receiving data: read %d expected %d\n",
+ rr, min_t(int, data_size, PAGE_SIZE));
+ break;
+ }
+ data_size -= rr;
+ }
+ kunmap(page);
+ drbd_pp_free(mdev, page);
+ return rv;
+}
+
+static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
+ sector_t sector, int data_size)
+{
+ struct bio_vec *bvec;
+ struct bio *bio;
+ int dgs, rr, i, expect;
+ void *dig_in = mdev->int_dig_in;
+ void *dig_vv = mdev->int_dig_vv;
+
+ dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
+ crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
+
+ if (dgs) {
+ rr = drbd_recv(mdev, dig_in, dgs);
+ if (rr != dgs) {
+ dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n",
+ rr, dgs);
+ return 0;
+ }
+ }
+
+ data_size -= dgs;
+
+ /* optimistically update recv_cnt. if receiving fails below,
+ * we disconnect anyways, and counters will be reset. */
+ mdev->recv_cnt += data_size>>9;
+
+ bio = req->master_bio;
+ D_ASSERT(sector == bio->bi_sector);
+
+ bio_for_each_segment(bvec, bio, i) {
+ expect = min_t(int, data_size, bvec->bv_len);
+ rr = drbd_recv(mdev,
+ kmap(bvec->bv_page)+bvec->bv_offset,
+ expect);
+ kunmap(bvec->bv_page);
+ if (rr != expect) {
+ dev_warn(DEV, "short read receiving data reply: "
+ "read %d expected %d\n",
+ rr, expect);
+ return 0;
+ }
+ data_size -= rr;
+ }
+
+ if (dgs) {
+ drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
+ if (memcmp(dig_in, dig_vv, dgs)) {
+ dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
+ return 0;
+ }
+ }
+
+ D_ASSERT(data_size == 0);
+ return 1;
+}
+
+/* e_end_resync_block() is called via
+ * drbd_process_done_ee() by asender only */
+static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+ struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
+ sector_t sector = e->sector;
+ int ok;
+
+ D_ASSERT(hlist_unhashed(&e->colision));
+
+ if (likely(drbd_bio_uptodate(e->private_bio))) {
+ drbd_set_in_sync(mdev, sector, e->size);
+ ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
+ } else {
+ /* Record failure to sync */
+ drbd_rs_failed_io(mdev, sector, e->size);
+
+ ok = drbd_send_ack(mdev, P_NEG_ACK, e);
+ }
+ dec_unacked(mdev);
+
+ return ok;
+}
+
+static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
+{
+ struct drbd_epoch_entry *e;
+
+ e = read_in_block(mdev, ID_SYNCER, sector, data_size);
+ if (!e) {
+ put_ldev(mdev);
+ return FALSE;
+ }
+
+ dec_rs_pending(mdev);
+
+ e->private_bio->bi_end_io = drbd_endio_write_sec;
+ e->private_bio->bi_rw = WRITE;
+ e->w.cb = e_end_resync_block;
+
+ inc_unacked(mdev);
+ /* corresponding dec_unacked() in e_end_resync_block()
+ * respective _drbd_clear_done_ee */
+
+ spin_lock_irq(&mdev->req_lock);
+ list_add(&e->w.list, &mdev->sync_ee);
+ spin_unlock_irq(&mdev->req_lock);
+
+ drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio);
+ /* accounting done in endio */
+
+ maybe_kick_lo(mdev);
+ return TRUE;
+}
+
+static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
+{
+ struct drbd_request *req;
+ sector_t sector;
+ unsigned int header_size, data_size;
+ int ok;
+ struct p_data *p = (struct p_data *)h;
+
+ header_size = sizeof(*p) - sizeof(*h);
+ data_size = h->length - header_size;
+
+ ERR_IF(data_size == 0) return FALSE;
+
+ if (drbd_recv(mdev, h->payload, header_size) != header_size)
+ return FALSE;
+
+ sector = be64_to_cpu(p->sector);
+
+ spin_lock_irq(&mdev->req_lock);
+ req = _ar_id_to_req(mdev, p->block_id, sector);
+ spin_unlock_irq(&mdev->req_lock);
+ if (unlikely(!req)) {
+ dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
+ return FALSE;
+ }
+
+ /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid
+ * special casing it there for the various failure cases.
+ * still no race with drbd_fail_pending_reads */
+ ok = recv_dless_read(mdev, req, sector, data_size);
+
+ if (ok)
+ req_mod(req, data_received);
+ /* else: nothing. handled from drbd_disconnect...
+ * I don't think we may complete this just yet
+ * in case we are "on-disconnect: freeze" */
+
+ return ok;
+}
+
+static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
+{
+ sector_t sector;
+ unsigned int header_size, data_size;
+ int ok;
+ struct p_data *p = (struct p_data *)h;
+
+ header_size = sizeof(*p) - sizeof(*h);
+ data_size = h->length - header_size;
+
+ ERR_IF(data_size == 0) return FALSE;
+
+ if (drbd_recv(mdev, h->payload, header_size) != header_size)
+ return FALSE;
+
+ sector = be64_to_cpu(p->sector);
+ D_ASSERT(p->block_id == ID_SYNCER);
+
+ if (get_ldev(mdev)) {
+ /* data is submitted to disk within recv_resync_read.
+ * corresponding put_ldev done below on error,
+ * or in drbd_endio_write_sec. */
+ ok = recv_resync_read(mdev, sector, data_size);
+ } else {
+ if (__ratelimit(&drbd_ratelimit_state))
+ dev_err(DEV, "Can not write resync data to local disk.\n");
+
+ ok = drbd_drain_block(mdev, data_size);
+
+ drbd_send_ack_dp(mdev, P_NEG_ACK, p);
+ }
+
+ return ok;
+}
+
+/* e_end_block() is called via drbd_process_done_ee().
+ * this means this function only runs in the asender thread
+ */
+static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+ struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
+ sector_t sector = e->sector;
+ struct drbd_epoch *epoch;
+ int ok = 1, pcmd;
+
+ if (e->flags & EE_IS_BARRIER) {
+ epoch = previous_epoch(mdev, e->epoch);
+ if (epoch)
+ drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0));
+ }
+
+ if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
+ if (likely(drbd_bio_uptodate(e->private_bio))) {
+ pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
+ mdev->state.conn <= C_PAUSED_SYNC_T &&
+ e->flags & EE_MAY_SET_IN_SYNC) ?
+ P_RS_WRITE_ACK : P_WRITE_ACK;
+ ok &= drbd_send_ack(mdev, pcmd, e);
+ if (pcmd == P_RS_WRITE_ACK)
+ drbd_set_in_sync(mdev, sector, e->size);
+ } else {
+ ok = drbd_send_ack(mdev, P_NEG_ACK, e);
+ /* we expect it to be marked out of sync anyways...
+ * maybe assert this? */
+ }
+ dec_unacked(mdev);
+ }
+ /* we delete from the conflict detection hash _after_ we sent out the
+ * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
+ if (mdev->net_conf->two_primaries) {
+ spin_lock_irq(&mdev->req_lock);
+ D_ASSERT(!hlist_unhashed(&e->colision));
+ hlist_del_init(&e->colision);
+ spin_unlock_irq(&mdev->req_lock);
+ } else {
+ D_ASSERT(hlist_unhashed(&e->colision));
+ }
+
+ drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+
+ return ok;
+}
+
+static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+ struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
+ int ok = 1;
+
+ D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
+ ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
+
+ spin_lock_irq(&mdev->req_lock);
+ D_ASSERT(!hlist_unhashed(&e->colision));
+ hlist_del_init(&e->colision);
+ spin_unlock_irq(&mdev->req_lock);
+
+ dec_unacked(mdev);
+
+ return ok;
+}
+
+/* Called from receive_Data.
+ * Synchronize packets on sock with packets on msock.
+ *
+ * This is here so even when a P_DATA packet traveling via sock overtook an Ack
+ * packet traveling on msock, they are still processed in the order they have
+ * been sent.
+ *
+ * Note: we don't care for Ack packets overtaking P_DATA packets.
+ *
+ * In case packet_seq is larger than mdev->peer_seq number, there are
+ * outstanding packets on the msock. We wait for them to arrive.
+ * In case we are the logically next packet, we update mdev->peer_seq
+ * ourselves. Correctly handles 32bit wrap around.
+ *
+ * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
+ * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
+ * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
+ * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
+ *
+ * returns 0 if we may process the packet,
+ * -ERESTARTSYS if we were interrupted (by disconnect signal). */
+static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
+{
+ DEFINE_WAIT(wait);
+ unsigned int p_seq;
+ long timeout;
+ int ret = 0;
+ spin_lock(&mdev->peer_seq_lock);
+ for (;;) {
+ prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
+ if (seq_le(packet_seq, mdev->peer_seq+1))
+ break;
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ p_seq = mdev->peer_seq;
+ spin_unlock(&mdev->peer_seq_lock);
+ timeout = schedule_timeout(30*HZ);
+ spin_lock(&mdev->peer_seq_lock);
+ if (timeout == 0 && p_seq == mdev->peer_seq) {
+ ret = -ETIMEDOUT;
+ dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n");
+ break;
+ }
+ }
+ finish_wait(&mdev->seq_wait, &wait);
+ if (mdev->peer_seq+1 == packet_seq)
+ mdev->peer_seq++;
+ spin_unlock(&mdev->peer_seq_lock);
+ return ret;
+}
+
+/* mirrored write */
+static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
+{
+ sector_t sector;
+ struct drbd_epoch_entry *e;
+ struct p_data *p = (struct p_data *)h;
+ int header_size, data_size;
+ int rw = WRITE;
+ u32 dp_flags;
+
+ header_size = sizeof(*p) - sizeof(*h);
+ data_size = h->length - header_size;
+
+ ERR_IF(data_size == 0) return FALSE;
+
+ if (drbd_recv(mdev, h->payload, header_size) != header_size)
+ return FALSE;
+
+ if (!get_ldev(mdev)) {
+ if (__ratelimit(&drbd_ratelimit_state))
+ dev_err(DEV, "Can not write mirrored data block "
+ "to local disk.\n");
+ spin_lock(&mdev->peer_seq_lock);
+ if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
+ mdev->peer_seq++;
+ spin_unlock(&mdev->peer_seq_lock);
+
+ drbd_send_ack_dp(mdev, P_NEG_ACK, p);
+ atomic_inc(&mdev->current_epoch->epoch_size);
+ return drbd_drain_block(mdev, data_size);
+ }
+
+ /* get_ldev(mdev) successful.
+ * Corresponding put_ldev done either below (on various errors),
+ * or in drbd_endio_write_sec, if we successfully submit the data at
+ * the end of this function. */
+
+ sector = be64_to_cpu(p->sector);
+ e = read_in_block(mdev, p->block_id, sector, data_size);
+ if (!e) {
+ put_ldev(mdev);
+ return FALSE;
+ }
+
+ e->private_bio->bi_end_io = drbd_endio_write_sec;
+ e->w.cb = e_end_block;
+
+ spin_lock(&mdev->epoch_lock);
+ e->epoch = mdev->current_epoch;
+ atomic_inc(&e->epoch->epoch_size);
+ atomic_inc(&e->epoch->active);
+
+ if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) {
+ struct drbd_epoch *epoch;
+ /* Issue a barrier if we start a new epoch, and the previous epoch
+ was not a epoch containing a single request which already was
+ a Barrier. */
+ epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
+ if (epoch == e->epoch) {
+ set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
+ rw |= (1<<BIO_RW_BARRIER);
+ e->flags |= EE_IS_BARRIER;
+ } else {
+ if (atomic_read(&epoch->epoch_size) > 1 ||
+ !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
+ set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
+ set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
+ rw |= (1<<BIO_RW_BARRIER);
+ e->flags |= EE_IS_BARRIER;
+ }
+ }
+ }
+ spin_unlock(&mdev->epoch_lock);
+
+ dp_flags = be32_to_cpu(p->dp_flags);
+ if (dp_flags & DP_HARDBARRIER) {
+ dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n");
+ /* rw |= (1<<BIO_RW_BARRIER); */
+ }
+ if (dp_flags & DP_RW_SYNC)
+ rw |= (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG);
+ if (dp_flags & DP_MAY_SET_IN_SYNC)
+ e->flags |= EE_MAY_SET_IN_SYNC;
+
+ /* I'm the receiver, I do hold a net_cnt reference. */
+ if (!mdev->net_conf->two_primaries) {
+ spin_lock_irq(&mdev->req_lock);
+ } else {
+ /* don't get the req_lock yet,
+ * we may sleep in drbd_wait_peer_seq */
+ const int size = e->size;
+ const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags);
+ DEFINE_WAIT(wait);
+ struct drbd_request *i;
+ struct hlist_node *n;
+ struct hlist_head *slot;
+ int first;
+
+ D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
+ BUG_ON(mdev->ee_hash == NULL);
+ BUG_ON(mdev->tl_hash == NULL);
+
+ /* conflict detection and handling:
+ * 1. wait on the sequence number,
+ * in case this data packet overtook ACK packets.
+ * 2. check our hash tables for conflicting requests.
+ * we only need to walk the tl_hash, since an ee can not
+ * have a conflict with an other ee: on the submitting
+ * node, the corresponding req had already been conflicting,
+ * and a conflicting req is never sent.
+ *
+ * Note: for two_primaries, we are protocol C,
+ * so there cannot be any request that is DONE
+ * but still on the transfer log.
+ *
+ * unconditionally add to the ee_hash.
+ *
+ * if no conflicting request is found:
+ * submit.
+ *
+ * if any conflicting request is found
+ * that has not yet been acked,
+ * AND I have the "discard concurrent writes" flag:
+ * queue (via done_ee) the P_DISCARD_ACK; OUT.
+ *
+ * if any conflicting request is found:
+ * block the receiver, waiting on misc_wait
+ * until no more conflicting requests are there,
+ * or we get interrupted (disconnect).
+ *
+ * we do not just write after local io completion of those
+ * requests, but only after req is done completely, i.e.
+ * we wait for the P_DISCARD_ACK to arrive!
+ *
+ * then proceed normally, i.e. submit.
+ */
+ if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num)))
+ goto out_interrupted;
+
+ spin_lock_irq(&mdev->req_lock);
+
+ hlist_add_head(&e->colision, ee_hash_slot(mdev, sector));
+
+#define OVERLAPS overlaps(i->sector, i->size, sector, size)
+ slot = tl_hash_slot(mdev, sector);
+ first = 1;
+ for (;;) {
+ int have_unacked = 0;
+ int have_conflict = 0;
+ prepare_to_wait(&mdev->misc_wait, &wait,
+ TASK_INTERRUPTIBLE);
+ hlist_for_each_entry(i, n, slot, colision) {
+ if (OVERLAPS) {
+ /* only ALERT on first iteration,
+ * we may be woken up early... */
+ if (first)
+ dev_alert(DEV, "%s[%u] Concurrent local write detected!"
+ " new: %llus +%u; pending: %llus +%u\n",
+ current->comm, current->pid,
+ (unsigned long long)sector, size,
+ (unsigned long long)i->sector, i->size);
+ if (i->rq_state & RQ_NET_PENDING)
+ ++have_unacked;
+ ++have_conflict;
+ }
+ }
+#undef OVERLAPS
+ if (!have_conflict)
+ break;
+
+ /* Discard Ack only for the _first_ iteration */
+ if (first && discard && have_unacked) {
+ dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n",
+ (unsigned long long)sector);
+ inc_unacked(mdev);
+ e->w.cb = e_send_discard_ack;
+ list_add_tail(&e->w.list, &mdev->done_ee);
+
+ spin_unlock_irq(&mdev->req_lock);
+
+ /* we could probably send that P_DISCARD_ACK ourselves,
+ * but I don't like the receiver using the msock */
+
+ put_ldev(mdev);
+ wake_asender(mdev);
+ finish_wait(&mdev->misc_wait, &wait);
+ return TRUE;
+ }
+
+ if (signal_pending(current)) {
+ hlist_del_init(&e->colision);
+
+ spin_unlock_irq(&mdev->req_lock);
+
+ finish_wait(&mdev->misc_wait, &wait);
+ goto out_interrupted;
+ }
+
+ spin_unlock_irq(&mdev->req_lock);
+ if (first) {
+ first = 0;
+ dev_alert(DEV, "Concurrent write! [W AFTERWARDS] "
+ "sec=%llus\n", (unsigned long long)sector);
+ } else if (discard) {
+ /* we had none on the first iteration.
+ * there must be none now. */
+ D_ASSERT(have_unacked == 0);
+ }
+ schedule();
+ spin_lock_irq(&mdev->req_lock);
+ }
+ finish_wait(&mdev->misc_wait, &wait);
+ }
+
+ list_add(&e->w.list, &mdev->active_ee);
+ spin_unlock_irq(&mdev->req_lock);
+
+ switch (mdev->net_conf->wire_protocol) {
+ case DRBD_PROT_C:
+ inc_unacked(mdev);
+ /* corresponding dec_unacked() in e_end_block()
+ * respective _drbd_clear_done_ee */
+ break;
+ case DRBD_PROT_B:
+ /* I really don't like it that the receiver thread
+ * sends on the msock, but anyways */
+ drbd_send_ack(mdev, P_RECV_ACK, e);
+ break;
+ case DRBD_PROT_A:
+ /* nothing to do */
+ break;
+ }
+
+ if (mdev->state.pdsk == D_DISKLESS) {
+ /* In case we have the only disk of the cluster, */
+ drbd_set_out_of_sync(mdev, e->sector, e->size);
+ e->flags |= EE_CALL_AL_COMPLETE_IO;
+ drbd_al_begin_io(mdev, e->sector);
+ }
+
+ e->private_bio->bi_rw = rw;
+ drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio);
+ /* accounting done in endio */
+
+ maybe_kick_lo(mdev);
+ return TRUE;
+
+out_interrupted:
+ /* yes, the epoch_size now is imbalanced.
+ * but we drop the connection anyways, so we don't have a chance to
+ * receive a barrier... atomic_inc(&mdev->epoch_size); */
+ put_ldev(mdev);
+ drbd_free_ee(mdev, e);
+ return FALSE;
+}
+
+static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
+{
+ sector_t sector;
+ const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
+ struct drbd_epoch_entry *e;
+ struct digest_info *di = NULL;
+ int size, digest_size;
+ unsigned int fault_type;
+ struct p_block_req *p =
+ (struct p_block_req *)h;
+ const int brps = sizeof(*p)-sizeof(*h);
+
+ if (drbd_recv(mdev, h->payload, brps) != brps)
+ return FALSE;
+
+ sector = be64_to_cpu(p->sector);
+ size = be32_to_cpu(p->blksize);
+
+ if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+ dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+ (unsigned long long)sector, size);
+ return FALSE;
+ }
+ if (sector + (size>>9) > capacity) {
+ dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+ (unsigned long long)sector, size);
+ return FALSE;
+ }
+
+ if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
+ if (__ratelimit(&drbd_ratelimit_state))
+ dev_err(DEV, "Can not satisfy peer's read request, "
+ "no local data.\n");
+ drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
+ P_NEG_RS_DREPLY , p);
+ return TRUE;
+ }
+
+ /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
+ * "criss-cross" setup, that might cause write-out on some other DRBD,
+ * which in turn might block on the other node at this very place. */
+ e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
+ if (!e) {
+ put_ldev(mdev);
+ return FALSE;
+ }
+
+ e->private_bio->bi_rw = READ;
+ e->private_bio->bi_end_io = drbd_endio_read_sec;
+
+ switch (h->command) {
+ case P_DATA_REQUEST:
+ e->w.cb = w_e_end_data_req;
+ fault_type = DRBD_FAULT_DT_RD;
+ break;
+ case P_RS_DATA_REQUEST:
+ e->w.cb = w_e_end_rsdata_req;
+ fault_type = DRBD_FAULT_RS_RD;
+ /* Eventually this should become asynchronously. Currently it
+ * blocks the whole receiver just to delay the reading of a
+ * resync data block.
+ * the drbd_work_queue mechanism is made for this...
+ */
+ if (!drbd_rs_begin_io(mdev, sector)) {
+ /* we have been interrupted,
+ * probably connection lost! */
+ D_ASSERT(signal_pending(current));
+ goto out_free_e;
+ }
+ break;
+
+ case P_OV_REPLY:
+ case P_CSUM_RS_REQUEST:
+ fault_type = DRBD_FAULT_RS_RD;
+ digest_size = h->length - brps ;
+ di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
+ if (!di)
+ goto out_free_e;
+
+ di->digest_size = digest_size;
+ di->digest = (((char *)di)+sizeof(struct digest_info));
+
+ if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
+ goto out_free_e;
+
+ e->block_id = (u64)(unsigned long)di;
+ if (h->command == P_CSUM_RS_REQUEST) {
+ D_ASSERT(mdev->agreed_pro_version >= 89);
+ e->w.cb = w_e_end_csum_rs_req;
+ } else if (h->command == P_OV_REPLY) {
+ e->w.cb = w_e_end_ov_reply;
+ dec_rs_pending(mdev);
+ break;
+ }
+
+ if (!drbd_rs_begin_io(mdev, sector)) {
+ /* we have been interrupted, probably connection lost! */
+ D_ASSERT(signal_pending(current));
+ goto out_free_e;
+ }
+ break;
+
+ case P_OV_REQUEST:
+ if (mdev->state.conn >= C_CONNECTED &&
+ mdev->state.conn != C_VERIFY_T)
+ dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n",
+ drbd_conn_str(mdev->state.conn));
+ if (mdev->ov_start_sector == ~(sector_t)0 &&
+ mdev->agreed_pro_version >= 90) {
+ mdev->ov_start_sector = sector;
+ mdev->ov_position = sector;
+ mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector);
+ dev_info(DEV, "Online Verify start sector: %llu\n",
+ (unsigned long long)sector);
+ }
+ e->w.cb = w_e_end_ov_req;
+ fault_type = DRBD_FAULT_RS_RD;
+ /* Eventually this should become asynchronous. Currently it
+ * blocks the whole receiver just to delay the reading of a
+ * resync data block.
+ * the drbd_work_queue mechanism is made for this...
+ */
+ if (!drbd_rs_begin_io(mdev, sector)) {
+ /* we have been interrupted,
+ * probably connection lost! */
+ D_ASSERT(signal_pending(current));
+ goto out_free_e;
+ }
+ break;
+
+
+ default:
+ dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
+ cmdname(h->command));
+ fault_type = DRBD_FAULT_MAX;
+ }
+
+ spin_lock_irq(&mdev->req_lock);
+ list_add(&e->w.list, &mdev->read_ee);
+ spin_unlock_irq(&mdev->req_lock);
+
+ inc_unacked(mdev);
+
+ drbd_generic_make_request(mdev, fault_type, e->private_bio);
+ maybe_kick_lo(mdev);
+
+ return TRUE;
+
+out_free_e:
+ kfree(di);
+ put_ldev(mdev);
+ drbd_free_ee(mdev, e);
+ return FALSE;
+}
+
+static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
+{
+ int self, peer, rv = -100;
+ unsigned long ch_self, ch_peer;
+
+ self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
+ peer = mdev->p_uuid[UI_BITMAP] & 1;
+
+ ch_peer = mdev->p_uuid[UI_SIZE];
+ ch_self = mdev->comm_bm_set;
+
+ switch (mdev->net_conf->after_sb_0p) {
+ case ASB_CONSENSUS:
+ case ASB_DISCARD_SECONDARY:
+ case ASB_CALL_HELPER:
+ dev_err(DEV, "Configuration error.\n");
+ break;
+ case ASB_DISCONNECT:
+ break;
+ case ASB_DISCARD_YOUNGER_PRI:
+ if (self == 0 && peer == 1) {
+ rv = -1;
+ break;
+ }
+ if (self == 1 && peer == 0) {
+ rv = 1;
+ break;
+ }
+ /* Else fall through to one of the other strategies... */
+ case ASB_DISCARD_OLDER_PRI:
+ if (self == 0 && peer == 1) {
+ rv = 1;
+ break;
+ }
+ if (self == 1 && peer == 0) {
+ rv = -1;
+ break;
+ }
+ /* Else fall through to one of the other strategies... */
+ dev_warn(DEV, "Discard younger/older primary did not find a decision\n"
+ "Using discard-least-changes instead\n");
+ case ASB_DISCARD_ZERO_CHG:
+ if (ch_peer == 0 && ch_self == 0) {
+ rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
+ ? -1 : 1;
+ break;
+ } else {
+ if (ch_peer == 0) { rv = 1; break; }
+ if (ch_self == 0) { rv = -1; break; }
+ }
+ if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG)
+ break;
+ case ASB_DISCARD_LEAST_CHG:
+ if (ch_self < ch_peer)
+ rv = -1;
+ else if (ch_self > ch_peer)
+ rv = 1;
+ else /* ( ch_self == ch_peer ) */
+ /* Well, then use something else. */
+ rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
+ ? -1 : 1;
+ break;
+ case ASB_DISCARD_LOCAL:
+ rv = -1;
+ break;
+ case ASB_DISCARD_REMOTE:
+ rv = 1;
+ }
+
+ return rv;
+}
+
+static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
+{
+ int self, peer, hg, rv = -100;
+
+ self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
+ peer = mdev->p_uuid[UI_BITMAP] & 1;
+
+ switch (mdev->net_conf->after_sb_1p) {
+ case ASB_DISCARD_YOUNGER_PRI:
+ case ASB_DISCARD_OLDER_PRI:
+ case ASB_DISCARD_LEAST_CHG:
+ case ASB_DISCARD_LOCAL:
+ case ASB_DISCARD_REMOTE:
+ dev_err(DEV, "Configuration error.\n");
+ break;
+ case ASB_DISCONNECT:
+ break;
+ case ASB_CONSENSUS:
+ hg = drbd_asb_recover_0p(mdev);
+ if (hg == -1 && mdev->state.role == R_SECONDARY)
+ rv = hg;
+ if (hg == 1 && mdev->state.role == R_PRIMARY)
+ rv = hg;
+ break;
+ case ASB_VIOLENTLY:
+ rv = drbd_asb_recover_0p(mdev);
+ break;
+ case ASB_DISCARD_SECONDARY:
+ return mdev->state.role == R_PRIMARY ? 1 : -1;
+ case ASB_CALL_HELPER:
+ hg = drbd_asb_recover_0p(mdev);
+ if (hg == -1 && mdev->state.role == R_PRIMARY) {
+ self = drbd_set_role(mdev, R_SECONDARY, 0);
+ /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
+ * we might be here in C_WF_REPORT_PARAMS which is transient.
+ * we do not need to wait for the after state change work either. */
+ self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
+ if (self != SS_SUCCESS) {
+ drbd_khelper(mdev, "pri-lost-after-sb");
+ } else {
+ dev_warn(DEV, "Successfully gave up primary role.\n");
+ rv = hg;
+ }
+ } else
+ rv = hg;
+ }
+
+ return rv;
+}
+
+static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
+{
+ int self, peer, hg, rv = -100;
+
+ self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
+ peer = mdev->p_uuid[UI_BITMAP] & 1;
+
+ switch (mdev->net_conf->after_sb_2p) {
+ case ASB_DISCARD_YOUNGER_PRI:
+ case ASB_DISCARD_OLDER_PRI:
+ case ASB_DISCARD_LEAST_CHG:
+ case ASB_DISCARD_LOCAL:
+ case ASB_DISCARD_REMOTE:
+ case ASB_CONSENSUS:
+ case ASB_DISCARD_SECONDARY:
+ dev_err(DEV, "Configuration error.\n");
+ break;
+ case ASB_VIOLENTLY:
+ rv = drbd_asb_recover_0p(mdev);
+ break;
+ case ASB_DISCONNECT:
+ break;
+ case ASB_CALL_HELPER:
+ hg = drbd_asb_recover_0p(mdev);
+ if (hg == -1) {
+ /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
+ * we might be here in C_WF_REPORT_PARAMS which is transient.
+ * we do not need to wait for the after state change work either. */
+ self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
+ if (self != SS_SUCCESS) {
+ drbd_khelper(mdev, "pri-lost-after-sb");
+ } else {
+ dev_warn(DEV, "Successfully gave up primary role.\n");
+ rv = hg;
+ }
+ } else
+ rv = hg;
+ }
+
+ return rv;
+}
+
+static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
+ u64 bits, u64 flags)
+{
+ if (!uuid) {
+ dev_info(DEV, "%s uuid info vanished while I was looking!\n", text);
+ return;
+ }
+ dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
+ text,
+ (unsigned long long)uuid[UI_CURRENT],
+ (unsigned long long)uuid[UI_BITMAP],
+ (unsigned long long)uuid[UI_HISTORY_START],
+ (unsigned long long)uuid[UI_HISTORY_END],
+ (unsigned long long)bits,
+ (unsigned long long)flags);
+}
+
+/*
+ 100 after split brain try auto recover
+ 2 C_SYNC_SOURCE set BitMap
+ 1 C_SYNC_SOURCE use BitMap
+ 0 no Sync
+ -1 C_SYNC_TARGET use BitMap
+ -2 C_SYNC_TARGET set BitMap
+ -100 after split brain, disconnect
+-1000 unrelated data
+ */
+static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
+{
+ u64 self, peer;
+ int i, j;
+
+ self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
+ peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
+
+ *rule_nr = 10;
+ if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
+ return 0;
+
+ *rule_nr = 20;
+ if ((self == UUID_JUST_CREATED || self == (u64)0) &&
+ peer != UUID_JUST_CREATED)
+ return -2;
+
+ *rule_nr = 30;
+ if (self != UUID_JUST_CREATED &&
+ (peer == UUID_JUST_CREATED || peer == (u64)0))
+ return 2;
+
+ if (self == peer) {
+ int rct, dc; /* roles at crash time */
+
+ if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
+
+ if (mdev->agreed_pro_version < 91)
+ return -1001;
+
+ if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
+ (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
+ dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n");
+ drbd_uuid_set_bm(mdev, 0UL);
+
+ drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
+ mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
+ *rule_nr = 34;
+ } else {
+ dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n");
+ *rule_nr = 36;
+ }
+
+ return 1;
+ }
+
+ if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
+
+ if (mdev->agreed_pro_version < 91)
+ return -1001;
+
+ if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
+ (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
+ dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
+
+ mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START];
+ mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP];
+ mdev->p_uuid[UI_BITMAP] = 0UL;
+
+ drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
+ *rule_nr = 35;
+ } else {
+ dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n");
+ *rule_nr = 37;
+ }
+
+ return -1;
+ }
+
+ /* Common power [off|failure] */
+ rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
+ (mdev->p_uuid[UI_FLAGS] & 2);
+ /* lowest bit is set when we were primary,
+ * next bit (weight 2) is set when peer was primary */
+ *rule_nr = 40;
+
+ switch (rct) {
+ case 0: /* !self_pri && !peer_pri */ return 0;
+ case 1: /* self_pri && !peer_pri */ return 1;
+ case 2: /* !self_pri && peer_pri */ return -1;
+ case 3: /* self_pri && peer_pri */
+ dc = test_bit(DISCARD_CONCURRENT, &mdev->flags);
+ return dc ? -1 : 1;
+ }
+ }
+
+ *rule_nr = 50;
+ peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
+ if (self == peer)
+ return -1;
+
+ *rule_nr = 51;
+ peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
+ if (self == peer) {
+ self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
+ peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1);
+ if (self == peer) {
+ /* The last P_SYNC_UUID did not get though. Undo the last start of
+ resync as sync source modifications of the peer's UUIDs. */
+
+ if (mdev->agreed_pro_version < 91)
+ return -1001;
+
+ mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
+ mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
+ return -1;
+ }
+ }
+
+ *rule_nr = 60;
+ self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
+ for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+ peer = mdev->p_uuid[i] & ~((u64)1);
+ if (self == peer)
+ return -2;
+ }
+
+ *rule_nr = 70;
+ self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
+ peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
+ if (self == peer)
+ return 1;
+
+ *rule_nr = 71;
+ self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
+ if (self == peer) {
+ self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1);
+ peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
+ if (self == peer) {
+ /* The last P_SYNC_UUID did not get though. Undo the last start of
+ resync as sync source modifications of our UUIDs. */
+
+ if (mdev->agreed_pro_version < 91)
+ return -1001;
+
+ _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
+ _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
+
+ dev_info(DEV, "Undid last start of resync:\n");
+
+ drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
+ mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
+
+ return 1;
+ }
+ }
+
+
+ *rule_nr = 80;
+ peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
+ for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+ self = mdev->ldev->md.uuid[i] & ~((u64)1);
+ if (self == peer)
+ return 2;
+ }
+
+ *rule_nr = 90;
+ self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
+ peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
+ if (self == peer && self != ((u64)0))
+ return 100;
+
+ *rule_nr = 100;
+ for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+ self = mdev->ldev->md.uuid[i] & ~((u64)1);
+ for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
+ peer = mdev->p_uuid[j] & ~((u64)1);
+ if (self == peer)
+ return -100;
+ }
+ }
+
+ return -1000;
+}
+
+/* drbd_sync_handshake() returns the new conn state on success, or
+ CONN_MASK (-1) on failure.
+ */
+static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
+ enum drbd_disk_state peer_disk) __must_hold(local)
+{
+ int hg, rule_nr;
+ enum drbd_conns rv = C_MASK;
+ enum drbd_disk_state mydisk;
+
+ mydisk = mdev->state.disk;
+ if (mydisk == D_NEGOTIATING)
+ mydisk = mdev->new_state_tmp.disk;
+
+ dev_info(DEV, "drbd_sync_handshake:\n");
+ drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0);
+ drbd_uuid_dump(mdev, "peer", mdev->p_uuid,
+ mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
+
+ hg = drbd_uuid_compare(mdev, &rule_nr);
+
+ dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
+
+ if (hg == -1000) {
+ dev_alert(DEV, "Unrelated data, aborting!\n");
+ return C_MASK;
+ }
+ if (hg == -1001) {
+ dev_alert(DEV, "To resolve this both sides have to support at least protocol\n");
+ return C_MASK;
+ }
+
+ if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
+ (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) {
+ int f = (hg == -100) || abs(hg) == 2;
+ hg = mydisk > D_INCONSISTENT ? 1 : -1;
+ if (f)
+ hg = hg*2;
+ dev_info(DEV, "Becoming sync %s due to disk states.\n",
+ hg > 0 ? "source" : "target");
+ }
+
+ if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
+ int pcount = (mdev->state.role == R_PRIMARY)
+ + (peer_role == R_PRIMARY);
+ int forced = (hg == -100);
+
+ switch (pcount) {
+ case 0:
+ hg = drbd_asb_recover_0p(mdev);
+ break;
+ case 1:
+ hg = drbd_asb_recover_1p(mdev);
+ break;
+ case 2:
+ hg = drbd_asb_recover_2p(mdev);
+ break;
+ }
+ if (abs(hg) < 100) {
+ dev_warn(DEV, "Split-Brain detected, %d primaries, "
+ "automatically solved. Sync from %s node\n",
+ pcount, (hg < 0) ? "peer" : "this");
+ if (forced) {
+ dev_warn(DEV, "Doing a full sync, since"
+ " UUIDs where ambiguous.\n");
+ hg = hg*2;
+ }
+ }
+ }
+
+ if (hg == -100) {
+ if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1))
+ hg = -1;
+ if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1))
+ hg = 1;
+
+ if (abs(hg) < 100)
+ dev_warn(DEV, "Split-Brain detected, manually solved. "
+ "Sync from %s node\n",
+ (hg < 0) ? "peer" : "this");
+ }
+
+ if (hg == -100) {
+ dev_alert(DEV, "Split-Brain detected, dropping connection!\n");
+ drbd_khelper(mdev, "split-brain");
+ return C_MASK;
+ }
+
+ if (hg > 0 && mydisk <= D_INCONSISTENT) {
+ dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n");
+ return C_MASK;
+ }
+
+ if (hg < 0 && /* by intention we do not use mydisk here. */
+ mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
+ switch (mdev->net_conf->rr_conflict) {
+ case ASB_CALL_HELPER:
+ drbd_khelper(mdev, "pri-lost");
+ /* fall through */
+ case ASB_DISCONNECT:
+ dev_err(DEV, "I shall become SyncTarget, but I am primary!\n");
+ return C_MASK;
+ case ASB_VIOLENTLY:
+ dev_warn(DEV, "Becoming SyncTarget, violating the stable-data"
+ "assumption\n");
+ }
+ }
+
+ if (abs(hg) >= 2) {
+ dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
+ if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake"))
+ return C_MASK;
+ }
+
+ if (hg > 0) { /* become sync source. */
+ rv = C_WF_BITMAP_S;
+ } else if (hg < 0) { /* become sync target */
+ rv = C_WF_BITMAP_T;
+ } else {
+ rv = C_CONNECTED;
+ if (drbd_bm_total_weight(mdev)) {
+ dev_info(DEV, "No resync, but %lu bits in bitmap!\n",
+ drbd_bm_total_weight(mdev));
+ }
+ }
+
+ return rv;
+}
+
+/* returns 1 if invalid */
+static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
+{
+ /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
+ if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) ||
+ (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL))
+ return 0;
+
+ /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
+ if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL ||
+ self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL)
+ return 1;
+
+ /* everything else is valid if they are equal on both sides. */
+ if (peer == self)
+ return 0;
+
+ /* everything es is invalid. */
+ return 1;
+}
+
+static int receive_protocol(struct drbd_conf *mdev, struct p_header *h)
+{
+ struct p_protocol *p = (struct p_protocol *)h;
+ int header_size, data_size;
+ int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
+ int p_want_lose, p_two_primaries;
+ char p_integrity_alg[SHARED_SECRET_MAX] = "";
+
+ header_size = sizeof(*p) - sizeof(*h);
+ data_size = h->length - header_size;
+
+ if (drbd_recv(mdev, h->payload, header_size) != header_size)
+ return FALSE;
+
+ p_proto = be32_to_cpu(p->protocol);
+ p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
+ p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
+ p_after_sb_2p = be32_to_cpu(p->after_sb_2p);
+ p_want_lose = be32_to_cpu(p->want_lose);
+ p_two_primaries = be32_to_cpu(p->two_primaries);
+
+ if (p_proto != mdev->net_conf->wire_protocol) {
+ dev_err(DEV, "incompatible communication protocols\n");
+ goto disconnect;
+ }
+
+ if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) {
+ dev_err(DEV, "incompatible after-sb-0pri settings\n");
+ goto disconnect;
+ }
+
+ if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) {
+ dev_err(DEV, "incompatible after-sb-1pri settings\n");
+ goto disconnect;
+ }
+
+ if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) {
+ dev_err(DEV, "incompatible after-sb-2pri settings\n");
+ goto disconnect;
+ }
+
+ if (p_want_lose && mdev->net_conf->want_lose) {
+ dev_err(DEV, "both sides have the 'want_lose' flag set\n");
+ goto disconnect;
+ }
+
+ if (p_two_primaries != mdev->net_conf->two_primaries) {
+ dev_err(DEV, "incompatible setting of the two-primaries options\n");
+ goto disconnect;
+ }
+
+ if (mdev->agreed_pro_version >= 87) {
+ unsigned char *my_alg = mdev->net_conf->integrity_alg;
+
+ if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
+ return FALSE;
+
+ p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
+ if (strcmp(p_integrity_alg, my_alg)) {
+ dev_err(DEV, "incompatible setting of the data-integrity-alg\n");
+ goto disconnect;
+ }
+ dev_info(DEV, "data-integrity-alg: %s\n",
+ my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
+ }
+
+ return TRUE;
+
+disconnect:
+ drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ return FALSE;
+}
+
+/* helper function
+ * input: alg name, feature name
+ * return: NULL (alg name was "")
+ * ERR_PTR(error) if something goes wrong
+ * or the crypto hash ptr, if it worked out ok. */
+struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
+ const char *alg, const char *name)
+{
+ struct crypto_hash *tfm;
+
+ if (!alg[0])
+ return NULL;
+
+ tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm)) {
+ dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n",
+ alg, name, PTR_ERR(tfm));
+ return tfm;
+ }
+ if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
+ crypto_free_hash(tfm);
+ dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name);
+ return ERR_PTR(-EINVAL);
+ }
+ return tfm;
+}
+
+static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
+{
+ int ok = TRUE;
+ struct p_rs_param_89 *p = (struct p_rs_param_89 *)h;
+ unsigned int header_size, data_size, exp_max_sz;
+ struct crypto_hash *verify_tfm = NULL;
+ struct crypto_hash *csums_tfm = NULL;
+ const int apv = mdev->agreed_pro_version;
+
+ exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
+ : apv == 88 ? sizeof(struct p_rs_param)
+ + SHARED_SECRET_MAX
+ : /* 89 */ sizeof(struct p_rs_param_89);
+
+ if (h->length > exp_max_sz) {
+ dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
+ h->length, exp_max_sz);
+ return FALSE;
+ }
+
+ if (apv <= 88) {
+ header_size = sizeof(struct p_rs_param) - sizeof(*h);
+ data_size = h->length - header_size;
+ } else /* apv >= 89 */ {
+ header_size = sizeof(struct p_rs_param_89) - sizeof(*h);
+ data_size = h->length - header_size;
+ D_ASSERT(data_size == 0);
+ }
+
+ /* initialize verify_alg and csums_alg */
+ memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
+
+ if (drbd_recv(mdev, h->payload, header_size) != header_size)
+ return FALSE;
+
+ mdev->sync_conf.rate = be32_to_cpu(p->rate);
+
+ if (apv >= 88) {
+ if (apv == 88) {
+ if (data_size > SHARED_SECRET_MAX) {
+ dev_err(DEV, "verify-alg too long, "
+ "peer wants %u, accepting only %u byte\n",
+ data_size, SHARED_SECRET_MAX);
+ return FALSE;
+ }
+
+ if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
+ return FALSE;
+
+ /* we expect NUL terminated string */
+ /* but just in case someone tries to be evil */
+ D_ASSERT(p->verify_alg[data_size-1] == 0);
+ p->verify_alg[data_size-1] = 0;
+
+ } else /* apv >= 89 */ {
+ /* we still expect NUL terminated strings */
+ /* but just in case someone tries to be evil */
+ D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0);
+ D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0);
+ p->verify_alg[SHARED_SECRET_MAX-1] = 0;
+ p->csums_alg[SHARED_SECRET_MAX-1] = 0;
+ }
+
+ if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) {
+ if (mdev->state.conn == C_WF_REPORT_PARAMS) {
+ dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
+ mdev->sync_conf.verify_alg, p->verify_alg);
+ goto disconnect;
+ }
+ verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
+ p->verify_alg, "verify-alg");
+ if (IS_ERR(verify_tfm)) {
+ verify_tfm = NULL;
+ goto disconnect;
+ }
+ }
+
+ if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) {
+ if (mdev->state.conn == C_WF_REPORT_PARAMS) {
+ dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
+ mdev->sync_conf.csums_alg, p->csums_alg);
+ goto disconnect;
+ }
+ csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
+ p->csums_alg, "csums-alg");
+ if (IS_ERR(csums_tfm)) {
+ csums_tfm = NULL;
+ goto disconnect;
+ }
+ }
+
+
+ spin_lock(&mdev->peer_seq_lock);
+ /* lock against drbd_nl_syncer_conf() */
+ if (verify_tfm) {
+ strcpy(mdev->sync_conf.verify_alg, p->verify_alg);
+ mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1;
+ crypto_free_hash(mdev->verify_tfm);
+ mdev->verify_tfm = verify_tfm;
+ dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
+ }
+ if (csums_tfm) {
+ strcpy(mdev->sync_conf.csums_alg, p->csums_alg);
+ mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1;
+ crypto_free_hash(mdev->csums_tfm);
+ mdev->csums_tfm = csums_tfm;
+ dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
+ }
+ spin_unlock(&mdev->peer_seq_lock);
+ }
+
+ return ok;
+disconnect:
+ /* just for completeness: actually not needed,
+ * as this is not reached if csums_tfm was ok. */
+ crypto_free_hash(csums_tfm);
+ /* but free the verify_tfm again, if csums_tfm did not work out */
+ crypto_free_hash(verify_tfm);
+ drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ return FALSE;
+}
+
+static void drbd_setup_order_type(struct drbd_conf *mdev, int peer)
+{
+ /* sorry, we currently have no working implementation
+ * of distributed TCQ */
+}
+
+/* warn if the arguments differ by more than 12.5% */
+static void warn_if_differ_considerably(struct drbd_conf *mdev,
+ const char *s, sector_t a, sector_t b)
+{
+ sector_t d;
+ if (a == 0 || b == 0)
+ return;
+ d = (a > b) ? (a - b) : (b - a);
+ if (d > (a>>3) || d > (b>>3))
+ dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s,
+ (unsigned long long)a, (unsigned long long)b);
+}
+
+static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
+{
+ struct p_sizes *p = (struct p_sizes *)h;
+ enum determine_dev_size dd = unchanged;
+ unsigned int max_seg_s;
+ sector_t p_size, p_usize, my_usize;
+ int ldsc = 0; /* local disk size changed */
+ enum drbd_conns nconn;
+
+ ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+ if (drbd_recv(mdev, h->payload, h->length) != h->length)
+ return FALSE;
+
+ p_size = be64_to_cpu(p->d_size);
+ p_usize = be64_to_cpu(p->u_size);
+
+ if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
+ dev_err(DEV, "some backing storage is needed\n");
+ drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ return FALSE;
+ }
+
+ /* just store the peer's disk size for now.
+ * we still need to figure out whether we accept that. */
+ mdev->p_size = p_size;
+
+#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
+ if (get_ldev(mdev)) {
+ warn_if_differ_considerably(mdev, "lower level device sizes",
+ p_size, drbd_get_max_capacity(mdev->ldev));
+ warn_if_differ_considerably(mdev, "user requested size",
+ p_usize, mdev->ldev->dc.disk_size);
+
+ /* if this is the first connect, or an otherwise expected
+ * param exchange, choose the minimum */
+ if (mdev->state.conn == C_WF_REPORT_PARAMS)
+ p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size,
+ p_usize);
+
+ my_usize = mdev->ldev->dc.disk_size;
+
+ if (mdev->ldev->dc.disk_size != p_usize) {
+ mdev->ldev->dc.disk_size = p_usize;
+ dev_info(DEV, "Peer sets u_size to %lu sectors\n",
+ (unsigned long)mdev->ldev->dc.disk_size);
+ }
+
+ /* Never shrink a device with usable data during connect.
+ But allow online shrinking if we are connected. */
+ if (drbd_new_dev_size(mdev, mdev->ldev) <
+ drbd_get_capacity(mdev->this_bdev) &&
+ mdev->state.disk >= D_OUTDATED &&
+ mdev->state.conn < C_CONNECTED) {
+ dev_err(DEV, "The peer's disk size is too small!\n");
+ drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ mdev->ldev->dc.disk_size = my_usize;
+ put_ldev(mdev);
+ return FALSE;
+ }
+ put_ldev(mdev);
+ }
+#undef min_not_zero
+
+ if (get_ldev(mdev)) {
+ dd = drbd_determin_dev_size(mdev);
+ put_ldev(mdev);
+ if (dd == dev_size_error)
+ return FALSE;
+ drbd_md_sync(mdev);
+ } else {
+ /* I am diskless, need to accept the peer's size. */
+ drbd_set_my_capacity(mdev, p_size);
+ }
+
+ if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
+ nconn = drbd_sync_handshake(mdev,
+ mdev->state.peer, mdev->state.pdsk);
+ put_ldev(mdev);
+
+ if (nconn == C_MASK) {
+ drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ return FALSE;
+ }
+
+ if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) {
+ drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ return FALSE;
+ }
+ }
+
+ if (get_ldev(mdev)) {
+ if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
+ mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
+ ldsc = 1;
+ }
+
+ max_seg_s = be32_to_cpu(p->max_segment_size);
+ if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
+ drbd_setup_queue_param(mdev, max_seg_s);
+
+ drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type));
+ put_ldev(mdev);
+ }
+
+ if (mdev->state.conn > C_WF_REPORT_PARAMS) {
+ if (be64_to_cpu(p->c_size) !=
+ drbd_get_capacity(mdev->this_bdev) || ldsc) {
+ /* we have different sizes, probably peer
+ * needs to know my new size... */
+ drbd_send_sizes(mdev, 0);
+ }
+ if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
+ (dd == grew && mdev->state.conn == C_CONNECTED)) {
+ if (mdev->state.pdsk >= D_INCONSISTENT &&
+ mdev->state.disk >= D_INCONSISTENT)
+ resync_after_online_grow(mdev);
+ else
+ set_bit(RESYNC_AFTER_NEG, &mdev->flags);
+ }
+ }
+
+ return TRUE;
+}
+
+static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
+{
+ struct p_uuids *p = (struct p_uuids *)h;
+ u64 *p_uuid;
+ int i;
+
+ ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+ if (drbd_recv(mdev, h->payload, h->length) != h->length)
+ return FALSE;
+
+ p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
+
+ for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
+ p_uuid[i] = be64_to_cpu(p->uuid[i]);
+
+ kfree(mdev->p_uuid);
+ mdev->p_uuid = p_uuid;
+
+ if (mdev->state.conn < C_CONNECTED &&
+ mdev->state.disk < D_INCONSISTENT &&
+ mdev->state.role == R_PRIMARY &&
+ (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
+ dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
+ (unsigned long long)mdev->ed_uuid);
+ drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ return FALSE;
+ }
+
+ if (get_ldev(mdev)) {
+ int skip_initial_sync =
+ mdev->state.conn == C_CONNECTED &&
+ mdev->agreed_pro_version >= 90 &&
+ mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
+ (p_uuid[UI_FLAGS] & 8);
+ if (skip_initial_sync) {
+ dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
+ drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
+ "clear_n_write from receive_uuids");
+ _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
+ _drbd_uuid_set(mdev, UI_BITMAP, 0);
+ _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
+ CS_VERBOSE, NULL);
+ drbd_md_sync(mdev);
+ }
+ put_ldev(mdev);
+ }
+
+ /* Before we test for the disk state, we should wait until an eventually
+ ongoing cluster wide state change is finished. That is important if
+ we are primary and are detaching from our disk. We need to see the
+ new disk state... */
+ wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags));
+ if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
+ drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
+
+ return TRUE;
+}
+
+/**
+ * convert_state() - Converts the peer's view of the cluster state to our point of view
+ * @ps: The state as seen by the peer.
+ */
+static union drbd_state convert_state(union drbd_state ps)
+{
+ union drbd_state ms;
+
+ static enum drbd_conns c_tab[] = {
+ [C_CONNECTED] = C_CONNECTED,
+
+ [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
+ [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
+ [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
+ [C_VERIFY_S] = C_VERIFY_T,
+ [C_MASK] = C_MASK,
+ };
+
+ ms.i = ps.i;
+
+ ms.conn = c_tab[ps.conn];
+ ms.peer = ps.role;
+ ms.role = ps.peer;
+ ms.pdsk = ps.disk;
+ ms.disk = ps.pdsk;
+ ms.peer_isp = (ps.aftr_isp | ps.user_isp);
+
+ return ms;
+}
+
+static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
+{
+ struct p_req_state *p = (struct p_req_state *)h;
+ union drbd_state mask, val;
+ int rv;
+
+ ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+ if (drbd_recv(mdev, h->payload, h->length) != h->length)
+ return FALSE;
+
+ mask.i = be32_to_cpu(p->mask);
+ val.i = be32_to_cpu(p->val);
+
+ if (test_bit(DISCARD_CONCURRENT, &mdev->flags) &&
+ test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) {
+ drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
+ return TRUE;
+ }
+
+ mask = convert_state(mask);
+ val = convert_state(val);
+
+ rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
+
+ drbd_send_sr_reply(mdev, rv);
+ drbd_md_sync(mdev);
+
+ return TRUE;
+}
+
+static int receive_state(struct drbd_conf *mdev, struct p_header *h)
+{
+ struct p_state *p = (struct p_state *)h;
+ enum drbd_conns nconn, oconn;
+ union drbd_state ns, peer_state;
+ enum drbd_disk_state real_peer_disk;
+ int rv;
+
+ ERR_IF(h->length != (sizeof(*p)-sizeof(*h)))
+ return FALSE;
+
+ if (drbd_recv(mdev, h->payload, h->length) != h->length)
+ return FALSE;
+
+ peer_state.i = be32_to_cpu(p->state);
+
+ real_peer_disk = peer_state.disk;
+ if (peer_state.disk == D_NEGOTIATING) {
+ real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
+ dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
+ }
+
+ spin_lock_irq(&mdev->req_lock);
+ retry:
+ oconn = nconn = mdev->state.conn;
+ spin_unlock_irq(&mdev->req_lock);
+
+ if (nconn == C_WF_REPORT_PARAMS)
+ nconn = C_CONNECTED;
+
+ if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
+ get_ldev_if_state(mdev, D_NEGOTIATING)) {
+ int cr; /* consider resync */
+
+ /* if we established a new connection */
+ cr = (oconn < C_CONNECTED);
+ /* if we had an established connection
+ * and one of the nodes newly attaches a disk */
+ cr |= (oconn == C_CONNECTED &&
+ (peer_state.disk == D_NEGOTIATING ||
+ mdev->state.disk == D_NEGOTIATING));
+ /* if we have both been inconsistent, and the peer has been
+ * forced to be UpToDate with --overwrite-data */
+ cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
+ /* if we had been plain connected, and the admin requested to
+ * start a sync by "invalidate" or "invalidate-remote" */
+ cr |= (oconn == C_CONNECTED &&
+ (peer_state.conn >= C_STARTING_SYNC_S &&
+ peer_state.conn <= C_WF_BITMAP_T));
+
+ if (cr)
+ nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
+
+ put_ldev(mdev);
+ if (nconn == C_MASK) {
+ if (mdev->state.disk == D_NEGOTIATING) {
+ drbd_force_state(mdev, NS(disk, D_DISKLESS));
+ nconn = C_CONNECTED;
+ } else if (peer_state.disk == D_NEGOTIATING) {
+ dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
+ peer_state.disk = D_DISKLESS;
+ } else {
+ D_ASSERT(oconn == C_WF_REPORT_PARAMS);
+ drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ return FALSE;
+ }
+ }
+ }
+
+ spin_lock_irq(&mdev->req_lock);
+ if (mdev->state.conn != oconn)
+ goto retry;
+ clear_bit(CONSIDER_RESYNC, &mdev->flags);
+ ns.i = mdev->state.i;
+ ns.conn = nconn;
+ ns.peer = peer_state.role;
+ ns.pdsk = real_peer_disk;
+ ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
+ if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
+ ns.disk = mdev->new_state_tmp.disk;
+
+ rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL);
+ ns = mdev->state;
+ spin_unlock_irq(&mdev->req_lock);
+
+ if (rv < SS_SUCCESS) {
+ drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ return FALSE;
+ }
+
+ if (oconn > C_WF_REPORT_PARAMS) {
+ if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
+ peer_state.disk != D_NEGOTIATING ) {
+ /* we want resync, peer has not yet decided to sync... */
+ /* Nowadays only used when forcing a node into primary role and
+ setting its disk to UpToDate with that */
+ drbd_send_uuids(mdev);
+ drbd_send_state(mdev);
+ }
+ }
+
+ mdev->net_conf->want_lose = 0;
+
+ drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
+
+ return TRUE;
+}
+
+static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
+{
+ struct p_rs_uuid *p = (struct p_rs_uuid *)h;
+
+ wait_event(mdev->misc_wait,
+ mdev->state.conn == C_WF_SYNC_UUID ||
+ mdev->state.conn < C_CONNECTED ||
+ mdev->state.disk < D_NEGOTIATING);
+
+ /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
+
+ ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+ if (drbd_recv(mdev, h->payload, h->length) != h->length)
+ return FALSE;
+
+ /* Here the _drbd_uuid_ functions are right, current should
+ _not_ be rotated into the history */
+ if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
+ _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
+ _drbd_uuid_set(mdev, UI_BITMAP, 0UL);
+
+ drbd_start_resync(mdev, C_SYNC_TARGET);
+
+ put_ldev(mdev);
+ } else
+ dev_err(DEV, "Ignoring SyncUUID packet!\n");
+
+ return TRUE;
+}
+
+enum receive_bitmap_ret { OK, DONE, FAILED };
+
+static enum receive_bitmap_ret
+receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h,
+ unsigned long *buffer, struct bm_xfer_ctx *c)
+{
+ unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
+ unsigned want = num_words * sizeof(long);
+
+ if (want != h->length) {
+ dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length);
+ return FAILED;
+ }
+ if (want == 0)
+ return DONE;
+ if (drbd_recv(mdev, buffer, want) != want)
+ return FAILED;
+
+ drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
+
+ c->word_offset += num_words;
+ c->bit_offset = c->word_offset * BITS_PER_LONG;
+ if (c->bit_offset > c->bm_bits)
+ c->bit_offset = c->bm_bits;
+
+ return OK;
+}
+
+static enum receive_bitmap_ret
+recv_bm_rle_bits(struct drbd_conf *mdev,
+ struct p_compressed_bm *p,
+ struct bm_xfer_ctx *c)
+{
+ struct bitstream bs;
+ u64 look_ahead;
+ u64 rl;
+ u64 tmp;
+ unsigned long s = c->bit_offset;
+ unsigned long e;
+ int len = p->head.length - (sizeof(*p) - sizeof(p->head));
+ int toggle = DCBP_get_start(p);
+ int have;
+ int bits;
+
+ bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p));
+
+ bits = bitstream_get_bits(&bs, &look_ahead, 64);
+ if (bits < 0)
+ return FAILED;
+
+ for (have = bits; have > 0; s += rl, toggle = !toggle) {
+ bits = vli_decode_bits(&rl, look_ahead);
+ if (bits <= 0)
+ return FAILED;
+
+ if (toggle) {
+ e = s + rl -1;
+ if (e >= c->bm_bits) {
+ dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
+ return FAILED;
+ }
+ _drbd_bm_set_bits(mdev, s, e);
+ }
+
+ if (have < bits) {
+ dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
+ have, bits, look_ahead,
+ (unsigned int)(bs.cur.b - p->code),
+ (unsigned int)bs.buf_len);
+ return FAILED;
+ }
+ look_ahead >>= bits;
+ have -= bits;
+
+ bits = bitstream_get_bits(&bs, &tmp, 64 - have);
+ if (bits < 0)
+ return FAILED;
+ look_ahead |= tmp << have;
+ have += bits;
+ }
+
+ c->bit_offset = s;
+ bm_xfer_ctx_bit_to_word_offset(c);
+
+ return (s == c->bm_bits) ? DONE : OK;
+}
+
+static enum receive_bitmap_ret
+decode_bitmap_c(struct drbd_conf *mdev,
+ struct p_compressed_bm *p,
+ struct bm_xfer_ctx *c)
+{
+ if (DCBP_get_code(p) == RLE_VLI_Bits)
+ return recv_bm_rle_bits(mdev, p, c);
+
+ /* other variants had been implemented for evaluation,
+ * but have been dropped as this one turned out to be "best"
+ * during all our tests. */
+
+ dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
+ drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+ return FAILED;
+}
+
+void INFO_bm_xfer_stats(struct drbd_conf *mdev,
+ const char *direction, struct bm_xfer_ctx *c)
+{
+ /* what would it take to transfer it "plaintext" */
+ unsigned plain = sizeof(struct p_header) *
+ ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
+ + c->bm_words * sizeof(long);
+ unsigned total = c->bytes[0] + c->bytes[1];
+ unsigned r;
+
+ /* total can not be zero. but just in case: */
+ if (total == 0)
+ return;
+
+ /* don't report if not compressed */
+ if (total >= plain)
+ return;
+
+ /* total < plain. check for overflow, still */
+ r = (total > UINT_MAX/1000) ? (total / (plain/1000))
+ : (1000 * total / plain);
+
+ if (r > 1000)
+ r = 1000;
+
+ r = 1000 - r;
+ dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
+ "total %u; compression: %u.%u%%\n",
+ direction,
+ c->bytes[1], c->packets[1],
+ c->bytes[0], c->packets[0],
+ total, r/10, r % 10);
+}
+
+/* Since we are processing the bitfield from lower addresses to higher,
+ it does not matter if the process it in 32 bit chunks or 64 bit
+ chunks as long as it is little endian. (Understand it as byte stream,
+ beginning with the lowest byte...) If we would use big endian
+ we would need to process it from the highest address to the lowest,
+ in order to be agnostic to the 32 vs 64 bits issue.
+
+ returns 0 on failure, 1 if we successfully received it. */
+static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
+{
+ struct bm_xfer_ctx c;
+ void *buffer;
+ enum receive_bitmap_ret ret;
+ int ok = FALSE;
+
+ wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
+
+ drbd_bm_lock(mdev, "receive bitmap");
+
+ /* maybe we should use some per thread scratch page,
+ * and allocate that during initial device creation? */
+ buffer = (unsigned long *) __get_free_page(GFP_NOIO);
+ if (!buffer) {
+ dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
+ goto out;
+ }
+
+ c = (struct bm_xfer_ctx) {
+ .bm_bits = drbd_bm_bits(mdev),
+ .bm_words = drbd_bm_words(mdev),
+ };
+
+ do {
+ if (h->command == P_BITMAP) {
+ ret = receive_bitmap_plain(mdev, h, buffer, &c);
+ } else if (h->command == P_COMPRESSED_BITMAP) {
+ /* MAYBE: sanity check that we speak proto >= 90,
+ * and the feature is enabled! */
+ struct p_compressed_bm *p;
+
+ if (h->length > BM_PACKET_PAYLOAD_BYTES) {
+ dev_err(DEV, "ReportCBitmap packet too large\n");
+ goto out;
+ }
+ /* use the page buff */
+ p = buffer;
+ memcpy(p, h, sizeof(*h));
+ if (drbd_recv(mdev, p->head.payload, h->length) != h->length)
+ goto out;
+ if (p->head.length <= (sizeof(*p) - sizeof(p->head))) {
+ dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length);
+ return FAILED;
+ }
+ ret = decode_bitmap_c(mdev, p, &c);
+ } else {
+ dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command);
+ goto out;
+ }
+
+ c.packets[h->command == P_BITMAP]++;
+ c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length;
+
+ if (ret != OK)
+ break;
+
+ if (!drbd_recv_header(mdev, h))
+ goto out;
+ } while (ret == OK);
+ if (ret == FAILED)
+ goto out;
+
+ INFO_bm_xfer_stats(mdev, "receive", &c);
+
+ if (mdev->state.conn == C_WF_BITMAP_T) {
+ ok = !drbd_send_bitmap(mdev);
+ if (!ok)
+ goto out;
+ /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
+ ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+ D_ASSERT(ok == SS_SUCCESS);
+ } else if (mdev->state.conn != C_WF_BITMAP_S) {
+ /* admin may have requested C_DISCONNECTING,
+ * other threads may have noticed network errors */
+ dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
+ drbd_conn_str(mdev->state.conn));
+ }
+
+ ok = TRUE;
+ out:
+ drbd_bm_unlock(mdev);
+ if (ok && mdev->state.conn == C_WF_BITMAP_S)
+ drbd_start_resync(mdev, C_SYNC_SOURCE);
+ free_page((unsigned long) buffer);
+ return ok;
+}
+
+static int receive_skip(struct drbd_conf *mdev, struct p_header *h)
+{
+ /* TODO zero copy sink :) */
+ static char sink[128];
+ int size, want, r;
+
+ dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
+ h->command, h->length);
+
+ size = h->length;
+ while (size > 0) {
+ want = min_t(int, size, sizeof(sink));
+ r = drbd_recv(mdev, sink, want);
+ ERR_IF(r <= 0) break;
+ size -= r;
+ }
+ return size == 0;
+}
+
+static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
+{
+ if (mdev->state.disk >= D_INCONSISTENT)
+ drbd_kick_lo(mdev);
+
+ /* Make sure we've acked all the TCP data associated
+ * with the data requests being unplugged */
+ drbd_tcp_quickack(mdev->data.socket);
+
+ return TRUE;
+}
+
+typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
+
+static drbd_cmd_handler_f drbd_default_handler[] = {
+ [P_DATA] = receive_Data,
+ [P_DATA_REPLY] = receive_DataReply,
+ [P_RS_DATA_REPLY] = receive_RSDataReply,
+ [P_BARRIER] = receive_Barrier,
+ [P_BITMAP] = receive_bitmap,
+ [P_COMPRESSED_BITMAP] = receive_bitmap,
+ [P_UNPLUG_REMOTE] = receive_UnplugRemote,
+ [P_DATA_REQUEST] = receive_DataRequest,
+ [P_RS_DATA_REQUEST] = receive_DataRequest,
+ [P_SYNC_PARAM] = receive_SyncParam,
+ [P_SYNC_PARAM89] = receive_SyncParam,
+ [P_PROTOCOL] = receive_protocol,
+ [P_UUIDS] = receive_uuids,
+ [P_SIZES] = receive_sizes,
+ [P_STATE] = receive_state,
+ [P_STATE_CHG_REQ] = receive_req_state,
+ [P_SYNC_UUID] = receive_sync_uuid,
+ [P_OV_REQUEST] = receive_DataRequest,
+ [P_OV_REPLY] = receive_DataRequest,
+ [P_CSUM_RS_REQUEST] = receive_DataRequest,
+ /* anything missing from this table is in
+ * the asender_tbl, see get_asender_cmd */
+ [P_MAX_CMD] = NULL,
+};
+
+static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler;
+static drbd_cmd_handler_f *drbd_opt_cmd_handler;
+
+static void drbdd(struct drbd_conf *mdev)
+{
+ drbd_cmd_handler_f handler;
+ struct p_header *header = &mdev->data.rbuf.header;
+
+ while (get_t_state(&mdev->receiver) == Running) {
+ drbd_thread_current_set_cpu(mdev);
+ if (!drbd_recv_header(mdev, header)) {
+ drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+ break;
+ }
+
+ if (header->command < P_MAX_CMD)
+ handler = drbd_cmd_handler[header->command];
+ else if (P_MAY_IGNORE < header->command
+ && header->command < P_MAX_OPT_CMD)
+ handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE];
+ else if (header->command > P_MAX_OPT_CMD)
+ handler = receive_skip;
+ else
+ handler = NULL;
+
+ if (unlikely(!handler)) {
+ dev_err(DEV, "unknown packet type %d, l: %d!\n",
+ header->command, header->length);
+ drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+ break;
+ }
+ if (unlikely(!handler(mdev, header))) {
+ dev_err(DEV, "error receiving %s, l: %d!\n",
+ cmdname(header->command), header->length);
+ drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+ break;
+ }
+ }
+}
+
+static void drbd_fail_pending_reads(struct drbd_conf *mdev)
+{
+ struct hlist_head *slot;
+ struct hlist_node *pos;
+ struct hlist_node *tmp;
+ struct drbd_request *req;
+ int i;
+
+ /*
+ * Application READ requests
+ */
+ spin_lock_irq(&mdev->req_lock);
+ for (i = 0; i < APP_R_HSIZE; i++) {
+ slot = mdev->app_reads_hash+i;
+ hlist_for_each_entry_safe(req, pos, tmp, slot, colision) {
+ /* it may (but should not any longer!)
+ * be on the work queue; if that assert triggers,
+ * we need to also grab the
+ * spin_lock_irq(&mdev->data.work.q_lock);
+ * and list_del_init here. */
+ D_ASSERT(list_empty(&req->w.list));
+ /* It would be nice to complete outside of spinlock.
+ * But this is easier for now. */
+ _req_mod(req, connection_lost_while_pending);
+ }
+ }
+ for (i = 0; i < APP_R_HSIZE; i++)
+ if (!hlist_empty(mdev->app_reads_hash+i))
+ dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: "
+ "%p, should be NULL\n", i, mdev->app_reads_hash[i].first);
+
+ memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
+ spin_unlock_irq(&mdev->req_lock);
+}
+
+void drbd_flush_workqueue(struct drbd_conf *mdev)
+{
+ struct drbd_wq_barrier barr;
+
+ barr.w.cb = w_prev_work_done;
+ init_completion(&barr.done);
+ drbd_queue_work(&mdev->data.work, &barr.w);
+ wait_for_completion(&barr.done);
+}
+
+static void drbd_disconnect(struct drbd_conf *mdev)
+{
+ enum drbd_fencing_p fp;
+ union drbd_state os, ns;
+ int rv = SS_UNKNOWN_ERROR;
+ unsigned int i;
+
+ if (mdev->state.conn == C_STANDALONE)
+ return;
+ if (mdev->state.conn >= C_WF_CONNECTION)
+ dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n",
+ drbd_conn_str(mdev->state.conn));
+
+ /* asender does not clean up anything. it must not interfere, either */
+ drbd_thread_stop(&mdev->asender);
+
+ mutex_lock(&mdev->data.mutex);
+ drbd_free_sock(mdev);
+ mutex_unlock(&mdev->data.mutex);
+
+ spin_lock_irq(&mdev->req_lock);
+ _drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
+ _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
+ _drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
+ spin_unlock_irq(&mdev->req_lock);
+
+ /* We do not have data structures that would allow us to
+ * get the rs_pending_cnt down to 0 again.
+ * * On C_SYNC_TARGET we do not have any data structures describing
+ * the pending RSDataRequest's we have sent.
+ * * On C_SYNC_SOURCE there is no data structure that tracks
+ * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
+ * And no, it is not the sum of the reference counts in the
+ * resync_LRU. The resync_LRU tracks the whole operation including
+ * the disk-IO, while the rs_pending_cnt only tracks the blocks
+ * on the fly. */
+ drbd_rs_cancel_all(mdev);
+ mdev->rs_total = 0;
+ mdev->rs_failed = 0;
+ atomic_set(&mdev->rs_pending_cnt, 0);
+ wake_up(&mdev->misc_wait);
+
+ /* make sure syncer is stopped and w_resume_next_sg queued */
+ del_timer_sync(&mdev->resync_timer);
+ set_bit(STOP_SYNC_TIMER, &mdev->flags);
+ resync_timer_fn((unsigned long)mdev);
+
+ /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
+ * w_make_resync_request etc. which may still be on the worker queue
+ * to be "canceled" */
+ drbd_flush_workqueue(mdev);
+
+ /* This also does reclaim_net_ee(). If we do this too early, we might
+ * miss some resync ee and pages.*/
+ drbd_process_done_ee(mdev);
+
+ kfree(mdev->p_uuid);
+ mdev->p_uuid = NULL;
+
+ if (!mdev->state.susp)
+ tl_clear(mdev);
+
+ drbd_fail_pending_reads(mdev);
+
+ dev_info(DEV, "Connection closed\n");
+
+ drbd_md_sync(mdev);
+
+ fp = FP_DONT_CARE;
+ if (get_ldev(mdev)) {
+ fp = mdev->ldev->dc.fencing;
+ put_ldev(mdev);
+ }
+
+ if (mdev->state.role == R_PRIMARY) {
+ if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) {
+ enum drbd_disk_state nps = drbd_try_outdate_peer(mdev);
+ drbd_request_state(mdev, NS(pdsk, nps));
+ }
+ }
+
+ spin_lock_irq(&mdev->req_lock);
+ os = mdev->state;
+ if (os.conn >= C_UNCONNECTED) {
+ /* Do not restart in case we are C_DISCONNECTING */
+ ns = os;
+ ns.conn = C_UNCONNECTED;
+ rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
+ }
+ spin_unlock_irq(&mdev->req_lock);
+
+ if (os.conn == C_DISCONNECTING) {
+ struct hlist_head *h;
+ wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0);
+
+ /* we must not free the tl_hash
+ * while application io is still on the fly */
+ wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0);
+
+ spin_lock_irq(&mdev->req_lock);
+ /* paranoia code */
+ for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
+ if (h->first)
+ dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
+ (int)(h - mdev->ee_hash), h->first);
+ kfree(mdev->ee_hash);
+ mdev->ee_hash = NULL;
+ mdev->ee_hash_s = 0;
+
+ /* paranoia code */
+ for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
+ if (h->first)
+ dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
+ (int)(h - mdev->tl_hash), h->first);
+ kfree(mdev->tl_hash);
+ mdev->tl_hash = NULL;
+ mdev->tl_hash_s = 0;
+ spin_unlock_irq(&mdev->req_lock);
+
+ crypto_free_hash(mdev->cram_hmac_tfm);
+ mdev->cram_hmac_tfm = NULL;
+
+ kfree(mdev->net_conf);
+ mdev->net_conf = NULL;
+ drbd_request_state(mdev, NS(conn, C_STANDALONE));
+ }
+
+ /* tcp_close and release of sendpage pages can be deferred. I don't
+ * want to use SO_LINGER, because apparently it can be deferred for
+ * more than 20 seconds (longest time I checked).
+ *
+ * Actually we don't care for exactly when the network stack does its
+ * put_page(), but release our reference on these pages right here.
+ */
+ i = drbd_release_ee(mdev, &mdev->net_ee);
+ if (i)
+ dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
+ i = atomic_read(&mdev->pp_in_use);
+ if (i)
+ dev_info(DEV, "pp_in_use = %u, expected 0\n", i);
+
+ D_ASSERT(list_empty(&mdev->read_ee));
+ D_ASSERT(list_empty(&mdev->active_ee));
+ D_ASSERT(list_empty(&mdev->sync_ee));
+ D_ASSERT(list_empty(&mdev->done_ee));
+
+ /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
+ atomic_set(&mdev->current_epoch->epoch_size, 0);
+ D_ASSERT(list_empty(&mdev->current_epoch->list));
+}
+
+/*
+ * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
+ * we can agree on is stored in agreed_pro_version.
+ *
+ * feature flags and the reserved array should be enough room for future
+ * enhancements of the handshake protocol, and possible plugins...
+ *
+ * for now, they are expected to be zero, but ignored.
+ */
+static int drbd_send_handshake(struct drbd_conf *mdev)
+{
+ /* ASSERT current == mdev->receiver ... */
+ struct p_handshake *p = &mdev->data.sbuf.handshake;
+ int ok;
+
+ if (mutex_lock_interruptible(&mdev->data.mutex)) {
+ dev_err(DEV, "interrupted during initial handshake\n");
+ return 0; /* interrupted. not ok. */
+ }
+
+ if (mdev->data.socket == NULL) {
+ mutex_unlock(&mdev->data.mutex);
+ return 0;
+ }
+
+ memset(p, 0, sizeof(*p));
+ p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
+ p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
+ ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
+ (struct p_header *)p, sizeof(*p), 0 );
+ mutex_unlock(&mdev->data.mutex);
+ return ok;
+}
+
+/*
+ * return values:
+ * 1 yes, we have a valid connection
+ * 0 oops, did not work out, please try again
+ * -1 peer talks different language,
+ * no point in trying again, please go standalone.
+ */
+static int drbd_do_handshake(struct drbd_conf *mdev)
+{
+ /* ASSERT current == mdev->receiver ... */
+ struct p_handshake *p = &mdev->data.rbuf.handshake;
+ const int expect = sizeof(struct p_handshake)
+ -sizeof(struct p_header);
+ int rv;
+
+ rv = drbd_send_handshake(mdev);
+ if (!rv)
+ return 0;
+
+ rv = drbd_recv_header(mdev, &p->head);
+ if (!rv)
+ return 0;
+
+ if (p->head.command != P_HAND_SHAKE) {
+ dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
+ cmdname(p->head.command), p->head.command);
+ return -1;
+ }
+
+ if (p->head.length != expect) {
+ dev_err(DEV, "expected HandShake length: %u, received: %u\n",
+ expect, p->head.length);
+ return -1;
+ }
+
+ rv = drbd_recv(mdev, &p->head.payload, expect);
+
+ if (rv != expect) {
+ dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv);
+ return 0;
+ }
+
+ p->protocol_min = be32_to_cpu(p->protocol_min);
+ p->protocol_max = be32_to_cpu(p->protocol_max);
+ if (p->protocol_max == 0)
+ p->protocol_max = p->protocol_min;
+
+ if (PRO_VERSION_MAX < p->protocol_min ||
+ PRO_VERSION_MIN > p->protocol_max)
+ goto incompat;
+
+ mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
+
+ dev_info(DEV, "Handshake successful: "
+ "Agreed network protocol version %d\n", mdev->agreed_pro_version);
+
+ return 1;
+
+ incompat:
+ dev_err(DEV, "incompatible DRBD dialects: "
+ "I support %d-%d, peer supports %d-%d\n",
+ PRO_VERSION_MIN, PRO_VERSION_MAX,
+ p->protocol_min, p->protocol_max);
+ return -1;
+}
+
+#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
+static int drbd_do_auth(struct drbd_conf *mdev)
+{
+ dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
+ dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
+ return 0;
+}
+#else
+#define CHALLENGE_LEN 64
+static int drbd_do_auth(struct drbd_conf *mdev)
+{
+ char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */
+ struct scatterlist sg;
+ char *response = NULL;
+ char *right_response = NULL;
+ char *peers_ch = NULL;
+ struct p_header p;
+ unsigned int key_len = strlen(mdev->net_conf->shared_secret);
+ unsigned int resp_size;
+ struct hash_desc desc;
+ int rv;
+
+ desc.tfm = mdev->cram_hmac_tfm;
+ desc.flags = 0;
+
+ rv = crypto_hash_setkey(mdev->cram_hmac_tfm,
+ (u8 *)mdev->net_conf->shared_secret, key_len);
+ if (rv) {
+ dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv);
+ rv = 0;
+ goto fail;
+ }
+
+ get_random_bytes(my_challenge, CHALLENGE_LEN);
+
+ rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN);
+ if (!rv)
+ goto fail;
+
+ rv = drbd_recv_header(mdev, &p);
+ if (!rv)
+ goto fail;
+
+ if (p.command != P_AUTH_CHALLENGE) {
+ dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
+ cmdname(p.command), p.command);
+ rv = 0;
+ goto fail;
+ }
+
+ if (p.length > CHALLENGE_LEN*2) {
+ dev_err(DEV, "expected AuthChallenge payload too big.\n");
+ rv = 0;
+ goto fail;
+ }
+
+ peers_ch = kmalloc(p.length, GFP_NOIO);
+ if (peers_ch == NULL) {
+ dev_err(DEV, "kmalloc of peers_ch failed\n");
+ rv = 0;
+ goto fail;
+ }
+
+ rv = drbd_recv(mdev, peers_ch, p.length);
+
+ if (rv != p.length) {
+ dev_err(DEV, "short read AuthChallenge: l=%u\n", rv);
+ rv = 0;
+ goto fail;
+ }
+
+ resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm);
+ response = kmalloc(resp_size, GFP_NOIO);
+ if (response == NULL) {
+ dev_err(DEV, "kmalloc of response failed\n");
+ rv = 0;
+ goto fail;
+ }
+
+ sg_init_table(&sg, 1);
+ sg_set_buf(&sg, peers_ch, p.length);
+
+ rv = crypto_hash_digest(&desc, &sg, sg.length, response);
+ if (rv) {
+ dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
+ rv = 0;
+ goto fail;
+ }
+
+ rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size);
+ if (!rv)
+ goto fail;
+
+ rv = drbd_recv_header(mdev, &p);
+ if (!rv)
+ goto fail;
+
+ if (p.command != P_AUTH_RESPONSE) {
+ dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
+ cmdname(p.command), p.command);
+ rv = 0;
+ goto fail;
+ }
+
+ if (p.length != resp_size) {
+ dev_err(DEV, "expected AuthResponse payload of wrong size\n");
+ rv = 0;
+ goto fail;
+ }
+
+ rv = drbd_recv(mdev, response , resp_size);
+
+ if (rv != resp_size) {
+ dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv);
+ rv = 0;
+ goto fail;
+ }
+
+ right_response = kmalloc(resp_size, GFP_NOIO);
+ if (response == NULL) {
+ dev_err(DEV, "kmalloc of right_response failed\n");
+ rv = 0;
+ goto fail;
+ }
+
+ sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
+
+ rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
+ if (rv) {
+ dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
+ rv = 0;
+ goto fail;
+ }
+
+ rv = !memcmp(response, right_response, resp_size);
+
+ if (rv)
+ dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n",
+ resp_size, mdev->net_conf->cram_hmac_alg);
+
+ fail:
+ kfree(peers_ch);
+ kfree(response);
+ kfree(right_response);
+
+ return rv;
+}
+#endif
+
+int drbdd_init(struct drbd_thread *thi)
+{
+ struct drbd_conf *mdev = thi->mdev;
+ unsigned int minor = mdev_to_minor(mdev);
+ int h;
+
+ sprintf(current->comm, "drbd%d_receiver", minor);
+
+ dev_info(DEV, "receiver (re)started\n");
+
+ do {
+ h = drbd_connect(mdev);
+ if (h == 0) {
+ drbd_disconnect(mdev);
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(HZ);
+ }
+ if (h == -1) {
+ dev_warn(DEV, "Discarding network configuration.\n");
+ drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ }
+ } while (h == 0);
+
+ if (h > 0) {
+ if (get_net_conf(mdev)) {
+ drbdd(mdev);
+ put_net_conf(mdev);
+ }
+ }
+
+ drbd_disconnect(mdev);
+
+ dev_info(DEV, "receiver terminated\n");
+ return 0;
+}
+
+/* ********* acknowledge sender ******** */
+
+static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
+{
+ struct p_req_state_reply *p = (struct p_req_state_reply *)h;
+
+ int retcode = be32_to_cpu(p->retcode);
+
+ if (retcode >= SS_SUCCESS) {
+ set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
+ } else {
+ set_bit(CL_ST_CHG_FAIL, &mdev->flags);
+ dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
+ drbd_set_st_err_str(retcode), retcode);
+ }
+ wake_up(&mdev->state_wait);
+
+ return TRUE;
+}
+
+static int got_Ping(struct drbd_conf *mdev, struct p_header *h)
+{
+ return drbd_send_ping_ack(mdev);
+
+}
+
+static int got_PingAck(struct drbd_conf *mdev, struct p_header *h)
+{
+ /* restore idle timeout */
+ mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
+
+ return TRUE;
+}
+
+static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
+{
+ struct p_block_ack *p = (struct p_block_ack *)h;
+ sector_t sector = be64_to_cpu(p->sector);
+ int blksize = be32_to_cpu(p->blksize);
+
+ D_ASSERT(mdev->agreed_pro_version >= 89);
+
+ update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+ drbd_rs_complete_io(mdev, sector);
+ drbd_set_in_sync(mdev, sector, blksize);
+ /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
+ mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+ dec_rs_pending(mdev);
+
+ return TRUE;
+}
+
+/* when we receive the ACK for a write request,
+ * verify that we actually know about it */
+static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
+ u64 id, sector_t sector)
+{
+ struct hlist_head *slot = tl_hash_slot(mdev, sector);
+ struct hlist_node *n;
+ struct drbd_request *req;
+
+ hlist_for_each_entry(req, n, slot, colision) {
+ if ((unsigned long)req == (unsigned long)id) {
+ if (req->sector != sector) {
+ dev_err(DEV, "_ack_id_to_req: found req %p but it has "
+ "wrong sector (%llus versus %llus)\n", req,
+ (unsigned long long)req->sector,
+ (unsigned long long)sector);
+ break;
+ }
+ return req;
+ }
+ }
+ dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n",
+ (void *)(unsigned long)id, (unsigned long long)sector);
+ return NULL;
+}
+
+typedef struct drbd_request *(req_validator_fn)
+ (struct drbd_conf *mdev, u64 id, sector_t sector);
+
+static int validate_req_change_req_state(struct drbd_conf *mdev,
+ u64 id, sector_t sector, req_validator_fn validator,
+ const char *func, enum drbd_req_event what)
+{
+ struct drbd_request *req;
+ struct bio_and_error m;
+
+ spin_lock_irq(&mdev->req_lock);
+ req = validator(mdev, id, sector);
+ if (unlikely(!req)) {
+ spin_unlock_irq(&mdev->req_lock);
+ dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func);
+ return FALSE;
+ }
+ __req_mod(req, what, &m);
+ spin_unlock_irq(&mdev->req_lock);
+
+ if (m.bio)
+ complete_master_bio(mdev, &m);
+ return TRUE;
+}
+
+static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
+{
+ struct p_block_ack *p = (struct p_block_ack *)h;
+ sector_t sector = be64_to_cpu(p->sector);
+ int blksize = be32_to_cpu(p->blksize);
+ enum drbd_req_event what;
+
+ update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+ if (is_syncer_block_id(p->block_id)) {
+ drbd_set_in_sync(mdev, sector, blksize);
+ dec_rs_pending(mdev);
+ return TRUE;
+ }
+ switch (be16_to_cpu(h->command)) {
+ case P_RS_WRITE_ACK:
+ D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
+ what = write_acked_by_peer_and_sis;
+ break;
+ case P_WRITE_ACK:
+ D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
+ what = write_acked_by_peer;
+ break;
+ case P_RECV_ACK:
+ D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B);
+ what = recv_acked_by_peer;
+ break;
+ case P_DISCARD_ACK:
+ D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
+ what = conflict_discarded_by_peer;
+ break;
+ default:
+ D_ASSERT(0);
+ return FALSE;
+ }
+
+ return validate_req_change_req_state(mdev, p->block_id, sector,
+ _ack_id_to_req, __func__ , what);
+}
+
+static int got_NegAck(struct drbd_conf *mdev, struct p_header *h)
+{
+ struct p_block_ack *p = (struct p_block_ack *)h;
+ sector_t sector = be64_to_cpu(p->sector);
+
+ if (__ratelimit(&drbd_ratelimit_state))
+ dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n");
+
+ update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+ if (is_syncer_block_id(p->block_id)) {
+ int size = be32_to_cpu(p->blksize);
+ dec_rs_pending(mdev);
+ drbd_rs_failed_io(mdev, sector, size);
+ return TRUE;
+ }
+ return validate_req_change_req_state(mdev, p->block_id, sector,
+ _ack_id_to_req, __func__ , neg_acked);
+}
+
+static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
+{
+ struct p_block_ack *p = (struct p_block_ack *)h;
+ sector_t sector = be64_to_cpu(p->sector);
+
+ update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+ dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n",
+ (unsigned long long)sector, be32_to_cpu(p->blksize));
+
+ return validate_req_change_req_state(mdev, p->block_id, sector,
+ _ar_id_to_req, __func__ , neg_acked);
+}
+
+static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
+{
+ sector_t sector;
+ int size;
+ struct p_block_ack *p = (struct p_block_ack *)h;
+
+ sector = be64_to_cpu(p->sector);
+ size = be32_to_cpu(p->blksize);
+ D_ASSERT(p->block_id == ID_SYNCER);
+
+ update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+ dec_rs_pending(mdev);
+
+ if (get_ldev_if_state(mdev, D_FAILED)) {
+ drbd_rs_complete_io(mdev, sector);
+ drbd_rs_failed_io(mdev, sector, size);
+ put_ldev(mdev);
+ }
+
+ return TRUE;
+}
+
+static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h)
+{
+ struct p_barrier_ack *p = (struct p_barrier_ack *)h;
+
+ tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
+
+ return TRUE;
+}
+
+static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
+{
+ struct p_block_ack *p = (struct p_block_ack *)h;
+ struct drbd_work *w;
+ sector_t sector;
+ int size;
+
+ sector = be64_to_cpu(p->sector);
+ size = be32_to_cpu(p->blksize);
+
+ update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+ if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
+ drbd_ov_oos_found(mdev, sector, size);
+ else
+ ov_oos_print(mdev);
+
+ drbd_rs_complete_io(mdev, sector);
+ dec_rs_pending(mdev);
+
+ if (--mdev->ov_left == 0) {
+ w = kmalloc(sizeof(*w), GFP_NOIO);
+ if (w) {
+ w->cb = w_ov_finished;
+ drbd_queue_work_front(&mdev->data.work, w);
+ } else {
+ dev_err(DEV, "kmalloc(w) failed.");
+ ov_oos_print(mdev);
+ drbd_resync_finished(mdev);
+ }
+ }
+ return TRUE;
+}
+
+struct asender_cmd {
+ size_t pkt_size;
+ int (*process)(struct drbd_conf *mdev, struct p_header *h);
+};
+
+static struct asender_cmd *get_asender_cmd(int cmd)
+{
+ static struct asender_cmd asender_tbl[] = {
+ /* anything missing from this table is in
+ * the drbd_cmd_handler (drbd_default_handler) table,
+ * see the beginning of drbdd() */
+ [P_PING] = { sizeof(struct p_header), got_Ping },
+ [P_PING_ACK] = { sizeof(struct p_header), got_PingAck },
+ [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
+ [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
+ [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
+ [P_DISCARD_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
+ [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck },
+ [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply },
+ [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply},
+ [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult },
+ [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
+ [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
+ [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
+ [P_MAX_CMD] = { 0, NULL },
+ };
+ if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
+ return NULL;
+ return &asender_tbl[cmd];
+}
+
+int drbd_asender(struct drbd_thread *thi)
+{
+ struct drbd_conf *mdev = thi->mdev;
+ struct p_header *h = &mdev->meta.rbuf.header;
+ struct asender_cmd *cmd = NULL;
+
+ int rv, len;
+ void *buf = h;
+ int received = 0;
+ int expect = sizeof(struct p_header);
+ int empty;
+
+ sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
+
+ current->policy = SCHED_RR; /* Make this a realtime task! */
+ current->rt_priority = 2; /* more important than all other tasks */
+
+ while (get_t_state(thi) == Running) {
+ drbd_thread_current_set_cpu(mdev);
+ if (test_and_clear_bit(SEND_PING, &mdev->flags)) {
+ ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
+ mdev->meta.socket->sk->sk_rcvtimeo =
+ mdev->net_conf->ping_timeo*HZ/10;
+ }
+
+ /* conditionally cork;
+ * it may hurt latency if we cork without much to send */
+ if (!mdev->net_conf->no_cork &&
+ 3 < atomic_read(&mdev->unacked_cnt))
+ drbd_tcp_cork(mdev->meta.socket);
+ while (1) {
+ clear_bit(SIGNAL_ASENDER, &mdev->flags);
+ flush_signals(current);
+ if (!drbd_process_done_ee(mdev)) {
+ dev_err(DEV, "process_done_ee() = NOT_OK\n");
+ goto reconnect;
+ }
+ /* to avoid race with newly queued ACKs */
+ set_bit(SIGNAL_ASENDER, &mdev->flags);
+ spin_lock_irq(&mdev->req_lock);
+ empty = list_empty(&mdev->done_ee);
+ spin_unlock_irq(&mdev->req_lock);
+ /* new ack may have been queued right here,
+ * but then there is also a signal pending,
+ * and we start over... */
+ if (empty)
+ break;
+ }
+ /* but unconditionally uncork unless disabled */
+ if (!mdev->net_conf->no_cork)
+ drbd_tcp_uncork(mdev->meta.socket);
+
+ /* short circuit, recv_msg would return EINTR anyways. */
+ if (signal_pending(current))
+ continue;
+
+ rv = drbd_recv_short(mdev, mdev->meta.socket,
+ buf, expect-received, 0);
+ clear_bit(SIGNAL_ASENDER, &mdev->flags);
+
+ flush_signals(current);
+
+ /* Note:
+ * -EINTR (on meta) we got a signal
+ * -EAGAIN (on meta) rcvtimeo expired
+ * -ECONNRESET other side closed the connection
+ * -ERESTARTSYS (on data) we got a signal
+ * rv < 0 other than above: unexpected error!
+ * rv == expected: full header or command
+ * rv < expected: "woken" by signal during receive
+ * rv == 0 : "connection shut down by peer"
+ */
+ if (likely(rv > 0)) {
+ received += rv;
+ buf += rv;
+ } else if (rv == 0) {
+ dev_err(DEV, "meta connection shut down by peer.\n");
+ goto reconnect;
+ } else if (rv == -EAGAIN) {
+ if (mdev->meta.socket->sk->sk_rcvtimeo ==
+ mdev->net_conf->ping_timeo*HZ/10) {
+ dev_err(DEV, "PingAck did not arrive in time.\n");
+ goto reconnect;
+ }
+ set_bit(SEND_PING, &mdev->flags);
+ continue;
+ } else if (rv == -EINTR) {
+ continue;
+ } else {
+ dev_err(DEV, "sock_recvmsg returned %d\n", rv);
+ goto reconnect;
+ }
+
+ if (received == expect && cmd == NULL) {
+ if (unlikely(h->magic != BE_DRBD_MAGIC)) {
+ dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n",
+ (long)be32_to_cpu(h->magic),
+ h->command, h->length);
+ goto reconnect;
+ }
+ cmd = get_asender_cmd(be16_to_cpu(h->command));
+ len = be16_to_cpu(h->length);
+ if (unlikely(cmd == NULL)) {
+ dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n",
+ (long)be32_to_cpu(h->magic),
+ h->command, h->length);
+ goto disconnect;
+ }
+ expect = cmd->pkt_size;
+ ERR_IF(len != expect-sizeof(struct p_header))
+ goto reconnect;
+ }
+ if (received == expect) {
+ D_ASSERT(cmd != NULL);
+ if (!cmd->process(mdev, h))
+ goto reconnect;
+
+ buf = h;
+ received = 0;
+ expect = sizeof(struct p_header);
+ cmd = NULL;
+ }
+ }
+
+ if (0) {
+reconnect:
+ drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+ }
+ if (0) {
+disconnect:
+ drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ }
+ clear_bit(SIGNAL_ASENDER, &mdev->flags);
+
+ D_ASSERT(mdev->state.conn < C_CONNECTED);
+ dev_info(DEV, "asender terminated\n");
+
+ return 0;
+}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
new file mode 100644
index 000000000000..de81ab7b4627
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.c
@@ -0,0 +1,1125 @@
+/*
+ drbd_req.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+#include "drbd_req.h"
+
+
+/* Update disk stats at start of I/O request */
+static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
+{
+ const int rw = bio_data_dir(bio);
+ int cpu;
+ cpu = part_stat_lock();
+ part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
+ part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
+ part_inc_in_flight(&mdev->vdisk->part0, rw);
+ part_stat_unlock();
+}
+
+/* Update disk stats when completing request upwards */
+static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
+{
+ int rw = bio_data_dir(req->master_bio);
+ unsigned long duration = jiffies - req->start_time;
+ int cpu;
+ cpu = part_stat_lock();
+ part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration);
+ part_round_stats(cpu, &mdev->vdisk->part0);
+ part_dec_in_flight(&mdev->vdisk->part0, rw);
+ part_stat_unlock();
+}
+
+static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
+{
+ const unsigned long s = req->rq_state;
+ /* if it was a write, we may have to set the corresponding
+ * bit(s) out-of-sync first. If it had a local part, we need to
+ * release the reference to the activity log. */
+ if (rw == WRITE) {
+ /* remove it from the transfer log.
+ * well, only if it had been there in the first
+ * place... if it had not (local only or conflicting
+ * and never sent), it should still be "empty" as
+ * initialized in drbd_req_new(), so we can list_del() it
+ * here unconditionally */
+ list_del(&req->tl_requests);
+ /* Set out-of-sync unless both OK flags are set
+ * (local only or remote failed).
+ * Other places where we set out-of-sync:
+ * READ with local io-error */
+ if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
+ drbd_set_out_of_sync(mdev, req->sector, req->size);
+
+ if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
+ drbd_set_in_sync(mdev, req->sector, req->size);
+
+ /* one might be tempted to move the drbd_al_complete_io
+ * to the local io completion callback drbd_endio_pri.
+ * but, if this was a mirror write, we may only
+ * drbd_al_complete_io after this is RQ_NET_DONE,
+ * otherwise the extent could be dropped from the al
+ * before it has actually been written on the peer.
+ * if we crash before our peer knows about the request,
+ * but after the extent has been dropped from the al,
+ * we would forget to resync the corresponding extent.
+ */
+ if (s & RQ_LOCAL_MASK) {
+ if (get_ldev_if_state(mdev, D_FAILED)) {
+ drbd_al_complete_io(mdev, req->sector);
+ put_ldev(mdev);
+ } else if (__ratelimit(&drbd_ratelimit_state)) {
+ dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), "
+ "but my Disk seems to have failed :(\n",
+ (unsigned long long) req->sector);
+ }
+ }
+ }
+
+ /* if it was a local io error, we want to notify our
+ * peer about that, and see if we need to
+ * detach the disk and stuff.
+ * to avoid allocating some special work
+ * struct, reuse the request. */
+
+ /* THINK
+ * why do we do this not when we detect the error,
+ * but delay it until it is "done", i.e. possibly
+ * until the next barrier ack? */
+
+ if (rw == WRITE &&
+ ((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) {
+ if (!(req->w.list.next == LIST_POISON1 ||
+ list_empty(&req->w.list))) {
+ /* DEBUG ASSERT only; if this triggers, we
+ * probably corrupt the worker list here */
+ dev_err(DEV, "req->w.list.next = %p\n", req->w.list.next);
+ dev_err(DEV, "req->w.list.prev = %p\n", req->w.list.prev);
+ }
+ req->w.cb = w_io_error;
+ drbd_queue_work(&mdev->data.work, &req->w);
+ /* drbd_req_free() is done in w_io_error */
+ } else {
+ drbd_req_free(req);
+ }
+}
+
+static void queue_barrier(struct drbd_conf *mdev)
+{
+ struct drbd_tl_epoch *b;
+
+ /* We are within the req_lock. Once we queued the barrier for sending,
+ * we set the CREATE_BARRIER bit. It is cleared as soon as a new
+ * barrier/epoch object is added. This is the only place this bit is
+ * set. It indicates that the barrier for this epoch is already queued,
+ * and no new epoch has been created yet. */
+ if (test_bit(CREATE_BARRIER, &mdev->flags))
+ return;
+
+ b = mdev->newest_tle;
+ b->w.cb = w_send_barrier;
+ /* inc_ap_pending done here, so we won't
+ * get imbalanced on connection loss.
+ * dec_ap_pending will be done in got_BarrierAck
+ * or (on connection loss) in tl_clear. */
+ inc_ap_pending(mdev);
+ drbd_queue_work(&mdev->data.work, &b->w);
+ set_bit(CREATE_BARRIER, &mdev->flags);
+}
+
+static void _about_to_complete_local_write(struct drbd_conf *mdev,
+ struct drbd_request *req)
+{
+ const unsigned long s = req->rq_state;
+ struct drbd_request *i;
+ struct drbd_epoch_entry *e;
+ struct hlist_node *n;
+ struct hlist_head *slot;
+
+ /* before we can signal completion to the upper layers,
+ * we may need to close the current epoch */
+ if (mdev->state.conn >= C_CONNECTED &&
+ req->epoch == mdev->newest_tle->br_number)
+ queue_barrier(mdev);
+
+ /* we need to do the conflict detection stuff,
+ * if we have the ee_hash (two_primaries) and
+ * this has been on the network */
+ if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) {
+ const sector_t sector = req->sector;
+ const int size = req->size;
+
+ /* ASSERT:
+ * there must be no conflicting requests, since
+ * they must have been failed on the spot */
+#define OVERLAPS overlaps(sector, size, i->sector, i->size)
+ slot = tl_hash_slot(mdev, sector);
+ hlist_for_each_entry(i, n, slot, colision) {
+ if (OVERLAPS) {
+ dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; "
+ "other: %p %llus +%u\n",
+ req, (unsigned long long)sector, size,
+ i, (unsigned long long)i->sector, i->size);
+ }
+ }
+
+ /* maybe "wake" those conflicting epoch entries
+ * that wait for this request to finish.
+ *
+ * currently, there can be only _one_ such ee
+ * (well, or some more, which would be pending
+ * P_DISCARD_ACK not yet sent by the asender...),
+ * since we block the receiver thread upon the
+ * first conflict detection, which will wait on
+ * misc_wait. maybe we want to assert that?
+ *
+ * anyways, if we found one,
+ * we just have to do a wake_up. */
+#undef OVERLAPS
+#define OVERLAPS overlaps(sector, size, e->sector, e->size)
+ slot = ee_hash_slot(mdev, req->sector);
+ hlist_for_each_entry(e, n, slot, colision) {
+ if (OVERLAPS) {
+ wake_up(&mdev->misc_wait);
+ break;
+ }
+ }
+ }
+#undef OVERLAPS
+}
+
+void complete_master_bio(struct drbd_conf *mdev,
+ struct bio_and_error *m)
+{
+ bio_endio(m->bio, m->error);
+ dec_ap_bio(mdev);
+}
+
+/* Helper for __req_mod().
+ * Set m->bio to the master bio, if it is fit to be completed,
+ * or leave it alone (it is initialized to NULL in __req_mod),
+ * if it has already been completed, or cannot be completed yet.
+ * If m->bio is set, the error status to be returned is placed in m->error.
+ */
+void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
+{
+ const unsigned long s = req->rq_state;
+ struct drbd_conf *mdev = req->mdev;
+ /* only WRITES may end up here without a master bio (on barrier ack) */
+ int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE;
+
+ /* we must not complete the master bio, while it is
+ * still being processed by _drbd_send_zc_bio (drbd_send_dblock)
+ * not yet acknowledged by the peer
+ * not yet completed by the local io subsystem
+ * these flags may get cleared in any order by
+ * the worker,
+ * the receiver,
+ * the bio_endio completion callbacks.
+ */
+ if (s & RQ_NET_QUEUED)
+ return;
+ if (s & RQ_NET_PENDING)
+ return;
+ if (s & RQ_LOCAL_PENDING)
+ return;
+
+ if (req->master_bio) {
+ /* this is data_received (remote read)
+ * or protocol C P_WRITE_ACK
+ * or protocol B P_RECV_ACK
+ * or protocol A "handed_over_to_network" (SendAck)
+ * or canceled or failed,
+ * or killed from the transfer log due to connection loss.
+ */
+
+ /*
+ * figure out whether to report success or failure.
+ *
+ * report success when at least one of the operations succeeded.
+ * or, to put the other way,
+ * only report failure, when both operations failed.
+ *
+ * what to do about the failures is handled elsewhere.
+ * what we need to do here is just: complete the master_bio.
+ *
+ * local completion error, if any, has been stored as ERR_PTR
+ * in private_bio within drbd_endio_pri.
+ */
+ int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
+ int error = PTR_ERR(req->private_bio);
+
+ /* remove the request from the conflict detection
+ * respective block_id verification hash */
+ if (!hlist_unhashed(&req->colision))
+ hlist_del(&req->colision);
+ else
+ D_ASSERT((s & RQ_NET_MASK) == 0);
+
+ /* for writes we need to do some extra housekeeping */
+ if (rw == WRITE)
+ _about_to_complete_local_write(mdev, req);
+
+ /* Update disk stats */
+ _drbd_end_io_acct(mdev, req);
+
+ m->error = ok ? 0 : (error ?: -EIO);
+ m->bio = req->master_bio;
+ req->master_bio = NULL;
+ }
+
+ if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
+ /* this is disconnected (local only) operation,
+ * or protocol C P_WRITE_ACK,
+ * or protocol A or B P_BARRIER_ACK,
+ * or killed from the transfer log due to connection loss. */
+ _req_is_done(mdev, req, rw);
+ }
+ /* else: network part and not DONE yet. that is
+ * protocol A or B, barrier ack still pending... */
+}
+
+/*
+ * checks whether there was an overlapping request
+ * or ee already registered.
+ *
+ * if so, return 1, in which case this request is completed on the spot,
+ * without ever being submitted or send.
+ *
+ * return 0 if it is ok to submit this request.
+ *
+ * NOTE:
+ * paranoia: assume something above us is broken, and issues different write
+ * requests for the same block simultaneously...
+ *
+ * To ensure these won't be reordered differently on both nodes, resulting in
+ * diverging data sets, we discard the later one(s). Not that this is supposed
+ * to happen, but this is the rationale why we also have to check for
+ * conflicting requests with local origin, and why we have to do so regardless
+ * of whether we allowed multiple primaries.
+ *
+ * BTW, in case we only have one primary, the ee_hash is empty anyways, and the
+ * second hlist_for_each_entry becomes a noop. This is even simpler than to
+ * grab a reference on the net_conf, and check for the two_primaries flag...
+ */
+static int _req_conflicts(struct drbd_request *req)
+{
+ struct drbd_conf *mdev = req->mdev;
+ const sector_t sector = req->sector;
+ const int size = req->size;
+ struct drbd_request *i;
+ struct drbd_epoch_entry *e;
+ struct hlist_node *n;
+ struct hlist_head *slot;
+
+ D_ASSERT(hlist_unhashed(&req->colision));
+
+ if (!get_net_conf(mdev))
+ return 0;
+
+ /* BUG_ON */
+ ERR_IF (mdev->tl_hash_s == 0)
+ goto out_no_conflict;
+ BUG_ON(mdev->tl_hash == NULL);
+
+#define OVERLAPS overlaps(i->sector, i->size, sector, size)
+ slot = tl_hash_slot(mdev, sector);
+ hlist_for_each_entry(i, n, slot, colision) {
+ if (OVERLAPS) {
+ dev_alert(DEV, "%s[%u] Concurrent local write detected! "
+ "[DISCARD L] new: %llus +%u; "
+ "pending: %llus +%u\n",
+ current->comm, current->pid,
+ (unsigned long long)sector, size,
+ (unsigned long long)i->sector, i->size);
+ goto out_conflict;
+ }
+ }
+
+ if (mdev->ee_hash_s) {
+ /* now, check for overlapping requests with remote origin */
+ BUG_ON(mdev->ee_hash == NULL);
+#undef OVERLAPS
+#define OVERLAPS overlaps(e->sector, e->size, sector, size)
+ slot = ee_hash_slot(mdev, sector);
+ hlist_for_each_entry(e, n, slot, colision) {
+ if (OVERLAPS) {
+ dev_alert(DEV, "%s[%u] Concurrent remote write detected!"
+ " [DISCARD L] new: %llus +%u; "
+ "pending: %llus +%u\n",
+ current->comm, current->pid,
+ (unsigned long long)sector, size,
+ (unsigned long long)e->sector, e->size);
+ goto out_conflict;
+ }
+ }
+ }
+#undef OVERLAPS
+
+out_no_conflict:
+ /* this is like it should be, and what we expected.
+ * our users do behave after all... */
+ put_net_conf(mdev);
+ return 0;
+
+out_conflict:
+ put_net_conf(mdev);
+ return 1;
+}
+
+/* obviously this could be coded as many single functions
+ * instead of one huge switch,
+ * or by putting the code directly in the respective locations
+ * (as it has been before).
+ *
+ * but having it this way
+ * enforces that it is all in this one place, where it is easier to audit,
+ * it makes it obvious that whatever "event" "happens" to a request should
+ * happen "atomically" within the req_lock,
+ * and it enforces that we have to think in a very structured manner
+ * about the "events" that may happen to a request during its life time ...
+ */
+void __req_mod(struct drbd_request *req, enum drbd_req_event what,
+ struct bio_and_error *m)
+{
+ struct drbd_conf *mdev = req->mdev;
+ m->bio = NULL;
+
+ switch (what) {
+ default:
+ dev_err(DEV, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__);
+ break;
+
+ /* does not happen...
+ * initialization done in drbd_req_new
+ case created:
+ break;
+ */
+
+ case to_be_send: /* via network */
+ /* reached via drbd_make_request_common
+ * and from w_read_retry_remote */
+ D_ASSERT(!(req->rq_state & RQ_NET_MASK));
+ req->rq_state |= RQ_NET_PENDING;
+ inc_ap_pending(mdev);
+ break;
+
+ case to_be_submitted: /* locally */
+ /* reached via drbd_make_request_common */
+ D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK));
+ req->rq_state |= RQ_LOCAL_PENDING;
+ break;
+
+ case completed_ok:
+ if (bio_data_dir(req->master_bio) == WRITE)
+ mdev->writ_cnt += req->size>>9;
+ else
+ mdev->read_cnt += req->size>>9;
+
+ req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
+ req->rq_state &= ~RQ_LOCAL_PENDING;
+
+ _req_may_be_done(req, m);
+ put_ldev(mdev);
+ break;
+
+ case write_completed_with_error:
+ req->rq_state |= RQ_LOCAL_COMPLETED;
+ req->rq_state &= ~RQ_LOCAL_PENDING;
+
+ dev_alert(DEV, "Local WRITE failed sec=%llus size=%u\n",
+ (unsigned long long)req->sector, req->size);
+ /* and now: check how to handle local io error. */
+ __drbd_chk_io_error(mdev, FALSE);
+ _req_may_be_done(req, m);
+ put_ldev(mdev);
+ break;
+
+ case read_ahead_completed_with_error:
+ /* it is legal to fail READA */
+ req->rq_state |= RQ_LOCAL_COMPLETED;
+ req->rq_state &= ~RQ_LOCAL_PENDING;
+ _req_may_be_done(req, m);
+ put_ldev(mdev);
+ break;
+
+ case read_completed_with_error:
+ drbd_set_out_of_sync(mdev, req->sector, req->size);
+
+ req->rq_state |= RQ_LOCAL_COMPLETED;
+ req->rq_state &= ~RQ_LOCAL_PENDING;
+
+ dev_alert(DEV, "Local READ failed sec=%llus size=%u\n",
+ (unsigned long long)req->sector, req->size);
+ /* _req_mod(req,to_be_send); oops, recursion... */
+ D_ASSERT(!(req->rq_state & RQ_NET_MASK));
+ req->rq_state |= RQ_NET_PENDING;
+ inc_ap_pending(mdev);
+
+ __drbd_chk_io_error(mdev, FALSE);
+ put_ldev(mdev);
+ /* NOTE: if we have no connection,
+ * or know the peer has no good data either,
+ * then we don't actually need to "queue_for_net_read",
+ * but we do so anyways, since the drbd_io_error()
+ * and the potential state change to "Diskless"
+ * needs to be done from process context */
+
+ /* fall through: _req_mod(req,queue_for_net_read); */
+
+ case queue_for_net_read:
+ /* READ or READA, and
+ * no local disk,
+ * or target area marked as invalid,
+ * or just got an io-error. */
+ /* from drbd_make_request_common
+ * or from bio_endio during read io-error recovery */
+
+ /* so we can verify the handle in the answer packet
+ * corresponding hlist_del is in _req_may_be_done() */
+ hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector));
+
+ set_bit(UNPLUG_REMOTE, &mdev->flags);
+
+ D_ASSERT(req->rq_state & RQ_NET_PENDING);
+ req->rq_state |= RQ_NET_QUEUED;
+ req->w.cb = (req->rq_state & RQ_LOCAL_MASK)
+ ? w_read_retry_remote
+ : w_send_read_req;
+ drbd_queue_work(&mdev->data.work, &req->w);
+ break;
+
+ case queue_for_net_write:
+ /* assert something? */
+ /* from drbd_make_request_common only */
+
+ hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector));
+ /* corresponding hlist_del is in _req_may_be_done() */
+
+ /* NOTE
+ * In case the req ended up on the transfer log before being
+ * queued on the worker, it could lead to this request being
+ * missed during cleanup after connection loss.
+ * So we have to do both operations here,
+ * within the same lock that protects the transfer log.
+ *
+ * _req_add_to_epoch(req); this has to be after the
+ * _maybe_start_new_epoch(req); which happened in
+ * drbd_make_request_common, because we now may set the bit
+ * again ourselves to close the current epoch.
+ *
+ * Add req to the (now) current epoch (barrier). */
+
+ /* otherwise we may lose an unplug, which may cause some remote
+ * io-scheduler timeout to expire, increasing maximum latency,
+ * hurting performance. */
+ set_bit(UNPLUG_REMOTE, &mdev->flags);
+
+ /* see drbd_make_request_common,
+ * just after it grabs the req_lock */
+ D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0);
+
+ req->epoch = mdev->newest_tle->br_number;
+ list_add_tail(&req->tl_requests,
+ &mdev->newest_tle->requests);
+
+ /* increment size of current epoch */
+ mdev->newest_tle->n_req++;
+
+ /* queue work item to send data */
+ D_ASSERT(req->rq_state & RQ_NET_PENDING);
+ req->rq_state |= RQ_NET_QUEUED;
+ req->w.cb = w_send_dblock;
+ drbd_queue_work(&mdev->data.work, &req->w);
+
+ /* close the epoch, in case it outgrew the limit */
+ if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size)
+ queue_barrier(mdev);
+
+ break;
+
+ case send_canceled:
+ /* treat it the same */
+ case send_failed:
+ /* real cleanup will be done from tl_clear. just update flags
+ * so it is no longer marked as on the worker queue */
+ req->rq_state &= ~RQ_NET_QUEUED;
+ /* if we did it right, tl_clear should be scheduled only after
+ * this, so this should not be necessary! */
+ _req_may_be_done(req, m);
+ break;
+
+ case handed_over_to_network:
+ /* assert something? */
+ if (bio_data_dir(req->master_bio) == WRITE &&
+ mdev->net_conf->wire_protocol == DRBD_PROT_A) {
+ /* this is what is dangerous about protocol A:
+ * pretend it was successfully written on the peer. */
+ if (req->rq_state & RQ_NET_PENDING) {
+ dec_ap_pending(mdev);
+ req->rq_state &= ~RQ_NET_PENDING;
+ req->rq_state |= RQ_NET_OK;
+ } /* else: neg-ack was faster... */
+ /* it is still not yet RQ_NET_DONE until the
+ * corresponding epoch barrier got acked as well,
+ * so we know what to dirty on connection loss */
+ }
+ req->rq_state &= ~RQ_NET_QUEUED;
+ req->rq_state |= RQ_NET_SENT;
+ /* because _drbd_send_zc_bio could sleep, and may want to
+ * dereference the bio even after the "write_acked_by_peer" and
+ * "completed_ok" events came in, once we return from
+ * _drbd_send_zc_bio (drbd_send_dblock), we have to check
+ * whether it is done already, and end it. */
+ _req_may_be_done(req, m);
+ break;
+
+ case connection_lost_while_pending:
+ /* transfer log cleanup after connection loss */
+ /* assert something? */
+ if (req->rq_state & RQ_NET_PENDING)
+ dec_ap_pending(mdev);
+ req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
+ req->rq_state |= RQ_NET_DONE;
+ /* if it is still queued, we may not complete it here.
+ * it will be canceled soon. */
+ if (!(req->rq_state & RQ_NET_QUEUED))
+ _req_may_be_done(req, m);
+ break;
+
+ case write_acked_by_peer_and_sis:
+ req->rq_state |= RQ_NET_SIS;
+ case conflict_discarded_by_peer:
+ /* for discarded conflicting writes of multiple primaries,
+ * there is no need to keep anything in the tl, potential
+ * node crashes are covered by the activity log. */
+ if (what == conflict_discarded_by_peer)
+ dev_alert(DEV, "Got DiscardAck packet %llus +%u!"
+ " DRBD is not a random data generator!\n",
+ (unsigned long long)req->sector, req->size);
+ req->rq_state |= RQ_NET_DONE;
+ /* fall through */
+ case write_acked_by_peer:
+ /* protocol C; successfully written on peer.
+ * Nothing to do here.
+ * We want to keep the tl in place for all protocols, to cater
+ * for volatile write-back caches on lower level devices.
+ *
+ * A barrier request is expected to have forced all prior
+ * requests onto stable storage, so completion of a barrier
+ * request could set NET_DONE right here, and not wait for the
+ * P_BARRIER_ACK, but that is an unnecessary optimization. */
+
+ /* this makes it effectively the same as for: */
+ case recv_acked_by_peer:
+ /* protocol B; pretends to be successfully written on peer.
+ * see also notes above in handed_over_to_network about
+ * protocol != C */
+ req->rq_state |= RQ_NET_OK;
+ D_ASSERT(req->rq_state & RQ_NET_PENDING);
+ dec_ap_pending(mdev);
+ req->rq_state &= ~RQ_NET_PENDING;
+ _req_may_be_done(req, m);
+ break;
+
+ case neg_acked:
+ /* assert something? */
+ if (req->rq_state & RQ_NET_PENDING)
+ dec_ap_pending(mdev);
+ req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
+
+ req->rq_state |= RQ_NET_DONE;
+ _req_may_be_done(req, m);
+ /* else: done by handed_over_to_network */
+ break;
+
+ case barrier_acked:
+ if (req->rq_state & RQ_NET_PENDING) {
+ /* barrier came in before all requests have been acked.
+ * this is bad, because if the connection is lost now,
+ * we won't be able to clean them up... */
+ dev_err(DEV, "FIXME (barrier_acked but pending)\n");
+ list_move(&req->tl_requests, &mdev->out_of_sequence_requests);
+ }
+ D_ASSERT(req->rq_state & RQ_NET_SENT);
+ req->rq_state |= RQ_NET_DONE;
+ _req_may_be_done(req, m);
+ break;
+
+ case data_received:
+ D_ASSERT(req->rq_state & RQ_NET_PENDING);
+ dec_ap_pending(mdev);
+ req->rq_state &= ~RQ_NET_PENDING;
+ req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
+ _req_may_be_done(req, m);
+ break;
+ };
+}
+
+/* we may do a local read if:
+ * - we are consistent (of course),
+ * - or we are generally inconsistent,
+ * BUT we are still/already IN SYNC for this area.
+ * since size may be bigger than BM_BLOCK_SIZE,
+ * we may need to check several bits.
+ */
+static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size)
+{
+ unsigned long sbnr, ebnr;
+ sector_t esector, nr_sectors;
+
+ if (mdev->state.disk == D_UP_TO_DATE)
+ return 1;
+ if (mdev->state.disk >= D_OUTDATED)
+ return 0;
+ if (mdev->state.disk < D_INCONSISTENT)
+ return 0;
+ /* state.disk == D_INCONSISTENT We will have a look at the BitMap */
+ nr_sectors = drbd_get_capacity(mdev->this_bdev);
+ esector = sector + (size >> 9) - 1;
+
+ D_ASSERT(sector < nr_sectors);
+ D_ASSERT(esector < nr_sectors);
+
+ sbnr = BM_SECT_TO_BIT(sector);
+ ebnr = BM_SECT_TO_BIT(esector);
+
+ return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
+}
+
+static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
+{
+ const int rw = bio_rw(bio);
+ const int size = bio->bi_size;
+ const sector_t sector = bio->bi_sector;
+ struct drbd_tl_epoch *b = NULL;
+ struct drbd_request *req;
+ int local, remote;
+ int err = -EIO;
+
+ /* allocate outside of all locks; */
+ req = drbd_req_new(mdev, bio);
+ if (!req) {
+ dec_ap_bio(mdev);
+ /* only pass the error to the upper layers.
+ * if user cannot handle io errors, that's not our business. */
+ dev_err(DEV, "could not kmalloc() req\n");
+ bio_endio(bio, -ENOMEM);
+ return 0;
+ }
+
+ local = get_ldev(mdev);
+ if (!local) {
+ bio_put(req->private_bio); /* or we get a bio leak */
+ req->private_bio = NULL;
+ }
+ if (rw == WRITE) {
+ remote = 1;
+ } else {
+ /* READ || READA */
+ if (local) {
+ if (!drbd_may_do_local_read(mdev, sector, size)) {
+ /* we could kick the syncer to
+ * sync this extent asap, wait for
+ * it, then continue locally.
+ * Or just issue the request remotely.
+ */
+ local = 0;
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ put_ldev(mdev);
+ }
+ }
+ remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
+ }
+
+ /* If we have a disk, but a READA request is mapped to remote,
+ * we are R_PRIMARY, D_INCONSISTENT, SyncTarget.
+ * Just fail that READA request right here.
+ *
+ * THINK: maybe fail all READA when not local?
+ * or make this configurable...
+ * if network is slow, READA won't do any good.
+ */
+ if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) {
+ err = -EWOULDBLOCK;
+ goto fail_and_free_req;
+ }
+
+ /* For WRITES going to the local disk, grab a reference on the target
+ * extent. This waits for any resync activity in the corresponding
+ * resync extent to finish, and, if necessary, pulls in the target
+ * extent into the activity log, which involves further disk io because
+ * of transactional on-disk meta data updates. */
+ if (rw == WRITE && local)
+ drbd_al_begin_io(mdev, sector);
+
+ remote = remote && (mdev->state.pdsk == D_UP_TO_DATE ||
+ (mdev->state.pdsk == D_INCONSISTENT &&
+ mdev->state.conn >= C_CONNECTED));
+
+ if (!(local || remote)) {
+ dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
+ goto fail_free_complete;
+ }
+
+ /* For WRITE request, we have to make sure that we have an
+ * unused_spare_tle, in case we need to start a new epoch.
+ * I try to be smart and avoid to pre-allocate always "just in case",
+ * but there is a race between testing the bit and pointer outside the
+ * spinlock, and grabbing the spinlock.
+ * if we lost that race, we retry. */
+ if (rw == WRITE && remote &&
+ mdev->unused_spare_tle == NULL &&
+ test_bit(CREATE_BARRIER, &mdev->flags)) {
+allocate_barrier:
+ b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO);
+ if (!b) {
+ dev_err(DEV, "Failed to alloc barrier.\n");
+ err = -ENOMEM;
+ goto fail_free_complete;
+ }
+ }
+
+ /* GOOD, everything prepared, grab the spin_lock */
+ spin_lock_irq(&mdev->req_lock);
+
+ if (remote) {
+ remote = (mdev->state.pdsk == D_UP_TO_DATE ||
+ (mdev->state.pdsk == D_INCONSISTENT &&
+ mdev->state.conn >= C_CONNECTED));
+ if (!remote)
+ dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
+ if (!(local || remote)) {
+ dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
+ spin_unlock_irq(&mdev->req_lock);
+ goto fail_free_complete;
+ }
+ }
+
+ if (b && mdev->unused_spare_tle == NULL) {
+ mdev->unused_spare_tle = b;
+ b = NULL;
+ }
+ if (rw == WRITE && remote &&
+ mdev->unused_spare_tle == NULL &&
+ test_bit(CREATE_BARRIER, &mdev->flags)) {
+ /* someone closed the current epoch
+ * while we were grabbing the spinlock */
+ spin_unlock_irq(&mdev->req_lock);
+ goto allocate_barrier;
+ }
+
+
+ /* Update disk stats */
+ _drbd_start_io_acct(mdev, req, bio);
+
+ /* _maybe_start_new_epoch(mdev);
+ * If we need to generate a write barrier packet, we have to add the
+ * new epoch (barrier) object, and queue the barrier packet for sending,
+ * and queue the req's data after it _within the same lock_, otherwise
+ * we have race conditions were the reorder domains could be mixed up.
+ *
+ * Even read requests may start a new epoch and queue the corresponding
+ * barrier packet. To get the write ordering right, we only have to
+ * make sure that, if this is a write request and it triggered a
+ * barrier packet, this request is queued within the same spinlock. */
+ if (remote && mdev->unused_spare_tle &&
+ test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
+ _tl_add_barrier(mdev, mdev->unused_spare_tle);
+ mdev->unused_spare_tle = NULL;
+ } else {
+ D_ASSERT(!(remote && rw == WRITE &&
+ test_bit(CREATE_BARRIER, &mdev->flags)));
+ }
+
+ /* NOTE
+ * Actually, 'local' may be wrong here already, since we may have failed
+ * to write to the meta data, and may become wrong anytime because of
+ * local io-error for some other request, which would lead to us
+ * "detaching" the local disk.
+ *
+ * 'remote' may become wrong any time because the network could fail.
+ *
+ * This is a harmless race condition, though, since it is handled
+ * correctly at the appropriate places; so it just defers the failure
+ * of the respective operation.
+ */
+
+ /* mark them early for readability.
+ * this just sets some state flags. */
+ if (remote)
+ _req_mod(req, to_be_send);
+ if (local)
+ _req_mod(req, to_be_submitted);
+
+ /* check this request on the collision detection hash tables.
+ * if we have a conflict, just complete it here.
+ * THINK do we want to check reads, too? (I don't think so...) */
+ if (rw == WRITE && _req_conflicts(req)) {
+ /* this is a conflicting request.
+ * even though it may have been only _partially_
+ * overlapping with one of the currently pending requests,
+ * without even submitting or sending it, we will
+ * pretend that it was successfully served right now.
+ */
+ if (local) {
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ drbd_al_complete_io(mdev, req->sector);
+ put_ldev(mdev);
+ local = 0;
+ }
+ if (remote)
+ dec_ap_pending(mdev);
+ _drbd_end_io_acct(mdev, req);
+ /* THINK: do we want to fail it (-EIO), or pretend success? */
+ bio_endio(req->master_bio, 0);
+ req->master_bio = NULL;
+ dec_ap_bio(mdev);
+ drbd_req_free(req);
+ remote = 0;
+ }
+
+ /* NOTE remote first: to get the concurrent write detection right,
+ * we must register the request before start of local IO. */
+ if (remote) {
+ /* either WRITE and C_CONNECTED,
+ * or READ, and no local disk,
+ * or READ, but not in sync.
+ */
+ _req_mod(req, (rw == WRITE)
+ ? queue_for_net_write
+ : queue_for_net_read);
+ }
+ spin_unlock_irq(&mdev->req_lock);
+ kfree(b); /* if someone else has beaten us to it... */
+
+ if (local) {
+ req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
+
+ if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
+ : rw == READ ? DRBD_FAULT_DT_RD
+ : DRBD_FAULT_DT_RA))
+ bio_endio(req->private_bio, -EIO);
+ else
+ generic_make_request(req->private_bio);
+ }
+
+ /* we need to plug ALWAYS since we possibly need to kick lo_dev.
+ * we plug after submit, so we won't miss an unplug event */
+ drbd_plug_device(mdev);
+
+ return 0;
+
+fail_free_complete:
+ if (rw == WRITE && local)
+ drbd_al_complete_io(mdev, sector);
+fail_and_free_req:
+ if (local) {
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ put_ldev(mdev);
+ }
+ bio_endio(bio, err);
+ drbd_req_free(req);
+ dec_ap_bio(mdev);
+ kfree(b);
+
+ return 0;
+}
+
+/* helper function for drbd_make_request
+ * if we can determine just by the mdev (state) that this request will fail,
+ * return 1
+ * otherwise return 0
+ */
+static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
+{
+ /* Unconfigured */
+ if (mdev->state.conn == C_DISCONNECTING &&
+ mdev->state.disk == D_DISKLESS)
+ return 1;
+
+ if (mdev->state.role != R_PRIMARY &&
+ (!allow_oos || is_write)) {
+ if (__ratelimit(&drbd_ratelimit_state)) {
+ dev_err(DEV, "Process %s[%u] tried to %s; "
+ "since we are not in Primary state, "
+ "we cannot allow this\n",
+ current->comm, current->pid,
+ is_write ? "WRITE" : "READ");
+ }
+ return 1;
+ }
+
+ /*
+ * Paranoia: we might have been primary, but sync target, or
+ * even diskless, then lost the connection.
+ * This should have been handled (panic? suspend?) somewhere
+ * else. But maybe it was not, so check again here.
+ * Caution: as long as we do not have a read/write lock on mdev,
+ * to serialize state changes, this is racy, since we may lose
+ * the connection *after* we test for the cstate.
+ */
+ if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) {
+ if (__ratelimit(&drbd_ratelimit_state))
+ dev_err(DEV, "Sorry, I have no access to good data anymore.\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+int drbd_make_request_26(struct request_queue *q, struct bio *bio)
+{
+ unsigned int s_enr, e_enr;
+ struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
+
+ if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
+ bio_endio(bio, -EPERM);
+ return 0;
+ }
+
+ /* Reject barrier requests if we know the underlying device does
+ * not support them.
+ * XXX: Need to get this info from peer as well some how so we
+ * XXX: reject if EITHER side/data/metadata area does not support them.
+ *
+ * because of those XXX, this is not yet enabled,
+ * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit.
+ */
+ if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags))) {
+ /* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */
+ bio_endio(bio, -EOPNOTSUPP);
+ return 0;
+ }
+
+ /*
+ * what we "blindly" assume:
+ */
+ D_ASSERT(bio->bi_size > 0);
+ D_ASSERT((bio->bi_size & 0x1ff) == 0);
+ D_ASSERT(bio->bi_idx == 0);
+
+ /* to make some things easier, force alignment of requests within the
+ * granularity of our hash tables */
+ s_enr = bio->bi_sector >> HT_SHIFT;
+ e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT;
+
+ if (likely(s_enr == e_enr)) {
+ inc_ap_bio(mdev, 1);
+ return drbd_make_request_common(mdev, bio);
+ }
+
+ /* can this bio be split generically?
+ * Maybe add our own split-arbitrary-bios function. */
+ if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) {
+ /* rather error out here than BUG in bio_split */
+ dev_err(DEV, "bio would need to, but cannot, be split: "
+ "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
+ bio->bi_vcnt, bio->bi_idx, bio->bi_size,
+ (unsigned long long)bio->bi_sector);
+ bio_endio(bio, -EINVAL);
+ } else {
+ /* This bio crosses some boundary, so we have to split it. */
+ struct bio_pair *bp;
+ /* works for the "do not cross hash slot boundaries" case
+ * e.g. sector 262269, size 4096
+ * s_enr = 262269 >> 6 = 4097
+ * e_enr = (262269+8-1) >> 6 = 4098
+ * HT_SHIFT = 6
+ * sps = 64, mask = 63
+ * first_sectors = 64 - (262269 & 63) = 3
+ */
+ const sector_t sect = bio->bi_sector;
+ const int sps = 1 << HT_SHIFT; /* sectors per slot */
+ const int mask = sps - 1;
+ const sector_t first_sectors = sps - (sect & mask);
+ bp = bio_split(bio,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+ bio_split_pool,
+#endif
+ first_sectors);
+
+ /* we need to get a "reference count" (ap_bio_cnt)
+ * to avoid races with the disconnect/reconnect/suspend code.
+ * In case we need to split the bio here, we need to get two references
+ * atomically, otherwise we might deadlock when trying to submit the
+ * second one! */
+ inc_ap_bio(mdev, 2);
+
+ D_ASSERT(e_enr == s_enr + 1);
+
+ drbd_make_request_common(mdev, &bp->bio1);
+ drbd_make_request_common(mdev, &bp->bio2);
+ bio_pair_release(bp);
+ }
+ return 0;
+}
+
+/* This is called by bio_add_page(). With this function we reduce
+ * the number of BIOs that span over multiple DRBD_MAX_SEGMENT_SIZEs
+ * units (was AL_EXTENTs).
+ *
+ * we do the calculation within the lower 32bit of the byte offsets,
+ * since we don't care for actual offset, but only check whether it
+ * would cross "activity log extent" boundaries.
+ *
+ * As long as the BIO is empty we have to allow at least one bvec,
+ * regardless of size and offset. so the resulting bio may still
+ * cross extent boundaries. those are dealt with (bio_split) in
+ * drbd_make_request_26.
+ */
+int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
+{
+ struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
+ unsigned int bio_offset =
+ (unsigned int)bvm->bi_sector << 9; /* 32 bit */
+ unsigned int bio_size = bvm->bi_size;
+ int limit, backing_limit;
+
+ limit = DRBD_MAX_SEGMENT_SIZE
+ - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size);
+ if (limit < 0)
+ limit = 0;
+ if (bio_size == 0) {
+ if (limit <= bvec->bv_len)
+ limit = bvec->bv_len;
+ } else if (limit && get_ldev(mdev)) {
+ struct request_queue * const b =
+ mdev->ldev->backing_bdev->bd_disk->queue;
+ if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) {
+ backing_limit = b->merge_bvec_fn(b, bvm, bvec);
+ limit = min(limit, backing_limit);
+ }
+ put_ldev(mdev);
+ }
+ return limit;
+}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
new file mode 100644
index 000000000000..f22c1bc8ec7e
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.h
@@ -0,0 +1,326 @@
+/*
+ drbd_req.h
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2006-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+ Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+
+ DRBD is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ DRBD is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DRBD_REQ_H
+#define _DRBD_REQ_H
+
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+#include "drbd_wrappers.h"
+
+/* The request callbacks will be called in irq context by the IDE drivers,
+ and in Softirqs/Tasklets/BH context by the SCSI drivers,
+ and by the receiver and worker in kernel-thread context.
+ Try to get the locking right :) */
+
+/*
+ * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are
+ * associated with IO requests originating from the block layer above us.
+ *
+ * There are quite a few things that may happen to a drbd request
+ * during its lifetime.
+ *
+ * It will be created.
+ * It will be marked with the intention to be
+ * submitted to local disk and/or
+ * send via the network.
+ *
+ * It has to be placed on the transfer log and other housekeeping lists,
+ * In case we have a network connection.
+ *
+ * It may be identified as a concurrent (write) request
+ * and be handled accordingly.
+ *
+ * It may me handed over to the local disk subsystem.
+ * It may be completed by the local disk subsystem,
+ * either sucessfully or with io-error.
+ * In case it is a READ request, and it failed locally,
+ * it may be retried remotely.
+ *
+ * It may be queued for sending.
+ * It may be handed over to the network stack,
+ * which may fail.
+ * It may be acknowledged by the "peer" according to the wire_protocol in use.
+ * this may be a negative ack.
+ * It may receive a faked ack when the network connection is lost and the
+ * transfer log is cleaned up.
+ * Sending may be canceled due to network connection loss.
+ * When it finally has outlived its time,
+ * corresponding dirty bits in the resync-bitmap may be cleared or set,
+ * it will be destroyed,
+ * and completion will be signalled to the originator,
+ * with or without "success".
+ */
+
+enum drbd_req_event {
+ created,
+ to_be_send,
+ to_be_submitted,
+
+ /* XXX yes, now I am inconsistent...
+ * these two are not "events" but "actions"
+ * oh, well... */
+ queue_for_net_write,
+ queue_for_net_read,
+
+ send_canceled,
+ send_failed,
+ handed_over_to_network,
+ connection_lost_while_pending,
+ recv_acked_by_peer,
+ write_acked_by_peer,
+ write_acked_by_peer_and_sis, /* and set_in_sync */
+ conflict_discarded_by_peer,
+ neg_acked,
+ barrier_acked, /* in protocol A and B */
+ data_received, /* (remote read) */
+
+ read_completed_with_error,
+ read_ahead_completed_with_error,
+ write_completed_with_error,
+ completed_ok,
+ nothing, /* for tracing only */
+};
+
+/* encoding of request states for now. we don't actually need that many bits.
+ * we don't need to do atomic bit operations either, since most of the time we
+ * need to look at the connection state and/or manipulate some lists at the
+ * same time, so we should hold the request lock anyways.
+ */
+enum drbd_req_state_bits {
+ /* 210
+ * 000: no local possible
+ * 001: to be submitted
+ * UNUSED, we could map: 011: submitted, completion still pending
+ * 110: completed ok
+ * 010: completed with error
+ */
+ __RQ_LOCAL_PENDING,
+ __RQ_LOCAL_COMPLETED,
+ __RQ_LOCAL_OK,
+
+ /* 76543
+ * 00000: no network possible
+ * 00001: to be send
+ * 00011: to be send, on worker queue
+ * 00101: sent, expecting recv_ack (B) or write_ack (C)
+ * 11101: sent,
+ * recv_ack (B) or implicit "ack" (A),
+ * still waiting for the barrier ack.
+ * master_bio may already be completed and invalidated.
+ * 11100: write_acked (C),
+ * data_received (for remote read, any protocol)
+ * or finally the barrier ack has arrived (B,A)...
+ * request can be freed
+ * 01100: neg-acked (write, protocol C)
+ * or neg-d-acked (read, any protocol)
+ * or killed from the transfer log
+ * during cleanup after connection loss
+ * request can be freed
+ * 01000: canceled or send failed...
+ * request can be freed
+ */
+
+ /* if "SENT" is not set, yet, this can still fail or be canceled.
+ * if "SENT" is set already, we still wait for an Ack packet.
+ * when cleared, the master_bio may be completed.
+ * in (B,A) the request object may still linger on the transaction log
+ * until the corresponding barrier ack comes in */
+ __RQ_NET_PENDING,
+
+ /* If it is QUEUED, and it is a WRITE, it is also registered in the
+ * transfer log. Currently we need this flag to avoid conflicts between
+ * worker canceling the request and tl_clear_barrier killing it from
+ * transfer log. We should restructure the code so this conflict does
+ * no longer occur. */
+ __RQ_NET_QUEUED,
+
+ /* well, actually only "handed over to the network stack".
+ *
+ * TODO can potentially be dropped because of the similar meaning
+ * of RQ_NET_SENT and ~RQ_NET_QUEUED.
+ * however it is not exactly the same. before we drop it
+ * we must ensure that we can tell a request with network part
+ * from a request without, regardless of what happens to it. */
+ __RQ_NET_SENT,
+
+ /* when set, the request may be freed (if RQ_NET_QUEUED is clear).
+ * basically this means the corresponding P_BARRIER_ACK was received */
+ __RQ_NET_DONE,
+
+ /* whether or not we know (C) or pretend (B,A) that the write
+ * was successfully written on the peer.
+ */
+ __RQ_NET_OK,
+
+ /* peer called drbd_set_in_sync() for this write */
+ __RQ_NET_SIS,
+
+ /* keep this last, its for the RQ_NET_MASK */
+ __RQ_NET_MAX,
+};
+
+#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
+#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
+#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK)
+
+#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */
+
+#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING)
+#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED)
+#define RQ_NET_SENT (1UL << __RQ_NET_SENT)
+#define RQ_NET_DONE (1UL << __RQ_NET_DONE)
+#define RQ_NET_OK (1UL << __RQ_NET_OK)
+#define RQ_NET_SIS (1UL << __RQ_NET_SIS)
+
+/* 0x1f8 */
+#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
+
+/* epoch entries */
+static inline
+struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
+{
+ BUG_ON(mdev->ee_hash_s == 0);
+ return mdev->ee_hash +
+ ((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s);
+}
+
+/* transfer log (drbd_request objects) */
+static inline
+struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector)
+{
+ BUG_ON(mdev->tl_hash_s == 0);
+ return mdev->tl_hash +
+ ((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s);
+}
+
+/* application reads (drbd_request objects) */
+static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector)
+{
+ return mdev->app_reads_hash
+ + ((unsigned int)(sector) % APP_R_HSIZE);
+}
+
+/* when we receive the answer for a read request,
+ * verify that we actually know about it */
+static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
+ u64 id, sector_t sector)
+{
+ struct hlist_head *slot = ar_hash_slot(mdev, sector);
+ struct hlist_node *n;
+ struct drbd_request *req;
+
+ hlist_for_each_entry(req, n, slot, colision) {
+ if ((unsigned long)req == (unsigned long)id) {
+ D_ASSERT(req->sector == sector);
+ return req;
+ }
+ }
+ return NULL;
+}
+
+static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
+ struct bio *bio_src)
+{
+ struct bio *bio;
+ struct drbd_request *req =
+ mempool_alloc(drbd_request_mempool, GFP_NOIO);
+ if (likely(req)) {
+ bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
+
+ req->rq_state = 0;
+ req->mdev = mdev;
+ req->master_bio = bio_src;
+ req->private_bio = bio;
+ req->epoch = 0;
+ req->sector = bio->bi_sector;
+ req->size = bio->bi_size;
+ req->start_time = jiffies;
+ INIT_HLIST_NODE(&req->colision);
+ INIT_LIST_HEAD(&req->tl_requests);
+ INIT_LIST_HEAD(&req->w.list);
+
+ bio->bi_private = req;
+ bio->bi_end_io = drbd_endio_pri;
+ bio->bi_next = NULL;
+ }
+ return req;
+}
+
+static inline void drbd_req_free(struct drbd_request *req)
+{
+ mempool_free(req, drbd_request_mempool);
+}
+
+static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
+{
+ return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
+}
+
+/* Short lived temporary struct on the stack.
+ * We could squirrel the error to be returned into
+ * bio->bi_size, or similar. But that would be too ugly. */
+struct bio_and_error {
+ struct bio *bio;
+ int error;
+};
+
+extern void _req_may_be_done(struct drbd_request *req,
+ struct bio_and_error *m);
+extern void __req_mod(struct drbd_request *req, enum drbd_req_event what,
+ struct bio_and_error *m);
+extern void complete_master_bio(struct drbd_conf *mdev,
+ struct bio_and_error *m);
+
+/* use this if you don't want to deal with calling complete_master_bio()
+ * outside the spinlock, e.g. when walking some list on cleanup. */
+static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what)
+{
+ struct drbd_conf *mdev = req->mdev;
+ struct bio_and_error m;
+
+ /* __req_mod possibly frees req, do not touch req after that! */
+ __req_mod(req, what, &m);
+ if (m.bio)
+ complete_master_bio(mdev, &m);
+}
+
+/* completion of master bio is outside of spinlock.
+ * If you need it irqsave, do it your self! */
+static inline void req_mod(struct drbd_request *req,
+ enum drbd_req_event what)
+{
+ struct drbd_conf *mdev = req->mdev;
+ struct bio_and_error m;
+ spin_lock_irq(&mdev->req_lock);
+ __req_mod(req, what, &m);
+ spin_unlock_irq(&mdev->req_lock);
+
+ if (m.bio)
+ complete_master_bio(mdev, &m);
+}
+#endif
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
new file mode 100644
index 000000000000..76863e3f05be
--- /dev/null
+++ b/drivers/block/drbd/drbd_strings.c
@@ -0,0 +1,113 @@
+/*
+ drbd.h
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include <linux/drbd.h>
+
+static const char *drbd_conn_s_names[] = {
+ [C_STANDALONE] = "StandAlone",
+ [C_DISCONNECTING] = "Disconnecting",
+ [C_UNCONNECTED] = "Unconnected",
+ [C_TIMEOUT] = "Timeout",
+ [C_BROKEN_PIPE] = "BrokenPipe",
+ [C_NETWORK_FAILURE] = "NetworkFailure",
+ [C_PROTOCOL_ERROR] = "ProtocolError",
+ [C_WF_CONNECTION] = "WFConnection",
+ [C_WF_REPORT_PARAMS] = "WFReportParams",
+ [C_TEAR_DOWN] = "TearDown",
+ [C_CONNECTED] = "Connected",
+ [C_STARTING_SYNC_S] = "StartingSyncS",
+ [C_STARTING_SYNC_T] = "StartingSyncT",
+ [C_WF_BITMAP_S] = "WFBitMapS",
+ [C_WF_BITMAP_T] = "WFBitMapT",
+ [C_WF_SYNC_UUID] = "WFSyncUUID",
+ [C_SYNC_SOURCE] = "SyncSource",
+ [C_SYNC_TARGET] = "SyncTarget",
+ [C_PAUSED_SYNC_S] = "PausedSyncS",
+ [C_PAUSED_SYNC_T] = "PausedSyncT",
+ [C_VERIFY_S] = "VerifyS",
+ [C_VERIFY_T] = "VerifyT",
+};
+
+static const char *drbd_role_s_names[] = {
+ [R_PRIMARY] = "Primary",
+ [R_SECONDARY] = "Secondary",
+ [R_UNKNOWN] = "Unknown"
+};
+
+static const char *drbd_disk_s_names[] = {
+ [D_DISKLESS] = "Diskless",
+ [D_ATTACHING] = "Attaching",
+ [D_FAILED] = "Failed",
+ [D_NEGOTIATING] = "Negotiating",
+ [D_INCONSISTENT] = "Inconsistent",
+ [D_OUTDATED] = "Outdated",
+ [D_UNKNOWN] = "DUnknown",
+ [D_CONSISTENT] = "Consistent",
+ [D_UP_TO_DATE] = "UpToDate",
+};
+
+static const char *drbd_state_sw_errors[] = {
+ [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
+ [-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk",
+ [-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
+ [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
+ [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
+ [-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated",
+ [-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active",
+ [-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device",
+ [-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node",
+ [-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk",
+ [-SS_DEVICE_IN_USE] = "Device is held open by someone",
+ [-SS_NO_NET_CONFIG] = "Have no net/connection configuration",
+ [-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify",
+ [-SS_NEED_CONNECTION] = "Need a connection to start verify or resync",
+ [-SS_NOT_SUPPORTED] = "Peer does not support protocol",
+ [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
+ [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
+ [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
+};
+
+const char *drbd_conn_str(enum drbd_conns s)
+{
+ /* enums are unsigned... */
+ return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s];
+}
+
+const char *drbd_role_str(enum drbd_role s)
+{
+ return s > R_SECONDARY ? "TOO_LARGE" : drbd_role_s_names[s];
+}
+
+const char *drbd_disk_str(enum drbd_disk_state s)
+{
+ return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s];
+}
+
+const char *drbd_set_st_err_str(enum drbd_state_ret_codes err)
+{
+ return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
+ err > SS_TWO_PRIMARIES ? "TOO_LARGE"
+ : drbd_state_sw_errors[-err];
+}
diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h
new file mode 100644
index 000000000000..fc824006e721
--- /dev/null
+++ b/drivers/block/drbd/drbd_vli.h
@@ -0,0 +1,351 @@
+/*
+-*- linux-c -*-
+ drbd_receiver.c
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DRBD_VLI_H
+#define _DRBD_VLI_H
+
+/*
+ * At a granularity of 4KiB storage represented per bit,
+ * and stroage sizes of several TiB,
+ * and possibly small-bandwidth replication,
+ * the bitmap transfer time can take much too long,
+ * if transmitted in plain text.
+ *
+ * We try to reduce the transfered bitmap information
+ * by encoding runlengths of bit polarity.
+ *
+ * We never actually need to encode a "zero" (runlengths are positive).
+ * But then we have to store the value of the first bit.
+ * The first bit of information thus shall encode if the first runlength
+ * gives the number of set or unset bits.
+ *
+ * We assume that large areas are either completely set or unset,
+ * which gives good compression with any runlength method,
+ * even when encoding the runlength as fixed size 32bit/64bit integers.
+ *
+ * Still, there may be areas where the polarity flips every few bits,
+ * and encoding the runlength sequence of those areas with fix size
+ * integers would be much worse than plaintext.
+ *
+ * We want to encode small runlength values with minimum code length,
+ * while still being able to encode a Huge run of all zeros.
+ *
+ * Thus we need a Variable Length Integer encoding, VLI.
+ *
+ * For some cases, we produce more code bits than plaintext input.
+ * We need to send incompressible chunks as plaintext, skip over them
+ * and then see if the next chunk compresses better.
+ *
+ * We don't care too much about "excellent" compression ratio for large
+ * runlengths (all set/all clear): whether we achieve a factor of 100
+ * or 1000 is not that much of an issue.
+ * We do not want to waste too much on short runlengths in the "noisy"
+ * parts of the bitmap, though.
+ *
+ * There are endless variants of VLI, we experimented with:
+ * * simple byte-based
+ * * various bit based with different code word length.
+ *
+ * To avoid yet an other configuration parameter (choice of bitmap compression
+ * algorithm) which was difficult to explain and tune, we just chose the one
+ * variant that turned out best in all test cases.
+ * Based on real world usage patterns, with device sizes ranging from a few GiB
+ * to several TiB, file server/mailserver/webserver/mysql/postgress,
+ * mostly idle to really busy, the all time winner (though sometimes only
+ * marginally better) is:
+ */
+
+/*
+ * encoding is "visualised" as
+ * __little endian__ bitstream, least significant bit first (left most)
+ *
+ * this particular encoding is chosen so that the prefix code
+ * starts as unary encoding the level, then modified so that
+ * 10 levels can be described in 8bit, with minimal overhead
+ * for the smaller levels.
+ *
+ * Number of data bits follow fibonacci sequence, with the exception of the
+ * last level (+1 data bit, so it makes 64bit total). The only worse code when
+ * encoding bit polarity runlength is 1 plain bits => 2 code bits.
+prefix data bits max val NÂș data bits
+0 x 0x2 1
+10 x 0x4 1
+110 xx 0x8 2
+1110 xxx 0x10 3
+11110 xxx xx 0x30 5
+111110 xx xxxxxx 0x130 8
+11111100 xxxxxxxx xxxxx 0x2130 13
+11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21
+11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34
+11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
+ * maximum encodable value: 0x100000400202130 == 2**56 + some */
+
+/* compression "table":
+ transmitted x 0.29
+ as plaintext x ........................
+ x ........................
+ x ........................
+ x 0.59 0.21........................
+ x ........................................................
+ x .. c ...................................................
+ x 0.44.. o ...................................................
+ x .......... d ...................................................
+ x .......... e ...................................................
+ X............. ...................................................
+ x.............. b ...................................................
+2.0x............... i ...................................................
+ #X................ t ...................................................
+ #................. s ........................... plain bits ..........
+-+-----------------------------------------------------------------------
+ 1 16 32 64
+*/
+
+/* LEVEL: (total bits, prefix bits, prefix value),
+ * sorted ascending by number of total bits.
+ * The rest of the code table is calculated at compiletime from this. */
+
+/* fibonacci data 1, 1, ... */
+#define VLI_L_1_1() do { \
+ LEVEL( 2, 1, 0x00); \
+ LEVEL( 3, 2, 0x01); \
+ LEVEL( 5, 3, 0x03); \
+ LEVEL( 7, 4, 0x07); \
+ LEVEL(10, 5, 0x0f); \
+ LEVEL(14, 6, 0x1f); \
+ LEVEL(21, 8, 0x3f); \
+ LEVEL(29, 8, 0x7f); \
+ LEVEL(42, 8, 0xbf); \
+ LEVEL(64, 8, 0xff); \
+ } while (0)
+
+/* finds a suitable level to decode the least significant part of in.
+ * returns number of bits consumed.
+ *
+ * BUG() for bad input, as that would mean a buggy code table. */
+static inline int vli_decode_bits(u64 *out, const u64 in)
+{
+ u64 adj = 1;
+
+#define LEVEL(t,b,v) \
+ do { \
+ if ((in & ((1 << b) -1)) == v) { \
+ *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \
+ return t; \
+ } \
+ adj += 1ULL << (t - b); \
+ } while (0)
+
+ VLI_L_1_1();
+
+ /* NOT REACHED, if VLI_LEVELS code table is defined properly */
+ BUG();
+#undef LEVEL
+}
+
+/* return number of code bits needed,
+ * or negative error number */
+static inline int __vli_encode_bits(u64 *out, const u64 in)
+{
+ u64 max = 0;
+ u64 adj = 1;
+
+ if (in == 0)
+ return -EINVAL;
+
+#define LEVEL(t,b,v) do { \
+ max += 1ULL << (t - b); \
+ if (in <= max) { \
+ if (out) \
+ *out = ((in - adj) << b) | v; \
+ return t; \
+ } \
+ adj = max + 1; \
+ } while (0)
+
+ VLI_L_1_1();
+
+ return -EOVERFLOW;
+#undef LEVEL
+}
+
+#undef VLI_L_1_1
+
+/* code from here down is independend of actually used bit code */
+
+/*
+ * Code length is determined by some unique (e.g. unary) prefix.
+ * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
+ * not a byte stream.
+ */
+
+/* for the bitstream, we need a cursor */
+struct bitstream_cursor {
+ /* the current byte */
+ u8 *b;
+ /* the current bit within *b, nomalized: 0..7 */
+ unsigned int bit;
+};
+
+/* initialize cursor to point to first bit of stream */
+static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
+{
+ cur->b = s;
+ cur->bit = 0;
+}
+
+/* advance cursor by that many bits; maximum expected input value: 64,
+ * but depending on VLI implementation, it may be more. */
+static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
+{
+ bits += cur->bit;
+ cur->b = cur->b + (bits >> 3);
+ cur->bit = bits & 7;
+}
+
+/* the bitstream itself knows its length */
+struct bitstream {
+ struct bitstream_cursor cur;
+ unsigned char *buf;
+ size_t buf_len; /* in bytes */
+
+ /* for input stream:
+ * number of trailing 0 bits for padding
+ * total number of valid bits in stream: buf_len * 8 - pad_bits */
+ unsigned int pad_bits;
+};
+
+static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
+{
+ bs->buf = s;
+ bs->buf_len = len;
+ bs->pad_bits = pad_bits;
+ bitstream_cursor_reset(&bs->cur, bs->buf);
+}
+
+static inline void bitstream_rewind(struct bitstream *bs)
+{
+ bitstream_cursor_reset(&bs->cur, bs->buf);
+ memset(bs->buf, 0, bs->buf_len);
+}
+
+/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
+ * Ignores "pad_bits".
+ * Returns zero if bits == 0 (nothing to do).
+ * Returns number of bits used if successful.
+ *
+ * If there is not enough room left in bitstream,
+ * leaves bitstream unchanged and returns -ENOBUFS.
+ */
+static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
+{
+ unsigned char *b = bs->cur.b;
+ unsigned int tmp;
+
+ if (bits == 0)
+ return 0;
+
+ if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
+ return -ENOBUFS;
+
+ /* paranoia: strip off hi bits; they should not be set anyways. */
+ if (bits < 64)
+ val &= ~0ULL >> (64 - bits);
+
+ *b++ |= (val & 0xff) << bs->cur.bit;
+
+ for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
+ *b++ |= (val >> tmp) & 0xff;
+
+ bitstream_cursor_advance(&bs->cur, bits);
+ return bits;
+}
+
+/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
+ *
+ * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
+ *
+ * If there are less than the requested number of valid bits left in the
+ * bitstream, still fetches all available bits.
+ *
+ * Returns number of actually fetched bits.
+ */
+static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
+{
+ u64 val;
+ unsigned int n;
+
+ if (bits > 64)
+ return -EINVAL;
+
+ if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
+ bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
+ - bs->cur.bit - bs->pad_bits;
+
+ if (bits == 0) {
+ *out = 0;
+ return 0;
+ }
+
+ /* get the high bits */
+ val = 0;
+ n = (bs->cur.bit + bits + 7) >> 3;
+ /* n may be at most 9, if cur.bit + bits > 64 */
+ /* which means this copies at most 8 byte */
+ if (n) {
+ memcpy(&val, bs->cur.b+1, n - 1);
+ val = le64_to_cpu(val) << (8 - bs->cur.bit);
+ }
+
+ /* we still need the low bits */
+ val |= bs->cur.b[0] >> bs->cur.bit;
+
+ /* and mask out bits we don't want */
+ val &= ~0ULL >> (64 - bits);
+
+ bitstream_cursor_advance(&bs->cur, bits);
+ *out = val;
+
+ return bits;
+}
+
+/* encodes @in as vli into @bs;
+
+ * return values
+ * > 0: number of bits successfully stored in bitstream
+ * -ENOBUFS @bs is full
+ * -EINVAL input zero (invalid)
+ * -EOVERFLOW input too large for this vli code (invalid)
+ */
+static inline int vli_encode_bits(struct bitstream *bs, u64 in)
+{
+ u64 code = code;
+ int bits = __vli_encode_bits(&code, in);
+
+ if (bits <= 0)
+ return bits;
+
+ return bitstream_put_bits(bs, code, bits);
+}
+
+#endif
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
new file mode 100644
index 000000000000..ed8796f1112d
--- /dev/null
+++ b/drivers/block/drbd/drbd_worker.c
@@ -0,0 +1,1512 @@
+/*
+ drbd_worker.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/drbd.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/wait.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+
+#include "drbd_int.h"
+#include "drbd_req.h"
+
+#define SLEEP_TIME (HZ/10)
+
+static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
+
+
+
+/* defined here:
+ drbd_md_io_complete
+ drbd_endio_write_sec
+ drbd_endio_read_sec
+ drbd_endio_pri
+
+ * more endio handlers:
+ atodb_endio in drbd_actlog.c
+ drbd_bm_async_io_complete in drbd_bitmap.c
+
+ * For all these callbacks, note the following:
+ * The callbacks will be called in irq context by the IDE drivers,
+ * and in Softirqs/Tasklets/BH context by the SCSI drivers.
+ * Try to get the locking right :)
+ *
+ */
+
+
+/* About the global_state_lock
+ Each state transition on an device holds a read lock. In case we have
+ to evaluate the sync after dependencies, we grab a write lock, because
+ we need stable states on all devices for that. */
+rwlock_t global_state_lock;
+
+/* used for synchronous meta data and bitmap IO
+ * submitted by drbd_md_sync_page_io()
+ */
+void drbd_md_io_complete(struct bio *bio, int error)
+{
+ struct drbd_md_io *md_io;
+
+ md_io = (struct drbd_md_io *)bio->bi_private;
+ md_io->error = error;
+
+ complete(&md_io->event);
+}
+
+/* reads on behalf of the partner,
+ * "submitted" by the receiver
+ */
+void drbd_endio_read_sec(struct bio *bio, int error) __releases(local)
+{
+ unsigned long flags = 0;
+ struct drbd_epoch_entry *e = NULL;
+ struct drbd_conf *mdev;
+ int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+ e = bio->bi_private;
+ mdev = e->mdev;
+
+ if (error)
+ dev_warn(DEV, "read: error=%d s=%llus\n", error,
+ (unsigned long long)e->sector);
+ if (!error && !uptodate) {
+ dev_warn(DEV, "read: setting error to -EIO s=%llus\n",
+ (unsigned long long)e->sector);
+ /* strange behavior of some lower level drivers...
+ * fail the request by clearing the uptodate flag,
+ * but do not return any error?! */
+ error = -EIO;
+ }
+
+ D_ASSERT(e->block_id != ID_VACANT);
+
+ spin_lock_irqsave(&mdev->req_lock, flags);
+ mdev->read_cnt += e->size >> 9;
+ list_del(&e->w.list);
+ if (list_empty(&mdev->read_ee))
+ wake_up(&mdev->ee_wait);
+ spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+ drbd_chk_io_error(mdev, error, FALSE);
+ drbd_queue_work(&mdev->data.work, &e->w);
+ put_ldev(mdev);
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver.
+ */
+void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
+{
+ unsigned long flags = 0;
+ struct drbd_epoch_entry *e = NULL;
+ struct drbd_conf *mdev;
+ sector_t e_sector;
+ int do_wake;
+ int is_syncer_req;
+ int do_al_complete_io;
+ int uptodate = bio_flagged(bio, BIO_UPTODATE);
+ int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
+
+ e = bio->bi_private;
+ mdev = e->mdev;
+
+ if (error)
+ dev_warn(DEV, "write: error=%d s=%llus\n", error,
+ (unsigned long long)e->sector);
+ if (!error && !uptodate) {
+ dev_warn(DEV, "write: setting error to -EIO s=%llus\n",
+ (unsigned long long)e->sector);
+ /* strange behavior of some lower level drivers...
+ * fail the request by clearing the uptodate flag,
+ * but do not return any error?! */
+ error = -EIO;
+ }
+
+ /* error == -ENOTSUPP would be a better test,
+ * alas it is not reliable */
+ if (error && is_barrier && e->flags & EE_IS_BARRIER) {
+ drbd_bump_write_ordering(mdev, WO_bdev_flush);
+ spin_lock_irqsave(&mdev->req_lock, flags);
+ list_del(&e->w.list);
+ e->w.cb = w_e_reissue;
+ /* put_ldev actually happens below, once we come here again. */
+ __release(local);
+ spin_unlock_irqrestore(&mdev->req_lock, flags);
+ drbd_queue_work(&mdev->data.work, &e->w);
+ return;
+ }
+
+ D_ASSERT(e->block_id != ID_VACANT);
+
+ spin_lock_irqsave(&mdev->req_lock, flags);
+ mdev->writ_cnt += e->size >> 9;
+ is_syncer_req = is_syncer_block_id(e->block_id);
+
+ /* after we moved e to done_ee,
+ * we may no longer access it,
+ * it may be freed/reused already!
+ * (as soon as we release the req_lock) */
+ e_sector = e->sector;
+ do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
+
+ list_del(&e->w.list); /* has been on active_ee or sync_ee */
+ list_add_tail(&e->w.list, &mdev->done_ee);
+
+ /* No hlist_del_init(&e->colision) here, we did not send the Ack yet,
+ * neither did we wake possibly waiting conflicting requests.
+ * done from "drbd_process_done_ee" within the appropriate w.cb
+ * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
+
+ do_wake = is_syncer_req
+ ? list_empty(&mdev->sync_ee)
+ : list_empty(&mdev->active_ee);
+
+ if (error)
+ __drbd_chk_io_error(mdev, FALSE);
+ spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+ if (is_syncer_req)
+ drbd_rs_complete_io(mdev, e_sector);
+
+ if (do_wake)
+ wake_up(&mdev->ee_wait);
+
+ if (do_al_complete_io)
+ drbd_al_complete_io(mdev, e_sector);
+
+ wake_asender(mdev);
+ put_ldev(mdev);
+
+}
+
+/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
+ */
+void drbd_endio_pri(struct bio *bio, int error)
+{
+ unsigned long flags;
+ struct drbd_request *req = bio->bi_private;
+ struct drbd_conf *mdev = req->mdev;
+ struct bio_and_error m;
+ enum drbd_req_event what;
+ int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+ if (error)
+ dev_warn(DEV, "p %s: error=%d\n",
+ bio_data_dir(bio) == WRITE ? "write" : "read", error);
+ if (!error && !uptodate) {
+ dev_warn(DEV, "p %s: setting error to -EIO\n",
+ bio_data_dir(bio) == WRITE ? "write" : "read");
+ /* strange behavior of some lower level drivers...
+ * fail the request by clearing the uptodate flag,
+ * but do not return any error?! */
+ error = -EIO;
+ }
+
+ /* to avoid recursion in __req_mod */
+ if (unlikely(error)) {
+ what = (bio_data_dir(bio) == WRITE)
+ ? write_completed_with_error
+ : (bio_rw(bio) == READA)
+ ? read_completed_with_error
+ : read_ahead_completed_with_error;
+ } else
+ what = completed_ok;
+
+ bio_put(req->private_bio);
+ req->private_bio = ERR_PTR(error);
+
+ spin_lock_irqsave(&mdev->req_lock, flags);
+ __req_mod(req, what, &m);
+ spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+ if (m.bio)
+ complete_master_bio(mdev, &m);
+}
+
+int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+ struct drbd_request *req = container_of(w, struct drbd_request, w);
+
+ /* NOTE: mdev->ldev can be NULL by the time we get here! */
+ /* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */
+
+ /* the only way this callback is scheduled is from _req_may_be_done,
+ * when it is done and had a local write error, see comments there */
+ drbd_req_free(req);
+
+ return TRUE;
+}
+
+int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+ struct drbd_request *req = container_of(w, struct drbd_request, w);
+
+ /* We should not detach for read io-error,
+ * but try to WRITE the P_DATA_REPLY to the failed location,
+ * to give the disk the chance to relocate that block */
+
+ spin_lock_irq(&mdev->req_lock);
+ if (cancel ||
+ mdev->state.conn < C_CONNECTED ||
+ mdev->state.pdsk <= D_INCONSISTENT) {
+ _req_mod(req, send_canceled);
+ spin_unlock_irq(&mdev->req_lock);
+ dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n");
+ return 1;
+ }
+ spin_unlock_irq(&mdev->req_lock);
+
+ return w_send_read_req(mdev, w, 0);
+}
+
+int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+ ERR_IF(cancel) return 1;
+ dev_err(DEV, "resync inactive, but callback triggered??\n");
+ return 1; /* Simply ignore this! */
+}
+
+void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
+{
+ struct hash_desc desc;
+ struct scatterlist sg;
+ struct bio_vec *bvec;
+ int i;
+
+ desc.tfm = tfm;
+ desc.flags = 0;
+
+ sg_init_table(&sg, 1);
+ crypto_hash_init(&desc);
+
+ __bio_for_each_segment(bvec, bio, i, 0) {
+ sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
+ crypto_hash_update(&desc, &sg, sg.length);
+ }
+ crypto_hash_final(&desc, digest);
+}
+
+static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+ struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+ int digest_size;
+ void *digest;
+ int ok;
+
+ D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
+
+ if (unlikely(cancel)) {
+ drbd_free_ee(mdev, e);
+ return 1;
+ }
+
+ if (likely(drbd_bio_uptodate(e->private_bio))) {
+ digest_size = crypto_hash_digestsize(mdev->csums_tfm);
+ digest = kmalloc(digest_size, GFP_NOIO);
+ if (digest) {
+ drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
+
+ inc_rs_pending(mdev);
+ ok = drbd_send_drequest_csum(mdev,
+ e->sector,
+ e->size,
+ digest,
+ digest_size,
+ P_CSUM_RS_REQUEST);
+ kfree(digest);
+ } else {
+ dev_err(DEV, "kmalloc() of digest failed.\n");
+ ok = 0;
+ }
+ } else
+ ok = 1;
+
+ drbd_free_ee(mdev, e);
+
+ if (unlikely(!ok))
+ dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
+ return ok;
+}
+
+#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
+
+static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
+{
+ struct drbd_epoch_entry *e;
+
+ if (!get_ldev(mdev))
+ return 0;
+
+ /* GFP_TRY, because if there is no memory available right now, this may
+ * be rescheduled for later. It is "only" background resync, after all. */
+ e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
+ if (!e) {
+ put_ldev(mdev);
+ return 2;
+ }
+
+ spin_lock_irq(&mdev->req_lock);
+ list_add(&e->w.list, &mdev->read_ee);
+ spin_unlock_irq(&mdev->req_lock);
+
+ e->private_bio->bi_end_io = drbd_endio_read_sec;
+ e->private_bio->bi_rw = READ;
+ e->w.cb = w_e_send_csum;
+
+ mdev->read_cnt += size >> 9;
+ drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio);
+
+ return 1;
+}
+
+void resync_timer_fn(unsigned long data)
+{
+ unsigned long flags;
+ struct drbd_conf *mdev = (struct drbd_conf *) data;
+ int queue;
+
+ spin_lock_irqsave(&mdev->req_lock, flags);
+
+ if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) {
+ queue = 1;
+ if (mdev->state.conn == C_VERIFY_S)
+ mdev->resync_work.cb = w_make_ov_request;
+ else
+ mdev->resync_work.cb = w_make_resync_request;
+ } else {
+ queue = 0;
+ mdev->resync_work.cb = w_resync_inactive;
+ }
+
+ spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+ /* harmless race: list_empty outside data.work.q_lock */
+ if (list_empty(&mdev->resync_work.list) && queue)
+ drbd_queue_work(&mdev->data.work, &mdev->resync_work);
+}
+
+int w_make_resync_request(struct drbd_conf *mdev,
+ struct drbd_work *w, int cancel)
+{
+ unsigned long bit;
+ sector_t sector;
+ const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
+ int max_segment_size = queue_max_segment_size(mdev->rq_queue);
+ int number, i, size, pe, mx;
+ int align, queued, sndbuf;
+
+ if (unlikely(cancel))
+ return 1;
+
+ if (unlikely(mdev->state.conn < C_CONNECTED)) {
+ dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected");
+ return 0;
+ }
+
+ if (mdev->state.conn != C_SYNC_TARGET)
+ dev_err(DEV, "%s in w_make_resync_request\n",
+ drbd_conn_str(mdev->state.conn));
+
+ if (!get_ldev(mdev)) {
+ /* Since we only need to access mdev->rsync a
+ get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
+ to continue resync with a broken disk makes no sense at
+ all */
+ dev_err(DEV, "Disk broke down during resync!\n");
+ mdev->resync_work.cb = w_resync_inactive;
+ return 1;
+ }
+
+ number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
+ pe = atomic_read(&mdev->rs_pending_cnt);
+
+ mutex_lock(&mdev->data.mutex);
+ if (mdev->data.socket)
+ mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req);
+ else
+ mx = 1;
+ mutex_unlock(&mdev->data.mutex);
+
+ /* For resync rates >160MB/sec, allow more pending RS requests */
+ if (number > mx)
+ mx = number;
+
+ /* Limit the number of pending RS requests to no more than the peer's receive buffer */
+ if ((pe + number) > mx) {
+ number = mx - pe;
+ }
+
+ for (i = 0; i < number; i++) {
+ /* Stop generating RS requests, when half of the send buffer is filled */
+ mutex_lock(&mdev->data.mutex);
+ if (mdev->data.socket) {
+ queued = mdev->data.socket->sk->sk_wmem_queued;
+ sndbuf = mdev->data.socket->sk->sk_sndbuf;
+ } else {
+ queued = 1;
+ sndbuf = 0;
+ }
+ mutex_unlock(&mdev->data.mutex);
+ if (queued > sndbuf / 2)
+ goto requeue;
+
+next_sector:
+ size = BM_BLOCK_SIZE;
+ bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo);
+
+ if (bit == -1UL) {
+ mdev->bm_resync_fo = drbd_bm_bits(mdev);
+ mdev->resync_work.cb = w_resync_inactive;
+ put_ldev(mdev);
+ return 1;
+ }
+
+ sector = BM_BIT_TO_SECT(bit);
+
+ if (drbd_try_rs_begin_io(mdev, sector)) {
+ mdev->bm_resync_fo = bit;
+ goto requeue;
+ }
+ mdev->bm_resync_fo = bit + 1;
+
+ if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {
+ drbd_rs_complete_io(mdev, sector);
+ goto next_sector;
+ }
+
+#if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE
+ /* try to find some adjacent bits.
+ * we stop if we have already the maximum req size.
+ *
+ * Additionally always align bigger requests, in order to
+ * be prepared for all stripe sizes of software RAIDs.
+ *
+ * we _do_ care about the agreed-upon q->max_segment_size
+ * here, as splitting up the requests on the other side is more
+ * difficult. the consequence is, that on lvm and md and other
+ * "indirect" devices, this is dead code, since
+ * q->max_segment_size will be PAGE_SIZE.
+ */
+ align = 1;
+ for (;;) {
+ if (size + BM_BLOCK_SIZE > max_segment_size)
+ break;
+
+ /* Be always aligned */
+ if (sector & ((1<<(align+3))-1))
+ break;
+
+ /* do not cross extent boundaries */
+ if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
+ break;
+ /* now, is it actually dirty, after all?
+ * caution, drbd_bm_test_bit is tri-state for some
+ * obscure reason; ( b == 0 ) would get the out-of-band
+ * only accidentally right because of the "oddly sized"
+ * adjustment below */
+ if (drbd_bm_test_bit(mdev, bit+1) != 1)
+ break;
+ bit++;
+ size += BM_BLOCK_SIZE;
+ if ((BM_BLOCK_SIZE << align) <= size)
+ align++;
+ i++;
+ }
+ /* if we merged some,
+ * reset the offset to start the next drbd_bm_find_next from */
+ if (size > BM_BLOCK_SIZE)
+ mdev->bm_resync_fo = bit + 1;
+#endif
+
+ /* adjust very last sectors, in case we are oddly sized */
+ if (sector + (size>>9) > capacity)
+ size = (capacity-sector)<<9;
+ if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
+ switch (read_for_csum(mdev, sector, size)) {
+ case 0: /* Disk failure*/
+ put_ldev(mdev);
+ return 0;
+ case 2: /* Allocation failed */
+ drbd_rs_complete_io(mdev, sector);
+ mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
+ goto requeue;
+ /* case 1: everything ok */
+ }
+ } else {
+ inc_rs_pending(mdev);
+ if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
+ sector, size, ID_SYNCER)) {
+ dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
+ dec_rs_pending(mdev);
+ put_ldev(mdev);
+ return 0;
+ }
+ }
+ }
+
+ if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) {