From 82f59cc6353889b426cf13b6596d5a3d100fa09e Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Sat, 16 Oct 2010 12:13:47 +0200
Subject: drbd: fix potential deadlock on detach

If we have contention in drbd_al_begin_iod (heavy randon IO),
an administrative request to detach the disk may deadlock
for similar reasons as the recently fixed deadlock if detaching
because of IO-error.

The approach taken here is to either go through the intermediate
cleanup state D_FAILED, or first lock out application io,
don't just go directly to D_DISKLESS.

We need an additional state bit (WAS_IO_ERROR) to distinguish
the -> D_FAILED because of IO-error from other failures.

Sanitize D_ATTACHING -> D_FAILED to D_ATTACHING -> D_DISKLESS.
If only attaching, ldev may be missing still, but would be referenced
from within the after_state_ch for -> D_FAILED, potentially
dereferencing a NULL pointer.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block/drbd/drbd_receiver.c')

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 6ec922c623a1..04a823b01da5 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3363,7 +3363,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 		if (ns.conn == C_MASK) {
 			ns.conn = C_CONNECTED;
 			if (mdev->state.disk == D_NEGOTIATING) {
-				drbd_force_state(mdev, NS(disk, D_DISKLESS));
+				drbd_force_state(mdev, NS(disk, D_FAILED));
 			} else if (peer_state.disk == D_NEGOTIATING) {
 				dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
 				peer_state.disk = D_DISKLESS;
-- 
cgit v1.2.3-59-g8ed1b


From 6719fb036cea56a5ee9d0ac912ed8c7cabb27f49 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 18 Oct 2010 23:04:07 +0200
Subject: drbd: fix potential data divergence after multiple failures

If we get an IO-error during an activity log transaction,
if we failed to write the bitmap of the evicted extent,
we must not write the transaction itself.
If we failed to write the transaction,
we must not even submit the corresponding bio,
as its extent is not yet marked in the activity log.

Otherwise, if this was a disconneted Primary (degraded cluster), which
now lost its disk as well, and we later re-attach the same backend
storage, we possibly "forget" to resync some parts of the disk that
potentially have been changed.

On the receiving side, when receiving from a peer with unhealthy disk,
checking for pdsk == D_DISKLESS is not enough, we need to set out of
sync and do AL transactions for everything pdsk < D_INCONSISTENT on the
receiving side.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_actlog.c   | 26 +++++++++++++++++++++-----
 drivers/block/drbd/drbd_receiver.c |  3 ++-
 drivers/block/drbd/drbd_req.c      | 19 ++++++++++++++-----
 3 files changed, 37 insertions(+), 11 deletions(-)

(limited to 'drivers/block/drbd/drbd_receiver.c')

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index ac04ef97eac2..bd925180a2b0 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -284,18 +284,32 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 	u32 xor_sum = 0;
 
 	if (!get_ldev(mdev)) {
-		dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n");
+		dev_err(DEV,
+			"disk is %s, cannot start al transaction (-%d +%d)\n",
+			drbd_disk_str(mdev->state.disk), evicted, new_enr);
 		complete(&((struct update_al_work *)w)->event);
 		return 1;
 	}
 	/* do we have to do a bitmap write, first?
 	 * TODO reduce maximum latency:
 	 * submit both bios, then wait for both,
-	 * instead of doing two synchronous sector writes. */
+	 * instead of doing two synchronous sector writes.
+	 * For now, we must not write the transaction,
+	 * if we cannot write out the bitmap of the evicted extent. */
 	if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
 		drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);
 
-	mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */
+	/* The bitmap write may have failed, causing a state change. */
+	if (mdev->state.disk < D_INCONSISTENT) {
+		dev_err(DEV,
+			"disk is %s, cannot write al transaction (-%d +%d)\n",
+			drbd_disk_str(mdev->state.disk), evicted, new_enr);
+		complete(&((struct update_al_work *)w)->event);
+		put_ldev(mdev);
+		return 1;
+	}
+
+	mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */
 	buffer = (struct al_transaction *)page_address(mdev->md_io_page);
 
 	buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
@@ -739,7 +753,7 @@ void drbd_al_apply_to_bm(struct drbd_conf *mdev)
 	unsigned int enr;
 	unsigned long add = 0;
 	char ppb[10];
-	int i;
+	int i, tmp;
 
 	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
 
@@ -747,7 +761,9 @@ void drbd_al_apply_to_bm(struct drbd_conf *mdev)
 		enr = lc_element_by_index(mdev->act_log, i)->lc_number;
 		if (enr == LC_FREE)
 			continue;
-		add += drbd_bm_ALe_set_all(mdev, enr);
+		tmp = drbd_bm_ALe_set_all(mdev, enr);
+		dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr);
+		add += tmp;
 	}
 
 	lc_unlock(mdev->act_log);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 04a823b01da5..1146faa7ae38 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1995,10 +1995,11 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 		break;
 	}
 
-	if (mdev->state.pdsk == D_DISKLESS) {
+	if (mdev->state.pdsk < D_INCONSISTENT) {
 		/* In case we have the only disk of the cluster, */
 		drbd_set_out_of_sync(mdev, e->sector, e->size);
 		e->flags |= EE_CALL_AL_COMPLETE_IO;
+		e->flags &= ~EE_MAY_SET_IN_SYNC;
 		drbd_al_begin_io(mdev, e->sector);
 	}
 
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 9e91a2545fc8..d26b213dbf15 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -942,12 +942,21 @@ allocate_barrier:
 	if (local) {
 		req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
 
-		if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
-				     : rw == READ  ? DRBD_FAULT_DT_RD
-				     :               DRBD_FAULT_DT_RA))
+		/* State may have changed since we grabbed our reference on the
+		 * mdev->ldev member. Double check, and short-circuit to endio.
+		 * In case the last activity log transaction failed to get on
+		 * stable storage, and this is a WRITE, we may not even submit
+		 * this bio. */
+		if (get_ldev(mdev)) {
+			if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
+					     : rw == READ  ? DRBD_FAULT_DT_RD
+					     :               DRBD_FAULT_DT_RA))
+				bio_endio(req->private_bio, -EIO);
+			else
+				generic_make_request(req->private_bio);
+			put_ldev(mdev);
+		} else
 			bio_endio(req->private_bio, -EIO);
-		else
-			generic_make_request(req->private_bio);
 	}
 
 	/* we need to plug ALWAYS since we possibly need to kick lo_dev.
-- 
cgit v1.2.3-59-g8ed1b


From 2451fc3b2bd3a7205270da75a21dde0d5d7c96a2 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 24 Aug 2010 13:43:11 +0200
Subject: drbd: Removed the BIO_RW_BARRIER support form the receiver/epoch code

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      |  12 ---
 drivers/block/drbd/drbd_main.c     |   2 +-
 drivers/block/drbd/drbd_nl.c       |   4 +-
 drivers/block/drbd/drbd_proc.c     |   1 -
 drivers/block/drbd/drbd_receiver.c | 212 ++++++-------------------------------
 drivers/block/drbd/drbd_worker.c   |  21 ----
 6 files changed, 33 insertions(+), 219 deletions(-)

(limited to 'drivers/block/drbd/drbd_receiver.c')

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 03c15e317c37..1b915fd9278f 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -749,17 +749,12 @@ struct drbd_epoch {
 
 /* drbd_epoch flag bits */
 enum {
-	DE_BARRIER_IN_NEXT_EPOCH_ISSUED,
-	DE_BARRIER_IN_NEXT_EPOCH_DONE,
-	DE_CONTAINS_A_BARRIER,
 	DE_HAVE_BARRIER_NUMBER,
-	DE_IS_FINISHING,
 };
 
 enum epoch_event {
 	EV_PUT,
 	EV_GOT_BARRIER_NR,
-	EV_BARRIER_DONE,
 	EV_BECAME_LAST,
 	EV_CLEANUP = 32, /* used as flag */
 };
@@ -801,11 +796,6 @@ enum {
 	__EE_CALL_AL_COMPLETE_IO,
 	__EE_MAY_SET_IN_SYNC,
 
-	/* This epoch entry closes an epoch using a barrier.
-	 * On sucessful completion, the epoch is released,
-	 * and the P_BARRIER_ACK send. */
-	__EE_IS_BARRIER,
-
 	/* In case a barrier failed,
 	 * we need to resubmit without the barrier flag. */
 	__EE_RESUBMITTED,
@@ -820,7 +810,6 @@ enum {
 };
 #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
 #define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
-#define EE_IS_BARRIER          (1<<__EE_IS_BARRIER)
 #define	EE_RESUBMITTED         (1<<__EE_RESUBMITTED)
 #define EE_WAS_ERROR           (1<<__EE_WAS_ERROR)
 #define EE_HAS_DIGEST          (1<<__EE_HAS_DIGEST)
@@ -948,7 +937,6 @@ enum write_ordering_e {
 	WO_none,
 	WO_drain_io,
 	WO_bdev_flush,
-	WO_bio_barrier
 };
 
 struct fifo_buffer {
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 992c3aecdf7e..8e0d707df23d 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2858,7 +2858,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	drbd_thread_init(mdev, &mdev->asender, drbd_asender);
 
 	mdev->agreed_pro_version = PRO_VERSION_MAX;
-	mdev->write_ordering = WO_bio_barrier;
+	mdev->write_ordering = WO_bdev_flush;
 	mdev->resync_wenr = LC_FREE;
 }
 
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 0cba7d3d2b5d..899878fcf97d 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1117,8 +1117,8 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	nbc = NULL;
 	resync_lru = NULL;
 
-	mdev->write_ordering = WO_bio_barrier;
-	drbd_bump_write_ordering(mdev, WO_bio_barrier);
+	mdev->write_ordering = WO_bdev_flush;
+	drbd_bump_write_ordering(mdev, WO_bdev_flush);
 
 	if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
 		set_bit(CRASHED_PRIMARY, &mdev->flags);
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index ad325c5d0ce1..7e6ac307e2de 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -158,7 +158,6 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 		[WO_none] = 'n',
 		[WO_drain_io] = 'd',
 		[WO_bdev_flush] = 'f',
-		[WO_bio_barrier] = 'b',
 	};
 
 	seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 1146faa7ae38..2952c1277b1d 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -49,11 +49,6 @@
 
 #include "drbd_vli.h"
 
-struct flush_work {
-	struct drbd_work w;
-	struct drbd_epoch *epoch;
-};
-
 enum finish_epoch {
 	FE_STILL_LIVE,
 	FE_DESTROYED,
@@ -66,16 +61,6 @@ static int drbd_do_auth(struct drbd_conf *mdev);
 static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
 static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
 
-static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
-{
-	struct drbd_epoch *prev;
-	spin_lock(&mdev->epoch_lock);
-	prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
-	if (prev == epoch || prev == mdev->current_epoch)
-		prev = NULL;
-	spin_unlock(&mdev->epoch_lock);
-	return prev;
-}
 
 #define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
 
@@ -981,7 +966,7 @@ static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsi
 	return TRUE;
 }
 
-static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
+static void drbd_flush(struct drbd_conf *mdev)
 {
 	int rv;
 
@@ -997,24 +982,6 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
 		}
 		put_ldev(mdev);
 	}
-
-	return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
-}
-
-static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
-{
-	struct flush_work *fw = (struct flush_work *)w;
-	struct drbd_epoch *epoch = fw->epoch;
-
-	kfree(w);
-
-	if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
-		drbd_flush_after_epoch(mdev, epoch);
-
-	drbd_may_finish_epoch(mdev, epoch, EV_PUT |
-			      (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0));
-
-	return 1;
 }
 
 /**
@@ -1027,15 +994,13 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
 					       struct drbd_epoch *epoch,
 					       enum epoch_event ev)
 {
-	int finish, epoch_size;
+	int epoch_size;
 	struct drbd_epoch *next_epoch;
-	int schedule_flush = 0;
 	enum finish_epoch rv = FE_STILL_LIVE;
 
 	spin_lock(&mdev->epoch_lock);
 	do {
 		next_epoch = NULL;
-		finish = 0;
 
 		epoch_size = atomic_read(&epoch->epoch_size);
 
@@ -1045,16 +1010,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
 			break;
 		case EV_GOT_BARRIER_NR:
 			set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
-
-			/* Special case: If we just switched from WO_bio_barrier to
-			   WO_bdev_flush we should not finish the current epoch */
-			if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
-			    mdev->write_ordering != WO_bio_barrier &&
-			    epoch == mdev->current_epoch)
-				clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
-			break;
-		case EV_BARRIER_DONE:
-			set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
 			break;
 		case EV_BECAME_LAST:
 			/* nothing to do*/
@@ -1063,23 +1018,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
 
 		if (epoch_size != 0 &&
 		    atomic_read(&epoch->active) == 0 &&
-		    test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) &&
-		    epoch->list.prev == &mdev->current_epoch->list &&
-		    !test_bit(DE_IS_FINISHING, &epoch->flags)) {
-			/* Nearly all conditions are met to finish that epoch... */
-			if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
-			    mdev->write_ordering == WO_none ||
-			    (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
-			    ev & EV_CLEANUP) {
-				finish = 1;
-				set_bit(DE_IS_FINISHING, &epoch->flags);
-			} else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
-				 mdev->write_ordering == WO_bio_barrier) {
-				atomic_inc(&epoch->active);
-				schedule_flush = 1;
-			}
-		}
-		if (finish) {
+		    test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) {
 			if (!(ev & EV_CLEANUP)) {
 				spin_unlock(&mdev->epoch_lock);
 				drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
@@ -1102,6 +1041,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
 				/* atomic_set(&epoch->active, 0); is already zero */
 				if (rv == FE_STILL_LIVE)
 					rv = FE_RECYCLED;
+				wake_up(&mdev->ee_wait);
 			}
 		}
 
@@ -1113,22 +1053,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
 
 	spin_unlock(&mdev->epoch_lock);
 
-	if (schedule_flush) {
-		struct flush_work *fw;
-		fw = kmalloc(sizeof(*fw), GFP_ATOMIC);
-		if (fw) {
-			fw->w.cb = w_flush;
-			fw->epoch = epoch;
-			drbd_queue_work(&mdev->data.work, &fw->w);
-		} else {
-			dev_warn(DEV, "Could not kmalloc a flush_work obj\n");
-			set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
-			/* That is not a recursion, only one level */
-			drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
-			drbd_may_finish_epoch(mdev, epoch, EV_PUT);
-		}
-	}
-
 	return rv;
 }
 
@@ -1144,19 +1068,16 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
 		[WO_none] = "none",
 		[WO_drain_io] = "drain",
 		[WO_bdev_flush] = "flush",
-		[WO_bio_barrier] = "barrier",
 	};
 
 	pwo = mdev->write_ordering;
 	wo = min(pwo, wo);
-	if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier)
-		wo = WO_bdev_flush;
 	if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
 		wo = WO_drain_io;
 	if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
 		wo = WO_none;
 	mdev->write_ordering = wo;
-	if (pwo != mdev->write_ordering || wo == WO_bio_barrier)
+	if (pwo != mdev->write_ordering || wo == WO_bdev_flush)
 		dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
 }
 
@@ -1192,7 +1113,7 @@ next_bio:
 	bio->bi_sector = sector;
 	bio->bi_bdev = mdev->ldev->backing_bdev;
 	/* we special case some flags in the multi-bio case, see below
-	 * (REQ_UNPLUG, REQ_HARDBARRIER) */
+	 * (REQ_UNPLUG) */
 	bio->bi_rw = rw;
 	bio->bi_private = e;
 	bio->bi_end_io = drbd_endio_sec;
@@ -1226,11 +1147,6 @@ next_bio:
 			bio->bi_rw &= ~REQ_UNPLUG;
 
 		drbd_generic_make_request(mdev, fault_type, bio);
-
-		/* strip off REQ_HARDBARRIER,
-		 * unless it is the first or last bio */
-		if (bios && bios->bi_next)
-			bios->bi_rw &= ~REQ_HARDBARRIER;
 	} while (bios);
 	maybe_kick_lo(mdev);
 	return 0;
@@ -1244,45 +1160,9 @@ fail:
 	return -ENOMEM;
 }
 
-/**
- * w_e_reissue() - Worker callback; Resubmit a bio, without REQ_HARDBARRIER set
- * @mdev:	DRBD device.
- * @w:		work object.
- * @cancel:	The connection will be closed anyways (unused in this callback)
- */
-int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
-{
-	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
-	/* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
-	   (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
-	   so that we can finish that epoch in drbd_may_finish_epoch().
-	   That is necessary if we already have a long chain of Epochs, before
-	   we realize that REQ_HARDBARRIER is actually not supported */
-
-	/* As long as the -ENOTSUPP on the barrier is reported immediately
-	   that will never trigger. If it is reported late, we will just
-	   print that warning and continue correctly for all future requests
-	   with WO_bdev_flush */
-	if (previous_epoch(mdev, e->epoch))
-		dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
-
-	/* we still have a local reference,
-	 * get_ldev was done in receive_Data. */
-
-	e->w.cb = e_end_block;
-	if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR) != 0) {
-		/* drbd_submit_ee fails for one reason only:
-		 * if was not able to allocate sufficient bios.
-		 * requeue, try again later. */
-		e->w.cb = w_e_reissue;
-		drbd_queue_work(&mdev->data.work, &e->w);
-	}
-	return 1;
-}
-
 static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
-	int rv, issue_flush;
+	int rv;
 	struct p_barrier *p = &mdev->data.rbuf.barrier;
 	struct drbd_epoch *epoch;
 
@@ -1300,44 +1180,40 @@ static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsign
 	 * Therefore we must send the barrier_ack after the barrier request was
 	 * completed. */
 	switch (mdev->write_ordering) {
-	case WO_bio_barrier:
 	case WO_none:
 		if (rv == FE_RECYCLED)
 			return TRUE;
-		break;
+
+		/* receiver context, in the writeout path of the other node.
+		 * avoid potential distributed deadlock */
+		epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+		if (epoch)
+			break;
+		else
+			dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
+			/* Fall through */
 
 	case WO_bdev_flush:
 	case WO_drain_io:
-		if (rv == FE_STILL_LIVE) {
-			set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
-			drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
-			rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
-		}
-		if (rv == FE_RECYCLED)
-			return TRUE;
-
-		/* The asender will send all the ACKs and barrier ACKs out, since
-		   all EEs moved from the active_ee to the done_ee. We need to
-		   provide a new epoch object for the EEs that come in soon */
-		break;
-	}
-
-	/* receiver context, in the writeout path of the other node.
-	 * avoid potential distributed deadlock */
-	epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
-	if (!epoch) {
-		dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
-		issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
 		drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
-		if (issue_flush) {
-			rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
-			if (rv == FE_RECYCLED)
-				return TRUE;
+		drbd_flush(mdev);
+
+		if (atomic_read(&mdev->current_epoch->epoch_size)) {
+			epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+			if (epoch)
+				break;
 		}
 
-		drbd_wait_ee_list_empty(mdev, &mdev->done_ee);
+		epoch = mdev->current_epoch;
+		wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0);
+
+		D_ASSERT(atomic_read(&epoch->active) == 0);
+		D_ASSERT(epoch->flags == 0);
 
 		return TRUE;
+	default:
+		dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering);
+		return FALSE;
 	}
 
 	epoch->flags = 0;
@@ -1652,15 +1528,8 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 {
 	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
 	sector_t sector = e->sector;
-	struct drbd_epoch *epoch;
 	int ok = 1, pcmd;
 
-	if (e->flags & EE_IS_BARRIER) {
-		epoch = previous_epoch(mdev, e->epoch);
-		if (epoch)
-			drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0));
-	}
-
 	if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
 		if (likely((e->flags & EE_WAS_ERROR) == 0)) {
 			pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
@@ -1817,27 +1686,6 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 	e->epoch = mdev->current_epoch;
 	atomic_inc(&e->epoch->epoch_size);
 	atomic_inc(&e->epoch->active);
-
-	if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) {
-		struct drbd_epoch *epoch;
-		/* Issue a barrier if we start a new epoch, and the previous epoch
-		   was not a epoch containing a single request which already was
-		   a Barrier. */
-		epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
-		if (epoch == e->epoch) {
-			set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
-			rw |= REQ_HARDBARRIER;
-			e->flags |= EE_IS_BARRIER;
-		} else {
-			if (atomic_read(&epoch->epoch_size) > 1 ||
-			    !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
-				set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
-				set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
-				rw |= REQ_HARDBARRIER;
-				e->flags |= EE_IS_BARRIER;
-			}
-		}
-	}
 	spin_unlock(&mdev->epoch_lock);
 
 	dp_flags = be32_to_cpu(p->dp_flags);
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 1a2d2f0759b2..b0551ba7ad0c 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -102,12 +102,6 @@ void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
 	put_ldev(mdev);
 }
 
-static int is_failed_barrier(int ee_flags)
-{
-	return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED))
-			== (EE_IS_BARRIER|EE_WAS_ERROR);
-}
-
 /* writes on behalf of the partner, or resync writes,
  * "submitted" by the receiver, final stage.  */
 static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
@@ -119,21 +113,6 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo
 	int is_syncer_req;
 	int do_al_complete_io;
 
-	/* if this is a failed barrier request, disable use of barriers,
-	 * and schedule for resubmission */
-	if (is_failed_barrier(e->flags)) {
-		drbd_bump_write_ordering(mdev, WO_bdev_flush);
-		spin_lock_irqsave(&mdev->req_lock, flags);
-		list_del(&e->w.list);
-		e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED;
-		e->w.cb = w_e_reissue;
-		/* put_ldev actually happens below, once we come here again. */
-		__release(local);
-		spin_unlock_irqrestore(&mdev->req_lock, flags);
-		drbd_queue_work(&mdev->data.work, &e->w);
-		return;
-	}
-
 	D_ASSERT(e->block_id != ID_VACANT);
 
 	/* after we moved e to done_ee,
-- 
cgit v1.2.3-59-g8ed1b