[SCSI] pm8001: deficient responses to IO_XFER_ERROR_BREAK and IO_XFER_OPEN_RETRY_TIMEOUT

IO_XFER_ERROR_BREAK and IO_XFER_OPEN_RETRY_TIMEOUT are deficient of the required actions as outlined in the programming manual for the pm8001. Due to the overlapping code requirements of these recovery responses, we found it necessary to bundle them together into one patch. When a break is received during the command phase (ssp_completion), this is a result of a timeout or interruption on the bus. Logic suggests that we should retry the command. When a break is received during the data-phase (ssp_event), the task must be aborted on the target or it will retain a data-phase lock turning the target reticent to all future media commands yet will successfully respond to TUR, INQUIRY and ABORT leading eventually to target failure through several abort-cycle loops. The open retry interval is exceedingly short resulting in occasional target drop-off during expander resets or when targets push-back during bad-block remapping. Increased effective timeout from 130ms to 1.5 seconds for each try so as to trigger after the administrative inquiry/tur timeout in the scsi subsystem to keep error-recovery harmonics to a minimum. When an open retry timeout event is received, the action required by the targets is to issue an abort for the outstanding command then logic suggests we retry the command as this state is usually an indication of a credit block or busy condition on the target. We hijacked the pm8001_handle_event work queue handler so that it will handle task as an argument instead of device for the workers in support of the deferred handling outlined above. Moderate to Heavy bad-path testing on a 2.6.32 vintage kernel, compile-testing on scsi-misc-2.6 kernel ... Signed-off-by: Mark Salyzyn <mark_salyzyn@xyratex.com> Acked-by: Jack Wang <jack_wang@usish.com> Signed-off-by: James Bottomley <JBottomley@Parallels.com>
author: Mark Salyzyn <mark_salyzyn@xyratex.com> 2012-01-17 11:52:24 -0500
committer: James Bottomley <JBottomley@Parallels.com> 2012-02-19 08:08:52 -0600
commit: 5954d7380f627371c4d8d7c59c08f9596aa2c674 (patch)
tree: 0908dd4a832ba0ef07c79237f9f6ab3f2ad20a36 /drivers/scsi/pm8001/pm8001_sas.c
parent: [SCSI] pm8001: Add FUNC_GET_EVENTS (diff)
download: linux-dev-5954d7380f627371c4d8d7c59c08f9596aa2c674.tar.xz
linux-dev-5954d7380f627371c4d8d7c59c08f9596aa2c674.zip
1 files changed, 72 insertions, 0 deletions
diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c
index ab0704e39040..9589fc941a8b 100644
--- a/drivers/scsi/pm8001/pm8001_sas.c
+++ b/drivers/scsi/pm8001/pm8001_sas.c
@@ -538,6 +538,7 @@ void pm8001_ccb_task_free(struct pm8001_hba_info *pm8001_ha,
 	task->lldd_task = NULL;
 	ccb->task = NULL;
 	ccb->ccb_tag = 0xFFFFFFFF;
+	ccb->open_retry = 0;
 	pm8001_ccb_free(pm8001_ha, ccb_idx);
 }
 
@@ -882,6 +883,77 @@ static int pm8001_issue_ssp_tmf(struct domain_device *dev,
 		tmf);
 }
 
+/* retry commands by ha, by task and/or by device */
+void pm8001_open_reject_retry(
+	struct pm8001_hba_info *pm8001_ha,
+	struct sas_task *task_to_close,
+	struct pm8001_device *device_to_close)
+{
+	int i;
+	unsigned long flags;
+
+	if (pm8001_ha == NULL)
+		return;
+
+	spin_lock_irqsave(&pm8001_ha->lock, flags);
+
+	for (i = 0; i < PM8001_MAX_CCB; i++) {
+		struct sas_task *task;
+		struct task_status_struct *ts;
+		struct pm8001_device *pm8001_dev;
+		unsigned long flags1;
+		u32 tag;
+		struct pm8001_ccb_info *ccb = &pm8001_ha->ccb_info[i];
+
+		pm8001_dev = ccb->device;
+		if (!pm8001_dev || (pm8001_dev->dev_type == NO_DEVICE))
+			continue;
+		if (!device_to_close) {
+			uintptr_t d = (uintptr_t)pm8001_dev
+					- (uintptr_t)&pm8001_ha->devices;
+			if (((d % sizeof(*pm8001_dev)) != 0)
+			 || ((d / sizeof(*pm8001_dev)) >= PM8001_MAX_DEVICES))
+				continue;
+		} else if (pm8001_dev != device_to_close)
+			continue;
+		tag = ccb->ccb_tag;
+		if (!tag || (tag == 0xFFFFFFFF))
+			continue;
+		task = ccb->task;
+		if (!task || !task->task_done)
+			continue;
+		if (task_to_close && (task != task_to_close))
+			continue;
+		ts = &task->task_status;
+		ts->resp = SAS_TASK_COMPLETE;
+		/* Force the midlayer to retry */
+		ts->stat = SAS_OPEN_REJECT;
+		ts->open_rej_reason = SAS_OREJ_RSVD_RETRY;
+		if (pm8001_dev)
+			pm8001_dev->running_req--;
+		spin_lock_irqsave(&task->task_state_lock, flags1);
+		task->task_state_flags &= ~SAS_TASK_STATE_PENDING;
+		task->task_state_flags &= ~SAS_TASK_AT_INITIATOR;
+		task->task_state_flags |= SAS_TASK_STATE_DONE;
+		if (unlikely((task->task_state_flags
+				& SAS_TASK_STATE_ABORTED))) {
+			spin_unlock_irqrestore(&task->task_state_lock,
+				flags1);
+			pm8001_ccb_task_free(pm8001_ha, task, ccb, tag);
+		} else {
+			spin_unlock_irqrestore(&task->task_state_lock,
+				flags1);
+			pm8001_ccb_task_free(pm8001_ha, task, ccb, tag);
+			mb();/* in order to force CPU ordering */
+			spin_unlock_irqrestore(&pm8001_ha->lock, flags);
+			task->task_done(task);
+			spin_lock_irqsave(&pm8001_ha->lock, flags);
+		}
+	}
+
+	spin_unlock_irqrestore(&pm8001_ha->lock, flags);
+}
+
 /**
   * Standard mandates link reset for ATA  (type 0) and hard reset for
   * SSP (type 1) , only for RECOVERY
author	Mark Salyzyn <mark_salyzyn@xyratex.com>	2012-01-17 11:52:24 -0500
committer	James Bottomley <JBottomley@Parallels.com>	2012-02-19 08:08:52 -0600
commit	5954d7380f627371c4d8d7c59c08f9596aa2c674 (patch)
tree	0908dd4a832ba0ef07c79237f9f6ab3f2ad20a36 /drivers/scsi/pm8001/pm8001_sas.c
parent	[SCSI] pm8001: Add FUNC_GET_EVENTS (diff)
download	linux-dev-5954d7380f627371c4d8d7c59c08f9596aa2c674.tar.xz linux-dev-5954d7380f627371c4d8d7c59c08f9596aa2c674.zip