aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/scsi/lpfc/lpfc_init.c
diff options
context:
space:
mode:
authorJames Smart <jsmart2021@gmail.com>2022-03-16 20:27:34 -0700
committerMartin K. Petersen <martin.petersen@oracle.com>2022-03-29 23:19:37 -0400
commit35ed9613d83f3c1f011877d591fd7d36f2666106 (patch)
tree6714d1973151a917f9281dccb18f86abbdde6ef1 /drivers/scsi/lpfc/lpfc_init.c
parentscsi: target: tcmu: Fix possible page UAF (diff)
downloadlinux-dev-35ed9613d83f3c1f011877d591fd7d36f2666106.tar.xz
linux-dev-35ed9613d83f3c1f011877d591fd7d36f2666106.zip
scsi: lpfc: Improve PCI EEH Error and Recovery Handling
Following EEH errors, the driver can crash or hang when deleting the localport or when attempting to unload. The EEH handlers in the driver did not notify the NVMe-FC transport before tearing the driver down. This was delayed until the resume steps. This worked for SCSI because lpfc_block_scsi() would notify the scsi_fc_transport that the target was not available but it would not clean up all the references to the ndlp. The SLI3 prep for dev reset handler did the lpfc_offline_prep() and lpfc_offline() calls to get the port stopped before restarting. The SLI4 version of the prep for dev reset just destroyed the queues and did not stop NVMe from continuing. Also because the port was not really stopped the localport destroy would hang because the transport was still waiting for I/O. Additionally, a devloss tmo can fire and post events to a stopped worker thread creating another hang condition. lpfc_sli4_prep_dev_for_reset() is modified to call lpfc_offline_prep() and lpfc_offline() rather than just lpfc_scsi_dev_block() to ensure both SCSI and NVMe transports are notified to block I/O to the driver. Logic is added to devloss handler and worker thread to clean up ndlp references and quiesce appropriately. Link: https://lore.kernel.org/r/20220317032737.45308-2-jsmart2021@gmail.com Co-developed-by: Justin Tee <justin.tee@broadcom.com> Signed-off-by: Justin Tee <justin.tee@broadcom.com> Signed-off-by: James Smart <jsmart2021@gmail.com> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Diffstat (limited to 'drivers/scsi/lpfc/lpfc_init.c')
-rw-r--r--drivers/scsi/lpfc/lpfc_init.c60
1 files changed, 38 insertions, 22 deletions
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index eed6464bd880..b8ab6dcbadc5 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -1642,7 +1642,7 @@ lpfc_sli4_offline_eratt(struct lpfc_hba *phba)
{
spin_lock_irq(&phba->hbalock);
if (phba->link_state == LPFC_HBA_ERROR &&
- phba->hba_flag & HBA_PCI_ERR) {
+ test_bit(HBA_PCI_ERR, &phba->bit_flags)) {
spin_unlock_irq(&phba->hbalock);
return;
}
@@ -3682,7 +3682,8 @@ lpfc_offline_prep(struct lpfc_hba *phba, int mbx_action)
struct lpfc_vport **vports;
struct Scsi_Host *shost;
int i;
- int offline = 0;
+ int offline;
+ bool hba_pci_err;
if (vport->fc_flag & FC_OFFLINE_MODE)
return;
@@ -3692,6 +3693,7 @@ lpfc_offline_prep(struct lpfc_hba *phba, int mbx_action)
lpfc_linkdown(phba);
offline = pci_channel_offline(phba->pcidev);
+ hba_pci_err = test_bit(HBA_PCI_ERR, &phba->bit_flags);
/* Issue an unreg_login to all nodes on all vports */
vports = lpfc_create_vport_work_array(phba);
@@ -3715,11 +3717,14 @@ lpfc_offline_prep(struct lpfc_hba *phba, int mbx_action)
ndlp->nlp_flag &= ~NLP_NPR_ADISC;
spin_unlock_irq(&ndlp->lock);
- if (offline) {
+ if (offline || hba_pci_err) {
spin_lock_irq(&ndlp->lock);
ndlp->nlp_flag &= ~(NLP_UNREG_INP |
NLP_RPI_REGISTERED);
spin_unlock_irq(&ndlp->lock);
+ if (phba->sli_rev == LPFC_SLI_REV4)
+ lpfc_sli_rpi_release(vports[i],
+ ndlp);
} else {
lpfc_unreg_rpi(vports[i], ndlp);
}
@@ -13374,15 +13379,12 @@ lpfc_sli4_hba_unset(struct lpfc_hba *phba)
/* Disable FW logging to host memory */
lpfc_ras_stop_fwlog(phba);
- /* Unset the queues shared with the hardware then release all
- * allocated resources.
- */
- lpfc_sli4_queue_unset(phba);
- lpfc_sli4_queue_destroy(phba);
-
/* Reset SLI4 HBA FCoE function */
lpfc_pci_function_reset(phba);
+ /* release all queue allocated resources. */
+ lpfc_sli4_queue_destroy(phba);
+
/* Free RAS DMA memory */
if (phba->ras_fwlog.ras_enabled)
lpfc_sli4_ras_dma_free(phba);
@@ -15057,24 +15059,28 @@ lpfc_sli4_prep_dev_for_recover(struct lpfc_hba *phba)
static void
lpfc_sli4_prep_dev_for_reset(struct lpfc_hba *phba)
{
- lpfc_printf_log(phba, KERN_ERR, LOG_TRACE_EVENT,
- "2826 PCI channel disable preparing for reset\n");
+ int offline = pci_channel_offline(phba->pcidev);
+
+ lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
+ "2826 PCI channel disable preparing for reset offline"
+ " %d\n", offline);
/* Block any management I/Os to the device */
lpfc_block_mgmt_io(phba, LPFC_MBX_NO_WAIT);
- /* Block all SCSI devices' I/Os on the host */
- lpfc_scsi_dev_block(phba);
+ /* HBA_PCI_ERR was set in io_error_detect */
+ lpfc_offline_prep(phba, LPFC_MBX_NO_WAIT);
/* Flush all driver's outstanding I/Os as we are to reset */
lpfc_sli_flush_io_rings(phba);
+ lpfc_offline(phba);
/* stop all timers */
lpfc_stop_hba_timers(phba);
+ lpfc_sli4_queue_destroy(phba);
/* Disable interrupt and pci device */
lpfc_sli4_disable_intr(phba);
- lpfc_sli4_queue_destroy(phba);
pci_disable_device(phba->pcidev);
}
@@ -15123,6 +15129,7 @@ lpfc_io_error_detected_s4(struct pci_dev *pdev, pci_channel_state_t state)
{
struct Scsi_Host *shost = pci_get_drvdata(pdev);
struct lpfc_hba *phba = ((struct lpfc_vport *)shost->hostdata)->phba;
+ bool hba_pci_err;
switch (state) {
case pci_channel_io_normal:
@@ -15130,17 +15137,24 @@ lpfc_io_error_detected_s4(struct pci_dev *pdev, pci_channel_state_t state)
lpfc_sli4_prep_dev_for_recover(phba);
return PCI_ERS_RESULT_CAN_RECOVER;
case pci_channel_io_frozen:
- phba->hba_flag |= HBA_PCI_ERR;
+ hba_pci_err = test_and_set_bit(HBA_PCI_ERR, &phba->bit_flags);
/* Fatal error, prepare for slot reset */
- lpfc_sli4_prep_dev_for_reset(phba);
+ if (!hba_pci_err)
+ lpfc_sli4_prep_dev_for_reset(phba);
+ else
+ lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
+ "2832 Already handling PCI error "
+ "state: x%x\n", state);
return PCI_ERS_RESULT_NEED_RESET;
case pci_channel_io_perm_failure:
- phba->hba_flag |= HBA_PCI_ERR;
+ set_bit(HBA_PCI_ERR, &phba->bit_flags);
/* Permanent failure, prepare for device down */
lpfc_sli4_prep_dev_for_perm_failure(phba);
return PCI_ERS_RESULT_DISCONNECT;
default:
- phba->hba_flag |= HBA_PCI_ERR;
+ hba_pci_err = test_and_set_bit(HBA_PCI_ERR, &phba->bit_flags);
+ if (!hba_pci_err)
+ lpfc_sli4_prep_dev_for_reset(phba);
/* Unknown state, prepare and request slot reset */
lpfc_printf_log(phba, KERN_ERR, LOG_TRACE_EVENT,
"2825 Unknown PCI error state: x%x\n", state);
@@ -15174,17 +15188,21 @@ lpfc_io_slot_reset_s4(struct pci_dev *pdev)
struct lpfc_hba *phba = ((struct lpfc_vport *)shost->hostdata)->phba;
struct lpfc_sli *psli = &phba->sli;
uint32_t intr_mode;
+ bool hba_pci_err;
dev_printk(KERN_INFO, &pdev->dev, "recovering from a slot reset.\n");
if (pci_enable_device_mem(pdev)) {
printk(KERN_ERR "lpfc: Cannot re-enable "
- "PCI device after reset.\n");
+ "PCI device after reset.\n");
return PCI_ERS_RESULT_DISCONNECT;
}
pci_restore_state(pdev);
- phba->hba_flag &= ~HBA_PCI_ERR;
+ hba_pci_err = test_and_clear_bit(HBA_PCI_ERR, &phba->bit_flags);
+ if (!hba_pci_err)
+ dev_info(&pdev->dev,
+ "hba_pci_err was not set, recovering slot reset.\n");
/*
* As the new kernel behavior of pci_restore_state() API call clears
* device saved_state flag, need to save the restored state again.
@@ -15239,8 +15257,6 @@ lpfc_io_resume_s4(struct pci_dev *pdev)
*/
if (!(phba->sli.sli_flag & LPFC_SLI_ACTIVE)) {
/* Perform device reset */
- lpfc_offline_prep(phba, LPFC_MBX_WAIT);
- lpfc_offline(phba);
lpfc_sli_brdrestart(phba);
/* Bring the device back online */
lpfc_online(phba);