Skip to content
This repository was archived by the owner on Sep 11, 2019. It is now read-only.

Commit 5386a4e

Browse files
Quinn Tranmartinkpetersen
Quinn Tran
authored andcommitted
scsi: qla2xxx: Add cleanup for PCI EEH recovery
During EEH error recovery testing it was discovered that driver's reset() callback partially frees resources used by driver, leaving some stale memory. After reset() is done and when resume() callback in driver uses old data which results into error leaving adapter disabled due to PCIe error. This patch does cleanup for EEH recovery code path and prevents adapter from getting disabled. Signed-off-by: Quinn Tran <[email protected]> Signed-off-by: Himanshu Madhani <[email protected]> Reviewed-by: Ewan D. Milne <[email protected]> Signed-off-by: Martin K. Petersen <[email protected]>
1 parent d4023db commit 5386a4e

File tree

1 file changed

+82
-139
lines changed

1 file changed

+82
-139
lines changed

drivers/scsi/qla2xxx/qla_os.c

+82-139
Original file line numberDiff line numberDiff line change
@@ -6826,6 +6826,78 @@ qla2x00_release_firmware(void)
68266826
mutex_unlock(&qla_fw_lock);
68276827
}
68286828

6829+
static void qla_pci_error_cleanup(scsi_qla_host_t *vha)
6830+
{
6831+
struct qla_hw_data *ha = vha->hw;
6832+
scsi_qla_host_t *base_vha = pci_get_drvdata(ha->pdev);
6833+
struct qla_qpair *qpair = NULL;
6834+
struct scsi_qla_host *vp;
6835+
fc_port_t *fcport;
6836+
int i;
6837+
unsigned long flags;
6838+
6839+
ha->chip_reset++;
6840+
6841+
ha->base_qpair->chip_reset = ha->chip_reset;
6842+
for (i = 0; i < ha->max_qpairs; i++) {
6843+
if (ha->queue_pair_map[i])
6844+
ha->queue_pair_map[i]->chip_reset =
6845+
ha->base_qpair->chip_reset;
6846+
}
6847+
6848+
/* purge MBox commands */
6849+
if (atomic_read(&ha->num_pend_mbx_stage3)) {
6850+
clear_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags);
6851+
complete(&ha->mbx_intr_comp);
6852+
}
6853+
6854+
i = 0;
6855+
6856+
while (atomic_read(&ha->num_pend_mbx_stage3) ||
6857+
atomic_read(&ha->num_pend_mbx_stage2) ||
6858+
atomic_read(&ha->num_pend_mbx_stage1)) {
6859+
msleep(20);
6860+
i++;
6861+
if (i > 50)
6862+
break;
6863+
}
6864+
6865+
ha->flags.purge_mbox = 0;
6866+
6867+
mutex_lock(&ha->mq_lock);
6868+
list_for_each_entry(qpair, &base_vha->qp_list, qp_list_elem)
6869+
qpair->online = 0;
6870+
mutex_unlock(&ha->mq_lock);
6871+
6872+
qla2x00_mark_all_devices_lost(vha, 0);
6873+
6874+
spin_lock_irqsave(&ha->vport_slock, flags);
6875+
list_for_each_entry(vp, &ha->vp_list, list) {
6876+
atomic_inc(&vp->vref_count);
6877+
spin_unlock_irqrestore(&ha->vport_slock, flags);
6878+
qla2x00_mark_all_devices_lost(vp, 0);
6879+
spin_lock_irqsave(&ha->vport_slock, flags);
6880+
atomic_dec(&vp->vref_count);
6881+
}
6882+
spin_unlock_irqrestore(&ha->vport_slock, flags);
6883+
6884+
/* Clear all async request states across all VPs. */
6885+
list_for_each_entry(fcport, &vha->vp_fcports, list)
6886+
fcport->flags &= ~(FCF_LOGIN_NEEDED | FCF_ASYNC_SENT);
6887+
6888+
spin_lock_irqsave(&ha->vport_slock, flags);
6889+
list_for_each_entry(vp, &ha->vp_list, list) {
6890+
atomic_inc(&vp->vref_count);
6891+
spin_unlock_irqrestore(&ha->vport_slock, flags);
6892+
list_for_each_entry(fcport, &vp->vp_fcports, list)
6893+
fcport->flags &= ~(FCF_LOGIN_NEEDED | FCF_ASYNC_SENT);
6894+
spin_lock_irqsave(&ha->vport_slock, flags);
6895+
atomic_dec(&vp->vref_count);
6896+
}
6897+
spin_unlock_irqrestore(&ha->vport_slock, flags);
6898+
}
6899+
6900+
68296901
static pci_ers_result_t
68306902
qla2xxx_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
68316903
{
@@ -6851,20 +6923,7 @@ qla2xxx_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
68516923
return PCI_ERS_RESULT_CAN_RECOVER;
68526924
case pci_channel_io_frozen:
68536925
ha->flags.eeh_busy = 1;
6854-
/* For ISP82XX complete any pending mailbox cmd */
6855-
if (IS_QLA82XX(ha)) {
6856-
ha->flags.isp82xx_fw_hung = 1;
6857-
ql_dbg(ql_dbg_aer, vha, 0x9001, "Pci channel io frozen\n");
6858-
qla82xx_clear_pending_mbx(vha);
6859-
}
6860-
qla2x00_free_irqs(vha);
6861-
pci_disable_device(pdev);
6862-
/* Return back all IOs */
6863-
qla2x00_abort_all_cmds(vha, DID_RESET << 16);
6864-
if (ql2xmqsupport || ql2xnvmeenable) {
6865-
set_bit(QPAIR_ONLINE_CHECK_NEEDED, &vha->dpc_flags);
6866-
qla2xxx_wake_dpc(vha);
6867-
}
6926+
qla_pci_error_cleanup(vha);
68686927
return PCI_ERS_RESULT_NEED_RESET;
68696928
case pci_channel_io_perm_failure:
68706929
ha->flags.pci_channel_io_perm_failure = 1;
@@ -6918,122 +6977,14 @@ qla2xxx_pci_mmio_enabled(struct pci_dev *pdev)
69186977
return PCI_ERS_RESULT_RECOVERED;
69196978
}
69206979

6921-
static uint32_t
6922-
qla82xx_error_recovery(scsi_qla_host_t *base_vha)
6923-
{
6924-
uint32_t rval = QLA_FUNCTION_FAILED;
6925-
uint32_t drv_active = 0;
6926-
struct qla_hw_data *ha = base_vha->hw;
6927-
int fn;
6928-
struct pci_dev *other_pdev = NULL;
6929-
6930-
ql_dbg(ql_dbg_aer, base_vha, 0x9006,
6931-
"Entered %s.\n", __func__);
6932-
6933-
set_bit(ABORT_ISP_ACTIVE, &base_vha->dpc_flags);
6934-
6935-
if (base_vha->flags.online) {
6936-
/* Abort all outstanding commands,
6937-
* so as to be requeued later */
6938-
qla2x00_abort_isp_cleanup(base_vha);
6939-
}
6940-
6941-
6942-
fn = PCI_FUNC(ha->pdev->devfn);
6943-
while (fn > 0) {
6944-
fn--;
6945-
ql_dbg(ql_dbg_aer, base_vha, 0x9007,
6946-
"Finding pci device at function = 0x%x.\n", fn);
6947-
other_pdev =
6948-
pci_get_domain_bus_and_slot(pci_domain_nr(ha->pdev->bus),
6949-
ha->pdev->bus->number, PCI_DEVFN(PCI_SLOT(ha->pdev->devfn),
6950-
fn));
6951-
6952-
if (!other_pdev)
6953-
continue;
6954-
if (atomic_read(&other_pdev->enable_cnt)) {
6955-
ql_dbg(ql_dbg_aer, base_vha, 0x9008,
6956-
"Found PCI func available and enable at 0x%x.\n",
6957-
fn);
6958-
pci_dev_put(other_pdev);
6959-
break;
6960-
}
6961-
pci_dev_put(other_pdev);
6962-
}
6963-
6964-
if (!fn) {
6965-
/* Reset owner */
6966-
ql_dbg(ql_dbg_aer, base_vha, 0x9009,
6967-
"This devfn is reset owner = 0x%x.\n",
6968-
ha->pdev->devfn);
6969-
qla82xx_idc_lock(ha);
6970-
6971-
qla82xx_wr_32(ha, QLA82XX_CRB_DEV_STATE,
6972-
QLA8XXX_DEV_INITIALIZING);
6973-
6974-
qla82xx_wr_32(ha, QLA82XX_CRB_DRV_IDC_VERSION,
6975-
QLA82XX_IDC_VERSION);
6976-
6977-
drv_active = qla82xx_rd_32(ha, QLA82XX_CRB_DRV_ACTIVE);
6978-
ql_dbg(ql_dbg_aer, base_vha, 0x900a,
6979-
"drv_active = 0x%x.\n", drv_active);
6980-
6981-
qla82xx_idc_unlock(ha);
6982-
/* Reset if device is not already reset
6983-
* drv_active would be 0 if a reset has already been done
6984-
*/
6985-
if (drv_active)
6986-
rval = qla82xx_start_firmware(base_vha);
6987-
else
6988-
rval = QLA_SUCCESS;
6989-
qla82xx_idc_lock(ha);
6990-
6991-
if (rval != QLA_SUCCESS) {
6992-
ql_log(ql_log_info, base_vha, 0x900b,
6993-
"HW State: FAILED.\n");
6994-
qla82xx_clear_drv_active(ha);
6995-
qla82xx_wr_32(ha, QLA82XX_CRB_DEV_STATE,
6996-
QLA8XXX_DEV_FAILED);
6997-
} else {
6998-
ql_log(ql_log_info, base_vha, 0x900c,
6999-
"HW State: READY.\n");
7000-
qla82xx_wr_32(ha, QLA82XX_CRB_DEV_STATE,
7001-
QLA8XXX_DEV_READY);
7002-
qla82xx_idc_unlock(ha);
7003-
ha->flags.isp82xx_fw_hung = 0;
7004-
rval = qla82xx_restart_isp(base_vha);
7005-
qla82xx_idc_lock(ha);
7006-
/* Clear driver state register */
7007-
qla82xx_wr_32(ha, QLA82XX_CRB_DRV_STATE, 0);
7008-
qla82xx_set_drv_active(base_vha);
7009-
}
7010-
qla82xx_idc_unlock(ha);
7011-
} else {
7012-
ql_dbg(ql_dbg_aer, base_vha, 0x900d,
7013-
"This devfn is not reset owner = 0x%x.\n",
7014-
ha->pdev->devfn);
7015-
if ((qla82xx_rd_32(ha, QLA82XX_CRB_DEV_STATE) ==
7016-
QLA8XXX_DEV_READY)) {
7017-
ha->flags.isp82xx_fw_hung = 0;
7018-
rval = qla82xx_restart_isp(base_vha);
7019-
qla82xx_idc_lock(ha);
7020-
qla82xx_set_drv_active(base_vha);
7021-
qla82xx_idc_unlock(ha);
7022-
}
7023-
}
7024-
clear_bit(ABORT_ISP_ACTIVE, &base_vha->dpc_flags);
7025-
7026-
return rval;
7027-
}
7028-
70296980
static pci_ers_result_t
70306981
qla2xxx_pci_slot_reset(struct pci_dev *pdev)
70316982
{
70326983
pci_ers_result_t ret = PCI_ERS_RESULT_DISCONNECT;
70336984
scsi_qla_host_t *base_vha = pci_get_drvdata(pdev);
70346985
struct qla_hw_data *ha = base_vha->hw;
7035-
struct rsp_que *rsp;
7036-
int rc, retries = 10;
6986+
int rc;
6987+
struct qla_qpair *qpair = NULL;
70376988

70386989
ql_dbg(ql_dbg_aer, base_vha, 0x9004,
70396990
"Slot Reset.\n");
@@ -7062,24 +7013,16 @@ qla2xxx_pci_slot_reset(struct pci_dev *pdev)
70627013
goto exit_slot_reset;
70637014
}
70647015

7065-
rsp = ha->rsp_q_map[0];
7066-
if (qla2x00_request_irqs(ha, rsp))
7067-
goto exit_slot_reset;
70687016

70697017
if (ha->isp_ops->pci_config(base_vha))
70707018
goto exit_slot_reset;
70717019

7072-
if (IS_QLA82XX(ha)) {
7073-
if (qla82xx_error_recovery(base_vha) == QLA_SUCCESS) {
7074-
ret = PCI_ERS_RESULT_RECOVERED;
7075-
goto exit_slot_reset;
7076-
} else
7077-
goto exit_slot_reset;
7078-
}
7079-
7080-
while (ha->flags.mbox_busy && retries--)
7081-
msleep(1000);
7020+
mutex_lock(&ha->mq_lock);
7021+
list_for_each_entry(qpair, &base_vha->qp_list, qp_list_elem)
7022+
qpair->online = 1;
7023+
mutex_unlock(&ha->mq_lock);
70827024

7025+
base_vha->flags.online = 1;
70837026
set_bit(ABORT_ISP_ACTIVE, &base_vha->dpc_flags);
70847027
if (ha->isp_ops->abort_isp(base_vha) == QLA_SUCCESS)
70857028
ret = PCI_ERS_RESULT_RECOVERED;
@@ -7103,13 +7046,13 @@ qla2xxx_pci_resume(struct pci_dev *pdev)
71037046
ql_dbg(ql_dbg_aer, base_vha, 0x900f,
71047047
"pci_resume.\n");
71057048

7049+
ha->flags.eeh_busy = 0;
7050+
71067051
ret = qla2x00_wait_for_hba_online(base_vha);
71077052
if (ret != QLA_SUCCESS) {
71087053
ql_log(ql_log_fatal, base_vha, 0x9002,
71097054
"The device failed to resume I/O from slot/link_reset.\n");
71107055
}
7111-
7112-
ha->flags.eeh_busy = 0;
71137056
}
71147057

71157058
static void

0 commit comments

Comments
 (0)