diff options
author | Shannon Nelson <snelson@pensando.io> | 2022-01-24 10:53:11 -0800 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2022-01-25 11:15:09 +0000 |
commit | ec8ee714736e6b975b45b896ab66bbfa48fbe3d8 (patch) | |
tree | bbd24142fea5559021b792cc0423d67740724531 /drivers/net/ethernet/pensando/ionic/ionic_main.c | |
parent | ionic: remove the dbid_inuse bitmap (diff) | |
download | linux-ec8ee714736e6b975b45b896ab66bbfa48fbe3d8.tar.xz linux-ec8ee714736e6b975b45b896ab66bbfa48fbe3d8.zip |
ionic: stretch heartbeat detection
The driver can be premature in detecting stalled firmware
when the heartbeat is not updated because the firmware can
occasionally take a long time (more than 2 seconds) to service
a request, and doesn't update the heartbeat during that time.
The firmware heartbeat is not necessarily a steady 1 second
periodic beat, but better described as something that should
progress at least once in every DECVMD_TIMEOUT period.
The single-threaded design in the FW means that if a devcmd
or adminq request launches a large internal job, it is stuck
waiting for that job to finish before it can get back to
updating the heartbeat. Since all requests are "guaranteed"
to finish within the DEVCMD_TIMEOUT period, the driver needs
to less aggressive in checking the heartbeat progress.
We change our current 2 second window to something bigger than
DEVCMD_TIMEOUT which should take care of most of the issue.
We stop checking for the heartbeat while waiting for a request,
as long as we're still watching for the FW status. Lastly,
we make sure our FW status is up to date before running a
devcmd request.
Once we do this, we need to not check the heartbeat on DEV
commands because it may be stalled while we're on the fw_down
path. Instead, we can rely on the is_fw_running check.
Fixes: b2b9a8d7ed13 ("ionic: avoid races in ionic_heartbeat_check")
Signed-off-by: Brett Creeley <brett@pensando.io>
Signed-off-by: Shannon Nelson <snelson@pensando.io>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net/ethernet/pensando/ionic/ionic_main.c')
-rw-r--r-- | drivers/net/ethernet/pensando/ionic/ionic_main.c | 34 |
1 files changed, 14 insertions, 20 deletions
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_main.c b/drivers/net/ethernet/pensando/ionic/ionic_main.c index 78771663808a..4029b4e021f8 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_main.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_main.c @@ -358,13 +358,14 @@ int ionic_adminq_wait(struct ionic_lif *lif, struct ionic_admin_ctx *ctx, if (remaining) break; - /* interrupt the wait if FW stopped */ + /* force a check of FW status and break out if FW reset */ + (void)ionic_heartbeat_check(lif->ionic); if ((test_bit(IONIC_LIF_F_FW_RESET, lif->state) && !lif->ionic->idev.fw_status_ready) || test_bit(IONIC_LIF_F_FW_STOPPING, lif->state)) { if (do_msg) - netdev_err(netdev, "%s (%d) interrupted, FW in reset\n", - name, ctx->cmd.cmd.opcode); + netdev_warn(netdev, "%s (%d) interrupted, FW in reset\n", + name, ctx->cmd.cmd.opcode); ctx->comp.comp.status = IONIC_RC_ERROR; return -ENXIO; } @@ -425,9 +426,9 @@ static int __ionic_dev_cmd_wait(struct ionic *ionic, unsigned long max_seconds, unsigned long start_time; unsigned long max_wait; unsigned long duration; + int done = 0; + bool fw_up; int opcode; - int hb = 0; - int done; int err; /* Wait for dev cmd to complete, retrying if we get EAGAIN, @@ -437,31 +438,24 @@ static int __ionic_dev_cmd_wait(struct ionic *ionic, unsigned long max_seconds, try_again: opcode = readb(&idev->dev_cmd_regs->cmd.cmd.opcode); start_time = jiffies; - do { + for (fw_up = ionic_is_fw_running(idev); + !done && fw_up && time_before(jiffies, max_wait); + fw_up = ionic_is_fw_running(idev)) { done = ionic_dev_cmd_done(idev); if (done) break; usleep_range(100, 200); - - /* Don't check the heartbeat on FW_CONTROL commands as they are - * notorious for interrupting the firmware's heartbeat update. - */ - if (opcode != IONIC_CMD_FW_CONTROL) - hb = ionic_heartbeat_check(ionic); - } while (!done && !hb && time_before(jiffies, max_wait)); + } duration = jiffies - start_time; dev_dbg(ionic->dev, "DEVCMD %s (%d) done=%d took %ld secs (%ld jiffies)\n", ionic_opcode_to_str(opcode), opcode, done, duration / HZ, duration); - if (!done && hb) { - /* It is possible (but unlikely) that FW was busy and missed a - * heartbeat check but is still alive and will process this - * request, so don't clean the dev_cmd in this case. - */ - dev_dbg(ionic->dev, "DEVCMD %s (%d) failed - FW halted\n", - ionic_opcode_to_str(opcode), opcode); + if (!done && !fw_up) { + ionic_dev_cmd_clean(ionic); + dev_warn(ionic->dev, "DEVCMD %s (%d) interrupted - FW is down\n", + ionic_opcode_to_str(opcode), opcode); return -ENXIO; } |