aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorGavin Li <gavinl@nvidia.com>2022-03-27 17:36:44 +0300
committerSaeed Mahameed <saeedm@nvidia.com>2022-05-09 22:54:00 -0700
commit37ca95e62ee23fa6d2c2c64e3dc40b4a0c0146dc (patch)
tree3c5957b3f12e084ad9b389f3fe772daf965a61ce /drivers
parentnet/mlx5: Add exit route when waiting for FW (diff)
downloadlinux-dev-37ca95e62ee23fa6d2c2c64e3dc40b4a0c0146dc.tar.xz
linux-dev-37ca95e62ee23fa6d2c2c64e3dc40b4a0c0146dc.zip
net/mlx5: Increase FW pre-init timeout for health recovery
Currently, health recovery will reload driver to recover it from fatal errors. During the driver's load process, it would wait for FW to set the pre-init bit for up to 120 seconds, beyond this threshold it would abort the load process. In some cases, such as a FW upgrade on the DPU, this timeout period is insufficient, and the user has no way to recover the host device. To solve this issue, introduce a new FW pre-init timeout for health recovery, which is set to 2 hours. The timeout for devlink reload and probe will use the original one because they are user triggered flows, and therefore should not have a significantly long timeout, during which the user command would hang. Signed-off-by: Gavin Li <gavinl@nvidia.com> Reviewed-by: Moshe Shemesh <moshe@nvidia.com> Reviewed-by: Shay Drory <shayd@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Diffstat (limited to '')
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/devlink.c4
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c1
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h1
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/main.c23
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h2
6 files changed, 20 insertions, 13 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index e8789e6d7e7b..f85166e587f2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -178,13 +178,13 @@ static int mlx5_devlink_reload_up(struct devlink *devlink, enum devlink_reload_a
*actions_performed = BIT(action);
switch (action) {
case DEVLINK_RELOAD_ACTION_DRIVER_REINIT:
- return mlx5_load_one(dev);
+ return mlx5_load_one(dev, false);
case DEVLINK_RELOAD_ACTION_FW_ACTIVATE:
if (limit == DEVLINK_RELOAD_LIMIT_NO_RESET)
break;
/* On fw_activate action, also driver is reloaded and reinit performed */
*actions_performed |= BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT);
- return mlx5_load_one(dev);
+ return mlx5_load_one(dev, false);
default:
/* Unsupported action should not get to this function */
WARN_ON(1);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
index ca1aba845dd6..84df0d56a2b6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
@@ -148,7 +148,7 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev)
if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) {
complete(&fw_reset->done);
} else {
- mlx5_load_one(dev);
+ mlx5_load_one(dev, false);
devlink_remote_reload_actions_performed(priv_to_devlink(dev), 0,
BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) |
BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c
index c1df0d3595d8..d758848d34d0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c
@@ -10,6 +10,7 @@ struct mlx5_timeouts {
static const u32 tout_def_sw_val[MAX_TIMEOUT_TYPES] = {
[MLX5_TO_FW_PRE_INIT_TIMEOUT_MS] = 120000,
+ [MLX5_TO_FW_PRE_INIT_ON_RECOVERY_TIMEOUT_MS] = 7200000,
[MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS] = 20000,
[MLX5_TO_FW_PRE_INIT_WAIT_MS] = 2,
[MLX5_TO_FW_INIT_MS] = 2000,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h
index 1c42ead782fa..257c03eeab36 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h
@@ -7,6 +7,7 @@
enum mlx5_timeouts_types {
/* pre init timeouts (not read from FW) */
MLX5_TO_FW_PRE_INIT_TIMEOUT_MS,
+ MLX5_TO_FW_PRE_INIT_ON_RECOVERY_TIMEOUT_MS,
MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS,
MLX5_TO_FW_PRE_INIT_WAIT_MS,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index f28a3526aafa..84f75aa25214 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1003,7 +1003,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
mlx5_devcom_unregister_device(dev->priv.devcom);
}
-static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)
+static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)
{
int err;
@@ -1018,11 +1018,11 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)
/* wait for firmware to accept initialization segments configurations
*/
- err = wait_fw_init(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT),
+ err = wait_fw_init(dev, timeout,
mlx5_tout_ms(dev, FW_PRE_INIT_WARN_MESSAGE_INTERVAL));
if (err) {
mlx5_core_err(dev, "Firmware over %llu MS in pre-initializing state, aborting\n",
- mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
+ timeout);
return err;
}
@@ -1272,7 +1272,7 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
mutex_lock(&dev->intf_state_mutex);
dev->state = MLX5_DEVICE_STATE_UP;
- err = mlx5_function_setup(dev, true);
+ err = mlx5_function_setup(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
if (err)
goto err_function;
@@ -1336,9 +1336,10 @@ out:
mutex_unlock(&dev->intf_state_mutex);
}
-int mlx5_load_one(struct mlx5_core_dev *dev)
+int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery)
{
int err = 0;
+ u64 timeout;
mutex_lock(&dev->intf_state_mutex);
if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
@@ -1348,7 +1349,11 @@ int mlx5_load_one(struct mlx5_core_dev *dev)
/* remove any previous indication of internal error */
dev->state = MLX5_DEVICE_STATE_UP;
- err = mlx5_function_setup(dev, false);
+ if (recovery)
+ timeout = mlx5_tout_ms(dev, FW_PRE_INIT_ON_RECOVERY_TIMEOUT);
+ else
+ timeout = mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT);
+ err = mlx5_function_setup(dev, timeout);
if (err)
goto err_function;
@@ -1719,7 +1724,7 @@ static void mlx5_pci_resume(struct pci_dev *pdev)
mlx5_pci_trace(dev, "Enter, loading driver..\n");
- err = mlx5_load_one(dev);
+ err = mlx5_load_one(dev, false);
mlx5_pci_trace(dev, "Done, err = %d, device %s\n", err,
!err ? "recovered" : "Failed");
@@ -1807,7 +1812,7 @@ static int mlx5_resume(struct pci_dev *pdev)
{
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
- return mlx5_load_one(dev);
+ return mlx5_load_one(dev, false);
}
static const struct pci_device_id mlx5_core_pci_table[] = {
@@ -1852,7 +1857,7 @@ int mlx5_recover_device(struct mlx5_core_dev *dev)
return -EIO;
}
- return mlx5_load_one(dev);
+ return mlx5_load_one(dev, true);
}
static struct pci_driver mlx5_core_driver = {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index a9b2d6ead542..9026be1d6223 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -290,7 +290,7 @@ void mlx5_mdev_uninit(struct mlx5_core_dev *dev);
int mlx5_init_one(struct mlx5_core_dev *dev);
void mlx5_uninit_one(struct mlx5_core_dev *dev);
void mlx5_unload_one(struct mlx5_core_dev *dev);
-int mlx5_load_one(struct mlx5_core_dev *dev);
+int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery);
int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out);