aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/net/ethernet/mellanox/mlx5/core/health.c
diff options
context:
space:
mode:
authorMohamad Haj Yahia <mohamad@mellanox.com>2016-10-25 18:36:34 +0300
committerDavid S. Miller <davem@davemloft.net>2016-10-29 12:00:39 -0400
commit04c0c1ab38e95105d950db5b84e727637e149ce7 (patch)
tree74621d574ebfad08b9a8fc9c0ded5c03f67d01fe /drivers/net/ethernet/mellanox/mlx5/core/health.c
parentnet/mlx5: Fix race between PCI error handlers and health work (diff)
downloadlinux-dev-04c0c1ab38e95105d950db5b84e727637e149ce7.tar.xz
linux-dev-04c0c1ab38e95105d950db5b84e727637e149ce7.zip
net/mlx5: PCI error recovery health care simulation
In case that the kernel PCI error handlers are not called, we will trigger our own recovery flow. The health work will give priority to the kernel pci error handlers to recover the PCI by waiting for a small period, if the pci error handlers are not triggered the manual recovery flow will be executed. We don't save pci state in case of manual recovery because it will ruin the pci configuration space and we will lose dma sync. Fixes: 89d44f0a6c73 ('net/mlx5_core: Add pci error handlers to mlx5_core driver') Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net/ethernet/mellanox/mlx5/core/health.c')
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/health.c45
1 files changed, 41 insertions, 4 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 2d00022c92d8..5bcf93422ee0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -61,14 +61,15 @@ enum {
enum {
MLX5_NIC_IFC_FULL = 0,
MLX5_NIC_IFC_DISABLED = 1,
- MLX5_NIC_IFC_NO_DRAM_NIC = 2
+ MLX5_NIC_IFC_NO_DRAM_NIC = 2,
+ MLX5_NIC_IFC_INVALID = 3
};
enum {
MLX5_DROP_NEW_HEALTH_WORK,
};
-static u8 get_nic_interface(struct mlx5_core_dev *dev)
+static u8 get_nic_state(struct mlx5_core_dev *dev)
{
return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
}
@@ -101,7 +102,7 @@ static int in_fatal(struct mlx5_core_dev *dev)
struct mlx5_core_health *health = &dev->priv.health;
struct health_buffer __iomem *h = health->health;
- if (get_nic_interface(dev) == MLX5_NIC_IFC_DISABLED)
+ if (get_nic_state(dev) == MLX5_NIC_IFC_DISABLED)
return 1;
if (ioread32be(&h->fw_ver) == 0xffffffff)
@@ -131,7 +132,7 @@ unlock:
static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
{
- u8 nic_interface = get_nic_interface(dev);
+ u8 nic_interface = get_nic_state(dev);
switch (nic_interface) {
case MLX5_NIC_IFC_FULL:
@@ -153,8 +154,34 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
mlx5_disable_device(dev);
}
+static void health_recover(struct work_struct *work)
+{
+ struct mlx5_core_health *health;
+ struct delayed_work *dwork;
+ struct mlx5_core_dev *dev;
+ struct mlx5_priv *priv;
+ u8 nic_state;
+
+ dwork = container_of(work, struct delayed_work, work);
+ health = container_of(dwork, struct mlx5_core_health, recover_work);
+ priv = container_of(health, struct mlx5_priv, health);
+ dev = container_of(priv, struct mlx5_core_dev, priv);
+
+ nic_state = get_nic_state(dev);
+ if (nic_state == MLX5_NIC_IFC_INVALID) {
+ dev_err(&dev->pdev->dev, "health recovery flow aborted since the nic state is invalid\n");
+ return;
+ }
+
+ dev_err(&dev->pdev->dev, "starting health recovery flow\n");
+ mlx5_recover_device(dev);
+}
+
+/* How much time to wait until health resetting the driver (in msecs) */
+#define MLX5_RECOVERY_DELAY_MSECS 60000
static void health_care(struct work_struct *work)
{
+ unsigned long recover_delay = msecs_to_jiffies(MLX5_RECOVERY_DELAY_MSECS);
struct mlx5_core_health *health;
struct mlx5_core_dev *dev;
struct mlx5_priv *priv;
@@ -164,6 +191,14 @@ static void health_care(struct work_struct *work)
dev = container_of(priv, struct mlx5_core_dev, priv);
mlx5_core_warn(dev, "handling bad device here\n");
mlx5_handle_bad_state(dev);
+
+ spin_lock(&health->wq_lock);
+ if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
+ schedule_delayed_work(&health->recover_work, recover_delay);
+ else
+ dev_err(&dev->pdev->dev,
+ "new health works are not permitted at this stage\n");
+ spin_unlock(&health->wq_lock);
}
static const char *hsynd_str(u8 synd)
@@ -316,6 +351,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
spin_lock(&health->wq_lock);
set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
spin_unlock(&health->wq_lock);
+ cancel_delayed_work_sync(&health->recover_work);
cancel_work_sync(&health->work);
}
@@ -344,6 +380,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
return -ENOMEM;
spin_lock_init(&health->wq_lock);
INIT_WORK(&health->work, health_care);
+ INIT_DELAYED_WORK(&health->recover_work, health_recover);
return 0;
}