diff options
Diffstat (limited to 'drivers/net/ethernet/mellanox/mlx5/core/lag')
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/lag/debugfs.c | 182 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 899 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h | 68 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c | 57 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/lag/mp.h | 7 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c | 100 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h | 26 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c | 131 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.h | 15 |
9 files changed, 1159 insertions, 326 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/debugfs.c new file mode 100644 index 000000000000..b8feaf0f5c4c --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/debugfs.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include "lag.h" + +static char *get_str_mode_type(struct mlx5_lag *ldev) +{ + switch (ldev->mode) { + case MLX5_LAG_MODE_ROCE: return "roce"; + case MLX5_LAG_MODE_SRIOV: return "switchdev"; + case MLX5_LAG_MODE_MULTIPATH: return "multipath"; + case MLX5_LAG_MODE_MPESW: return "multiport_eswitch"; + default: return "invalid"; + } + + return NULL; +} + +static int type_show(struct seq_file *file, void *priv) +{ + struct mlx5_core_dev *dev = file->private; + struct mlx5_lag *ldev; + char *mode = NULL; + + ldev = dev->priv.lag; + mutex_lock(&ldev->lock); + if (__mlx5_lag_is_active(ldev)) + mode = get_str_mode_type(ldev); + mutex_unlock(&ldev->lock); + if (!mode) + return -EINVAL; + seq_printf(file, "%s\n", mode); + + return 0; +} + +static int port_sel_mode_show(struct seq_file *file, void *priv) +{ + struct mlx5_core_dev *dev = file->private; + struct mlx5_lag *ldev; + int ret = 0; + char *mode; + + ldev = dev->priv.lag; + mutex_lock(&ldev->lock); + if (__mlx5_lag_is_active(ldev)) + mode = mlx5_get_str_port_sel_mode(ldev->mode, ldev->mode_flags); + else + ret = -EINVAL; + mutex_unlock(&ldev->lock); + if (ret) + return ret; + + seq_printf(file, "%s\n", mode); + return 0; +} + +static int state_show(struct seq_file *file, void *priv) +{ + struct mlx5_core_dev *dev = file->private; + struct mlx5_lag *ldev; + bool active; + + ldev = dev->priv.lag; + mutex_lock(&ldev->lock); + active = __mlx5_lag_is_active(ldev); + mutex_unlock(&ldev->lock); + seq_printf(file, "%s\n", active ? "active" : "disabled"); + return 0; +} + +static int flags_show(struct seq_file *file, void *priv) +{ + struct mlx5_core_dev *dev = file->private; + bool fdb_sel_mode_native; + struct mlx5_lag *ldev; + bool shared_fdb; + bool lag_active; + + ldev = dev->priv.lag; + mutex_lock(&ldev->lock); + lag_active = __mlx5_lag_is_active(ldev); + if (!lag_active) + goto unlock; + + shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags); + fdb_sel_mode_native = test_bit(MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE, + &ldev->mode_flags); + +unlock: + mutex_unlock(&ldev->lock); + if (!lag_active) + return -EINVAL; + + seq_printf(file, "%s:%s\n", "shared_fdb", shared_fdb ? "on" : "off"); + seq_printf(file, "%s:%s\n", "fdb_selection_mode", + fdb_sel_mode_native ? "native" : "affinity"); + return 0; +} + +static int mapping_show(struct seq_file *file, void *priv) +{ + struct mlx5_core_dev *dev = file->private; + u8 ports[MLX5_MAX_PORTS] = {}; + struct mlx5_lag *ldev; + bool hash = false; + bool lag_active; + int num_ports; + int i; + + ldev = dev->priv.lag; + mutex_lock(&ldev->lock); + lag_active = __mlx5_lag_is_active(ldev); + if (lag_active) { + if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &ldev->mode_flags)) { + mlx5_infer_tx_enabled(&ldev->tracker, ldev->ports, ports, + &num_ports); + hash = true; + } else { + for (i = 0; i < ldev->ports; i++) + ports[i] = ldev->v2p_map[i]; + num_ports = ldev->ports; + } + } + mutex_unlock(&ldev->lock); + if (!lag_active) + return -EINVAL; + + for (i = 0; i < num_ports; i++) { + if (hash) + seq_printf(file, "%d\n", ports[i] + 1); + else + seq_printf(file, "%d:%d\n", i + 1, ports[i]); + } + + return 0; +} + +static int members_show(struct seq_file *file, void *priv) +{ + struct mlx5_core_dev *dev = file->private; + struct mlx5_lag *ldev; + int i; + + ldev = dev->priv.lag; + mutex_lock(&ldev->lock); + for (i = 0; i < ldev->ports; i++) { + if (!ldev->pf[i].dev) + continue; + seq_printf(file, "%s\n", dev_name(ldev->pf[i].dev->device)); + } + mutex_unlock(&ldev->lock); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(type); +DEFINE_SHOW_ATTRIBUTE(port_sel_mode); +DEFINE_SHOW_ATTRIBUTE(state); +DEFINE_SHOW_ATTRIBUTE(flags); +DEFINE_SHOW_ATTRIBUTE(mapping); +DEFINE_SHOW_ATTRIBUTE(members); + +void mlx5_ldev_add_debugfs(struct mlx5_core_dev *dev) +{ + struct dentry *dbg; + + dbg = debugfs_create_dir("lag", mlx5_debugfs_get_dev_root(dev)); + dev->priv.dbg.lag_debugfs = dbg; + + debugfs_create_file("type", 0444, dbg, dev, &type_fops); + debugfs_create_file("port_sel_mode", 0444, dbg, dev, &port_sel_mode_fops); + debugfs_create_file("state", 0444, dbg, dev, &state_fops); + debugfs_create_file("flags", 0444, dbg, dev, &flags_fops); + debugfs_create_file("mapping", 0444, dbg, dev, &mapping_fops); + debugfs_create_file("members", 0444, dbg, dev, &members_fops); +} + +void mlx5_ldev_remove_debugfs(struct dentry *dbg) +{ + debugfs_remove_recursive(dbg); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index 4ddf6b330a44..a9f4ede4a9bf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -31,14 +31,22 @@ */ #include <linux/netdevice.h> +#include <net/bonding.h> #include <linux/mlx5/driver.h> #include <linux/mlx5/eswitch.h> #include <linux/mlx5/vport.h> #include "lib/devcom.h" #include "mlx5_core.h" #include "eswitch.h" +#include "esw/acl/ofld.h" #include "lag.h" #include "mp.h" +#include "mpesw.h" + +enum { + MLX5_LAG_EGRESS_PORT_1 = 1, + MLX5_LAG_EGRESS_PORT_2, +}; /* General purpose, use for short periods of time. * Beware of lock dependencies (preferably, no locks should be acquired @@ -46,28 +54,67 @@ */ static DEFINE_SPINLOCK(lag_lock); -static int mlx5_cmd_create_lag(struct mlx5_core_dev *dev, u8 remap_port1, - u8 remap_port2, bool shared_fdb, u8 flags) +static int get_port_sel_mode(enum mlx5_lag_mode mode, unsigned long flags) { + if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) + return MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT; + + if (mode == MLX5_LAG_MODE_MPESW) + return MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW; + + return MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY; +} + +static u8 lag_active_port_bits(struct mlx5_lag *ldev) +{ + u8 enabled_ports[MLX5_MAX_PORTS] = {}; + u8 active_port = 0; + int num_enabled; + int idx; + + mlx5_infer_tx_enabled(&ldev->tracker, ldev->ports, enabled_ports, + &num_enabled); + for (idx = 0; idx < num_enabled; idx++) + active_port |= BIT_MASK(enabled_ports[idx]); + + return active_port; +} + +static int mlx5_cmd_create_lag(struct mlx5_core_dev *dev, u8 *ports, int mode, + unsigned long flags) +{ + bool fdb_sel_mode = test_bit(MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE, + &flags); + int port_sel_mode = get_port_sel_mode(mode, flags); u32 in[MLX5_ST_SZ_DW(create_lag_in)] = {}; - void *lag_ctx = MLX5_ADDR_OF(create_lag_in, in, ctx); + void *lag_ctx; + lag_ctx = MLX5_ADDR_OF(create_lag_in, in, ctx); MLX5_SET(create_lag_in, in, opcode, MLX5_CMD_OP_CREATE_LAG); + MLX5_SET(lagc, lag_ctx, fdb_selection_mode, fdb_sel_mode); - MLX5_SET(lagc, lag_ctx, fdb_selection_mode, shared_fdb); - if (!(flags & MLX5_LAG_FLAG_HASH_BASED)) { - MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, remap_port1); - MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, remap_port2); - } else { - MLX5_SET(lagc, lag_ctx, port_select_mode, - MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT); + switch (port_sel_mode) { + case MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY: + MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, ports[0]); + MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, ports[1]); + break; + case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT: + if (!MLX5_CAP_PORT_SELECTION(dev, port_select_flow_table_bypass)) + break; + + MLX5_SET(lagc, lag_ctx, active_port, + lag_active_port_bits(mlx5_lag_dev(dev))); + break; + default: + break; } + MLX5_SET(lagc, lag_ctx, port_select_mode, port_sel_mode); return mlx5_cmd_exec_in(dev, create_lag, in); } -static int mlx5_cmd_modify_lag(struct mlx5_core_dev *dev, u8 remap_port1, - u8 remap_port2) +static int mlx5_cmd_modify_lag(struct mlx5_core_dev *dev, u8 num_ports, + u8 *ports) { u32 in[MLX5_ST_SZ_DW(modify_lag_in)] = {}; void *lag_ctx = MLX5_ADDR_OF(modify_lag_in, in, ctx); @@ -75,8 +122,8 @@ static int mlx5_cmd_modify_lag(struct mlx5_core_dev *dev, u8 remap_port1, MLX5_SET(modify_lag_in, in, opcode, MLX5_CMD_OP_MODIFY_LAG); MLX5_SET(modify_lag_in, in, field_select, 0x1); - MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, remap_port1); - MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, remap_port2); + MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, ports[0]); + MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, ports[1]); return mlx5_cmd_exec_in(dev, modify_lag, in); } @@ -101,6 +148,75 @@ int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev) } EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag); +static void mlx5_infer_tx_disabled(struct lag_tracker *tracker, u8 num_ports, + u8 *ports, int *num_disabled) +{ + int i; + + *num_disabled = 0; + for (i = 0; i < num_ports; i++) { + if (!tracker->netdev_state[i].tx_enabled || + !tracker->netdev_state[i].link_up) + ports[(*num_disabled)++] = i; + } +} + +void mlx5_infer_tx_enabled(struct lag_tracker *tracker, u8 num_ports, + u8 *ports, int *num_enabled) +{ + int i; + + *num_enabled = 0; + for (i = 0; i < num_ports; i++) { + if (tracker->netdev_state[i].tx_enabled && + tracker->netdev_state[i].link_up) + ports[(*num_enabled)++] = i; + } + + if (*num_enabled == 0) + mlx5_infer_tx_disabled(tracker, num_ports, ports, num_enabled); +} + +static void mlx5_lag_print_mapping(struct mlx5_core_dev *dev, + struct mlx5_lag *ldev, + struct lag_tracker *tracker, + unsigned long flags) +{ + char buf[MLX5_MAX_PORTS * 10 + 1] = {}; + u8 enabled_ports[MLX5_MAX_PORTS] = {}; + int written = 0; + int num_enabled; + int idx; + int err; + int i; + int j; + + if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) { + mlx5_infer_tx_enabled(tracker, ldev->ports, enabled_ports, + &num_enabled); + for (i = 0; i < num_enabled; i++) { + err = scnprintf(buf + written, 4, "%d, ", enabled_ports[i] + 1); + if (err != 3) + return; + written += err; + } + buf[written - 2] = 0; + mlx5_core_info(dev, "lag map active ports: %s\n", buf); + } else { + for (i = 0; i < ldev->ports; i++) { + for (j = 0; j < ldev->buckets; j++) { + idx = i * ldev->buckets + j; + err = scnprintf(buf + written, 10, + " port %d:%d", i + 1, ldev->v2p_map[idx]); + if (err != 9) + return; + written += err; + } + } + mlx5_core_info(dev, "lag map:%s\n", buf); + } +} + static int mlx5_lag_netdev_event(struct notifier_block *this, unsigned long event, void *ptr); static void mlx5_do_bond_work(struct work_struct *work); @@ -112,8 +228,10 @@ static void mlx5_ldev_free(struct kref *ref) if (ldev->nb.notifier_call) unregister_netdevice_notifier_net(&init_net, &ldev->nb); mlx5_lag_mp_cleanup(ldev); - cancel_delayed_work_sync(&ldev->bond_work); + mlx5_lag_mpesw_cleanup(ldev); + cancel_work_sync(&ldev->mpesw_work); destroy_workqueue(ldev->wq); + mutex_destroy(&ldev->lock); kfree(ldev); } @@ -143,6 +261,7 @@ static struct mlx5_lag *mlx5_lag_dev_alloc(struct mlx5_core_dev *dev) } kref_init(&ldev->ref); + mutex_init(&ldev->lock); INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work); ldev->nb.notifier_call = mlx5_lag_netdev_event; @@ -150,12 +269,17 @@ static struct mlx5_lag *mlx5_lag_dev_alloc(struct mlx5_core_dev *dev) ldev->nb.notifier_call = NULL; mlx5_core_err(dev, "Failed to register LAG netdev notifier\n"); } + ldev->mode = MLX5_LAG_MODE_NONE; err = mlx5_lag_mp_init(ldev); if (err) mlx5_core_err(dev, "Failed to init multipath lag err=%d\n", err); + mlx5_lag_mpesw_init(ldev); + ldev->ports = MLX5_CAP_GEN(dev, num_lag_ports); + ldev->buckets = 1; + return ldev; } @@ -164,7 +288,7 @@ int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev, { int i; - for (i = 0; i < MLX5_MAX_PORTS; i++) + for (i = 0; i < ldev->ports; i++) if (ldev->pf[i].netdev == ndev) return i; @@ -173,108 +297,278 @@ int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev, static bool __mlx5_lag_is_roce(struct mlx5_lag *ldev) { - return !!(ldev->flags & MLX5_LAG_FLAG_ROCE); + return ldev->mode == MLX5_LAG_MODE_ROCE; } static bool __mlx5_lag_is_sriov(struct mlx5_lag *ldev) { - return !!(ldev->flags & MLX5_LAG_FLAG_SRIOV); + return ldev->mode == MLX5_LAG_MODE_SRIOV; } +/* Create a mapping between steering slots and active ports. + * As we have ldev->buckets slots per port first assume the native + * mapping should be used. + * If there are ports that are disabled fill the relevant slots + * with mapping that points to active ports. + */ static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker, - u8 *port1, u8 *port2) + u8 num_ports, + u8 buckets, + u8 *ports) { - bool p1en; - bool p2en; + int disabled[MLX5_MAX_PORTS] = {}; + int enabled[MLX5_MAX_PORTS] = {}; + int disabled_ports_num = 0; + int enabled_ports_num = 0; + int idx; + u32 rand; + int i; + int j; - p1en = tracker->netdev_state[MLX5_LAG_P1].tx_enabled && - tracker->netdev_state[MLX5_LAG_P1].link_up; + for (i = 0; i < num_ports; i++) { + if (tracker->netdev_state[i].tx_enabled && + tracker->netdev_state[i].link_up) + enabled[enabled_ports_num++] = i; + else + disabled[disabled_ports_num++] = i; + } - p2en = tracker->netdev_state[MLX5_LAG_P2].tx_enabled && - tracker->netdev_state[MLX5_LAG_P2].link_up; + /* Use native mapping by default where each port's buckets + * point the native port: 1 1 1 .. 1 2 2 2 ... 2 3 3 3 ... 3 etc + */ + for (i = 0; i < num_ports; i++) + for (j = 0; j < buckets; j++) { + idx = i * buckets + j; + ports[idx] = MLX5_LAG_EGRESS_PORT_1 + i; + } - *port1 = 1; - *port2 = 2; - if ((!p1en && !p2en) || (p1en && p2en)) + /* If all ports are disabled/enabled keep native mapping */ + if (enabled_ports_num == num_ports || + disabled_ports_num == num_ports) return; - if (p1en) - *port2 = 1; - else - *port1 = 2; + /* Go over the disabled ports and for each assign a random active port */ + for (i = 0; i < disabled_ports_num; i++) { + for (j = 0; j < buckets; j++) { + get_random_bytes(&rand, 4); + ports[disabled[i] * buckets + j] = enabled[rand % enabled_ports_num] + 1; + } + } +} + +static bool mlx5_lag_has_drop_rule(struct mlx5_lag *ldev) +{ + int i; + + for (i = 0; i < ldev->ports; i++) + if (ldev->pf[i].has_drop) + return true; + return false; } -static int _mlx5_modify_lag(struct mlx5_lag *ldev, u8 v2p_port1, u8 v2p_port2) +static void mlx5_lag_drop_rule_cleanup(struct mlx5_lag *ldev) +{ + int i; + + for (i = 0; i < ldev->ports; i++) { + if (!ldev->pf[i].has_drop) + continue; + + mlx5_esw_acl_ingress_vport_drop_rule_destroy(ldev->pf[i].dev->priv.eswitch, + MLX5_VPORT_UPLINK); + ldev->pf[i].has_drop = false; + } +} + +static void mlx5_lag_drop_rule_setup(struct mlx5_lag *ldev, + struct lag_tracker *tracker) +{ + u8 disabled_ports[MLX5_MAX_PORTS] = {}; + struct mlx5_core_dev *dev; + int disabled_index; + int num_disabled; + int err; + int i; + + /* First delete the current drop rule so there won't be any dropped + * packets + */ + mlx5_lag_drop_rule_cleanup(ldev); + + if (!ldev->tracker.has_inactive) + return; + + mlx5_infer_tx_disabled(tracker, ldev->ports, disabled_ports, &num_disabled); + + for (i = 0; i < num_disabled; i++) { + disabled_index = disabled_ports[i]; + dev = ldev->pf[disabled_index].dev; + err = mlx5_esw_acl_ingress_vport_drop_rule_create(dev->priv.eswitch, + MLX5_VPORT_UPLINK); + if (!err) + ldev->pf[disabled_index].has_drop = true; + else + mlx5_core_err(dev, + "Failed to create lag drop rule, error: %d", err); + } +} + +static int mlx5_cmd_modify_active_port(struct mlx5_core_dev *dev, u8 ports) +{ + u32 in[MLX5_ST_SZ_DW(modify_lag_in)] = {}; + void *lag_ctx; + + lag_ctx = MLX5_ADDR_OF(modify_lag_in, in, ctx); + + MLX5_SET(modify_lag_in, in, opcode, MLX5_CMD_OP_MODIFY_LAG); + MLX5_SET(modify_lag_in, in, field_select, 0x2); + + MLX5_SET(lagc, lag_ctx, active_port, ports); + + return mlx5_cmd_exec_in(dev, modify_lag, in); +} + +static int _mlx5_modify_lag(struct mlx5_lag *ldev, u8 *ports) { struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + u8 active_ports; + int ret; + + if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &ldev->mode_flags)) { + ret = mlx5_lag_port_sel_modify(ldev, ports); + if (ret || + !MLX5_CAP_PORT_SELECTION(dev0, port_select_flow_table_bypass)) + return ret; - if (ldev->flags & MLX5_LAG_FLAG_HASH_BASED) - return mlx5_lag_port_sel_modify(ldev, v2p_port1, v2p_port2); - return mlx5_cmd_modify_lag(dev0, v2p_port1, v2p_port2); + active_ports = lag_active_port_bits(ldev); + + return mlx5_cmd_modify_active_port(dev0, active_ports); + } + return mlx5_cmd_modify_lag(dev0, ldev->ports, ports); } void mlx5_modify_lag(struct mlx5_lag *ldev, struct lag_tracker *tracker) { + u8 ports[MLX5_MAX_PORTS * MLX5_LAG_MAX_HASH_BUCKETS] = {}; struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; - u8 v2p_port1, v2p_port2; + int idx; int err; + int i; + int j; - mlx5_infer_tx_affinity_mapping(tracker, &v2p_port1, - &v2p_port2); + mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ports); - if (v2p_port1 != ldev->v2p_map[MLX5_LAG_P1] || - v2p_port2 != ldev->v2p_map[MLX5_LAG_P2]) { - err = _mlx5_modify_lag(ldev, v2p_port1, v2p_port2); - if (err) { - mlx5_core_err(dev0, - "Failed to modify LAG (%d)\n", - err); - return; + for (i = 0; i < ldev->ports; i++) { + for (j = 0; j < ldev->buckets; j++) { + idx = i * ldev->buckets + j; + if (ports[idx] == ldev->v2p_map[idx]) + continue; + err = _mlx5_modify_lag(ldev, ports); + if (err) { + mlx5_core_err(dev0, + "Failed to modify LAG (%d)\n", + err); + return; + } + memcpy(ldev->v2p_map, ports, sizeof(ports)); + + mlx5_lag_print_mapping(dev0, ldev, tracker, + ldev->mode_flags); + break; } - ldev->v2p_map[MLX5_LAG_P1] = v2p_port1; - ldev->v2p_map[MLX5_LAG_P2] = v2p_port2; - mlx5_core_info(dev0, "modify lag map port 1:%d port 2:%d", - ldev->v2p_map[MLX5_LAG_P1], - ldev->v2p_map[MLX5_LAG_P2]); } + + if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP && + !(ldev->mode == MLX5_LAG_MODE_ROCE)) + mlx5_lag_drop_rule_setup(ldev, tracker); +} + +static int mlx5_lag_set_port_sel_mode_roce(struct mlx5_lag *ldev, + unsigned long *flags) +{ + struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + + if (!MLX5_CAP_PORT_SELECTION(dev0, port_select_flow_table)) { + if (ldev->ports > 2) + return -EINVAL; + return 0; + } + + if (ldev->ports > 2) + ldev->buckets = MLX5_LAG_MAX_HASH_BUCKETS; + + set_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, flags); + + return 0; } -static void mlx5_lag_set_port_sel_mode(struct mlx5_lag *ldev, - struct lag_tracker *tracker, u8 *flags) +static void mlx5_lag_set_port_sel_mode_offloads(struct mlx5_lag *ldev, + struct lag_tracker *tracker, + enum mlx5_lag_mode mode, + unsigned long *flags) { - bool roce_lag = !!(*flags & MLX5_LAG_FLAG_ROCE); struct lag_func *dev0 = &ldev->pf[MLX5_LAG_P1]; - if (roce_lag || - !MLX5_CAP_PORT_SELECTION(dev0->dev, port_select_flow_table) || - tracker->tx_type != NETDEV_LAG_TX_TYPE_HASH) + if (mode == MLX5_LAG_MODE_MPESW) return; - *flags |= MLX5_LAG_FLAG_HASH_BASED; + + if (MLX5_CAP_PORT_SELECTION(dev0->dev, port_select_flow_table) && + tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH) + set_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, flags); +} + +static int mlx5_lag_set_flags(struct mlx5_lag *ldev, enum mlx5_lag_mode mode, + struct lag_tracker *tracker, bool shared_fdb, + unsigned long *flags) +{ + bool roce_lag = mode == MLX5_LAG_MODE_ROCE; + + *flags = 0; + if (shared_fdb) { + set_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, flags); + set_bit(MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE, flags); + } + + if (mode == MLX5_LAG_MODE_MPESW) + set_bit(MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE, flags); + + if (roce_lag) + return mlx5_lag_set_port_sel_mode_roce(ldev, flags); + + mlx5_lag_set_port_sel_mode_offloads(ldev, tracker, mode, flags); + return 0; } -static char *get_str_port_sel_mode(u8 flags) +char *mlx5_get_str_port_sel_mode(enum mlx5_lag_mode mode, unsigned long flags) { - if (flags & MLX5_LAG_FLAG_HASH_BASED) - return "hash"; - return "queue_affinity"; + int port_sel_mode = get_port_sel_mode(mode, flags); + + switch (port_sel_mode) { + case MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY: return "queue_affinity"; + case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT: return "hash"; + case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW: return "mpesw"; + default: return "invalid"; + } } static int mlx5_create_lag(struct mlx5_lag *ldev, struct lag_tracker *tracker, - bool shared_fdb, u8 flags) + enum mlx5_lag_mode mode, + unsigned long flags) { + bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags); struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev; u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {}; int err; - mlx5_core_info(dev0, "lag map port 1:%d port 2:%d shared_fdb:%d mode:%s", - ldev->v2p_map[MLX5_LAG_P1], ldev->v2p_map[MLX5_LAG_P2], - shared_fdb, get_str_port_sel_mode(flags)); + if (tracker) + mlx5_lag_print_mapping(dev0, ldev, tracker, flags); + mlx5_core_info(dev0, "shared_fdb:%d mode:%s\n", + shared_fdb, mlx5_get_str_port_sel_mode(mode, flags)); - err = mlx5_cmd_create_lag(dev0, ldev->v2p_map[MLX5_LAG_P1], - ldev->v2p_map[MLX5_LAG_P2], shared_fdb, flags); + err = mlx5_cmd_create_lag(dev0, ldev->v2p_map, mode, flags); if (err) { mlx5_core_err(dev0, "Failed to create LAG (%d)\n", @@ -303,31 +597,35 @@ static int mlx5_create_lag(struct mlx5_lag *ldev, int mlx5_activate_lag(struct mlx5_lag *ldev, struct lag_tracker *tracker, - u8 flags, + enum mlx5_lag_mode mode, bool shared_fdb) { - bool roce_lag = !!(flags & MLX5_LAG_FLAG_ROCE); + bool roce_lag = mode == MLX5_LAG_MODE_ROCE; struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + unsigned long flags = 0; int err; - mlx5_infer_tx_affinity_mapping(tracker, &ldev->v2p_map[MLX5_LAG_P1], - &ldev->v2p_map[MLX5_LAG_P2]); - mlx5_lag_set_port_sel_mode(ldev, tracker, &flags); - if (flags & MLX5_LAG_FLAG_HASH_BASED) { - err = mlx5_lag_port_sel_create(ldev, tracker->hash_type, - ldev->v2p_map[MLX5_LAG_P1], - ldev->v2p_map[MLX5_LAG_P2]); - if (err) { - mlx5_core_err(dev0, - "Failed to create LAG port selection(%d)\n", - err); - return err; + err = mlx5_lag_set_flags(ldev, mode, tracker, shared_fdb, &flags); + if (err) + return err; + + if (mode != MLX5_LAG_MODE_MPESW) { + mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ldev->v2p_map); + if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) { + err = mlx5_lag_port_sel_create(ldev, tracker->hash_type, + ldev->v2p_map); + if (err) { + mlx5_core_err(dev0, + "Failed to create LAG port selection(%d)\n", + err); + return err; + } } } - err = mlx5_create_lag(ldev, tracker, shared_fdb, flags); + err = mlx5_create_lag(ldev, tracker, mode, flags); if (err) { - if (flags & MLX5_LAG_FLAG_HASH_BASED) + if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) mlx5_lag_port_sel_destroy(ldev); if (roce_lag) mlx5_core_err(dev0, @@ -339,26 +637,32 @@ int mlx5_activate_lag(struct mlx5_lag *ldev, return err; } - ldev->flags |= flags; - ldev->shared_fdb = shared_fdb; + if (tracker && tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP && + !roce_lag) + mlx5_lag_drop_rule_setup(ldev, tracker); + + ldev->mode = mode; + ldev->mode_flags = flags; return 0; } static int mlx5_deactivate_lag(struct mlx5_lag *ldev) { struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev; u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {}; bool roce_lag = __mlx5_lag_is_roce(ldev); - u8 flags = ldev->flags; + unsigned long flags = ldev->mode_flags; int err; - ldev->flags &= ~MLX5_LAG_MODE_FLAGS; + ldev->mode = MLX5_LAG_MODE_NONE; + ldev->mode_flags = 0; mlx5_lag_mp_reset(ldev); - if (ldev->shared_fdb) { - mlx5_eswitch_offloads_destroy_single_fdb(ldev->pf[MLX5_LAG_P1].dev->priv.eswitch, - ldev->pf[MLX5_LAG_P2].dev->priv.eswitch); - ldev->shared_fdb = false; + if (test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags)) { + mlx5_eswitch_offloads_destroy_single_fdb(dev0->priv.eswitch, + dev1->priv.eswitch); + clear_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags); } MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG); @@ -372,32 +676,55 @@ static int mlx5_deactivate_lag(struct mlx5_lag *ldev) "Failed to deactivate VF LAG; driver restart required\n" "Make sure all VFs are unbound prior to VF LAG activation or deactivation\n"); } - } else if (flags & MLX5_LAG_FLAG_HASH_BASED) { - mlx5_lag_port_sel_destroy(ldev); + return err; } - return err; + if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) + mlx5_lag_port_sel_destroy(ldev); + if (mlx5_lag_has_drop_rule(ldev)) + mlx5_lag_drop_rule_cleanup(ldev); + + return 0; } +#define MLX5_LAG_OFFLOADS_SUPPORTED_PORTS 2 static bool mlx5_lag_check_prereq(struct mlx5_lag *ldev) { - if (!ldev->pf[MLX5_LAG_P1].dev || !ldev->pf[MLX5_LAG_P2].dev) - return false; +#ifdef CONFIG_MLX5_ESWITCH + struct mlx5_core_dev *dev; + u8 mode; +#endif + int i; + + for (i = 0; i < ldev->ports; i++) + if (!ldev->pf[i].dev) + return false; #ifdef CONFIG_MLX5_ESWITCH - return mlx5_esw_lag_prereq(ldev->pf[MLX5_LAG_P1].dev, - ldev->pf[MLX5_LAG_P2].dev); + dev = ldev->pf[MLX5_LAG_P1].dev; + if ((mlx5_sriov_is_enabled(dev)) && !is_mdev_switchdev_mode(dev)) + return false; + + mode = mlx5_eswitch_mode(dev); + for (i = 0; i < ldev->ports; i++) + if (mlx5_eswitch_mode(ldev->pf[i].dev) != mode) + return false; + + if (mode == MLX5_ESWITCH_OFFLOADS && ldev->ports != MLX5_LAG_OFFLOADS_SUPPORTED_PORTS) + return false; #else - return (!mlx5_sriov_is_enabled(ldev->pf[MLX5_LAG_P1].dev) && - !mlx5_sriov_is_enabled(ldev->pf[MLX5_LAG_P2].dev)); + for (i = 0; i < ldev->ports; i++) + if (mlx5_sriov_is_enabled(ldev->pf[i].dev)) + return false; #endif + return true; } static void mlx5_lag_add_devices(struct mlx5_lag *ldev) { int i; - for (i = 0; i < MLX5_MAX_PORTS; i++) { + for (i = 0; i < ldev->ports; i++) { if (!ldev->pf[i].dev) continue; @@ -414,7 +741,7 @@ static void mlx5_lag_remove_devices(struct mlx5_lag *ldev) { int i; - for (i = 0; i < MLX5_MAX_PORTS; i++) { + for (i = 0; i < ldev->ports; i++) { if (!ldev->pf[i].dev) continue; @@ -427,13 +754,14 @@ static void mlx5_lag_remove_devices(struct mlx5_lag *ldev) } } -static void mlx5_disable_lag(struct mlx5_lag *ldev) +void mlx5_disable_lag(struct mlx5_lag *ldev) { + bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags); struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev; - bool shared_fdb = ldev->shared_fdb; bool roce_lag; int err; + int i; roce_lag = __mlx5_lag_is_roce(ldev); @@ -444,7 +772,8 @@ static void mlx5_disable_lag(struct mlx5_lag *ldev) dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV; mlx5_rescan_drivers_locked(dev0); } - mlx5_nic_vport_disable_roce(dev1); + for (i = 1; i < ldev->ports; i++) + mlx5_nic_vport_disable_roce(ldev->pf[i].dev); } err = mlx5_deactivate_lag(ldev); @@ -462,7 +791,7 @@ static void mlx5_disable_lag(struct mlx5_lag *ldev) } } -static bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev) +bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev) { struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev; @@ -481,13 +810,42 @@ static bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev) return false; } +static bool mlx5_lag_is_roce_lag(struct mlx5_lag *ldev) +{ + bool roce_lag = true; + int i; + + for (i = 0; i < ldev->ports; i++) + roce_lag = roce_lag && !mlx5_sriov_is_enabled(ldev->pf[i].dev); + +#ifdef CONFIG_MLX5_ESWITCH + for (i = 0; i < ldev->ports; i++) + roce_lag = roce_lag && is_mdev_legacy_mode(ldev->pf[i].dev); +#endif + + return roce_lag; +} + +static bool mlx5_lag_should_modify_lag(struct mlx5_lag *ldev, bool do_bond) +{ + return do_bond && __mlx5_lag_is_active(ldev) && + ldev->mode != MLX5_LAG_MODE_MPESW; +} + +static bool mlx5_lag_should_disable_lag(struct mlx5_lag *ldev, bool do_bond) +{ + return !do_bond && __mlx5_lag_is_active(ldev) && + ldev->mode != MLX5_LAG_MODE_MPESW; +} + static void mlx5_do_bond(struct mlx5_lag *ldev) { struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev; - struct lag_tracker tracker; + struct lag_tracker tracker = { }; bool do_bond, roce_lag; int err; + int i; if (!mlx5_lag_is_ready(ldev)) { do_bond = false; @@ -504,21 +862,14 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) if (do_bond && !__mlx5_lag_is_active(ldev)) { bool shared_fdb = mlx5_shared_fdb_supported(ldev); - roce_lag = !mlx5_sriov_is_enabled(dev0) && - !mlx5_sriov_is_enabled(dev1); - -#ifdef CONFIG_MLX5_ESWITCH - roce_lag = roce_lag && - dev0->priv.eswitch->mode == MLX5_ESWITCH_NONE && - dev1->priv.eswitch->mode == MLX5_ESWITCH_NONE; -#endif + roce_lag = mlx5_lag_is_roce_lag(ldev); if (shared_fdb || roce_lag) mlx5_lag_remove_devices(ldev); err = mlx5_activate_lag(ldev, &tracker, - roce_lag ? MLX5_LAG_FLAG_ROCE : - MLX5_LAG_FLAG_SRIOV, + roce_lag ? MLX5_LAG_MODE_ROCE : + MLX5_LAG_MODE_SRIOV, shared_fdb); if (err) { if (shared_fdb || roce_lag) @@ -528,7 +879,8 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) } else if (roce_lag) { dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV; mlx5_rescan_drivers_locked(dev0); - mlx5_nic_vport_enable_roce(dev1); + for (i = 1; i < ldev->ports; i++) + mlx5_nic_vport_enable_roce(ldev->pf[i].dev); } else if (shared_fdb) { dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV; mlx5_rescan_drivers_locked(dev0); @@ -548,9 +900,9 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) return; } } - } else if (do_bond && __mlx5_lag_is_active(ldev)) { + } else if (mlx5_lag_should_modify_lag(ldev, do_bond)) { mlx5_modify_lag(ldev, &tracker); - } else if (!do_bond && __mlx5_lag_is_active(ldev)) { + } else if (mlx5_lag_should_disable_lag(ldev, do_bond)) { mlx5_disable_lag(ldev); } } @@ -560,31 +912,11 @@ static void mlx5_queue_bond_work(struct mlx5_lag *ldev, unsigned long delay) queue_delayed_work(ldev->wq, &ldev->bond_work, delay); } -static void mlx5_lag_lock_eswitches(struct mlx5_core_dev *dev0, - struct mlx5_core_dev *dev1) -{ - if (dev0) - mlx5_esw_lock(dev0->priv.eswitch); - if (dev1) - mlx5_esw_lock(dev1->priv.eswitch); -} - -static void mlx5_lag_unlock_eswitches(struct mlx5_core_dev *dev0, - struct mlx5_core_dev *dev1) -{ - if (dev1) - mlx5_esw_unlock(dev1->priv.eswitch); - if (dev0) - mlx5_esw_unlock(dev0->priv.eswitch); -} - static void mlx5_do_bond_work(struct work_struct *work) { struct delayed_work *delayed_work = to_delayed_work(work); struct mlx5_lag *ldev = container_of(delayed_work, struct mlx5_lag, bond_work); - struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; - struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev; int status; status = mlx5_dev_list_trylock(); @@ -593,27 +925,29 @@ static void mlx5_do_bond_work(struct work_struct *work) return; } + mutex_lock(&ldev->lock); if (ldev->mode_changes_in_progress) { + mutex_unlock(&ldev->lock); mlx5_dev_list_unlock(); mlx5_queue_bond_work(ldev, HZ); return; } - mlx5_lag_lock_eswitches(dev0, dev1); mlx5_do_bond(ldev); - mlx5_lag_unlock_eswitches(dev0, dev1); + mutex_unlock(&ldev->lock); mlx5_dev_list_unlock(); } static int mlx5_handle_changeupper_event(struct mlx5_lag *ldev, struct lag_tracker *tracker, - struct net_device *ndev, struct netdev_notifier_changeupper_info *info) { struct net_device *upper = info->upper_dev, *ndev_tmp; struct netdev_lag_upper_info *lag_upper_info = NULL; bool is_bonded, is_in_lag, mode_supported; - int bond_status = 0; + bool has_inactive = 0; + struct slave *slave; + u8 bond_status = 0; int num_slaves = 0; int changed = 0; int idx; @@ -632,15 +966,19 @@ static int mlx5_handle_changeupper_event(struct mlx5_lag *ldev, rcu_read_lock(); for_each_netdev_in_bond_rcu(upper, ndev_tmp) { idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp); - if (idx >= 0) + if (idx >= 0) { + slave = bond_slave_get_rcu(ndev_tmp); + if (slave) + has_inactive |= bond_is_slave_inactive(slave); bond_status |= (1 << idx); + } num_slaves++; } rcu_read_unlock(); /* None of this lagdev's netdevs are slaves of this master. */ - if (!(bond_status & 0x3)) + if (!(bond_status & GENMASK(ldev->ports - 1, 0))) return 0; if (lag_upper_info) { @@ -648,11 +986,13 @@ static int mlx5_handle_changeupper_event(struct mlx5_lag *ldev, tracker->hash_type = lag_upper_info->hash_type; } + tracker->has_inactive = has_inactive; /* Determine bonding status: * A device is considered bonded if both its physical ports are slaves * of the same lag master, and only them. */ - is_in_lag = num_slaves == MLX5_MAX_PORTS && bond_status == 0x3; + is_in_lag = num_slaves == ldev->ports && + bond_status == GENMASK(ldev->ports - 1, 0); /* Lag mode must be activebackup or hash. */ mode_supported = tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP || @@ -704,6 +1044,39 @@ static int mlx5_handle_changelowerstate_event(struct mlx5_lag *ldev, return 1; } +static int mlx5_handle_changeinfodata_event(struct mlx5_lag *ldev, + struct lag_tracker *tracker, + struct net_device *ndev) +{ + struct net_device *ndev_tmp; + struct slave *slave; + bool has_inactive = 0; + int idx; + + if (!netif_is_lag_master(ndev)) + return 0; + + rcu_read_lock(); + for_each_netdev_in_bond_rcu(ndev, ndev_tmp) { + idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp); + if (idx < 0) + continue; + + slave = bond_slave_get_rcu(ndev_tmp); + if (slave) + has_inactive |= bond_is_slave_inactive(slave); + } + rcu_read_unlock(); + + if (tracker->has_inactive == has_inactive) + return 0; + + tracker->has_inactive = has_inactive; + + return 1; +} + +/* this handler is always registered to netdev events */ static int mlx5_lag_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -712,7 +1085,9 @@ static int mlx5_lag_netdev_event(struct notifier_block *this, struct mlx5_lag *ldev; int changed = 0; - if ((event != NETDEV_CHANGEUPPER) && (event != NETDEV_CHANGELOWERSTATE)) + if (event != NETDEV_CHANGEUPPER && + event != NETDEV_CHANGELOWERSTATE && + event != NETDEV_CHANGEINFODATA) return NOTIFY_DONE; ldev = container_of(this, struct mlx5_lag, nb); @@ -721,13 +1096,15 @@ static int mlx5_lag_netdev_event(struct notifier_block *this, switch (event) { case NETDEV_CHANGEUPPER: - changed = mlx5_handle_changeupper_event(ldev, &tracker, ndev, - ptr); + changed = mlx5_handle_changeupper_event(ldev, &tracker, ptr); break; case NETDEV_CHANGELOWERSTATE: changed = mlx5_handle_changelowerstate_event(ldev, &tracker, ndev, ptr); break; + case NETDEV_CHANGEINFODATA: + changed = mlx5_handle_changeinfodata_event(ldev, &tracker, ndev); + break; } ldev->tracker = tracker; @@ -743,30 +1120,32 @@ static void mlx5_ldev_add_netdev(struct mlx5_lag *ldev, struct net_device *netdev) { unsigned int fn = mlx5_get_dev_index(dev); + unsigned long flags; - if (fn >= MLX5_MAX_PORTS) + if (fn >= ldev->ports) return; - spin_lock(&lag_lock); + spin_lock_irqsave(&lag_lock, flags); ldev->pf[fn].netdev = netdev; ldev->tracker.netdev_state[fn].link_up = 0; ldev->tracker.netdev_state[fn].tx_enabled = 0; - spin_unlock(&lag_lock); + spin_unlock_irqrestore(&lag_lock, flags); } static void mlx5_ldev_remove_netdev(struct mlx5_lag *ldev, struct net_device *netdev) { + unsigned long flags; int i; - spin_lock(&lag_lock); - for (i = 0; i < MLX5_MAX_PORTS; i++) { + spin_lock_irqsave(&lag_lock, flags); + for (i = 0; i < ldev->ports; i++) { if (ldev->pf[i].netdev == netdev) { ldev->pf[i].netdev = NULL; break; } } - spin_unlock(&lag_lock); + spin_unlock_irqrestore(&lag_lock, flags); } static void mlx5_ldev_add_mdev(struct mlx5_lag *ldev, @@ -774,24 +1153,23 @@ static void mlx5_ldev_add_mdev(struct mlx5_lag *ldev, { unsigned int fn = mlx5_get_dev_index(dev); - if (fn >= MLX5_MAX_PORTS) + if (fn >= ldev->ports) return; ldev->pf[fn].dev = dev; dev->priv.lag = ldev; } -/* Must be called with intf_mutex held */ static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev, struct mlx5_core_dev *dev) { int i; - for (i = 0; i < MLX5_MAX_PORTS; i++) + for (i = 0; i < ldev->ports; i++) if (ldev->pf[i].dev == dev) break; - if (i == MLX5_MAX_PORTS) + if (i == ldev->ports) return; ldev->pf[i].dev = NULL; @@ -804,12 +1182,7 @@ static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev) struct mlx5_lag *ldev = NULL; struct mlx5_core_dev *tmp_dev; - if (!MLX5_CAP_GEN(dev, vport_group_manager) || - !MLX5_CAP_GEN(dev, lag_master) || - MLX5_CAP_GEN(dev, num_lag_ports) != MLX5_MAX_PORTS) - return 0; - - tmp_dev = mlx5_get_next_phys_dev(dev); + tmp_dev = mlx5_get_next_phys_dev_lag(dev); if (tmp_dev) ldev = tmp_dev->priv.lag; @@ -819,13 +1192,18 @@ static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev) mlx5_core_err(dev, "Failed to alloc lag dev\n"); return 0; } - } else { - if (ldev->mode_changes_in_progress) - return -EAGAIN; - mlx5_ldev_get(ldev); + mlx5_ldev_add_mdev(ldev, dev); + return 0; } + mutex_lock(&ldev->lock); + if (ldev->mode_changes_in_progress) { + mutex_unlock(&ldev->lock); + return -EAGAIN; + } + mlx5_ldev_get(ldev); mlx5_ldev_add_mdev(ldev, dev); + mutex_unlock(&ldev->lock); return 0; } @@ -838,15 +1216,19 @@ void mlx5_lag_remove_mdev(struct mlx5_core_dev *dev) if (!ldev) return; + /* mdev is being removed, might as well remove debugfs + * as early as possible. + */ + mlx5_ldev_remove_debugfs(dev->priv.dbg.lag_debugfs); recheck: - mlx5_dev_list_lock(); + mutex_lock(&ldev->lock); if (ldev->mode_changes_in_progress) { - mlx5_dev_list_unlock(); + mutex_unlock(&ldev->lock); msleep(100); goto recheck; } mlx5_ldev_remove_mdev(ldev, dev); - mlx5_dev_list_unlock(); + mutex_unlock(&ldev->lock); mlx5_ldev_put(ldev); } @@ -854,35 +1236,45 @@ void mlx5_lag_add_mdev(struct mlx5_core_dev *dev) { int err; + if (!MLX5_CAP_GEN(dev, vport_group_manager) || + !MLX5_CAP_GEN(dev, lag_master) || + (MLX5_CAP_GEN(dev, num_lag_ports) > MLX5_MAX_PORTS || + MLX5_CAP_GEN(dev, num_lag_ports) <= 1)) + return; + recheck: mlx5_dev_list_lock(); err = __mlx5_lag_dev_add_mdev(dev); + mlx5_dev_list_unlock(); + if (err) { - mlx5_dev_list_unlock(); msleep(100); goto recheck; } - mlx5_dev_list_unlock(); + mlx5_ldev_add_debugfs(dev); } -/* Must be called with intf_mutex held */ void mlx5_lag_remove_netdev(struct mlx5_core_dev *dev, struct net_device *netdev) { struct mlx5_lag *ldev; + bool lag_is_active; ldev = mlx5_lag_dev(dev); if (!ldev) return; + mutex_lock(&ldev->lock); mlx5_ldev_remove_netdev(ldev, netdev); - ldev->flags &= ~MLX5_LAG_FLAG_READY; + clear_bit(MLX5_LAG_FLAG_NDEVS_READY, &ldev->state_flags); - if (__mlx5_lag_is_active(ldev)) + lag_is_active = __mlx5_lag_is_active(ldev); + mutex_unlock(&ldev->lock); + + if (lag_is_active) mlx5_queue_bond_work(ldev, 0); } -/* Must be called with intf_mutex held */ void mlx5_lag_add_netdev(struct mlx5_core_dev *dev, struct net_device *netdev) { @@ -893,26 +1285,29 @@ void mlx5_lag_add_netdev(struct mlx5_core_dev *dev, if (!ldev) return; + mutex_lock(&ldev->lock); mlx5_ldev_add_netdev(ldev, dev, netdev); - for (i = 0; i < MLX5_MAX_PORTS; i++) - if (!ldev->pf[i].dev) + for (i = 0; i < ldev->ports; i++) + if (!ldev->pf[i].netdev) break; - if (i >= MLX5_MAX_PORTS) - ldev->flags |= MLX5_LAG_FLAG_READY; + if (i >= ldev->ports) + set_bit(MLX5_LAG_FLAG_NDEVS_READY, &ldev->state_flags); + mutex_unlock(&ldev->lock); mlx5_queue_bond_work(ldev, 0); } bool mlx5_lag_is_roce(struct mlx5_core_dev *dev) { struct mlx5_lag *ldev; + unsigned long flags; bool res; - spin_lock(&lag_lock); + spin_lock_irqsave(&lag_lock, flags); ldev = mlx5_lag_dev(dev); res = ldev && __mlx5_lag_is_roce(ldev); - spin_unlock(&lag_lock); + spin_unlock_irqrestore(&lag_lock, flags); return res; } @@ -921,27 +1316,45 @@ EXPORT_SYMBOL(mlx5_lag_is_roce); bool mlx5_lag_is_active(struct mlx5_core_dev *dev) { struct mlx5_lag *ldev; + unsigned long flags; bool res; - spin_lock(&lag_lock); + spin_lock_irqsave(&lag_lock, flags); ldev = mlx5_lag_dev(dev); res = ldev && __mlx5_lag_is_active(ldev); - spin_unlock(&lag_lock); + spin_unlock_irqrestore(&lag_lock, flags); return res; } EXPORT_SYMBOL(mlx5_lag_is_active); +bool mlx5_lag_mode_is_hash(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev; + unsigned long flags; + bool res = 0; + + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(dev); + if (ldev) + res = test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &ldev->mode_flags); + spin_unlock_irqrestore(&lag_lock, flags); + + return res; +} +EXPORT_SYMBOL(mlx5_lag_mode_is_hash); + bool mlx5_lag_is_master(struct mlx5_core_dev *dev) { struct mlx5_lag *ldev; + unsigned long flags; bool res; - spin_lock(&lag_lock); + spin_lock_irqsave(&lag_lock, flags); ldev = mlx5_lag_dev(dev); res = ldev && __mlx5_lag_is_active(ldev) && dev == ldev->pf[MLX5_LAG_P1].dev; - spin_unlock(&lag_lock); + spin_unlock_irqrestore(&lag_lock, flags); return res; } @@ -950,12 +1363,13 @@ EXPORT_SYMBOL(mlx5_lag_is_master); bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev) { struct mlx5_lag *ldev; + unsigned long flags; bool res; - spin_lock(&lag_lock); + spin_lock_irqsave(&lag_lock, flags); ldev = mlx5_lag_dev(dev); res = ldev && __mlx5_lag_is_sriov(ldev); - spin_unlock(&lag_lock); + spin_unlock_irqrestore(&lag_lock, flags); return res; } @@ -964,12 +1378,14 @@ EXPORT_SYMBOL(mlx5_lag_is_sriov); bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev) { struct mlx5_lag *ldev; + unsigned long flags; bool res; - spin_lock(&lag_lock); + spin_lock_irqsave(&lag_lock, flags); ldev = mlx5_lag_dev(dev); - res = ldev && __mlx5_lag_is_sriov(ldev) && ldev->shared_fdb; - spin_unlock(&lag_lock); + res = ldev && __mlx5_lag_is_sriov(ldev) && + test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags); + spin_unlock_irqrestore(&lag_lock, flags); return res; } @@ -977,8 +1393,6 @@ EXPORT_SYMBOL(mlx5_lag_is_shared_fdb); void mlx5_lag_disable_change(struct mlx5_core_dev *dev) { - struct mlx5_core_dev *dev0; - struct mlx5_core_dev *dev1; struct mlx5_lag *ldev; ldev = mlx5_lag_dev(dev); @@ -986,16 +1400,13 @@ void mlx5_lag_disable_change(struct mlx5_core_dev *dev) return; mlx5_dev_list_lock(); - - dev0 = ldev->pf[MLX5_LAG_P1].dev; - dev1 = ldev->pf[MLX5_LAG_P2].dev; + mutex_lock(&ldev->lock); ldev->mode_changes_in_progress++; - if (__mlx5_lag_is_active(ldev)) { - mlx5_lag_lock_eswitches(dev0, dev1); + if (__mlx5_lag_is_active(ldev)) mlx5_disable_lag(ldev); - mlx5_lag_unlock_eswitches(dev0, dev1); - } + + mutex_unlock(&ldev->lock); mlx5_dev_list_unlock(); } @@ -1007,9 +1418,9 @@ void mlx5_lag_enable_change(struct mlx5_core_dev *dev) if (!ldev) return; - mlx5_dev_list_lock(); + mutex_lock(&ldev->lock); ldev->mode_changes_in_progress--; - mlx5_dev_list_unlock(); + mutex_unlock(&ldev->lock); mlx5_queue_bond_work(ldev, 0); } @@ -1017,17 +1428,21 @@ struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev) { struct net_device *ndev = NULL; struct mlx5_lag *ldev; + unsigned long flags; + int i; - spin_lock(&lag_lock); + spin_lock_irqsave(&lag_lock, flags); ldev = mlx5_lag_dev(dev); if (!(ldev && __mlx5_lag_is_roce(ldev))) goto unlock; if (ldev->tracker.tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { - ndev = ldev->tracker.netdev_state[MLX5_LAG_P1].tx_enabled ? - ldev->pf[MLX5_LAG_P1].netdev : - ldev->pf[MLX5_LAG_P2].netdev; + for (i = 0; i < ldev->ports; i++) + if (ldev->tracker.netdev_state[i].tx_enabled) + ndev = ldev->pf[i].netdev; + if (!ndev) + ndev = ldev->pf[ldev->ports - 1].netdev; } else { ndev = ldev->pf[MLX5_LAG_P1].netdev; } @@ -1035,7 +1450,7 @@ struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev) dev_hold(ndev); unlock: - spin_unlock(&lag_lock); + spin_unlock_irqrestore(&lag_lock, flags); return ndev; } @@ -1045,32 +1460,49 @@ u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev, struct net_device *slave) { struct mlx5_lag *ldev; + unsigned long flags; u8 port = 0; + int i; - spin_lock(&lag_lock); + spin_lock_irqsave(&lag_lock, flags); ldev = mlx5_lag_dev(dev); if (!(ldev && __mlx5_lag_is_roce(ldev))) goto unlock; - if (ldev->pf[MLX5_LAG_P1].netdev == slave) - port = MLX5_LAG_P1; - else - port = MLX5_LAG_P2; + for (i = 0; i < ldev->ports; i++) { + if (ldev->pf[MLX5_LAG_P1].netdev == slave) { + port = i; + break; + } + } - port = ldev->v2p_map[port]; + port = ldev->v2p_map[port * ldev->buckets]; unlock: - spin_unlock(&lag_lock); + spin_unlock_irqrestore(&lag_lock, flags); return port; } EXPORT_SYMBOL(mlx5_lag_get_slave_port); +u8 mlx5_lag_get_num_ports(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev; + + ldev = mlx5_lag_dev(dev); + if (!ldev) + return 0; + + return ldev->ports; +} +EXPORT_SYMBOL(mlx5_lag_get_num_ports); + struct mlx5_core_dev *mlx5_lag_get_peer_mdev(struct mlx5_core_dev *dev) { struct mlx5_core_dev *peer_dev = NULL; struct mlx5_lag *ldev; + unsigned long flags; - spin_lock(&lag_lock); + spin_lock_irqsave(&lag_lock, flags); ldev = mlx5_lag_dev(dev); if (!ldev) goto unlock; @@ -1080,7 +1512,7 @@ struct mlx5_core_dev *mlx5_lag_get_peer_mdev(struct mlx5_core_dev *dev) ldev->pf[MLX5_LAG_P1].dev; unlock: - spin_unlock(&lag_lock); + spin_unlock_irqrestore(&lag_lock, flags); return peer_dev; } EXPORT_SYMBOL(mlx5_lag_get_peer_mdev); @@ -1091,8 +1523,9 @@ int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, size_t *offsets) { int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out); - struct mlx5_core_dev *mdev[MLX5_MAX_PORTS]; + struct mlx5_core_dev **mdev; struct mlx5_lag *ldev; + unsigned long flags; int num_ports; int ret, i, j; void *out; @@ -1101,19 +1534,25 @@ int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, if (!out) return -ENOMEM; + mdev = kvzalloc(sizeof(mdev[0]) * MLX5_MAX_PORTS, GFP_KERNEL); + if (!mdev) { + ret = -ENOMEM; + goto free_out; + } + memset(values, 0, sizeof(*values) * num_counters); - spin_lock(&lag_lock); + spin_lock_irqsave(&lag_lock, flags); ldev = mlx5_lag_dev(dev); if (ldev && __mlx5_lag_is_active(ldev)) { - num_ports = MLX5_MAX_PORTS; - mdev[MLX5_LAG_P1] = ldev->pf[MLX5_LAG_P1].dev; - mdev[MLX5_LAG_P2] = ldev->pf[MLX5_LAG_P2].dev; + num_ports = ldev->ports; + for (i = 0; i < ldev->ports; i++) + mdev[i] = ldev->pf[i].dev; } else { num_ports = 1; mdev[MLX5_LAG_P1] = dev; } - spin_unlock(&lag_lock); + spin_unlock_irqrestore(&lag_lock, flags); for (i = 0; i < num_ports; ++i) { u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = {}; @@ -1123,13 +1562,15 @@ int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, ret = mlx5_cmd_exec_inout(mdev[i], query_cong_statistics, in, out); if (ret) - goto free; + goto free_mdev; for (j = 0; j < num_counters; ++j) values[j] += be64_to_cpup((__be64 *)(out + offsets[j])); } -free: +free_mdev: + kvfree(mdev); +free_out: kvfree(out); return ret; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h index e5d231c31b54..ce2ce8ccbd70 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h @@ -4,9 +4,13 @@ #ifndef __MLX5_LAG_H__ #define __MLX5_LAG_H__ +#include <linux/debugfs.h> + +#define MLX5_LAG_MAX_HASH_BUCKETS 16 #include "mlx5_core.h" #include "mp.h" #include "port_sel.h" +#include "mpesw.h" enum { MLX5_LAG_P1, @@ -14,20 +18,27 @@ enum { }; enum { - MLX5_LAG_FLAG_ROCE = 1 << 0, - MLX5_LAG_FLAG_SRIOV = 1 << 1, - MLX5_LAG_FLAG_MULTIPATH = 1 << 2, - MLX5_LAG_FLAG_READY = 1 << 3, - MLX5_LAG_FLAG_HASH_BASED = 1 << 4, + MLX5_LAG_FLAG_NDEVS_READY, }; -#define MLX5_LAG_MODE_FLAGS (MLX5_LAG_FLAG_ROCE | MLX5_LAG_FLAG_SRIOV |\ - MLX5_LAG_FLAG_MULTIPATH | \ - MLX5_LAG_FLAG_HASH_BASED) +enum { + MLX5_LAG_MODE_FLAG_HASH_BASED, + MLX5_LAG_MODE_FLAG_SHARED_FDB, + MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE, +}; + +enum mlx5_lag_mode { + MLX5_LAG_MODE_NONE, + MLX5_LAG_MODE_ROCE, + MLX5_LAG_MODE_SRIOV, + MLX5_LAG_MODE_MULTIPATH, + MLX5_LAG_MODE_MPESW, +}; struct lag_func { struct mlx5_core_dev *dev; struct net_device *netdev; + bool has_drop; }; /* Used for collection of netdev event info. */ @@ -35,6 +46,7 @@ struct lag_tracker { enum netdev_lag_tx_type tx_type; struct netdev_lag_lower_state_info netdev_state[MLX5_MAX_PORTS]; unsigned int is_bonded:1; + unsigned int has_inactive:1; enum netdev_lag_hash hash_type; }; @@ -42,20 +54,37 @@ struct lag_tracker { * It serves both its phys functions. */ struct mlx5_lag { - u8 flags; + enum mlx5_lag_mode mode; + unsigned long mode_flags; + unsigned long state_flags; + u8 ports; + u8 buckets; int mode_changes_in_progress; - bool shared_fdb; - u8 v2p_map[MLX5_MAX_PORTS]; + u8 v2p_map[MLX5_MAX_PORTS * MLX5_LAG_MAX_HASH_BUCKETS]; struct kref ref; struct lag_func pf[MLX5_MAX_PORTS]; struct lag_tracker tracker; struct workqueue_struct *wq; struct delayed_work bond_work; + struct work_struct mpesw_work; struct notifier_block nb; struct lag_mp lag_mp; struct mlx5_lag_port_sel port_sel; + /* Protect lag fields/state changes */ + struct mutex lock; + struct lag_mpesw lag_mpesw; }; +static inline bool mlx5_is_lag_supported(struct mlx5_core_dev *dev) +{ + if (!MLX5_CAP_GEN(dev, vport_group_manager) || + !MLX5_CAP_GEN(dev, lag_master) || + MLX5_CAP_GEN(dev, num_lag_ports) < 2 || + MLX5_CAP_GEN(dev, num_lag_ports) > MLX5_MAX_PORTS) + return false; + return true; +} + static inline struct mlx5_lag * mlx5_lag_dev(struct mlx5_core_dev *dev) { @@ -65,22 +94,33 @@ mlx5_lag_dev(struct mlx5_core_dev *dev) static inline bool __mlx5_lag_is_active(struct mlx5_lag *ldev) { - return !!(ldev->flags & MLX5_LAG_MODE_FLAGS); + return ldev->mode != MLX5_LAG_MODE_NONE; } static inline bool mlx5_lag_is_ready(struct mlx5_lag *ldev) { - return ldev->flags & MLX5_LAG_FLAG_READY; + return test_bit(MLX5_LAG_FLAG_NDEVS_READY, &ldev->state_flags); } void mlx5_modify_lag(struct mlx5_lag *ldev, struct lag_tracker *tracker); int mlx5_activate_lag(struct mlx5_lag *ldev, struct lag_tracker *tracker, - u8 flags, + enum mlx5_lag_mode mode, bool shared_fdb); int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev, struct net_device *ndev); +bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev); +void mlx5_lag_del_mpesw_rule(struct mlx5_core_dev *dev); +int mlx5_lag_add_mpesw_rule(struct mlx5_core_dev *dev); + +char *mlx5_get_str_port_sel_mode(enum mlx5_lag_mode mode, unsigned long flags); +void mlx5_infer_tx_enabled(struct lag_tracker *tracker, u8 num_ports, + u8 *ports, int *num_enabled); + +void mlx5_ldev_add_debugfs(struct mlx5_core_dev *dev); +void mlx5_ldev_remove_debugfs(struct dentry *dbg); +void mlx5_disable_lag(struct mlx5_lag *ldev); #endif /* __MLX5_LAG_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c index bf4d3cbefa63..0259a149a64c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c @@ -11,7 +11,7 @@ static bool __mlx5_lag_is_multipath(struct mlx5_lag *ldev) { - return !!(ldev->flags & MLX5_LAG_FLAG_MULTIPATH); + return ldev->mode == MLX5_LAG_MODE_MULTIPATH; } static bool mlx5_lag_multipath_check_prereq(struct mlx5_lag *ldev) @@ -50,7 +50,7 @@ bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev) static void mlx5_lag_set_port_affinity(struct mlx5_lag *ldev, enum mlx5_lag_port_affinity port) { - struct lag_tracker tracker; + struct lag_tracker tracker = {}; if (!__mlx5_lag_is_multipath(ldev)) return; @@ -100,6 +100,14 @@ static void mlx5_lag_fib_event_flush(struct notifier_block *nb) flush_workqueue(mp->wq); } +static void mlx5_lag_fib_set(struct lag_mp *mp, struct fib_info *fi, u32 dst, int dst_len) +{ + mp->fib.mfi = fi; + mp->fib.priority = fi->fib_priority; + mp->fib.dst = dst; + mp->fib.dst_len = dst_len; +} + struct mlx5_fib_event_work { struct work_struct work; struct mlx5_lag *ldev; @@ -110,10 +118,10 @@ struct mlx5_fib_event_work { }; }; -static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, - unsigned long event, - struct fib_info *fi) +static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, unsigned long event, + struct fib_entry_notifier_info *fen_info) { + struct fib_info *fi = fen_info->fi; struct lag_mp *mp = &ldev->lag_mp; struct fib_nh *fib_nh0, *fib_nh1; unsigned int nhs; @@ -121,11 +129,17 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, /* Handle delete event */ if (event == FIB_EVENT_ENTRY_DEL) { /* stop track */ - if (mp->mfi == fi) - mp->mfi = NULL; + if (mp->fib.mfi == fi) + mp->fib.mfi = NULL; return; } + /* Handle multipath entry with lower priority value */ + if (mp->fib.mfi && mp->fib.mfi != fi && + (mp->fib.dst != fen_info->dst || mp->fib.dst_len != fen_info->dst_len) && + fi->fib_priority >= mp->fib.priority) + return; + /* Handle add/replace event */ nhs = fib_info_num_path(fi); if (nhs == 1) { @@ -135,12 +149,13 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, int i = mlx5_lag_dev_get_netdev_idx(ldev, nh_dev); if (i < 0) - i = MLX5_LAG_NORMAL_AFFINITY; - else - ++i; + return; + i++; mlx5_lag_set_port_affinity(ldev, i); + mlx5_lag_fib_set(mp, fi, fen_info->dst, fen_info->dst_len); } + return; } @@ -160,15 +175,15 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, } /* First time we see multipath route */ - if (!mp->mfi && !__mlx5_lag_is_active(ldev)) { + if (!mp->fib.mfi && !__mlx5_lag_is_active(ldev)) { struct lag_tracker tracker; tracker = ldev->tracker; - mlx5_activate_lag(ldev, &tracker, MLX5_LAG_FLAG_MULTIPATH, false); + mlx5_activate_lag(ldev, &tracker, MLX5_LAG_MODE_MULTIPATH, false); } mlx5_lag_set_port_affinity(ldev, MLX5_LAG_NORMAL_AFFINITY); - mp->mfi = fi; + mlx5_lag_fib_set(mp, fi, fen_info->dst, fen_info->dst_len); } static void mlx5_lag_fib_nexthop_event(struct mlx5_lag *ldev, @@ -179,7 +194,7 @@ static void mlx5_lag_fib_nexthop_event(struct mlx5_lag *ldev, struct lag_mp *mp = &ldev->lag_mp; /* Check the nh event is related to the route */ - if (!mp->mfi || mp->mfi != fi) + if (!mp->fib.mfi || mp->fib.mfi != fi) return; /* nh added/removed */ @@ -209,7 +224,7 @@ static void mlx5_lag_fib_update(struct work_struct *work) case FIB_EVENT_ENTRY_REPLACE: case FIB_EVENT_ENTRY_DEL: mlx5_lag_fib_route_event(ldev, fib_work->event, - fib_work->fen_info.fi); + &fib_work->fen_info); fib_info_put(fib_work->fen_info.fi); break; case FIB_EVENT_NH_ADD: @@ -268,10 +283,8 @@ static int mlx5_lag_fib_event(struct notifier_block *nb, fen_info = container_of(info, struct fib_entry_notifier_info, info); fi = fen_info->fi; - if (fi->nh) { - NL_SET_ERR_MSG_MOD(info->extack, "IPv4 route with nexthop objects is not supported"); - return notifier_from_errno(-EINVAL); - } + if (fi->nh) + return NOTIFY_DONE; fib_dev = fib_info_nh(fen_info->fi, 0)->fib_nh_dev; if (fib_dev != ldev->pf[MLX5_LAG_P1].netdev && fib_dev != ldev->pf[MLX5_LAG_P2].netdev) { @@ -310,7 +323,7 @@ void mlx5_lag_mp_reset(struct mlx5_lag *ldev) /* Clear mfi, as it might become stale when a route delete event * has been missed, see mlx5_lag_fib_route_event(). */ - ldev->lag_mp.mfi = NULL; + ldev->lag_mp.fib.mfi = NULL; } int mlx5_lag_mp_init(struct mlx5_lag *ldev) @@ -321,7 +334,7 @@ int mlx5_lag_mp_init(struct mlx5_lag *ldev) /* always clear mfi, as it might become stale when a route delete event * has been missed */ - mp->mfi = NULL; + mp->fib.mfi = NULL; if (mp->fib_nb.notifier_call) return 0; @@ -351,5 +364,5 @@ void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev) unregister_fib_notifier(&init_net, &mp->fib_nb); destroy_workqueue(mp->wq); mp->fib_nb.notifier_call = NULL; - mp->mfi = NULL; + mp->fib.mfi = NULL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.h index 57af962cad29..056a066da604 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.h @@ -15,7 +15,12 @@ enum mlx5_lag_port_affinity { struct lag_mp { struct notifier_block fib_nb; - struct fib_info *mfi; /* used in tracking fib events */ + struct { + const void *mfi; /* used in tracking fib events */ + u32 priority; + u32 dst; + int dst_len; + } fib; struct workqueue_struct *wq; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c new file mode 100644 index 000000000000..f643202b29c6 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include <linux/netdevice.h> +#include <net/nexthop.h> +#include "lag/lag.h" +#include "eswitch.h" +#include "lib/mlx5.h" + +void mlx5_mpesw_work(struct work_struct *work) +{ + struct mlx5_lag *ldev = container_of(work, struct mlx5_lag, mpesw_work); + + mutex_lock(&ldev->lock); + mlx5_disable_lag(ldev); + mutex_unlock(&ldev->lock); +} + +static void mlx5_lag_disable_mpesw(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev = dev->priv.lag; + + if (!queue_work(ldev->wq, &ldev->mpesw_work)) + mlx5_core_warn(dev, "failed to queue work\n"); +} + +void mlx5_lag_del_mpesw_rule(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev = dev->priv.lag; + + if (!ldev) + return; + + mutex_lock(&ldev->lock); + if (!atomic_dec_return(&ldev->lag_mpesw.mpesw_rule_count) && + ldev->mode == MLX5_LAG_MODE_MPESW) + mlx5_lag_disable_mpesw(dev); + mutex_unlock(&ldev->lock); +} + +int mlx5_lag_add_mpesw_rule(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev = dev->priv.lag; + int err = 0; + + if (!ldev) + return 0; + + mutex_lock(&ldev->lock); + if (atomic_add_return(1, &ldev->lag_mpesw.mpesw_rule_count) != 1) + goto out; + + if (ldev->mode != MLX5_LAG_MODE_NONE) { + err = -EINVAL; + goto out; + } + + err = mlx5_activate_lag(ldev, NULL, MLX5_LAG_MODE_MPESW, false); + if (err) + mlx5_core_warn(dev, "Failed to create LAG in MPESW mode (%d)\n", err); + +out: + mutex_unlock(&ldev->lock); + return err; +} + +int mlx5_lag_do_mirred(struct mlx5_core_dev *mdev, struct net_device *out_dev) +{ + struct mlx5_lag *ldev = mdev->priv.lag; + + if (!netif_is_bond_master(out_dev) || !ldev) + return 0; + + mutex_lock(&ldev->lock); + if (ldev->mode == MLX5_LAG_MODE_MPESW) { + mutex_unlock(&ldev->lock); + return -EOPNOTSUPP; + } + mutex_unlock(&ldev->lock); + return 0; +} + +bool mlx5_lag_mpesw_is_activated(struct mlx5_core_dev *dev) +{ + bool ret; + + ret = dev->priv.lag && dev->priv.lag->mode == MLX5_LAG_MODE_MPESW; + return ret; +} + +void mlx5_lag_mpesw_init(struct mlx5_lag *ldev) +{ + INIT_WORK(&ldev->mpesw_work, mlx5_mpesw_work); + atomic_set(&ldev->lag_mpesw.mpesw_rule_count, 0); +} + +void mlx5_lag_mpesw_cleanup(struct mlx5_lag *ldev) +{ + cancel_delayed_work_sync(&ldev->bond_work); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h new file mode 100644 index 000000000000..be4abcb8fcd5 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_LAG_MPESW_H__ +#define __MLX5_LAG_MPESW_H__ + +#include "lag.h" +#include "mlx5_core.h" + +struct lag_mpesw { + struct work_struct mpesw_work; + atomic_t mpesw_rule_count; +}; + +void mlx5_mpesw_work(struct work_struct *work); +int mlx5_lag_do_mirred(struct mlx5_core_dev *mdev, struct net_device *out_dev); +bool mlx5_lag_mpesw_is_activated(struct mlx5_core_dev *dev); +#if IS_ENABLED(CONFIG_MLX5_ESWITCH) +void mlx5_lag_mpesw_init(struct mlx5_lag *ldev); +void mlx5_lag_mpesw_cleanup(struct mlx5_lag *ldev); +#else +static inline void mlx5_lag_mpesw_init(struct mlx5_lag *ldev) {} +static inline void mlx5_lag_mpesw_cleanup(struct mlx5_lag *ldev) {} +#endif + +#endif /* __MLX5_LAG_MPESW_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c index a6592f9c3c05..d3a3fe4ce670 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c @@ -12,7 +12,8 @@ enum { static struct mlx5_flow_group * mlx5_create_hash_flow_group(struct mlx5_flow_table *ft, - struct mlx5_flow_definer *definer) + struct mlx5_flow_definer *definer, + u8 rules) { int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); struct mlx5_flow_group *fg; @@ -25,7 +26,7 @@ mlx5_create_hash_flow_group(struct mlx5_flow_table *ft, MLX5_SET(create_flow_group_in, in, match_definer_id, mlx5_get_match_definer_id(definer)); MLX5_SET(create_flow_group_in, in, start_flow_index, 0); - MLX5_SET(create_flow_group_in, in, end_flow_index, MLX5_MAX_PORTS - 1); + MLX5_SET(create_flow_group_in, in, end_flow_index, rules - 1); MLX5_SET(create_flow_group_in, in, group_type, MLX5_CREATE_FLOW_GROUP_IN_GROUP_TYPE_HASH_SPLIT); @@ -36,7 +37,7 @@ mlx5_create_hash_flow_group(struct mlx5_flow_table *ft, static int mlx5_lag_create_port_sel_table(struct mlx5_lag *ldev, struct mlx5_lag_definer *lag_definer, - u8 port1, u8 port2) + u8 *ports) { struct mlx5_core_dev *dev = ldev->pf[MLX5_LAG_P1].dev; struct mlx5_flow_table_attr ft_attr = {}; @@ -44,8 +45,10 @@ static int mlx5_lag_create_port_sel_table(struct mlx5_lag *ldev, MLX5_DECLARE_FLOW_ACT(flow_act); struct mlx5_flow_namespace *ns; int err, i; + int idx; + int j; - ft_attr.max_fte = MLX5_MAX_PORTS; + ft_attr.max_fte = ldev->ports * ldev->buckets; ft_attr.level = MLX5_LAG_FT_LEVEL_DEFINER; ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_PORT_SEL); @@ -61,7 +64,8 @@ static int mlx5_lag_create_port_sel_table(struct mlx5_lag *ldev, } lag_definer->fg = mlx5_create_hash_flow_group(lag_definer->ft, - lag_definer->definer); + lag_definer->definer, + ft_attr.max_fte); if (IS_ERR(lag_definer->fg)) { err = PTR_ERR(lag_definer->fg); goto destroy_ft; @@ -70,19 +74,25 @@ static int mlx5_lag_create_port_sel_table(struct mlx5_lag *ldev, dest.type = MLX5_FLOW_DESTINATION_TYPE_UPLINK; dest.vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID; flow_act.flags |= FLOW_ACT_NO_APPEND; - for (i = 0; i < MLX5_MAX_PORTS; i++) { - u8 affinity = i == 0 ? port1 : port2; - - dest.vport.vhca_id = MLX5_CAP_GEN(ldev->pf[affinity - 1].dev, - vhca_id); - lag_definer->rules[i] = mlx5_add_flow_rules(lag_definer->ft, - NULL, &flow_act, - &dest, 1); - if (IS_ERR(lag_definer->rules[i])) { - err = PTR_ERR(lag_definer->rules[i]); - while (i--) - mlx5_del_flow_rules(lag_definer->rules[i]); - goto destroy_fg; + for (i = 0; i < ldev->ports; i++) { + for (j = 0; j < ldev->buckets; j++) { + u8 affinity; + + idx = i * ldev->buckets + j; + affinity = ports[idx]; + + dest.vport.vhca_id = MLX5_CAP_GEN(ldev->pf[affinity - 1].dev, + vhca_id); + lag_definer->rules[idx] = mlx5_add_flow_rules(lag_definer->ft, + NULL, &flow_act, + &dest, 1); + if (IS_ERR(lag_definer->rules[idx])) { + err = PTR_ERR(lag_definer->rules[idx]); + while (i--) + while (j--) + mlx5_del_flow_rules(lag_definer->rules[idx]); + goto destroy_fg; + } } } @@ -279,8 +289,7 @@ static int mlx5_lag_set_definer(u32 *match_definer_mask, static struct mlx5_lag_definer * mlx5_lag_create_definer(struct mlx5_lag *ldev, enum netdev_lag_hash hash, - enum mlx5_traffic_types tt, bool tunnel, u8 port1, - u8 port2) + enum mlx5_traffic_types tt, bool tunnel, u8 *ports) { struct mlx5_core_dev *dev = ldev->pf[MLX5_LAG_P1].dev; struct mlx5_lag_definer *lag_definer; @@ -308,7 +317,7 @@ mlx5_lag_create_definer(struct mlx5_lag *ldev, enum netdev_lag_hash hash, goto free_mask; } - err = mlx5_lag_create_port_sel_table(ldev, lag_definer, port1, port2); + err = mlx5_lag_create_port_sel_table(ldev, lag_definer, ports); if (err) goto destroy_match_definer; @@ -329,10 +338,16 @@ static void mlx5_lag_destroy_definer(struct mlx5_lag *ldev, struct mlx5_lag_definer *lag_definer) { struct mlx5_core_dev *dev = ldev->pf[MLX5_LAG_P1].dev; + int idx; int i; + int j; - for (i = 0; i < MLX5_MAX_PORTS; i++) - mlx5_del_flow_rules(lag_definer->rules[i]); + for (i = 0; i < ldev->ports; i++) { + for (j = 0; j < ldev->buckets; j++) { + idx = i * ldev->buckets + j; + mlx5_del_flow_rules(lag_definer->rules[idx]); + } + } mlx5_destroy_flow_group(lag_definer->fg); mlx5_destroy_flow_table(lag_definer->ft); mlx5_destroy_match_definer(dev, lag_definer->definer); @@ -356,7 +371,7 @@ static void mlx5_lag_destroy_definers(struct mlx5_lag *ldev) static int mlx5_lag_create_definers(struct mlx5_lag *ldev, enum netdev_lag_hash hash_type, - u8 port1, u8 port2) + u8 *ports) { struct mlx5_lag_port_sel *port_sel = &ldev->port_sel; struct mlx5_lag_definer *lag_definer; @@ -364,7 +379,7 @@ static int mlx5_lag_create_definers(struct mlx5_lag *ldev, for_each_set_bit(tt, port_sel->tt_map, MLX5_NUM_TT) { lag_definer = mlx5_lag_create_definer(ldev, hash_type, tt, - false, port1, port2); + false, ports); if (IS_ERR(lag_definer)) { err = PTR_ERR(lag_definer); goto destroy_definers; @@ -376,7 +391,7 @@ static int mlx5_lag_create_definers(struct mlx5_lag *ldev, lag_definer = mlx5_lag_create_definer(ldev, hash_type, tt, - true, port1, port2); + true, ports); if (IS_ERR(lag_definer)) { err = PTR_ERR(lag_definer); goto destroy_definers; @@ -505,7 +520,7 @@ static int mlx5_lag_create_inner_ttc_table(struct mlx5_lag *ldev) struct ttc_params ttc_params = {}; mlx5_lag_set_inner_ttc_params(ldev, &ttc_params); - port_sel->inner.ttc = mlx5_create_ttc_table(dev, &ttc_params); + port_sel->inner.ttc = mlx5_create_inner_ttc_table(dev, &ttc_params); if (IS_ERR(port_sel->inner.ttc)) return PTR_ERR(port_sel->inner.ttc); @@ -513,13 +528,13 @@ static int mlx5_lag_create_inner_ttc_table(struct mlx5_lag *ldev) } int mlx5_lag_port_sel_create(struct mlx5_lag *ldev, - enum netdev_lag_hash hash_type, u8 port1, u8 port2) + enum netdev_lag_hash hash_type, u8 *ports) { struct mlx5_lag_port_sel *port_sel = &ldev->port_sel; int err; set_tt_map(port_sel, hash_type); - err = mlx5_lag_create_definers(ldev, hash_type, port1, port2); + err = mlx5_lag_create_definers(ldev, hash_type, ports); if (err) return err; @@ -543,52 +558,62 @@ destroy_definers: return err; } -static int -mlx5_lag_modify_definers_destinations(struct mlx5_lag *ldev, - struct mlx5_lag_definer **definers, - u8 port1, u8 port2) +static int __mlx5_lag_modify_definers_destinations(struct mlx5_lag *ldev, + struct mlx5_lag_definer *def, + u8 *ports) { - struct mlx5_lag_port_sel *port_sel = &ldev->port_sel; struct mlx5_flow_destination dest = {}; + int idx; int err; - int tt; + int i; + int j; dest.type = MLX5_FLOW_DESTINATION_TYPE_UPLINK; dest.vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID; - for_each_set_bit(tt, port_sel->tt_map, MLX5_NUM_TT) { - struct mlx5_flow_handle **rules = definers[tt]->rules; + for (i = 0; i < ldev->ports; i++) { + for (j = 0; j < ldev->buckets; j++) { + idx = i * ldev->buckets + j; + if (ldev->v2p_map[i] == ports[i]) + continue; - if (ldev->v2p_map[MLX5_LAG_P1] != port1) { - dest.vport.vhca_id = - MLX5_CAP_GEN(ldev->pf[port1 - 1].dev, vhca_id); - err = mlx5_modify_rule_destination(rules[MLX5_LAG_P1], - &dest, NULL); + dest.vport.vhca_id = MLX5_CAP_GEN(ldev->pf[ports[idx] - 1].dev, + vhca_id); + err = mlx5_modify_rule_destination(def->rules[idx], &dest, NULL); if (err) return err; } + } - if (ldev->v2p_map[MLX5_LAG_P2] != port2) { - dest.vport.vhca_id = - MLX5_CAP_GEN(ldev->pf[port2 - 1].dev, vhca_id); - err = mlx5_modify_rule_destination(rules[MLX5_LAG_P2], - &dest, NULL); - if (err) - return err; - } + return 0; +} + +static int +mlx5_lag_modify_definers_destinations(struct mlx5_lag *ldev, + struct mlx5_lag_definer **definers, + u8 *ports) +{ + struct mlx5_lag_port_sel *port_sel = &ldev->port_sel; + int err; + int tt; + + for_each_set_bit(tt, port_sel->tt_map, MLX5_NUM_TT) { + err = __mlx5_lag_modify_definers_destinations(ldev, definers[tt], ports); + if (err) + return err; } return 0; } -int mlx5_lag_port_sel_modify(struct mlx5_lag *ldev, u8 port1, u8 port2) +int mlx5_lag_port_sel_modify(struct mlx5_lag *ldev, u8 *ports) { struct mlx5_lag_port_sel *port_sel = &ldev->port_sel; int err; err = mlx5_lag_modify_definers_destinations(ldev, port_sel->outer.definers, - port1, port2); + ports); if (err) return err; @@ -597,7 +622,7 @@ int mlx5_lag_port_sel_modify(struct mlx5_lag *ldev, u8 port1, u8 port2) return mlx5_lag_modify_definers_destinations(ldev, port_sel->inner.definers, - port1, port2); + ports); } void mlx5_lag_port_sel_destroy(struct mlx5_lag *ldev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.h index 6d15b28a42fc..5ec3af2a3ecd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.h @@ -10,7 +10,10 @@ struct mlx5_lag_definer { struct mlx5_flow_definer *definer; struct mlx5_flow_table *ft; struct mlx5_flow_group *fg; - struct mlx5_flow_handle *rules[MLX5_MAX_PORTS]; + /* Each port has ldev->buckets number of rules and they are arrange in + * [port * buckets .. port * buckets + buckets) locations + */ + struct mlx5_flow_handle *rules[MLX5_MAX_PORTS * MLX5_LAG_MAX_HASH_BUCKETS]; }; struct mlx5_lag_ttc { @@ -27,22 +30,20 @@ struct mlx5_lag_port_sel { #ifdef CONFIG_MLX5_ESWITCH -int mlx5_lag_port_sel_modify(struct mlx5_lag *ldev, u8 port1, u8 port2); +int mlx5_lag_port_sel_modify(struct mlx5_lag *ldev, u8 *ports); void mlx5_lag_port_sel_destroy(struct mlx5_lag *ldev); int mlx5_lag_port_sel_create(struct mlx5_lag *ldev, - enum netdev_lag_hash hash_type, u8 port1, - u8 port2); + enum netdev_lag_hash hash_type, u8 *ports); #else /* CONFIG_MLX5_ESWITCH */ static inline int mlx5_lag_port_sel_create(struct mlx5_lag *ldev, enum netdev_lag_hash hash_type, - u8 port1, u8 port2) + u8 *ports) { return 0; } -static inline int mlx5_lag_port_sel_modify(struct mlx5_lag *ldev, u8 port1, - u8 port2) +static inline int mlx5_lag_port_sel_modify(struct mlx5_lag *ldev, u8 *ports) { return 0; } |