19 files changed, 675 insertions, 236 deletions
diff --git a/Documentation/networking/devlink/mlx5.rst b/Documentation/networking/devlink/mlx5.rst
index 38e94ed65936..29ad304e6fba 100644
--- a/Documentation/networking/devlink/mlx5.rst
+++ b/Documentation/networking/devlink/mlx5.rst
@@ -17,6 +17,7 @@ Parameters
      - Validation
    * - ``enable_roce``
      - driverinit
+     - Type: Boolean
    * - ``io_eq_size``
      - driverinit
      - The range is between 64 and 4096.
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 91eb615b89ee..86842cd580ba 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -1541,16 +1541,10 @@ int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 
 	eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int;
 	param = (struct mlx5_eq_param) {
-		.irq_index = MLX5_IRQ_EQ_CTRL,
 		.nent = MLX5_IB_NUM_PF_EQE,
 	};
 	param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT;
-	if (!zalloc_cpumask_var(&param.affinity, GFP_KERNEL)) {
-		err = -ENOMEM;
-		goto err_wq;
-	}
 	eq->core = mlx5_eq_create_generic(dev->mdev, &param);
-	free_cpumask_var(param.affinity);
 	if (IS_ERR(eq->core)) {
 		err = PTR_ERR(eq->core);
 		goto err_wq;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 33904bc87efa..fcfd38fa9e6c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -109,7 +109,7 @@ mlx5_core-$(CONFIG_MLX5_SW_STEERING) += steering/dr_domain.o steering/dr_table.o
 #
 # SF device
 #
-mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o sf/dev/dev.o sf/dev/driver.o
+mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o sf/dev/dev.o sf/dev/driver.o irq_affinity.o
 
 #
 # SF manager
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index c5f959a9e14b..812e6810cb3b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -984,7 +984,7 @@ struct mlx5e_profile {
 };
 
 #define mlx5e_profile_feature_cap(profile, feature)	\
-	((profile)->features & (MLX5E_PROFILE_FEATURE_## feature))
+	((profile)->features & BIT(MLX5E_PROFILE_FEATURE_##feature))
 
 void mlx5e_build_ptys2ethtool_map(void);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
index 074ffa4fa5af..b4f3bd7d346e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
@@ -120,14 +120,14 @@ static void mlx5e_hv_vhca_stats_cleanup(struct mlx5_hv_vhca_agent *agent)
 	cancel_delayed_work_sync(&priv->stats_agent.work);
 }
 
-int mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv)
+void mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv)
 {
 	int buf_len = mlx5e_hv_vhca_stats_buf_size(priv);
 	struct mlx5_hv_vhca_agent *agent;
 
 	priv->stats_agent.buf = kvzalloc(buf_len, GFP_KERNEL);
 	if (!priv->stats_agent.buf)
-		return -ENOMEM;
+		return;
 
 	agent = mlx5_hv_vhca_agent_create(priv->mdev->hv_vhca,
 					  MLX5_HV_VHCA_AGENT_STATS,
@@ -142,13 +142,11 @@ int mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv)
 				    PTR_ERR(agent));
 
 		kvfree(priv->stats_agent.buf);
-		return IS_ERR_OR_NULL(agent);
+		return;
 	}
 
 	priv->stats_agent.agent = agent;
 	INIT_DELAYED_WORK(&priv->stats_agent.work, mlx5e_hv_vhca_stats_work);
-
-	return 0;
 }
 
 void mlx5e_hv_vhca_stats_destroy(struct mlx5e_priv *priv)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h
index 664463faf77b..29c8c6d3260f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h
@@ -7,19 +7,12 @@
 
 #if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
 
-int mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv);
+void mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv);
 void mlx5e_hv_vhca_stats_destroy(struct mlx5e_priv *priv);
 
 #else
-
-static inline int mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv)
-{
-	return 0;
-}
-
-static inline void mlx5e_hv_vhca_stats_destroy(struct mlx5e_priv *priv)
-{
-}
+static inline void mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv) {}
+static inline void mlx5e_hv_vhca_stats_destroy(struct mlx5e_priv *priv) {}
 #endif
 
 #endif /* __MLX5_EN_STATS_VHCA_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c
index a0832b86016c..c614fc7fdc9c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c
@@ -45,14 +45,10 @@ verify_uplink_forwarding(struct mlx5e_priv *priv,
 					termination_table_raw_traffic)) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "devices are both uplink, can't offload forwarding");
-			pr_err("devices %s %s are both uplink, can't offload forwarding\n",
-			       priv->netdev->name, out_dev->name);
 			return -EOPNOTSUPP;
 	} else if (out_dev != rep_priv->netdev) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "devices are not the same uplink, can't offload forwarding");
-		pr_err("devices %s %s are both uplink but not the same, can't offload forwarding\n",
-		       priv->netdev->name, out_dev->name);
 		return -EOPNOTSUPP;
 	}
 	return 0;
@@ -160,10 +156,6 @@ tc_act_can_offload_mirred(struct mlx5e_tc_act_parse_state *parse_state,
 	}
 
 	NL_SET_ERR_MSG_MOD(extack, "devices are not on same switch HW, can't offload forwarding");
-	netdev_warn(priv->netdev,
-		    "devices %s %s not on same switch HW, can't offload forwarding\n",
-		    netdev_name(priv->netdev),
-		    out_dev->name);
 
 	return false;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 536fcb2c5e90..57d755db1cf5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1883,24 +1883,19 @@ static int set_pflag_cqe_based_moder(struct net_device *netdev, bool enable,
 				     bool is_rx_cq)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
-	struct mlx5_core_dev *mdev = priv->mdev;
-	struct mlx5e_params new_params;
-	bool mode_changed;
 	u8 cq_period_mode, current_cq_period_mode;
+	struct mlx5e_params new_params;
+
+	if (enable && !MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe))
+		return -EOPNOTSUPP;
+
+	cq_period_mode = cqe_mode_to_period_mode(enable);
 
-	cq_period_mode = enable ?
-		MLX5_CQ_PERIOD_MODE_START_FROM_CQE :
-		MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
 	current_cq_period_mode = is_rx_cq ?
 		priv->channels.params.rx_cq_moderation.cq_period_mode :
 		priv->channels.params.tx_cq_moderation.cq_period_mode;
-	mode_changed = cq_period_mode != current_cq_period_mode;
-
-	if (cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE &&
-	    !MLX5_CAP_GEN(mdev, cq_period_start_from_cqe))
-		return -EOPNOTSUPP;
 
-	if (!mode_changed)
+	if (cq_period_mode == current_cq_period_mode)
 		return 0;
 
 	new_params = priv->channels.params;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 31c911182498..d36489363a26 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3605,11 +3605,6 @@ static int set_feature_hw_gro(struct net_device *netdev, bool enable)
 	new_params = priv->channels.params;
 
 	if (enable) {
-		if (MLX5E_GET_PFLAG(&new_params, MLX5E_PFLAG_RX_CQE_COMPRESS)) {
-			netdev_warn(netdev, "Can't set HW-GRO when CQE compress is active\n");
-			err = -EINVAL;
-			goto out;
-		}
 		new_params.packet_merge.type = MLX5E_PACKET_MERGE_SHAMPO;
 		new_params.packet_merge.shampo.match_criteria_type =
 			MLX5_RQC_SHAMPO_MATCH_CRITERIA_TYPE_EXTENDED;
@@ -3871,6 +3866,11 @@ static netdev_features_t mlx5e_fix_features(struct net_device *netdev,
 		features &= ~NETIF_F_RXHASH;
 		if (netdev->features & NETIF_F_RXHASH)
 			netdev_warn(netdev, "Disabling rxhash, not supported when CQE compress is active\n");
+
+		if (features & NETIF_F_GRO_HW) {
+			netdev_warn(netdev, "Disabling HW-GRO, not supported when CQE compress is active\n");
+			features &= ~NETIF_F_GRO_HW;
+		}
 	}
 
 	if (mlx5e_is_uplink_rep(priv))
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index f09b57c31ed7..96e260fd7987 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -1603,6 +1603,12 @@ static void trigger_report(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	}
 }
 
+static void mlx5e_handle_rx_err_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+	trigger_report(rq, cqe);
+	rq->stats->wqe_err++;
+}
+
 static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
 	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
@@ -1616,8 +1622,7 @@ static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
 	if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
-		trigger_report(rq, cqe);
-		rq->stats->wqe_err++;
+		mlx5e_handle_rx_err_cqe(rq, cqe);
 		goto free_wqe;
 	}
 
@@ -1670,7 +1675,7 @@ static void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
 	if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
-		rq->stats->wqe_err++;
+		mlx5e_handle_rx_err_cqe(rq, cqe);
 		goto free_wqe;
 	}
 
@@ -1719,8 +1724,7 @@ static void mlx5e_handle_rx_cqe_mpwrq_rep(struct mlx5e_rq *rq, struct mlx5_cqe64
 	wi->consumed_strides += cstrides;
 
 	if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
-		trigger_report(rq, cqe);
-		rq->stats->wqe_err++;
+		mlx5e_handle_rx_err_cqe(rq, cqe);
 		goto mpwrq_cqe_out;
 	}
 
@@ -1988,8 +1992,7 @@ static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cq
 	wi->consumed_strides += cstrides;
 
 	if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
-		trigger_report(rq, cqe);
-		stats->wqe_err++;
+		mlx5e_handle_rx_err_cqe(rq, cqe);
 		goto mpwrq_cqe_out;
 	}
 
@@ -2058,8 +2061,7 @@ static void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cq
 	wi->consumed_strides += cstrides;
 
 	if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
-		trigger_report(rq, cqe);
-		rq->stats->wqe_err++;
+		mlx5e_handle_rx_err_cqe(rq, cqe);
 		goto mpwrq_cqe_out;
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index 73fcd9fb17dd..7e7c0c1019f6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -35,6 +35,7 @@
 #include "en_accel/tls.h"
 #include "en_accel/en_accel.h"
 #include "en/ptp.h"
+#include "en/port.h"
 
 static unsigned int stats_grps_num(struct mlx5e_priv *priv)
 {
@@ -1158,12 +1159,99 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(phy)
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 }
 
-void mlx5e_stats_fec_get(struct mlx5e_priv *priv,
-			 struct ethtool_fec_stats *fec_stats)
+static int fec_num_lanes(struct mlx5_core_dev *dev)
+{
+	u32 out[MLX5_ST_SZ_DW(pmlp_reg)] = {};
+	u32 in[MLX5_ST_SZ_DW(pmlp_reg)] = {};
+	int err;
+
+	MLX5_SET(pmlp_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_PMLP, 0, 0);
+	if (err)
+		return 0;
+
+	return MLX5_GET(pmlp_reg, out, width);
+}
+
+static int fec_active_mode(struct mlx5_core_dev *mdev)
+{
+	unsigned long fec_active_long;
+	u32 fec_active;
+
+	if (mlx5e_get_fec_mode(mdev, &fec_active, NULL))
+		return MLX5E_FEC_NOFEC;
+
+	fec_active_long = fec_active;
+	return find_first_bit(&fec_active_long, sizeof(unsigned long) * BITS_PER_BYTE);
+}
+
+#define MLX5E_STATS_SET_FEC_BLOCK(idx) ({ \
+	fec_stats->corrected_blocks.lanes[(idx)] = \
+		MLX5E_READ_CTR64_BE_F(ppcnt, phys_layer_cntrs, \
+				      fc_fec_corrected_blocks_lane##idx); \
+	fec_stats->uncorrectable_blocks.lanes[(idx)] = \
+		MLX5E_READ_CTR64_BE_F(ppcnt, phys_layer_cntrs, \
+				      fc_fec_uncorrectable_blocks_lane##idx); \
+})
+
+static void fec_set_fc_stats(struct ethtool_fec_stats *fec_stats,
+			     u32 *ppcnt, u8 lanes)
+{
+	if (lanes > 3) { /* 4 lanes */
+		MLX5E_STATS_SET_FEC_BLOCK(3);
+		MLX5E_STATS_SET_FEC_BLOCK(2);
+	}
+	if (lanes > 1) /* 2 lanes */
+		MLX5E_STATS_SET_FEC_BLOCK(1);
+	if (lanes > 0) /* 1 lane */
+		MLX5E_STATS_SET_FEC_BLOCK(0);
+}
+
+static void fec_set_rs_stats(struct ethtool_fec_stats *fec_stats, u32 *ppcnt)
+{
+	fec_stats->corrected_blocks.total =
+		MLX5E_READ_CTR64_BE_F(ppcnt, phys_layer_cntrs,
+				      rs_fec_corrected_blocks);
+	fec_stats->uncorrectable_blocks.total =
+		MLX5E_READ_CTR64_BE_F(ppcnt, phys_layer_cntrs,
+				      rs_fec_uncorrectable_blocks);
+}
+
+static void fec_set_block_stats(struct mlx5e_priv *priv,
+				struct ethtool_fec_stats *fec_stats)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 out[MLX5_ST_SZ_DW(ppcnt_reg)] = {};
+	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {};
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+	int mode = fec_active_mode(mdev);
+
+	if (mode == MLX5E_FEC_NOFEC)
+		return;
+
+	MLX5_SET(ppcnt_reg, in, local_port, 1);
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_COUNTERS_GROUP);
+	if (mlx5_core_access_reg(mdev, in, sz, outl, sz, MLX5_REG_PPCNT, 0, 0))
+		return;
+
+	switch (mode) {
+	case MLX5E_FEC_RS_528_514:
+	case MLX5E_FEC_RS_544_514:
+	case MLX5E_FEC_LLRS_272_257_1:
+		fec_set_rs_stats(fec_stats, out);
+		return;
+	case MLX5E_FEC_FIRECODE:
+		fec_set_fc_stats(fec_stats, out, fec_num_lanes(mdev));
+	}
+}
+
+static void fec_set_corrected_bits_total(struct mlx5e_priv *priv,
+					 struct ethtool_fec_stats *fec_stats)
 {
 	u32 ppcnt_phy_statistical[MLX5_ST_SZ_DW(ppcnt_reg)];
 	struct mlx5_core_dev *mdev = priv->mdev;
-	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0};
+	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {};
 	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
 
 	if (!MLX5_CAP_PCAM_FEATURE(mdev, ppcnt_statistical_group))
@@ -1181,6 +1269,13 @@ void mlx5e_stats_fec_get(struct mlx5e_priv *priv,
 				      phy_corrected_bits);
 }
 
+void mlx5e_stats_fec_get(struct mlx5e_priv *priv,
+			 struct ethtool_fec_stats *fec_stats)
+{
+	fec_set_corrected_bits_total(priv, fec_stats);
+	fec_set_block_stats(priv, fec_stats);
+}
+
 #define PPORT_ETH_EXT_OFF(c) \
 	MLX5_BYTE_OFF(ppcnt_reg, \
 		      counter_set.eth_extended_cntrs_grp_data_layout.c##_high)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index b695aad71ee1..48a45aa54a3c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -59,6 +59,8 @@ struct mlx5_eq_table {
 	struct mutex            lock; /* sync async eqs creations */
 	int			num_comp_eqs;
 	struct mlx5_irq_table	*irq_table;
+	struct mlx5_irq         **comp_irqs;
+	struct mlx5_irq         *ctrl_irq;
 #ifdef CONFIG_RFS_ACCEL
 	struct cpu_rmap		*rmap;
 #endif
@@ -266,8 +268,8 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
 	u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0};
 	u8 log_eq_stride = ilog2(MLX5_EQE_SIZE);
 	struct mlx5_priv *priv = &dev->priv;
-	u16 vecidx = param->irq_index;
 	__be64 *pas;
+	u16 vecidx;
 	void *eqc;
 	int inlen;
 	u32 *in;
@@ -289,20 +291,16 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
 	mlx5_init_fbc(eq->frag_buf.frags, log_eq_stride, log_eq_size, &eq->fbc);
 	init_eq_buf(eq);
 
-	eq->irq = mlx5_irq_request(dev, vecidx, param->affinity);
-	if (IS_ERR(eq->irq)) {
-		err = PTR_ERR(eq->irq);
-		goto err_buf;
-	}
-
+	eq->irq = param->irq;
 	vecidx = mlx5_irq_get_index(eq->irq);
+
 	inlen = MLX5_ST_SZ_BYTES(create_eq_in) +
 		MLX5_FLD_SZ_BYTES(create_eq_in, pas[0]) * eq->frag_buf.npages;
 
 	in = kvzalloc(inlen, GFP_KERNEL);
 	if (!in) {
 		err = -ENOMEM;
-		goto err_irq;
+		goto err_buf;
 	}
 
 	pas = (__be64 *)MLX5_ADDR_OF(create_eq_in, in, pas);
@@ -346,8 +344,6 @@ err_eq:
 err_in:
 	kvfree(in);
 
-err_irq:
-	mlx5_irq_release(eq->irq);
 err_buf:
 	mlx5_frag_buf_free(dev, &eq->frag_buf);
 	return err;
@@ -401,7 +397,6 @@ static int destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 	if (err)
 		mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n",
 			       eq->eqn);
-	mlx5_irq_release(eq->irq);
 
 	mlx5_frag_buf_free(dev, &eq->frag_buf);
 	return err;
@@ -594,11 +589,8 @@ setup_async_eq(struct mlx5_core_dev *dev, struct mlx5_eq_async *eq,
 
 	eq->irq_nb.notifier_call = mlx5_eq_async_int;
 	spin_lock_init(&eq->lock);
-	if (!zalloc_cpumask_var(&param->affinity, GFP_KERNEL))
-		return -ENOMEM;
 
 	err = create_async_eq(dev, &eq->core, param);
-	free_cpumask_var(param->affinity);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create %s EQ %d\n", name, err);
 		return err;
@@ -643,11 +635,18 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 	struct mlx5_eq_param param = {};
 	int err;
 
+	/* All the async_eqs are using single IRQ, request one IRQ and share its
+	 * index among all the async_eqs of this device.
+	 */
+	table->ctrl_irq = mlx5_ctrl_irq_request(dev);
+	if (IS_ERR(table->ctrl_irq))
+		return PTR_ERR(table->ctrl_irq);
+
 	MLX5_NB_INIT(&table->cq_err_nb, cq_err_event_notifier, CQ_ERROR);
 	mlx5_eq_notifier_register(dev, &table->cq_err_nb);
 
 	param = (struct mlx5_eq_param) {
-		.irq_index = MLX5_IRQ_EQ_CTRL,
+		.irq = table->ctrl_irq,
 		.nent = MLX5_NUM_CMD_EQE,
 		.mask[0] = 1ull << MLX5_EVENT_TYPE_CMD,
 	};
@@ -660,7 +659,7 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 	mlx5_cmd_allowed_opcode(dev, CMD_ALLOWED_OPCODE_ALL);
 
 	param = (struct mlx5_eq_param) {
-		.irq_index = MLX5_IRQ_EQ_CTRL,
+		.irq = table->ctrl_irq,
 		.nent = async_eq_depth_devlink_param_get(dev),
 	};
 
@@ -670,7 +669,7 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 		goto err2;
 
 	param = (struct mlx5_eq_param) {
-		.irq_index = MLX5_IRQ_EQ_CTRL,
+		.irq = table->ctrl_irq,
 		.nent = /* TODO: sriov max_vf + */ 1,
 		.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_REQUEST,
 	};
@@ -689,6 +688,7 @@ err2:
 err1:
 	mlx5_cmd_allowed_opcode(dev, CMD_ALLOWED_OPCODE_ALL);
 	mlx5_eq_notifier_unregister(dev, &table->cq_err_nb);
+	mlx5_ctrl_irq_release(table->ctrl_irq);
 	return err;
 }
 
@@ -703,6 +703,7 @@ static void destroy_async_eqs(struct mlx5_core_dev *dev)
 	cleanup_async_eq(dev, &table->cmd_eq, "cmd");
 	mlx5_cmd_allowed_opcode(dev, CMD_ALLOWED_OPCODE_ALL);
 	mlx5_eq_notifier_unregister(dev, &table->cq_err_nb);
+	mlx5_ctrl_irq_release(table->ctrl_irq);
 }
 
 struct mlx5_eq *mlx5_get_async_eq(struct mlx5_core_dev *dev)
@@ -730,12 +731,10 @@ mlx5_eq_create_generic(struct mlx5_core_dev *dev,
 	struct mlx5_eq *eq = kvzalloc(sizeof(*eq), GFP_KERNEL);
 	int err;
 
-	if (!cpumask_available(param->affinity))
-		return ERR_PTR(-EINVAL);
-
 	if (!eq)
 		return ERR_PTR(-ENOMEM);
 
+	param->irq = dev->priv.eq_table->ctrl_irq;
 	err = create_async_eq(dev, eq, param);
 	if (err) {
 		kvfree(eq);
@@ -795,6 +794,54 @@ void mlx5_eq_update_ci(struct mlx5_eq *eq, u32 cc, bool arm)
 }
 EXPORT_SYMBOL(mlx5_eq_update_ci);
 
+static void comp_irqs_release(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = dev->priv.eq_table;
+
+	if (mlx5_core_is_sf(dev))
+		mlx5_irq_affinity_irqs_release(dev, table->comp_irqs, table->num_comp_eqs);
+	else
+		mlx5_irqs_release_vectors(table->comp_irqs, table->num_comp_eqs);
+	kfree(table->comp_irqs);
+}
+
+static int comp_irqs_request(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = dev->priv.eq_table;
+	int ncomp_eqs = table->num_comp_eqs;
+	u16 *cpus;
+	int ret;
+	int i;
+
+	ncomp_eqs = table->num_comp_eqs;
+	table->comp_irqs = kcalloc(ncomp_eqs, sizeof(*table->comp_irqs), GFP_KERNEL);
+	if (!table->comp_irqs)
+		return -ENOMEM;
+	if (mlx5_core_is_sf(dev)) {
+		ret = mlx5_irq_affinity_irqs_request_auto(dev, ncomp_eqs, table->comp_irqs);
+		if (ret < 0)
+			goto free_irqs;
+		return ret;
+	}
+
+	cpus = kcalloc(ncomp_eqs, sizeof(*cpus), GFP_KERNEL);
+	if (!cpus) {
+		ret = -ENOMEM;
+		goto free_irqs;
+	}
+	for (i = 0; i < ncomp_eqs; i++)
+		cpus[i] = cpumask_local_spread(i, dev->priv.numa_node);
+	ret = mlx5_irqs_request_vectors(dev, cpus, ncomp_eqs, table->comp_irqs);
+	kfree(cpus);
+	if (ret < 0)
+		goto free_irqs;
+	return ret;
+
+free_irqs:
+	kfree(table->comp_irqs);
+	return ret;
+}
+
 static void destroy_comp_eqs(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *table = dev->priv.eq_table;
@@ -809,6 +856,7 @@ static void destroy_comp_eqs(struct mlx5_core_dev *dev)
 		tasklet_disable(&eq->tasklet_ctx.task);
 		kfree(eq);
 	}
+	comp_irqs_release(dev);
 }
 
 static u16 comp_eq_depth_devlink_param_get(struct mlx5_core_dev *dev)
@@ -835,12 +883,13 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 	int err;
 	int i;
 
+	ncomp_eqs = comp_irqs_request(dev);
+	if (ncomp_eqs < 0)
+		return ncomp_eqs;
 	INIT_LIST_HEAD(&table->comp_eqs_list);
-	ncomp_eqs = table->num_comp_eqs;
 	nent = comp_eq_depth_devlink_param_get(dev);
 	for (i = 0; i < ncomp_eqs; i++) {
 		struct mlx5_eq_param param = {};
-		int vecidx = i;
 
 		eq = kzalloc(sizeof(*eq), GFP_KERNEL);
 		if (!eq) {
@@ -855,18 +904,11 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 
 		eq->irq_nb.notifier_call = mlx5_eq_comp_int;
 		param = (struct mlx5_eq_param) {
-			.irq_index = vecidx,
+			.irq = table->comp_irqs[i],
 			.nent = nent,
 		};
 
-		if (!zalloc_cpumask_var(&param.affinity, GFP_KERNEL)) {
-			err = -ENOMEM;
-			goto clean_eq;
-		}
-		cpumask_set_cpu(cpumask_local_spread(i, dev->priv.numa_node),
-				param.affinity);
 		err = create_map_eq(dev, &eq->core, &param);
-		free_cpumask_var(param.affinity);
 		if (err)
 			goto clean_eq;
 		err = mlx5_eq_enable(dev, &eq->core, &eq->irq_nb);
@@ -880,7 +922,9 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 		list_add_tail(&eq->list, &table->comp_eqs_list);
 	}
 
+	table->num_comp_eqs = ncomp_eqs;
 	return 0;
+
 clean_eq:
 	kfree(eq);
 clean:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c
index 2b52f7c09152..9d17206d1625 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c
@@ -431,7 +431,7 @@ int mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw,
 	int err = 0;
 
 	if (!mlx5_esw_allowed(esw))
-		return -EPERM;
+		return vlan ? -EPERM : 0;
 
 	if (vlan || qos)
 		set_flags = SET_VLAN_STRIP | SET_VLAN_INSERT;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c b/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c
new file mode 100644
index 000000000000..380a208ab137
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+
+#include "mlx5_core.h"
+#include "mlx5_irq.h"
+#include "pci_irq.h"
+
+static void cpu_put(struct mlx5_irq_pool *pool, int cpu)
+{
+	pool->irqs_per_cpu[cpu]--;
+}
+
+static void cpu_get(struct mlx5_irq_pool *pool, int cpu)
+{
+	pool->irqs_per_cpu[cpu]++;
+}
+
+/* Gets the least loaded CPU. e.g.: the CPU with least IRQs bound to it */
+static int cpu_get_least_loaded(struct mlx5_irq_pool *pool,
+				const struct cpumask *req_mask)
+{
+	int best_cpu = -1;
+	int cpu;
+
+	for_each_cpu_and(cpu, req_mask, cpu_online_mask) {
+		/* CPU has zero IRQs on it. No need to search any more CPUs. */
+		if (!pool->irqs_per_cpu[cpu]) {
+			best_cpu = cpu;
+			break;
+		}
+		if (best_cpu < 0)
+			best_cpu = cpu;
+		if (pool->irqs_per_cpu[cpu] < pool->irqs_per_cpu[best_cpu])
+			best_cpu = cpu;
+	}
+	if (best_cpu == -1) {
+		/* There isn't online CPUs in req_mask */
+		mlx5_core_err(pool->dev, "NO online CPUs in req_mask (%*pbl)\n",
+			      cpumask_pr_args(req_mask));
+		best_cpu = cpumask_first(cpu_online_mask);
+	}
+	pool->irqs_per_cpu[best_cpu]++;
+	return best_cpu;
+}
+
+/* Creating an IRQ from irq_pool */
+static struct mlx5_irq *
+irq_pool_request_irq(struct mlx5_irq_pool *pool, const struct cpumask *req_mask)
+{
+	cpumask_var_t auto_mask;
+	struct mlx5_irq *irq;
+	u32 irq_index;
+	int err;
+
+	if (!zalloc_cpumask_var(&auto_mask, GFP_KERNEL))
+		return ERR_PTR(-ENOMEM);
+	err = xa_alloc(&pool->irqs, &irq_index, NULL, pool->xa_num_irqs, GFP_KERNEL);
+	if (err)
+		return ERR_PTR(err);
+	if (pool->irqs_per_cpu) {
+		if (cpumask_weight(req_mask) > 1)
+			/* if req_mask contain more then one CPU, set the least loadad CPU
+			 * of req_mask
+			 */
+			cpumask_set_cpu(cpu_get_least_loaded(pool, req_mask), auto_mask);
+		else
+			cpu_get(pool, cpumask_first(req_mask));
+	}
+	irq = mlx5_irq_alloc(pool, irq_index, cpumask_empty(auto_mask) ? req_mask : auto_mask);
+	free_cpumask_var(auto_mask);
+	return irq;
+}
+
+/* Looking for the IRQ with the smallest refcount that fits req_mask.
+ * If pool is sf_comp_pool, then we are looking for an IRQ with any of the
+ * requested CPUs in req_mask.
+ * for example: req_mask = 0xf, irq0_mask = 0x10, irq1_mask = 0x1. irq0_mask
+ * isn't subset of req_mask, so we will skip it. irq1_mask is subset of req_mask,
+ * we don't skip it.
+ * If pool is sf_ctrl_pool, then all IRQs have the same mask, so any IRQ will
+ * fit. And since mask is subset of itself, we will pass the first if bellow.
+ */
+static struct mlx5_irq *
+irq_pool_find_least_loaded(struct mlx5_irq_pool *pool, const struct cpumask *req_mask)
+{
+	int start = pool->xa_num_irqs.min;
+	int end = pool->xa_num_irqs.max;
+	struct mlx5_irq *irq = NULL;
+	struct mlx5_irq *iter;
+	int irq_refcount = 0;
+	unsigned long index;
+
+	lockdep_assert_held(&pool->lock);
+	xa_for_each_range(&pool->irqs, index, iter, start, end) {
+		struct cpumask *iter_mask = mlx5_irq_get_affinity_mask(iter);
+		int iter_refcount = mlx5_irq_read_locked(iter);
+
+		if (!cpumask_subset(iter_mask, req_mask))
+			/* skip IRQs with a mask which is not subset of req_mask */
+			continue;
+		if (iter_refcount < pool->min_threshold)
+			/* If we found an IRQ with less than min_thres, return it */
+			return iter;
+		if (!irq || iter_refcount < irq_refcount) {
+			/* In case we won't find an IRQ with less than min_thres,
+			 * keep a pointer to the least used IRQ
+			 */
+			irq_refcount = iter_refcount;
+			irq = iter;
+		}
+	}
+	return irq;
+}
+
+/**
+ * mlx5_irq_affinity_request - request an IRQ according to the given mask.
+ * @pool: IRQ pool to request from.
+ * @req_mask: cpumask requested for this IRQ.
+ *
+ * This function returns a pointer to IRQ, or ERR_PTR in case of error.
+ */
+struct mlx5_irq *
+mlx5_irq_affinity_request(struct mlx5_irq_pool *pool, const struct cpumask *req_mask)
+{
+	struct mlx5_irq *least_loaded_irq, *new_irq;
+
+	mutex_lock(&pool->lock);
+	least_loaded_irq = irq_pool_find_least_loaded(pool, req_mask);
+	if (least_loaded_irq &&
+	    mlx5_irq_read_locked(least_loaded_irq) < pool->min_threshold)
+		goto out;
+	/* We didn't find an IRQ with less than min_thres, try to allocate a new IRQ */
+	new_irq = irq_pool_request_irq(pool, req_mask);
+	if (IS_ERR(new_irq)) {
+		if (!least_loaded_irq) {
+			/* We failed to create an IRQ and we didn't find an IRQ */
+			mlx5_core_err(pool->dev, "Didn't find a matching IRQ. err = %ld\n",
+				      PTR_ERR(new_irq));
+			mutex_unlock(&pool->lock);
+			return new_irq;
+		}
+		/* We failed to create a new IRQ for the requested affinity,
+		 * sharing existing IRQ.
+		 */
+		goto out;
+	}
+	least_loaded_irq = new_irq;
+	goto unlock;
+out:
+	mlx5_irq_get_locked(least_loaded_irq);
+	if (mlx5_irq_read_locked(least_loaded_irq) > pool->max_threshold)
+		mlx5_core_dbg(pool->dev, "IRQ %u overloaded, pool_name: %s, %u EQs on this irq\n",
+			      pci_irq_vector(pool->dev->pdev,
+					     mlx5_irq_get_index(least_loaded_irq)), pool->name,
+			      mlx5_irq_read_locked(least_loaded_irq) / MLX5_EQ_REFS_PER_IRQ);
+unlock:
+	mutex_unlock(&pool->lock);
+	return least_loaded_irq;
+}
+
+void mlx5_irq_affinity_irqs_release(struct mlx5_core_dev *dev, struct mlx5_irq **irqs,
+				    int num_irqs)
+{
+	struct mlx5_irq_pool *pool = mlx5_irq_pool_get(dev);
+	int i;
+
+	for (i = 0; i < num_irqs; i++) {
+		int cpu = cpumask_first(mlx5_irq_get_affinity_mask(irqs[i]));
+
+		synchronize_irq(pci_irq_vector(pool->dev->pdev,
+					       mlx5_irq_get_index(irqs[i])));
+		if (mlx5_irq_put(irqs[i]))
+			if (pool->irqs_per_cpu)
+				cpu_put(pool, cpu);
+	}
+}
+
+/**
+ * mlx5_irq_affinity_irqs_request_auto - request one or more IRQs for mlx5 device.
+ * @dev: mlx5 device that is requesting the IRQs.
+ * @nirqs: number of IRQs to request.
+ * @irqs: an output array of IRQs pointers.
+ *
+ * Each IRQ is bounded to at most 1 CPU.
+ * This function is requesting IRQs according to the default assignment.
+ * The default assignment policy is:
+ * - in each iteration, request the least loaded IRQ which is not bound to any
+ *   CPU of the previous IRQs requested.
+ *
+ * This function returns the number of IRQs requested, (which might be smaller than
+ * @nirqs), if successful, or a negative error code in case of an error.
+ */
+int mlx5_irq_affinity_irqs_request_auto(struct mlx5_core_dev *dev, int nirqs,
+					struct mlx5_irq **irqs)
+{
+	struct mlx5_irq_pool *pool = mlx5_irq_pool_get(dev);
+	cpumask_var_t req_mask;
+	struct mlx5_irq *irq;
+	int i = 0;
+
+	if (!zalloc_cpumask_var(&req_mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_copy(req_mask, cpu_online_mask);
+	for (i = 0; i < nirqs; i++) {
+		if (mlx5_irq_pool_is_sf_pool(pool))
+			irq = mlx5_irq_affinity_request(pool, req_mask);
+		else
+			/* In case SF pool doesn't exists, fallback to the PF IRQs.
+			 * The PF IRQs are already allocated and binded to CPU
+			 * at this point. Hence, only an index is needed.
+			 */
+			irq = mlx5_irq_request(dev, i, NULL);
+		if (IS_ERR(irq))
+			break;
+		irqs[i] = irq;
+		cpumask_clear_cpu(cpumask_first(mlx5_irq_get_affinity_mask(irq)), req_mask);
+		mlx5_core_dbg(pool->dev, "IRQ %u mapped to cpu %*pbl, %u EQs on this irq\n",
+			      pci_irq_vector(dev->pdev, mlx5_irq_get_index(irq)),
+			      cpumask_pr_args(mlx5_irq_get_affinity_mask(irq)),
+			      mlx5_irq_read_locked(irq) / MLX5_EQ_REFS_PER_IRQ);
+	}
+	free_cpumask_var(req_mask);
+	if (!i)
+		return PTR_ERR(irq);
+	return i;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 6b225bb5751a..2c774f367199 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -98,6 +98,8 @@ enum {
 	MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS = 0x1,
 };
 
+#define LOG_MAX_SUPPORTED_QPS 0xff
+
 static struct mlx5_profile profile[] = {
 	[0] = {
 		.mask           = 0,
@@ -109,7 +111,7 @@ static struct mlx5_profile profile[] = {
 	[2] = {
 		.mask		= MLX5_PROF_MASK_QP_SIZE |
 				  MLX5_PROF_MASK_MR_CACHE,
-		.log_max_qp	= 18,
+		.log_max_qp	= LOG_MAX_SUPPORTED_QPS,
 		.mr_cache[0]	= {
 			.size	= 500,
 			.limit	= 250
@@ -523,7 +525,9 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx)
 		 to_fw_pkey_sz(dev, 128));
 
 	/* Check log_max_qp from HCA caps to set in current profile */
-	if (MLX5_CAP_GEN_MAX(dev, log_max_qp) < prof->log_max_qp) {
+	if (prof->log_max_qp == LOG_MAX_SUPPORTED_QPS) {
+		prof->log_max_qp = MLX5_CAP_GEN_MAX(dev, log_max_qp);
+	} else if (MLX5_CAP_GEN_MAX(dev, log_max_qp) < prof->log_max_qp) {
 		mlx5_core_warn(dev, "log_max_qp value in current profile is %d, changing it to HCA capability limit (%d)\n",
 			       prof->log_max_qp,
 			       MLX5_CAP_GEN_MAX(dev, log_max_qp));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h
index 8116815663a7..23cb63fa4588 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h
@@ -22,12 +22,40 @@ int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int devfn,
 			    int msix_vec_count);
 int mlx5_get_default_msix_vec_count(struct mlx5_core_dev *dev, int num_vfs);
 
+struct mlx5_irq *mlx5_ctrl_irq_request(struct mlx5_core_dev *dev);
+void mlx5_ctrl_irq_release(struct mlx5_irq *ctrl_irq);
 struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx,
 				  struct cpumask *affinity);
-void mlx5_irq_release(struct mlx5_irq *irq);
+int mlx5_irqs_request_vectors(struct mlx5_core_dev *dev, u16 *cpus, int nirqs,
+			      struct mlx5_irq **irqs);
+void mlx5_irqs_release_vectors(struct mlx5_irq **irqs, int nirqs);
 int mlx5_irq_attach_nb(struct mlx5_irq *irq, struct notifier_block *nb);
 int mlx5_irq_detach_nb(struct mlx5_irq *irq, struct notifier_block *nb);
 struct cpumask *mlx5_irq_get_affinity_mask(struct mlx5_irq *irq);
 int mlx5_irq_get_index(struct mlx5_irq *irq);
 
+struct mlx5_irq_pool;
+#ifdef CONFIG_MLX5_SF
+int mlx5_irq_affinity_irqs_request_auto(struct mlx5_core_dev *dev, int nirqs,
+					struct mlx5_irq **irqs);
+struct mlx5_irq *mlx5_irq_affinity_request(struct mlx5_irq_pool *pool,
+					   const struct cpumask *req_mask);
+void mlx5_irq_affinity_irqs_release(struct mlx5_core_dev *dev, struct mlx5_irq **irqs,
+				    int num_irqs);
+#else
+static inline int mlx5_irq_affinity_irqs_request_auto(struct mlx5_core_dev *dev, int nirqs,
+						      struct mlx5_irq **irqs)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline struct mlx5_irq *
+mlx5_irq_affinity_request(struct mlx5_irq_pool *pool, const struct cpumask *req_mask)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void mlx5_irq_affinity_irqs_release(struct mlx5_core_dev *dev,
+						  struct mlx5_irq **irqs, int num_irqs) {}
+#endif
 #endif /* __MLX5_IRQ_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index 163e01fe500e..90fec0649ef5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -7,15 +7,12 @@
 #include <linux/mlx5/driver.h>
 #include "mlx5_core.h"
 #include "mlx5_irq.h"
+#include "pci_irq.h"
 #include "lib/sf.h"
 #ifdef CONFIG_RFS_ACCEL
 #include <linux/cpu_rmap.h>
 #endif
 
-#define MLX5_MAX_IRQ_NAME (32)
-/* max irq_index is 2047, so four chars */
-#define MLX5_MAX_IRQ_IDX_CHARS (4)
-
 #define MLX5_SFS_PER_CTRL_IRQ 64
 #define MLX5_IRQ_CTRL_SF_MAX 8
 /* min num of vectors for SFs to be enabled */
@@ -25,7 +22,6 @@
 #define MLX5_EQ_SHARE_IRQ_MAX_CTRL (UINT_MAX)
 #define MLX5_EQ_SHARE_IRQ_MIN_COMP (1)
 #define MLX5_EQ_SHARE_IRQ_MIN_CTRL (4)
-#define MLX5_EQ_REFS_PER_IRQ (2)
 
 struct mlx5_irq {
 	struct atomic_notifier_head nh;
@@ -37,16 +33,6 @@ struct mlx5_irq {
 	int irqn;
 };
 
-struct mlx5_irq_pool {
-	char name[MLX5_MAX_IRQ_NAME - MLX5_MAX_IRQ_IDX_CHARS];
-	struct xa_limit xa_num_irqs;
-	struct mutex lock; /* sync IRQs creations */
-	struct xarray irqs;
-	u32 max_threshold;
-	u32 min_threshold;
-	struct mlx5_core_dev *dev;
-};
-
 struct mlx5_irq_table {
 	struct mlx5_irq_pool *pf_pool;
 	struct mlx5_irq_pool *sf_ctrl_pool;
@@ -153,18 +139,28 @@ static void irq_release(struct mlx5_irq *irq)
 	kfree(irq);
 }
 
-static void irq_put(struct mlx5_irq *irq)
+int mlx5_irq_put(struct mlx5_irq *irq)
 {
 	struct mlx5_irq_pool *pool = irq->pool;
+	int ret = 0;
 
 	mutex_lock(&pool->lock);
 	irq->refcount--;
-	if (!irq->refcount)
+	if (!irq->refcount) {
 		irq_release(irq);
+		ret = 1;
+	}
 	mutex_unlock(&pool->lock);
+	return ret;
+}
+
+int mlx5_irq_read_locked(struct mlx5_irq *irq)
+{
+	lockdep_assert_held(&irq->pool->lock);
+	return irq->refcount;
 }
 
-static int irq_get_locked(struct mlx5_irq *irq)
+int mlx5_irq_get_locked(struct mlx5_irq *irq)
 {
 	lockdep_assert_held(&irq->pool->lock);
 	if (WARN_ON_ONCE(!irq->refcount))
@@ -178,7 +174,7 @@ static int irq_get(struct mlx5_irq *irq)
 	int err;
 
 	mutex_lock(&irq->pool->lock);
-	err = irq_get_locked(irq);
+	err = mlx5_irq_get_locked(irq);
 	mutex_unlock(&irq->pool->lock);
 	return err;
 }
@@ -210,12 +206,8 @@ static void irq_set_name(struct mlx5_irq_pool *pool, char *name, int vecidx)
 	snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", vecidx);
 }
 
-static bool irq_pool_is_sf_pool(struct mlx5_irq_pool *pool)
-{
-	return !strncmp("mlx5_sf", pool->name, strlen("mlx5_sf"));
-}
-
-static struct mlx5_irq *irq_request(struct mlx5_irq_pool *pool, int i)
+struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i,
+				const struct cpumask *affinity)
 {
 	struct mlx5_core_dev *dev = pool->dev;
 	char name[MLX5_MAX_IRQ_NAME];
@@ -226,7 +218,7 @@ static struct mlx5_irq *irq_request(struct mlx5_irq_pool *pool, int i)
 	if (!irq)
 		return ERR_PTR(-ENOMEM);
 	irq->irqn = pci_irq_vector(dev->pdev, i);
-	if (!irq_pool_is_sf_pool(pool))
+	if (!mlx5_irq_pool_is_sf_pool(pool))
 		irq_set_name(pool, name, i);
 	else
 		irq_sf_set_name(pool, name, i);
@@ -244,6 +236,10 @@ static struct mlx5_irq *irq_request(struct mlx5_irq_pool *pool, int i)
 		err = -ENOMEM;
 		goto err_cpumask;
 	}
+	if (affinity) {
+		cpumask_copy(irq->mask, affinity);
+		irq_set_affinity_hint(irq->irqn, irq->mask);
+	}
 	irq->pool = pool;
 	irq->refcount = 1;
 	irq->index = i;
@@ -255,6 +251,7 @@ static struct mlx5_irq *irq_request(struct mlx5_irq_pool *pool, int i)
 	}
 	return irq;
 err_xa:
+	irq_set_affinity_hint(irq->irqn, NULL);
 	free_cpumask_var(irq->mask);
 err_cpumask:
 	free_irq(irq->irqn, &irq->nh);
@@ -275,7 +272,7 @@ int mlx5_irq_attach_nb(struct mlx5_irq *irq, struct notifier_block *nb)
 		return -ENOENT;
 	ret = atomic_notifier_chain_register(&irq->nh, nb);
 	if (ret)
-		irq_put(irq);
+		mlx5_irq_put(irq);
 	return ret;
 }
 
@@ -284,7 +281,7 @@ int mlx5_irq_detach_nb(struct mlx5_irq *irq, struct notifier_block *nb)
 	int err = 0;
 
 	err = atomic_notifier_chain_unregister(&irq->nh, nb);
-	irq_put(irq);
+	mlx5_irq_put(irq);
 	return err;
 }
 
@@ -300,131 +297,121 @@ int mlx5_irq_get_index(struct mlx5_irq *irq)
 
 /* irq_pool API */
 
-/* creating an irq from irq_pool */
-static struct mlx5_irq *irq_pool_create_irq(struct mlx5_irq_pool *pool,
-					    struct cpumask *affinity)
+/* requesting an irq from a given pool according to given index */
+static struct mlx5_irq *
+irq_pool_request_vector(struct mlx5_irq_pool *pool, int vecidx,
+			struct cpumask *affinity)
 {
 	struct mlx5_irq *irq;
-	u32 irq_index;
-	int err;
 
-	err = xa_alloc(&pool->irqs, &irq_index, NULL, pool->xa_num_irqs,
-		       GFP_KERNEL);
-	if (err)
-		return ERR_PTR(err);
-	irq = irq_request(pool, irq_index);
-	if (IS_ERR(irq))
-		return irq;
-	cpumask_copy(irq->mask, affinity);
-	irq_set_affinity_hint(irq->irqn, irq->mask);
+	mutex_lock(&pool->lock);
+	irq = xa_load(&pool->irqs, vecidx);
+	if (irq) {
+		mlx5_irq_get_locked(irq);
+		goto unlock;
+	}
+	irq = mlx5_irq_alloc(pool, vecidx, affinity);
+unlock:
+	mutex_unlock(&pool->lock);
 	return irq;
 }
 
-/* looking for the irq with the smallest refcount and the same affinity */
-static struct mlx5_irq *irq_pool_find_least_loaded(struct mlx5_irq_pool *pool,
-						   struct cpumask *affinity)
+static struct mlx5_irq_pool *sf_ctrl_irq_pool_get(struct mlx5_irq_table *irq_table)
 {
-	int start = pool->xa_num_irqs.min;
-	int end = pool->xa_num_irqs.max;
-	struct mlx5_irq *irq = NULL;
-	struct mlx5_irq *iter;
-	unsigned long index;
+	return irq_table->sf_ctrl_pool;
+}
 
-	lockdep_assert_held(&pool->lock);
-	xa_for_each_range(&pool->irqs, index, iter, start, end) {
-		if (!cpumask_equal(iter->mask, affinity))
-			continue;
-		if (iter->refcount < pool->min_threshold)
-			return iter;
-		if (!irq || iter->refcount < irq->refcount)
-			irq = iter;
-	}
-	return irq;
+static struct mlx5_irq_pool *sf_irq_pool_get(struct mlx5_irq_table *irq_table)
+{
+	return irq_table->sf_comp_pool;
 }
 
-/* requesting an irq from a given pool according to given affinity */
-static struct mlx5_irq *irq_pool_request_affinity(struct mlx5_irq_pool *pool,
-						  struct cpumask *affinity)
+struct mlx5_irq_pool *mlx5_irq_pool_get(struct mlx5_core_dev *dev)
 {
-	struct mlx5_irq *least_loaded_irq, *new_irq;
+	struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev);
+	struct mlx5_irq_pool *pool = NULL;
 
-	mutex_lock(&pool->lock);
-	least_loaded_irq = irq_pool_find_least_loaded(pool, affinity);
-	if (least_loaded_irq &&
-	    least_loaded_irq->refcount < pool->min_threshold)
-		goto out;
-	new_irq = irq_pool_create_irq(pool, affinity);
-	if (IS_ERR(new_irq)) {
-		if (!least_loaded_irq) {
-			mlx5_core_err(pool->dev, "Didn't find a matching IRQ. err = %ld\n",
-				      PTR_ERR(new_irq));
-			mutex_unlock(&pool->lock);
-			return new_irq;
-		}
-		/* We failed to create a new IRQ for the requested affinity,
-		 * sharing existing IRQ.
-		 */
-		goto out;
-	}
-	least_loaded_irq = new_irq;
-	goto unlock;
-out:
-	irq_get_locked(least_loaded_irq);
-	if (least_loaded_irq->refcount > pool->max_threshold)
-		mlx5_core_dbg(pool->dev, "IRQ %u overloaded, pool_name: %s, %u EQs on this irq\n",
-			      least_loaded_irq->irqn, pool->name,
-			      least_loaded_irq->refcount / MLX5_EQ_REFS_PER_IRQ);
-unlock:
-	mutex_unlock(&pool->lock);
-	return least_loaded_irq;
+	if (mlx5_core_is_sf(dev))
+		pool = sf_irq_pool_get(irq_table);
+
+	/* In some configs, there won't be a pool of SFs IRQs. Hence, returning
+	 * the PF IRQs pool in case the SF pool doesn't exist.
+	 */
+	return pool ? pool : irq_table->pf_pool;
 }
 
-/* requesting an irq from a given pool according to given index */
-static struct mlx5_irq *
-irq_pool_request_vector(struct mlx5_irq_pool *pool, int vecidx,
-			struct cpumask *affinity)
+static struct mlx5_irq_pool *ctrl_irq_pool_get(struct mlx5_core_dev *dev)
 {
-	struct mlx5_irq *irq;
+	struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev);
+	struct mlx5_irq_pool *pool = NULL;
 
-	mutex_lock(&pool->lock);
-	irq = xa_load(&pool->irqs, vecidx);
-	if (irq) {
-		irq_get_locked(irq);
-		goto unlock;
+	if (mlx5_core_is_sf(dev))
+		pool = sf_ctrl_irq_pool_get(irq_table);
+
+	/* In some configs, there won't be a pool of SFs IRQs. Hence, returning
+	 * the PF IRQs pool in case the SF pool doesn't exist.
+	 */
+	return pool ? pool : irq_table->pf_pool;
+}
+
+/**
+ * mlx5_irqs_release - release one or more IRQs back to the system.
+ * @irqs: IRQs to be released.
+ * @nirqs: number of IRQs to be released.
+ */
+static void mlx5_irqs_release(struct mlx5_irq **irqs, int nirqs)
+{
+	int i;
+
+	for (i = 0; i < nirqs; i++) {
+		synchronize_irq(irqs[i]->irqn);
+		mlx5_irq_put(irqs[i]);
 	}
-	irq = irq_request(pool, vecidx);
-	if (IS_ERR(irq) || !affinity)
-		goto unlock;
-	cpumask_copy(irq->mask, affinity);
-	if (!irq_pool_is_sf_pool(pool) && !pool->xa_num_irqs.max &&
-	    cpumask_empty(irq->mask))
-		cpumask_set_cpu(cpumask_first(cpu_online_mask), irq->mask);
-	irq_set_affinity_hint(irq->irqn, irq->mask);
-unlock:
-	mutex_unlock(&pool->lock);
-	return irq;
 }
 
-static struct mlx5_irq_pool *find_sf_irq_pool(struct mlx5_irq_table *irq_table,
-					      int i, struct cpumask *affinity)
+/**
+ * mlx5_ctrl_irq_release - release a ctrl IRQ back to the system.
+ * @ctrl_irq: ctrl IRQ to be released.
+ */
+void mlx5_ctrl_irq_release(struct mlx5_irq *ctrl_irq)
 {
-	if (cpumask_empty(affinity) && i == MLX5_IRQ_EQ_CTRL)
-		return irq_table->sf_ctrl_pool;
-	return irq_table->sf_comp_pool;
+	mlx5_irqs_release(&ctrl_irq, 1);
 }
 
 /**
- * mlx5_irq_release - release an IRQ back to the system.
- * @irq: irq to be released.
+ * mlx5_ctrl_irq_request - request a ctrl IRQ for mlx5 device.
+ * @dev: mlx5 device that requesting the IRQ.
+ *
+ * This function returns a pointer to IRQ, or ERR_PTR in case of error.
  */
-void mlx5_irq_release(struct mlx5_irq *irq)
+struct mlx5_irq *mlx5_ctrl_irq_request(struct mlx5_core_dev *dev)
 {
-	synchronize_irq(irq->irqn);
-	irq_put(irq);
+	struct mlx5_irq_pool *pool = ctrl_irq_pool_get(dev);
+	cpumask_var_t req_mask;
+	struct mlx5_irq *irq;
+
+	if (!zalloc_cpumask_var(&req_mask, GFP_KERNEL))
+		return ERR_PTR(-ENOMEM);
+	cpumask_copy(req_mask, cpu_online_mask);
+	if (!mlx5_irq_pool_is_sf_pool(pool)) {
+		/* In case we are allocating a control IRQ for PF/VF */
+		if (!pool->xa_num_irqs.max) {
+			cpumask_clear(req_mask);
+			/* In case we only have a single IRQ for PF/VF */
+			cpumask_set_cpu(cpumask_first(cpu_online_mask), req_mask);
+		}
+		/* Allocate the IRQ in the last index of the pool */
+		irq = irq_pool_request_vector(pool, pool->xa_num_irqs.max, req_mask);
+	} else {
+		irq = mlx5_irq_affinity_request(pool, req_mask);
+	}
+
+	free_cpumask_var(req_mask);
+	return irq;
 }
 
 /**
- * mlx5_irq_request - request an IRQ for mlx5 device.
+ * mlx5_irq_request - request an IRQ for mlx5 PF/VF device.
  * @dev: mlx5 device that requesting the IRQ.
  * @vecidx: vector index of the IRQ. This argument is ignore if affinity is
  * provided.
@@ -439,23 +426,8 @@ struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx,
 	struct mlx5_irq_pool *pool;
 	struct mlx5_irq *irq;
 
-	if (mlx5_core_is_sf(dev)) {
-		pool = find_sf_irq_pool(irq_table, vecidx, affinity);
-		if (!pool)
-			/* we don't have IRQs for SFs, using the PF IRQs */
-			goto pf_irq;
-		if (cpumask_empty(affinity) && !strcmp(pool->name, "mlx5_sf_comp"))
-			/* In case an SF user request IRQ with vecidx */
-			irq = irq_pool_request_vector(pool, vecidx, NULL);
-		else
-			irq = irq_pool_request_affinity(pool, affinity);
-		goto out;
-	}
-pf_irq:
 	pool = irq_table->pf_pool;
-	vecidx = (vecidx == MLX5_IRQ_EQ_CTRL) ? pool->xa_num_irqs.max : vecidx;
 	irq = irq_pool_request_vector(pool, vecidx, affinity);
-out:
 	if (IS_ERR(irq))
 		return irq;
 	mlx5_core_dbg(dev, "irq %u mapped to cpu %*pbl, %u EQs on this irq\n",
@@ -464,6 +436,51 @@ out:
 	return irq;
 }
 
+/**
+ * mlx5_irqs_release_vectors - release one or more IRQs back to the system.
+ * @irqs: IRQs to be released.
+ * @nirqs: number of IRQs to be released.
+ */
+void mlx5_irqs_release_vectors(struct mlx5_irq **irqs, int nirqs)
+{
+	mlx5_irqs_release(irqs, nirqs);
+}
+
+/**
+ * mlx5_irqs_request_vectors - request one or more IRQs for mlx5 device.
+ * @dev: mlx5 device that is requesting the IRQs.
+ * @cpus: CPUs array for binding the IRQs
+ * @nirqs: number of IRQs to request.
+ * @irqs: an output array of IRQs pointers.
+ *
+ * Each IRQ is bound to at most 1 CPU.
+ * This function is requests nirqs IRQs, starting from @vecidx.
+ *
+ * This function returns the number of IRQs requested, (which might be smaller than
+ * @nirqs), if successful, or a negative error code in case of an error.
+ */
+int mlx5_irqs_request_vectors(struct mlx5_core_dev *dev, u16 *cpus, int nirqs,
+			      struct mlx5_irq **irqs)
+{
+	cpumask_var_t req_mask;
+	struct mlx5_irq *irq;
+	int i;
+
+	if (!zalloc_cpumask_var(&req_mask, GFP_KERNEL))
+		return -ENOMEM;
+	for (i = 0; i < nirqs; i++) {
+		cpumask_set_cpu(cpus[i], req_mask);
+		irq = mlx5_irq_request(dev, i, req_mask);
+		if (IS_ERR(irq))
+			break;
+		cpumask_clear(req_mask);
+		irqs[i] = irq;
+	}
+
+	free_cpumask_var(req_mask);
+	return i ? i : PTR_ERR(irq);
+}
+
 static struct mlx5_irq_pool *
 irq_pool_alloc(struct mlx5_core_dev *dev, int start, int size, char *name,
 	       u32 min_threshold, u32 max_threshold)
@@ -500,6 +517,7 @@ static void irq_pool_free(struct mlx5_irq_pool *pool)
 		irq_release(irq);
 	xa_destroy(&pool->irqs);
 	mutex_destroy(&pool->lock);
+	kfree(pool->irqs_per_cpu);
 	kvfree(pool);
 }
 
@@ -547,7 +565,17 @@ static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pf_vec)
 		err = PTR_ERR(table->sf_comp_pool);
 		goto err_sf_ctrl;
 	}
+
+	table->sf_comp_pool->irqs_per_cpu = kcalloc(nr_cpu_ids, sizeof(u16), GFP_KERNEL);
+	if (!table->sf_comp_pool->irqs_per_cpu) {
+		err = -ENOMEM;
+		goto err_irqs_per_cpu;
+	}
+
 	return 0;
+
+err_irqs_per_cpu:
+	irq_pool_free(table->sf_comp_pool);
 err_sf_ctrl:
 	irq_pool_free(table->sf_ctrl_pool);
 err_pf:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h
new file mode 100644
index 000000000000..5c7e68bee43a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+
+#ifndef __PCI_IRQ_H__
+#define __PCI_IRQ_H__
+
+#include <linux/mlx5/driver.h>
+
+#define MLX5_MAX_IRQ_NAME (32)
+/* max irq_index is 2047, so four chars */
+#define MLX5_MAX_IRQ_IDX_CHARS (4)
+#define MLX5_EQ_REFS_PER_IRQ (2)
+
+struct mlx5_irq;
+
+struct mlx5_irq_pool {
+	char name[MLX5_MAX_IRQ_NAME - MLX5_MAX_IRQ_IDX_CHARS];
+	struct xa_limit xa_num_irqs;
+	struct mutex lock; /* sync IRQs creations */
+	struct xarray irqs;
+	u32 max_threshold;
+	u32 min_threshold;
+	u16 *irqs_per_cpu;
+	struct mlx5_core_dev *dev;
+};
+
+struct mlx5_irq_pool *mlx5_irq_pool_get(struct mlx5_core_dev *dev);
+static inline bool mlx5_irq_pool_is_sf_pool(struct mlx5_irq_pool *pool)
+{
+	return !strncmp("mlx5_sf", pool->name, strlen("mlx5_sf"));
+}
+
+struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i,
+				const struct cpumask *affinity);
+int mlx5_irq_get_locked(struct mlx5_irq *irq);
+int mlx5_irq_read_locked(struct mlx5_irq *irq);
+int mlx5_irq_put(struct mlx5_irq *irq);
+
+#endif /* __PCI_IRQ_H__ */
diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h
index ea3ff5a8ced3..3705a382276b 100644
--- a/include/linux/mlx5/eq.h
+++ b/include/linux/mlx5/eq.h
@@ -9,13 +9,13 @@
 #define MLX5_NUM_SPARE_EQE (0x80)
 
 struct mlx5_eq;
+struct mlx5_irq;
 struct mlx5_core_dev;
 
 struct mlx5_eq_param {
-	u8             irq_index;
 	int            nent;
 	u64            mask[4];
-	cpumask_var_t  affinity;
+	struct mlx5_irq *irq;
 };
 
 struct mlx5_eq *