aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2014-06-01 19:17:18 -0700
committerDavid S. Miller <davem@davemloft.net>2014-06-01 19:17:18 -0700
commitb07166b26e6974d5b7072eff75d773ef6f0c5ba3 (patch)
tree89dbe2c36c695f2e4c79429f4362bc07df2cc1de
parentMerge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next (diff)
parentnet/mlx4_en: Use affinity hint (diff)
downloadlinux-dev-b07166b26e6974d5b7072eff75d773ef6f0c5ba3.tar.xz
linux-dev-b07166b26e6974d5b7072eff75d773ef6f0c5ba3.zip
Merge branch 'mlx4-next'
Amir Vadai says: ==================== cpumask,net: Affinity hint helper function This patchset will set affinity hint to influence IRQs to be allocated on the same NUMA node as the one where the card resides. As discussed in http://www.spinics.net/lists/netdev/msg271497.html If the number of IRQs allocated is greater than the number of local NUMA cores, all local cores will be used first, and the rest of the IRQs will be on a remote NUMA node. If no NUMA support - IRQ's and cores will be mapped 1:1 Since the utility function to calculate the mapping could be useful in other mq drivers in the kernel, it was added to cpumask.[ch] This patchset was tested and applied on top of net-next since the first consumer is a network device (mlx4_en). Over commit 506724c: "tg3: Override clock, link aware and link idle mode during NVRAM dump" I couldn't find a maintainer for cpumask.c, so only added the kernel mailing list Amir Changes from V5: - Moved the utility function from kernel/irq/manage.c to lib/cpumask.c, and renamed it's name accordingly to cpumask_set_cpu_local_first() - Added some comments as Thomas Gleixner suggested - Changed -EINVAL to -EAGAIN, that describes the error situtation better. Changes from V4: - Patch 1/2: irq: Utility function to get affinity_hint by policy Thank you Ben for the great review: - Moved the function it kernel/irq/manage.c since it could be useful for block mq devices - Fixed Typo's - Use cpumask_t * instead of cpumask_var_t in function header - Restructured the function to remove NULL assignment in a cpumask_var_t - Fix for offline local CPU's Changes from V3: - Patch 2/2: net/mlx4_en: Use affinity hint - somehow patch file was corrupted Changes from V2: - Patch 1/2: net: Utility function to get affinity_hint by policy - Fixed style issues Changes from V1: - Patch 1/2: net: Utility function to get affinity_hint by policy - Fixed error flow to return -EINVAL on error (thanks govind) - Patch 2/2: net/mlx4_en: Use affinity hint - Set ring->affinity_hint to NULL on error Changes from V0: - Fixed small style issues ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/infiniband/hw/mlx4/main.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_cq.c6
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_netdev.c30
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/eq.c13
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/mlx4_en.h1
-rw-r--r--include/linux/cpumask.h2
-rw-r--r--include/linux/mlx4/device.h2
-rw-r--r--lib/cpumask.c64
8 files changed, 116 insertions, 4 deletions
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 199c7896f081..58b1f239ac2b 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1897,7 +1897,7 @@ static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
i, j, dev->pdev->bus->name);
/* Set IRQ for specific name (per ring) */
if (mlx4_assign_eq(dev, name, NULL,
- &ibdev->eq_table[eq])) {
+ &ibdev->eq_table[eq], NULL)) {
/* Use legacy (same as mlx4_en driver) */
pr_warn("Can't allocate EQ %d; reverting to legacy\n", eq);
ibdev->eq_table[eq] =
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
index 636963db598a..ea2cd72e5368 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
@@ -118,11 +118,15 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
if (cq->is_tx == RX) {
if (mdev->dev->caps.comp_pool) {
if (!cq->vector) {
+ struct mlx4_en_rx_ring *ring =
+ priv->rx_ring[cq->ring];
+
sprintf(name, "%s-%d", priv->dev->name,
cq->ring);
/* Set IRQ for specific name (per ring) */
if (mlx4_assign_eq(mdev->dev, name, rmap,
- &cq->vector)) {
+ &cq->vector,
+ ring->affinity_mask)) {
cq->vector = (cq->ring + 1 + priv->port)
% mdev->dev->caps.num_comp_vectors;
mlx4_warn(mdev, "Failed assigning an EQ to %s, falling back to legacy EQ's\n",
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 58209bd0c94c..05d135572abc 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -1526,6 +1526,32 @@ static void mlx4_en_linkstate(struct work_struct *work)
mutex_unlock(&mdev->state_lock);
}
+static void mlx4_en_init_affinity_hint(struct mlx4_en_priv *priv, int ring_idx)
+{
+ struct mlx4_en_rx_ring *ring = priv->rx_ring[ring_idx];
+ int numa_node = priv->mdev->dev->numa_node;
+
+ if (numa_node == -1)
+ return;
+
+ if (!zalloc_cpumask_var(&ring->affinity_mask, GFP_KERNEL)) {
+ en_err(priv, "Failed to allocate core mask\n");
+ return;
+ }
+
+ if (cpumask_set_cpu_local_first(ring_idx, numa_node,
+ ring->affinity_mask)) {
+ en_err(priv, "Failed setting affinity hint\n");
+ free_cpumask_var(ring->affinity_mask);
+ ring->affinity_mask = NULL;
+ }
+}
+
+static void mlx4_en_free_affinity_hint(struct mlx4_en_priv *priv, int ring_idx)
+{
+ free_cpumask_var(priv->rx_ring[ring_idx]->affinity_mask);
+ priv->rx_ring[ring_idx]->affinity_mask = NULL;
+}
int mlx4_en_start_port(struct net_device *dev)
{
@@ -1567,6 +1593,8 @@ int mlx4_en_start_port(struct net_device *dev)
mlx4_en_cq_init_lock(cq);
+ mlx4_en_init_affinity_hint(priv, i);
+
err = mlx4_en_activate_cq(priv, cq, i);
if (err) {
en_err(priv, "Failed activating Rx CQ\n");
@@ -1847,6 +1875,8 @@ void mlx4_en_stop_port(struct net_device *dev, int detach)
msleep(1);
mlx4_en_deactivate_rx_ring(priv, priv->rx_ring[i]);
mlx4_en_deactivate_cq(priv, cq);
+
+ mlx4_en_free_affinity_hint(priv, i);
}
}
diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c
index d954ec1eac17..f91659e5fa13 100644
--- a/drivers/net/ethernet/mellanox/mlx4/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -1376,7 +1376,7 @@ int mlx4_test_interrupts(struct mlx4_dev *dev)
EXPORT_SYMBOL(mlx4_test_interrupts);
int mlx4_assign_eq(struct mlx4_dev *dev, char *name, struct cpu_rmap *rmap,
- int *vector)
+ int *vector, cpumask_var_t cpu_hint_mask)
{
struct mlx4_priv *priv = mlx4_priv(dev);
@@ -1411,6 +1411,15 @@ int mlx4_assign_eq(struct mlx4_dev *dev, char *name, struct cpu_rmap *rmap,
}
mlx4_assign_irq_notifier(priv, dev,
priv->eq_table.eq[vec].irq);
+ if (cpu_hint_mask) {
+ err = irq_set_affinity_hint(
+ priv->eq_table.eq[vec].irq,
+ cpu_hint_mask);
+ if (err) {
+ mlx4_warn(dev, "Failed setting affinity hint\n");
+ /*we dont want to break here*/
+ }
+ }
eq_set_ci(&priv->eq_table.eq[vec], 1);
}
@@ -1441,6 +1450,8 @@ void mlx4_release_eq(struct mlx4_dev *dev, int vec)
irq_set_affinity_notifier(
priv->eq_table.eq[vec].irq,
NULL);
+ irq_set_affinity_hint(priv->eq_table.eq[vec].irq,
+ NULL);
free_irq(priv->eq_table.eq[vec].irq,
&priv->eq_table.eq[vec]);
priv->msix_ctl.pool_bm &= ~(1ULL << i);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index b5db1bf361dc..0e15295bedd6 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -313,6 +313,7 @@ struct mlx4_en_rx_ring {
unsigned long csum_ok;
unsigned long csum_none;
int hwtstamp_rx_filter;
+ cpumask_var_t affinity_mask;
};
struct mlx4_en_cq {
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index d08e4d2a9b92..3551d667ef9f 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -257,6 +257,8 @@ static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}
+int cpumask_set_cpu_local_first(int i, int numa_node, cpumask_t *dstp);
+
/**
* cpumask_clear_cpu - clear a cpu in a cpumask
* @cpu: cpu number (< nr_cpu_ids)
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index ca38871a585c..b9b70e00e3c1 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -1161,7 +1161,7 @@ int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
int mlx4_SYNC_TPT(struct mlx4_dev *dev);
int mlx4_test_interrupts(struct mlx4_dev *dev);
int mlx4_assign_eq(struct mlx4_dev *dev, char *name, struct cpu_rmap *rmap,
- int *vector);
+ int *vector, cpumask_t *cpu_hint_mask);
void mlx4_release_eq(struct mlx4_dev *dev, int vec);
int mlx4_get_phys_port_id(struct mlx4_dev *dev);
diff --git a/lib/cpumask.c b/lib/cpumask.c
index b810b753c607..14049a96f04a 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -163,4 +163,68 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
{
memblock_free_early(__pa(mask), cpumask_size());
}
+
+/**
+ * cpumask_set_cpu_local_first - set i'th cpu with local numa cpu's first
+ *
+ * @i: index number
+ * @numa_node: local numa_node
+ * @dstp: cpumask with the relevant cpu bit set according to the policy
+ *
+ * This function sets the cpumask according to a numa aware policy.
+ * cpumask could be used as an affinity hint for the IRQ related to a
+ * queue. When the policy is to spread queues across cores - local cores
+ * first.
+ *
+ * Returns 0 on success, -ENOMEM for no memory, and -EAGAIN when failed to set
+ * the cpu bit and need to re-call the function.
+ */
+int cpumask_set_cpu_local_first(int i, int numa_node, cpumask_t *dstp)
+{
+ cpumask_var_t mask;
+ int cpu;
+ int ret = 0;
+
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ i %= num_online_cpus();
+
+ if (!cpumask_of_node(numa_node)) {
+ /* Use all online cpu's for non numa aware system */
+ cpumask_copy(mask, cpu_online_mask);
+ } else {
+ int n;
+
+ cpumask_and(mask,
+ cpumask_of_node(numa_node), cpu_online_mask);
+
+ n = cpumask_weight(mask);
+ if (i >= n) {
+ i -= n;
+
+ /* If index > number of local cpu's, mask out local
+ * cpu's
+ */
+ cpumask_andnot(mask, cpu_online_mask, mask);
+ }
+ }
+
+ for_each_cpu(cpu, mask) {
+ if (--i < 0)
+ goto out;
+ }
+
+ ret = -EAGAIN;
+
+out:
+ free_cpumask_var(mask);
+
+ if (!ret)
+ cpumask_set_cpu(cpu, dstp);
+
+ return ret;
+}
+EXPORT_SYMBOL(cpumask_set_cpu_local_first);
+
#endif