diff options
author | VSR Burru <veerasenareddy.burru@cavium.com> | 2017-03-06 18:45:59 -0800 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2017-03-09 13:07:00 -0800 |
commit | 67e303e0c7683957eb4e530453705a43a6d4f966 (patch) | |
tree | fce0649052e8f551bea61bffc7788fb2faa4c0ee /drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | |
parent | net: ipv6: Remove redundant RTA_OIF in multipath routes (diff) | |
download | linux-dev-67e303e0c7683957eb4e530453705a43a6d4f966.tar.xz linux-dev-67e303e0c7683957eb4e530453705a43a6d4f966.zip |
liquidio: improve UDP TX performance
Improve UDP TX performance by:
* reducing the ring size from 2K to 512
* replacing the numerous streaming DMA allocations for info buffers and
gather lists with one large consistent DMA allocation per ring
BQL is not effective here. We reduced the ring size because there is heavy
overhead with dma_map_single every so often. With iommu=on, dma_map_single
in PF Tx data path was taking longer time (~700usec) for every ~250
packets. Debugged intel_iommu code, and found that PF driver is utilizing
too many static IO virtual address mapping entries (for gather list entries
and info buffers): about 100K entries for two PF's each using 8 rings.
Also, finding an empty entry (in rbtree of device domain's iova mapping in
kernel) during Tx path becomes a bottleneck every so often; the loop to
find the empty entry goes through over 40K iterations; this is too costly
and was the major overhead. Overhead is low when this loop quits quickly.
Netperf benchmark numbers before and after patch:
PF UDP TX
+--------+--------+------------+------------+---------+
| | | Before | After | |
| Number | | Patch | Patch | |
| of | Packet | Throughput | Throughput | Percent |
| Flows | Size | (Gbps) | (Gbps) | Change |
+--------+--------+------------+------------+---------+
| | 360 | 0.52 | 0.93 | +78.9 |
| 1 | 1024 | 1.62 | 2.84 | +75.3 |
| | 1518 | 2.44 | 4.21 | +72.5 |
+--------+--------+------------+------------+---------+
| | 360 | 0.45 | 1.59 | +253.3 |
| 4 | 1024 | 1.34 | 5.48 | +308.9 |
| | 1518 | 2.27 | 8.31 | +266.1 |
+--------+--------+------------+------------+---------+
| | 360 | 0.40 | 1.61 | +302.5 |
| 8 | 1024 | 1.64 | 4.24 | +158.5 |
| | 1518 | 2.87 | 6.52 | +127.2 |
+--------+--------+------------+------------+---------+
VF UDP TX
+--------+--------+------------+------------+---------+
| | | Before | After | |
| Number | | Patch | Patch | |
| of | Packet | Throughput | Throughput | Percent |
| Flows | Size | (Gbps) | (Gbps) | Change |
+--------+--------+------------+------------+---------+
| | 360 | 1.28 | 1.49 | +16.4 |
| 1 | 1024 | 4.44 | 4.39 | -1.1 |
| | 1518 | 6.08 | 6.51 | +7.1 |
+--------+--------+------------+------------+---------+
| | 360 | 2.35 | 2.35 | 0.0 |
| 4 | 1024 | 6.41 | 8.07 | +25.9 |
| | 1518 | 9.56 | 9.54 | -0.2 |
+--------+--------+------------+------------+---------+
| | 360 | 3.41 | 3.65 | +7.0 |
| 8 | 1024 | 9.35 | 9.34 | -0.1 |
| | 1518 | 9.56 | 9.57 | +0.1 |
+--------+--------+------------+------------+---------+
Signed-off-by: VSR Burru <veerasenareddy.burru@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
Signed-off-by: Derek Chickles <derek.chickles@cavium.com>
Signed-off-by: Raghu Vatsavayi <raghu.vatsavayi@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net/ethernet/cavium/liquidio/lio_vf_main.c')
-rw-r--r-- | drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 104 |
1 files changed, 55 insertions, 49 deletions
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c index 9d5e03502c76..7b83be4ce1fe 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c @@ -108,6 +108,8 @@ struct octnic_gather { * received from the IP layer. */ struct octeon_sg_entry *sg; + + dma_addr_t sg_dma_ptr; }; struct octeon_device_priv { @@ -490,6 +492,9 @@ static void delete_glists(struct lio *lio) struct octnic_gather *g; int i; + kfree(lio->glist_lock); + lio->glist_lock = NULL; + if (!lio->glist) return; @@ -497,17 +502,26 @@ static void delete_glists(struct lio *lio) do { g = (struct octnic_gather *) list_delete_head(&lio->glist[i]); - if (g) { - if (g->sg) - kfree((void *)((unsigned long)g->sg - - g->adjust)); + if (g) kfree(g); - } } while (g); + + if (lio->glists_virt_base && lio->glists_virt_base[i]) { + lio_dma_free(lio->oct_dev, + lio->glist_entry_size * lio->tx_qsize, + lio->glists_virt_base[i], + lio->glists_dma_base[i]); + } } + kfree(lio->glists_virt_base); + lio->glists_virt_base = NULL; + + kfree(lio->glists_dma_base); + lio->glists_dma_base = NULL; + kfree(lio->glist); - kfree(lio->glist_lock); + lio->glist = NULL; } /** @@ -522,13 +536,30 @@ static int setup_glists(struct lio *lio, int num_iqs) lio->glist_lock = kzalloc(sizeof(*lio->glist_lock) * num_iqs, GFP_KERNEL); if (!lio->glist_lock) - return 1; + return -ENOMEM; lio->glist = kzalloc(sizeof(*lio->glist) * num_iqs, GFP_KERNEL); if (!lio->glist) { kfree(lio->glist_lock); - return 1; + lio->glist_lock = NULL; + return -ENOMEM; + } + + lio->glist_entry_size = + ROUNDUP8((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * OCT_SG_ENTRY_SIZE); + + /* allocate memory to store virtual and dma base address of + * per glist consistent memory + */ + lio->glists_virt_base = kcalloc(num_iqs, sizeof(*lio->glists_virt_base), + GFP_KERNEL); + lio->glists_dma_base = kcalloc(num_iqs, sizeof(*lio->glists_dma_base), + GFP_KERNEL); + + if (!lio->glists_virt_base || !lio->glists_dma_base) { + delete_glists(lio); + return -ENOMEM; } for (i = 0; i < num_iqs; i++) { @@ -536,34 +567,33 @@ static int setup_glists(struct lio *lio, int num_iqs) INIT_LIST_HEAD(&lio->glist[i]); + lio->glists_virt_base[i] = + lio_dma_alloc(lio->oct_dev, + lio->glist_entry_size * lio->tx_qsize, + &lio->glists_dma_base[i]); + + if (!lio->glists_virt_base[i]) { + delete_glists(lio); + return -ENOMEM; + } + for (j = 0; j < lio->tx_qsize; j++) { g = kzalloc(sizeof(*g), GFP_KERNEL); if (!g) break; - g->sg_size = ((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * - OCT_SG_ENTRY_SIZE); + g->sg = lio->glists_virt_base[i] + + (j * lio->glist_entry_size); - g->sg = kmalloc(g->sg_size + 8, GFP_KERNEL); - if (!g->sg) { - kfree(g); - break; - } + g->sg_dma_ptr = lio->glists_dma_base[i] + + (j * lio->glist_entry_size); - /* The gather component should be aligned on 64-bit - * boundary - */ - if (((unsigned long)g->sg) & 7) { - g->adjust = 8 - (((unsigned long)g->sg) & 7); - g->sg = (struct octeon_sg_entry *) - ((unsigned long)g->sg + g->adjust); - } list_add_tail(&g->list, &lio->glist[i]); } if (j != lio->tx_qsize) { delete_glists(lio); - return 1; + return -ENOMEM; } } @@ -1324,10 +1354,6 @@ static void free_netsgbuf(void *buf) i++; } - dma_unmap_single(&lio->oct_dev->pci_dev->dev, - finfo->dptr, g->sg_size, - DMA_TO_DEVICE); - iq = skb_iq(lio, skb); spin_lock(&lio->glist_lock[iq]); @@ -1374,10 +1400,6 @@ static void free_netsgbuf_with_resp(void *buf) i++; } - dma_unmap_single(&lio->oct_dev->pci_dev->dev, - finfo->dptr, g->sg_size, - DMA_TO_DEVICE); - iq = skb_iq(lio, skb); spin_lock(&lio->glist_lock[iq]); @@ -2382,23 +2404,7 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev) i++; } - dptr = dma_map_single(&oct->pci_dev->dev, - g->sg, g->sg_size, - DMA_TO_DEVICE); - if (dma_mapping_error(&oct->pci_dev->dev, dptr)) { - dev_err(&oct->pci_dev->dev, "%s DMA mapping error 4\n", - __func__); - dma_unmap_single(&oct->pci_dev->dev, g->sg[0].ptr[0], - skb->len - skb->data_len, - DMA_TO_DEVICE); - for (j = 1; j <= frags; j++) { - frag = &skb_shinfo(skb)->frags[j - 1]; - dma_unmap_page(&oct->pci_dev->dev, - g->sg[j >> 2].ptr[j & 3], - frag->size, DMA_TO_DEVICE); - } - return NETDEV_TX_BUSY; - } + dptr = g->sg_dma_ptr; ndata.cmd.cmd3.dptr = dptr; finfo->dptr = dptr; |