diff options
Diffstat (limited to 'drivers/net/ethernet/google/gve/gve_tx.c')
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_tx.c | 358 |
1 files changed, 256 insertions, 102 deletions
diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index d0244feb0301..4888bf05fbed 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -1,11 +1,12 @@ // SPDX-License-Identifier: (GPL-2.0 OR MIT) /* Google virtual Ethernet (gve) driver * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc. */ #include "gve.h" #include "gve_adminq.h" +#include "gve_utils.h" #include <linux/ip.h> #include <linux/tcp.h> #include <linux/vmalloc.h> @@ -131,14 +132,6 @@ static void gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes) atomic_add(bytes, &fifo->available); } -static void gve_tx_remove_from_block(struct gve_priv *priv, int queue_idx) -{ - struct gve_notify_block *block = - &priv->ntfy_blocks[gve_tx_idx_to_ntfy(priv, queue_idx)]; - - block->tx = NULL; -} - static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx, u32 to_do, bool try_to_wake); @@ -151,16 +144,18 @@ static void gve_tx_free_ring(struct gve_priv *priv, int idx) gve_tx_remove_from_block(priv, idx); slots = tx->mask + 1; - gve_clean_tx_done(priv, tx, tx->req, false); + gve_clean_tx_done(priv, tx, priv->tx_desc_cnt, false); netdev_tx_reset_queue(tx->netdev_txq); dma_free_coherent(hdev, sizeof(*tx->q_resources), tx->q_resources, tx->q_resources_bus); tx->q_resources = NULL; - gve_tx_fifo_release(priv, &tx->tx_fifo); - gve_unassign_qpl(priv, tx->tx_fifo.qpl->id); - tx->tx_fifo.qpl = NULL; + if (!tx->raw_addressing) { + gve_tx_fifo_release(priv, &tx->tx_fifo); + gve_unassign_qpl(priv, tx->tx_fifo.qpl->id); + tx->tx_fifo.qpl = NULL; + } bytes = sizeof(*tx->desc) * slots; dma_free_coherent(hdev, bytes, tx->desc, tx->bus); @@ -172,16 +167,6 @@ static void gve_tx_free_ring(struct gve_priv *priv, int idx) netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx); } -static void gve_tx_add_to_block(struct gve_priv *priv, int queue_idx) -{ - int ntfy_idx = gve_tx_idx_to_ntfy(priv, queue_idx); - struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; - struct gve_tx_ring *tx = &priv->tx[queue_idx]; - - block->tx = tx; - tx->ntfy_id = ntfy_idx; -} - static int gve_tx_alloc_ring(struct gve_priv *priv, int idx) { struct gve_tx_ring *tx = &priv->tx[idx]; @@ -191,6 +176,7 @@ static int gve_tx_alloc_ring(struct gve_priv *priv, int idx) /* Make sure everything is zeroed to start */ memset(tx, 0, sizeof(*tx)); + spin_lock_init(&tx->clean_lock); tx->q_num = idx; tx->mask = slots - 1; @@ -206,11 +192,16 @@ static int gve_tx_alloc_ring(struct gve_priv *priv, int idx) if (!tx->desc) goto abort_with_info; - tx->tx_fifo.qpl = gve_assign_tx_qpl(priv); - - /* map Tx FIFO */ - if (gve_tx_fifo_init(priv, &tx->tx_fifo)) - goto abort_with_desc; + tx->raw_addressing = priv->queue_format == GVE_GQI_RDA_FORMAT; + tx->dev = &priv->pdev->dev; + if (!tx->raw_addressing) { + tx->tx_fifo.qpl = gve_assign_tx_qpl(priv); + if (!tx->tx_fifo.qpl) + goto abort_with_desc; + /* map Tx FIFO */ + if (gve_tx_fifo_init(priv, &tx->tx_fifo)) + goto abort_with_qpl; + } tx->q_resources = dma_alloc_coherent(hdev, @@ -228,7 +219,11 @@ static int gve_tx_alloc_ring(struct gve_priv *priv, int idx) return 0; abort_with_fifo: - gve_tx_fifo_release(priv, &tx->tx_fifo); + if (!tx->raw_addressing) + gve_tx_fifo_release(priv, &tx->tx_fifo); +abort_with_qpl: + if (!tx->raw_addressing) + gve_unassign_qpl(priv, tx->tx_fifo.qpl->id); abort_with_desc: dma_free_coherent(hdev, bytes, tx->desc, tx->bus); tx->desc = NULL; @@ -262,7 +257,7 @@ int gve_tx_alloc_rings(struct gve_priv *priv) return err; } -void gve_tx_free_rings(struct gve_priv *priv) +void gve_tx_free_rings_gqi(struct gve_priv *priv) { int i; @@ -301,53 +296,81 @@ static inline int gve_skb_fifo_bytes_required(struct gve_tx_ring *tx, return bytes; } -/* The most descriptors we could need are 3 - 1 for the headers, 1 for - * the beginning of the payload at the end of the FIFO, and 1 if the - * payload wraps to the beginning of the FIFO. +/* The most descriptors we could need is MAX_SKB_FRAGS + 4 : + * 1 for each skb frag + * 1 for the skb linear portion + * 1 for when tcp hdr needs to be in separate descriptor + * 1 if the payload wraps to the beginning of the FIFO + * 1 for metadata descriptor */ -#define MAX_TX_DESC_NEEDED 3 +#define MAX_TX_DESC_NEEDED (MAX_SKB_FRAGS + 4) +static void gve_tx_unmap_buf(struct device *dev, struct gve_tx_buffer_state *info) +{ + if (info->skb) { + dma_unmap_single(dev, dma_unmap_addr(info, dma), + dma_unmap_len(info, len), + DMA_TO_DEVICE); + dma_unmap_len_set(info, len, 0); + } else { + dma_unmap_page(dev, dma_unmap_addr(info, dma), + dma_unmap_len(info, len), + DMA_TO_DEVICE); + dma_unmap_len_set(info, len, 0); + } +} /* Check if sufficient resources (descriptor ring space, FIFO space) are * available to transmit the given number of bytes. */ static inline bool gve_can_tx(struct gve_tx_ring *tx, int bytes_required) { - return (gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED && - gve_tx_fifo_can_alloc(&tx->tx_fifo, bytes_required)); + bool can_alloc = true; + + if (!tx->raw_addressing) + can_alloc = gve_tx_fifo_can_alloc(&tx->tx_fifo, bytes_required); + + return (gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED && can_alloc); } +static_assert(NAPI_POLL_WEIGHT >= MAX_TX_DESC_NEEDED); + /* Stops the queue if the skb cannot be transmitted. */ -static int gve_maybe_stop_tx(struct gve_tx_ring *tx, struct sk_buff *skb) +static int gve_maybe_stop_tx(struct gve_priv *priv, struct gve_tx_ring *tx, + struct sk_buff *skb) { - int bytes_required; + int bytes_required = 0; + u32 nic_done; + u32 to_do; + int ret; + + if (!tx->raw_addressing) + bytes_required = gve_skb_fifo_bytes_required(tx, skb); - bytes_required = gve_skb_fifo_bytes_required(tx, skb); if (likely(gve_can_tx(tx, bytes_required))) return 0; - /* No space, so stop the queue */ - tx->stop_queue++; - netif_tx_stop_queue(tx->netdev_txq); - smp_mb(); /* sync with restarting queue in gve_clean_tx_done() */ - - /* Now check for resources again, in case gve_clean_tx_done() freed - * resources after we checked and we stopped the queue after - * gve_clean_tx_done() checked. - * - * gve_maybe_stop_tx() gve_clean_tx_done() - * nsegs/can_alloc test failed - * gve_tx_free_fifo() - * if (tx queue stopped) - * netif_tx_queue_wake() - * netif_tx_stop_queue() - * Need to check again for space here! - */ - if (likely(!gve_can_tx(tx, bytes_required))) - return -EBUSY; + ret = -EBUSY; + spin_lock(&tx->clean_lock); + nic_done = gve_tx_load_event_counter(priv, tx); + to_do = nic_done - tx->done; - netif_tx_start_queue(tx->netdev_txq); - tx->wake_queue++; - return 0; + /* Only try to clean if there is hope for TX */ + if (to_do + gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED) { + if (to_do > 0) { + to_do = min_t(u32, to_do, NAPI_POLL_WEIGHT); + gve_clean_tx_done(priv, tx, to_do, false); + } + if (likely(gve_can_tx(tx, bytes_required))) + ret = 0; + } + if (ret) { + /* No space, so stop the queue */ + tx->stop_queue++; + netif_tx_stop_queue(tx->netdev_txq); + } + spin_unlock(&tx->clean_lock); + + return ret; } static void gve_tx_fill_pkt_desc(union gve_tx_desc *pkt_desc, @@ -375,6 +398,19 @@ static void gve_tx_fill_pkt_desc(union gve_tx_desc *pkt_desc, pkt_desc->pkt.seg_addr = cpu_to_be64(addr); } +static void gve_tx_fill_mtd_desc(union gve_tx_desc *mtd_desc, + struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(mtd_desc->mtd) != sizeof(mtd_desc->pkt)); + + mtd_desc->mtd.type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH; + mtd_desc->mtd.path_state = GVE_MTD_PATH_STATE_DEFAULT | + GVE_MTD_PATH_HASH_L4; + mtd_desc->mtd.path_hash = cpu_to_be32(skb->hash); + mtd_desc->mtd.reserved0 = 0; + mtd_desc->mtd.reserved1 = 0; +} + static void gve_tx_fill_seg_desc(union gve_tx_desc *seg_desc, struct sk_buff *skb, bool is_gso, u16 len, u64 addr) @@ -395,21 +431,18 @@ static void gve_dma_sync_for_device(struct device *dev, dma_addr_t *page_buses, { u64 last_page = (iov_offset + iov_len - 1) / PAGE_SIZE; u64 first_page = iov_offset / PAGE_SIZE; - dma_addr_t dma; u64 page; - for (page = first_page; page <= last_page; page++) { - dma = page_buses[page]; - dma_sync_single_for_device(dev, dma, PAGE_SIZE, DMA_TO_DEVICE); - } + for (page = first_page; page <= last_page; page++) + dma_sync_single_for_device(dev, page_buses[page], PAGE_SIZE, DMA_TO_DEVICE); } -static int gve_tx_add_skb(struct gve_tx_ring *tx, struct sk_buff *skb, - struct device *dev) +static int gve_tx_add_skb_copy(struct gve_priv *priv, struct gve_tx_ring *tx, struct sk_buff *skb) { int pad_bytes, hlen, hdr_nfrags, payload_nfrags, l4_hdr_offset; union gve_tx_desc *pkt_desc, *seg_desc; struct gve_tx_buffer_state *info; + int mtd_desc_nr = !!skb->l4_hash; bool is_gso = skb_is_gso(skb); u32 idx = tx->req & tx->mask; int payload_iov = 2; @@ -441,19 +474,24 @@ static int gve_tx_add_skb(struct gve_tx_ring *tx, struct sk_buff *skb, &info->iov[payload_iov]); gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset, - 1 + payload_nfrags, hlen, + 1 + mtd_desc_nr + payload_nfrags, hlen, info->iov[hdr_nfrags - 1].iov_offset); skb_copy_bits(skb, 0, tx->tx_fifo.base + info->iov[hdr_nfrags - 1].iov_offset, hlen); - gve_dma_sync_for_device(dev, tx->tx_fifo.qpl->page_buses, + gve_dma_sync_for_device(&priv->pdev->dev, tx->tx_fifo.qpl->page_buses, info->iov[hdr_nfrags - 1].iov_offset, info->iov[hdr_nfrags - 1].iov_len); copy_offset = hlen; + if (mtd_desc_nr) { + next_idx = (tx->req + 1) & tx->mask; + gve_tx_fill_mtd_desc(&tx->desc[next_idx], skb); + } + for (i = payload_iov; i < payload_nfrags + payload_iov; i++) { - next_idx = (tx->req + 1 + i - payload_iov) & tx->mask; + next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask; seg_desc = &tx->desc[next_idx]; gve_tx_fill_seg_desc(seg_desc, skb, is_gso, @@ -463,13 +501,109 @@ static int gve_tx_add_skb(struct gve_tx_ring *tx, struct sk_buff *skb, skb_copy_bits(skb, copy_offset, tx->tx_fifo.base + info->iov[i].iov_offset, info->iov[i].iov_len); - gve_dma_sync_for_device(dev, tx->tx_fifo.qpl->page_buses, + gve_dma_sync_for_device(&priv->pdev->dev, tx->tx_fifo.qpl->page_buses, info->iov[i].iov_offset, info->iov[i].iov_len); copy_offset += info->iov[i].iov_len; } - return 1 + payload_nfrags; + return 1 + mtd_desc_nr + payload_nfrags; +} + +static int gve_tx_add_skb_no_copy(struct gve_priv *priv, struct gve_tx_ring *tx, + struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + int hlen, num_descriptors, l4_hdr_offset; + union gve_tx_desc *pkt_desc, *mtd_desc, *seg_desc; + struct gve_tx_buffer_state *info; + int mtd_desc_nr = !!skb->l4_hash; + bool is_gso = skb_is_gso(skb); + u32 idx = tx->req & tx->mask; + u64 addr; + u32 len; + int i; + + info = &tx->info[idx]; + pkt_desc = &tx->desc[idx]; + + l4_hdr_offset = skb_checksum_start_offset(skb); + /* If the skb is gso, then we want only up to the tcp header in the first segment + * to efficiently replicate on each segment otherwise we want the linear portion + * of the skb (which will contain the checksum because skb->csum_start and + * skb->csum_offset are given relative to skb->head) in the first segment. + */ + hlen = is_gso ? l4_hdr_offset + tcp_hdrlen(skb) : skb_headlen(skb); + len = skb_headlen(skb); + + info->skb = skb; + + addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx->dev, addr))) { + tx->dma_mapping_error++; + goto drop; + } + dma_unmap_len_set(info, len, len); + dma_unmap_addr_set(info, dma, addr); + + num_descriptors = 1 + shinfo->nr_frags; + if (hlen < len) + num_descriptors++; + if (mtd_desc_nr) + num_descriptors++; + + gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset, + num_descriptors, hlen, addr); + + if (mtd_desc_nr) { + idx = (idx + 1) & tx->mask; + mtd_desc = &tx->desc[idx]; + gve_tx_fill_mtd_desc(mtd_desc, skb); + } + + if (hlen < len) { + /* For gso the rest of the linear portion of the skb needs to + * be in its own descriptor. + */ + len -= hlen; + addr += hlen; + idx = (idx + 1) & tx->mask; + seg_desc = &tx->desc[idx]; + gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr); + } + + for (i = 0; i < shinfo->nr_frags; i++) { + const skb_frag_t *frag = &shinfo->frags[i]; + + idx = (idx + 1) & tx->mask; + seg_desc = &tx->desc[idx]; + len = skb_frag_size(frag); + addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx->dev, addr))) { + tx->dma_mapping_error++; + goto unmap_drop; + } + tx->info[idx].skb = NULL; + dma_unmap_len_set(&tx->info[idx], len, len); + dma_unmap_addr_set(&tx->info[idx], dma, addr); + + gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr); + } + + return num_descriptors; + +unmap_drop: + i += num_descriptors - shinfo->nr_frags; + while (i--) { + /* Skip metadata descriptor, if set */ + if (i == 1 && mtd_desc_nr == 1) + continue; + idx--; + gve_tx_unmap_buf(tx->dev, &tx->info[idx & tx->mask]); + } +drop: + tx->dropped_pkt++; + return 0; } netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev) @@ -478,10 +612,10 @@ netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev) struct gve_tx_ring *tx; int nsegs; - WARN(skb_get_queue_mapping(skb) > priv->tx_cfg.num_queues, + WARN(skb_get_queue_mapping(skb) >= priv->tx_cfg.num_queues, "skb queue index out of range"); tx = &priv->tx[skb_get_queue_mapping(skb)]; - if (unlikely(gve_maybe_stop_tx(tx, skb))) { + if (unlikely(gve_maybe_stop_tx(priv, tx, skb))) { /* We need to ring the txq doorbell -- we have stopped the Tx * queue for want of resources, but prior calls to gve_tx() * may have added descriptors without ringing the doorbell. @@ -490,17 +624,26 @@ netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev) gve_tx_put_doorbell(priv, tx->q_resources, tx->req); return NETDEV_TX_BUSY; } - nsegs = gve_tx_add_skb(tx, skb, &priv->pdev->dev); - - netdev_tx_sent_queue(tx->netdev_txq, skb->len); - skb_tx_timestamp(skb); - - /* give packets to NIC */ - tx->req += nsegs; + if (tx->raw_addressing) + nsegs = gve_tx_add_skb_no_copy(priv, tx, skb); + else + nsegs = gve_tx_add_skb_copy(priv, tx, skb); + + /* If the packet is getting sent, we need to update the skb */ + if (nsegs) { + netdev_tx_sent_queue(tx->netdev_txq, skb->len); + skb_tx_timestamp(skb); + tx->req += nsegs; + } else { + dev_kfree_skb_any(skb); + } if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more()) return NETDEV_TX_OK; + /* Give packets to NIC. Even if this packet failed to send the doorbell + * might need to be rung because of xmit_more. + */ gve_tx_put_doorbell(priv, tx->q_resources, tx->req); return NETDEV_TX_OK; } @@ -525,24 +668,29 @@ static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx, info = &tx->info[idx]; skb = info->skb; + /* Unmap the buffer */ + if (tx->raw_addressing) + gve_tx_unmap_buf(tx->dev, info); + tx->done++; /* Mark as free */ if (skb) { info->skb = NULL; bytes += skb->len; pkts++; dev_consume_skb_any(skb); + if (tx->raw_addressing) + continue; /* FIFO free */ for (i = 0; i < ARRAY_SIZE(info->iov); i++) { - space_freed += info->iov[i].iov_len + - info->iov[i].iov_padding; + space_freed += info->iov[i].iov_len + info->iov[i].iov_padding; info->iov[i].iov_len = 0; info->iov[i].iov_padding = 0; } } - tx->done++; } - gve_tx_free_fifo(&tx->tx_fifo, space_freed); + if (!tx->raw_addressing) + gve_tx_free_fifo(&tx->tx_fifo, space_freed); u64_stats_update_begin(&tx->statss); tx->bytes_done += bytes; tx->pkt_done += pkts; @@ -563,19 +711,19 @@ static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx, return pkts; } -__be32 gve_tx_load_event_counter(struct gve_priv *priv, - struct gve_tx_ring *tx) +u32 gve_tx_load_event_counter(struct gve_priv *priv, + struct gve_tx_ring *tx) { - u32 counter_index = be32_to_cpu((tx->q_resources->counter_index)); + u32 counter_index = be32_to_cpu(tx->q_resources->counter_index); + __be32 counter = READ_ONCE(priv->counter_array[counter_index]); - return READ_ONCE(priv->counter_array[counter_index]); + return be32_to_cpu(counter); } bool gve_tx_poll(struct gve_notify_block *block, int budget) { struct gve_priv *priv = block->priv; struct gve_tx_ring *tx = block->tx; - bool repoll = false; u32 nic_done; u32 to_do; @@ -583,17 +731,23 @@ bool gve_tx_poll(struct gve_notify_block *block, int budget) if (budget == 0) budget = INT_MAX; + /* In TX path, it may try to clean completed pkts in order to xmit, + * to avoid cleaning conflict, use spin_lock(), it yields better + * concurrency between xmit/clean than netif's lock. + */ + spin_lock(&tx->clean_lock); /* Find out how much work there is to be done */ - tx->last_nic_done = gve_tx_load_event_counter(priv, tx); - nic_done = be32_to_cpu(tx->last_nic_done); - if (budget > 0) { - /* Do as much work as we have that the budget will - * allow - */ - to_do = min_t(u32, (nic_done - tx->done), budget); - gve_clean_tx_done(priv, tx, to_do, true); - } + nic_done = gve_tx_load_event_counter(priv, tx); + to_do = min_t(u32, (nic_done - tx->done), budget); + gve_clean_tx_done(priv, tx, to_do, true); + spin_unlock(&tx->clean_lock); /* If we still have work we want to repoll */ - repoll |= (nic_done != tx->done); - return repoll; + return nic_done != tx->done; +} + +bool gve_tx_clean_pending(struct gve_priv *priv, struct gve_tx_ring *tx) +{ + u32 nic_done = gve_tx_load_event_counter(priv, tx); + + return nic_done != tx->done; } |