diff options
Diffstat (limited to 'drivers/net/ethernet/google')
-rw-r--r-- | drivers/net/ethernet/google/Kconfig | 2 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/Makefile | 2 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve.h | 506 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_adminq.c | 679 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_adminq.h | 194 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_desc.h | 52 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_desc_dqo.h | 256 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_dqo.h | 93 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_ethtool.c | 480 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_main.c | 836 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_register.h | 1 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_rx.c | 645 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_rx_dqo.c | 756 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_tx.c | 358 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_tx_dqo.c | 1022 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_utils.c | 99 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_utils.h | 28 |
17 files changed, 5414 insertions, 595 deletions
diff --git a/drivers/net/ethernet/google/Kconfig b/drivers/net/ethernet/google/Kconfig index b8f04d052fda..8641a00f8e63 100644 --- a/drivers/net/ethernet/google/Kconfig +++ b/drivers/net/ethernet/google/Kconfig @@ -17,7 +17,7 @@ if NET_VENDOR_GOOGLE config GVE tristate "Google Virtual NIC (gVNIC) support" - depends on PCI_MSI + depends on (PCI_MSI && (X86 || CPU_LITTLE_ENDIAN)) help This driver supports Google Virtual NIC (gVNIC)" diff --git a/drivers/net/ethernet/google/gve/Makefile b/drivers/net/ethernet/google/gve/Makefile index 3354ce40eb97..b9a6be76531b 100644 --- a/drivers/net/ethernet/google/gve/Makefile +++ b/drivers/net/ethernet/google/gve/Makefile @@ -1,4 +1,4 @@ # Makefile for the Google virtual Ethernet (gve) driver obj-$(CONFIG_GVE) += gve.o -gve-objs := gve_main.o gve_tx.o gve_rx.o gve_ethtool.o gve_adminq.o +gve-objs := gve_main.o gve_tx.o gve_tx_dqo.o gve_rx.o gve_rx_dqo.o gve_ethtool.o gve_adminq.o gve_utils.o diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index ebc37e256922..160735484465 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: (GPL-2.0 OR MIT) * Google virtual Ethernet (gve) driver * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc. */ #ifndef _GVE_H_ @@ -11,7 +11,9 @@ #include <linux/netdevice.h> #include <linux/pci.h> #include <linux/u64_stats_sync.h> + #include "gve_desc.h" +#include "gve_desc_dqo.h" #ifndef PCI_VENDOR_ID_GOOGLE #define PCI_VENDOR_ID_GOOGLE 0x1ae0 @@ -27,6 +29,24 @@ /* 1 for management, 1 for rx, 1 for tx */ #define GVE_MIN_MSIX 3 +/* Numbers of gve tx/rx stats in stats report. */ +#define GVE_TX_STATS_REPORT_NUM 6 +#define GVE_RX_STATS_REPORT_NUM 2 + +/* Interval to schedule a stats report update, 20000ms. */ +#define GVE_STATS_REPORT_TIMER_PERIOD 20000 + +/* Numbers of NIC tx/rx stats in stats report. */ +#define NIC_TX_STATS_REPORT_NUM 0 +#define NIC_RX_STATS_REPORT_NUM 4 + +#define GVE_DATA_SLOT_ADDR_PAGE_MASK (~(PAGE_SIZE - 1)) + +/* PTYPEs are always 10 bits. */ +#define GVE_NUM_PTYPES 1024 + +#define GVE_RX_BUFFER_SIZE_DQO 2048 + /* Each slot in the desc ring has a 1:1 mapping to a slot in the data ring */ struct gve_rx_desc_queue { struct gve_rx_desc *desc_ring; /* the descriptor ring */ @@ -39,6 +59,8 @@ struct gve_rx_slot_page_info { struct page *page; void *page_address; u32 page_offset; /* offset to write to in page */ + int pagecnt_bias; /* expected pagecnt if only the driver has a ref */ + u8 can_flip; }; /* A list of pages registered with the device during setup and used by a queue @@ -53,34 +75,161 @@ struct gve_queue_page_list { /* Each slot in the data ring has a 1:1 mapping to a slot in the desc ring */ struct gve_rx_data_queue { - struct gve_rx_data_slot *data_ring; /* read by NIC */ + union gve_rx_data_slot *data_ring; /* read by NIC */ dma_addr_t data_bus; /* dma mapping of the slots */ struct gve_rx_slot_page_info *page_info; /* page info of the buffers */ struct gve_queue_page_list *qpl; /* qpl assigned to this queue */ + u8 raw_addressing; /* use raw_addressing? */ }; struct gve_priv; -/* An RX ring that contains a power-of-two sized desc and data ring. */ +/* RX buffer queue for posting buffers to HW. + * Each RX (completion) queue has a corresponding buffer queue. + */ +struct gve_rx_buf_queue_dqo { + struct gve_rx_desc_dqo *desc_ring; + dma_addr_t bus; + u32 head; /* Pointer to start cleaning buffers at. */ + u32 tail; /* Last posted buffer index + 1 */ + u32 mask; /* Mask for indices to the size of the ring */ +}; + +/* RX completion queue to receive packets from HW. */ +struct gve_rx_compl_queue_dqo { + struct gve_rx_compl_desc_dqo *desc_ring; + dma_addr_t bus; + + /* Number of slots which did not have a buffer posted yet. We should not + * post more buffers than the queue size to avoid HW overrunning the + * queue. + */ + int num_free_slots; + + /* HW uses a "generation bit" to notify SW of new descriptors. When a + * descriptor's generation bit is different from the current generation, + * that descriptor is ready to be consumed by SW. + */ + u8 cur_gen_bit; + + /* Pointer into desc_ring where the next completion descriptor will be + * received. + */ + u32 head; + u32 mask; /* Mask for indices to the size of the ring */ +}; + +/* Stores state for tracking buffers posted to HW */ +struct gve_rx_buf_state_dqo { + /* The page posted to HW. */ + struct gve_rx_slot_page_info page_info; + + /* The DMA address corresponding to `page_info`. */ + dma_addr_t addr; + + /* Last offset into the page when it only had a single reference, at + * which point every other offset is free to be reused. + */ + u32 last_single_ref_offset; + + /* Linked list index to next element in the list, or -1 if none */ + s16 next; +}; + +/* `head` and `tail` are indices into an array, or -1 if empty. */ +struct gve_index_list { + s16 head; + s16 tail; +}; + +/* A single received packet split across multiple buffers may be + * reconstructed using the information in this structure. + */ +struct gve_rx_ctx { + /* head and tail of skb chain for the current packet or NULL if none */ + struct sk_buff *skb_head; + struct sk_buff *skb_tail; + u16 total_expected_size; + u8 expected_frag_cnt; + u8 curr_frag_cnt; + u8 reuse_frags; +}; + +/* Contains datapath state used to represent an RX queue. */ struct gve_rx_ring { struct gve_priv *gve; - struct gve_rx_desc_queue desc; - struct gve_rx_data_queue data; + union { + /* GQI fields */ + struct { + struct gve_rx_desc_queue desc; + struct gve_rx_data_queue data; + + /* threshold for posting new buffs and descs */ + u32 db_threshold; + u16 packet_buffer_size; + }; + + /* DQO fields. */ + struct { + struct gve_rx_buf_queue_dqo bufq; + struct gve_rx_compl_queue_dqo complq; + + struct gve_rx_buf_state_dqo *buf_states; + u16 num_buf_states; + + /* Linked list of gve_rx_buf_state_dqo. Index into + * buf_states, or -1 if empty. + */ + s16 free_buf_states; + + /* Linked list of gve_rx_buf_state_dqo. Indexes into + * buf_states, or -1 if empty. + * + * This list contains buf_states which are pointing to + * valid buffers. + * + * We use a FIFO here in order to increase the + * probability that buffers can be reused by increasing + * the time between usages. + */ + struct gve_index_list recycled_buf_states; + + /* Linked list of gve_rx_buf_state_dqo. Indexes into + * buf_states, or -1 if empty. + * + * This list contains buf_states which have buffers + * which cannot be reused yet. + */ + struct gve_index_list used_buf_states; + } dqo; + }; + u64 rbytes; /* free-running bytes received */ u64 rpackets; /* free-running packets received */ u32 cnt; /* free-running total number of completed packets */ u32 fill_cnt; /* free-running total number of descs and buffs posted */ u32 mask; /* masks the cnt and fill_cnt to the size of the ring */ + u64 rx_copybreak_pkt; /* free-running count of copybreak packets */ + u64 rx_copied_pkt; /* free-running total number of copied packets */ + u64 rx_skb_alloc_fail; /* free-running count of skb alloc fails */ + u64 rx_buf_alloc_fail; /* free-running count of buffer alloc fails */ + u64 rx_desc_err_dropped_pkt; /* free-running count of packets dropped by descriptor error */ + u64 rx_cont_packet_cnt; /* free-running multi-fragment packets received */ + u64 rx_frag_flip_cnt; /* free-running count of rx segments where page_flip was used */ + u64 rx_frag_copy_cnt; /* free-running count of rx segments copied into skb linear portion */ u32 q_num; /* queue index */ u32 ntfy_id; /* notification block index */ struct gve_queue_resources *q_resources; /* head and tail pointer idx */ dma_addr_t q_resources_bus; /* dma address for the queue resources */ struct u64_stats_sync statss; /* sync stats for 32bit archs */ + + struct gve_rx_ctx ctx; /* Info for packet currently being processed in this ring. */ }; /* A TX desc ring entry */ union gve_tx_desc { struct gve_tx_pkt_desc pkt; /* first desc for a packet */ + struct gve_tx_mtd_desc mtd; /* optional metadata descriptor */ struct gve_tx_seg_desc seg; /* subsequent descs for a packet */ }; @@ -96,7 +245,13 @@ struct gve_tx_iovec { */ struct gve_tx_buffer_state { struct sk_buff *skb; /* skb for this pkt */ - struct gve_tx_iovec iov[GVE_TX_MAX_IOVEC]; /* segments of this pkt */ + union { + struct gve_tx_iovec iov[GVE_TX_MAX_IOVEC]; /* segments of this pkt */ + struct { + DEFINE_DMA_UNMAP_ADDR(dma); + DEFINE_DMA_UNMAP_LEN(len); + }; + }; }; /* A TX buffer - each queue has one */ @@ -108,32 +263,178 @@ struct gve_tx_fifo { struct gve_queue_page_list *qpl; /* QPL mapped into this FIFO */ }; -/* A TX ring that contains a power-of-two sized desc ring and a FIFO buffer */ +/* TX descriptor for DQO format */ +union gve_tx_desc_dqo { + struct gve_tx_pkt_desc_dqo pkt; + struct gve_tx_tso_context_desc_dqo tso_ctx; + struct gve_tx_general_context_desc_dqo general_ctx; +}; + +enum gve_packet_state { + /* Packet is in free list, available to be allocated. + * This should always be zero since state is not explicitly initialized. + */ + GVE_PACKET_STATE_UNALLOCATED, + /* Packet is expecting a regular data completion or miss completion */ + GVE_PACKET_STATE_PENDING_DATA_COMPL, + /* Packet has received a miss completion and is expecting a + * re-injection completion. + */ + GVE_PACKET_STATE_PENDING_REINJECT_COMPL, + /* No valid completion received within the specified timeout. */ + GVE_PACKET_STATE_TIMED_OUT_COMPL, +}; + +struct gve_tx_pending_packet_dqo { + struct sk_buff *skb; /* skb for this packet */ + + /* 0th element corresponds to the linear portion of `skb`, should be + * unmapped with `dma_unmap_single`. + * + * All others correspond to `skb`'s frags and should be unmapped with + * `dma_unmap_page`. + */ + DEFINE_DMA_UNMAP_ADDR(dma[MAX_SKB_FRAGS + 1]); + DEFINE_DMA_UNMAP_LEN(len[MAX_SKB_FRAGS + 1]); + u16 num_bufs; + + /* Linked list index to next element in the list, or -1 if none */ + s16 next; + + /* Linked list index to prev element in the list, or -1 if none. + * Used for tracking either outstanding miss completions or prematurely + * freed packets. + */ + s16 prev; + + /* Identifies the current state of the packet as defined in + * `enum gve_packet_state`. + */ + u8 state; + + /* If packet is an outstanding miss completion, then the packet is + * freed if the corresponding re-injection completion is not received + * before kernel jiffies exceeds timeout_jiffies. + */ + unsigned long timeout_jiffies; +}; + +/* Contains datapath state used to represent a TX queue. */ struct gve_tx_ring { /* Cacheline 0 -- Accessed & dirtied during transmit */ - struct gve_tx_fifo tx_fifo; - u32 req; /* driver tracked head pointer */ - u32 done; /* driver tracked tail pointer */ + union { + /* GQI fields */ + struct { + struct gve_tx_fifo tx_fifo; + u32 req; /* driver tracked head pointer */ + u32 done; /* driver tracked tail pointer */ + }; + + /* DQO fields. */ + struct { + /* Linked list of gve_tx_pending_packet_dqo. Index into + * pending_packets, or -1 if empty. + * + * This is a consumer list owned by the TX path. When it + * runs out, the producer list is stolen from the + * completion handling path + * (dqo_compl.free_pending_packets). + */ + s16 free_pending_packets; + + /* Cached value of `dqo_compl.hw_tx_head` */ + u32 head; + u32 tail; /* Last posted buffer index + 1 */ + + /* Index of the last descriptor with "report event" bit + * set. + */ + u32 last_re_idx; + } dqo_tx; + }; /* Cacheline 1 -- Accessed & dirtied during gve_clean_tx_done */ - __be32 last_nic_done ____cacheline_aligned; /* NIC tail pointer */ + union { + /* GQI fields */ + struct { + /* Spinlock for when cleanup in progress */ + spinlock_t clean_lock; + }; + + /* DQO fields. */ + struct { + u32 head; /* Last read on compl_desc */ + + /* Tracks the current gen bit of compl_q */ + u8 cur_gen_bit; + + /* Linked list of gve_tx_pending_packet_dqo. Index into + * pending_packets, or -1 if empty. + * + * This is the producer list, owned by the completion + * handling path. When the consumer list + * (dqo_tx.free_pending_packets) is runs out, this list + * will be stolen. + */ + atomic_t free_pending_packets; + + /* Last TX ring index fetched by HW */ + atomic_t hw_tx_head; + + /* List to track pending packets which received a miss + * completion but not a corresponding reinjection. + */ + struct gve_index_list miss_completions; + + /* List to track pending packets that were completed + * before receiving a valid completion because they + * reached a specified timeout. + */ + struct gve_index_list timed_out_completions; + } dqo_compl; + } ____cacheline_aligned; u64 pkt_done; /* free-running - total packets completed */ u64 bytes_done; /* free-running - total bytes completed */ + u64 dropped_pkt; /* free-running - total packets dropped */ + u64 dma_mapping_error; /* count of dma mapping errors */ /* Cacheline 2 -- Read-mostly fields */ - union gve_tx_desc *desc ____cacheline_aligned; - struct gve_tx_buffer_state *info; /* Maps 1:1 to a desc */ + union { + /* GQI fields */ + struct { + union gve_tx_desc *desc; + + /* Maps 1:1 to a desc */ + struct gve_tx_buffer_state *info; + }; + + /* DQO fields. */ + struct { + union gve_tx_desc_dqo *tx_ring; + struct gve_tx_compl_desc *compl_ring; + + struct gve_tx_pending_packet_dqo *pending_packets; + s16 num_pending_packets; + + u32 complq_mask; /* complq size is complq_mask + 1 */ + } dqo; + } ____cacheline_aligned; struct netdev_queue *netdev_txq; struct gve_queue_resources *q_resources; /* head and tail pointer idx */ + struct device *dev; u32 mask; /* masks req and done down to queue size */ + u8 raw_addressing; /* use raw_addressing? */ /* Slow-path fields */ u32 q_num ____cacheline_aligned; /* queue idx */ u32 stop_queue; /* count of queue stops */ u32 wake_queue; /* count of queue wakes */ + u32 queue_timeout; /* count of queue timeouts */ u32 ntfy_id; /* notification block index */ + u32 last_kick_msec; /* Last time the queue was kicked */ dma_addr_t bus; /* dma address of the descr ring */ dma_addr_t q_resources_bus; /* dma address of the queue resources */ + dma_addr_t complq_bus_dqo; /* dma address of the dqo.compl_ring */ struct u64_stats_sync statss; /* sync stats for 32bit archs */ } ____cacheline_aligned; @@ -141,13 +442,13 @@ struct gve_tx_ring { * associated with that irq. */ struct gve_notify_block { - __be32 irq_db_index; /* idx into Bar2 - set by device, must be 1st */ + __be32 *irq_db_index; /* pointer to idx into Bar2 */ char name[IFNAMSIZ + 16]; /* name registered with the kernel */ struct napi_struct napi; /* kernel napi struct for this block */ struct gve_priv *priv; struct gve_tx_ring *tx; /* tx rings on this block */ struct gve_rx_ring *rx; /* rx rings on this block */ -} ____cacheline_aligned; +}; /* Tracks allowed and current queue settings */ struct gve_queue_config { @@ -161,13 +462,43 @@ struct gve_qpl_config { unsigned long *qpl_id_map; /* bitmap of used qpl ids */ }; +struct gve_options_dqo_rda { + u16 tx_comp_ring_entries; /* number of tx_comp descriptors */ + u16 rx_buff_ring_entries; /* number of rx_buff descriptors */ +}; + +struct gve_irq_db { + __be32 index; +} ____cacheline_aligned; + +struct gve_ptype { + u8 l3_type; /* `gve_l3_type` in gve_adminq.h */ + u8 l4_type; /* `gve_l4_type` in gve_adminq.h */ +}; + +struct gve_ptype_lut { + struct gve_ptype ptypes[GVE_NUM_PTYPES]; +}; + +/* GVE_QUEUE_FORMAT_UNSPECIFIED must be zero since 0 is the default value + * when the entire configure_device_resources command is zeroed out and the + * queue_format is not specified. + */ +enum gve_queue_format { + GVE_QUEUE_FORMAT_UNSPECIFIED = 0x0, + GVE_GQI_RDA_FORMAT = 0x1, + GVE_GQI_QPL_FORMAT = 0x2, + GVE_DQO_RDA_FORMAT = 0x3, +}; + struct gve_priv { struct net_device *dev; struct gve_tx_ring *tx; /* array of tx_cfg.num_queues */ struct gve_rx_ring *rx; /* array of rx_cfg.num_queues */ struct gve_queue_page_list *qpls; /* array of num qpls */ struct gve_notify_block *ntfy_blocks; /* array of num_ntfy_blks */ - dma_addr_t ntfy_block_bus; + struct gve_irq_db *irq_db_indices; /* array of num_ntfy_blks */ + dma_addr_t irq_db_indices_bus; struct msix_entry *msix_vectors; /* array of num_ntfy_blks + 1 */ char mgmt_msix_name[IFNAMSIZ + 16]; u32 mgmt_msix_idx; @@ -178,7 +509,7 @@ struct gve_priv { u16 tx_desc_cnt; /* num desc per ring */ u16 rx_desc_cnt; /* num desc per ring */ u16 tx_pages_per_qpl; /* tx buffer length */ - u16 rx_pages_per_qpl; /* rx buffer length */ + u16 rx_data_slot_cnt; /* rx buffer length */ u64 max_registered_pages; u64 num_registered_pages; /* num pages registered with NIC */ u32 rx_copybreak; /* copy packets smaller than this */ @@ -202,24 +533,79 @@ struct gve_priv { dma_addr_t adminq_bus_addr; u32 adminq_mask; /* masks prod_cnt to adminq size */ u32 adminq_prod_cnt; /* free-running count of AQ cmds executed */ - + u32 adminq_cmd_fail; /* free-running count of AQ cmds failed */ + u32 adminq_timeouts; /* free-running count of AQ cmds timeouts */ + /* free-running count of per AQ cmd executed */ + u32 adminq_describe_device_cnt; + u32 adminq_cfg_device_resources_cnt; + u32 adminq_register_page_list_cnt; + u32 adminq_unregister_page_list_cnt; + u32 adminq_create_tx_queue_cnt; + u32 adminq_create_rx_queue_cnt; + u32 adminq_destroy_tx_queue_cnt; + u32 adminq_destroy_rx_queue_cnt; + u32 adminq_dcfg_device_resources_cnt; + u32 adminq_set_driver_parameter_cnt; + u32 adminq_report_stats_cnt; + u32 adminq_report_link_speed_cnt; + u32 adminq_get_ptype_map_cnt; + + /* Global stats */ + u32 interface_up_cnt; /* count of times interface turned up since last reset */ + u32 interface_down_cnt; /* count of times interface turned down since last reset */ + u32 reset_cnt; /* count of reset */ + u32 page_alloc_fail; /* count of page alloc fails */ + u32 dma_mapping_error; /* count of dma mapping errors */ + u32 stats_report_trigger_cnt; /* count of device-requested stats-reports since last reset */ + u32 suspend_cnt; /* count of times suspended */ + u32 resume_cnt; /* count of times resumed */ struct workqueue_struct *gve_wq; struct work_struct service_task; + struct work_struct stats_report_task; unsigned long service_task_flags; unsigned long state_flags; + + struct gve_stats_report *stats_report; + u64 stats_report_len; + dma_addr_t stats_report_bus; /* dma address for the stats report */ + unsigned long ethtool_flags; + + unsigned long stats_report_timer_period; + struct timer_list stats_report_timer; + + /* Gvnic device link speed from hypervisor. */ + u64 link_speed; + bool up_before_suspend; /* True if dev was up before suspend */ + + struct gve_options_dqo_rda options_dqo_rda; + struct gve_ptype_lut *ptype_lut_dqo; + + /* Must be a power of two. */ + int data_buffer_size_dqo; + + enum gve_queue_format queue_format; + + /* Interrupt coalescing settings */ + u32 tx_coalesce_usecs; + u32 rx_coalesce_usecs; }; -enum gve_service_task_flags { - GVE_PRIV_FLAGS_DO_RESET = BIT(1), - GVE_PRIV_FLAGS_RESET_IN_PROGRESS = BIT(2), - GVE_PRIV_FLAGS_PROBE_IN_PROGRESS = BIT(3), +enum gve_service_task_flags_bit { + GVE_PRIV_FLAGS_DO_RESET = 1, + GVE_PRIV_FLAGS_RESET_IN_PROGRESS = 2, + GVE_PRIV_FLAGS_PROBE_IN_PROGRESS = 3, + GVE_PRIV_FLAGS_DO_REPORT_STATS = 4, }; -enum gve_state_flags { - GVE_PRIV_FLAGS_ADMIN_QUEUE_OK = BIT(1), - GVE_PRIV_FLAGS_DEVICE_RESOURCES_OK = BIT(2), - GVE_PRIV_FLAGS_DEVICE_RINGS_OK = BIT(3), - GVE_PRIV_FLAGS_NAPI_ENABLED = BIT(4), +enum gve_state_flags_bit { + GVE_PRIV_FLAGS_ADMIN_QUEUE_OK = 1, + GVE_PRIV_FLAGS_DEVICE_RESOURCES_OK = 2, + GVE_PRIV_FLAGS_DEVICE_RINGS_OK = 3, + GVE_PRIV_FLAGS_NAPI_ENABLED = 4, +}; + +enum gve_ethtool_flags_bit { + GVE_PRIV_FLAGS_REPORT_STATS = 0, }; static inline bool gve_get_do_reset(struct gve_priv *priv) @@ -269,6 +655,22 @@ static inline void gve_clear_probe_in_progress(struct gve_priv *priv) clear_bit(GVE_PRIV_FLAGS_PROBE_IN_PROGRESS, &priv->service_task_flags); } +static inline bool gve_get_do_report_stats(struct gve_priv *priv) +{ + return test_bit(GVE_PRIV_FLAGS_DO_REPORT_STATS, + &priv->service_task_flags); +} + +static inline void gve_set_do_report_stats(struct gve_priv *priv) +{ + set_bit(GVE_PRIV_FLAGS_DO_REPORT_STATS, &priv->service_task_flags); +} + +static inline void gve_clear_do_report_stats(struct gve_priv *priv) +{ + clear_bit(GVE_PRIV_FLAGS_DO_REPORT_STATS, &priv->service_task_flags); +} + static inline bool gve_get_admin_queue_ok(struct gve_priv *priv) { return test_bit(GVE_PRIV_FLAGS_ADMIN_QUEUE_OK, &priv->state_flags); @@ -329,12 +731,22 @@ static inline void gve_clear_napi_enabled(struct gve_priv *priv) clear_bit(GVE_PRIV_FLAGS_NAPI_ENABLED, &priv->state_flags); } +static inline bool gve_get_report_stats(struct gve_priv *priv) +{ + return test_bit(GVE_PRIV_FLAGS_REPORT_STATS, &priv->ethtool_flags); +} + +static inline void gve_clear_report_stats(struct gve_priv *priv) +{ + clear_bit(GVE_PRIV_FLAGS_REPORT_STATS, &priv->ethtool_flags); +} + /* Returns the address of the ntfy_blocks irq doorbell */ static inline __be32 __iomem *gve_irq_doorbell(struct gve_priv *priv, struct gve_notify_block *block) { - return &priv->db_bar2[be32_to_cpu(block->irq_db_index)]; + return &priv->db_bar2[be32_to_cpu(*block->irq_db_index)]; } /* Returns the index into ntfy_blocks of the given tx ring's block @@ -355,6 +767,9 @@ static inline u32 gve_rx_idx_to_ntfy(struct gve_priv *priv, u32 queue_idx) */ static inline u32 gve_num_tx_qpls(struct gve_priv *priv) { + if (priv->queue_format != GVE_GQI_QPL_FORMAT) + return 0; + return priv->tx_cfg.num_queues; } @@ -362,6 +777,9 @@ static inline u32 gve_num_tx_qpls(struct gve_priv *priv) */ static inline u32 gve_num_rx_qpls(struct gve_priv *priv) { + if (priv->queue_format != GVE_GQI_QPL_FORMAT) + return 0; + return priv->rx_cfg.num_queues; } @@ -391,7 +809,7 @@ struct gve_queue_page_list *gve_assign_rx_qpl(struct gve_priv *priv) gve_num_tx_qpls(priv)); /* we are out of rx qpls */ - if (id == priv->qpl_cfg.qpl_map_size) + if (id == gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv)) return NULL; set_bit(id, priv->qpl_cfg.qpl_id_map); @@ -416,40 +834,40 @@ static inline enum dma_data_direction gve_qpl_dma_dir(struct gve_priv *priv, return DMA_FROM_DEVICE; } -/* Returns true if the max mtu allows page recycling */ -static inline bool gve_can_recycle_pages(struct net_device *dev) +static inline bool gve_is_gqi(struct gve_priv *priv) { - /* We can't recycle the pages if we can't fit a packet into half a - * page. - */ - return dev->max_mtu <= PAGE_SIZE / 2; + return priv->queue_format == GVE_GQI_RDA_FORMAT || + priv->queue_format == GVE_GQI_QPL_FORMAT; } /* buffers */ -int gve_alloc_page(struct device *dev, struct page **page, dma_addr_t *dma, - enum dma_data_direction); +int gve_alloc_page(struct gve_priv *priv, struct device *dev, + struct page **page, dma_addr_t *dma, + enum dma_data_direction, gfp_t gfp_flags); void gve_free_page(struct device *dev, struct page *page, dma_addr_t dma, enum dma_data_direction); /* tx handling */ netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev); bool gve_tx_poll(struct gve_notify_block *block, int budget); int gve_tx_alloc_rings(struct gve_priv *priv); -void gve_tx_free_rings(struct gve_priv *priv); -__be32 gve_tx_load_event_counter(struct gve_priv *priv, - struct gve_tx_ring *tx); +void gve_tx_free_rings_gqi(struct gve_priv *priv); +u32 gve_tx_load_event_counter(struct gve_priv *priv, + struct gve_tx_ring *tx); +bool gve_tx_clean_pending(struct gve_priv *priv, struct gve_tx_ring *tx); /* rx handling */ void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx); -bool gve_rx_poll(struct gve_notify_block *block, int budget); +int gve_rx_poll(struct gve_notify_block *block, int budget); +bool gve_rx_work_pending(struct gve_rx_ring *rx); int gve_rx_alloc_rings(struct gve_priv *priv); -void gve_rx_free_rings(struct gve_priv *priv); -bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget, - netdev_features_t feat); +void gve_rx_free_rings_gqi(struct gve_priv *priv); /* Reset */ void gve_schedule_reset(struct gve_priv *priv); int gve_reset(struct gve_priv *priv, bool attempt_teardown); int gve_adjust_queues(struct gve_priv *priv, struct gve_queue_config new_rx_config, struct gve_queue_config new_tx_config); +/* report stats handling */ +void gve_handle_report_stats(struct gve_priv *priv); /* exported by ethtool.c */ extern const struct ethtool_ops gve_ethtool_ops; /* needed by ethtool */ diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c index c3ba7baf0107..f7621ab672b9 100644 --- a/drivers/net/ethernet/google/gve/gve_adminq.c +++ b/drivers/net/ethernet/google/gve/gve_adminq.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: (GPL-2.0 OR MIT) /* Google virtual Ethernet (gve) driver * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc. */ #include <linux/etherdevice.h> @@ -14,6 +14,165 @@ #define GVE_ADMINQ_SLEEP_LEN 20 #define GVE_MAX_ADMINQ_EVENT_COUNTER_CHECK 100 +#define GVE_DEVICE_OPTION_ERROR_FMT "%s option error:\n" \ +"Expected: length=%d, feature_mask=%x.\n" \ +"Actual: length=%d, feature_mask=%x.\n" + +#define GVE_DEVICE_OPTION_TOO_BIG_FMT "Length of %s option larger than expected. Possible older version of guest driver.\n" + +static +struct gve_device_option *gve_get_next_option(struct gve_device_descriptor *descriptor, + struct gve_device_option *option) +{ + void *option_end, *descriptor_end; + + option_end = (void *)(option + 1) + be16_to_cpu(option->option_length); + descriptor_end = (void *)descriptor + be16_to_cpu(descriptor->total_length); + + return option_end > descriptor_end ? NULL : (struct gve_device_option *)option_end; +} + +static +void gve_parse_device_option(struct gve_priv *priv, + struct gve_device_descriptor *device_descriptor, + struct gve_device_option *option, + struct gve_device_option_gqi_rda **dev_op_gqi_rda, + struct gve_device_option_gqi_qpl **dev_op_gqi_qpl, + struct gve_device_option_dqo_rda **dev_op_dqo_rda, + struct gve_device_option_jumbo_frames **dev_op_jumbo_frames) +{ + u32 req_feat_mask = be32_to_cpu(option->required_features_mask); + u16 option_length = be16_to_cpu(option->option_length); + u16 option_id = be16_to_cpu(option->option_id); + + /* If the length or feature mask doesn't match, continue without + * enabling the feature. + */ + switch (option_id) { + case GVE_DEV_OPT_ID_GQI_RAW_ADDRESSING: + if (option_length != GVE_DEV_OPT_LEN_GQI_RAW_ADDRESSING || + req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RAW_ADDRESSING) { + dev_warn(&priv->pdev->dev, GVE_DEVICE_OPTION_ERROR_FMT, + "Raw Addressing", + GVE_DEV_OPT_LEN_GQI_RAW_ADDRESSING, + GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RAW_ADDRESSING, + option_length, req_feat_mask); + break; + } + + dev_info(&priv->pdev->dev, + "Gqi raw addressing device option enabled.\n"); + priv->queue_format = GVE_GQI_RDA_FORMAT; + break; + case GVE_DEV_OPT_ID_GQI_RDA: + if (option_length < sizeof(**dev_op_gqi_rda) || + req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RDA) { + dev_warn(&priv->pdev->dev, GVE_DEVICE_OPTION_ERROR_FMT, + "GQI RDA", (int)sizeof(**dev_op_gqi_rda), + GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RDA, + option_length, req_feat_mask); + break; + } + + if (option_length > sizeof(**dev_op_gqi_rda)) { + dev_warn(&priv->pdev->dev, + GVE_DEVICE_OPTION_TOO_BIG_FMT, "GQI RDA"); + } + *dev_op_gqi_rda = (void *)(option + 1); + break; + case GVE_DEV_OPT_ID_GQI_QPL: + if (option_length < sizeof(**dev_op_gqi_qpl) || + req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL) { + dev_warn(&priv->pdev->dev, GVE_DEVICE_OPTION_ERROR_FMT, + "GQI QPL", (int)sizeof(**dev_op_gqi_qpl), + GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL, + option_length, req_feat_mask); + break; + } + + if (option_length > sizeof(**dev_op_gqi_qpl)) { + dev_warn(&priv->pdev->dev, + GVE_DEVICE_OPTION_TOO_BIG_FMT, "GQI QPL"); + } + *dev_op_gqi_qpl = (void *)(option + 1); + break; + case GVE_DEV_OPT_ID_DQO_RDA: + if (option_length < sizeof(**dev_op_dqo_rda) || + req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA) { + dev_warn(&priv->pdev->dev, GVE_DEVICE_OPTION_ERROR_FMT, + "DQO RDA", (int)sizeof(**dev_op_dqo_rda), + GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA, + option_length, req_feat_mask); + break; + } + + if (option_length > sizeof(**dev_op_dqo_rda)) { + dev_warn(&priv->pdev->dev, + GVE_DEVICE_OPTION_TOO_BIG_FMT, "DQO RDA"); + } + *dev_op_dqo_rda = (void *)(option + 1); + break; + case GVE_DEV_OPT_ID_JUMBO_FRAMES: + if (option_length < sizeof(**dev_op_jumbo_frames) || + req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES) { + dev_warn(&priv->pdev->dev, GVE_DEVICE_OPTION_ERROR_FMT, + "Jumbo Frames", + (int)sizeof(**dev_op_jumbo_frames), + GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES, + option_length, req_feat_mask); + break; + } + + if (option_length > sizeof(**dev_op_jumbo_frames)) { + dev_warn(&priv->pdev->dev, + GVE_DEVICE_OPTION_TOO_BIG_FMT, + "Jumbo Frames"); + } + *dev_op_jumbo_frames = (void *)(option + 1); + break; + default: + /* If we don't recognize the option just continue + * without doing anything. + */ + dev_dbg(&priv->pdev->dev, "Unrecognized device option 0x%hx not enabled.\n", + option_id); + } +} + +/* Process all device options for a given describe device call. */ +static int +gve_process_device_options(struct gve_priv *priv, + struct gve_device_descriptor *descriptor, + struct gve_device_option_gqi_rda **dev_op_gqi_rda, + struct gve_device_option_gqi_qpl **dev_op_gqi_qpl, + struct gve_device_option_dqo_rda **dev_op_dqo_rda, + struct gve_device_option_jumbo_frames **dev_op_jumbo_frames) +{ + const int num_options = be16_to_cpu(descriptor->num_device_options); + struct gve_device_option *dev_opt; + int i; + + /* The options struct directly follows the device descriptor. */ + dev_opt = (void *)(descriptor + 1); + for (i = 0; i < num_options; i++) { + struct gve_device_option *next_opt; + + next_opt = gve_get_next_option(descriptor, dev_opt); + if (!next_opt) { + dev_err(&priv->dev->dev, + "options exceed device_descriptor's total length.\n"); + return -EINVAL; + } + + gve_parse_device_option(priv, descriptor, dev_opt, + dev_op_gqi_rda, dev_op_gqi_qpl, + dev_op_dqo_rda, dev_op_jumbo_frames); + dev_opt = next_opt; + } + + return 0; +} + int gve_adminq_alloc(struct device *dev, struct gve_priv *priv) { priv->adminq = dma_alloc_coherent(dev, PAGE_SIZE, @@ -23,6 +182,21 @@ int gve_adminq_alloc(struct device *dev, struct gve_priv *priv) priv->adminq_mask = (PAGE_SIZE / sizeof(union gve_adminq_command)) - 1; priv->adminq_prod_cnt = 0; + priv->adminq_cmd_fail = 0; + priv->adminq_timeouts = 0; + priv->adminq_describe_device_cnt = 0; + priv->adminq_cfg_device_resources_cnt = 0; + priv->adminq_register_page_list_cnt = 0; + priv->adminq_unregister_page_list_cnt = 0; + priv->adminq_create_tx_queue_cnt = 0; + priv->adminq_create_rx_queue_cnt = 0; + priv->adminq_destroy_tx_queue_cnt = 0; + priv->adminq_destroy_rx_queue_cnt = 0; + priv->adminq_dcfg_device_resources_cnt = 0; + priv->adminq_set_driver_parameter_cnt = 0; + priv->adminq_report_stats_cnt = 0; + priv->adminq_report_link_speed_cnt = 0; + priv->adminq_get_ptype_map_cnt = 0; /* Setup Admin queue with the device */ iowrite32be(priv->adminq_bus_addr / PAGE_SIZE, @@ -81,17 +255,18 @@ static bool gve_adminq_wait_for_cmd(struct gve_priv *priv, u32 prod_cnt) return false; } -static int gve_adminq_parse_err(struct device *dev, u32 status) +static int gve_adminq_parse_err(struct gve_priv *priv, u32 status) { if (status != GVE_ADMINQ_COMMAND_PASSED && - status != GVE_ADMINQ_COMMAND_UNSET) - dev_err(dev, "AQ command failed with status %d\n", status); - + status != GVE_ADMINQ_COMMAND_UNSET) { + dev_err(&priv->pdev->dev, "AQ command failed with status %d\n", status); + priv->adminq_cmd_fail++; + } switch (status) { case GVE_ADMINQ_COMMAND_PASSED: return 0; case GVE_ADMINQ_COMMAND_UNSET: - dev_err(dev, "parse_aq_err: err and status both unset, this should not be possible.\n"); + dev_err(&priv->pdev->dev, "parse_aq_err: err and status both unset, this should not be possible.\n"); return -EINVAL; case GVE_ADMINQ_COMMAND_ERROR_ABORTED: case GVE_ADMINQ_COMMAND_ERROR_CANCELLED: @@ -116,36 +291,151 @@ static int gve_adminq_parse_err(struct device *dev, u32 status) case GVE_ADMINQ_COMMAND_ERROR_UNIMPLEMENTED: return -ENOTSUPP; default: - dev_err(dev, "parse_aq_err: unknown status code %d\n", status); + dev_err(&priv->pdev->dev, "parse_aq_err: unknown status code %d\n", status); return -EINVAL; } } +/* Flushes all AQ commands currently queued and waits for them to complete. + * If there are failures, it will return the first error. + */ +static int gve_adminq_kick_and_wait(struct gve_priv *priv) +{ + int tail, head; + int i; + + tail = ioread32be(&priv->reg_bar0->adminq_event_counter); + head = priv->adminq_prod_cnt; + + gve_adminq_kick_cmd(priv, head); + if (!gve_adminq_wait_for_cmd(priv, head)) { + dev_err(&priv->pdev->dev, "AQ commands timed out, need to reset AQ\n"); + priv->adminq_timeouts++; + return -ENOTRECOVERABLE; + } + + for (i = tail; i < head; i++) { + union gve_adminq_command *cmd; + u32 status, err; + + cmd = &priv->adminq[i & priv->adminq_mask]; + status = be32_to_cpu(READ_ONCE(cmd->status)); + err = gve_adminq_parse_err(priv, status); + if (err) + // Return the first error if we failed. + return err; + } + + return 0; +} + /* This function is not threadsafe - the caller is responsible for any * necessary locks. */ -int gve_adminq_execute_cmd(struct gve_priv *priv, - union gve_adminq_command *cmd_orig) +static int gve_adminq_issue_cmd(struct gve_priv *priv, + union gve_adminq_command *cmd_orig) { union gve_adminq_command *cmd; - u32 status = 0; - u32 prod_cnt; + u32 opcode; + u32 tail; + + tail = ioread32be(&priv->reg_bar0->adminq_event_counter); + + // Check if next command will overflow the buffer. + if (((priv->adminq_prod_cnt + 1) & priv->adminq_mask) == + (tail & priv->adminq_mask)) { + int err; + + // Flush existing commands to make room. + err = gve_adminq_kick_and_wait(priv); + if (err) + return err; + + // Retry. + tail = ioread32be(&priv->reg_bar0->adminq_event_counter); + if (((priv->adminq_prod_cnt + 1) & priv->adminq_mask) == + (tail & priv->adminq_mask)) { + // This should never happen. We just flushed the + // command queue so there should be enough space. + return -ENOMEM; + } + } cmd = &priv->adminq[priv->adminq_prod_cnt & priv->adminq_mask]; priv->adminq_prod_cnt++; - prod_cnt = priv->adminq_prod_cnt; memcpy(cmd, cmd_orig, sizeof(*cmd_orig)); - - gve_adminq_kick_cmd(priv, prod_cnt); - if (!gve_adminq_wait_for_cmd(priv, prod_cnt)) { - dev_err(&priv->pdev->dev, "AQ command timed out, need to reset AQ\n"); - return -ENOTRECOVERABLE; + opcode = be32_to_cpu(READ_ONCE(cmd->opcode)); + + switch (opcode) { + case GVE_ADMINQ_DESCRIBE_DEVICE: + priv->adminq_describe_device_cnt++; + break; + case GVE_ADMINQ_CONFIGURE_DEVICE_RESOURCES: + priv->adminq_cfg_device_resources_cnt++; + break; + case GVE_ADMINQ_REGISTER_PAGE_LIST: + priv->adminq_register_page_list_cnt++; + break; + case GVE_ADMINQ_UNREGISTER_PAGE_LIST: + priv->adminq_unregister_page_list_cnt++; + break; + case GVE_ADMINQ_CREATE_TX_QUEUE: + priv->adminq_create_tx_queue_cnt++; + break; + case GVE_ADMINQ_CREATE_RX_QUEUE: + priv->adminq_create_rx_queue_cnt++; + break; + case GVE_ADMINQ_DESTROY_TX_QUEUE: + priv->adminq_destroy_tx_queue_cnt++; + break; + case GVE_ADMINQ_DESTROY_RX_QUEUE: + priv->adminq_destroy_rx_queue_cnt++; + break; + case GVE_ADMINQ_DECONFIGURE_DEVICE_RESOURCES: + priv->adminq_dcfg_device_resources_cnt++; + break; + case GVE_ADMINQ_SET_DRIVER_PARAMETER: + priv->adminq_set_driver_parameter_cnt++; + break; + case GVE_ADMINQ_REPORT_STATS: + priv->adminq_report_stats_cnt++; + break; + case GVE_ADMINQ_REPORT_LINK_SPEED: + priv->adminq_report_link_speed_cnt++; + break; + case GVE_ADMINQ_GET_PTYPE_MAP: + priv->adminq_get_ptype_map_cnt++; + break; + default: + dev_err(&priv->pdev->dev, "unknown AQ command opcode %d\n", opcode); } - memcpy(cmd_orig, cmd, sizeof(*cmd)); - status = be32_to_cpu(READ_ONCE(cmd->status)); - return gve_adminq_parse_err(&priv->pdev->dev, status); + return 0; +} + +/* This function is not threadsafe - the caller is responsible for any + * necessary locks. + * The caller is also responsible for making sure there are no commands + * waiting to be executed. + */ +static int gve_adminq_execute_cmd(struct gve_priv *priv, + union gve_adminq_command *cmd_orig) +{ + u32 tail, head; + int err; + + tail = ioread32be(&priv->reg_bar0->adminq_event_counter); + head = priv->adminq_prod_cnt; + if (tail != head) + // This is not a valid path + return -EINVAL; + + err = gve_adminq_issue_cmd(priv, cmd_orig); + if (err) + return err; + + return gve_adminq_kick_and_wait(priv); } /* The device specifies that the management vector can either be the first irq @@ -172,9 +462,10 @@ int gve_adminq_configure_device_resources(struct gve_priv *priv, .num_counters = cpu_to_be32(num_counters), .irq_db_addr = cpu_to_be64(db_array_bus_addr), .num_irq_dbs = cpu_to_be32(num_ntfy_blks), - .irq_db_stride = cpu_to_be32(sizeof(priv->ntfy_blocks[0])), + .irq_db_stride = cpu_to_be32(sizeof(*priv->irq_db_indices)), .ntfy_blk_msix_base_idx = cpu_to_be32(GVE_NTFY_BLK_BASE_MSIX_IDX), + .queue_format = priv->queue_format, }; return gve_adminq_execute_cmd(priv, &cmd); @@ -190,7 +481,7 @@ int gve_adminq_deconfigure_device_resources(struct gve_priv *priv) return gve_adminq_execute_cmd(priv, &cmd); } -int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_index) +static int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_index) { struct gve_tx_ring *tx = &priv->tx[queue_index]; union gve_adminq_command cmd; @@ -199,17 +490,44 @@ int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_index) cmd.opcode = cpu_to_be32(GVE_ADMINQ_CREATE_TX_QUEUE); cmd.create_tx_queue = (struct gve_adminq_create_tx_queue) { .queue_id = cpu_to_be32(queue_index), - .reserved = 0, - .queue_resources_addr = cpu_to_be64(tx->q_resources_bus), + .queue_resources_addr = + cpu_to_be64(tx->q_resources_bus), .tx_ring_addr = cpu_to_be64(tx->bus), - .queue_page_list_id = cpu_to_be32(tx->tx_fifo.qpl->id), .ntfy_id = cpu_to_be32(tx->ntfy_id), }; - return gve_adminq_execute_cmd(priv, &cmd); + if (gve_is_gqi(priv)) { + u32 qpl_id = priv->queue_format == GVE_GQI_RDA_FORMAT ? + GVE_RAW_ADDRESSING_QPL_ID : tx->tx_fifo.qpl->id; + + cmd.create_tx_queue.queue_page_list_id = cpu_to_be32(qpl_id); + } else { + cmd.create_tx_queue.tx_ring_size = + cpu_to_be16(priv->tx_desc_cnt); + cmd.create_tx_queue.tx_comp_ring_addr = + cpu_to_be64(tx->complq_bus_dqo); + cmd.create_tx_queue.tx_comp_ring_size = + cpu_to_be16(priv->options_dqo_rda.tx_comp_ring_entries); + } + + return gve_adminq_issue_cmd(priv, &cmd); } -int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index) +int gve_adminq_create_tx_queues(struct gve_priv *priv, u32 num_queues) +{ + int err; + int i; + + for (i = 0; i < num_queues; i++) { + err = gve_adminq_create_tx_queue(priv, i); + if (err) + return err; + } + + return gve_adminq_kick_and_wait(priv); +} + +static int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index) { struct gve_rx_ring *rx = &priv->rx[queue_index]; union gve_adminq_command cmd; @@ -218,21 +536,57 @@ int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index) cmd.opcode = cpu_to_be32(GVE_ADMINQ_CREATE_RX_QUEUE); cmd.create_rx_queue = (struct gve_adminq_create_rx_queue) { .queue_id = cpu_to_be32(queue_index), - .index = cpu_to_be32(queue_index), - .reserved = 0, .ntfy_id = cpu_to_be32(rx->ntfy_id), .queue_resources_addr = cpu_to_be64(rx->q_resources_bus), - .rx_desc_ring_addr = cpu_to_be64(rx->desc.bus), - .rx_data_ring_addr = cpu_to_be64(rx->data.data_bus), - .queue_page_list_id = cpu_to_be32(rx->data.qpl->id), }; - return gve_adminq_execute_cmd(priv, &cmd); + if (gve_is_gqi(priv)) { + u32 qpl_id = priv->queue_format == GVE_GQI_RDA_FORMAT ? + GVE_RAW_ADDRESSING_QPL_ID : rx->data.qpl->id; + + cmd.create_rx_queue.rx_desc_ring_addr = + cpu_to_be64(rx->desc.bus), + cmd.create_rx_queue.rx_data_ring_addr = + cpu_to_be64(rx->data.data_bus), + cmd.create_rx_queue.index = cpu_to_be32(queue_index); + cmd.create_rx_queue.queue_page_list_id = cpu_to_be32(qpl_id); + cmd.create_rx_queue.packet_buffer_size = cpu_to_be16(rx->packet_buffer_size); + } else { + cmd.create_rx_queue.rx_ring_size = + cpu_to_be16(priv->rx_desc_cnt); + cmd.create_rx_queue.rx_desc_ring_addr = + cpu_to_be64(rx->dqo.complq.bus); + cmd.create_rx_queue.rx_data_ring_addr = + cpu_to_be64(rx->dqo.bufq.bus); + cmd.create_rx_queue.packet_buffer_size = + cpu_to_be16(priv->data_buffer_size_dqo); + cmd.create_rx_queue.rx_buff_ring_size = + cpu_to_be16(priv->options_dqo_rda.rx_buff_ring_entries); + cmd.create_rx_queue.enable_rsc = + !!(priv->dev->features & NETIF_F_LRO); + } + + return gve_adminq_issue_cmd(priv, &cmd); } -int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_index) +int gve_adminq_create_rx_queues(struct gve_priv *priv, u32 num_queues) +{ + int err; + int i; + + for (i = 0; i < num_queues; i++) { + err = gve_adminq_create_rx_queue(priv, i); + if (err) + return err; + } + + return gve_adminq_kick_and_wait(priv); +} + +static int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_index) { union gve_adminq_command cmd; + int err; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = cpu_to_be32(GVE_ADMINQ_DESTROY_TX_QUEUE); @@ -240,12 +594,31 @@ int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_index) .queue_id = cpu_to_be32(queue_index), }; - return gve_adminq_execute_cmd(priv, &cmd); + err = gve_adminq_issue_cmd(priv, &cmd); + if (err) + return err; + + return 0; +} + +int gve_adminq_destroy_tx_queues(struct gve_priv *priv, u32 num_queues) +{ + int err; + int i; + + for (i = 0; i < num_queues; i++) { + err = gve_adminq_destroy_tx_queue(priv, i); + if (err) + return err; + } + + return gve_adminq_kick_and_wait(priv); } -int gve_adminq_destroy_rx_queue(struct gve_priv *priv, u32 queue_index) +static int gve_adminq_destroy_rx_queue(struct gve_priv *priv, u32 queue_index) { union gve_adminq_command cmd; + int err; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = cpu_to_be32(GVE_ADMINQ_DESTROY_RX_QUEUE); @@ -253,12 +626,86 @@ int gve_adminq_destroy_rx_queue(struct gve_priv *priv, u32 queue_index) .queue_id = cpu_to_be32(queue_index), }; - return gve_adminq_execute_cmd(priv, &cmd); + err = gve_adminq_issue_cmd(priv, &cmd); + if (err) + return err; + + return 0; +} + +int gve_adminq_destroy_rx_queues(struct gve_priv *priv, u32 num_queues) +{ + int err; + int i; + + for (i = 0; i < num_queues; i++) { + err = gve_adminq_destroy_rx_queue(priv, i); + if (err) + return err; + } + + return gve_adminq_kick_and_wait(priv); +} + +static int gve_set_desc_cnt(struct gve_priv *priv, + struct gve_device_descriptor *descriptor) +{ + priv->tx_desc_cnt = be16_to_cpu(descriptor->tx_queue_entries); + if (priv->tx_desc_cnt * sizeof(priv->tx->desc[0]) < PAGE_SIZE) { + dev_err(&priv->pdev->dev, "Tx desc count %d too low\n", + priv->tx_desc_cnt); + return -EINVAL; + } + priv->rx_desc_cnt = be16_to_cpu(descriptor->rx_queue_entries); + if (priv->rx_desc_cnt * sizeof(priv->rx->desc.desc_ring[0]) + < PAGE_SIZE) { + dev_err(&priv->pdev->dev, "Rx desc count %d too low\n", + priv->rx_desc_cnt); + return -EINVAL; + } + return 0; +} + +static int +gve_set_desc_cnt_dqo(struct gve_priv *priv, + const struct gve_device_descriptor *descriptor, + const struct gve_device_option_dqo_rda *dev_op_dqo_rda) +{ + priv->tx_desc_cnt = be16_to_cpu(descriptor->tx_queue_entries); + priv->options_dqo_rda.tx_comp_ring_entries = + be16_to_cpu(dev_op_dqo_rda->tx_comp_ring_entries); + priv->rx_desc_cnt = be16_to_cpu(descriptor->rx_queue_entries); + priv->options_dqo_rda.rx_buff_ring_entries = + be16_to_cpu(dev_op_dqo_rda->rx_buff_ring_entries); + + return 0; +} + +static void gve_enable_supported_features(struct gve_priv *priv, + u32 supported_features_mask, + const struct gve_device_option_jumbo_frames + *dev_op_jumbo_frames) +{ + /* Before control reaches this point, the page-size-capped max MTU from + * the gve_device_descriptor field has already been stored in + * priv->dev->max_mtu. We overwrite it with the true max MTU below. + */ + if (dev_op_jumbo_frames && + (supported_features_mask & GVE_SUP_JUMBO_FRAMES_MASK)) { + dev_info(&priv->pdev->dev, + "JUMBO FRAMES device option enabled.\n"); + priv->dev->max_mtu = be16_to_cpu(dev_op_jumbo_frames->max_mtu); + } } int gve_adminq_describe_device(struct gve_priv *priv) { + struct gve_device_option_jumbo_frames *dev_op_jumbo_frames = NULL; + struct gve_device_option_gqi_rda *dev_op_gqi_rda = NULL; + struct gve_device_option_gqi_qpl *dev_op_gqi_qpl = NULL; + struct gve_device_option_dqo_rda *dev_op_dqo_rda = NULL; struct gve_device_descriptor *descriptor; + u32 supported_features_mask = 0; union gve_adminq_command cmd; dma_addr_t descriptor_bus; int err = 0; @@ -281,48 +728,77 @@ int gve_adminq_describe_device(struct gve_priv *priv) if (err) goto free_device_descriptor; - priv->tx_desc_cnt = be16_to_cpu(descriptor->tx_queue_entries); - if (priv->tx_desc_cnt * sizeof(priv->tx->desc[0]) < PAGE_SIZE) { - netif_err(priv, drv, priv->dev, "Tx desc count %d too low\n", - priv->tx_desc_cnt); - err = -EINVAL; + err = gve_process_device_options(priv, descriptor, &dev_op_gqi_rda, + &dev_op_gqi_qpl, &dev_op_dqo_rda, + &dev_op_jumbo_frames); + if (err) goto free_device_descriptor; + + /* If the GQI_RAW_ADDRESSING option is not enabled and the queue format + * is not set to GqiRda, choose the queue format in a priority order: + * DqoRda, GqiRda, GqiQpl. Use GqiQpl as default. + */ + if (dev_op_dqo_rda) { + priv->queue_format = GVE_DQO_RDA_FORMAT; + dev_info(&priv->pdev->dev, + "Driver is running with DQO RDA queue format.\n"); + supported_features_mask = + be32_to_cpu(dev_op_dqo_rda->supported_features_mask); + } else if (dev_op_gqi_rda) { + priv->queue_format = GVE_GQI_RDA_FORMAT; + dev_info(&priv->pdev->dev, + "Driver is running with GQI RDA queue format.\n"); + supported_features_mask = + be32_to_cpu(dev_op_gqi_rda->supported_features_mask); + } else if (priv->queue_format == GVE_GQI_RDA_FORMAT) { + dev_info(&priv->pdev->dev, + "Driver is running with GQI RDA queue format.\n"); + } else { + priv->queue_format = GVE_GQI_QPL_FORMAT; + if (dev_op_gqi_qpl) + supported_features_mask = + be32_to_cpu(dev_op_gqi_qpl->supported_features_mask); + dev_info(&priv->pdev->dev, + "Driver is running with GQI QPL queue format.\n"); } - priv->rx_desc_cnt = be16_to_cpu(descriptor->rx_queue_entries); - if (priv->rx_desc_cnt * sizeof(priv->rx->desc.desc_ring[0]) - < PAGE_SIZE || - priv->rx_desc_cnt * sizeof(priv->rx->data.data_ring[0]) - < PAGE_SIZE) { - netif_err(priv, drv, priv->dev, "Rx desc count %d too low\n", - priv->rx_desc_cnt); - err = -EINVAL; - goto free_device_descriptor; + if (gve_is_gqi(priv)) { + err = gve_set_desc_cnt(priv, descriptor); + } else { + /* DQO supports LRO. */ + priv->dev->hw_features |= NETIF_F_LRO; + err = gve_set_desc_cnt_dqo(priv, descriptor, dev_op_dqo_rda); } + if (err) + goto free_device_descriptor; + priv->max_registered_pages = be64_to_cpu(descriptor->max_registered_pages); mtu = be16_to_cpu(descriptor->mtu); if (mtu < ETH_MIN_MTU) { - netif_err(priv, drv, priv->dev, "MTU %d below minimum MTU\n", - mtu); + dev_err(&priv->pdev->dev, "MTU %d below minimum MTU\n", mtu); err = -EINVAL; goto free_device_descriptor; } priv->dev->max_mtu = mtu; priv->num_event_counters = be16_to_cpu(descriptor->counters); - ether_addr_copy(priv->dev->dev_addr, descriptor->mac); + eth_hw_addr_set(priv->dev, descriptor->mac); mac = descriptor->mac; - netif_info(priv, drv, priv->dev, "MAC addr: %pM\n", mac); + dev_info(&priv->pdev->dev, "MAC addr: %pM\n", mac); priv->tx_pages_per_qpl = be16_to_cpu(descriptor->tx_pages_per_qpl); - priv->rx_pages_per_qpl = be16_to_cpu(descriptor->rx_pages_per_qpl); - if (priv->rx_pages_per_qpl < priv->rx_desc_cnt) { - netif_err(priv, drv, priv->dev, "rx_pages_per_qpl cannot be smaller than rx_desc_cnt, setting rx_desc_cnt down to %d.\n", - priv->rx_pages_per_qpl); - priv->rx_desc_cnt = priv->rx_pages_per_qpl; + priv->rx_data_slot_cnt = be16_to_cpu(descriptor->rx_pages_per_qpl); + + if (gve_is_gqi(priv) && priv->rx_data_slot_cnt < priv->rx_desc_cnt) { + dev_err(&priv->pdev->dev, "rx_data_slot_cnt cannot be smaller than rx_desc_cnt, setting rx_desc_cnt down to %d.\n", + priv->rx_data_slot_cnt); + priv->rx_desc_cnt = priv->rx_data_slot_cnt; } priv->default_num_queues = be16_to_cpu(descriptor->default_num_queues); + gve_enable_supported_features(priv, supported_features_mask, + dev_op_jumbo_frames); + free_device_descriptor: - dma_free_coherent(&priv->pdev->dev, sizeof(*descriptor), descriptor, + dma_free_coherent(&priv->pdev->dev, PAGE_SIZE, descriptor, descriptor_bus); return err; } @@ -385,3 +861,84 @@ int gve_adminq_set_mtu(struct gve_priv *priv, u64 mtu) return gve_adminq_execute_cmd(priv, &cmd); } + +int gve_adminq_report_stats(struct gve_priv *priv, u64 stats_report_len, + dma_addr_t stats_report_addr, u64 interval) +{ + union gve_adminq_command cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = cpu_to_be32(GVE_ADMINQ_REPORT_STATS); + cmd.report_stats = (struct gve_adminq_report_stats) { + .stats_report_len = cpu_to_be64(stats_report_len), + .stats_report_addr = cpu_to_be64(stats_report_addr), + .interval = cpu_to_be64(interval), + }; + + return gve_adminq_execute_cmd(priv, &cmd); +} + +int gve_adminq_report_link_speed(struct gve_priv *priv) +{ + union gve_adminq_command gvnic_cmd; + dma_addr_t link_speed_region_bus; + __be64 *link_speed_region; + int err; + + link_speed_region = + dma_alloc_coherent(&priv->pdev->dev, sizeof(*link_speed_region), + &link_speed_region_bus, GFP_KERNEL); + + if (!link_speed_region) + return -ENOMEM; + + memset(&gvnic_cmd, 0, sizeof(gvnic_cmd)); + gvnic_cmd.opcode = cpu_to_be32(GVE_ADMINQ_REPORT_LINK_SPEED); + gvnic_cmd.report_link_speed.link_speed_address = + cpu_to_be64(link_speed_region_bus); + + err = gve_adminq_execute_cmd(priv, &gvnic_cmd); + + priv->link_speed = be64_to_cpu(*link_speed_region); + dma_free_coherent(&priv->pdev->dev, sizeof(*link_speed_region), link_speed_region, + link_speed_region_bus); + return err; +} + +int gve_adminq_get_ptype_map_dqo(struct gve_priv *priv, + struct gve_ptype_lut *ptype_lut) +{ + struct gve_ptype_map *ptype_map; + union gve_adminq_command cmd; + dma_addr_t ptype_map_bus; + int err = 0; + int i; + + memset(&cmd, 0, sizeof(cmd)); + ptype_map = dma_alloc_coherent(&priv->pdev->dev, sizeof(*ptype_map), + &ptype_map_bus, GFP_KERNEL); + if (!ptype_map) + return -ENOMEM; + + cmd.opcode = cpu_to_be32(GVE_ADMINQ_GET_PTYPE_MAP); + cmd.get_ptype_map = (struct gve_adminq_get_ptype_map) { + .ptype_map_len = cpu_to_be64(sizeof(*ptype_map)), + .ptype_map_addr = cpu_to_be64(ptype_map_bus), + }; + + err = gve_adminq_execute_cmd(priv, &cmd); + if (err) + goto err; + + /* Populate ptype_lut. */ + for (i = 0; i < GVE_NUM_PTYPES; i++) { + ptype_lut->ptypes[i].l3_type = + ptype_map->ptypes[i].l3_type; + ptype_lut->ptypes[i].l4_type = + ptype_map->ptypes[i].l4_type; + } +err: + dma_free_coherent(&priv->pdev->dev, sizeof(*ptype_map), ptype_map, + ptype_map_bus); + return err; +} diff --git a/drivers/net/ethernet/google/gve/gve_adminq.h b/drivers/net/ethernet/google/gve/gve_adminq.h index 4dfa06edc0f8..83c0b40cd2d9 100644 --- a/drivers/net/ethernet/google/gve/gve_adminq.h +++ b/drivers/net/ethernet/google/gve/gve_adminq.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: (GPL-2.0 OR MIT) * Google virtual Ethernet (gve) driver * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc. */ #ifndef _GVE_ADMINQ_H @@ -21,6 +21,9 @@ enum gve_adminq_opcodes { GVE_ADMINQ_DESTROY_RX_QUEUE = 0x8, GVE_ADMINQ_DECONFIGURE_DEVICE_RESOURCES = 0x9, GVE_ADMINQ_SET_DRIVER_PARAMETER = 0xB, + GVE_ADMINQ_REPORT_STATS = 0xC, + GVE_ADMINQ_REPORT_LINK_SPEED = 0xD, + GVE_ADMINQ_GET_PTYPE_MAP = 0xE, }; /* Admin queue status codes */ @@ -77,12 +80,71 @@ struct gve_device_descriptor { static_assert(sizeof(struct gve_device_descriptor) == 40); -struct device_option { - __be32 option_id; - __be32 option_length; +struct gve_device_option { + __be16 option_id; + __be16 option_length; + __be32 required_features_mask; }; -static_assert(sizeof(struct device_option) == 8); +static_assert(sizeof(struct gve_device_option) == 8); + +struct gve_device_option_gqi_rda { + __be32 supported_features_mask; +}; + +static_assert(sizeof(struct gve_device_option_gqi_rda) == 4); + +struct gve_device_option_gqi_qpl { + __be32 supported_features_mask; +}; + +static_assert(sizeof(struct gve_device_option_gqi_qpl) == 4); + +struct gve_device_option_dqo_rda { + __be32 supported_features_mask; + __be16 tx_comp_ring_entries; + __be16 rx_buff_ring_entries; +}; + +static_assert(sizeof(struct gve_device_option_dqo_rda) == 8); + +struct gve_device_option_jumbo_frames { + __be32 supported_features_mask; + __be16 max_mtu; + u8 padding[2]; +}; + +static_assert(sizeof(struct gve_device_option_jumbo_frames) == 8); + +/* Terminology: + * + * RDA - Raw DMA Addressing - Buffers associated with SKBs are directly DMA + * mapped and read/updated by the device. + * + * QPL - Queue Page Lists - Driver uses bounce buffers which are DMA mapped with + * the device for read/write and data is copied from/to SKBs. + */ +enum gve_dev_opt_id { + GVE_DEV_OPT_ID_GQI_RAW_ADDRESSING = 0x1, + GVE_DEV_OPT_ID_GQI_RDA = 0x2, + GVE_DEV_OPT_ID_GQI_QPL = 0x3, + GVE_DEV_OPT_ID_DQO_RDA = 0x4, + GVE_DEV_OPT_ID_JUMBO_FRAMES = 0x8, +}; + +enum gve_dev_opt_req_feat_mask { + GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RAW_ADDRESSING = 0x0, + GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RDA = 0x0, + GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL = 0x0, + GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA = 0x0, + GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES = 0x0, +}; + +enum gve_sup_feature_mask { + GVE_SUP_JUMBO_FRAMES_MASK = 1 << 2, +}; + +#define GVE_DEV_OPT_LEN_GQI_RAW_ADDRESSING 0x0 struct gve_adminq_configure_device_resources { __be64 counter_array; @@ -91,9 +153,11 @@ struct gve_adminq_configure_device_resources { __be32 num_irq_dbs; __be32 irq_db_stride; __be32 ntfy_blk_msix_base_idx; + u8 queue_format; + u8 padding[7]; }; -static_assert(sizeof(struct gve_adminq_configure_device_resources) == 32); +static_assert(sizeof(struct gve_adminq_configure_device_resources) == 40); struct gve_adminq_register_page_list { __be32 page_list_id; @@ -109,6 +173,8 @@ struct gve_adminq_unregister_page_list { static_assert(sizeof(struct gve_adminq_unregister_page_list) == 4); +#define GVE_RAW_ADDRESSING_QPL_ID 0xFFFFFFFF + struct gve_adminq_create_tx_queue { __be32 queue_id; __be32 reserved; @@ -116,9 +182,13 @@ struct gve_adminq_create_tx_queue { __be64 tx_ring_addr; __be32 queue_page_list_id; __be32 ntfy_id; + __be64 tx_comp_ring_addr; + __be16 tx_ring_size; + __be16 tx_comp_ring_size; + u8 padding[4]; }; -static_assert(sizeof(struct gve_adminq_create_tx_queue) == 32); +static_assert(sizeof(struct gve_adminq_create_tx_queue) == 48); struct gve_adminq_create_rx_queue { __be32 queue_id; @@ -129,10 +199,14 @@ struct gve_adminq_create_rx_queue { __be64 rx_desc_ring_addr; __be64 rx_data_ring_addr; __be32 queue_page_list_id; - u8 padding[4]; + __be16 rx_ring_size; + __be16 packet_buffer_size; + __be16 rx_buff_ring_size; + u8 enable_rsc; + u8 padding[5]; }; -static_assert(sizeof(struct gve_adminq_create_rx_queue) == 48); +static_assert(sizeof(struct gve_adminq_create_rx_queue) == 56); /* Queue resources that are shared with the device */ struct gve_queue_resources { @@ -172,6 +246,87 @@ struct gve_adminq_set_driver_parameter { static_assert(sizeof(struct gve_adminq_set_driver_parameter) == 16); +struct gve_adminq_report_stats { + __be64 stats_report_len; + __be64 stats_report_addr; + __be64 interval; +}; + +static_assert(sizeof(struct gve_adminq_report_stats) == 24); + +struct gve_adminq_report_link_speed { + __be64 link_speed_address; +}; + +static_assert(sizeof(struct gve_adminq_report_link_speed) == 8); + +struct stats { + __be32 stat_name; + __be32 queue_id; + __be64 value; +}; + +static_assert(sizeof(struct stats) == 16); + +struct gve_stats_report { + __be64 written_count; + struct stats stats[]; +}; + +static_assert(sizeof(struct gve_stats_report) == 8); + +enum gve_stat_names { + // stats from gve + TX_WAKE_CNT = 1, + TX_STOP_CNT = 2, + TX_FRAMES_SENT = 3, + TX_BYTES_SENT = 4, + TX_LAST_COMPLETION_PROCESSED = 5, + RX_NEXT_EXPECTED_SEQUENCE = 6, + RX_BUFFERS_POSTED = 7, + TX_TIMEOUT_CNT = 8, + // stats from NIC + RX_QUEUE_DROP_CNT = 65, + RX_NO_BUFFERS_POSTED = 66, + RX_DROPS_PACKET_OVER_MRU = 67, + RX_DROPS_INVALID_CHECKSUM = 68, +}; + +enum gve_l3_type { + /* Must be zero so zero initialized LUT is unknown. */ + GVE_L3_TYPE_UNKNOWN = 0, + GVE_L3_TYPE_OTHER, + GVE_L3_TYPE_IPV4, + GVE_L3_TYPE_IPV6, +}; + +enum gve_l4_type { + /* Must be zero so zero initialized LUT is unknown. */ + GVE_L4_TYPE_UNKNOWN = 0, + GVE_L4_TYPE_OTHER, + GVE_L4_TYPE_TCP, + GVE_L4_TYPE_UDP, + GVE_L4_TYPE_ICMP, + GVE_L4_TYPE_SCTP, +}; + +/* These are control path types for PTYPE which are the same as the data path + * types. + */ +struct gve_ptype_entry { + u8 l3_type; + u8 l4_type; +}; + +struct gve_ptype_map { + struct gve_ptype_entry ptypes[1 << 10]; /* PTYPES are always 10 bits. */ +}; + +struct gve_adminq_get_ptype_map { + __be64 ptype_map_len; + __be64 ptype_map_addr; +}; + union gve_adminq_command { struct { __be32 opcode; @@ -187,6 +342,9 @@ union gve_adminq_command { struct gve_adminq_register_page_list reg_page_list; struct gve_adminq_unregister_page_list unreg_page_list; struct gve_adminq_set_driver_parameter set_driver_param; + struct gve_adminq_report_stats report_stats; + struct gve_adminq_report_link_speed report_link_speed; + struct gve_adminq_get_ptype_map get_ptype_map; }; }; u8 reserved[64]; @@ -197,8 +355,6 @@ static_assert(sizeof(union gve_adminq_command) == 64); int gve_adminq_alloc(struct device *dev, struct gve_priv *priv); void gve_adminq_free(struct device *dev, struct gve_priv *priv); void gve_adminq_release(struct gve_priv *priv); -int gve_adminq_execute_cmd(struct gve_priv *priv, - union gve_adminq_command *cmd_orig); int gve_adminq_describe_device(struct gve_priv *priv); int gve_adminq_configure_device_resources(struct gve_priv *priv, dma_addr_t counter_array_bus_addr, @@ -206,12 +362,20 @@ int gve_adminq_configure_device_resources(struct gve_priv *priv, dma_addr_t db_array_bus_addr, u32 num_ntfy_blks); int gve_adminq_deconfigure_device_resources(struct gve_priv *priv); -int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_id); -int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_id); -int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_id); -int gve_adminq_destroy_rx_queue(struct gve_priv *priv, u32 queue_id); +int gve_adminq_create_tx_queues(struct gve_priv *priv, u32 num_queues); +int gve_adminq_destroy_tx_queues(struct gve_priv *priv, u32 queue_id); +int gve_adminq_create_rx_queues(struct gve_priv *priv, u32 num_queues); +int gve_adminq_destroy_rx_queues(struct gve_priv *priv, u32 queue_id); int gve_adminq_register_page_list(struct gve_priv *priv, struct gve_queue_page_list *qpl); int gve_adminq_unregister_page_list(struct gve_priv *priv, u32 page_list_id); int gve_adminq_set_mtu(struct gve_priv *priv, u64 mtu); +int gve_adminq_report_stats(struct gve_priv *priv, u64 stats_report_len, + dma_addr_t stats_report_addr, u64 interval); +int gve_adminq_report_link_speed(struct gve_priv *priv); + +struct gve_ptype_lut; +int gve_adminq_get_ptype_map_dqo(struct gve_priv *priv, + struct gve_ptype_lut *ptype_lut); + #endif /* _GVE_ADMINQ_H */ diff --git a/drivers/net/ethernet/google/gve/gve_desc.h b/drivers/net/ethernet/google/gve/gve_desc.h index 54779871d52e..f4ae9e19b844 100644 --- a/drivers/net/ethernet/google/gve/gve_desc.h +++ b/drivers/net/ethernet/google/gve/gve_desc.h @@ -16,9 +16,11 @@ * Base addresses encoded in seg_addr are not assumed to be physical * addresses. The ring format assumes these come from some linear address * space. This could be physical memory, kernel virtual memory, user virtual - * memory. gVNIC uses lists of registered pages. Each queue is assumed - * to be associated with a single such linear address space to ensure a - * consistent meaning for seg_addrs posted to its rings. + * memory. + * If raw dma addressing is not supported then gVNIC uses lists of registered + * pages. Each queue is assumed to be associated with a single such linear + * address space to ensure a consistent meaning for seg_addrs posted to its + * rings. */ struct gve_tx_pkt_desc { @@ -31,6 +33,14 @@ struct gve_tx_pkt_desc { __be64 seg_addr; /* Base address (see note) of this segment */ } __packed; +struct gve_tx_mtd_desc { + u8 type_flags; /* type is lower 4 bits, subtype upper */ + u8 path_state; /* state is lower 4 bits, hash type upper */ + __be16 reserved0; + __be32 path_hash; + __be64 reserved1; +} __packed; + struct gve_tx_seg_desc { u8 type_flags; /* type is lower 4 bits, flags upper */ u8 l3_offset; /* TSO: 2 byte units to start of IPH */ @@ -44,6 +54,7 @@ struct gve_tx_seg_desc { #define GVE_TXD_STD (0x0 << 4) /* Std with Host Address */ #define GVE_TXD_TSO (0x1 << 4) /* TSO with Host Address */ #define GVE_TXD_SEG (0x2 << 4) /* Seg with Host Address */ +#define GVE_TXD_MTD (0x3 << 4) /* Metadata */ /* GVE Transmit Descriptor Flags for Std Pkts */ #define GVE_TXF_L4CSUM BIT(0) /* Need csum offload */ @@ -52,6 +63,17 @@ struct gve_tx_seg_desc { /* GVE Transmit Descriptor Flags for TSO Segs */ #define GVE_TXSF_IPV6 BIT(1) /* IPv6 TSO */ +/* GVE Transmit Descriptor Options for MTD Segs */ +#define GVE_MTD_SUBTYPE_PATH 0 + +#define GVE_MTD_PATH_STATE_DEFAULT 0 +#define GVE_MTD_PATH_STATE_TIMEOUT 1 +#define GVE_MTD_PATH_STATE_CONGESTION 2 +#define GVE_MTD_PATH_STATE_RETRANSMIT 3 + +#define GVE_MTD_PATH_HASH_NONE (0x0 << 4) +#define GVE_MTD_PATH_HASH_L4 (0x1 << 4) + /* GVE Receive Packet Descriptor */ /* The start of an ethernet packet comes 2 bytes into the rx buffer. * gVNIC adds this padding so that both the DMA and the L3/4 protocol header @@ -72,12 +94,15 @@ struct gve_rx_desc { } __packed; static_assert(sizeof(struct gve_rx_desc) == 64); -/* As with the Tx ring format, the qpl_offset entries below are offsets into an - * ordered list of registered pages. +/* If the device supports raw dma addressing then the addr in data slot is + * the dma address of the buffer. + * If the device only supports registered segments then the addr is a byte + * offset into the registered segment (an ordered list of pages) where the + * buffer is. */ -struct gve_rx_data_slot { - /* byte offset into the rx registered segment of this slot */ +union gve_rx_data_slot { __be64 qpl_offset; + __be64 addr; }; /* GVE Recive Packet Descriptor Seq No */ @@ -85,12 +110,13 @@ struct gve_rx_data_slot { /* GVE Recive Packet Descriptor Flags */ #define GVE_RXFLG(x) cpu_to_be16(1 << (3 + (x))) -#define GVE_RXF_FRAG GVE_RXFLG(3) /* IP Fragment */ -#define GVE_RXF_IPV4 GVE_RXFLG(4) /* IPv4 */ -#define GVE_RXF_IPV6 GVE_RXFLG(5) /* IPv6 */ -#define GVE_RXF_TCP GVE_RXFLG(6) /* TCP Packet */ -#define GVE_RXF_UDP GVE_RXFLG(7) /* UDP Packet */ -#define GVE_RXF_ERR GVE_RXFLG(8) /* Packet Error Detected */ +#define GVE_RXF_FRAG GVE_RXFLG(3) /* IP Fragment */ +#define GVE_RXF_IPV4 GVE_RXFLG(4) /* IPv4 */ +#define GVE_RXF_IPV6 GVE_RXFLG(5) /* IPv6 */ +#define GVE_RXF_TCP GVE_RXFLG(6) /* TCP Packet */ +#define GVE_RXF_UDP GVE_RXFLG(7) /* UDP Packet */ +#define GVE_RXF_ERR GVE_RXFLG(8) /* Packet Error Detected */ +#define GVE_RXF_PKT_CONT GVE_RXFLG(10) /* Multi Fragment RX packet */ /* GVE IRQ */ #define GVE_IRQ_ACK BIT(31) diff --git a/drivers/net/ethernet/google/gve/gve_desc_dqo.h b/drivers/net/ethernet/google/gve/gve_desc_dqo.h new file mode 100644 index 000000000000..e8fe9adef7f2 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_desc_dqo.h @@ -0,0 +1,256 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) + * Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2021 Google, Inc. + */ + +/* GVE DQO Descriptor formats */ + +#ifndef _GVE_DESC_DQO_H_ +#define _GVE_DESC_DQO_H_ + +#include <linux/build_bug.h> + +#define GVE_TX_MAX_HDR_SIZE_DQO 255 +#define GVE_TX_MIN_TSO_MSS_DQO 88 + +#ifndef __LITTLE_ENDIAN_BITFIELD +#error "Only little endian supported" +#endif + +/* Basic TX descriptor (DTYPE 0x0C) */ +struct gve_tx_pkt_desc_dqo { + __le64 buf_addr; + + /* Must be GVE_TX_PKT_DESC_DTYPE_DQO (0xc) */ + u8 dtype: 5; + + /* Denotes the last descriptor of a packet. */ + u8 end_of_packet: 1; + u8 checksum_offload_enable: 1; + + /* If set, will generate a descriptor completion for this descriptor. */ + u8 report_event: 1; + u8 reserved0; + __le16 reserved1; + + /* The TX completion associated with this packet will contain this tag. + */ + __le16 compl_tag; + u16 buf_size: 14; + u16 reserved2: 2; +} __packed; +static_assert(sizeof(struct gve_tx_pkt_desc_dqo) == 16); + +#define GVE_TX_PKT_DESC_DTYPE_DQO 0xc +#define GVE_TX_MAX_BUF_SIZE_DQO ((16 * 1024) - 1) + +/* Maximum number of data descriptors allowed per packet, or per-TSO segment. */ +#define GVE_TX_MAX_DATA_DESCS 10 + +/* Min gap between tail and head to avoid cacheline overlap */ +#define GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP 4 + +/* "report_event" on TX packet descriptors may only be reported on the last + * descriptor of a TX packet, and they must be spaced apart with at least this + * value. + */ +#define GVE_TX_MIN_RE_INTERVAL 32 + +struct gve_tx_context_cmd_dtype { + u8 dtype: 5; + u8 tso: 1; + u8 reserved1: 2; + + u8 reserved2; +}; + +static_assert(sizeof(struct gve_tx_context_cmd_dtype) == 2); + +/* TX Native TSO Context DTYPE (0x05) + * + * "flex" fields allow the driver to send additional packet context to HW. + */ +struct gve_tx_tso_context_desc_dqo { + /* The L4 payload bytes that should be segmented. */ + u32 tso_total_len: 24; + u32 flex10: 8; + + /* Max segment size in TSO excluding headers. */ + u16 mss: 14; + u16 reserved: 2; + + u8 header_len; /* Header length to use for TSO offload */ + u8 flex11; + struct gve_tx_context_cmd_dtype cmd_dtype; + u8 flex0; + u8 flex5; + u8 flex6; + u8 flex7; + u8 flex8; + u8 flex9; +} __packed; +static_assert(sizeof(struct gve_tx_tso_context_desc_dqo) == 16); + +#define GVE_TX_TSO_CTX_DESC_DTYPE_DQO 0x5 + +/* General context descriptor for sending metadata. */ +struct gve_tx_general_context_desc_dqo { + u8 flex4; + u8 flex5; + u8 flex6; + u8 flex7; + u8 flex8; + u8 flex9; + u8 flex10; + u8 flex11; + struct gve_tx_context_cmd_dtype cmd_dtype; + u16 reserved; + u8 flex0; + u8 flex1; + u8 flex2; + u8 flex3; +} __packed; +static_assert(sizeof(struct gve_tx_general_context_desc_dqo) == 16); + +#define GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO 0x4 + +/* Logical structure of metadata which is packed into context descriptor flex + * fields. + */ +struct gve_tx_metadata_dqo { + union { + struct { + u8 version; + + /* If `skb->l4_hash` is set, this value should be + * derived from `skb->hash`. + * + * A zero value means no l4_hash was associated with the + * skb. + */ + u16 path_hash: 15; + + /* Should be set to 1 if the flow associated with the + * skb had a rehash from the TCP stack. + */ + u16 rehash_event: 1; + } __packed; + u8 bytes[12]; + }; +} __packed; +static_assert(sizeof(struct gve_tx_metadata_dqo) == 12); + +#define GVE_TX_METADATA_VERSION_DQO 0 + +/* TX completion descriptor */ +struct gve_tx_compl_desc { + /* For types 0-4 this is the TX queue ID associated with this + * completion. + */ + u16 id: 11; + + /* See: GVE_COMPL_TYPE_DQO* */ + u16 type: 3; + u16 reserved0: 1; + + /* Flipped by HW to notify the descriptor is populated. */ + u16 generation: 1; + union { + /* For descriptor completions, this is the last index fetched + * by HW + 1. + */ + __le16 tx_head; + + /* For packet completions, this is the completion tag set on the + * TX packet descriptors. + */ + __le16 completion_tag; + }; + __le32 reserved1; +} __packed; +static_assert(sizeof(struct gve_tx_compl_desc) == 8); + +#define GVE_COMPL_TYPE_DQO_PKT 0x2 /* Packet completion */ +#define GVE_COMPL_TYPE_DQO_DESC 0x4 /* Descriptor completion */ +#define GVE_COMPL_TYPE_DQO_MISS 0x1 /* Miss path completion */ +#define GVE_COMPL_TYPE_DQO_REINJECTION 0x3 /* Re-injection completion */ + +/* Descriptor to post buffers to HW on buffer queue. */ +struct gve_rx_desc_dqo { + __le16 buf_id; /* ID returned in Rx completion descriptor */ + __le16 reserved0; + __le32 reserved1; + __le64 buf_addr; /* DMA address of the buffer */ + __le64 header_buf_addr; + __le64 reserved2; +} __packed; +static_assert(sizeof(struct gve_rx_desc_dqo) == 32); + +/* Descriptor for HW to notify SW of new packets received on RX queue. */ +struct gve_rx_compl_desc_dqo { + /* Must be 1 */ + u8 rxdid: 4; + u8 reserved0: 4; + + /* Packet originated from this system rather than the network. */ + u8 loopback: 1; + /* Set when IPv6 packet contains a destination options header or routing + * header. + */ + u8 ipv6_ex_add: 1; + /* Invalid packet was received. */ + u8 rx_error: 1; + u8 reserved1: 5; + + u16 packet_type: 10; + u16 ip_hdr_err: 1; + u16 udp_len_err: 1; + u16 raw_cs_invalid: 1; + u16 reserved2: 3; + + u16 packet_len: 14; + /* Flipped by HW to notify the descriptor is populated. */ + u16 generation: 1; + /* Should be zero. */ + u16 buffer_queue_id: 1; + + u16 header_len: 10; + u16 rsc: 1; + u16 split_header: 1; + u16 reserved3: 4; + + u8 descriptor_done: 1; + u8 end_of_packet: 1; + u8 header_buffer_overflow: 1; + u8 l3_l4_processed: 1; + u8 csum_ip_err: 1; + u8 csum_l4_err: 1; + u8 csum_external_ip_err: 1; + u8 csum_external_udp_err: 1; + + u8 status_error1; + + __le16 reserved5; + __le16 buf_id; /* Buffer ID which was sent on the buffer queue. */ + + union { + /* Packet checksum. */ + __le16 raw_cs; + /* Segment length for RSC packets. */ + __le16 rsc_seg_len; + }; + __le32 hash; + __le32 reserved6; + __le64 reserved7; +} __packed; + +static_assert(sizeof(struct gve_rx_compl_desc_dqo) == 32); + +/* Ringing the doorbell too often can hurt performance. + * + * HW requires this value to be at least 8. + */ +#define GVE_RX_BUF_THRESH_DQO 32 + +#endif /* _GVE_DESC_DQO_H_ */ diff --git a/drivers/net/ethernet/google/gve/gve_dqo.h b/drivers/net/ethernet/google/gve/gve_dqo.h new file mode 100644 index 000000000000..1eb4d5fd8561 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_dqo.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) + * Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2021 Google, Inc. + */ + +#ifndef _GVE_DQO_H_ +#define _GVE_DQO_H_ + +#include "gve_adminq.h" + +#define GVE_ITR_ENABLE_BIT_DQO BIT(0) +#define GVE_ITR_CLEAR_PBA_BIT_DQO BIT(1) +#define GVE_ITR_NO_UPDATE_DQO (3 << 3) + +#define GVE_ITR_INTERVAL_DQO_SHIFT 5 +#define GVE_ITR_INTERVAL_DQO_MASK ((1 << 12) - 1) + +#define GVE_TX_IRQ_RATELIMIT_US_DQO 50 +#define GVE_RX_IRQ_RATELIMIT_US_DQO 20 +#define GVE_MAX_ITR_INTERVAL_DQO (GVE_ITR_INTERVAL_DQO_MASK * 2) + +/* Timeout in seconds to wait for a reinjection completion after receiving + * its corresponding miss completion. + */ +#define GVE_REINJECT_COMPL_TIMEOUT 1 + +/* Timeout in seconds to deallocate the completion tag for a packet that was + * prematurely freed for not receiving a valid completion. This should be large + * enough to rule out the possibility of receiving the corresponding valid + * completion after this interval. + */ +#define GVE_DEALLOCATE_COMPL_TIMEOUT 60 + +netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev); +bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean); +int gve_rx_poll_dqo(struct gve_notify_block *block, int budget); +int gve_tx_alloc_rings_dqo(struct gve_priv *priv); +void gve_tx_free_rings_dqo(struct gve_priv *priv); +int gve_rx_alloc_rings_dqo(struct gve_priv *priv); +void gve_rx_free_rings_dqo(struct gve_priv *priv); +int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, + struct napi_struct *napi); +void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx); +void gve_rx_write_doorbell_dqo(const struct gve_priv *priv, int queue_idx); + +static inline void +gve_tx_put_doorbell_dqo(const struct gve_priv *priv, + const struct gve_queue_resources *q_resources, u32 val) +{ + u64 index; + + index = be32_to_cpu(q_resources->db_index); + iowrite32(val, &priv->db_bar2[index]); +} + +/* Builds register value to write to DQO IRQ doorbell to enable with specified + * ITR interval. + */ +static inline u32 gve_setup_itr_interval_dqo(u32 interval_us) +{ + u32 result = GVE_ITR_ENABLE_BIT_DQO; + + /* Interval has 2us granularity. */ + interval_us >>= 1; + + interval_us &= GVE_ITR_INTERVAL_DQO_MASK; + result |= (interval_us << GVE_ITR_INTERVAL_DQO_SHIFT); + + return result; +} + +static inline void +gve_write_irq_doorbell_dqo(const struct gve_priv *priv, + const struct gve_notify_block *block, u32 val) +{ + u32 index = be32_to_cpu(*block->irq_db_index); + + iowrite32(val, &priv->db_bar2[index]); +} + +/* Sets interrupt throttling interval and enables interrupt + * by writing to IRQ doorbell. + */ +static inline void +gve_set_itr_coalesce_usecs_dqo(struct gve_priv *priv, + struct gve_notify_block *block, + u32 usecs) +{ + gve_write_irq_doorbell_dqo(priv, block, + gve_setup_itr_interval_dqo(usecs)); +} +#endif /* _GVE_DQO_H_ */ diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c index d8fa816f4473..7b9a2d9d9624 100644 --- a/drivers/net/ethernet/google/gve/gve_ethtool.c +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -1,20 +1,23 @@ // SPDX-License-Identifier: (GPL-2.0 OR MIT) /* Google virtual Ethernet (gve) driver * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc. */ +#include <linux/ethtool.h> #include <linux/rtnetlink.h> #include "gve.h" +#include "gve_adminq.h" +#include "gve_dqo.h" static void gve_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *info) { struct gve_priv *priv = netdev_priv(netdev); - strlcpy(info->driver, "gve", sizeof(info->driver)); - strlcpy(info->version, gve_version_str, sizeof(info->version)); - strlcpy(info->bus_info, pci_name(priv->pdev), sizeof(info->bus_info)); + strscpy(info->driver, "gve", sizeof(info->driver)); + strscpy(info->version, gve_version_str, sizeof(info->version)); + strscpy(info->bus_info, pci_name(priv->pdev), sizeof(info->bus_info)); } static void gve_set_msglevel(struct net_device *netdev, u32 value) @@ -34,41 +37,86 @@ static u32 gve_get_msglevel(struct net_device *netdev) static const char gve_gstrings_main_stats[][ETH_GSTRING_LEN] = { "rx_packets", "tx_packets", "rx_bytes", "tx_bytes", "rx_dropped", "tx_dropped", "tx_timeouts", + "rx_skb_alloc_fail", "rx_buf_alloc_fail", "rx_desc_err_dropped_pkt", + "interface_up_cnt", "interface_down_cnt", "reset_cnt", + "page_alloc_fail", "dma_mapping_error", "stats_report_trigger_cnt", +}; + +static const char gve_gstrings_rx_stats[][ETH_GSTRING_LEN] = { + "rx_posted_desc[%u]", "rx_completed_desc[%u]", "rx_consumed_desc[%u]", "rx_bytes[%u]", + "rx_cont_packet_cnt[%u]", "rx_frag_flip_cnt[%u]", "rx_frag_copy_cnt[%u]", + "rx_dropped_pkt[%u]", "rx_copybreak_pkt[%u]", "rx_copied_pkt[%u]", + "rx_queue_drop_cnt[%u]", "rx_no_buffers_posted[%u]", + "rx_drops_packet_over_mru[%u]", "rx_drops_invalid_checksum[%u]", +}; + +static const char gve_gstrings_tx_stats[][ETH_GSTRING_LEN] = { + "tx_posted_desc[%u]", "tx_completed_desc[%u]", "tx_consumed_desc[%u]", "tx_bytes[%u]", + "tx_wake[%u]", "tx_stop[%u]", "tx_event_counter[%u]", + "tx_dma_mapping_error[%u]", +}; + +static const char gve_gstrings_adminq_stats[][ETH_GSTRING_LEN] = { + "adminq_prod_cnt", "adminq_cmd_fail", "adminq_timeouts", + "adminq_describe_device_cnt", "adminq_cfg_device_resources_cnt", + "adminq_register_page_list_cnt", "adminq_unregister_page_list_cnt", + "adminq_create_tx_queue_cnt", "adminq_create_rx_queue_cnt", + "adminq_destroy_tx_queue_cnt", "adminq_destroy_rx_queue_cnt", + "adminq_dcfg_device_resources_cnt", "adminq_set_driver_parameter_cnt", + "adminq_report_stats_cnt", "adminq_report_link_speed_cnt" +}; + +static const char gve_gstrings_priv_flags[][ETH_GSTRING_LEN] = { + "report-stats", }; #define GVE_MAIN_STATS_LEN ARRAY_SIZE(gve_gstrings_main_stats) -#define NUM_GVE_TX_CNTS 5 -#define NUM_GVE_RX_CNTS 2 +#define GVE_ADMINQ_STATS_LEN ARRAY_SIZE(gve_gstrings_adminq_stats) +#define NUM_GVE_TX_CNTS ARRAY_SIZE(gve_gstrings_tx_stats) +#define NUM_GVE_RX_CNTS ARRAY_SIZE(gve_gstrings_rx_stats) +#define GVE_PRIV_FLAGS_STR_LEN ARRAY_SIZE(gve_gstrings_priv_flags) static void gve_get_strings(struct net_device *netdev, u32 stringset, u8 *data) { struct gve_priv *priv = netdev_priv(netdev); char *s = (char *)data; - int i; + int i, j; - if (stringset != ETH_SS_STATS) - return; + switch (stringset) { + case ETH_SS_STATS: + memcpy(s, *gve_gstrings_main_stats, + sizeof(gve_gstrings_main_stats)); + s += sizeof(gve_gstrings_main_stats); + + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + for (j = 0; j < NUM_GVE_RX_CNTS; j++) { + snprintf(s, ETH_GSTRING_LEN, + gve_gstrings_rx_stats[j], i); + s += ETH_GSTRING_LEN; + } + } + + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + for (j = 0; j < NUM_GVE_TX_CNTS; j++) { + snprintf(s, ETH_GSTRING_LEN, + gve_gstrings_tx_stats[j], i); + s += ETH_GSTRING_LEN; + } + } + + memcpy(s, *gve_gstrings_adminq_stats, + sizeof(gve_gstrings_adminq_stats)); + s += sizeof(gve_gstrings_adminq_stats); + break; + + case ETH_SS_PRIV_FLAGS: + memcpy(s, *gve_gstrings_priv_flags, + sizeof(gve_gstrings_priv_flags)); + s += sizeof(gve_gstrings_priv_flags); + break; - memcpy(s, *gve_gstrings_main_stats, - sizeof(gve_gstrings_main_stats)); - s += sizeof(gve_gstrings_main_stats); - for (i = 0; i < priv->rx_cfg.num_queues; i++) { - snprintf(s, ETH_GSTRING_LEN, "rx_desc_cnt[%u]", i); - s += ETH_GSTRING_LEN; - snprintf(s, ETH_GSTRING_LEN, "rx_desc_fill_cnt[%u]", i); - s += ETH_GSTRING_LEN; - } - for (i = 0; i < priv->tx_cfg.num_queues; i++) { - snprintf(s, ETH_GSTRING_LEN, "tx_req[%u]", i); - s += ETH_GSTRING_LEN; - snprintf(s, ETH_GSTRING_LEN, "tx_done[%u]", i); - s += ETH_GSTRING_LEN; - snprintf(s, ETH_GSTRING_LEN, "tx_wake[%u]", i); - s += ETH_GSTRING_LEN; - snprintf(s, ETH_GSTRING_LEN, "tx_stop[%u]", i); - s += ETH_GSTRING_LEN; - snprintf(s, ETH_GSTRING_LEN, "tx_event_counter[%u]", i); - s += ETH_GSTRING_LEN; + default: + break; } } @@ -78,9 +126,11 @@ static int gve_get_sset_count(struct net_device *netdev, int sset) switch (sset) { case ETH_SS_STATS: - return GVE_MAIN_STATS_LEN + + return GVE_MAIN_STATS_LEN + GVE_ADMINQ_STATS_LEN + (priv->rx_cfg.num_queues * NUM_GVE_RX_CNTS) + (priv->tx_cfg.num_queues * NUM_GVE_TX_CNTS); + case ETH_SS_PRIV_FLAGS: + return GVE_PRIV_FLAGS_STR_LEN; default: return -EOPNOTSUPP; } @@ -90,36 +140,72 @@ static void gve_get_ethtool_stats(struct net_device *netdev, struct ethtool_stats *stats, u64 *data) { - struct gve_priv *priv = netdev_priv(netdev); - u64 rx_pkts, rx_bytes, tx_pkts, tx_bytes; + u64 tmp_rx_pkts, tmp_rx_bytes, tmp_rx_skb_alloc_fail, + tmp_rx_buf_alloc_fail, tmp_rx_desc_err_dropped_pkt, + tmp_tx_pkts, tmp_tx_bytes; + u64 rx_buf_alloc_fail, rx_desc_err_dropped_pkt, rx_pkts, + rx_skb_alloc_fail, rx_bytes, tx_pkts, tx_bytes, tx_dropped; + int stats_idx, base_stats_idx, max_stats_idx; + struct stats *report_stats; + int *rx_qid_to_stats_idx; + int *tx_qid_to_stats_idx; + struct gve_priv *priv; + bool skip_nic_stats; unsigned int start; int ring; - int i; + int i, j; ASSERT_RTNL(); - for (rx_pkts = 0, rx_bytes = 0, ring = 0; + priv = netdev_priv(netdev); + report_stats = priv->stats_report->stats; + rx_qid_to_stats_idx = kmalloc_array(priv->rx_cfg.num_queues, + sizeof(int), GFP_KERNEL); + if (!rx_qid_to_stats_idx) + return; + tx_qid_to_stats_idx = kmalloc_array(priv->tx_cfg.num_queues, + sizeof(int), GFP_KERNEL); + if (!tx_qid_to_stats_idx) { + kfree(rx_qid_to_stats_idx); + return; + } + for (rx_pkts = 0, rx_bytes = 0, rx_skb_alloc_fail = 0, + rx_buf_alloc_fail = 0, rx_desc_err_dropped_pkt = 0, ring = 0; ring < priv->rx_cfg.num_queues; ring++) { if (priv->rx) { do { + struct gve_rx_ring *rx = &priv->rx[ring]; + start = - u64_stats_fetch_begin(&priv->rx[ring].statss); - rx_pkts += priv->rx[ring].rpackets; - rx_bytes += priv->rx[ring].rbytes; - } while (u64_stats_fetch_retry(&priv->rx[ring].statss, + u64_stats_fetch_begin_irq(&priv->rx[ring].statss); + tmp_rx_pkts = rx->rpackets; + tmp_rx_bytes = rx->rbytes; + tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail; + tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail; + tmp_rx_desc_err_dropped_pkt = + rx->rx_desc_err_dropped_pkt; + } while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss, start)); + rx_pkts += tmp_rx_pkts; + rx_bytes += tmp_rx_bytes; + rx_skb_alloc_fail += tmp_rx_skb_alloc_fail; + rx_buf_alloc_fail += tmp_rx_buf_alloc_fail; + rx_desc_err_dropped_pkt += tmp_rx_desc_err_dropped_pkt; } } - for (tx_pkts = 0, tx_bytes = 0, ring = 0; + for (tx_pkts = 0, tx_bytes = 0, tx_dropped = 0, ring = 0; ring < priv->tx_cfg.num_queues; ring++) { if (priv->tx) { do { start = - u64_stats_fetch_begin(&priv->tx[ring].statss); - tx_pkts += priv->tx[ring].pkt_done; - tx_bytes += priv->tx[ring].bytes_done; - } while (u64_stats_fetch_retry(&priv->tx[ring].statss, + u64_stats_fetch_begin_irq(&priv->tx[ring].statss); + tmp_tx_pkts = priv->tx[ring].pkt_done; + tmp_tx_bytes = priv->tx[ring].bytes_done; + } while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss, start)); + tx_pkts += tmp_tx_pkts; + tx_bytes += tmp_tx_bytes; + tx_dropped += priv->tx[ring].dropped_pkt; } } @@ -128,37 +214,166 @@ gve_get_ethtool_stats(struct net_device *netdev, data[i++] = tx_pkts; data[i++] = rx_bytes; data[i++] = tx_bytes; - /* Skip rx_dropped and tx_dropped */ - i += 2; + /* total rx dropped packets */ + data[i++] = rx_skb_alloc_fail + rx_buf_alloc_fail + + rx_desc_err_dropped_pkt; + data[i++] = tx_dropped; data[i++] = priv->tx_timeo_cnt; + data[i++] = rx_skb_alloc_fail; + data[i++] = rx_buf_alloc_fail; + data[i++] = rx_desc_err_dropped_pkt; + data[i++] = priv->interface_up_cnt; + data[i++] = priv->interface_down_cnt; + data[i++] = priv->reset_cnt; + data[i++] = priv->page_alloc_fail; + data[i++] = priv->dma_mapping_error; + data[i++] = priv->stats_report_trigger_cnt; i = GVE_MAIN_STATS_LEN; + /* For rx cross-reporting stats, start from nic rx stats in report */ + base_stats_idx = GVE_TX_STATS_REPORT_NUM * priv->tx_cfg.num_queues + + GVE_RX_STATS_REPORT_NUM * priv->rx_cfg.num_queues; + max_stats_idx = NIC_RX_STATS_REPORT_NUM * priv->rx_cfg.num_queues + + base_stats_idx; + /* Preprocess the stats report for rx, map queue id to start index */ + skip_nic_stats = false; + for (stats_idx = base_stats_idx; stats_idx < max_stats_idx; + stats_idx += NIC_RX_STATS_REPORT_NUM) { + u32 stat_name = be32_to_cpu(report_stats[stats_idx].stat_name); + u32 queue_id = be32_to_cpu(report_stats[stats_idx].queue_id); + + if (stat_name == 0) { + /* no stats written by NIC yet */ + skip_nic_stats = true; + break; + } + rx_qid_to_stats_idx[queue_id] = stats_idx; + } /* walk RX rings */ if (priv->rx) { for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) { struct gve_rx_ring *rx = &priv->rx[ring]; - data[i++] = rx->cnt; data[i++] = rx->fill_cnt; + data[i++] = rx->cnt; + data[i++] = rx->fill_cnt - rx->cnt; + do { + start = + u64_stats_fetch_begin_irq(&priv->rx[ring].statss); + tmp_rx_bytes = rx->rbytes; + tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail; + tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail; + tmp_rx_desc_err_dropped_pkt = + rx->rx_desc_err_dropped_pkt; + } while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss, + start)); + data[i++] = tmp_rx_bytes; + data[i++] = rx->rx_cont_packet_cnt; + data[i++] = rx->rx_frag_flip_cnt; + data[i++] = rx->rx_frag_copy_cnt; + /* rx dropped packets */ + data[i++] = tmp_rx_skb_alloc_fail + + tmp_rx_buf_alloc_fail + + tmp_rx_desc_err_dropped_pkt; + data[i++] = rx->rx_copybreak_pkt; + data[i++] = rx->rx_copied_pkt; + /* stats from NIC */ + if (skip_nic_stats) { + /* skip NIC rx stats */ + i += NIC_RX_STATS_REPORT_NUM; + continue; + } + for (j = 0; j < NIC_RX_STATS_REPORT_NUM; j++) { + u64 value = + be64_to_cpu(report_stats[rx_qid_to_stats_idx[ring] + j].value); + + data[i++] = value; + } } } else { i += priv->rx_cfg.num_queues * NUM_GVE_RX_CNTS; } + + /* For tx cross-reporting stats, start from nic tx stats in report */ + base_stats_idx = max_stats_idx; + max_stats_idx = NIC_TX_STATS_REPORT_NUM * priv->tx_cfg.num_queues + + max_stats_idx; + /* Preprocess the stats report for tx, map queue id to start index */ + skip_nic_stats = false; + for (stats_idx = base_stats_idx; stats_idx < max_stats_idx; + stats_idx += NIC_TX_STATS_REPORT_NUM) { + u32 stat_name = be32_to_cpu(report_stats[stats_idx].stat_name); + u32 queue_id = be32_to_cpu(report_stats[stats_idx].queue_id); + + if (stat_name == 0) { + /* no stats written by NIC yet */ + skip_nic_stats = true; + break; + } + tx_qid_to_stats_idx[queue_id] = stats_idx; + } /* walk TX rings */ if (priv->tx) { for (ring = 0; ring < priv->tx_cfg.num_queues; ring++) { struct gve_tx_ring *tx = &priv->tx[ring]; - data[i++] = tx->req; - data[i++] = tx->done; + if (gve_is_gqi(priv)) { + data[i++] = tx->req; + data[i++] = tx->done; + data[i++] = tx->req - tx->done; + } else { + /* DQO doesn't currently support + * posted/completed descriptor counts; + */ + data[i++] = 0; + data[i++] = 0; + data[i++] = tx->dqo_tx.tail - tx->dqo_tx.head; + } + do { + start = + u64_stats_fetch_begin_irq(&priv->tx[ring].statss); + tmp_tx_bytes = tx->bytes_done; + } while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss, + start)); + data[i++] = tmp_tx_bytes; data[i++] = tx->wake_queue; data[i++] = tx->stop_queue; - data[i++] = be32_to_cpu(gve_tx_load_event_counter(priv, - tx)); + data[i++] = gve_tx_load_event_counter(priv, tx); + data[i++] = tx->dma_mapping_error; + /* stats from NIC */ + if (skip_nic_stats) { + /* skip NIC tx stats */ + i += NIC_TX_STATS_REPORT_NUM; + continue; + } + for (j = 0; j < NIC_TX_STATS_REPORT_NUM; j++) { + u64 value = + be64_to_cpu(report_stats[tx_qid_to_stats_idx[ring] + j].value); + data[i++] = value; + } } } else { i += priv->tx_cfg.num_queues * NUM_GVE_TX_CNTS; } + + kfree(rx_qid_to_stats_idx); + kfree(tx_qid_to_stats_idx); + /* AQ Stats */ + data[i++] = priv->adminq_prod_cnt; + data[i++] = priv->adminq_cmd_fail; + data[i++] = priv->adminq_timeouts; + data[i++] = priv->adminq_describe_device_cnt; + data[i++] = priv->adminq_cfg_device_resources_cnt; + data[i++] = priv->adminq_register_page_list_cnt; + data[i++] = priv->adminq_unregister_page_list_cnt; + data[i++] = priv->adminq_create_tx_queue_cnt; + data[i++] = priv->adminq_create_rx_queue_cnt; + data[i++] = priv->adminq_destroy_tx_queue_cnt; + data[i++] = priv->adminq_destroy_rx_queue_cnt; + data[i++] = priv->adminq_dcfg_device_resources_cnt; + data[i++] = priv->adminq_set_driver_parameter_cnt; + data[i++] = priv->adminq_report_stats_cnt; + data[i++] = priv->adminq_report_link_speed_cnt; } static void gve_get_channels(struct net_device *netdev, @@ -188,7 +403,7 @@ static int gve_set_channels(struct net_device *netdev, gve_get_channels(netdev, &old_settings); - /* Changing combined is not allowed allowed */ + /* Changing combined is not allowed */ if (cmd->combined_count != old_settings.combined_count) return -EINVAL; @@ -208,7 +423,9 @@ static int gve_set_channels(struct net_device *netdev, } static void gve_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *cmd) + struct ethtool_ringparam *cmd, + struct kernel_ethtool_ringparam *kernel_cmd, + struct netlink_ext_ack *extack) { struct gve_priv *priv = netdev_priv(netdev); @@ -230,7 +447,159 @@ static int gve_user_reset(struct net_device *netdev, u32 *flags) return -EOPNOTSUPP; } +static int gve_get_tunable(struct net_device *netdev, + const struct ethtool_tunable *etuna, void *value) +{ + struct gve_priv *priv = netdev_priv(netdev); + + switch (etuna->id) { + case ETHTOOL_RX_COPYBREAK: + *(u32 *)value = priv->rx_copybreak; + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int gve_set_tunable(struct net_device *netdev, + const struct ethtool_tunable *etuna, + const void *value) +{ + struct gve_priv *priv = netdev_priv(netdev); + u32 len; + + switch (etuna->id) { + case ETHTOOL_RX_COPYBREAK: + { + u32 max_copybreak = gve_is_gqi(priv) ? + (PAGE_SIZE / 2) : priv->data_buffer_size_dqo; + + len = *(u32 *)value; + if (len > max_copybreak) + return -EINVAL; + priv->rx_copybreak = len; + return 0; + } + default: + return -EOPNOTSUPP; + } +} + +static u32 gve_get_priv_flags(struct net_device *netdev) +{ + struct gve_priv *priv = netdev_priv(netdev); + u32 ret_flags = 0; + + /* Only 1 flag exists currently: report-stats (BIT(O)), so set that flag. */ + if (priv->ethtool_flags & BIT(0)) + ret_flags |= BIT(0); + return ret_flags; +} + +static int gve_set_priv_flags(struct net_device *netdev, u32 flags) +{ + struct gve_priv *priv = netdev_priv(netdev); + u64 ori_flags, new_flags; + + ori_flags = READ_ONCE(priv->ethtool_flags); + new_flags = ori_flags; + + /* Only one priv flag exists: report-stats (BIT(0))*/ + if (flags & BIT(0)) + new_flags |= BIT(0); + else + new_flags &= ~(BIT(0)); + priv->ethtool_flags = new_flags; + /* start report-stats timer when user turns report stats on. */ + if (flags & BIT(0)) { + mod_timer(&priv->stats_report_timer, + round_jiffies(jiffies + + msecs_to_jiffies(priv->stats_report_timer_period))); + } + /* Zero off gve stats when report-stats turned off and */ + /* delete report stats timer. */ + if (!(flags & BIT(0)) && (ori_flags & BIT(0))) { + int tx_stats_num = GVE_TX_STATS_REPORT_NUM * + priv->tx_cfg.num_queues; + int rx_stats_num = GVE_RX_STATS_REPORT_NUM * + priv->rx_cfg.num_queues; + + memset(priv->stats_report->stats, 0, (tx_stats_num + rx_stats_num) * + sizeof(struct stats)); + del_timer_sync(&priv->stats_report_timer); + } + return 0; +} + +static int gve_get_link_ksettings(struct net_device *netdev, + struct ethtool_link_ksettings *cmd) +{ + struct gve_priv *priv = netdev_priv(netdev); + int err = gve_adminq_report_link_speed(priv); + + cmd->base.speed = priv->link_speed; + return err; +} + +static int gve_get_coalesce(struct net_device *netdev, + struct ethtool_coalesce *ec, + struct kernel_ethtool_coalesce *kernel_ec, + struct netlink_ext_ack *extack) +{ + struct gve_priv *priv = netdev_priv(netdev); + + if (gve_is_gqi(priv)) + return -EOPNOTSUPP; + ec->tx_coalesce_usecs = priv->tx_coalesce_usecs; + ec->rx_coalesce_usecs = priv->rx_coalesce_usecs; + + return 0; +} + +static int gve_set_coalesce(struct net_device *netdev, + struct ethtool_coalesce *ec, + struct kernel_ethtool_coalesce *kernel_ec, + struct netlink_ext_ack *extack) +{ + struct gve_priv *priv = netdev_priv(netdev); + u32 tx_usecs_orig = priv->tx_coalesce_usecs; + u32 rx_usecs_orig = priv->rx_coalesce_usecs; + int idx; + + if (gve_is_gqi(priv)) + return -EOPNOTSUPP; + + if (ec->tx_coalesce_usecs > GVE_MAX_ITR_INTERVAL_DQO || + ec->rx_coalesce_usecs > GVE_MAX_ITR_INTERVAL_DQO) + return -EINVAL; + priv->tx_coalesce_usecs = ec->tx_coalesce_usecs; + priv->rx_coalesce_usecs = ec->rx_coalesce_usecs; + + if (tx_usecs_orig != priv->tx_coalesce_usecs) { + for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) { + int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx); + struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; + + gve_set_itr_coalesce_usecs_dqo(priv, block, + priv->tx_coalesce_usecs); + } + } + + if (rx_usecs_orig != priv->rx_coalesce_usecs) { + for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) { + int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); + struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; + + gve_set_itr_coalesce_usecs_dqo(priv, block, + priv->rx_coalesce_usecs); + } + } + + return 0; +} + const struct ethtool_ops gve_ethtool_ops = { + .supported_coalesce_params = ETHTOOL_COALESCE_USECS, .get_drvinfo = gve_get_drvinfo, .get_strings = gve_get_strings, .get_sset_count = gve_get_sset_count, @@ -240,6 +609,13 @@ const struct ethtool_ops gve_ethtool_ops = { .set_channels = gve_set_channels, .get_channels = gve_get_channels, .get_link = ethtool_op_get_link, + .get_coalesce = gve_get_coalesce, + .set_coalesce = gve_set_coalesce, .get_ringparam = gve_get_ringparam, .reset = gve_user_reset, + .get_tunable = gve_get_tunable, + .set_tunable = gve_set_tunable, + .get_priv_flags = gve_get_priv_flags, + .set_priv_flags = gve_set_priv_flags, + .get_link_ksettings = gve_get_link_ksettings }; diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index e032563ceefd..d3e3ac242bfc 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: (GPL-2.0 OR MIT) /* Google virtual Ethernet (gve) driver * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc. */ #include <linux/cpumask.h> @@ -14,6 +14,7 @@ #include <linux/workqueue.h> #include <net/sch_generic.h> #include "gve.h" +#include "gve_dqo.h" #include "gve_adminq.h" #include "gve_register.h" @@ -23,35 +24,53 @@ #define GVE_VERSION "1.0.0" #define GVE_VERSION_PREFIX "GVE-" +// Minimum amount of time between queue kicks in msec (10 seconds) +#define MIN_TX_TIMEOUT_GAP (1000 * 10) + const char gve_version_str[] = GVE_VERSION; static const char gve_version_prefix[] = GVE_VERSION_PREFIX; +static netdev_tx_t gve_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct gve_priv *priv = netdev_priv(dev); + + if (gve_is_gqi(priv)) + return gve_tx(skb, dev); + else + return gve_tx_dqo(skb, dev); +} + static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) { struct gve_priv *priv = netdev_priv(dev); unsigned int start; + u64 packets, bytes; int ring; if (priv->rx) { for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) { do { start = - u64_stats_fetch_begin(&priv->rx[ring].statss); - s->rx_packets += priv->rx[ring].rpackets; - s->rx_bytes += priv->rx[ring].rbytes; - } while (u64_stats_fetch_retry(&priv->rx[ring].statss, + u64_stats_fetch_begin_irq(&priv->rx[ring].statss); + packets = priv->rx[ring].rpackets; + bytes = priv->rx[ring].rbytes; + } while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss, start)); + s->rx_packets += packets; + s->rx_bytes += bytes; } } if (priv->tx) { for (ring = 0; ring < priv->tx_cfg.num_queues; ring++) { do { start = - u64_stats_fetch_begin(&priv->tx[ring].statss); - s->tx_packets += priv->tx[ring].pkt_done; - s->tx_bytes += priv->tx[ring].bytes_done; - } while (u64_stats_fetch_retry(&priv->tx[ring].statss, + u64_stats_fetch_begin_irq(&priv->tx[ring].statss); + packets = priv->tx[ring].pkt_done; + bytes = priv->tx[ring].bytes_done; + } while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss, start)); + s->tx_packets += packets; + s->tx_bytes += bytes; } } } @@ -71,6 +90,9 @@ static int gve_alloc_counter_array(struct gve_priv *priv) static void gve_free_counter_array(struct gve_priv *priv) { + if (!priv->counter_array) + return; + dma_free_coherent(&priv->pdev->dev, priv->num_event_counters * sizeof(*priv->counter_array), @@ -78,6 +100,68 @@ static void gve_free_counter_array(struct gve_priv *priv) priv->counter_array = NULL; } +/* NIC requests to report stats */ +static void gve_stats_report_task(struct work_struct *work) +{ + struct gve_priv *priv = container_of(work, struct gve_priv, + stats_report_task); + if (gve_get_do_report_stats(priv)) { + gve_handle_report_stats(priv); + gve_clear_do_report_stats(priv); + } +} + +static void gve_stats_report_schedule(struct gve_priv *priv) +{ + if (!gve_get_probe_in_progress(priv) && + !gve_get_reset_in_progress(priv)) { + gve_set_do_report_stats(priv); + queue_work(priv->gve_wq, &priv->stats_report_task); + } +} + +static void gve_stats_report_timer(struct timer_list *t) +{ + struct gve_priv *priv = from_timer(priv, t, stats_report_timer); + + mod_timer(&priv->stats_report_timer, + round_jiffies(jiffies + + msecs_to_jiffies(priv->stats_report_timer_period))); + gve_stats_report_schedule(priv); +} + +static int gve_alloc_stats_report(struct gve_priv *priv) +{ + int tx_stats_num, rx_stats_num; + + tx_stats_num = (GVE_TX_STATS_REPORT_NUM + NIC_TX_STATS_REPORT_NUM) * + priv->tx_cfg.num_queues; + rx_stats_num = (GVE_RX_STATS_REPORT_NUM + NIC_RX_STATS_REPORT_NUM) * + priv->rx_cfg.num_queues; + priv->stats_report_len = struct_size(priv->stats_report, stats, + tx_stats_num + rx_stats_num); + priv->stats_report = + dma_alloc_coherent(&priv->pdev->dev, priv->stats_report_len, + &priv->stats_report_bus, GFP_KERNEL); + if (!priv->stats_report) + return -ENOMEM; + /* Set up timer for the report-stats task */ + timer_setup(&priv->stats_report_timer, gve_stats_report_timer, 0); + priv->stats_report_timer_period = GVE_STATS_REPORT_TIMER_PERIOD; + return 0; +} + +static void gve_free_stats_report(struct gve_priv *priv) +{ + if (!priv->stats_report) + return; + + del_timer_sync(&priv->stats_report_timer); + dma_free_coherent(&priv->pdev->dev, priv->stats_report_len, + priv->stats_report, priv->stats_report_bus); + priv->stats_report = NULL; +} + static irqreturn_t gve_mgmnt_intr(int irq, void *arg) { struct gve_priv *priv = arg; @@ -96,40 +180,103 @@ static irqreturn_t gve_intr(int irq, void *arg) return IRQ_HANDLED; } +static irqreturn_t gve_intr_dqo(int irq, void *arg) +{ + struct gve_notify_block *block = arg; + + /* Interrupts are automatically masked */ + napi_schedule_irqoff(&block->napi); + return IRQ_HANDLED; +} + static int gve_napi_poll(struct napi_struct *napi, int budget) { struct gve_notify_block *block; __be32 __iomem *irq_doorbell; bool reschedule = false; struct gve_priv *priv; + int work_done = 0; block = container_of(napi, struct gve_notify_block, napi); priv = block->priv; if (block->tx) reschedule |= gve_tx_poll(block, budget); - if (block->rx) - reschedule |= gve_rx_poll(block, budget); + if (block->rx) { + work_done = gve_rx_poll(block, budget); + reschedule |= work_done == budget; + } if (reschedule) return budget; - napi_complete(napi); - irq_doorbell = gve_irq_doorbell(priv, block); - iowrite32be(GVE_IRQ_ACK | GVE_IRQ_EVENT, irq_doorbell); + /* Complete processing - don't unmask irq if busy polling is enabled */ + if (likely(napi_complete_done(napi, work_done))) { + irq_doorbell = gve_irq_doorbell(priv, block); + iowrite32be(GVE_IRQ_ACK | GVE_IRQ_EVENT, irq_doorbell); + + /* Ensure IRQ ACK is visible before we check pending work. + * If queue had issued updates, it would be truly visible. + */ + mb(); + + if (block->tx) + reschedule |= gve_tx_clean_pending(priv, block->tx); + if (block->rx) + reschedule |= gve_rx_work_pending(block->rx); + + if (reschedule && napi_reschedule(napi)) + iowrite32be(GVE_IRQ_MASK, irq_doorbell); + } + return work_done; +} - /* Double check we have no extra work. - * Ensure unmask synchronizes with checking for work. +static int gve_napi_poll_dqo(struct napi_struct *napi, int budget) +{ + struct gve_notify_block *block = + container_of(napi, struct gve_notify_block, napi); + struct gve_priv *priv = block->priv; + bool reschedule = false; + int work_done = 0; + + /* Clear PCI MSI-X Pending Bit Array (PBA) + * + * This bit is set if an interrupt event occurs while the vector is + * masked. If this bit is set and we reenable the interrupt, it will + * fire again. Since we're just about to poll the queue state, we don't + * need it to fire again. + * + * Under high softirq load, it's possible that the interrupt condition + * is triggered twice before we got the chance to process it. */ - dma_rmb(); + gve_write_irq_doorbell_dqo(priv, block, + GVE_ITR_NO_UPDATE_DQO | GVE_ITR_CLEAR_PBA_BIT_DQO); + if (block->tx) - reschedule |= gve_tx_poll(block, -1); - if (block->rx) - reschedule |= gve_rx_poll(block, -1); - if (reschedule && napi_reschedule(napi)) - iowrite32be(GVE_IRQ_MASK, irq_doorbell); + reschedule |= gve_tx_poll_dqo(block, /*do_clean=*/true); - return 0; + if (block->rx) { + work_done = gve_rx_poll_dqo(block, budget); + reschedule |= work_done == budget; + } + + if (reschedule) + return budget; + + if (likely(napi_complete_done(napi, work_done))) { + /* Enable interrupts again. + * + * We don't need to repoll afterwards because HW supports the + * PCI MSI-X PBA feature. + * + * Another interrupt would be triggered if a new event came in + * since the last one. + */ + gve_write_irq_doorbell_dqo(priv, block, + GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO); + } + + return work_done; } static int gve_alloc_notify_blocks(struct gve_priv *priv) @@ -141,7 +288,7 @@ static int gve_alloc_notify_blocks(struct gve_priv *priv) int i, j; int err; - priv->msix_vectors = kvzalloc(num_vecs_requested * + priv->msix_vectors = kvcalloc(num_vecs_requested, sizeof(*priv->msix_vectors), GFP_KERNEL); if (!priv->msix_vectors) return -ENOMEM; @@ -161,6 +308,7 @@ static int gve_alloc_notify_blocks(struct gve_priv *priv) int vecs_left = new_num_ntfy_blks % 2; priv->num_ntfy_blks = new_num_ntfy_blks; + priv->mgmt_msix_idx = priv->num_ntfy_blks; priv->tx_cfg.max_queues = min_t(int, priv->tx_cfg.max_queues, vecs_per_type); priv->rx_cfg.max_queues = min_t(int, priv->rx_cfg.max_queues, @@ -186,15 +334,23 @@ static int gve_alloc_notify_blocks(struct gve_priv *priv) dev_err(&priv->pdev->dev, "Did not receive management vector.\n"); goto abort_with_msix_enabled; } - priv->ntfy_blocks = + priv->irq_db_indices = dma_alloc_coherent(&priv->pdev->dev, priv->num_ntfy_blks * - sizeof(*priv->ntfy_blocks), - &priv->ntfy_block_bus, GFP_KERNEL); - if (!priv->ntfy_blocks) { + sizeof(*priv->irq_db_indices), + &priv->irq_db_indices_bus, GFP_KERNEL); + if (!priv->irq_db_indices) { err = -ENOMEM; goto abort_with_mgmt_vector; } + + priv->ntfy_blocks = kvzalloc(priv->num_ntfy_blks * + sizeof(*priv->ntfy_blocks), GFP_KERNEL); + if (!priv->ntfy_blocks) { + err = -ENOMEM; + goto abort_with_irq_db_indices; + } + /* Setup the other blocks - the first n-1 vectors */ for (i = 0; i < priv->num_ntfy_blks; i++) { struct gve_notify_block *block = &priv->ntfy_blocks[i]; @@ -204,7 +360,8 @@ static int gve_alloc_notify_blocks(struct gve_priv *priv) name, i); block->priv = priv; err = request_irq(priv->msix_vectors[msix_idx].vector, - gve_intr, 0, block->name, block); + gve_is_gqi(priv) ? gve_intr : gve_intr_dqo, + 0, block->name, block); if (err) { dev_err(&priv->pdev->dev, "Failed to receive msix vector %d\n", i); @@ -212,6 +369,7 @@ static int gve_alloc_notify_blocks(struct gve_priv *priv) } irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector, get_cpu_mask(i % active_cpus)); + block->irq_db_index = &priv->irq_db_indices[i].index; } return 0; abort_with_some_ntfy_blocks: @@ -223,10 +381,13 @@ abort_with_some_ntfy_blocks: NULL); free_irq(priv->msix_vectors[msix_idx].vector, block); } - dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks * - sizeof(*priv->ntfy_blocks), - priv->ntfy_blocks, priv->ntfy_block_bus); + kvfree(priv->ntfy_blocks); priv->ntfy_blocks = NULL; +abort_with_irq_db_indices: + dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks * + sizeof(*priv->irq_db_indices), + priv->irq_db_indices, priv->irq_db_indices_bus); + priv->irq_db_indices = NULL; abort_with_mgmt_vector: free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv); abort_with_msix_enabled: @@ -241,6 +402,9 @@ static void gve_free_notify_blocks(struct gve_priv *priv) { int i; + if (!priv->msix_vectors) + return; + /* Free the irqs */ for (i = 0; i < priv->num_ntfy_blks; i++) { struct gve_notify_block *block = &priv->ntfy_blocks[i]; @@ -250,11 +414,13 @@ static void gve_free_notify_blocks(struct gve_priv *priv) NULL); free_irq(priv->msix_vectors[msix_idx].vector, block); } - dma_free_coherent(&priv->pdev->dev, - priv->num_ntfy_blks * sizeof(*priv->ntfy_blocks), - priv->ntfy_blocks, priv->ntfy_block_bus); - priv->ntfy_blocks = NULL; free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv); + kvfree(priv->ntfy_blocks); + priv->ntfy_blocks = NULL; + dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks * + sizeof(*priv->irq_db_indices), + priv->irq_db_indices, priv->irq_db_indices_bus); + priv->irq_db_indices = NULL; pci_disable_msix(priv->pdev); kvfree(priv->msix_vectors); priv->msix_vectors = NULL; @@ -270,23 +436,55 @@ static int gve_setup_device_resources(struct gve_priv *priv) err = gve_alloc_notify_blocks(priv); if (err) goto abort_with_counter; + err = gve_alloc_stats_report(priv); + if (err) + goto abort_with_ntfy_blocks; err = gve_adminq_configure_device_resources(priv, priv->counter_array_bus, priv->num_event_counters, - priv->ntfy_block_bus, + priv->irq_db_indices_bus, priv->num_ntfy_blks); if (unlikely(err)) { dev_err(&priv->pdev->dev, "could not setup device_resources: err=%d\n", err); err = -ENXIO; - goto abort_with_ntfy_blocks; + goto abort_with_stats_report; } + + if (priv->queue_format == GVE_DQO_RDA_FORMAT) { + priv->ptype_lut_dqo = kvzalloc(sizeof(*priv->ptype_lut_dqo), + GFP_KERNEL); + if (!priv->ptype_lut_dqo) { + err = -ENOMEM; + goto abort_with_stats_report; + } + err = gve_adminq_get_ptype_map_dqo(priv, priv->ptype_lut_dqo); + if (err) { + dev_err(&priv->pdev->dev, + "Failed to get ptype map: err=%d\n", err); + goto abort_with_ptype_lut; + } + } + + err = gve_adminq_report_stats(priv, priv->stats_report_len, + priv->stats_report_bus, + GVE_STATS_REPORT_TIMER_PERIOD); + if (err) + dev_err(&priv->pdev->dev, + "Failed to report stats: err=%d\n", err); gve_set_device_resources_ok(priv); return 0; + +abort_with_ptype_lut: + kvfree(priv->ptype_lut_dqo); + priv->ptype_lut_dqo = NULL; +abort_with_stats_report: + gve_free_stats_report(priv); abort_with_ntfy_blocks: gve_free_notify_blocks(priv); abort_with_counter: gve_free_counter_array(priv); + return err; } @@ -298,6 +496,13 @@ static void gve_teardown_device_resources(struct gve_priv *priv) /* Tell device its resources are being freed */ if (gve_get_device_resources_ok(priv)) { + /* detach the stats report */ + err = gve_adminq_report_stats(priv, 0, 0x0, GVE_STATS_REPORT_TIMER_PERIOD); + if (err) { + dev_err(&priv->pdev->dev, + "Failed to detach stats report: err=%d\n", err); + gve_trigger_reset(priv); + } err = gve_adminq_deconfigure_device_resources(priv); if (err) { dev_err(&priv->pdev->dev, @@ -306,17 +511,22 @@ static void gve_teardown_device_resources(struct gve_priv *priv) gve_trigger_reset(priv); } } + + kvfree(priv->ptype_lut_dqo); + priv->ptype_lut_dqo = NULL; + gve_free_counter_array(priv); gve_free_notify_blocks(priv); + gve_free_stats_report(priv); gve_clear_device_resources_ok(priv); } -static void gve_add_napi(struct gve_priv *priv, int ntfy_idx) +static void gve_add_napi(struct gve_priv *priv, int ntfy_idx, + int (*gve_poll)(struct napi_struct *, int)) { struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; - netif_napi_add(priv->dev, &block->napi, gve_napi_poll, - NAPI_POLL_WEIGHT); + netif_napi_add(priv->dev, &block->napi, gve_poll); } static void gve_remove_napi(struct gve_priv *priv, int ntfy_idx) @@ -371,76 +581,118 @@ static int gve_create_rings(struct gve_priv *priv) int err; int i; - for (i = 0; i < priv->tx_cfg.num_queues; i++) { - err = gve_adminq_create_tx_queue(priv, i); - if (err) { - netif_err(priv, drv, priv->dev, "failed to create tx queue %d\n", - i); - /* This failure will trigger a reset - no need to clean - * up - */ - return err; - } - netif_dbg(priv, drv, priv->dev, "created tx queue %d\n", i); + err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues); + if (err) { + netif_err(priv, drv, priv->dev, "failed to create %d tx queues\n", + priv->tx_cfg.num_queues); + /* This failure will trigger a reset - no need to clean + * up + */ + return err; } - for (i = 0; i < priv->rx_cfg.num_queues; i++) { - err = gve_adminq_create_rx_queue(priv, i); - if (err) { - netif_err(priv, drv, priv->dev, "failed to create rx queue %d\n", - i); - /* This failure will trigger a reset - no need to clean - * up - */ - return err; - } - /* Rx data ring has been prefilled with packet buffers at - * queue allocation time. + netif_dbg(priv, drv, priv->dev, "created %d tx queues\n", + priv->tx_cfg.num_queues); + + err = gve_adminq_create_rx_queues(priv, priv->rx_cfg.num_queues); + if (err) { + netif_err(priv, drv, priv->dev, "failed to create %d rx queues\n", + priv->rx_cfg.num_queues); + /* This failure will trigger a reset - no need to clean + * up + */ + return err; + } + netif_dbg(priv, drv, priv->dev, "created %d rx queues\n", + priv->rx_cfg.num_queues); + + if (gve_is_gqi(priv)) { + /* Rx data ring has been prefilled with packet buffers at queue + * allocation time. + * * Write the doorbell to provide descriptor slots and packet * buffers to the NIC. */ - gve_rx_write_doorbell(priv, &priv->rx[i]); - netif_dbg(priv, drv, priv->dev, "created rx queue %d\n", i); + for (i = 0; i < priv->rx_cfg.num_queues; i++) + gve_rx_write_doorbell(priv, &priv->rx[i]); + } else { + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + /* Post buffers and ring doorbell. */ + gve_rx_post_buffers_dqo(&priv->rx[i]); + } } return 0; } +static void add_napi_init_sync_stats(struct gve_priv *priv, + int (*napi_poll)(struct napi_struct *napi, + int budget)) +{ + int i; + + /* Add tx napi & init sync stats*/ + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + int ntfy_idx = gve_tx_idx_to_ntfy(priv, i); + + u64_stats_init(&priv->tx[i].statss); + priv->tx[i].ntfy_id = ntfy_idx; + gve_add_napi(priv, ntfy_idx, napi_poll); + } + /* Add rx napi & init sync stats*/ + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + int ntfy_idx = gve_rx_idx_to_ntfy(priv, i); + + u64_stats_init(&priv->rx[i].statss); + priv->rx[i].ntfy_id = ntfy_idx; + gve_add_napi(priv, ntfy_idx, napi_poll); + } +} + +static void gve_tx_free_rings(struct gve_priv *priv) +{ + if (gve_is_gqi(priv)) { + gve_tx_free_rings_gqi(priv); + } else { + gve_tx_free_rings_dqo(priv); + } +} + static int gve_alloc_rings(struct gve_priv *priv) { - int ntfy_idx; int err; - int i; /* Setup tx rings */ - priv->tx = kvzalloc(priv->tx_cfg.num_queues * sizeof(*priv->tx), + priv->tx = kvcalloc(priv->tx_cfg.num_queues, sizeof(*priv->tx), GFP_KERNEL); if (!priv->tx) return -ENOMEM; - err = gve_tx_alloc_rings(priv); + + if (gve_is_gqi(priv)) + err = gve_tx_alloc_rings(priv); + else + err = gve_tx_alloc_rings_dqo(priv); if (err) goto free_tx; + /* Setup rx rings */ - priv->rx = kvzalloc(priv->rx_cfg.num_queues * sizeof(*priv->rx), + priv->rx = kvcalloc(priv->rx_cfg.num_queues, sizeof(*priv->rx), GFP_KERNEL); if (!priv->rx) { err = -ENOMEM; goto free_tx_queue; } - err = gve_rx_alloc_rings(priv); + + if (gve_is_gqi(priv)) + err = gve_rx_alloc_rings(priv); + else + err = gve_rx_alloc_rings_dqo(priv); if (err) goto free_rx; - /* Add tx napi & init sync stats*/ - for (i = 0; i < priv->tx_cfg.num_queues; i++) { - u64_stats_init(&priv->tx[i].statss); - ntfy_idx = gve_tx_idx_to_ntfy(priv, i); - gve_add_napi(priv, ntfy_idx); - } - /* Add rx napi & init sync stats*/ - for (i = 0; i < priv->rx_cfg.num_queues; i++) { - u64_stats_init(&priv->rx[i].statss); - ntfy_idx = gve_rx_idx_to_ntfy(priv, i); - gve_add_napi(priv, ntfy_idx); - } + + if (gve_is_gqi(priv)) + add_napi_init_sync_stats(priv, gve_napi_poll); + else + add_napi_init_sync_stats(priv, gve_napi_poll_dqo); return 0; @@ -458,37 +710,34 @@ free_tx: static int gve_destroy_rings(struct gve_priv *priv) { int err; - int i; - for (i = 0; i < priv->tx_cfg.num_queues; i++) { - err = gve_adminq_destroy_tx_queue(priv, i); - if (err) { - netif_err(priv, drv, priv->dev, - "failed to destroy tx queue %d\n", - i); - /* This failure will trigger a reset - no need to clean - * up - */ - return err; - } - netif_dbg(priv, drv, priv->dev, "destroyed tx queue %d\n", i); + err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues); + if (err) { + netif_err(priv, drv, priv->dev, + "failed to destroy tx queues\n"); + /* This failure will trigger a reset - no need to clean up */ + return err; } - for (i = 0; i < priv->rx_cfg.num_queues; i++) { - err = gve_adminq_destroy_rx_queue(priv, i); - if (err) { - netif_err(priv, drv, priv->dev, - "failed to destroy rx queue %d\n", - i); - /* This failure will trigger a reset - no need to clean - * up - */ - return err; - } - netif_dbg(priv, drv, priv->dev, "destroyed rx queue %d\n", i); + netif_dbg(priv, drv, priv->dev, "destroyed tx queues\n"); + err = gve_adminq_destroy_rx_queues(priv, priv->rx_cfg.num_queues); + if (err) { + netif_err(priv, drv, priv->dev, + "failed to destroy rx queues\n"); + /* This failure will trigger a reset - no need to clean up */ + return err; } + netif_dbg(priv, drv, priv->dev, "destroyed rx queues\n"); return 0; } +static void gve_rx_free_rings(struct gve_priv *priv) +{ + if (gve_is_gqi(priv)) + gve_rx_free_rings_gqi(priv); + else + gve_rx_free_rings_dqo(priv); +} + static void gve_free_rings(struct gve_priv *priv) { int ntfy_idx; @@ -514,14 +763,18 @@ static void gve_free_rings(struct gve_priv *priv) } } -int gve_alloc_page(struct device *dev, struct page **page, dma_addr_t *dma, - enum dma_data_direction dir) +int gve_alloc_page(struct gve_priv *priv, struct device *dev, + struct page **page, dma_addr_t *dma, + enum dma_data_direction dir, gfp_t gfp_flags) { - *page = alloc_page(GFP_KERNEL); - if (!*page) + *page = alloc_page(gfp_flags); + if (!*page) { + priv->page_alloc_fail++; return -ENOMEM; + } *dma = dma_map_page(dev, *page, 0, PAGE_SIZE, dir); if (dma_mapping_error(dev, *dma)) { + priv->dma_mapping_error++; put_page(*page); return -ENOMEM; } @@ -545,20 +798,19 @@ static int gve_alloc_queue_page_list(struct gve_priv *priv, u32 id, qpl->id = id; qpl->num_entries = 0; - qpl->pages = kvzalloc(pages * sizeof(*qpl->pages), GFP_KERNEL); + qpl->pages = kvcalloc(pages, sizeof(*qpl->pages), GFP_KERNEL); /* caller handles clean up */ if (!qpl->pages) return -ENOMEM; - qpl->page_buses = kvzalloc(pages * sizeof(*qpl->page_buses), - GFP_KERNEL); + qpl->page_buses = kvcalloc(pages, sizeof(*qpl->page_buses), GFP_KERNEL); /* caller handles clean up */ if (!qpl->page_buses) return -ENOMEM; for (i = 0; i < pages; i++) { - err = gve_alloc_page(&priv->pdev->dev, &qpl->pages[i], + err = gve_alloc_page(priv, &priv->pdev->dev, &qpl->pages[i], &qpl->page_buses[i], - gve_qpl_dma_dir(priv, id)); + gve_qpl_dma_dir(priv, id), GFP_KERNEL); /* caller handles clean up */ if (err) return -ENOMEM; @@ -578,8 +830,7 @@ void gve_free_page(struct device *dev, struct page *page, dma_addr_t dma, put_page(page); } -static void gve_free_queue_page_list(struct gve_priv *priv, - int id) +static void gve_free_queue_page_list(struct gve_priv *priv, u32 id) { struct gve_queue_page_list *qpl = &priv->qpls[id]; int i; @@ -605,7 +856,10 @@ static int gve_alloc_qpls(struct gve_priv *priv) int i, j; int err; - priv->qpls = kvzalloc(num_qpls * sizeof(*priv->qpls), GFP_KERNEL); + if (num_qpls == 0) + return 0; + + priv->qpls = kvcalloc(num_qpls, sizeof(*priv->qpls), GFP_KERNEL); if (!priv->qpls) return -ENOMEM; @@ -617,14 +871,14 @@ static int gve_alloc_qpls(struct gve_priv *priv) } for (; i < num_qpls; i++) { err = gve_alloc_queue_page_list(priv, i, - priv->rx_pages_per_qpl); + priv->rx_data_slot_cnt); if (err) goto free_qpls; } priv->qpl_cfg.qpl_map_size = BITS_TO_LONGS(num_qpls) * sizeof(unsigned long) * BITS_PER_BYTE; - priv->qpl_cfg.qpl_id_map = kvzalloc(BITS_TO_LONGS(num_qpls) * + priv->qpl_cfg.qpl_id_map = kvcalloc(BITS_TO_LONGS(num_qpls), sizeof(unsigned long), GFP_KERNEL); if (!priv->qpl_cfg.qpl_id_map) { err = -ENOMEM; @@ -645,6 +899,9 @@ static void gve_free_qpls(struct gve_priv *priv) int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); int i; + if (num_qpls == 0) + return; + kvfree(priv->qpl_cfg.qpl_id_map); for (i = 0; i < num_qpls; i++) @@ -676,6 +933,7 @@ static int gve_open(struct net_device *dev) err = gve_alloc_qpls(priv); if (err) return err; + err = gve_alloc_rings(priv); if (err) goto free_qpls; @@ -690,13 +948,27 @@ static int gve_open(struct net_device *dev) err = gve_register_qpls(priv); if (err) goto reset; + + if (!gve_is_gqi(priv)) { + /* Hard code this for now. This may be tuned in the future for + * performance. + */ + priv->data_buffer_size_dqo = GVE_RX_BUFFER_SIZE_DQO; + } err = gve_create_rings(priv); if (err) goto reset; + gve_set_device_rings_ok(priv); + if (gve_get_report_stats(priv)) + mod_timer(&priv->stats_report_timer, + round_jiffies(jiffies + + msecs_to_jiffies(priv->stats_report_timer_period))); + gve_turnup(priv); - netif_carrier_on(dev); + queue_work(priv->gve_wq, &priv->service_task); + priv->interface_up_cnt++; return 0; free_rings: @@ -735,9 +1007,11 @@ static int gve_close(struct net_device *dev) goto err; gve_clear_device_rings_ok(priv); } + del_timer_sync(&priv->stats_report_timer); gve_free_rings(priv); gve_free_qpls(priv); + priv->interface_down_cnt++; return 0; err: @@ -817,6 +1091,7 @@ static void gve_turndown(struct gve_priv *priv) netif_tx_disable(priv->dev); gve_clear_napi_enabled(priv); + gve_clear_report_stats(priv); } static void gve_turnup(struct gve_priv *priv) @@ -832,14 +1107,24 @@ static void gve_turnup(struct gve_priv *priv) struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; napi_enable(&block->napi); - iowrite32be(0, gve_irq_doorbell(priv, block)); + if (gve_is_gqi(priv)) { + iowrite32be(0, gve_irq_doorbell(priv, block)); + } else { + gve_set_itr_coalesce_usecs_dqo(priv, block, + priv->tx_coalesce_usecs); + } } for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) { int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; napi_enable(&block->napi); - iowrite32be(0, gve_irq_doorbell(priv, block)); + if (gve_is_gqi(priv)) { + iowrite32be(0, gve_irq_doorbell(priv, block)); + } else { + gve_set_itr_coalesce_usecs_dqo(priv, block, + priv->rx_coalesce_usecs); + } } gve_set_napi_enabled(priv); @@ -847,18 +1132,93 @@ static void gve_turnup(struct gve_priv *priv) static void gve_tx_timeout(struct net_device *dev, unsigned int txqueue) { - struct gve_priv *priv = netdev_priv(dev); + struct gve_notify_block *block; + struct gve_tx_ring *tx = NULL; + struct gve_priv *priv; + u32 last_nic_done; + u32 current_time; + u32 ntfy_idx; + + netdev_info(dev, "Timeout on tx queue, %d", txqueue); + priv = netdev_priv(dev); + if (txqueue > priv->tx_cfg.num_queues) + goto reset; + + ntfy_idx = gve_tx_idx_to_ntfy(priv, txqueue); + if (ntfy_idx >= priv->num_ntfy_blks) + goto reset; + + block = &priv->ntfy_blocks[ntfy_idx]; + tx = block->tx; + + current_time = jiffies_to_msecs(jiffies); + if (tx->last_kick_msec + MIN_TX_TIMEOUT_GAP > current_time) + goto reset; + + /* Check to see if there are missed completions, which will allow us to + * kick the queue. + */ + last_nic_done = gve_tx_load_event_counter(priv, tx); + if (last_nic_done - tx->done) { + netdev_info(dev, "Kicking queue %d", txqueue); + iowrite32be(GVE_IRQ_MASK, gve_irq_doorbell(priv, block)); + napi_schedule(&block->napi); + tx->last_kick_msec = current_time; + goto out; + } // Else reset. +reset: gve_schedule_reset(priv); + +out: + if (tx) + tx->queue_timeout++; priv->tx_timeo_cnt++; } +static int gve_set_features(struct net_device *netdev, + netdev_features_t features) +{ + const netdev_features_t orig_features = netdev->features; + struct gve_priv *priv = netdev_priv(netdev); + int err; + + if ((netdev->features & NETIF_F_LRO) != (features & NETIF_F_LRO)) { + netdev->features ^= NETIF_F_LRO; + if (netif_carrier_ok(netdev)) { + /* To make this process as simple as possible we + * teardown the device, set the new configuration, + * and then bring the device up again. + */ + err = gve_close(netdev); + /* We have already tried to reset in close, just fail + * at this point. + */ + if (err) + goto err; + + err = gve_open(netdev); + if (err) + goto err; + } + } + + return 0; +err: + /* Reverts the change on error. */ + netdev->features = orig_features; + netif_err(priv, drv, netdev, + "Set features failed! !!! DISABLING ALL QUEUES !!!\n"); + return err; +} + static const struct net_device_ops gve_netdev_ops = { - .ndo_start_xmit = gve_tx, + .ndo_start_xmit = gve_start_xmit, .ndo_open = gve_open, .ndo_stop = gve_close, .ndo_get_stats64 = gve_get_stats, .ndo_tx_timeout = gve_tx_timeout, + .ndo_set_features = gve_set_features, }; static void gve_handle_status(struct gve_priv *priv, u32 status) @@ -867,6 +1227,10 @@ static void gve_handle_status(struct gve_priv *priv, u32 status) dev_info(&priv->pdev->dev, "Device requested reset.\n"); gve_set_do_reset(priv); } + if (GVE_DEVICE_STATUS_REPORT_STATS_MASK & status) { + priv->stats_report_trigger_cnt++; + gve_set_do_report_stats(priv); + } } static void gve_handle_reset(struct gve_priv *priv) @@ -885,16 +1249,110 @@ static void gve_handle_reset(struct gve_priv *priv) } } -/* Handle NIC status register changes and reset requests */ +void gve_handle_report_stats(struct gve_priv *priv) +{ + struct stats *stats = priv->stats_report->stats; + int idx, stats_idx = 0; + unsigned int start = 0; + u64 tx_bytes; + + if (!gve_get_report_stats(priv)) + return; + + be64_add_cpu(&priv->stats_report->written_count, 1); + /* tx stats */ + if (priv->tx) { + for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) { + u32 last_completion = 0; + u32 tx_frames = 0; + + /* DQO doesn't currently support these metrics. */ + if (gve_is_gqi(priv)) { + last_completion = priv->tx[idx].done; + tx_frames = priv->tx[idx].req; + } + + do { + start = u64_stats_fetch_begin_irq(&priv->tx[idx].statss); + tx_bytes = priv->tx[idx].bytes_done; + } while (u64_stats_fetch_retry_irq(&priv->tx[idx].statss, start)); + stats[stats_idx++] = (struct stats) { + .stat_name = cpu_to_be32(TX_WAKE_CNT), + .value = cpu_to_be64(priv->tx[idx].wake_queue), + .queue_id = cpu_to_be32(idx), + }; + stats[stats_idx++] = (struct stats) { + .stat_name = cpu_to_be32(TX_STOP_CNT), + .value = cpu_to_be64(priv->tx[idx].stop_queue), + .queue_id = cpu_to_be32(idx), + }; + stats[stats_idx++] = (struct stats) { + .stat_name = cpu_to_be32(TX_FRAMES_SENT), + .value = cpu_to_be64(tx_frames), + .queue_id = cpu_to_be32(idx), + }; + stats[stats_idx++] = (struct stats) { + .stat_name = cpu_to_be32(TX_BYTES_SENT), + .value = cpu_to_be64(tx_bytes), + .queue_id = cpu_to_be32(idx), + }; + stats[stats_idx++] = (struct stats) { + .stat_name = cpu_to_be32(TX_LAST_COMPLETION_PROCESSED), + .value = cpu_to_be64(last_completion), + .queue_id = cpu_to_be32(idx), + }; + stats[stats_idx++] = (struct stats) { + .stat_name = cpu_to_be32(TX_TIMEOUT_CNT), + .value = cpu_to_be64(priv->tx[idx].queue_timeout), + .queue_id = cpu_to_be32(idx), + }; + } + } + /* rx stats */ + if (priv->rx) { + for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) { + stats[stats_idx++] = (struct stats) { + .stat_name = cpu_to_be32(RX_NEXT_EXPECTED_SEQUENCE), + .value = cpu_to_be64(priv->rx[idx].desc.seqno), + .queue_id = cpu_to_be32(idx), + }; + stats[stats_idx++] = (struct stats) { + .stat_name = cpu_to_be32(RX_BUFFERS_POSTED), + .value = cpu_to_be64(priv->rx[0].fill_cnt), + .queue_id = cpu_to_be32(idx), + }; + } + } +} + +static void gve_handle_link_status(struct gve_priv *priv, bool link_status) +{ + if (!gve_get_napi_enabled(priv)) + return; + + if (link_status == netif_carrier_ok(priv->dev)) + return; + + if (link_status) { + netdev_info(priv->dev, "Device link is up.\n"); + netif_carrier_on(priv->dev); + } else { + netdev_info(priv->dev, "Device link is down.\n"); + netif_carrier_off(priv->dev); + } +} + +/* Handle NIC status register changes, reset requests and report stats */ static void gve_service_task(struct work_struct *work) { struct gve_priv *priv = container_of(work, struct gve_priv, service_task); + u32 status = ioread32be(&priv->reg_bar0->device_status); - gve_handle_status(priv, - ioread32be(&priv->reg_bar0->device_status)); + gve_handle_status(priv, status); gve_handle_reset(priv); + gve_handle_link_status(priv, GVE_DEVICE_STATUS_LINK_STATUS_MASK & status); } static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device) @@ -913,6 +1371,7 @@ static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device) if (skip_describe_device) goto setup_device; + priv->queue_format = GVE_QUEUE_FORMAT_UNSPECIFIED; /* Get the initial information we need from the device */ err = gve_adminq_describe_device(priv); if (err) { @@ -920,14 +1379,6 @@ static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device) "Could not get device information: err=%d\n", err); goto err; } - if (priv->dev->max_mtu > PAGE_SIZE) { - priv->dev->max_mtu = PAGE_SIZE; - err = gve_adminq_set_mtu(priv, priv->dev->mtu); - if (err) { - netif_err(priv, drv, priv->dev, "Could not set mtu"); - goto err; - } - } priv->dev->mtu = priv->dev->max_mtu; num_ntfy = pci_msix_vec_count(priv->pdev); if (num_ntfy <= 0) { @@ -964,10 +1415,15 @@ static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device) priv->rx_cfg.num_queues); } - netif_info(priv, drv, priv->dev, "TX queues %d, RX queues %d\n", - priv->tx_cfg.num_queues, priv->rx_cfg.num_queues); - netif_info(priv, drv, priv->dev, "Max TX queues %d, Max RX queues %d\n", - priv->tx_cfg.max_queues, priv->rx_cfg.max_queues); + dev_info(&priv->pdev->dev, "TX queues %d, RX queues %d\n", + priv->tx_cfg.num_queues, priv->rx_cfg.num_queues); + dev_info(&priv->pdev->dev, "Max TX queues %d, Max RX queues %d\n", + priv->tx_cfg.max_queues, priv->rx_cfg.max_queues); + + if (!gve_is_gqi(priv)) { + priv->tx_coalesce_usecs = GVE_TX_IRQ_RATELIMIT_US_DQO; + priv->rx_coalesce_usecs = GVE_RX_IRQ_RATELIMIT_US_DQO; + } setup_device: err = gve_setup_device_resources(priv); @@ -1047,6 +1503,10 @@ int gve_reset(struct gve_priv *priv, bool attempt_teardown) /* Set it all back up */ err = gve_reset_recovery(priv, was_up); gve_clear_reset_in_progress(priv); + priv->reset_cnt++; + priv->interface_up_cnt = 0; + priv->interface_down_cnt = 0; + priv->stats_report_trigger_cnt = 0; return err; } @@ -1078,7 +1538,7 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err = pci_enable_device(pdev); if (err) - return -ENXIO; + return err; err = pci_request_regions(pdev, "gvnic-cfg"); if (err) @@ -1086,19 +1546,12 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent) pci_set_master(pdev); - err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); if (err) { dev_err(&pdev->dev, "Failed to set dma mask: err=%d\n", err); goto abort_with_pci_region; } - err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); - if (err) { - dev_err(&pdev->dev, - "Failed to set consistent dma mask: err=%d\n", err); - goto abort_with_pci_region; - } - reg_bar = pci_iomap(pdev, GVE_REGISTER_BAR, 0); if (!reg_bar) { dev_err(&pdev->dev, "Failed to map pci bar!\n"); @@ -1115,19 +1568,25 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent) gve_write_version(®_bar->driver_version); /* Get max queues to alloc etherdev */ - max_rx_queues = ioread32be(®_bar->max_tx_queues); - max_tx_queues = ioread32be(®_bar->max_rx_queues); + max_tx_queues = ioread32be(®_bar->max_tx_queues); + max_rx_queues = ioread32be(®_bar->max_rx_queues); /* Alloc and setup the netdev and priv */ dev = alloc_etherdev_mqs(sizeof(*priv), max_tx_queues, max_rx_queues); if (!dev) { dev_err(&pdev->dev, "could not allocate netdev\n"); + err = -ENOMEM; goto abort_with_db_bar; } SET_NETDEV_DEV(dev, &pdev->dev); pci_set_drvdata(pdev, dev); dev->ethtool_ops = &gve_ethtool_ops; dev->netdev_ops = &gve_netdev_ops; - /* advertise features */ + + /* Set default and supported features. + * + * Features might be set in other locations as well (such as + * `gve_adminq_describe_device`). + */ dev->hw_features = NETIF_F_HIGHDMA; dev->hw_features |= NETIF_F_SG; dev->hw_features |= NETIF_F_HW_CSUM; @@ -1149,6 +1608,7 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent) priv->db_bar2 = db_bar; priv->service_task_flags = 0x0; priv->state_flags = 0x0; + priv->ethtool_flags = 0x0; gve_set_probe_in_progress(priv); priv->gve_wq = alloc_ordered_workqueue("gve", 0); @@ -1158,6 +1618,7 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto abort_with_netdev; } INIT_WORK(&priv->service_task, gve_service_task); + INIT_WORK(&priv->stats_report_task, gve_stats_report_task); priv->tx_cfg.max_queues = max_tx_queues; priv->rx_cfg.max_queues = max_rx_queues; @@ -1167,13 +1628,17 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err = register_netdev(dev); if (err) - goto abort_with_wq; + goto abort_with_gve_init; dev_info(&pdev->dev, "GVE version %s\n", gve_version_str); + dev_info(&pdev->dev, "GVE queue format %d\n", (int)priv->queue_format); gve_clear_probe_in_progress(priv); queue_work(priv->gve_wq, &priv->service_task); return 0; +abort_with_gve_init: + gve_teardown_priv_resources(priv); + abort_with_wq: destroy_workqueue(priv->gve_wq); @@ -1191,7 +1656,7 @@ abort_with_pci_region: abort_with_enabled: pci_disable_device(pdev); - return -ENXIO; + return err; } static void gve_remove(struct pci_dev *pdev) @@ -1211,6 +1676,58 @@ static void gve_remove(struct pci_dev *pdev) pci_disable_device(pdev); } +static void gve_shutdown(struct pci_dev *pdev) +{ + struct net_device *netdev = pci_get_drvdata(pdev); + struct gve_priv *priv = netdev_priv(netdev); + bool was_up = netif_carrier_ok(priv->dev); + + rtnl_lock(); + if (was_up && gve_close(priv->dev)) { + /* If the dev was up, attempt to close, if close fails, reset */ + gve_reset_and_teardown(priv, was_up); + } else { + /* If the dev wasn't up or close worked, finish tearing down */ + gve_teardown_priv_resources(priv); + } + rtnl_unlock(); +} + +#ifdef CONFIG_PM +static int gve_suspend(struct pci_dev *pdev, pm_message_t state) +{ + struct net_device *netdev = pci_get_drvdata(pdev); + struct gve_priv *priv = netdev_priv(netdev); + bool was_up = netif_carrier_ok(priv->dev); + + priv->suspend_cnt++; + rtnl_lock(); + if (was_up && gve_close(priv->dev)) { + /* If the dev was up, attempt to close, if close fails, reset */ + gve_reset_and_teardown(priv, was_up); + } else { + /* If the dev wasn't up or close worked, finish tearing down */ + gve_teardown_priv_resources(priv); + } + priv->up_before_suspend = was_up; + rtnl_unlock(); + return 0; +} + +static int gve_resume(struct pci_dev *pdev) +{ + struct net_device *netdev = pci_get_drvdata(pdev); + struct gve_priv *priv = netdev_priv(netdev); + int err; + + priv->resume_cnt++; + rtnl_lock(); + err = gve_reset_recovery(priv, priv->up_before_suspend); + rtnl_unlock(); + return err; +} +#endif /* CONFIG_PM */ + static const struct pci_device_id gve_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_GOOGLE, PCI_DEV_ID_GVNIC) }, { } @@ -1221,6 +1738,11 @@ static struct pci_driver gvnic_driver = { .id_table = gve_id_table, .probe = gve_probe, .remove = gve_remove, + .shutdown = gve_shutdown, +#ifdef CONFIG_PM + .suspend = gve_suspend, + .resume = gve_resume, +#endif }; module_pci_driver(gvnic_driver); diff --git a/drivers/net/ethernet/google/gve/gve_register.h b/drivers/net/ethernet/google/gve/gve_register.h index 84ab8893aadd..fb655463c357 100644 --- a/drivers/net/ethernet/google/gve/gve_register.h +++ b/drivers/net/ethernet/google/gve/gve_register.h @@ -23,5 +23,6 @@ struct gve_registers { enum gve_device_status_flags { GVE_DEVICE_STATUS_RESET_MASK = BIT(1), GVE_DEVICE_STATUS_LINK_STATUS_MASK = BIT(2), + GVE_DEVICE_STATUS_REPORT_STATS_MASK = BIT(3), }; #endif /* _GVE_REGISTER_H_ */ diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c index 9f52e72ff641..021bbf308d68 100644 --- a/drivers/net/ethernet/google/gve/gve_rx.c +++ b/drivers/net/ethernet/google/gve/gve_rx.c @@ -1,27 +1,51 @@ // SPDX-License-Identifier: (GPL-2.0 OR MIT) /* Google virtual Ethernet (gve) driver * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc. */ #include "gve.h" #include "gve_adminq.h" +#include "gve_utils.h" #include <linux/etherdevice.h> -static void gve_rx_remove_from_block(struct gve_priv *priv, int queue_idx) +static void gve_rx_free_buffer(struct device *dev, + struct gve_rx_slot_page_info *page_info, + union gve_rx_data_slot *data_slot) { - struct gve_notify_block *block = - &priv->ntfy_blocks[gve_rx_idx_to_ntfy(priv, queue_idx)]; + dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) & + GVE_DATA_SLOT_ADDR_PAGE_MASK); - block->rx = NULL; + page_ref_sub(page_info->page, page_info->pagecnt_bias - 1); + gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE); +} + +static void gve_rx_unfill_pages(struct gve_priv *priv, struct gve_rx_ring *rx) +{ + u32 slots = rx->mask + 1; + int i; + + if (rx->data.raw_addressing) { + for (i = 0; i < slots; i++) + gve_rx_free_buffer(&priv->pdev->dev, &rx->data.page_info[i], + &rx->data.data_ring[i]); + } else { + for (i = 0; i < slots; i++) + page_ref_sub(rx->data.page_info[i].page, + rx->data.page_info[i].pagecnt_bias - 1); + gve_unassign_qpl(priv, rx->data.qpl->id); + rx->data.qpl = NULL; + } + kvfree(rx->data.page_info); + rx->data.page_info = NULL; } static void gve_rx_free_ring(struct gve_priv *priv, int idx) { struct gve_rx_ring *rx = &priv->rx[idx]; struct device *dev = &priv->pdev->dev; + u32 slots = rx->mask + 1; size_t bytes; - u32 slots; gve_rx_remove_from_block(priv, idx); @@ -33,11 +57,8 @@ static void gve_rx_free_ring(struct gve_priv *priv, int idx) rx->q_resources, rx->q_resources_bus); rx->q_resources = NULL; - gve_unassign_qpl(priv, rx->data.qpl->id); - rx->data.qpl = NULL; - kvfree(rx->data.page_info); + gve_rx_unfill_pages(priv, rx); - slots = rx->mask + 1; bytes = sizeof(*rx->data.data_ring) * slots; dma_free_coherent(dev, bytes, rx->data.data_ring, rx->data.data_bus); @@ -46,19 +67,39 @@ static void gve_rx_free_ring(struct gve_priv *priv, int idx) } static void gve_setup_rx_buffer(struct gve_rx_slot_page_info *page_info, - struct gve_rx_data_slot *slot, - dma_addr_t addr, struct page *page) + dma_addr_t addr, struct page *page, __be64 *slot_addr) { page_info->page = page; page_info->page_offset = 0; page_info->page_address = page_address(page); - slot->qpl_offset = cpu_to_be64(addr); + *slot_addr = cpu_to_be64(addr); + /* The page already has 1 ref */ + page_ref_add(page, INT_MAX - 1); + page_info->pagecnt_bias = INT_MAX; +} + +static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev, + struct gve_rx_slot_page_info *page_info, + union gve_rx_data_slot *data_slot) +{ + struct page *page; + dma_addr_t dma; + int err; + + err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE, + GFP_ATOMIC); + if (err) + return err; + + gve_setup_rx_buffer(page_info, dma, page, &data_slot->addr); + return 0; } static int gve_prefill_rx_pages(struct gve_rx_ring *rx) { struct gve_priv *priv = rx->gve; u32 slots; + int err; int i; /* Allocate one page per Rx queue slot. Each page is split into two @@ -71,27 +112,46 @@ static int gve_prefill_rx_pages(struct gve_rx_ring *rx) if (!rx->data.page_info) return -ENOMEM; - rx->data.qpl = gve_assign_rx_qpl(priv); - + if (!rx->data.raw_addressing) { + rx->data.qpl = gve_assign_rx_qpl(priv); + if (!rx->data.qpl) { + kvfree(rx->data.page_info); + rx->data.page_info = NULL; + return -ENOMEM; + } + } for (i = 0; i < slots; i++) { - struct page *page = rx->data.qpl->pages[i]; - dma_addr_t addr = i * PAGE_SIZE; + if (!rx->data.raw_addressing) { + struct page *page = rx->data.qpl->pages[i]; + dma_addr_t addr = i * PAGE_SIZE; - gve_setup_rx_buffer(&rx->data.page_info[i], - &rx->data.data_ring[i], addr, page); + gve_setup_rx_buffer(&rx->data.page_info[i], addr, page, + &rx->data.data_ring[i].qpl_offset); + continue; + } + err = gve_rx_alloc_buffer(priv, &priv->pdev->dev, &rx->data.page_info[i], + &rx->data.data_ring[i]); + if (err) + goto alloc_err; } return slots; +alloc_err: + while (i--) + gve_rx_free_buffer(&priv->pdev->dev, + &rx->data.page_info[i], + &rx->data.data_ring[i]); + return err; } -static void gve_rx_add_to_block(struct gve_priv *priv, int queue_idx) +static void gve_rx_ctx_clear(struct gve_rx_ctx *ctx) { - u32 ntfy_idx = gve_rx_idx_to_ntfy(priv, queue_idx); - struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; - struct gve_rx_ring *rx = &priv->rx[queue_idx]; - - block->rx = rx; - rx->ntfy_id = ntfy_idx; + ctx->curr_frag_cnt = 0; + ctx->total_expected_size = 0; + ctx->expected_frag_cnt = 0; + ctx->skb_head = NULL; + ctx->skb_tail = NULL; + ctx->reuse_frags = false; } static int gve_rx_alloc_ring(struct gve_priv *priv, int idx) @@ -110,8 +170,9 @@ static int gve_rx_alloc_ring(struct gve_priv *priv, int idx) rx->gve = priv; rx->q_num = idx; - slots = priv->rx_pages_per_qpl; + slots = priv->rx_data_slot_cnt; rx->mask = slots - 1; + rx->data.raw_addressing = priv->queue_format == GVE_GQI_RDA_FORMAT; /* alloc rx data ring */ bytes = sizeof(*rx->data.data_ring) * slots; @@ -156,9 +217,15 @@ static int gve_rx_alloc_ring(struct gve_priv *priv, int idx) err = -ENOMEM; goto abort_with_q_resources; } - rx->mask = slots - 1; rx->cnt = 0; + rx->db_threshold = priv->rx_desc_cnt / 2; rx->desc.seqno = 1; + + /* Allocating half-page buffers allows page-flipping which is faster + * than copying or allocating new pages. + */ + rx->packet_buffer_size = PAGE_SIZE / 2; + gve_rx_ctx_clear(&rx->ctx); gve_rx_add_to_block(priv, idx); return 0; @@ -168,7 +235,7 @@ abort_with_q_resources: rx->q_resources, rx->q_resources_bus); rx->q_resources = NULL; abort_filled: - kvfree(rx->data.page_info); + gve_rx_unfill_pages(priv, rx); abort_with_slots: bytes = sizeof(*rx->data.data_ring) * slots; dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus); @@ -201,7 +268,7 @@ int gve_rx_alloc_rings(struct gve_priv *priv) return err; } -void gve_rx_free_rings(struct gve_priv *priv) +void gve_rx_free_rings_gqi(struct gve_priv *priv) { int i; @@ -225,144 +292,346 @@ static enum pkt_hash_types gve_rss_type(__be16 pkt_flags) return PKT_HASH_TYPE_L2; } -static struct sk_buff *gve_rx_copy(struct net_device *dev, - struct napi_struct *napi, - struct gve_rx_slot_page_info *page_info, - u16 len) +static u16 gve_rx_ctx_padding(struct gve_rx_ctx *ctx) { - struct sk_buff *skb = napi_alloc_skb(napi, len); - void *va = page_info->page_address + GVE_RX_PAD + - page_info->page_offset; + return (ctx->curr_frag_cnt == 0) ? GVE_RX_PAD : 0; +} - if (unlikely(!skb)) - return NULL; +static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi, + struct gve_rx_slot_page_info *page_info, + u16 packet_buffer_size, u16 len, + struct gve_rx_ctx *ctx) +{ + u32 offset = page_info->page_offset + gve_rx_ctx_padding(ctx); + struct sk_buff *skb; + + if (!ctx->skb_head) + ctx->skb_head = napi_get_frags(napi); - __skb_put(skb, len); + if (unlikely(!ctx->skb_head)) + return NULL; - skb_copy_to_linear_data(skb, va, len); + skb = ctx->skb_head; + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page_info->page, + offset, len, packet_buffer_size); - skb->protocol = eth_type_trans(skb, dev); return skb; } -static struct sk_buff *gve_rx_add_frags(struct net_device *dev, - struct napi_struct *napi, - struct gve_rx_slot_page_info *page_info, - u16 len) +static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) +{ + const __be64 offset = cpu_to_be64(PAGE_SIZE / 2); + + /* "flip" to other packet buffer on this page */ + page_info->page_offset ^= PAGE_SIZE / 2; + *(slot_addr) ^= offset; +} + +static int gve_rx_can_recycle_buffer(struct gve_rx_slot_page_info *page_info) +{ + int pagecount = page_count(page_info->page); + + /* This page is not being used by any SKBs - reuse */ + if (pagecount == page_info->pagecnt_bias) + return 1; + /* This page is still being used by an SKB - we can't reuse */ + else if (pagecount > page_info->pagecnt_bias) + return 0; + WARN(pagecount < page_info->pagecnt_bias, + "Pagecount should never be less than the bias."); + return -1; +} + +static struct sk_buff * +gve_rx_raw_addressing(struct device *dev, struct net_device *netdev, + struct gve_rx_slot_page_info *page_info, u16 len, + struct napi_struct *napi, + union gve_rx_data_slot *data_slot, + u16 packet_buffer_size, struct gve_rx_ctx *ctx) { - struct sk_buff *skb = napi_get_frags(napi); + struct sk_buff *skb = gve_rx_add_frags(napi, page_info, packet_buffer_size, len, ctx); - if (unlikely(!skb)) + if (!skb) return NULL; - skb_add_rx_frag(skb, 0, page_info->page, - page_info->page_offset + - GVE_RX_PAD, len, PAGE_SIZE / 2); + /* Optimistically stop the kernel from freeing the page. + * We will check again in refill to determine if we need to alloc a + * new page. + */ + gve_dec_pagecnt_bias(page_info); return skb; } -static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, - struct gve_rx_data_slot *data_ring) +static struct sk_buff * +gve_rx_qpl(struct device *dev, struct net_device *netdev, + struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info, + u16 len, struct napi_struct *napi, + union gve_rx_data_slot *data_slot) { - u64 addr = be64_to_cpu(data_ring->qpl_offset); + struct gve_rx_ctx *ctx = &rx->ctx; + struct sk_buff *skb; - page_info->page_offset ^= PAGE_SIZE / 2; - addr ^= PAGE_SIZE / 2; - data_ring->qpl_offset = cpu_to_be64(addr); + /* if raw_addressing mode is not enabled gvnic can only receive into + * registered segments. If the buffer can't be recycled, our only + * choice is to copy the data out of it so that we can return it to the + * device. + */ + if (ctx->reuse_frags) { + skb = gve_rx_add_frags(napi, page_info, rx->packet_buffer_size, len, ctx); + /* No point in recycling if we didn't get the skb */ + if (skb) { + /* Make sure that the page isn't freed. */ + gve_dec_pagecnt_bias(page_info); + gve_rx_flip_buff(page_info, &data_slot->qpl_offset); + } + } else { + const u16 padding = gve_rx_ctx_padding(ctx); + + skb = gve_rx_copy(netdev, napi, page_info, len, padding, ctx); + if (skb) { + u64_stats_update_begin(&rx->statss); + rx->rx_frag_copy_cnt++; + u64_stats_update_end(&rx->statss); + } + } + return skb; +} + +#define GVE_PKTCONT_BIT_IS_SET(x) (GVE_RXF_PKT_CONT & (x)) +static u16 gve_rx_get_fragment_size(struct gve_rx_ctx *ctx, struct gve_rx_desc *desc) +{ + return be16_to_cpu(desc->len) - gve_rx_ctx_padding(ctx); } -static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc, - netdev_features_t feat, u32 idx) +static bool gve_rx_ctx_init(struct gve_rx_ctx *ctx, struct gve_rx_ring *rx) { + bool qpl_mode = !rx->data.raw_addressing, packet_size_error = false; + bool buffer_error = false, desc_error = false, seqno_error = false; struct gve_rx_slot_page_info *page_info; struct gve_priv *priv = rx->gve; - struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; - struct net_device *dev = priv->dev; - struct sk_buff *skb; - int pagecount; - u16 len; + u32 idx = rx->cnt & rx->mask; + bool reuse_frags, can_flip; + struct gve_rx_desc *desc; + u16 packet_size = 0; + u16 n_frags = 0; + int recycle; - /* drop this packet */ - if (unlikely(rx_desc->flags_seq & GVE_RXF_ERR)) - return true; + /** In QPL mode, we only flip buffers when all buffers containing the packet + * can be flipped. RDA can_flip decisions will be made later, per frag. + */ + can_flip = qpl_mode; + reuse_frags = can_flip; + do { + u16 frag_size; + + n_frags++; + desc = &rx->desc.desc_ring[idx]; + desc_error = unlikely(desc->flags_seq & GVE_RXF_ERR) || desc_error; + if (GVE_SEQNO(desc->flags_seq) != rx->desc.seqno) { + seqno_error = true; + netdev_warn(priv->dev, + "RX seqno error: want=%d, got=%d, dropping packet and scheduling reset.", + rx->desc.seqno, GVE_SEQNO(desc->flags_seq)); + } + frag_size = be16_to_cpu(desc->len); + packet_size += frag_size; + if (frag_size > rx->packet_buffer_size) { + packet_size_error = true; + netdev_warn(priv->dev, + "RX fragment error: packet_buffer_size=%d, frag_size=%d, dropping packet.", + rx->packet_buffer_size, be16_to_cpu(desc->len)); + } + page_info = &rx->data.page_info[idx]; + if (can_flip) { + recycle = gve_rx_can_recycle_buffer(page_info); + reuse_frags = reuse_frags && recycle > 0; + buffer_error = buffer_error || unlikely(recycle < 0); + } + idx = (idx + 1) & rx->mask; + rx->desc.seqno = gve_next_seqno(rx->desc.seqno); + } while (GVE_PKTCONT_BIT_IS_SET(desc->flags_seq)); - len = be16_to_cpu(rx_desc->len) - GVE_RX_PAD; - page_info = &rx->data.page_info[idx]; - dma_sync_single_for_cpu(&priv->pdev->dev, rx->data.qpl->page_buses[idx], - PAGE_SIZE, DMA_FROM_DEVICE); + prefetch(rx->desc.desc_ring + idx); - /* gvnic can only receive into registered segments. If the buffer - * can't be recycled, our only choice is to copy the data out of - * it so that we can return it to the device. - */ + ctx->curr_frag_cnt = 0; + ctx->total_expected_size = packet_size - GVE_RX_PAD; + ctx->expected_frag_cnt = n_frags; + ctx->skb_head = NULL; + ctx->reuse_frags = reuse_frags; - if (PAGE_SIZE == 4096) { - if (len <= priv->rx_copybreak) { - /* Just copy small packets */ - skb = gve_rx_copy(dev, napi, page_info, len); - goto have_skb; - } - if (unlikely(!gve_can_recycle_pages(dev))) { - skb = gve_rx_copy(dev, napi, page_info, len); - goto have_skb; + if (ctx->expected_frag_cnt > 1) { + u64_stats_update_begin(&rx->statss); + rx->rx_cont_packet_cnt++; + u64_stats_update_end(&rx->statss); + } + if (ctx->total_expected_size > priv->rx_copybreak && !ctx->reuse_frags && qpl_mode) { + u64_stats_update_begin(&rx->statss); + rx->rx_copied_pkt++; + u64_stats_update_end(&rx->statss); + } + + if (unlikely(buffer_error || seqno_error || packet_size_error)) { + gve_schedule_reset(priv); + return false; + } + + if (unlikely(desc_error)) { + u64_stats_update_begin(&rx->statss); + rx->rx_desc_err_dropped_pkt++; + u64_stats_update_end(&rx->statss); + return false; + } + return true; +} + +static struct sk_buff *gve_rx_skb(struct gve_priv *priv, struct gve_rx_ring *rx, + struct gve_rx_slot_page_info *page_info, struct napi_struct *napi, + u16 len, union gve_rx_data_slot *data_slot) +{ + struct net_device *netdev = priv->dev; + struct gve_rx_ctx *ctx = &rx->ctx; + struct sk_buff *skb = NULL; + + if (len <= priv->rx_copybreak && ctx->expected_frag_cnt == 1) { + /* Just copy small packets */ + skb = gve_rx_copy(netdev, napi, page_info, len, GVE_RX_PAD, ctx); + if (skb) { + u64_stats_update_begin(&rx->statss); + rx->rx_copied_pkt++; + rx->rx_frag_copy_cnt++; + rx->rx_copybreak_pkt++; + u64_stats_update_end(&rx->statss); } - pagecount = page_count(page_info->page); - if (pagecount == 1) { - /* No part of this page is used by any SKBs; we attach - * the page fragment to a new SKB and pass it up the - * stack. - */ - skb = gve_rx_add_frags(dev, napi, page_info, len); - if (!skb) - return true; - /* Make sure the kernel stack can't release the page */ - get_page(page_info->page); - /* "flip" to other packet buffer on this page */ - gve_rx_flip_buff(page_info, &rx->data.data_ring[idx]); - } else if (pagecount >= 2) { - /* We have previously passed the other half of this - * page up the stack, but it has not yet been freed. - */ - skb = gve_rx_copy(dev, napi, page_info, len); + } else { + if (rx->data.raw_addressing) { + int recycle = gve_rx_can_recycle_buffer(page_info); + + if (unlikely(recycle < 0)) { + gve_schedule_reset(priv); + return NULL; + } + page_info->can_flip = recycle; + if (page_info->can_flip) { + u64_stats_update_begin(&rx->statss); + rx->rx_frag_flip_cnt++; + u64_stats_update_end(&rx->statss); + } + skb = gve_rx_raw_addressing(&priv->pdev->dev, netdev, + page_info, len, napi, + data_slot, + rx->packet_buffer_size, ctx); } else { - WARN(pagecount < 1, "Pagecount should never be < 1"); - return false; + if (ctx->reuse_frags) { + u64_stats_update_begin(&rx->statss); + rx->rx_frag_flip_cnt++; + u64_stats_update_end(&rx->statss); + } + skb = gve_rx_qpl(&priv->pdev->dev, netdev, rx, + page_info, len, napi, data_slot); } - } else { - skb = gve_rx_copy(dev, napi, page_info, len); } + return skb; +} -have_skb: - /* We didn't manage to allocate an skb but we haven't had any - * reset worthy failures. - */ - if (!skb) - return true; +static bool gve_rx(struct gve_rx_ring *rx, netdev_features_t feat, + u64 *packet_size_bytes, u32 *work_done) +{ + struct gve_rx_slot_page_info *page_info; + struct gve_rx_ctx *ctx = &rx->ctx; + union gve_rx_data_slot *data_slot; + struct gve_priv *priv = rx->gve; + struct gve_rx_desc *first_desc; + struct sk_buff *skb = NULL; + struct gve_rx_desc *desc; + struct napi_struct *napi; + dma_addr_t page_bus; + u32 work_cnt = 0; + void *va; + u32 idx; + u16 len; + + idx = rx->cnt & rx->mask; + first_desc = &rx->desc.desc_ring[idx]; + desc = first_desc; + napi = &priv->ntfy_blocks[rx->ntfy_id].napi; + + if (unlikely(!gve_rx_ctx_init(ctx, rx))) + goto skb_alloc_fail; + + while (ctx->curr_frag_cnt < ctx->expected_frag_cnt) { + /* Prefetch two packet buffers ahead, we will need it soon. */ + page_info = &rx->data.page_info[(idx + 2) & rx->mask]; + va = page_info->page_address + page_info->page_offset; + + prefetch(page_info->page); /* Kernel page struct. */ + prefetch(va); /* Packet header. */ + prefetch(va + 64); /* Next cacheline too. */ + + len = gve_rx_get_fragment_size(ctx, desc); + + page_info = &rx->data.page_info[idx]; + data_slot = &rx->data.data_ring[idx]; + page_bus = rx->data.raw_addressing ? + be64_to_cpu(data_slot->addr) - page_info->page_offset : + rx->data.qpl->page_buses[idx]; + dma_sync_single_for_cpu(&priv->pdev->dev, page_bus, PAGE_SIZE, DMA_FROM_DEVICE); + + skb = gve_rx_skb(priv, rx, page_info, napi, len, data_slot); + if (!skb) { + u64_stats_update_begin(&rx->statss); + rx->rx_skb_alloc_fail++; + u64_stats_update_end(&rx->statss); + goto skb_alloc_fail; + } + + ctx->curr_frag_cnt++; + rx->cnt++; + idx = rx->cnt & rx->mask; + work_cnt++; + desc = &rx->desc.desc_ring[idx]; + } if (likely(feat & NETIF_F_RXCSUM)) { /* NIC passes up the partial sum */ - if (rx_desc->csum) + if (first_desc->csum) skb->ip_summed = CHECKSUM_COMPLETE; else skb->ip_summed = CHECKSUM_NONE; - skb->csum = csum_unfold(rx_desc->csum); + skb->csum = csum_unfold(first_desc->csum); } /* parse flags & pass relevant info up */ if (likely(feat & NETIF_F_RXHASH) && - gve_needs_rss(rx_desc->flags_seq)) - skb_set_hash(skb, be32_to_cpu(rx_desc->rss_hash), - gve_rss_type(rx_desc->flags_seq)); + gve_needs_rss(first_desc->flags_seq)) + skb_set_hash(skb, be32_to_cpu(first_desc->rss_hash), + gve_rss_type(first_desc->flags_seq)); + *packet_size_bytes = skb->len + (skb->protocol ? ETH_HLEN : 0); + *work_done = work_cnt; + skb_record_rx_queue(skb, rx->q_num); if (skb_is_nonlinear(skb)) napi_gro_frags(napi); else napi_gro_receive(napi, skb); + + gve_rx_ctx_clear(ctx); return true; + +skb_alloc_fail: + if (napi->skb) + napi_free_frags(napi); + *packet_size_bytes = 0; + *work_done = ctx->expected_frag_cnt; + while (ctx->curr_frag_cnt < ctx->expected_frag_cnt) { + rx->cnt++; + ctx->curr_frag_cnt++; + } + gve_rx_ctx_clear(ctx); + return false; } -static bool gve_rx_work_pending(struct gve_rx_ring *rx) +bool gve_rx_work_pending(struct gve_rx_ring *rx) { struct gve_rx_desc *desc; __be16 flags_seq; @@ -372,25 +641,82 @@ static bool gve_rx_work_pending(struct gve_rx_ring *rx) desc = rx->desc.desc_ring + next_idx; flags_seq = desc->flags_seq; - /* Make sure we have synchronized the seq no with the device */ - smp_rmb(); return (GVE_SEQNO(flags_seq) == rx->desc.seqno); } -bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget, - netdev_features_t feat) +static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx) { + int refill_target = rx->mask + 1; + u32 fill_cnt = rx->fill_cnt; + + while (fill_cnt - rx->cnt < refill_target) { + struct gve_rx_slot_page_info *page_info; + u32 idx = fill_cnt & rx->mask; + + page_info = &rx->data.page_info[idx]; + if (page_info->can_flip) { + /* The other half of the page is free because it was + * free when we processed the descriptor. Flip to it. + */ + union gve_rx_data_slot *data_slot = + &rx->data.data_ring[idx]; + + gve_rx_flip_buff(page_info, &data_slot->addr); + page_info->can_flip = 0; + } else { + /* It is possible that the networking stack has already + * finished processing all outstanding packets in the buffer + * and it can be reused. + * Flipping is unnecessary here - if the networking stack still + * owns half the page it is impossible to tell which half. Either + * the whole page is free or it needs to be replaced. + */ + int recycle = gve_rx_can_recycle_buffer(page_info); + + if (recycle < 0) { + if (!rx->data.raw_addressing) + gve_schedule_reset(priv); + return false; + } + if (!recycle) { + /* We can't reuse the buffer - alloc a new one*/ + union gve_rx_data_slot *data_slot = + &rx->data.data_ring[idx]; + struct device *dev = &priv->pdev->dev; + gve_rx_free_buffer(dev, page_info, data_slot); + page_info->page = NULL; + if (gve_rx_alloc_buffer(priv, dev, page_info, + data_slot)) { + u64_stats_update_begin(&rx->statss); + rx->rx_buf_alloc_fail++; + u64_stats_update_end(&rx->statss); + break; + } + } + } + fill_cnt++; + } + rx->fill_cnt = fill_cnt; + return true; +} + +static int gve_clean_rx_done(struct gve_rx_ring *rx, int budget, + netdev_features_t feat) +{ + u32 work_done = 0, total_packet_cnt = 0, ok_packet_cnt = 0; struct gve_priv *priv = rx->gve; + u32 idx = rx->cnt & rx->mask; struct gve_rx_desc *desc; - u32 cnt = rx->cnt; - u32 idx = cnt & rx->mask; - u32 work_done = 0; u64 bytes = 0; - desc = rx->desc.desc_ring + idx; + desc = &rx->desc.desc_ring[idx]; while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) && work_done < budget) { + u64 packet_size_bytes = 0; + u32 work_cnt = 0; + bool dropped; + netif_info(priv, rx_status, priv->dev, "[%d] idx=%d desc=%p desc->flags_seq=0x%x\n", rx->q_num, idx, desc, desc->flags_seq); @@ -398,35 +724,57 @@ bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget, "[%d] seqno=%d rx->desc.seqno=%d\n", rx->q_num, GVE_SEQNO(desc->flags_seq), rx->desc.seqno); - bytes += be16_to_cpu(desc->len) - GVE_RX_PAD; - if (!gve_rx(rx, desc, feat, idx)) - gve_schedule_reset(priv); - cnt++; - idx = cnt & rx->mask; - desc = rx->desc.desc_ring + idx; - rx->desc.seqno = gve_next_seqno(rx->desc.seqno); - work_done++; + + dropped = !gve_rx(rx, feat, &packet_size_bytes, &work_cnt); + if (!dropped) { + bytes += packet_size_bytes; + ok_packet_cnt++; + } + total_packet_cnt++; + idx = rx->cnt & rx->mask; + desc = &rx->desc.desc_ring[idx]; + work_done += work_cnt; } - if (!work_done) - return false; + if (!work_done && rx->fill_cnt - rx->cnt > rx->db_threshold) + return 0; - u64_stats_update_begin(&rx->statss); - rx->rpackets += work_done; - rx->rbytes += bytes; - u64_stats_update_end(&rx->statss); - rx->cnt = cnt; - rx->fill_cnt += work_done; + if (work_done) { + u64_stats_update_begin(&rx->statss); + rx->rpackets += ok_packet_cnt; + rx->rbytes += bytes; + u64_stats_update_end(&rx->statss); + } + + /* restock ring slots */ + if (!rx->data.raw_addressing) { + /* In QPL mode buffs are refilled as the desc are processed */ + rx->fill_cnt += work_done; + } else if (rx->fill_cnt - rx->cnt <= rx->db_threshold) { + /* In raw addressing mode buffs are only refilled if the avail + * falls below a threshold. + */ + if (!gve_rx_refill_buffers(priv, rx)) + return 0; + + /* If we were not able to completely refill buffers, we'll want + * to schedule this queue for work again to refill buffers. + */ + if (rx->fill_cnt - rx->cnt <= rx->db_threshold) { + gve_rx_write_doorbell(priv, rx); + return budget; + } + } gve_rx_write_doorbell(priv, rx); - return gve_rx_work_pending(rx); + return total_packet_cnt; } -bool gve_rx_poll(struct gve_notify_block *block, int budget) +int gve_rx_poll(struct gve_notify_block *block, int budget) { struct gve_rx_ring *rx = block->rx; netdev_features_t feat; - bool repoll = false; + int work_done = 0; feat = block->napi.dev->features; @@ -435,8 +783,7 @@ bool gve_rx_poll(struct gve_notify_block *block, int budget) budget = INT_MAX; if (budget > 0) - repoll |= gve_clean_rx_done(rx, budget, feat); - else - repoll |= gve_rx_work_pending(rx); - return repoll; + work_done = gve_clean_rx_done(rx, budget, feat); + + return work_done; } diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c new file mode 100644 index 000000000000..2e6461b0ea8b --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c @@ -0,0 +1,756 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2021 Google, Inc. + */ + +#include "gve.h" +#include "gve_dqo.h" +#include "gve_adminq.h" +#include "gve_utils.h" +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <net/ip6_checksum.h> +#include <net/ipv6.h> +#include <net/tcp.h> + +static int gve_buf_ref_cnt(struct gve_rx_buf_state_dqo *bs) +{ + return page_count(bs->page_info.page) - bs->page_info.pagecnt_bias; +} + +static void gve_free_page_dqo(struct gve_priv *priv, + struct gve_rx_buf_state_dqo *bs) +{ + page_ref_sub(bs->page_info.page, bs->page_info.pagecnt_bias - 1); + gve_free_page(&priv->pdev->dev, bs->page_info.page, bs->addr, + DMA_FROM_DEVICE); + bs->page_info.page = NULL; +} + +static struct gve_rx_buf_state_dqo *gve_alloc_buf_state(struct gve_rx_ring *rx) +{ + struct gve_rx_buf_state_dqo *buf_state; + s16 buffer_id; + + buffer_id = rx->dqo.free_buf_states; + if (unlikely(buffer_id == -1)) + return NULL; + + buf_state = &rx->dqo.buf_states[buffer_id]; + + /* Remove buf_state from free list */ + rx->dqo.free_buf_states = buf_state->next; + + /* Point buf_state to itself to mark it as allocated */ + buf_state->next = buffer_id; + + return buf_state; +} + +static bool gve_buf_state_is_allocated(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + s16 buffer_id = buf_state - rx->dqo.buf_states; + + return buf_state->next == buffer_id; +} + +static void gve_free_buf_state(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + s16 buffer_id = buf_state - rx->dqo.buf_states; + + buf_state->next = rx->dqo.free_buf_states; + rx->dqo.free_buf_states = buffer_id; +} + +static struct gve_rx_buf_state_dqo * +gve_dequeue_buf_state(struct gve_rx_ring *rx, struct gve_index_list *list) +{ + struct gve_rx_buf_state_dqo *buf_state; + s16 buffer_id; + + buffer_id = list->head; + if (unlikely(buffer_id == -1)) + return NULL; + + buf_state = &rx->dqo.buf_states[buffer_id]; + + /* Remove buf_state from list */ + list->head = buf_state->next; + if (buf_state->next == -1) + list->tail = -1; + + /* Point buf_state to itself to mark it as allocated */ + buf_state->next = buffer_id; + + return buf_state; +} + +static void gve_enqueue_buf_state(struct gve_rx_ring *rx, + struct gve_index_list *list, + struct gve_rx_buf_state_dqo *buf_state) +{ + s16 buffer_id = buf_state - rx->dqo.buf_states; + + buf_state->next = -1; + + if (list->head == -1) { + list->head = buffer_id; + list->tail = buffer_id; + } else { + int tail = list->tail; + + rx->dqo.buf_states[tail].next = buffer_id; + list->tail = buffer_id; + } +} + +static struct gve_rx_buf_state_dqo * +gve_get_recycled_buf_state(struct gve_rx_ring *rx) +{ + struct gve_rx_buf_state_dqo *buf_state; + int i; + + /* Recycled buf states are immediately usable. */ + buf_state = gve_dequeue_buf_state(rx, &rx->dqo.recycled_buf_states); + if (likely(buf_state)) + return buf_state; + + if (unlikely(rx->dqo.used_buf_states.head == -1)) + return NULL; + + /* Used buf states are only usable when ref count reaches 0, which means + * no SKBs refer to them. + * + * Search a limited number before giving up. + */ + for (i = 0; i < 5; i++) { + buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states); + if (gve_buf_ref_cnt(buf_state) == 0) + return buf_state; + + gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); + } + + /* If there are no free buf states discard an entry from + * `used_buf_states` so it can be used. + */ + if (unlikely(rx->dqo.free_buf_states == -1)) { + buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states); + if (gve_buf_ref_cnt(buf_state) == 0) + return buf_state; + + gve_free_page_dqo(rx->gve, buf_state); + gve_free_buf_state(rx, buf_state); + } + + return NULL; +} + +static int gve_alloc_page_dqo(struct gve_priv *priv, + struct gve_rx_buf_state_dqo *buf_state) +{ + int err; + + err = gve_alloc_page(priv, &priv->pdev->dev, &buf_state->page_info.page, + &buf_state->addr, DMA_FROM_DEVICE, GFP_ATOMIC); + if (err) + return err; + + buf_state->page_info.page_offset = 0; + buf_state->page_info.page_address = + page_address(buf_state->page_info.page); + buf_state->last_single_ref_offset = 0; + + /* The page already has 1 ref. */ + page_ref_add(buf_state->page_info.page, INT_MAX - 1); + buf_state->page_info.pagecnt_bias = INT_MAX; + + return 0; +} + +static void gve_rx_free_ring_dqo(struct gve_priv *priv, int idx) +{ + struct gve_rx_ring *rx = &priv->rx[idx]; + struct device *hdev = &priv->pdev->dev; + size_t completion_queue_slots; + size_t buffer_queue_slots; + size_t size; + int i; + + completion_queue_slots = rx->dqo.complq.mask + 1; + buffer_queue_slots = rx->dqo.bufq.mask + 1; + + gve_rx_remove_from_block(priv, idx); + + if (rx->q_resources) { + dma_free_coherent(hdev, sizeof(*rx->q_resources), + rx->q_resources, rx->q_resources_bus); + rx->q_resources = NULL; + } + + for (i = 0; i < rx->dqo.num_buf_states; i++) { + struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i]; + + if (bs->page_info.page) + gve_free_page_dqo(priv, bs); + } + + if (rx->dqo.bufq.desc_ring) { + size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots; + dma_free_coherent(hdev, size, rx->dqo.bufq.desc_ring, + rx->dqo.bufq.bus); + rx->dqo.bufq.desc_ring = NULL; + } + + if (rx->dqo.complq.desc_ring) { + size = sizeof(rx->dqo.complq.desc_ring[0]) * + completion_queue_slots; + dma_free_coherent(hdev, size, rx->dqo.complq.desc_ring, + rx->dqo.complq.bus); + rx->dqo.complq.desc_ring = NULL; + } + + kvfree(rx->dqo.buf_states); + rx->dqo.buf_states = NULL; + + netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx); +} + +static int gve_rx_alloc_ring_dqo(struct gve_priv *priv, int idx) +{ + struct gve_rx_ring *rx = &priv->rx[idx]; + struct device *hdev = &priv->pdev->dev; + size_t size; + int i; + + const u32 buffer_queue_slots = + priv->options_dqo_rda.rx_buff_ring_entries; + const u32 completion_queue_slots = priv->rx_desc_cnt; + + netif_dbg(priv, drv, priv->dev, "allocating rx ring DQO\n"); + + memset(rx, 0, sizeof(*rx)); + rx->gve = priv; + rx->q_num = idx; + rx->dqo.bufq.mask = buffer_queue_slots - 1; + rx->dqo.complq.num_free_slots = completion_queue_slots; + rx->dqo.complq.mask = completion_queue_slots - 1; + rx->ctx.skb_head = NULL; + rx->ctx.skb_tail = NULL; + + rx->dqo.num_buf_states = min_t(s16, S16_MAX, buffer_queue_slots * 4); + rx->dqo.buf_states = kvcalloc(rx->dqo.num_buf_states, + sizeof(rx->dqo.buf_states[0]), + GFP_KERNEL); + if (!rx->dqo.buf_states) + return -ENOMEM; + + /* Set up linked list of buffer IDs */ + for (i = 0; i < rx->dqo.num_buf_states - 1; i++) + rx->dqo.buf_states[i].next = i + 1; + + rx->dqo.buf_states[rx->dqo.num_buf_states - 1].next = -1; + rx->dqo.recycled_buf_states.head = -1; + rx->dqo.recycled_buf_states.tail = -1; + rx->dqo.used_buf_states.head = -1; + rx->dqo.used_buf_states.tail = -1; + + /* Allocate RX completion queue */ + size = sizeof(rx->dqo.complq.desc_ring[0]) * + completion_queue_slots; + rx->dqo.complq.desc_ring = + dma_alloc_coherent(hdev, size, &rx->dqo.complq.bus, GFP_KERNEL); + if (!rx->dqo.complq.desc_ring) + goto err; + + /* Allocate RX buffer queue */ + size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots; + rx->dqo.bufq.desc_ring = + dma_alloc_coherent(hdev, size, &rx->dqo.bufq.bus, GFP_KERNEL); + if (!rx->dqo.bufq.desc_ring) + goto err; + + rx->q_resources = dma_alloc_coherent(hdev, sizeof(*rx->q_resources), + &rx->q_resources_bus, GFP_KERNEL); + if (!rx->q_resources) + goto err; + + gve_rx_add_to_block(priv, idx); + + return 0; + +err: + gve_rx_free_ring_dqo(priv, idx); + return -ENOMEM; +} + +void gve_rx_write_doorbell_dqo(const struct gve_priv *priv, int queue_idx) +{ + const struct gve_rx_ring *rx = &priv->rx[queue_idx]; + u64 index = be32_to_cpu(rx->q_resources->db_index); + + iowrite32(rx->dqo.bufq.tail, &priv->db_bar2[index]); +} + +int gve_rx_alloc_rings_dqo(struct gve_priv *priv) +{ + int err = 0; + int i; + + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + err = gve_rx_alloc_ring_dqo(priv, i); + if (err) { + netif_err(priv, drv, priv->dev, + "Failed to alloc rx ring=%d: err=%d\n", + i, err); + goto err; + } + } + + return 0; + +err: + for (i--; i >= 0; i--) + gve_rx_free_ring_dqo(priv, i); + + return err; +} + +void gve_rx_free_rings_dqo(struct gve_priv *priv) +{ + int i; + + for (i = 0; i < priv->rx_cfg.num_queues; i++) + gve_rx_free_ring_dqo(priv, i); +} + +void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx) +{ + struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq; + struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq; + struct gve_priv *priv = rx->gve; + u32 num_avail_slots; + u32 num_full_slots; + u32 num_posted = 0; + + num_full_slots = (bufq->tail - bufq->head) & bufq->mask; + num_avail_slots = bufq->mask - num_full_slots; + + num_avail_slots = min_t(u32, num_avail_slots, complq->num_free_slots); + while (num_posted < num_avail_slots) { + struct gve_rx_desc_dqo *desc = &bufq->desc_ring[bufq->tail]; + struct gve_rx_buf_state_dqo *buf_state; + + buf_state = gve_get_recycled_buf_state(rx); + if (unlikely(!buf_state)) { + buf_state = gve_alloc_buf_state(rx); + if (unlikely(!buf_state)) + break; + + if (unlikely(gve_alloc_page_dqo(priv, buf_state))) { + u64_stats_update_begin(&rx->statss); + rx->rx_buf_alloc_fail++; + u64_stats_update_end(&rx->statss); + gve_free_buf_state(rx, buf_state); + break; + } + } + + desc->buf_id = cpu_to_le16(buf_state - rx->dqo.buf_states); + desc->buf_addr = cpu_to_le64(buf_state->addr + + buf_state->page_info.page_offset); + + bufq->tail = (bufq->tail + 1) & bufq->mask; + complq->num_free_slots--; + num_posted++; + + if ((bufq->tail & (GVE_RX_BUF_THRESH_DQO - 1)) == 0) + gve_rx_write_doorbell_dqo(priv, rx->q_num); + } + + rx->fill_cnt += num_posted; +} + +static void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + const int data_buffer_size = priv->data_buffer_size_dqo; + int pagecount; + + /* Can't reuse if we only fit one buffer per page */ + if (data_buffer_size * 2 > PAGE_SIZE) + goto mark_used; + + pagecount = gve_buf_ref_cnt(buf_state); + + /* Record the offset when we have a single remaining reference. + * + * When this happens, we know all of the other offsets of the page are + * usable. + */ + if (pagecount == 1) { + buf_state->last_single_ref_offset = + buf_state->page_info.page_offset; + } + + /* Use the next buffer sized chunk in the page. */ + buf_state->page_info.page_offset += data_buffer_size; + buf_state->page_info.page_offset &= (PAGE_SIZE - 1); + + /* If we wrap around to the same offset without ever dropping to 1 + * reference, then we don't know if this offset was ever freed. + */ + if (buf_state->page_info.page_offset == + buf_state->last_single_ref_offset) { + goto mark_used; + } + + gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state); + return; + +mark_used: + gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); +} + +static void gve_rx_skb_csum(struct sk_buff *skb, + const struct gve_rx_compl_desc_dqo *desc, + struct gve_ptype ptype) +{ + skb->ip_summed = CHECKSUM_NONE; + + /* HW did not identify and process L3 and L4 headers. */ + if (unlikely(!desc->l3_l4_processed)) + return; + + if (ptype.l3_type == GVE_L3_TYPE_IPV4) { + if (unlikely(desc->csum_ip_err || desc->csum_external_ip_err)) + return; + } else if (ptype.l3_type == GVE_L3_TYPE_IPV6) { + /* Checksum should be skipped if this flag is set. */ + if (unlikely(desc->ipv6_ex_add)) + return; + } + + if (unlikely(desc->csum_l4_err)) + return; + + switch (ptype.l4_type) { + case GVE_L4_TYPE_TCP: + case GVE_L4_TYPE_UDP: + case GVE_L4_TYPE_ICMP: + case GVE_L4_TYPE_SCTP: + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + default: + break; + } +} + +static void gve_rx_skb_hash(struct sk_buff *skb, + const struct gve_rx_compl_desc_dqo *compl_desc, + struct gve_ptype ptype) +{ + enum pkt_hash_types hash_type = PKT_HASH_TYPE_L2; + + if (ptype.l4_type != GVE_L4_TYPE_UNKNOWN) + hash_type = PKT_HASH_TYPE_L4; + else if (ptype.l3_type != GVE_L3_TYPE_UNKNOWN) + hash_type = PKT_HASH_TYPE_L3; + + skb_set_hash(skb, le32_to_cpu(compl_desc->hash), hash_type); +} + +static void gve_rx_free_skb(struct gve_rx_ring *rx) +{ + if (!rx->ctx.skb_head) + return; + + dev_kfree_skb_any(rx->ctx.skb_head); + rx->ctx.skb_head = NULL; + rx->ctx.skb_tail = NULL; +} + +/* Chains multi skbs for single rx packet. + * Returns 0 if buffer is appended, -1 otherwise. + */ +static int gve_rx_append_frags(struct napi_struct *napi, + struct gve_rx_buf_state_dqo *buf_state, + u16 buf_len, struct gve_rx_ring *rx, + struct gve_priv *priv) +{ + int num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags; + + if (unlikely(num_frags == MAX_SKB_FRAGS)) { + struct sk_buff *skb; + + skb = napi_alloc_skb(napi, 0); + if (!skb) + return -1; + + skb_shinfo(rx->ctx.skb_tail)->frag_list = skb; + rx->ctx.skb_tail = skb; + num_frags = 0; + } + if (rx->ctx.skb_tail != rx->ctx.skb_head) { + rx->ctx.skb_head->len += buf_len; + rx->ctx.skb_head->data_len += buf_len; + rx->ctx.skb_head->truesize += priv->data_buffer_size_dqo; + } + + skb_add_rx_frag(rx->ctx.skb_tail, num_frags, + buf_state->page_info.page, + buf_state->page_info.page_offset, + buf_len, priv->data_buffer_size_dqo); + gve_dec_pagecnt_bias(&buf_state->page_info); + + return 0; +} + +/* Returns 0 if descriptor is completed successfully. + * Returns -EINVAL if descriptor is invalid. + * Returns -ENOMEM if data cannot be copied to skb. + */ +static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, + const struct gve_rx_compl_desc_dqo *compl_desc, + int queue_idx) +{ + const u16 buffer_id = le16_to_cpu(compl_desc->buf_id); + const bool eop = compl_desc->end_of_packet != 0; + struct gve_rx_buf_state_dqo *buf_state; + struct gve_priv *priv = rx->gve; + u16 buf_len; + + if (unlikely(buffer_id >= rx->dqo.num_buf_states)) { + net_err_ratelimited("%s: Invalid RX buffer_id=%u\n", + priv->dev->name, buffer_id); + return -EINVAL; + } + buf_state = &rx->dqo.buf_states[buffer_id]; + if (unlikely(!gve_buf_state_is_allocated(rx, buf_state))) { + net_err_ratelimited("%s: RX buffer_id is not allocated: %u\n", + priv->dev->name, buffer_id); + return -EINVAL; + } + + if (unlikely(compl_desc->rx_error)) { + gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, + buf_state); + return -EINVAL; + } + + buf_len = compl_desc->packet_len; + + /* Page might have not been used for awhile and was likely last written + * by a different thread. + */ + prefetch(buf_state->page_info.page); + + /* Sync the portion of dma buffer for CPU to read. */ + dma_sync_single_range_for_cpu(&priv->pdev->dev, buf_state->addr, + buf_state->page_info.page_offset, + buf_len, DMA_FROM_DEVICE); + + /* Append to current skb if one exists. */ + if (rx->ctx.skb_head) { + if (unlikely(gve_rx_append_frags(napi, buf_state, buf_len, rx, + priv)) != 0) { + goto error; + } + + gve_try_recycle_buf(priv, rx, buf_state); + return 0; + } + + if (eop && buf_len <= priv->rx_copybreak) { + rx->ctx.skb_head = gve_rx_copy(priv->dev, napi, + &buf_state->page_info, buf_len, 0, NULL); + if (unlikely(!rx->ctx.skb_head)) + goto error; + rx->ctx.skb_tail = rx->ctx.skb_head; + + u64_stats_update_begin(&rx->statss); + rx->rx_copied_pkt++; + rx->rx_copybreak_pkt++; + u64_stats_update_end(&rx->statss); + + gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, + buf_state); + return 0; + } + + rx->ctx.skb_head = napi_get_frags(napi); + if (unlikely(!rx->ctx.skb_head)) + goto error; + rx->ctx.skb_tail = rx->ctx.skb_head; + + skb_add_rx_frag(rx->ctx.skb_head, 0, buf_state->page_info.page, + buf_state->page_info.page_offset, buf_len, + priv->data_buffer_size_dqo); + gve_dec_pagecnt_bias(&buf_state->page_info); + + gve_try_recycle_buf(priv, rx, buf_state); + return 0; + +error: + gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state); + return -ENOMEM; +} + +static int gve_rx_complete_rsc(struct sk_buff *skb, + const struct gve_rx_compl_desc_dqo *desc, + struct gve_ptype ptype) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + + /* Only TCP is supported right now. */ + if (ptype.l4_type != GVE_L4_TYPE_TCP) + return -EINVAL; + + switch (ptype.l3_type) { + case GVE_L3_TYPE_IPV4: + shinfo->gso_type = SKB_GSO_TCPV4; + break; + case GVE_L3_TYPE_IPV6: + shinfo->gso_type = SKB_GSO_TCPV6; + break; + default: + return -EINVAL; + } + + shinfo->gso_size = le16_to_cpu(desc->rsc_seg_len); + return 0; +} + +/* Returns 0 if skb is completed successfully, -1 otherwise. */ +static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi, + const struct gve_rx_compl_desc_dqo *desc, + netdev_features_t feat) +{ + struct gve_ptype ptype = + rx->gve->ptype_lut_dqo->ptypes[desc->packet_type]; + int err; + + skb_record_rx_queue(rx->ctx.skb_head, rx->q_num); + + if (feat & NETIF_F_RXHASH) + gve_rx_skb_hash(rx->ctx.skb_head, desc, ptype); + + if (feat & NETIF_F_RXCSUM) + gve_rx_skb_csum(rx->ctx.skb_head, desc, ptype); + + /* RSC packets must set gso_size otherwise the TCP stack will complain + * that packets are larger than MTU. + */ + if (desc->rsc) { + err = gve_rx_complete_rsc(rx->ctx.skb_head, desc, ptype); + if (err < 0) + return err; + } + + if (skb_headlen(rx->ctx.skb_head) == 0) + napi_gro_frags(napi); + else + napi_gro_receive(napi, rx->ctx.skb_head); + + return 0; +} + +int gve_rx_poll_dqo(struct gve_notify_block *block, int budget) +{ + struct napi_struct *napi = &block->napi; + netdev_features_t feat = napi->dev->features; + + struct gve_rx_ring *rx = block->rx; + struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq; + + u32 work_done = 0; + u64 bytes = 0; + int err; + + while (work_done < budget) { + struct gve_rx_compl_desc_dqo *compl_desc = + &complq->desc_ring[complq->head]; + u32 pkt_bytes; + + /* No more new packets */ + if (compl_desc->generation == complq->cur_gen_bit) + break; + + /* Prefetch the next two descriptors. */ + prefetch(&complq->desc_ring[(complq->head + 1) & complq->mask]); + prefetch(&complq->desc_ring[(complq->head + 2) & complq->mask]); + + /* Do not read data until we own the descriptor */ + dma_rmb(); + + err = gve_rx_dqo(napi, rx, compl_desc, rx->q_num); + if (err < 0) { + gve_rx_free_skb(rx); + u64_stats_update_begin(&rx->statss); + if (err == -ENOMEM) + rx->rx_skb_alloc_fail++; + else if (err == -EINVAL) + rx->rx_desc_err_dropped_pkt++; + u64_stats_update_end(&rx->statss); + } + + complq->head = (complq->head + 1) & complq->mask; + complq->num_free_slots++; + + /* When the ring wraps, the generation bit is flipped. */ + complq->cur_gen_bit ^= (complq->head == 0); + + /* Receiving a completion means we have space to post another + * buffer on the buffer queue. + */ + { + struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq; + + bufq->head = (bufq->head + 1) & bufq->mask; + } + + /* Free running counter of completed descriptors */ + rx->cnt++; + + if (!rx->ctx.skb_head) + continue; + + if (!compl_desc->end_of_packet) + continue; + + work_done++; + pkt_bytes = rx->ctx.skb_head->len; + /* The ethernet header (first ETH_HLEN bytes) is snipped off + * by eth_type_trans. + */ + if (skb_headlen(rx->ctx.skb_head)) + pkt_bytes += ETH_HLEN; + + /* gve_rx_complete_skb() will consume skb if successful */ + if (gve_rx_complete_skb(rx, napi, compl_desc, feat) != 0) { + gve_rx_free_skb(rx); + u64_stats_update_begin(&rx->statss); + rx->rx_desc_err_dropped_pkt++; + u64_stats_update_end(&rx->statss); + continue; + } + + bytes += pkt_bytes; + rx->ctx.skb_head = NULL; + rx->ctx.skb_tail = NULL; + } + + gve_rx_post_buffers_dqo(rx); + + u64_stats_update_begin(&rx->statss); + rx->rpackets += work_done; + rx->rbytes += bytes; + u64_stats_update_end(&rx->statss); + + return work_done; +} diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index d0244feb0301..4888bf05fbed 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -1,11 +1,12 @@ // SPDX-License-Identifier: (GPL-2.0 OR MIT) /* Google virtual Ethernet (gve) driver * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc. */ #include "gve.h" #include "gve_adminq.h" +#include "gve_utils.h" #include <linux/ip.h> #include <linux/tcp.h> #include <linux/vmalloc.h> @@ -131,14 +132,6 @@ static void gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes) atomic_add(bytes, &fifo->available); } -static void gve_tx_remove_from_block(struct gve_priv *priv, int queue_idx) -{ - struct gve_notify_block *block = - &priv->ntfy_blocks[gve_tx_idx_to_ntfy(priv, queue_idx)]; - - block->tx = NULL; -} - static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx, u32 to_do, bool try_to_wake); @@ -151,16 +144,18 @@ static void gve_tx_free_ring(struct gve_priv *priv, int idx) gve_tx_remove_from_block(priv, idx); slots = tx->mask + 1; - gve_clean_tx_done(priv, tx, tx->req, false); + gve_clean_tx_done(priv, tx, priv->tx_desc_cnt, false); netdev_tx_reset_queue(tx->netdev_txq); dma_free_coherent(hdev, sizeof(*tx->q_resources), tx->q_resources, tx->q_resources_bus); tx->q_resources = NULL; - gve_tx_fifo_release(priv, &tx->tx_fifo); - gve_unassign_qpl(priv, tx->tx_fifo.qpl->id); - tx->tx_fifo.qpl = NULL; + if (!tx->raw_addressing) { + gve_tx_fifo_release(priv, &tx->tx_fifo); + gve_unassign_qpl(priv, tx->tx_fifo.qpl->id); + tx->tx_fifo.qpl = NULL; + } bytes = sizeof(*tx->desc) * slots; dma_free_coherent(hdev, bytes, tx->desc, tx->bus); @@ -172,16 +167,6 @@ static void gve_tx_free_ring(struct gve_priv *priv, int idx) netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx); } -static void gve_tx_add_to_block(struct gve_priv *priv, int queue_idx) -{ - int ntfy_idx = gve_tx_idx_to_ntfy(priv, queue_idx); - struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; - struct gve_tx_ring *tx = &priv->tx[queue_idx]; - - block->tx = tx; - tx->ntfy_id = ntfy_idx; -} - static int gve_tx_alloc_ring(struct gve_priv *priv, int idx) { struct gve_tx_ring *tx = &priv->tx[idx]; @@ -191,6 +176,7 @@ static int gve_tx_alloc_ring(struct gve_priv *priv, int idx) /* Make sure everything is zeroed to start */ memset(tx, 0, sizeof(*tx)); + spin_lock_init(&tx->clean_lock); tx->q_num = idx; tx->mask = slots - 1; @@ -206,11 +192,16 @@ static int gve_tx_alloc_ring(struct gve_priv *priv, int idx) if (!tx->desc) goto abort_with_info; - tx->tx_fifo.qpl = gve_assign_tx_qpl(priv); - - /* map Tx FIFO */ - if (gve_tx_fifo_init(priv, &tx->tx_fifo)) - goto abort_with_desc; + tx->raw_addressing = priv->queue_format == GVE_GQI_RDA_FORMAT; + tx->dev = &priv->pdev->dev; + if (!tx->raw_addressing) { + tx->tx_fifo.qpl = gve_assign_tx_qpl(priv); + if (!tx->tx_fifo.qpl) + goto abort_with_desc; + /* map Tx FIFO */ + if (gve_tx_fifo_init(priv, &tx->tx_fifo)) + goto abort_with_qpl; + } tx->q_resources = dma_alloc_coherent(hdev, @@ -228,7 +219,11 @@ static int gve_tx_alloc_ring(struct gve_priv *priv, int idx) return 0; abort_with_fifo: - gve_tx_fifo_release(priv, &tx->tx_fifo); + if (!tx->raw_addressing) + gve_tx_fifo_release(priv, &tx->tx_fifo); +abort_with_qpl: + if (!tx->raw_addressing) + gve_unassign_qpl(priv, tx->tx_fifo.qpl->id); abort_with_desc: dma_free_coherent(hdev, bytes, tx->desc, tx->bus); tx->desc = NULL; @@ -262,7 +257,7 @@ int gve_tx_alloc_rings(struct gve_priv *priv) return err; } -void gve_tx_free_rings(struct gve_priv *priv) +void gve_tx_free_rings_gqi(struct gve_priv *priv) { int i; @@ -301,53 +296,81 @@ static inline int gve_skb_fifo_bytes_required(struct gve_tx_ring *tx, return bytes; } -/* The most descriptors we could need are 3 - 1 for the headers, 1 for - * the beginning of the payload at the end of the FIFO, and 1 if the - * payload wraps to the beginning of the FIFO. +/* The most descriptors we could need is MAX_SKB_FRAGS + 4 : + * 1 for each skb frag + * 1 for the skb linear portion + * 1 for when tcp hdr needs to be in separate descriptor + * 1 if the payload wraps to the beginning of the FIFO + * 1 for metadata descriptor */ -#define MAX_TX_DESC_NEEDED 3 +#define MAX_TX_DESC_NEEDED (MAX_SKB_FRAGS + 4) +static void gve_tx_unmap_buf(struct device *dev, struct gve_tx_buffer_state *info) +{ + if (info->skb) { + dma_unmap_single(dev, dma_unmap_addr(info, dma), + dma_unmap_len(info, len), + DMA_TO_DEVICE); + dma_unmap_len_set(info, len, 0); + } else { + dma_unmap_page(dev, dma_unmap_addr(info, dma), + dma_unmap_len(info, len), + DMA_TO_DEVICE); + dma_unmap_len_set(info, len, 0); + } +} /* Check if sufficient resources (descriptor ring space, FIFO space) are * available to transmit the given number of bytes. */ static inline bool gve_can_tx(struct gve_tx_ring *tx, int bytes_required) { - return (gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED && - gve_tx_fifo_can_alloc(&tx->tx_fifo, bytes_required)); + bool can_alloc = true; + + if (!tx->raw_addressing) + can_alloc = gve_tx_fifo_can_alloc(&tx->tx_fifo, bytes_required); + + return (gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED && can_alloc); } +static_assert(NAPI_POLL_WEIGHT >= MAX_TX_DESC_NEEDED); + /* Stops the queue if the skb cannot be transmitted. */ -static int gve_maybe_stop_tx(struct gve_tx_ring *tx, struct sk_buff *skb) +static int gve_maybe_stop_tx(struct gve_priv *priv, struct gve_tx_ring *tx, + struct sk_buff *skb) { - int bytes_required; + int bytes_required = 0; + u32 nic_done; + u32 to_do; + int ret; + + if (!tx->raw_addressing) + bytes_required = gve_skb_fifo_bytes_required(tx, skb); - bytes_required = gve_skb_fifo_bytes_required(tx, skb); if (likely(gve_can_tx(tx, bytes_required))) return 0; - /* No space, so stop the queue */ - tx->stop_queue++; - netif_tx_stop_queue(tx->netdev_txq); - smp_mb(); /* sync with restarting queue in gve_clean_tx_done() */ - - /* Now check for resources again, in case gve_clean_tx_done() freed - * resources after we checked and we stopped the queue after - * gve_clean_tx_done() checked. - * - * gve_maybe_stop_tx() gve_clean_tx_done() - * nsegs/can_alloc test failed - * gve_tx_free_fifo() - * if (tx queue stopped) - * netif_tx_queue_wake() - * netif_tx_stop_queue() - * Need to check again for space here! - */ - if (likely(!gve_can_tx(tx, bytes_required))) - return -EBUSY; + ret = -EBUSY; + spin_lock(&tx->clean_lock); + nic_done = gve_tx_load_event_counter(priv, tx); + to_do = nic_done - tx->done; - netif_tx_start_queue(tx->netdev_txq); - tx->wake_queue++; - return 0; + /* Only try to clean if there is hope for TX */ + if (to_do + gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED) { + if (to_do > 0) { + to_do = min_t(u32, to_do, NAPI_POLL_WEIGHT); + gve_clean_tx_done(priv, tx, to_do, false); + } + if (likely(gve_can_tx(tx, bytes_required))) + ret = 0; + } + if (ret) { + /* No space, so stop the queue */ + tx->stop_queue++; + netif_tx_stop_queue(tx->netdev_txq); + } + spin_unlock(&tx->clean_lock); + + return ret; } static void gve_tx_fill_pkt_desc(union gve_tx_desc *pkt_desc, @@ -375,6 +398,19 @@ static void gve_tx_fill_pkt_desc(union gve_tx_desc *pkt_desc, pkt_desc->pkt.seg_addr = cpu_to_be64(addr); } +static void gve_tx_fill_mtd_desc(union gve_tx_desc *mtd_desc, + struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(mtd_desc->mtd) != sizeof(mtd_desc->pkt)); + + mtd_desc->mtd.type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH; + mtd_desc->mtd.path_state = GVE_MTD_PATH_STATE_DEFAULT | + GVE_MTD_PATH_HASH_L4; + mtd_desc->mtd.path_hash = cpu_to_be32(skb->hash); + mtd_desc->mtd.reserved0 = 0; + mtd_desc->mtd.reserved1 = 0; +} + static void gve_tx_fill_seg_desc(union gve_tx_desc *seg_desc, struct sk_buff *skb, bool is_gso, u16 len, u64 addr) @@ -395,21 +431,18 @@ static void gve_dma_sync_for_device(struct device *dev, dma_addr_t *page_buses, { u64 last_page = (iov_offset + iov_len - 1) / PAGE_SIZE; u64 first_page = iov_offset / PAGE_SIZE; - dma_addr_t dma; u64 page; - for (page = first_page; page <= last_page; page++) { - dma = page_buses[page]; - dma_sync_single_for_device(dev, dma, PAGE_SIZE, DMA_TO_DEVICE); - } + for (page = first_page; page <= last_page; page++) + dma_sync_single_for_device(dev, page_buses[page], PAGE_SIZE, DMA_TO_DEVICE); } -static int gve_tx_add_skb(struct gve_tx_ring *tx, struct sk_buff *skb, - struct device *dev) +static int gve_tx_add_skb_copy(struct gve_priv *priv, struct gve_tx_ring *tx, struct sk_buff *skb) { int pad_bytes, hlen, hdr_nfrags, payload_nfrags, l4_hdr_offset; union gve_tx_desc *pkt_desc, *seg_desc; struct gve_tx_buffer_state *info; + int mtd_desc_nr = !!skb->l4_hash; bool is_gso = skb_is_gso(skb); u32 idx = tx->req & tx->mask; int payload_iov = 2; @@ -441,19 +474,24 @@ static int gve_tx_add_skb(struct gve_tx_ring *tx, struct sk_buff *skb, &info->iov[payload_iov]); gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset, - 1 + payload_nfrags, hlen, + 1 + mtd_desc_nr + payload_nfrags, hlen, info->iov[hdr_nfrags - 1].iov_offset); skb_copy_bits(skb, 0, tx->tx_fifo.base + info->iov[hdr_nfrags - 1].iov_offset, hlen); - gve_dma_sync_for_device(dev, tx->tx_fifo.qpl->page_buses, + gve_dma_sync_for_device(&priv->pdev->dev, tx->tx_fifo.qpl->page_buses, info->iov[hdr_nfrags - 1].iov_offset, info->iov[hdr_nfrags - 1].iov_len); copy_offset = hlen; + if (mtd_desc_nr) { + next_idx = (tx->req + 1) & tx->mask; + gve_tx_fill_mtd_desc(&tx->desc[next_idx], skb); + } + for (i = payload_iov; i < payload_nfrags + payload_iov; i++) { - next_idx = (tx->req + 1 + i - payload_iov) & tx->mask; + next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask; seg_desc = &tx->desc[next_idx]; gve_tx_fill_seg_desc(seg_desc, skb, is_gso, @@ -463,13 +501,109 @@ static int gve_tx_add_skb(struct gve_tx_ring *tx, struct sk_buff *skb, skb_copy_bits(skb, copy_offset, tx->tx_fifo.base + info->iov[i].iov_offset, info->iov[i].iov_len); - gve_dma_sync_for_device(dev, tx->tx_fifo.qpl->page_buses, + gve_dma_sync_for_device(&priv->pdev->dev, tx->tx_fifo.qpl->page_buses, info->iov[i].iov_offset, info->iov[i].iov_len); copy_offset += info->iov[i].iov_len; } - return 1 + payload_nfrags; + return 1 + mtd_desc_nr + payload_nfrags; +} + +static int gve_tx_add_skb_no_copy(struct gve_priv *priv, struct gve_tx_ring *tx, + struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + int hlen, num_descriptors, l4_hdr_offset; + union gve_tx_desc *pkt_desc, *mtd_desc, *seg_desc; + struct gve_tx_buffer_state *info; + int mtd_desc_nr = !!skb->l4_hash; + bool is_gso = skb_is_gso(skb); + u32 idx = tx->req & tx->mask; + u64 addr; + u32 len; + int i; + + info = &tx->info[idx]; + pkt_desc = &tx->desc[idx]; + + l4_hdr_offset = skb_checksum_start_offset(skb); + /* If the skb is gso, then we want only up to the tcp header in the first segment + * to efficiently replicate on each segment otherwise we want the linear portion + * of the skb (which will contain the checksum because skb->csum_start and + * skb->csum_offset are given relative to skb->head) in the first segment. + */ + hlen = is_gso ? l4_hdr_offset + tcp_hdrlen(skb) : skb_headlen(skb); + len = skb_headlen(skb); + + info->skb = skb; + + addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx->dev, addr))) { + tx->dma_mapping_error++; + goto drop; + } + dma_unmap_len_set(info, len, len); + dma_unmap_addr_set(info, dma, addr); + + num_descriptors = 1 + shinfo->nr_frags; + if (hlen < len) + num_descriptors++; + if (mtd_desc_nr) + num_descriptors++; + + gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset, + num_descriptors, hlen, addr); + + if (mtd_desc_nr) { + idx = (idx + 1) & tx->mask; + mtd_desc = &tx->desc[idx]; + gve_tx_fill_mtd_desc(mtd_desc, skb); + } + + if (hlen < len) { + /* For gso the rest of the linear portion of the skb needs to + * be in its own descriptor. + */ + len -= hlen; + addr += hlen; + idx = (idx + 1) & tx->mask; + seg_desc = &tx->desc[idx]; + gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr); + } + + for (i = 0; i < shinfo->nr_frags; i++) { + const skb_frag_t *frag = &shinfo->frags[i]; + + idx = (idx + 1) & tx->mask; + seg_desc = &tx->desc[idx]; + len = skb_frag_size(frag); + addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx->dev, addr))) { + tx->dma_mapping_error++; + goto unmap_drop; + } + tx->info[idx].skb = NULL; + dma_unmap_len_set(&tx->info[idx], len, len); + dma_unmap_addr_set(&tx->info[idx], dma, addr); + + gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr); + } + + return num_descriptors; + +unmap_drop: + i += num_descriptors - shinfo->nr_frags; + while (i--) { + /* Skip metadata descriptor, if set */ + if (i == 1 && mtd_desc_nr == 1) + continue; + idx--; + gve_tx_unmap_buf(tx->dev, &tx->info[idx & tx->mask]); + } +drop: + tx->dropped_pkt++; + return 0; } netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev) @@ -478,10 +612,10 @@ netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev) struct gve_tx_ring *tx; int nsegs; - WARN(skb_get_queue_mapping(skb) > priv->tx_cfg.num_queues, + WARN(skb_get_queue_mapping(skb) >= priv->tx_cfg.num_queues, "skb queue index out of range"); tx = &priv->tx[skb_get_queue_mapping(skb)]; - if (unlikely(gve_maybe_stop_tx(tx, skb))) { + if (unlikely(gve_maybe_stop_tx(priv, tx, skb))) { /* We need to ring the txq doorbell -- we have stopped the Tx * queue for want of resources, but prior calls to gve_tx() * may have added descriptors without ringing the doorbell. @@ -490,17 +624,26 @@ netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev) gve_tx_put_doorbell(priv, tx->q_resources, tx->req); return NETDEV_TX_BUSY; } - nsegs = gve_tx_add_skb(tx, skb, &priv->pdev->dev); - - netdev_tx_sent_queue(tx->netdev_txq, skb->len); - skb_tx_timestamp(skb); - - /* give packets to NIC */ - tx->req += nsegs; + if (tx->raw_addressing) + nsegs = gve_tx_add_skb_no_copy(priv, tx, skb); + else + nsegs = gve_tx_add_skb_copy(priv, tx, skb); + + /* If the packet is getting sent, we need to update the skb */ + if (nsegs) { + netdev_tx_sent_queue(tx->netdev_txq, skb->len); + skb_tx_timestamp(skb); + tx->req += nsegs; + } else { + dev_kfree_skb_any(skb); + } if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more()) return NETDEV_TX_OK; + /* Give packets to NIC. Even if this packet failed to send the doorbell + * might need to be rung because of xmit_more. + */ gve_tx_put_doorbell(priv, tx->q_resources, tx->req); return NETDEV_TX_OK; } @@ -525,24 +668,29 @@ static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx, info = &tx->info[idx]; skb = info->skb; + /* Unmap the buffer */ + if (tx->raw_addressing) + gve_tx_unmap_buf(tx->dev, info); + tx->done++; /* Mark as free */ if (skb) { info->skb = NULL; bytes += skb->len; pkts++; dev_consume_skb_any(skb); + if (tx->raw_addressing) + continue; /* FIFO free */ for (i = 0; i < ARRAY_SIZE(info->iov); i++) { - space_freed += info->iov[i].iov_len + - info->iov[i].iov_padding; + space_freed += info->iov[i].iov_len + info->iov[i].iov_padding; info->iov[i].iov_len = 0; info->iov[i].iov_padding = 0; } } - tx->done++; } - gve_tx_free_fifo(&tx->tx_fifo, space_freed); + if (!tx->raw_addressing) + gve_tx_free_fifo(&tx->tx_fifo, space_freed); u64_stats_update_begin(&tx->statss); tx->bytes_done += bytes; tx->pkt_done += pkts; @@ -563,19 +711,19 @@ static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx, return pkts; } -__be32 gve_tx_load_event_counter(struct gve_priv *priv, - struct gve_tx_ring *tx) +u32 gve_tx_load_event_counter(struct gve_priv *priv, + struct gve_tx_ring *tx) { - u32 counter_index = be32_to_cpu((tx->q_resources->counter_index)); + u32 counter_index = be32_to_cpu(tx->q_resources->counter_index); + __be32 counter = READ_ONCE(priv->counter_array[counter_index]); - return READ_ONCE(priv->counter_array[counter_index]); + return be32_to_cpu(counter); } bool gve_tx_poll(struct gve_notify_block *block, int budget) { struct gve_priv *priv = block->priv; struct gve_tx_ring *tx = block->tx; - bool repoll = false; u32 nic_done; u32 to_do; @@ -583,17 +731,23 @@ bool gve_tx_poll(struct gve_notify_block *block, int budget) if (budget == 0) budget = INT_MAX; + /* In TX path, it may try to clean completed pkts in order to xmit, + * to avoid cleaning conflict, use spin_lock(), it yields better + * concurrency between xmit/clean than netif's lock. + */ + spin_lock(&tx->clean_lock); /* Find out how much work there is to be done */ - tx->last_nic_done = gve_tx_load_event_counter(priv, tx); - nic_done = be32_to_cpu(tx->last_nic_done); - if (budget > 0) { - /* Do as much work as we have that the budget will - * allow - */ - to_do = min_t(u32, (nic_done - tx->done), budget); - gve_clean_tx_done(priv, tx, to_do, true); - } + nic_done = gve_tx_load_event_counter(priv, tx); + to_do = min_t(u32, (nic_done - tx->done), budget); + gve_clean_tx_done(priv, tx, to_do, true); + spin_unlock(&tx->clean_lock); /* If we still have work we want to repoll */ - repoll |= (nic_done != tx->done); - return repoll; + return nic_done != tx->done; +} + +bool gve_tx_clean_pending(struct gve_priv *priv, struct gve_tx_ring *tx) +{ + u32 nic_done = gve_tx_load_event_counter(priv, tx); + + return nic_done != tx->done; } diff --git a/drivers/net/ethernet/google/gve/gve_tx_dqo.c b/drivers/net/ethernet/google/gve/gve_tx_dqo.c new file mode 100644 index 000000000000..588d64819ed5 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_tx_dqo.c @@ -0,0 +1,1022 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2021 Google, Inc. + */ + +#include "gve.h" +#include "gve_adminq.h" +#include "gve_utils.h" +#include "gve_dqo.h" +#include <linux/tcp.h> +#include <linux/slab.h> +#include <linux/skbuff.h> + +/* Returns true if a gve_tx_pending_packet_dqo object is available. */ +static bool gve_has_pending_packet(struct gve_tx_ring *tx) +{ + /* Check TX path's list. */ + if (tx->dqo_tx.free_pending_packets != -1) + return true; + + /* Check completion handler's list. */ + if (atomic_read_acquire(&tx->dqo_compl.free_pending_packets) != -1) + return true; + + return false; +} + +static struct gve_tx_pending_packet_dqo * +gve_alloc_pending_packet(struct gve_tx_ring *tx) +{ + struct gve_tx_pending_packet_dqo *pending_packet; + s16 index; + + index = tx->dqo_tx.free_pending_packets; + + /* No pending_packets available, try to steal the list from the + * completion handler. + */ + if (unlikely(index == -1)) { + tx->dqo_tx.free_pending_packets = + atomic_xchg(&tx->dqo_compl.free_pending_packets, -1); + index = tx->dqo_tx.free_pending_packets; + + if (unlikely(index == -1)) + return NULL; + } + + pending_packet = &tx->dqo.pending_packets[index]; + + /* Remove pending_packet from free list */ + tx->dqo_tx.free_pending_packets = pending_packet->next; + pending_packet->state = GVE_PACKET_STATE_PENDING_DATA_COMPL; + + return pending_packet; +} + +static void +gve_free_pending_packet(struct gve_tx_ring *tx, + struct gve_tx_pending_packet_dqo *pending_packet) +{ + s16 index = pending_packet - tx->dqo.pending_packets; + + pending_packet->state = GVE_PACKET_STATE_UNALLOCATED; + while (true) { + s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_pending_packets); + + pending_packet->next = old_head; + if (atomic_cmpxchg(&tx->dqo_compl.free_pending_packets, + old_head, index) == old_head) { + break; + } + } +} + +/* gve_tx_free_desc - Cleans up all pending tx requests and buffers. + */ +static void gve_tx_clean_pending_packets(struct gve_tx_ring *tx) +{ + int i; + + for (i = 0; i < tx->dqo.num_pending_packets; i++) { + struct gve_tx_pending_packet_dqo *cur_state = + &tx->dqo.pending_packets[i]; + int j; + + for (j = 0; j < cur_state->num_bufs; j++) { + if (j == 0) { + dma_unmap_single(tx->dev, + dma_unmap_addr(cur_state, dma[j]), + dma_unmap_len(cur_state, len[j]), + DMA_TO_DEVICE); + } else { + dma_unmap_page(tx->dev, + dma_unmap_addr(cur_state, dma[j]), + dma_unmap_len(cur_state, len[j]), + DMA_TO_DEVICE); + } + } + if (cur_state->skb) { + dev_consume_skb_any(cur_state->skb); + cur_state->skb = NULL; + } + } +} + +static void gve_tx_free_ring_dqo(struct gve_priv *priv, int idx) +{ + struct gve_tx_ring *tx = &priv->tx[idx]; + struct device *hdev = &priv->pdev->dev; + size_t bytes; + + gve_tx_remove_from_block(priv, idx); + + if (tx->q_resources) { + dma_free_coherent(hdev, sizeof(*tx->q_resources), + tx->q_resources, tx->q_resources_bus); + tx->q_resources = NULL; + } + + if (tx->dqo.compl_ring) { + bytes = sizeof(tx->dqo.compl_ring[0]) * + (tx->dqo.complq_mask + 1); + dma_free_coherent(hdev, bytes, tx->dqo.compl_ring, + tx->complq_bus_dqo); + tx->dqo.compl_ring = NULL; + } + + if (tx->dqo.tx_ring) { + bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1); + dma_free_coherent(hdev, bytes, tx->dqo.tx_ring, tx->bus); + tx->dqo.tx_ring = NULL; + } + + kvfree(tx->dqo.pending_packets); + tx->dqo.pending_packets = NULL; + + netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx); +} + +static int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int idx) +{ + struct gve_tx_ring *tx = &priv->tx[idx]; + struct device *hdev = &priv->pdev->dev; + int num_pending_packets; + size_t bytes; + int i; + + memset(tx, 0, sizeof(*tx)); + tx->q_num = idx; + tx->dev = &priv->pdev->dev; + tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx); + atomic_set_release(&tx->dqo_compl.hw_tx_head, 0); + + /* Queue sizes must be a power of 2 */ + tx->mask = priv->tx_desc_cnt - 1; + tx->dqo.complq_mask = priv->options_dqo_rda.tx_comp_ring_entries - 1; + + /* The max number of pending packets determines the maximum number of + * descriptors which maybe written to the completion queue. + * + * We must set the number small enough to make sure we never overrun the + * completion queue. + */ + num_pending_packets = tx->dqo.complq_mask + 1; + + /* Reserve space for descriptor completions, which will be reported at + * most every GVE_TX_MIN_RE_INTERVAL packets. + */ + num_pending_packets -= + (tx->dqo.complq_mask + 1) / GVE_TX_MIN_RE_INTERVAL; + + /* Each packet may have at most 2 buffer completions if it receives both + * a miss and reinjection completion. + */ + num_pending_packets /= 2; + + tx->dqo.num_pending_packets = min_t(int, num_pending_packets, S16_MAX); + tx->dqo.pending_packets = kvcalloc(tx->dqo.num_pending_packets, + sizeof(tx->dqo.pending_packets[0]), + GFP_KERNEL); + if (!tx->dqo.pending_packets) + goto err; + + /* Set up linked list of pending packets */ + for (i = 0; i < tx->dqo.num_pending_packets - 1; i++) + tx->dqo.pending_packets[i].next = i + 1; + + tx->dqo.pending_packets[tx->dqo.num_pending_packets - 1].next = -1; + atomic_set_release(&tx->dqo_compl.free_pending_packets, -1); + tx->dqo_compl.miss_completions.head = -1; + tx->dqo_compl.miss_completions.tail = -1; + tx->dqo_compl.timed_out_completions.head = -1; + tx->dqo_compl.timed_out_completions.tail = -1; + + bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1); + tx->dqo.tx_ring = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL); + if (!tx->dqo.tx_ring) + goto err; + + bytes = sizeof(tx->dqo.compl_ring[0]) * (tx->dqo.complq_mask + 1); + tx->dqo.compl_ring = dma_alloc_coherent(hdev, bytes, + &tx->complq_bus_dqo, + GFP_KERNEL); + if (!tx->dqo.compl_ring) + goto err; + + tx->q_resources = dma_alloc_coherent(hdev, sizeof(*tx->q_resources), + &tx->q_resources_bus, GFP_KERNEL); + if (!tx->q_resources) + goto err; + + gve_tx_add_to_block(priv, idx); + + return 0; + +err: + gve_tx_free_ring_dqo(priv, idx); + return -ENOMEM; +} + +int gve_tx_alloc_rings_dqo(struct gve_priv *priv) +{ + int err = 0; + int i; + + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + err = gve_tx_alloc_ring_dqo(priv, i); + if (err) { + netif_err(priv, drv, priv->dev, + "Failed to alloc tx ring=%d: err=%d\n", + i, err); + goto err; + } + } + + return 0; + +err: + for (i--; i >= 0; i--) + gve_tx_free_ring_dqo(priv, i); + + return err; +} + +void gve_tx_free_rings_dqo(struct gve_priv *priv) +{ + int i; + + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + struct gve_tx_ring *tx = &priv->tx[i]; + + gve_clean_tx_done_dqo(priv, tx, /*napi=*/NULL); + netdev_tx_reset_queue(tx->netdev_txq); + gve_tx_clean_pending_packets(tx); + + gve_tx_free_ring_dqo(priv, i); + } +} + +/* Returns the number of slots available in the ring */ +static u32 num_avail_tx_slots(const struct gve_tx_ring *tx) +{ + u32 num_used = (tx->dqo_tx.tail - tx->dqo_tx.head) & tx->mask; + + return tx->mask - num_used; +} + +/* Stops the queue if available descriptors is less than 'count'. + * Return: 0 if stop is not required. + */ +static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx, int count) +{ + if (likely(gve_has_pending_packet(tx) && + num_avail_tx_slots(tx) >= count)) + return 0; + + /* Update cached TX head pointer */ + tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head); + + if (likely(gve_has_pending_packet(tx) && + num_avail_tx_slots(tx) >= count)) + return 0; + + /* No space, so stop the queue */ + tx->stop_queue++; + netif_tx_stop_queue(tx->netdev_txq); + + /* Sync with restarting queue in `gve_tx_poll_dqo()` */ + mb(); + + /* After stopping queue, check if we can transmit again in order to + * avoid TOCTOU bug. + */ + tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head); + + if (likely(!gve_has_pending_packet(tx) || + num_avail_tx_slots(tx) < count)) + return -EBUSY; + + netif_tx_start_queue(tx->netdev_txq); + tx->wake_queue++; + return 0; +} + +static void gve_extract_tx_metadata_dqo(const struct sk_buff *skb, + struct gve_tx_metadata_dqo *metadata) +{ + memset(metadata, 0, sizeof(*metadata)); + metadata->version = GVE_TX_METADATA_VERSION_DQO; + + if (skb->l4_hash) { + u16 path_hash = skb->hash ^ (skb->hash >> 16); + + path_hash &= (1 << 15) - 1; + if (unlikely(path_hash == 0)) + path_hash = ~path_hash; + + metadata->path_hash = path_hash; + } +} + +static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx, + struct sk_buff *skb, u32 len, u64 addr, + s16 compl_tag, bool eop, bool is_gso) +{ + const bool checksum_offload_en = skb->ip_summed == CHECKSUM_PARTIAL; + + while (len > 0) { + struct gve_tx_pkt_desc_dqo *desc = + &tx->dqo.tx_ring[*desc_idx].pkt; + u32 cur_len = min_t(u32, len, GVE_TX_MAX_BUF_SIZE_DQO); + bool cur_eop = eop && cur_len == len; + + *desc = (struct gve_tx_pkt_desc_dqo){ + .buf_addr = cpu_to_le64(addr), + .dtype = GVE_TX_PKT_DESC_DTYPE_DQO, + .end_of_packet = cur_eop, + .checksum_offload_enable = checksum_offload_en, + .compl_tag = cpu_to_le16(compl_tag), + .buf_size = cur_len, + }; + + addr += cur_len; + len -= cur_len; + *desc_idx = (*desc_idx + 1) & tx->mask; + } +} + +/* Validates and prepares `skb` for TSO. + * + * Returns header length, or < 0 if invalid. + */ +static int gve_prep_tso(struct sk_buff *skb) +{ + struct tcphdr *tcp; + int header_len; + u32 paylen; + int err; + + /* Note: HW requires MSS (gso_size) to be <= 9728 and the total length + * of the TSO to be <= 262143. + * + * However, we don't validate these because: + * - Hypervisor enforces a limit of 9K MTU + * - Kernel will not produce a TSO larger than 64k + */ + + if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO)) + return -1; + + /* Needed because we will modify header. */ + err = skb_cow_head(skb, 0); + if (err < 0) + return err; + + tcp = tcp_hdr(skb); + + /* Remove payload length from checksum. */ + paylen = skb->len - skb_transport_offset(skb); + + switch (skb_shinfo(skb)->gso_type) { + case SKB_GSO_TCPV4: + case SKB_GSO_TCPV6: + csum_replace_by_diff(&tcp->check, + (__force __wsum)htonl(paylen)); + + /* Compute length of segmentation header. */ + header_len = skb_tcp_all_headers(skb); + break; + default: + return -EINVAL; + } + + if (unlikely(header_len > GVE_TX_MAX_HDR_SIZE_DQO)) + return -EINVAL; + + return header_len; +} + +static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc, + const struct sk_buff *skb, + const struct gve_tx_metadata_dqo *metadata, + int header_len) +{ + *desc = (struct gve_tx_tso_context_desc_dqo){ + .header_len = header_len, + .cmd_dtype = { + .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO, + .tso = 1, + }, + .flex0 = metadata->bytes[0], + .flex5 = metadata->bytes[5], + .flex6 = metadata->bytes[6], + .flex7 = metadata->bytes[7], + .flex8 = metadata->bytes[8], + .flex9 = metadata->bytes[9], + .flex10 = metadata->bytes[10], + .flex11 = metadata->bytes[11], + }; + desc->tso_total_len = skb->len - header_len; + desc->mss = skb_shinfo(skb)->gso_size; +} + +static void +gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc, + const struct gve_tx_metadata_dqo *metadata) +{ + *desc = (struct gve_tx_general_context_desc_dqo){ + .flex0 = metadata->bytes[0], + .flex1 = metadata->bytes[1], + .flex2 = metadata->bytes[2], + .flex3 = metadata->bytes[3], + .flex4 = metadata->bytes[4], + .flex5 = metadata->bytes[5], + .flex6 = metadata->bytes[6], + .flex7 = metadata->bytes[7], + .flex8 = metadata->bytes[8], + .flex9 = metadata->bytes[9], + .flex10 = metadata->bytes[10], + .flex11 = metadata->bytes[11], + .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO}, + }; +} + +/* Returns 0 on success, or < 0 on error. + * + * Before this function is called, the caller must ensure + * gve_has_pending_packet(tx) returns true. + */ +static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx, + struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + const bool is_gso = skb_is_gso(skb); + u32 desc_idx = tx->dqo_tx.tail; + + struct gve_tx_pending_packet_dqo *pkt; + struct gve_tx_metadata_dqo metadata; + s16 completion_tag; + int i; + + pkt = gve_alloc_pending_packet(tx); + pkt->skb = skb; + pkt->num_bufs = 0; + completion_tag = pkt - tx->dqo.pending_packets; + + gve_extract_tx_metadata_dqo(skb, &metadata); + if (is_gso) { + int header_len = gve_prep_tso(skb); + + if (unlikely(header_len < 0)) + goto err; + + gve_tx_fill_tso_ctx_desc(&tx->dqo.tx_ring[desc_idx].tso_ctx, + skb, &metadata, header_len); + desc_idx = (desc_idx + 1) & tx->mask; + } + + gve_tx_fill_general_ctx_desc(&tx->dqo.tx_ring[desc_idx].general_ctx, + &metadata); + desc_idx = (desc_idx + 1) & tx->mask; + + /* Note: HW requires that the size of a non-TSO packet be within the + * range of [17, 9728]. + * + * We don't double check because + * - We limited `netdev->min_mtu` to ETH_MIN_MTU. + * - Hypervisor won't allow MTU larger than 9216. + */ + + /* Map the linear portion of skb */ + { + u32 len = skb_headlen(skb); + dma_addr_t addr; + + addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx->dev, addr))) + goto err; + + dma_unmap_len_set(pkt, len[pkt->num_bufs], len); + dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr); + ++pkt->num_bufs; + + gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr, + completion_tag, + /*eop=*/shinfo->nr_frags == 0, is_gso); + } + + for (i = 0; i < shinfo->nr_frags; i++) { + const skb_frag_t *frag = &shinfo->frags[i]; + bool is_eop = i == (shinfo->nr_frags - 1); + u32 len = skb_frag_size(frag); + dma_addr_t addr; + + addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx->dev, addr))) + goto err; + + dma_unmap_len_set(pkt, len[pkt->num_bufs], len); + dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr); + ++pkt->num_bufs; + + gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr, + completion_tag, is_eop, is_gso); + } + + /* Commit the changes to our state */ + tx->dqo_tx.tail = desc_idx; + + /* Request a descriptor completion on the last descriptor of the + * packet if we are allowed to by the HW enforced interval. + */ + { + u32 last_desc_idx = (desc_idx - 1) & tx->mask; + u32 last_report_event_interval = + (last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask; + + if (unlikely(last_report_event_interval >= + GVE_TX_MIN_RE_INTERVAL)) { + tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true; + tx->dqo_tx.last_re_idx = last_desc_idx; + } + } + + return 0; + +err: + for (i = 0; i < pkt->num_bufs; i++) { + if (i == 0) { + dma_unmap_single(tx->dev, + dma_unmap_addr(pkt, dma[i]), + dma_unmap_len(pkt, len[i]), + DMA_TO_DEVICE); + } else { + dma_unmap_page(tx->dev, + dma_unmap_addr(pkt, dma[i]), + dma_unmap_len(pkt, len[i]), + DMA_TO_DEVICE); + } + } + + pkt->skb = NULL; + pkt->num_bufs = 0; + gve_free_pending_packet(tx, pkt); + + return -1; +} + +static int gve_num_descs_per_buf(size_t size) +{ + return DIV_ROUND_UP(size, GVE_TX_MAX_BUF_SIZE_DQO); +} + +static int gve_num_buffer_descs_needed(const struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + int num_descs; + int i; + + num_descs = gve_num_descs_per_buf(skb_headlen(skb)); + + for (i = 0; i < shinfo->nr_frags; i++) { + unsigned int frag_size = skb_frag_size(&shinfo->frags[i]); + + num_descs += gve_num_descs_per_buf(frag_size); + } + + return num_descs; +} + +/* Returns true if HW is capable of sending TSO represented by `skb`. + * + * Each segment must not span more than GVE_TX_MAX_DATA_DESCS buffers. + * - The header is counted as one buffer for every single segment. + * - A buffer which is split between two segments is counted for both. + * - If a buffer contains both header and payload, it is counted as two buffers. + */ +static bool gve_can_send_tso(const struct sk_buff *skb) +{ + const int max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1; + const struct skb_shared_info *shinfo = skb_shinfo(skb); + const int header_len = skb_tcp_all_headers(skb); + const int gso_size = shinfo->gso_size; + int cur_seg_num_bufs; + int cur_seg_size; + int i; + + cur_seg_size = skb_headlen(skb) - header_len; + cur_seg_num_bufs = cur_seg_size > 0; + + for (i = 0; i < shinfo->nr_frags; i++) { + if (cur_seg_size >= gso_size) { + cur_seg_size %= gso_size; + cur_seg_num_bufs = cur_seg_size > 0; + } + + if (unlikely(++cur_seg_num_bufs > max_bufs_per_seg)) + return false; + + cur_seg_size += skb_frag_size(&shinfo->frags[i]); + } + + return true; +} + +/* Attempt to transmit specified SKB. + * + * Returns 0 if the SKB was transmitted or dropped. + * Returns -1 if there is not currently enough space to transmit the SKB. + */ +static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx, + struct sk_buff *skb) +{ + int num_buffer_descs; + int total_num_descs; + + if (skb_is_gso(skb)) { + /* If TSO doesn't meet HW requirements, attempt to linearize the + * packet. + */ + if (unlikely(!gve_can_send_tso(skb) && + skb_linearize(skb) < 0)) { + net_err_ratelimited("%s: Failed to transmit TSO packet\n", + priv->dev->name); + goto drop; + } + + num_buffer_descs = gve_num_buffer_descs_needed(skb); + } else { + num_buffer_descs = gve_num_buffer_descs_needed(skb); + + if (unlikely(num_buffer_descs > GVE_TX_MAX_DATA_DESCS)) { + if (unlikely(skb_linearize(skb) < 0)) + goto drop; + + num_buffer_descs = 1; + } + } + + /* Metadata + (optional TSO) + data descriptors. */ + total_num_descs = 1 + skb_is_gso(skb) + num_buffer_descs; + if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs + + GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP))) { + return -1; + } + + if (unlikely(gve_tx_add_skb_no_copy_dqo(tx, skb) < 0)) + goto drop; + + netdev_tx_sent_queue(tx->netdev_txq, skb->len); + skb_tx_timestamp(skb); + return 0; + +drop: + tx->dropped_pkt++; + dev_kfree_skb_any(skb); + return 0; +} + +/* Transmit a given skb and ring the doorbell. */ +netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev) +{ + struct gve_priv *priv = netdev_priv(dev); + struct gve_tx_ring *tx; + + tx = &priv->tx[skb_get_queue_mapping(skb)]; + if (unlikely(gve_try_tx_skb(priv, tx, skb) < 0)) { + /* We need to ring the txq doorbell -- we have stopped the Tx + * queue for want of resources, but prior calls to gve_tx() + * may have added descriptors without ringing the doorbell. + */ + gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); + return NETDEV_TX_BUSY; + } + + if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more()) + return NETDEV_TX_OK; + + gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); + return NETDEV_TX_OK; +} + +static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list, + struct gve_tx_pending_packet_dqo *pending_packet) +{ + s16 old_tail, index; + + index = pending_packet - tx->dqo.pending_packets; + old_tail = list->tail; + list->tail = index; + if (old_tail == -1) + list->head = index; + else + tx->dqo.pending_packets[old_tail].next = index; + + pending_packet->next = -1; + pending_packet->prev = old_tail; +} + +static void remove_from_list(struct gve_tx_ring *tx, + struct gve_index_list *list, + struct gve_tx_pending_packet_dqo *pkt) +{ + s16 prev_index, next_index; + + prev_index = pkt->prev; + next_index = pkt->next; + + if (prev_index == -1) { + /* Node is head */ + list->head = next_index; + } else { + tx->dqo.pending_packets[prev_index].next = next_index; + } + if (next_index == -1) { + /* Node is tail */ + list->tail = prev_index; + } else { + tx->dqo.pending_packets[next_index].prev = prev_index; + } +} + +static void gve_unmap_packet(struct device *dev, + struct gve_tx_pending_packet_dqo *pkt) +{ + int i; + + /* SKB linear portion is guaranteed to be mapped */ + dma_unmap_single(dev, dma_unmap_addr(pkt, dma[0]), + dma_unmap_len(pkt, len[0]), DMA_TO_DEVICE); + for (i = 1; i < pkt->num_bufs; i++) { + dma_unmap_page(dev, dma_unmap_addr(pkt, dma[i]), + dma_unmap_len(pkt, len[i]), DMA_TO_DEVICE); + } + pkt->num_bufs = 0; +} + +/* Completion types and expected behavior: + * No Miss compl + Packet compl = Packet completed normally. + * Miss compl + Re-inject compl = Packet completed normally. + * No Miss compl + Re-inject compl = Skipped i.e. packet not completed. + * Miss compl + Packet compl = Skipped i.e. packet not completed. + */ +static void gve_handle_packet_completion(struct gve_priv *priv, + struct gve_tx_ring *tx, bool is_napi, + u16 compl_tag, u64 *bytes, u64 *pkts, + bool is_reinjection) +{ + struct gve_tx_pending_packet_dqo *pending_packet; + + if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) { + net_err_ratelimited("%s: Invalid TX completion tag: %d\n", + priv->dev->name, (int)compl_tag); + return; + } + + pending_packet = &tx->dqo.pending_packets[compl_tag]; + + if (unlikely(is_reinjection)) { + if (unlikely(pending_packet->state == + GVE_PACKET_STATE_TIMED_OUT_COMPL)) { + net_err_ratelimited("%s: Re-injection completion: %d received after timeout.\n", + priv->dev->name, (int)compl_tag); + /* Packet was already completed as a result of timeout, + * so just remove from list and free pending packet. + */ + remove_from_list(tx, + &tx->dqo_compl.timed_out_completions, + pending_packet); + gve_free_pending_packet(tx, pending_packet); + return; + } + if (unlikely(pending_packet->state != + GVE_PACKET_STATE_PENDING_REINJECT_COMPL)) { + /* No outstanding miss completion but packet allocated + * implies packet receives a re-injection completion + * without a prior miss completion. Return without + * completing the packet. + */ + net_err_ratelimited("%s: Re-injection completion received without corresponding miss completion: %d\n", + priv->dev->name, (int)compl_tag); + return; + } + remove_from_list(tx, &tx->dqo_compl.miss_completions, + pending_packet); + } else { + /* Packet is allocated but not a pending data completion. */ + if (unlikely(pending_packet->state != + GVE_PACKET_STATE_PENDING_DATA_COMPL)) { + net_err_ratelimited("%s: No pending data completion: %d\n", + priv->dev->name, (int)compl_tag); + return; + } + } + gve_unmap_packet(tx->dev, pending_packet); + + *bytes += pending_packet->skb->len; + (*pkts)++; + napi_consume_skb(pending_packet->skb, is_napi); + pending_packet->skb = NULL; + gve_free_pending_packet(tx, pending_packet); +} + +static void gve_handle_miss_completion(struct gve_priv *priv, + struct gve_tx_ring *tx, u16 compl_tag, + u64 *bytes, u64 *pkts) +{ + struct gve_tx_pending_packet_dqo *pending_packet; + + if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) { + net_err_ratelimited("%s: Invalid TX completion tag: %d\n", + priv->dev->name, (int)compl_tag); + return; + } + + pending_packet = &tx->dqo.pending_packets[compl_tag]; + if (unlikely(pending_packet->state != + GVE_PACKET_STATE_PENDING_DATA_COMPL)) { + net_err_ratelimited("%s: Unexpected packet state: %d for completion tag : %d\n", + priv->dev->name, (int)pending_packet->state, + (int)compl_tag); + return; + } + + pending_packet->state = GVE_PACKET_STATE_PENDING_REINJECT_COMPL; + /* jiffies can wraparound but time comparisons can handle overflows. */ + pending_packet->timeout_jiffies = + jiffies + + msecs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT * + MSEC_PER_SEC); + add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet); + + *bytes += pending_packet->skb->len; + (*pkts)++; +} + +static void remove_miss_completions(struct gve_priv *priv, + struct gve_tx_ring *tx) +{ + struct gve_tx_pending_packet_dqo *pending_packet; + s16 next_index; + + next_index = tx->dqo_compl.miss_completions.head; + while (next_index != -1) { + pending_packet = &tx->dqo.pending_packets[next_index]; + next_index = pending_packet->next; + /* Break early because packets should timeout in order. */ + if (time_is_after_jiffies(pending_packet->timeout_jiffies)) + break; + + remove_from_list(tx, &tx->dqo_compl.miss_completions, + pending_packet); + /* Unmap buffers and free skb but do not unallocate packet i.e. + * the completion tag is not freed to ensure that the driver + * can take appropriate action if a corresponding valid + * completion is received later. + */ + gve_unmap_packet(tx->dev, pending_packet); + /* This indicates the packet was dropped. */ + dev_kfree_skb_any(pending_packet->skb); + pending_packet->skb = NULL; + tx->dropped_pkt++; + net_err_ratelimited("%s: No reinjection completion was received for: %d.\n", + priv->dev->name, + (int)(pending_packet - tx->dqo.pending_packets)); + + pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL; + pending_packet->timeout_jiffies = + jiffies + + msecs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT * + MSEC_PER_SEC); + /* Maintain pending packet in another list so the packet can be + * unallocated at a later time. + */ + add_to_list(tx, &tx->dqo_compl.timed_out_completions, + pending_packet); + } +} + +static void remove_timed_out_completions(struct gve_priv *priv, + struct gve_tx_ring *tx) +{ + struct gve_tx_pending_packet_dqo *pending_packet; + s16 next_index; + + next_index = tx->dqo_compl.timed_out_completions.head; + while (next_index != -1) { + pending_packet = &tx->dqo.pending_packets[next_index]; + next_index = pending_packet->next; + /* Break early because packets should timeout in order. */ + if (time_is_after_jiffies(pending_packet->timeout_jiffies)) + break; + + remove_from_list(tx, &tx->dqo_compl.timed_out_completions, + pending_packet); + gve_free_pending_packet(tx, pending_packet); + } +} + +int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, + struct napi_struct *napi) +{ + u64 reinject_compl_bytes = 0; + u64 reinject_compl_pkts = 0; + int num_descs_cleaned = 0; + u64 miss_compl_bytes = 0; + u64 miss_compl_pkts = 0; + u64 pkt_compl_bytes = 0; + u64 pkt_compl_pkts = 0; + + /* Limit in order to avoid blocking for too long */ + while (!napi || pkt_compl_pkts < napi->weight) { + struct gve_tx_compl_desc *compl_desc = + &tx->dqo.compl_ring[tx->dqo_compl.head]; + u16 type; + + if (compl_desc->generation == tx->dqo_compl.cur_gen_bit) + break; + + /* Prefetch the next descriptor. */ + prefetch(&tx->dqo.compl_ring[(tx->dqo_compl.head + 1) & + tx->dqo.complq_mask]); + + /* Do not read data until we own the descriptor */ + dma_rmb(); + type = compl_desc->type; + + if (type == GVE_COMPL_TYPE_DQO_DESC) { + /* This is the last descriptor fetched by HW plus one */ + u16 tx_head = le16_to_cpu(compl_desc->tx_head); + + atomic_set_release(&tx->dqo_compl.hw_tx_head, tx_head); + } else if (type == GVE_COMPL_TYPE_DQO_PKT) { + u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); + + gve_handle_packet_completion(priv, tx, !!napi, + compl_tag, + &pkt_compl_bytes, + &pkt_compl_pkts, + /*is_reinjection=*/false); + } else if (type == GVE_COMPL_TYPE_DQO_MISS) { + u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); + + gve_handle_miss_completion(priv, tx, compl_tag, + &miss_compl_bytes, + &miss_compl_pkts); + } else if (type == GVE_COMPL_TYPE_DQO_REINJECTION) { + u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); + + gve_handle_packet_completion(priv, tx, !!napi, + compl_tag, + &reinject_compl_bytes, + &reinject_compl_pkts, + /*is_reinjection=*/true); + } + + tx->dqo_compl.head = + (tx->dqo_compl.head + 1) & tx->dqo.complq_mask; + /* Flip the generation bit when we wrap around */ + tx->dqo_compl.cur_gen_bit ^= tx->dqo_compl.head == 0; + num_descs_cleaned++; + } + + netdev_tx_completed_queue(tx->netdev_txq, + pkt_compl_pkts + miss_compl_pkts, + pkt_compl_bytes + miss_compl_bytes); + + remove_miss_completions(priv, tx); + remove_timed_out_completions(priv, tx); + + u64_stats_update_begin(&tx->statss); + tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes; + tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts; + u64_stats_update_end(&tx->statss); + return num_descs_cleaned; +} + +bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean) +{ + struct gve_tx_compl_desc *compl_desc; + struct gve_tx_ring *tx = block->tx; + struct gve_priv *priv = block->priv; + + if (do_clean) { + int num_descs_cleaned = gve_clean_tx_done_dqo(priv, tx, + &block->napi); + + /* Sync with queue being stopped in `gve_maybe_stop_tx_dqo()` */ + mb(); + + if (netif_tx_queue_stopped(tx->netdev_txq) && + num_descs_cleaned > 0) { + tx->wake_queue++; + netif_tx_wake_queue(tx->netdev_txq); + } + } + + /* Return true if we still have work. */ + compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head]; + return compl_desc->generation != tx->dqo_compl.cur_gen_bit; +} diff --git a/drivers/net/ethernet/google/gve/gve_utils.c b/drivers/net/ethernet/google/gve/gve_utils.c new file mode 100644 index 000000000000..d57508bc4307 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_utils.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2021 Google, Inc. + */ + +#include "gve.h" +#include "gve_adminq.h" +#include "gve_utils.h" + +void gve_tx_remove_from_block(struct gve_priv *priv, int queue_idx) +{ + struct gve_notify_block *block = + &priv->ntfy_blocks[gve_tx_idx_to_ntfy(priv, queue_idx)]; + + block->tx = NULL; +} + +void gve_tx_add_to_block(struct gve_priv *priv, int queue_idx) +{ + unsigned int active_cpus = min_t(int, priv->num_ntfy_blks / 2, + num_online_cpus()); + int ntfy_idx = gve_tx_idx_to_ntfy(priv, queue_idx); + struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; + struct gve_tx_ring *tx = &priv->tx[queue_idx]; + + block->tx = tx; + tx->ntfy_id = ntfy_idx; + netif_set_xps_queue(priv->dev, get_cpu_mask(ntfy_idx % active_cpus), + queue_idx); +} + +void gve_rx_remove_from_block(struct gve_priv *priv, int queue_idx) +{ + struct gve_notify_block *block = + &priv->ntfy_blocks[gve_rx_idx_to_ntfy(priv, queue_idx)]; + + block->rx = NULL; +} + +void gve_rx_add_to_block(struct gve_priv *priv, int queue_idx) +{ + u32 ntfy_idx = gve_rx_idx_to_ntfy(priv, queue_idx); + struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; + struct gve_rx_ring *rx = &priv->rx[queue_idx]; + + block->rx = rx; + rx->ntfy_id = ntfy_idx; +} + +struct sk_buff *gve_rx_copy(struct net_device *dev, struct napi_struct *napi, + struct gve_rx_slot_page_info *page_info, u16 len, + u16 padding, struct gve_rx_ctx *ctx) +{ + void *va = page_info->page_address + padding + page_info->page_offset; + int skb_linear_offset = 0; + bool set_protocol = false; + struct sk_buff *skb; + + if (ctx) { + if (!ctx->skb_head) + ctx->skb_head = napi_alloc_skb(napi, ctx->total_expected_size); + + if (unlikely(!ctx->skb_head)) + return NULL; + skb = ctx->skb_head; + skb_linear_offset = skb->len; + set_protocol = ctx->curr_frag_cnt == ctx->expected_frag_cnt - 1; + } else { + skb = napi_alloc_skb(napi, len); + + if (unlikely(!skb)) + return NULL; + set_protocol = true; + } + __skb_put(skb, len); + skb_copy_to_linear_data_offset(skb, skb_linear_offset, va, len); + + if (set_protocol) + skb->protocol = eth_type_trans(skb, dev); + + return skb; +} + +void gve_dec_pagecnt_bias(struct gve_rx_slot_page_info *page_info) +{ + page_info->pagecnt_bias--; + if (page_info->pagecnt_bias == 0) { + int pagecount = page_count(page_info->page); + + /* If we have run out of bias - set it back up to INT_MAX + * minus the existing refs. + */ + page_info->pagecnt_bias = INT_MAX - pagecount; + + /* Set pagecount back up to max. */ + page_ref_add(page_info->page, INT_MAX - pagecount); + } +} diff --git a/drivers/net/ethernet/google/gve/gve_utils.h b/drivers/net/ethernet/google/gve/gve_utils.h new file mode 100644 index 000000000000..6d98e69fd3b8 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_utils.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) + * Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2021 Google, Inc. + */ + +#ifndef _GVE_UTILS_H +#define _GVE_UTILS_H + +#include <linux/etherdevice.h> + +#include "gve.h" + +void gve_tx_remove_from_block(struct gve_priv *priv, int queue_idx); +void gve_tx_add_to_block(struct gve_priv *priv, int queue_idx); + +void gve_rx_remove_from_block(struct gve_priv *priv, int queue_idx); +void gve_rx_add_to_block(struct gve_priv *priv, int queue_idx); + +struct sk_buff *gve_rx_copy(struct net_device *dev, struct napi_struct *napi, + struct gve_rx_slot_page_info *page_info, u16 len, + u16 pad, struct gve_rx_ctx *ctx); + +/* Decrement pagecnt_bias. Set it back to INT_MAX if it reached zero. */ +void gve_dec_pagecnt_bias(struct gve_rx_slot_page_info *page_info); + +#endif /* _GVE_UTILS_H */ + |