diff options
Diffstat (limited to 'drivers/net/ethernet/google')
| -rw-r--r-- | drivers/net/ethernet/google/Kconfig | 2 | ||||
| -rw-r--r-- | drivers/net/ethernet/google/gve/Makefile | 2 | ||||
| -rw-r--r-- | drivers/net/ethernet/google/gve/gve.h | 506 | ||||
| -rw-r--r-- | drivers/net/ethernet/google/gve/gve_adminq.c | 679 | ||||
| -rw-r--r-- | drivers/net/ethernet/google/gve/gve_adminq.h | 194 | ||||
| -rw-r--r-- | drivers/net/ethernet/google/gve/gve_desc.h | 52 | ||||
| -rw-r--r-- | drivers/net/ethernet/google/gve/gve_desc_dqo.h | 256 | ||||
| -rw-r--r-- | drivers/net/ethernet/google/gve/gve_dqo.h | 93 | ||||
| -rw-r--r-- | drivers/net/ethernet/google/gve/gve_ethtool.c | 480 | ||||
| -rw-r--r-- | drivers/net/ethernet/google/gve/gve_main.c | 836 | ||||
| -rw-r--r-- | drivers/net/ethernet/google/gve/gve_register.h | 1 | ||||
| -rw-r--r-- | drivers/net/ethernet/google/gve/gve_rx.c | 645 | ||||
| -rw-r--r-- | drivers/net/ethernet/google/gve/gve_rx_dqo.c | 756 | ||||
| -rw-r--r-- | drivers/net/ethernet/google/gve/gve_tx.c | 358 | ||||
| -rw-r--r-- | drivers/net/ethernet/google/gve/gve_tx_dqo.c | 1022 | ||||
| -rw-r--r-- | drivers/net/ethernet/google/gve/gve_utils.c | 99 | ||||
| -rw-r--r-- | drivers/net/ethernet/google/gve/gve_utils.h | 28 | 
17 files changed, 5414 insertions, 595 deletions
diff --git a/drivers/net/ethernet/google/Kconfig b/drivers/net/ethernet/google/Kconfig index b8f04d052fda..8641a00f8e63 100644 --- a/drivers/net/ethernet/google/Kconfig +++ b/drivers/net/ethernet/google/Kconfig @@ -17,7 +17,7 @@ if NET_VENDOR_GOOGLE  config GVE  	tristate "Google Virtual NIC (gVNIC) support" -	depends on PCI_MSI +	depends on (PCI_MSI && (X86 || CPU_LITTLE_ENDIAN))  	help  	  This driver supports Google Virtual NIC (gVNIC)" diff --git a/drivers/net/ethernet/google/gve/Makefile b/drivers/net/ethernet/google/gve/Makefile index 3354ce40eb97..b9a6be76531b 100644 --- a/drivers/net/ethernet/google/gve/Makefile +++ b/drivers/net/ethernet/google/gve/Makefile @@ -1,4 +1,4 @@  # Makefile for the Google virtual Ethernet (gve) driver  obj-$(CONFIG_GVE) += gve.o -gve-objs := gve_main.o gve_tx.o gve_rx.o gve_ethtool.o gve_adminq.o +gve-objs := gve_main.o gve_tx.o gve_tx_dqo.o gve_rx.o gve_rx_dqo.o gve_ethtool.o gve_adminq.o gve_utils.o diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index ebc37e256922..160735484465 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -1,7 +1,7 @@  /* SPDX-License-Identifier: (GPL-2.0 OR MIT)   * Google virtual Ethernet (gve) driver   * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc.   */  #ifndef _GVE_H_ @@ -11,7 +11,9 @@  #include <linux/netdevice.h>  #include <linux/pci.h>  #include <linux/u64_stats_sync.h> +  #include "gve_desc.h" +#include "gve_desc_dqo.h"  #ifndef PCI_VENDOR_ID_GOOGLE  #define PCI_VENDOR_ID_GOOGLE	0x1ae0 @@ -27,6 +29,24 @@  /* 1 for management, 1 for rx, 1 for tx */  #define GVE_MIN_MSIX 3 +/* Numbers of gve tx/rx stats in stats report. */ +#define GVE_TX_STATS_REPORT_NUM	6 +#define GVE_RX_STATS_REPORT_NUM	2 + +/* Interval to schedule a stats report update, 20000ms. */ +#define GVE_STATS_REPORT_TIMER_PERIOD	20000 + +/* Numbers of NIC tx/rx stats in stats report. */ +#define NIC_TX_STATS_REPORT_NUM	0 +#define NIC_RX_STATS_REPORT_NUM	4 + +#define GVE_DATA_SLOT_ADDR_PAGE_MASK (~(PAGE_SIZE - 1)) + +/* PTYPEs are always 10 bits. */ +#define GVE_NUM_PTYPES	1024 + +#define GVE_RX_BUFFER_SIZE_DQO 2048 +  /* Each slot in the desc ring has a 1:1 mapping to a slot in the data ring */  struct gve_rx_desc_queue {  	struct gve_rx_desc *desc_ring; /* the descriptor ring */ @@ -39,6 +59,8 @@ struct gve_rx_slot_page_info {  	struct page *page;  	void *page_address;  	u32 page_offset; /* offset to write to in page */ +	int pagecnt_bias; /* expected pagecnt if only the driver has a ref */ +	u8 can_flip;  };  /* A list of pages registered with the device during setup and used by a queue @@ -53,34 +75,161 @@ struct gve_queue_page_list {  /* Each slot in the data ring has a 1:1 mapping to a slot in the desc ring */  struct gve_rx_data_queue { -	struct gve_rx_data_slot *data_ring; /* read by NIC */ +	union gve_rx_data_slot *data_ring; /* read by NIC */  	dma_addr_t data_bus; /* dma mapping of the slots */  	struct gve_rx_slot_page_info *page_info; /* page info of the buffers */  	struct gve_queue_page_list *qpl; /* qpl assigned to this queue */ +	u8 raw_addressing; /* use raw_addressing? */  };  struct gve_priv; -/* An RX ring that contains a power-of-two sized desc and data ring. */ +/* RX buffer queue for posting buffers to HW. + * Each RX (completion) queue has a corresponding buffer queue. + */ +struct gve_rx_buf_queue_dqo { +	struct gve_rx_desc_dqo *desc_ring; +	dma_addr_t bus; +	u32 head; /* Pointer to start cleaning buffers at. */ +	u32 tail; /* Last posted buffer index + 1 */ +	u32 mask; /* Mask for indices to the size of the ring */ +}; + +/* RX completion queue to receive packets from HW. */ +struct gve_rx_compl_queue_dqo { +	struct gve_rx_compl_desc_dqo *desc_ring; +	dma_addr_t bus; + +	/* Number of slots which did not have a buffer posted yet. We should not +	 * post more buffers than the queue size to avoid HW overrunning the +	 * queue. +	 */ +	int num_free_slots; + +	/* HW uses a "generation bit" to notify SW of new descriptors. When a +	 * descriptor's generation bit is different from the current generation, +	 * that descriptor is ready to be consumed by SW. +	 */ +	u8 cur_gen_bit; + +	/* Pointer into desc_ring where the next completion descriptor will be +	 * received. +	 */ +	u32 head; +	u32 mask; /* Mask for indices to the size of the ring */ +}; + +/* Stores state for tracking buffers posted to HW */ +struct gve_rx_buf_state_dqo { +	/* The page posted to HW. */ +	struct gve_rx_slot_page_info page_info; + +	/* The DMA address corresponding to `page_info`. */ +	dma_addr_t addr; + +	/* Last offset into the page when it only had a single reference, at +	 * which point every other offset is free to be reused. +	 */ +	u32 last_single_ref_offset; + +	/* Linked list index to next element in the list, or -1 if none */ +	s16 next; +}; + +/* `head` and `tail` are indices into an array, or -1 if empty. */ +struct gve_index_list { +	s16 head; +	s16 tail; +}; + +/* A single received packet split across multiple buffers may be + * reconstructed using the information in this structure. + */ +struct gve_rx_ctx { +	/* head and tail of skb chain for the current packet or NULL if none */ +	struct sk_buff *skb_head; +	struct sk_buff *skb_tail; +	u16 total_expected_size; +	u8 expected_frag_cnt; +	u8 curr_frag_cnt; +	u8 reuse_frags; +}; + +/* Contains datapath state used to represent an RX queue. */  struct gve_rx_ring {  	struct gve_priv *gve; -	struct gve_rx_desc_queue desc; -	struct gve_rx_data_queue data; +	union { +		/* GQI fields */ +		struct { +			struct gve_rx_desc_queue desc; +			struct gve_rx_data_queue data; + +			/* threshold for posting new buffs and descs */ +			u32 db_threshold; +			u16 packet_buffer_size; +		}; + +		/* DQO fields. */ +		struct { +			struct gve_rx_buf_queue_dqo bufq; +			struct gve_rx_compl_queue_dqo complq; + +			struct gve_rx_buf_state_dqo *buf_states; +			u16 num_buf_states; + +			/* Linked list of gve_rx_buf_state_dqo. Index into +			 * buf_states, or -1 if empty. +			 */ +			s16 free_buf_states; + +			/* Linked list of gve_rx_buf_state_dqo. Indexes into +			 * buf_states, or -1 if empty. +			 * +			 * This list contains buf_states which are pointing to +			 * valid buffers. +			 * +			 * We use a FIFO here in order to increase the +			 * probability that buffers can be reused by increasing +			 * the time between usages. +			 */ +			struct gve_index_list recycled_buf_states; + +			/* Linked list of gve_rx_buf_state_dqo. Indexes into +			 * buf_states, or -1 if empty. +			 * +			 * This list contains buf_states which have buffers +			 * which cannot be reused yet. +			 */ +			struct gve_index_list used_buf_states; +		} dqo; +	}; +  	u64 rbytes; /* free-running bytes received */  	u64 rpackets; /* free-running packets received */  	u32 cnt; /* free-running total number of completed packets */  	u32 fill_cnt; /* free-running total number of descs and buffs posted */  	u32 mask; /* masks the cnt and fill_cnt to the size of the ring */ +	u64 rx_copybreak_pkt; /* free-running count of copybreak packets */ +	u64 rx_copied_pkt; /* free-running total number of copied packets */ +	u64 rx_skb_alloc_fail; /* free-running count of skb alloc fails */ +	u64 rx_buf_alloc_fail; /* free-running count of buffer alloc fails */ +	u64 rx_desc_err_dropped_pkt; /* free-running count of packets dropped by descriptor error */ +	u64 rx_cont_packet_cnt; /* free-running multi-fragment packets received */ +	u64 rx_frag_flip_cnt; /* free-running count of rx segments where page_flip was used */ +	u64 rx_frag_copy_cnt; /* free-running count of rx segments copied into skb linear portion */  	u32 q_num; /* queue index */  	u32 ntfy_id; /* notification block index */  	struct gve_queue_resources *q_resources; /* head and tail pointer idx */  	dma_addr_t q_resources_bus; /* dma address for the queue resources */  	struct u64_stats_sync statss; /* sync stats for 32bit archs */ + +	struct gve_rx_ctx ctx; /* Info for packet currently being processed in this ring. */  };  /* A TX desc ring entry */  union gve_tx_desc {  	struct gve_tx_pkt_desc pkt; /* first desc for a packet */ +	struct gve_tx_mtd_desc mtd; /* optional metadata descriptor */  	struct gve_tx_seg_desc seg; /* subsequent descs for a packet */  }; @@ -96,7 +245,13 @@ struct gve_tx_iovec {   */  struct gve_tx_buffer_state {  	struct sk_buff *skb; /* skb for this pkt */ -	struct gve_tx_iovec iov[GVE_TX_MAX_IOVEC]; /* segments of this pkt */ +	union { +		struct gve_tx_iovec iov[GVE_TX_MAX_IOVEC]; /* segments of this pkt */ +		struct { +			DEFINE_DMA_UNMAP_ADDR(dma); +			DEFINE_DMA_UNMAP_LEN(len); +		}; +	};  };  /* A TX buffer - each queue has one */ @@ -108,32 +263,178 @@ struct gve_tx_fifo {  	struct gve_queue_page_list *qpl; /* QPL mapped into this FIFO */  }; -/* A TX ring that contains a power-of-two sized desc ring and a FIFO buffer */ +/* TX descriptor for DQO format */ +union gve_tx_desc_dqo { +	struct gve_tx_pkt_desc_dqo pkt; +	struct gve_tx_tso_context_desc_dqo tso_ctx; +	struct gve_tx_general_context_desc_dqo general_ctx; +}; + +enum gve_packet_state { +	/* Packet is in free list, available to be allocated. +	 * This should always be zero since state is not explicitly initialized. +	 */ +	GVE_PACKET_STATE_UNALLOCATED, +	/* Packet is expecting a regular data completion or miss completion */ +	GVE_PACKET_STATE_PENDING_DATA_COMPL, +	/* Packet has received a miss completion and is expecting a +	 * re-injection completion. +	 */ +	GVE_PACKET_STATE_PENDING_REINJECT_COMPL, +	/* No valid completion received within the specified timeout. */ +	GVE_PACKET_STATE_TIMED_OUT_COMPL, +}; + +struct gve_tx_pending_packet_dqo { +	struct sk_buff *skb; /* skb for this packet */ + +	/* 0th element corresponds to the linear portion of `skb`, should be +	 * unmapped with `dma_unmap_single`. +	 * +	 * All others correspond to `skb`'s frags and should be unmapped with +	 * `dma_unmap_page`. +	 */ +	DEFINE_DMA_UNMAP_ADDR(dma[MAX_SKB_FRAGS + 1]); +	DEFINE_DMA_UNMAP_LEN(len[MAX_SKB_FRAGS + 1]); +	u16 num_bufs; + +	/* Linked list index to next element in the list, or -1 if none */ +	s16 next; + +	/* Linked list index to prev element in the list, or -1 if none. +	 * Used for tracking either outstanding miss completions or prematurely +	 * freed packets. +	 */ +	s16 prev; + +	/* Identifies the current state of the packet as defined in +	 * `enum gve_packet_state`. +	 */ +	u8 state; + +	/* If packet is an outstanding miss completion, then the packet is +	 * freed if the corresponding re-injection completion is not received +	 * before kernel jiffies exceeds timeout_jiffies. +	 */ +	unsigned long timeout_jiffies; +}; + +/* Contains datapath state used to represent a TX queue. */  struct gve_tx_ring {  	/* Cacheline 0 -- Accessed & dirtied during transmit */ -	struct gve_tx_fifo tx_fifo; -	u32 req; /* driver tracked head pointer */ -	u32 done; /* driver tracked tail pointer */ +	union { +		/* GQI fields */ +		struct { +			struct gve_tx_fifo tx_fifo; +			u32 req; /* driver tracked head pointer */ +			u32 done; /* driver tracked tail pointer */ +		}; + +		/* DQO fields. */ +		struct { +			/* Linked list of gve_tx_pending_packet_dqo. Index into +			 * pending_packets, or -1 if empty. +			 * +			 * This is a consumer list owned by the TX path. When it +			 * runs out, the producer list is stolen from the +			 * completion handling path +			 * (dqo_compl.free_pending_packets). +			 */ +			s16 free_pending_packets; + +			/* Cached value of `dqo_compl.hw_tx_head` */ +			u32 head; +			u32 tail; /* Last posted buffer index + 1 */ + +			/* Index of the last descriptor with "report event" bit +			 * set. +			 */ +			u32 last_re_idx; +		} dqo_tx; +	};  	/* Cacheline 1 -- Accessed & dirtied during gve_clean_tx_done */ -	__be32 last_nic_done ____cacheline_aligned; /* NIC tail pointer */ +	union { +		/* GQI fields */ +		struct { +			/* Spinlock for when cleanup in progress */ +			spinlock_t clean_lock; +		}; + +		/* DQO fields. */ +		struct { +			u32 head; /* Last read on compl_desc */ + +			/* Tracks the current gen bit of compl_q */ +			u8 cur_gen_bit; + +			/* Linked list of gve_tx_pending_packet_dqo. Index into +			 * pending_packets, or -1 if empty. +			 * +			 * This is the producer list, owned by the completion +			 * handling path. When the consumer list +			 * (dqo_tx.free_pending_packets) is runs out, this list +			 * will be stolen. +			 */ +			atomic_t free_pending_packets; + +			/* Last TX ring index fetched by HW */ +			atomic_t hw_tx_head; + +			/* List to track pending packets which received a miss +			 * completion but not a corresponding reinjection. +			 */ +			struct gve_index_list miss_completions; + +			/* List to track pending packets that were completed +			 * before receiving a valid completion because they +			 * reached a specified timeout. +			 */ +			struct gve_index_list timed_out_completions; +		} dqo_compl; +	} ____cacheline_aligned;  	u64 pkt_done; /* free-running - total packets completed */  	u64 bytes_done; /* free-running - total bytes completed */ +	u64 dropped_pkt; /* free-running - total packets dropped */ +	u64 dma_mapping_error; /* count of dma mapping errors */  	/* Cacheline 2 -- Read-mostly fields */ -	union gve_tx_desc *desc ____cacheline_aligned; -	struct gve_tx_buffer_state *info; /* Maps 1:1 to a desc */ +	union { +		/* GQI fields */ +		struct { +			union gve_tx_desc *desc; + +			/* Maps 1:1 to a desc */ +			struct gve_tx_buffer_state *info; +		}; + +		/* DQO fields. */ +		struct { +			union gve_tx_desc_dqo *tx_ring; +			struct gve_tx_compl_desc *compl_ring; + +			struct gve_tx_pending_packet_dqo *pending_packets; +			s16 num_pending_packets; + +			u32 complq_mask; /* complq size is complq_mask + 1 */ +		} dqo; +	} ____cacheline_aligned;  	struct netdev_queue *netdev_txq;  	struct gve_queue_resources *q_resources; /* head and tail pointer idx */ +	struct device *dev;  	u32 mask; /* masks req and done down to queue size */ +	u8 raw_addressing; /* use raw_addressing? */  	/* Slow-path fields */  	u32 q_num ____cacheline_aligned; /* queue idx */  	u32 stop_queue; /* count of queue stops */  	u32 wake_queue; /* count of queue wakes */ +	u32 queue_timeout; /* count of queue timeouts */  	u32 ntfy_id; /* notification block index */ +	u32 last_kick_msec; /* Last time the queue was kicked */  	dma_addr_t bus; /* dma address of the descr ring */  	dma_addr_t q_resources_bus; /* dma address of the queue resources */ +	dma_addr_t complq_bus_dqo; /* dma address of the dqo.compl_ring */  	struct u64_stats_sync statss; /* sync stats for 32bit archs */  } ____cacheline_aligned; @@ -141,13 +442,13 @@ struct gve_tx_ring {   * associated with that irq.   */  struct gve_notify_block { -	__be32 irq_db_index; /* idx into Bar2 - set by device, must be 1st */ +	__be32 *irq_db_index; /* pointer to idx into Bar2 */  	char name[IFNAMSIZ + 16]; /* name registered with the kernel */  	struct napi_struct napi; /* kernel napi struct for this block */  	struct gve_priv *priv;  	struct gve_tx_ring *tx; /* tx rings on this block */  	struct gve_rx_ring *rx; /* rx rings on this block */ -} ____cacheline_aligned; +};  /* Tracks allowed and current queue settings */  struct gve_queue_config { @@ -161,13 +462,43 @@ struct gve_qpl_config {  	unsigned long *qpl_id_map; /* bitmap of used qpl ids */  }; +struct gve_options_dqo_rda { +	u16 tx_comp_ring_entries; /* number of tx_comp descriptors */ +	u16 rx_buff_ring_entries; /* number of rx_buff descriptors */ +}; + +struct gve_irq_db { +	__be32 index; +} ____cacheline_aligned; + +struct gve_ptype { +	u8 l3_type;  /* `gve_l3_type` in gve_adminq.h */ +	u8 l4_type;  /* `gve_l4_type` in gve_adminq.h */ +}; + +struct gve_ptype_lut { +	struct gve_ptype ptypes[GVE_NUM_PTYPES]; +}; + +/* GVE_QUEUE_FORMAT_UNSPECIFIED must be zero since 0 is the default value + * when the entire configure_device_resources command is zeroed out and the + * queue_format is not specified. + */ +enum gve_queue_format { +	GVE_QUEUE_FORMAT_UNSPECIFIED	= 0x0, +	GVE_GQI_RDA_FORMAT		= 0x1, +	GVE_GQI_QPL_FORMAT		= 0x2, +	GVE_DQO_RDA_FORMAT		= 0x3, +}; +  struct gve_priv {  	struct net_device *dev;  	struct gve_tx_ring *tx; /* array of tx_cfg.num_queues */  	struct gve_rx_ring *rx; /* array of rx_cfg.num_queues */  	struct gve_queue_page_list *qpls; /* array of num qpls */  	struct gve_notify_block *ntfy_blocks; /* array of num_ntfy_blks */ -	dma_addr_t ntfy_block_bus; +	struct gve_irq_db *irq_db_indices; /* array of num_ntfy_blks */ +	dma_addr_t irq_db_indices_bus;  	struct msix_entry *msix_vectors; /* array of num_ntfy_blks + 1 */  	char mgmt_msix_name[IFNAMSIZ + 16];  	u32 mgmt_msix_idx; @@ -178,7 +509,7 @@ struct gve_priv {  	u16 tx_desc_cnt; /* num desc per ring */  	u16 rx_desc_cnt; /* num desc per ring */  	u16 tx_pages_per_qpl; /* tx buffer length */ -	u16 rx_pages_per_qpl; /* rx buffer length */ +	u16 rx_data_slot_cnt; /* rx buffer length */  	u64 max_registered_pages;  	u64 num_registered_pages; /* num pages registered with NIC */  	u32 rx_copybreak; /* copy packets smaller than this */ @@ -202,24 +533,79 @@ struct gve_priv {  	dma_addr_t adminq_bus_addr;  	u32 adminq_mask; /* masks prod_cnt to adminq size */  	u32 adminq_prod_cnt; /* free-running count of AQ cmds executed */ - +	u32 adminq_cmd_fail; /* free-running count of AQ cmds failed */ +	u32 adminq_timeouts; /* free-running count of AQ cmds timeouts */ +	/* free-running count of per AQ cmd executed */ +	u32 adminq_describe_device_cnt; +	u32 adminq_cfg_device_resources_cnt; +	u32 adminq_register_page_list_cnt; +	u32 adminq_unregister_page_list_cnt; +	u32 adminq_create_tx_queue_cnt; +	u32 adminq_create_rx_queue_cnt; +	u32 adminq_destroy_tx_queue_cnt; +	u32 adminq_destroy_rx_queue_cnt; +	u32 adminq_dcfg_device_resources_cnt; +	u32 adminq_set_driver_parameter_cnt; +	u32 adminq_report_stats_cnt; +	u32 adminq_report_link_speed_cnt; +	u32 adminq_get_ptype_map_cnt; + +	/* Global stats */ +	u32 interface_up_cnt; /* count of times interface turned up since last reset */ +	u32 interface_down_cnt; /* count of times interface turned down since last reset */ +	u32 reset_cnt; /* count of reset */ +	u32 page_alloc_fail; /* count of page alloc fails */ +	u32 dma_mapping_error; /* count of dma mapping errors */ +	u32 stats_report_trigger_cnt; /* count of device-requested stats-reports since last reset */ +	u32 suspend_cnt; /* count of times suspended */ +	u32 resume_cnt; /* count of times resumed */  	struct workqueue_struct *gve_wq;  	struct work_struct service_task; +	struct work_struct stats_report_task;  	unsigned long service_task_flags;  	unsigned long state_flags; + +	struct gve_stats_report *stats_report; +	u64 stats_report_len; +	dma_addr_t stats_report_bus; /* dma address for the stats report */ +	unsigned long ethtool_flags; + +	unsigned long stats_report_timer_period; +	struct timer_list stats_report_timer; + +	/* Gvnic device link speed from hypervisor. */ +	u64 link_speed; +	bool up_before_suspend; /* True if dev was up before suspend */ + +	struct gve_options_dqo_rda options_dqo_rda; +	struct gve_ptype_lut *ptype_lut_dqo; + +	/* Must be a power of two. */ +	int data_buffer_size_dqo; + +	enum gve_queue_format queue_format; + +	/* Interrupt coalescing settings */ +	u32 tx_coalesce_usecs; +	u32 rx_coalesce_usecs;  }; -enum gve_service_task_flags { -	GVE_PRIV_FLAGS_DO_RESET			= BIT(1), -	GVE_PRIV_FLAGS_RESET_IN_PROGRESS	= BIT(2), -	GVE_PRIV_FLAGS_PROBE_IN_PROGRESS	= BIT(3), +enum gve_service_task_flags_bit { +	GVE_PRIV_FLAGS_DO_RESET			= 1, +	GVE_PRIV_FLAGS_RESET_IN_PROGRESS	= 2, +	GVE_PRIV_FLAGS_PROBE_IN_PROGRESS	= 3, +	GVE_PRIV_FLAGS_DO_REPORT_STATS = 4,  }; -enum gve_state_flags { -	GVE_PRIV_FLAGS_ADMIN_QUEUE_OK		= BIT(1), -	GVE_PRIV_FLAGS_DEVICE_RESOURCES_OK	= BIT(2), -	GVE_PRIV_FLAGS_DEVICE_RINGS_OK		= BIT(3), -	GVE_PRIV_FLAGS_NAPI_ENABLED		= BIT(4), +enum gve_state_flags_bit { +	GVE_PRIV_FLAGS_ADMIN_QUEUE_OK		= 1, +	GVE_PRIV_FLAGS_DEVICE_RESOURCES_OK	= 2, +	GVE_PRIV_FLAGS_DEVICE_RINGS_OK		= 3, +	GVE_PRIV_FLAGS_NAPI_ENABLED		= 4, +}; + +enum gve_ethtool_flags_bit { +	GVE_PRIV_FLAGS_REPORT_STATS		= 0,  };  static inline bool gve_get_do_reset(struct gve_priv *priv) @@ -269,6 +655,22 @@ static inline void gve_clear_probe_in_progress(struct gve_priv *priv)  	clear_bit(GVE_PRIV_FLAGS_PROBE_IN_PROGRESS, &priv->service_task_flags);  } +static inline bool gve_get_do_report_stats(struct gve_priv *priv) +{ +	return test_bit(GVE_PRIV_FLAGS_DO_REPORT_STATS, +			&priv->service_task_flags); +} + +static inline void gve_set_do_report_stats(struct gve_priv *priv) +{ +	set_bit(GVE_PRIV_FLAGS_DO_REPORT_STATS, &priv->service_task_flags); +} + +static inline void gve_clear_do_report_stats(struct gve_priv *priv) +{ +	clear_bit(GVE_PRIV_FLAGS_DO_REPORT_STATS, &priv->service_task_flags); +} +  static inline bool gve_get_admin_queue_ok(struct gve_priv *priv)  {  	return test_bit(GVE_PRIV_FLAGS_ADMIN_QUEUE_OK, &priv->state_flags); @@ -329,12 +731,22 @@ static inline void gve_clear_napi_enabled(struct gve_priv *priv)  	clear_bit(GVE_PRIV_FLAGS_NAPI_ENABLED, &priv->state_flags);  } +static inline bool gve_get_report_stats(struct gve_priv *priv) +{ +	return test_bit(GVE_PRIV_FLAGS_REPORT_STATS, &priv->ethtool_flags); +} + +static inline void gve_clear_report_stats(struct gve_priv *priv) +{ +	clear_bit(GVE_PRIV_FLAGS_REPORT_STATS, &priv->ethtool_flags); +} +  /* Returns the address of the ntfy_blocks irq doorbell   */  static inline __be32 __iomem *gve_irq_doorbell(struct gve_priv *priv,  					       struct gve_notify_block *block)  { -	return &priv->db_bar2[be32_to_cpu(block->irq_db_index)]; +	return &priv->db_bar2[be32_to_cpu(*block->irq_db_index)];  }  /* Returns the index into ntfy_blocks of the given tx ring's block @@ -355,6 +767,9 @@ static inline u32 gve_rx_idx_to_ntfy(struct gve_priv *priv, u32 queue_idx)   */  static inline u32 gve_num_tx_qpls(struct gve_priv *priv)  { +	if (priv->queue_format != GVE_GQI_QPL_FORMAT) +		return 0; +  	return priv->tx_cfg.num_queues;  } @@ -362,6 +777,9 @@ static inline u32 gve_num_tx_qpls(struct gve_priv *priv)   */  static inline u32 gve_num_rx_qpls(struct gve_priv *priv)  { +	if (priv->queue_format != GVE_GQI_QPL_FORMAT) +		return 0; +  	return priv->rx_cfg.num_queues;  } @@ -391,7 +809,7 @@ struct gve_queue_page_list *gve_assign_rx_qpl(struct gve_priv *priv)  				    gve_num_tx_qpls(priv));  	/* we are out of rx qpls */ -	if (id == priv->qpl_cfg.qpl_map_size) +	if (id == gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv))  		return NULL;  	set_bit(id, priv->qpl_cfg.qpl_id_map); @@ -416,40 +834,40 @@ static inline enum dma_data_direction gve_qpl_dma_dir(struct gve_priv *priv,  		return DMA_FROM_DEVICE;  } -/* Returns true if the max mtu allows page recycling */ -static inline bool gve_can_recycle_pages(struct net_device *dev) +static inline bool gve_is_gqi(struct gve_priv *priv)  { -	/* We can't recycle the pages if we can't fit a packet into half a -	 * page. -	 */ -	return dev->max_mtu <= PAGE_SIZE / 2; +	return priv->queue_format == GVE_GQI_RDA_FORMAT || +		priv->queue_format == GVE_GQI_QPL_FORMAT;  }  /* buffers */ -int gve_alloc_page(struct device *dev, struct page **page, dma_addr_t *dma, -		   enum dma_data_direction); +int gve_alloc_page(struct gve_priv *priv, struct device *dev, +		   struct page **page, dma_addr_t *dma, +		   enum dma_data_direction, gfp_t gfp_flags);  void gve_free_page(struct device *dev, struct page *page, dma_addr_t dma,  		   enum dma_data_direction);  /* tx handling */  netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev);  bool gve_tx_poll(struct gve_notify_block *block, int budget);  int gve_tx_alloc_rings(struct gve_priv *priv); -void gve_tx_free_rings(struct gve_priv *priv); -__be32 gve_tx_load_event_counter(struct gve_priv *priv, -				 struct gve_tx_ring *tx); +void gve_tx_free_rings_gqi(struct gve_priv *priv); +u32 gve_tx_load_event_counter(struct gve_priv *priv, +			      struct gve_tx_ring *tx); +bool gve_tx_clean_pending(struct gve_priv *priv, struct gve_tx_ring *tx);  /* rx handling */  void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx); -bool gve_rx_poll(struct gve_notify_block *block, int budget); +int gve_rx_poll(struct gve_notify_block *block, int budget); +bool gve_rx_work_pending(struct gve_rx_ring *rx);  int gve_rx_alloc_rings(struct gve_priv *priv); -void gve_rx_free_rings(struct gve_priv *priv); -bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget, -		       netdev_features_t feat); +void gve_rx_free_rings_gqi(struct gve_priv *priv);  /* Reset */  void gve_schedule_reset(struct gve_priv *priv);  int gve_reset(struct gve_priv *priv, bool attempt_teardown);  int gve_adjust_queues(struct gve_priv *priv,  		      struct gve_queue_config new_rx_config,  		      struct gve_queue_config new_tx_config); +/* report stats handling */ +void gve_handle_report_stats(struct gve_priv *priv);  /* exported by ethtool.c */  extern const struct ethtool_ops gve_ethtool_ops;  /* needed by ethtool */ diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c index c3ba7baf0107..f7621ab672b9 100644 --- a/drivers/net/ethernet/google/gve/gve_adminq.c +++ b/drivers/net/ethernet/google/gve/gve_adminq.c @@ -1,7 +1,7 @@  // SPDX-License-Identifier: (GPL-2.0 OR MIT)  /* Google virtual Ethernet (gve) driver   * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc.   */  #include <linux/etherdevice.h> @@ -14,6 +14,165 @@  #define GVE_ADMINQ_SLEEP_LEN		20  #define GVE_MAX_ADMINQ_EVENT_COUNTER_CHECK	100 +#define GVE_DEVICE_OPTION_ERROR_FMT "%s option error:\n" \ +"Expected: length=%d, feature_mask=%x.\n" \ +"Actual: length=%d, feature_mask=%x.\n" + +#define GVE_DEVICE_OPTION_TOO_BIG_FMT "Length of %s option larger than expected. Possible older version of guest driver.\n" + +static +struct gve_device_option *gve_get_next_option(struct gve_device_descriptor *descriptor, +					      struct gve_device_option *option) +{ +	void *option_end, *descriptor_end; + +	option_end = (void *)(option + 1) + be16_to_cpu(option->option_length); +	descriptor_end = (void *)descriptor + be16_to_cpu(descriptor->total_length); + +	return option_end > descriptor_end ? NULL : (struct gve_device_option *)option_end; +} + +static +void gve_parse_device_option(struct gve_priv *priv, +			     struct gve_device_descriptor *device_descriptor, +			     struct gve_device_option *option, +			     struct gve_device_option_gqi_rda **dev_op_gqi_rda, +			     struct gve_device_option_gqi_qpl **dev_op_gqi_qpl, +			     struct gve_device_option_dqo_rda **dev_op_dqo_rda, +			     struct gve_device_option_jumbo_frames **dev_op_jumbo_frames) +{ +	u32 req_feat_mask = be32_to_cpu(option->required_features_mask); +	u16 option_length = be16_to_cpu(option->option_length); +	u16 option_id = be16_to_cpu(option->option_id); + +	/* If the length or feature mask doesn't match, continue without +	 * enabling the feature. +	 */ +	switch (option_id) { +	case GVE_DEV_OPT_ID_GQI_RAW_ADDRESSING: +		if (option_length != GVE_DEV_OPT_LEN_GQI_RAW_ADDRESSING || +		    req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RAW_ADDRESSING) { +			dev_warn(&priv->pdev->dev, GVE_DEVICE_OPTION_ERROR_FMT, +				 "Raw Addressing", +				 GVE_DEV_OPT_LEN_GQI_RAW_ADDRESSING, +				 GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RAW_ADDRESSING, +				 option_length, req_feat_mask); +			break; +		} + +		dev_info(&priv->pdev->dev, +			 "Gqi raw addressing device option enabled.\n"); +		priv->queue_format = GVE_GQI_RDA_FORMAT; +		break; +	case GVE_DEV_OPT_ID_GQI_RDA: +		if (option_length < sizeof(**dev_op_gqi_rda) || +		    req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RDA) { +			dev_warn(&priv->pdev->dev, GVE_DEVICE_OPTION_ERROR_FMT, +				 "GQI RDA", (int)sizeof(**dev_op_gqi_rda), +				 GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RDA, +				 option_length, req_feat_mask); +			break; +		} + +		if (option_length > sizeof(**dev_op_gqi_rda)) { +			dev_warn(&priv->pdev->dev, +				 GVE_DEVICE_OPTION_TOO_BIG_FMT, "GQI RDA"); +		} +		*dev_op_gqi_rda = (void *)(option + 1); +		break; +	case GVE_DEV_OPT_ID_GQI_QPL: +		if (option_length < sizeof(**dev_op_gqi_qpl) || +		    req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL) { +			dev_warn(&priv->pdev->dev, GVE_DEVICE_OPTION_ERROR_FMT, +				 "GQI QPL", (int)sizeof(**dev_op_gqi_qpl), +				 GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL, +				 option_length, req_feat_mask); +			break; +		} + +		if (option_length > sizeof(**dev_op_gqi_qpl)) { +			dev_warn(&priv->pdev->dev, +				 GVE_DEVICE_OPTION_TOO_BIG_FMT, "GQI QPL"); +		} +		*dev_op_gqi_qpl = (void *)(option + 1); +		break; +	case GVE_DEV_OPT_ID_DQO_RDA: +		if (option_length < sizeof(**dev_op_dqo_rda) || +		    req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA) { +			dev_warn(&priv->pdev->dev, GVE_DEVICE_OPTION_ERROR_FMT, +				 "DQO RDA", (int)sizeof(**dev_op_dqo_rda), +				 GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA, +				 option_length, req_feat_mask); +			break; +		} + +		if (option_length > sizeof(**dev_op_dqo_rda)) { +			dev_warn(&priv->pdev->dev, +				 GVE_DEVICE_OPTION_TOO_BIG_FMT, "DQO RDA"); +		} +		*dev_op_dqo_rda = (void *)(option + 1); +		break; +	case GVE_DEV_OPT_ID_JUMBO_FRAMES: +		if (option_length < sizeof(**dev_op_jumbo_frames) || +		    req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES) { +			dev_warn(&priv->pdev->dev, GVE_DEVICE_OPTION_ERROR_FMT, +				 "Jumbo Frames", +				 (int)sizeof(**dev_op_jumbo_frames), +				 GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES, +				 option_length, req_feat_mask); +			break; +		} + +		if (option_length > sizeof(**dev_op_jumbo_frames)) { +			dev_warn(&priv->pdev->dev, +				 GVE_DEVICE_OPTION_TOO_BIG_FMT, +				 "Jumbo Frames"); +		} +		*dev_op_jumbo_frames = (void *)(option + 1); +		break; +	default: +		/* If we don't recognize the option just continue +		 * without doing anything. +		 */ +		dev_dbg(&priv->pdev->dev, "Unrecognized device option 0x%hx not enabled.\n", +			option_id); +	} +} + +/* Process all device options for a given describe device call. */ +static int +gve_process_device_options(struct gve_priv *priv, +			   struct gve_device_descriptor *descriptor, +			   struct gve_device_option_gqi_rda **dev_op_gqi_rda, +			   struct gve_device_option_gqi_qpl **dev_op_gqi_qpl, +			   struct gve_device_option_dqo_rda **dev_op_dqo_rda, +			   struct gve_device_option_jumbo_frames **dev_op_jumbo_frames) +{ +	const int num_options = be16_to_cpu(descriptor->num_device_options); +	struct gve_device_option *dev_opt; +	int i; + +	/* The options struct directly follows the device descriptor. */ +	dev_opt = (void *)(descriptor + 1); +	for (i = 0; i < num_options; i++) { +		struct gve_device_option *next_opt; + +		next_opt = gve_get_next_option(descriptor, dev_opt); +		if (!next_opt) { +			dev_err(&priv->dev->dev, +				"options exceed device_descriptor's total length.\n"); +			return -EINVAL; +		} + +		gve_parse_device_option(priv, descriptor, dev_opt, +					dev_op_gqi_rda, dev_op_gqi_qpl, +					dev_op_dqo_rda, dev_op_jumbo_frames); +		dev_opt = next_opt; +	} + +	return 0; +} +  int gve_adminq_alloc(struct device *dev, struct gve_priv *priv)  {  	priv->adminq = dma_alloc_coherent(dev, PAGE_SIZE, @@ -23,6 +182,21 @@ int gve_adminq_alloc(struct device *dev, struct gve_priv *priv)  	priv->adminq_mask = (PAGE_SIZE / sizeof(union gve_adminq_command)) - 1;  	priv->adminq_prod_cnt = 0; +	priv->adminq_cmd_fail = 0; +	priv->adminq_timeouts = 0; +	priv->adminq_describe_device_cnt = 0; +	priv->adminq_cfg_device_resources_cnt = 0; +	priv->adminq_register_page_list_cnt = 0; +	priv->adminq_unregister_page_list_cnt = 0; +	priv->adminq_create_tx_queue_cnt = 0; +	priv->adminq_create_rx_queue_cnt = 0; +	priv->adminq_destroy_tx_queue_cnt = 0; +	priv->adminq_destroy_rx_queue_cnt = 0; +	priv->adminq_dcfg_device_resources_cnt = 0; +	priv->adminq_set_driver_parameter_cnt = 0; +	priv->adminq_report_stats_cnt = 0; +	priv->adminq_report_link_speed_cnt = 0; +	priv->adminq_get_ptype_map_cnt = 0;  	/* Setup Admin queue with the device */  	iowrite32be(priv->adminq_bus_addr / PAGE_SIZE, @@ -81,17 +255,18 @@ static bool gve_adminq_wait_for_cmd(struct gve_priv *priv, u32 prod_cnt)  	return false;  } -static int gve_adminq_parse_err(struct device *dev, u32 status) +static int gve_adminq_parse_err(struct gve_priv *priv, u32 status)  {  	if (status != GVE_ADMINQ_COMMAND_PASSED && -	    status != GVE_ADMINQ_COMMAND_UNSET) -		dev_err(dev, "AQ command failed with status %d\n", status); - +	    status != GVE_ADMINQ_COMMAND_UNSET) { +		dev_err(&priv->pdev->dev, "AQ command failed with status %d\n", status); +		priv->adminq_cmd_fail++; +	}  	switch (status) {  	case GVE_ADMINQ_COMMAND_PASSED:  		return 0;  	case GVE_ADMINQ_COMMAND_UNSET: -		dev_err(dev, "parse_aq_err: err and status both unset, this should not be possible.\n"); +		dev_err(&priv->pdev->dev, "parse_aq_err: err and status both unset, this should not be possible.\n");  		return -EINVAL;  	case GVE_ADMINQ_COMMAND_ERROR_ABORTED:  	case GVE_ADMINQ_COMMAND_ERROR_CANCELLED: @@ -116,36 +291,151 @@ static int gve_adminq_parse_err(struct device *dev, u32 status)  	case GVE_ADMINQ_COMMAND_ERROR_UNIMPLEMENTED:  		return -ENOTSUPP;  	default: -		dev_err(dev, "parse_aq_err: unknown status code %d\n", status); +		dev_err(&priv->pdev->dev, "parse_aq_err: unknown status code %d\n", status);  		return -EINVAL;  	}  } +/* Flushes all AQ commands currently queued and waits for them to complete. + * If there are failures, it will return the first error. + */ +static int gve_adminq_kick_and_wait(struct gve_priv *priv) +{ +	int tail, head; +	int i; + +	tail = ioread32be(&priv->reg_bar0->adminq_event_counter); +	head = priv->adminq_prod_cnt; + +	gve_adminq_kick_cmd(priv, head); +	if (!gve_adminq_wait_for_cmd(priv, head)) { +		dev_err(&priv->pdev->dev, "AQ commands timed out, need to reset AQ\n"); +		priv->adminq_timeouts++; +		return -ENOTRECOVERABLE; +	} + +	for (i = tail; i < head; i++) { +		union gve_adminq_command *cmd; +		u32 status, err; + +		cmd = &priv->adminq[i & priv->adminq_mask]; +		status = be32_to_cpu(READ_ONCE(cmd->status)); +		err = gve_adminq_parse_err(priv, status); +		if (err) +			// Return the first error if we failed. +			return err; +	} + +	return 0; +} +  /* This function is not threadsafe - the caller is responsible for any   * necessary locks.   */ -int gve_adminq_execute_cmd(struct gve_priv *priv, -			   union gve_adminq_command *cmd_orig) +static int gve_adminq_issue_cmd(struct gve_priv *priv, +				union gve_adminq_command *cmd_orig)  {  	union gve_adminq_command *cmd; -	u32 status = 0; -	u32 prod_cnt; +	u32 opcode; +	u32 tail; + +	tail = ioread32be(&priv->reg_bar0->adminq_event_counter); + +	// Check if next command will overflow the buffer. +	if (((priv->adminq_prod_cnt + 1) & priv->adminq_mask) == +	    (tail & priv->adminq_mask)) { +		int err; + +		// Flush existing commands to make room. +		err = gve_adminq_kick_and_wait(priv); +		if (err) +			return err; + +		// Retry. +		tail = ioread32be(&priv->reg_bar0->adminq_event_counter); +		if (((priv->adminq_prod_cnt + 1) & priv->adminq_mask) == +		    (tail & priv->adminq_mask)) { +			// This should never happen. We just flushed the +			// command queue so there should be enough space. +			return -ENOMEM; +		} +	}  	cmd = &priv->adminq[priv->adminq_prod_cnt & priv->adminq_mask];  	priv->adminq_prod_cnt++; -	prod_cnt = priv->adminq_prod_cnt;  	memcpy(cmd, cmd_orig, sizeof(*cmd_orig)); - -	gve_adminq_kick_cmd(priv, prod_cnt); -	if (!gve_adminq_wait_for_cmd(priv, prod_cnt)) { -		dev_err(&priv->pdev->dev, "AQ command timed out, need to reset AQ\n"); -		return -ENOTRECOVERABLE; +	opcode = be32_to_cpu(READ_ONCE(cmd->opcode)); + +	switch (opcode) { +	case GVE_ADMINQ_DESCRIBE_DEVICE: +		priv->adminq_describe_device_cnt++; +		break; +	case GVE_ADMINQ_CONFIGURE_DEVICE_RESOURCES: +		priv->adminq_cfg_device_resources_cnt++; +		break; +	case GVE_ADMINQ_REGISTER_PAGE_LIST: +		priv->adminq_register_page_list_cnt++; +		break; +	case GVE_ADMINQ_UNREGISTER_PAGE_LIST: +		priv->adminq_unregister_page_list_cnt++; +		break; +	case GVE_ADMINQ_CREATE_TX_QUEUE: +		priv->adminq_create_tx_queue_cnt++; +		break; +	case GVE_ADMINQ_CREATE_RX_QUEUE: +		priv->adminq_create_rx_queue_cnt++; +		break; +	case GVE_ADMINQ_DESTROY_TX_QUEUE: +		priv->adminq_destroy_tx_queue_cnt++; +		break; +	case GVE_ADMINQ_DESTROY_RX_QUEUE: +		priv->adminq_destroy_rx_queue_cnt++; +		break; +	case GVE_ADMINQ_DECONFIGURE_DEVICE_RESOURCES: +		priv->adminq_dcfg_device_resources_cnt++; +		break; +	case GVE_ADMINQ_SET_DRIVER_PARAMETER: +		priv->adminq_set_driver_parameter_cnt++; +		break; +	case GVE_ADMINQ_REPORT_STATS: +		priv->adminq_report_stats_cnt++; +		break; +	case GVE_ADMINQ_REPORT_LINK_SPEED: +		priv->adminq_report_link_speed_cnt++; +		break; +	case GVE_ADMINQ_GET_PTYPE_MAP: +		priv->adminq_get_ptype_map_cnt++; +		break; +	default: +		dev_err(&priv->pdev->dev, "unknown AQ command opcode %d\n", opcode);  	} -	memcpy(cmd_orig, cmd, sizeof(*cmd)); -	status = be32_to_cpu(READ_ONCE(cmd->status)); -	return gve_adminq_parse_err(&priv->pdev->dev, status); +	return 0; +} + +/* This function is not threadsafe - the caller is responsible for any + * necessary locks. + * The caller is also responsible for making sure there are no commands + * waiting to be executed. + */ +static int gve_adminq_execute_cmd(struct gve_priv *priv, +				  union gve_adminq_command *cmd_orig) +{ +	u32 tail, head; +	int err; + +	tail = ioread32be(&priv->reg_bar0->adminq_event_counter); +	head = priv->adminq_prod_cnt; +	if (tail != head) +		// This is not a valid path +		return -EINVAL; + +	err = gve_adminq_issue_cmd(priv, cmd_orig); +	if (err) +		return err; + +	return gve_adminq_kick_and_wait(priv);  }  /* The device specifies that the management vector can either be the first irq @@ -172,9 +462,10 @@ int gve_adminq_configure_device_resources(struct gve_priv *priv,  		.num_counters = cpu_to_be32(num_counters),  		.irq_db_addr = cpu_to_be64(db_array_bus_addr),  		.num_irq_dbs = cpu_to_be32(num_ntfy_blks), -		.irq_db_stride = cpu_to_be32(sizeof(priv->ntfy_blocks[0])), +		.irq_db_stride = cpu_to_be32(sizeof(*priv->irq_db_indices)),  		.ntfy_blk_msix_base_idx =  					cpu_to_be32(GVE_NTFY_BLK_BASE_MSIX_IDX), +		.queue_format = priv->queue_format,  	};  	return gve_adminq_execute_cmd(priv, &cmd); @@ -190,7 +481,7 @@ int gve_adminq_deconfigure_device_resources(struct gve_priv *priv)  	return gve_adminq_execute_cmd(priv, &cmd);  } -int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_index) +static int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_index)  {  	struct gve_tx_ring *tx = &priv->tx[queue_index];  	union gve_adminq_command cmd; @@ -199,17 +490,44 @@ int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_index)  	cmd.opcode = cpu_to_be32(GVE_ADMINQ_CREATE_TX_QUEUE);  	cmd.create_tx_queue = (struct gve_adminq_create_tx_queue) {  		.queue_id = cpu_to_be32(queue_index), -		.reserved = 0, -		.queue_resources_addr = cpu_to_be64(tx->q_resources_bus), +		.queue_resources_addr = +			cpu_to_be64(tx->q_resources_bus),  		.tx_ring_addr = cpu_to_be64(tx->bus), -		.queue_page_list_id = cpu_to_be32(tx->tx_fifo.qpl->id),  		.ntfy_id = cpu_to_be32(tx->ntfy_id),  	}; -	return gve_adminq_execute_cmd(priv, &cmd); +	if (gve_is_gqi(priv)) { +		u32 qpl_id = priv->queue_format == GVE_GQI_RDA_FORMAT ? +			GVE_RAW_ADDRESSING_QPL_ID : tx->tx_fifo.qpl->id; + +		cmd.create_tx_queue.queue_page_list_id = cpu_to_be32(qpl_id); +	} else { +		cmd.create_tx_queue.tx_ring_size = +			cpu_to_be16(priv->tx_desc_cnt); +		cmd.create_tx_queue.tx_comp_ring_addr = +			cpu_to_be64(tx->complq_bus_dqo); +		cmd.create_tx_queue.tx_comp_ring_size = +			cpu_to_be16(priv->options_dqo_rda.tx_comp_ring_entries); +	} + +	return gve_adminq_issue_cmd(priv, &cmd);  } -int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index) +int gve_adminq_create_tx_queues(struct gve_priv *priv, u32 num_queues) +{ +	int err; +	int i; + +	for (i = 0; i < num_queues; i++) { +		err = gve_adminq_create_tx_queue(priv, i); +		if (err) +			return err; +	} + +	return gve_adminq_kick_and_wait(priv); +} + +static int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index)  {  	struct gve_rx_ring *rx = &priv->rx[queue_index];  	union gve_adminq_command cmd; @@ -218,21 +536,57 @@ int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index)  	cmd.opcode = cpu_to_be32(GVE_ADMINQ_CREATE_RX_QUEUE);  	cmd.create_rx_queue = (struct gve_adminq_create_rx_queue) {  		.queue_id = cpu_to_be32(queue_index), -		.index = cpu_to_be32(queue_index), -		.reserved = 0,  		.ntfy_id = cpu_to_be32(rx->ntfy_id),  		.queue_resources_addr = cpu_to_be64(rx->q_resources_bus), -		.rx_desc_ring_addr = cpu_to_be64(rx->desc.bus), -		.rx_data_ring_addr = cpu_to_be64(rx->data.data_bus), -		.queue_page_list_id = cpu_to_be32(rx->data.qpl->id),  	}; -	return gve_adminq_execute_cmd(priv, &cmd); +	if (gve_is_gqi(priv)) { +		u32 qpl_id = priv->queue_format == GVE_GQI_RDA_FORMAT ? +			GVE_RAW_ADDRESSING_QPL_ID : rx->data.qpl->id; + +		cmd.create_rx_queue.rx_desc_ring_addr = +			cpu_to_be64(rx->desc.bus), +		cmd.create_rx_queue.rx_data_ring_addr = +			cpu_to_be64(rx->data.data_bus), +		cmd.create_rx_queue.index = cpu_to_be32(queue_index); +		cmd.create_rx_queue.queue_page_list_id = cpu_to_be32(qpl_id); +		cmd.create_rx_queue.packet_buffer_size = cpu_to_be16(rx->packet_buffer_size); +	} else { +		cmd.create_rx_queue.rx_ring_size = +			cpu_to_be16(priv->rx_desc_cnt); +		cmd.create_rx_queue.rx_desc_ring_addr = +			cpu_to_be64(rx->dqo.complq.bus); +		cmd.create_rx_queue.rx_data_ring_addr = +			cpu_to_be64(rx->dqo.bufq.bus); +		cmd.create_rx_queue.packet_buffer_size = +			cpu_to_be16(priv->data_buffer_size_dqo); +		cmd.create_rx_queue.rx_buff_ring_size = +			cpu_to_be16(priv->options_dqo_rda.rx_buff_ring_entries); +		cmd.create_rx_queue.enable_rsc = +			!!(priv->dev->features & NETIF_F_LRO); +	} + +	return gve_adminq_issue_cmd(priv, &cmd);  } -int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_index) +int gve_adminq_create_rx_queues(struct gve_priv *priv, u32 num_queues) +{ +	int err; +	int i; + +	for (i = 0; i < num_queues; i++) { +		err = gve_adminq_create_rx_queue(priv, i); +		if (err) +			return err; +	} + +	return gve_adminq_kick_and_wait(priv); +} + +static int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_index)  {  	union gve_adminq_command cmd; +	int err;  	memset(&cmd, 0, sizeof(cmd));  	cmd.opcode = cpu_to_be32(GVE_ADMINQ_DESTROY_TX_QUEUE); @@ -240,12 +594,31 @@ int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_index)  		.queue_id = cpu_to_be32(queue_index),  	}; -	return gve_adminq_execute_cmd(priv, &cmd); +	err = gve_adminq_issue_cmd(priv, &cmd); +	if (err) +		return err; + +	return 0; +} + +int gve_adminq_destroy_tx_queues(struct gve_priv *priv, u32 num_queues) +{ +	int err; +	int i; + +	for (i = 0; i < num_queues; i++) { +		err = gve_adminq_destroy_tx_queue(priv, i); +		if (err) +			return err; +	} + +	return gve_adminq_kick_and_wait(priv);  } -int gve_adminq_destroy_rx_queue(struct gve_priv *priv, u32 queue_index) +static int gve_adminq_destroy_rx_queue(struct gve_priv *priv, u32 queue_index)  {  	union gve_adminq_command cmd; +	int err;  	memset(&cmd, 0, sizeof(cmd));  	cmd.opcode = cpu_to_be32(GVE_ADMINQ_DESTROY_RX_QUEUE); @@ -253,12 +626,86 @@ int gve_adminq_destroy_rx_queue(struct gve_priv *priv, u32 queue_index)  		.queue_id = cpu_to_be32(queue_index),  	}; -	return gve_adminq_execute_cmd(priv, &cmd); +	err = gve_adminq_issue_cmd(priv, &cmd); +	if (err) +		return err; + +	return 0; +} + +int gve_adminq_destroy_rx_queues(struct gve_priv *priv, u32 num_queues) +{ +	int err; +	int i; + +	for (i = 0; i < num_queues; i++) { +		err = gve_adminq_destroy_rx_queue(priv, i); +		if (err) +			return err; +	} + +	return gve_adminq_kick_and_wait(priv); +} + +static int gve_set_desc_cnt(struct gve_priv *priv, +			    struct gve_device_descriptor *descriptor) +{ +	priv->tx_desc_cnt = be16_to_cpu(descriptor->tx_queue_entries); +	if (priv->tx_desc_cnt * sizeof(priv->tx->desc[0]) < PAGE_SIZE) { +		dev_err(&priv->pdev->dev, "Tx desc count %d too low\n", +			priv->tx_desc_cnt); +		return -EINVAL; +	} +	priv->rx_desc_cnt = be16_to_cpu(descriptor->rx_queue_entries); +	if (priv->rx_desc_cnt * sizeof(priv->rx->desc.desc_ring[0]) +	    < PAGE_SIZE) { +		dev_err(&priv->pdev->dev, "Rx desc count %d too low\n", +			priv->rx_desc_cnt); +		return -EINVAL; +	} +	return 0; +} + +static int +gve_set_desc_cnt_dqo(struct gve_priv *priv, +		     const struct gve_device_descriptor *descriptor, +		     const struct gve_device_option_dqo_rda *dev_op_dqo_rda) +{ +	priv->tx_desc_cnt = be16_to_cpu(descriptor->tx_queue_entries); +	priv->options_dqo_rda.tx_comp_ring_entries = +		be16_to_cpu(dev_op_dqo_rda->tx_comp_ring_entries); +	priv->rx_desc_cnt = be16_to_cpu(descriptor->rx_queue_entries); +	priv->options_dqo_rda.rx_buff_ring_entries = +		be16_to_cpu(dev_op_dqo_rda->rx_buff_ring_entries); + +	return 0; +} + +static void gve_enable_supported_features(struct gve_priv *priv, +					  u32 supported_features_mask, +					  const struct gve_device_option_jumbo_frames +						  *dev_op_jumbo_frames) +{ +	/* Before control reaches this point, the page-size-capped max MTU from +	 * the gve_device_descriptor field has already been stored in +	 * priv->dev->max_mtu. We overwrite it with the true max MTU below. +	 */ +	if (dev_op_jumbo_frames && +	    (supported_features_mask & GVE_SUP_JUMBO_FRAMES_MASK)) { +		dev_info(&priv->pdev->dev, +			 "JUMBO FRAMES device option enabled.\n"); +		priv->dev->max_mtu = be16_to_cpu(dev_op_jumbo_frames->max_mtu); +	}  }  int gve_adminq_describe_device(struct gve_priv *priv)  { +	struct gve_device_option_jumbo_frames *dev_op_jumbo_frames = NULL; +	struct gve_device_option_gqi_rda *dev_op_gqi_rda = NULL; +	struct gve_device_option_gqi_qpl *dev_op_gqi_qpl = NULL; +	struct gve_device_option_dqo_rda *dev_op_dqo_rda = NULL;  	struct gve_device_descriptor *descriptor; +	u32 supported_features_mask = 0;  	union gve_adminq_command cmd;  	dma_addr_t descriptor_bus;  	int err = 0; @@ -281,48 +728,77 @@ int gve_adminq_describe_device(struct gve_priv *priv)  	if (err)  		goto free_device_descriptor; -	priv->tx_desc_cnt = be16_to_cpu(descriptor->tx_queue_entries); -	if (priv->tx_desc_cnt * sizeof(priv->tx->desc[0]) < PAGE_SIZE) { -		netif_err(priv, drv, priv->dev, "Tx desc count %d too low\n", -			  priv->tx_desc_cnt); -		err = -EINVAL; +	err = gve_process_device_options(priv, descriptor, &dev_op_gqi_rda, +					 &dev_op_gqi_qpl, &dev_op_dqo_rda, +					 &dev_op_jumbo_frames); +	if (err)  		goto free_device_descriptor; + +	/* If the GQI_RAW_ADDRESSING option is not enabled and the queue format +	 * is not set to GqiRda, choose the queue format in a priority order: +	 * DqoRda, GqiRda, GqiQpl. Use GqiQpl as default. +	 */ +	if (dev_op_dqo_rda) { +		priv->queue_format = GVE_DQO_RDA_FORMAT; +		dev_info(&priv->pdev->dev, +			 "Driver is running with DQO RDA queue format.\n"); +		supported_features_mask = +			be32_to_cpu(dev_op_dqo_rda->supported_features_mask); +	} else if (dev_op_gqi_rda) { +		priv->queue_format = GVE_GQI_RDA_FORMAT; +		dev_info(&priv->pdev->dev, +			 "Driver is running with GQI RDA queue format.\n"); +		supported_features_mask = +			be32_to_cpu(dev_op_gqi_rda->supported_features_mask); +	} else if (priv->queue_format == GVE_GQI_RDA_FORMAT) { +		dev_info(&priv->pdev->dev, +			 "Driver is running with GQI RDA queue format.\n"); +	} else { +		priv->queue_format = GVE_GQI_QPL_FORMAT; +		if (dev_op_gqi_qpl) +			supported_features_mask = +				be32_to_cpu(dev_op_gqi_qpl->supported_features_mask); +		dev_info(&priv->pdev->dev, +			 "Driver is running with GQI QPL queue format.\n");  	} -	priv->rx_desc_cnt = be16_to_cpu(descriptor->rx_queue_entries); -	if (priv->rx_desc_cnt * sizeof(priv->rx->desc.desc_ring[0]) -	    < PAGE_SIZE || -	    priv->rx_desc_cnt * sizeof(priv->rx->data.data_ring[0]) -	    < PAGE_SIZE) { -		netif_err(priv, drv, priv->dev, "Rx desc count %d too low\n", -			  priv->rx_desc_cnt); -		err = -EINVAL; -		goto free_device_descriptor; +	if (gve_is_gqi(priv)) { +		err = gve_set_desc_cnt(priv, descriptor); +	} else { +		/* DQO supports LRO. */ +		priv->dev->hw_features |= NETIF_F_LRO; +		err = gve_set_desc_cnt_dqo(priv, descriptor, dev_op_dqo_rda);  	} +	if (err) +		goto free_device_descriptor; +  	priv->max_registered_pages =  				be64_to_cpu(descriptor->max_registered_pages);  	mtu = be16_to_cpu(descriptor->mtu);  	if (mtu < ETH_MIN_MTU) { -		netif_err(priv, drv, priv->dev, "MTU %d below minimum MTU\n", -			  mtu); +		dev_err(&priv->pdev->dev, "MTU %d below minimum MTU\n", mtu);  		err = -EINVAL;  		goto free_device_descriptor;  	}  	priv->dev->max_mtu = mtu;  	priv->num_event_counters = be16_to_cpu(descriptor->counters); -	ether_addr_copy(priv->dev->dev_addr, descriptor->mac); +	eth_hw_addr_set(priv->dev, descriptor->mac);  	mac = descriptor->mac; -	netif_info(priv, drv, priv->dev, "MAC addr: %pM\n", mac); +	dev_info(&priv->pdev->dev, "MAC addr: %pM\n", mac);  	priv->tx_pages_per_qpl = be16_to_cpu(descriptor->tx_pages_per_qpl); -	priv->rx_pages_per_qpl = be16_to_cpu(descriptor->rx_pages_per_qpl); -	if (priv->rx_pages_per_qpl < priv->rx_desc_cnt) { -		netif_err(priv, drv, priv->dev, "rx_pages_per_qpl cannot be smaller than rx_desc_cnt, setting rx_desc_cnt down to %d.\n", -			  priv->rx_pages_per_qpl); -		priv->rx_desc_cnt = priv->rx_pages_per_qpl; +	priv->rx_data_slot_cnt = be16_to_cpu(descriptor->rx_pages_per_qpl); + +	if (gve_is_gqi(priv) && priv->rx_data_slot_cnt < priv->rx_desc_cnt) { +		dev_err(&priv->pdev->dev, "rx_data_slot_cnt cannot be smaller than rx_desc_cnt, setting rx_desc_cnt down to %d.\n", +			priv->rx_data_slot_cnt); +		priv->rx_desc_cnt = priv->rx_data_slot_cnt;  	}  	priv->default_num_queues = be16_to_cpu(descriptor->default_num_queues); +	gve_enable_supported_features(priv, supported_features_mask, +				      dev_op_jumbo_frames); +  free_device_descriptor: -	dma_free_coherent(&priv->pdev->dev, sizeof(*descriptor), descriptor, +	dma_free_coherent(&priv->pdev->dev, PAGE_SIZE, descriptor,  			  descriptor_bus);  	return err;  } @@ -385,3 +861,84 @@ int gve_adminq_set_mtu(struct gve_priv *priv, u64 mtu)  	return gve_adminq_execute_cmd(priv, &cmd);  } + +int gve_adminq_report_stats(struct gve_priv *priv, u64 stats_report_len, +			    dma_addr_t stats_report_addr, u64 interval) +{ +	union gve_adminq_command cmd; + +	memset(&cmd, 0, sizeof(cmd)); +	cmd.opcode = cpu_to_be32(GVE_ADMINQ_REPORT_STATS); +	cmd.report_stats = (struct gve_adminq_report_stats) { +		.stats_report_len = cpu_to_be64(stats_report_len), +		.stats_report_addr = cpu_to_be64(stats_report_addr), +		.interval = cpu_to_be64(interval), +	}; + +	return gve_adminq_execute_cmd(priv, &cmd); +} + +int gve_adminq_report_link_speed(struct gve_priv *priv) +{ +	union gve_adminq_command gvnic_cmd; +	dma_addr_t link_speed_region_bus; +	__be64 *link_speed_region; +	int err; + +	link_speed_region = +		dma_alloc_coherent(&priv->pdev->dev, sizeof(*link_speed_region), +				   &link_speed_region_bus, GFP_KERNEL); + +	if (!link_speed_region) +		return -ENOMEM; + +	memset(&gvnic_cmd, 0, sizeof(gvnic_cmd)); +	gvnic_cmd.opcode = cpu_to_be32(GVE_ADMINQ_REPORT_LINK_SPEED); +	gvnic_cmd.report_link_speed.link_speed_address = +		cpu_to_be64(link_speed_region_bus); + +	err = gve_adminq_execute_cmd(priv, &gvnic_cmd); + +	priv->link_speed = be64_to_cpu(*link_speed_region); +	dma_free_coherent(&priv->pdev->dev, sizeof(*link_speed_region), link_speed_region, +			  link_speed_region_bus); +	return err; +} + +int gve_adminq_get_ptype_map_dqo(struct gve_priv *priv, +				 struct gve_ptype_lut *ptype_lut) +{ +	struct gve_ptype_map *ptype_map; +	union gve_adminq_command cmd; +	dma_addr_t ptype_map_bus; +	int err = 0; +	int i; + +	memset(&cmd, 0, sizeof(cmd)); +	ptype_map = dma_alloc_coherent(&priv->pdev->dev, sizeof(*ptype_map), +				       &ptype_map_bus, GFP_KERNEL); +	if (!ptype_map) +		return -ENOMEM; + +	cmd.opcode = cpu_to_be32(GVE_ADMINQ_GET_PTYPE_MAP); +	cmd.get_ptype_map = (struct gve_adminq_get_ptype_map) { +		.ptype_map_len = cpu_to_be64(sizeof(*ptype_map)), +		.ptype_map_addr = cpu_to_be64(ptype_map_bus), +	}; + +	err = gve_adminq_execute_cmd(priv, &cmd); +	if (err) +		goto err; + +	/* Populate ptype_lut. */ +	for (i = 0; i < GVE_NUM_PTYPES; i++) { +		ptype_lut->ptypes[i].l3_type = +			ptype_map->ptypes[i].l3_type; +		ptype_lut->ptypes[i].l4_type = +			ptype_map->ptypes[i].l4_type; +	} +err: +	dma_free_coherent(&priv->pdev->dev, sizeof(*ptype_map), ptype_map, +			  ptype_map_bus); +	return err; +} diff --git a/drivers/net/ethernet/google/gve/gve_adminq.h b/drivers/net/ethernet/google/gve/gve_adminq.h index 4dfa06edc0f8..83c0b40cd2d9 100644 --- a/drivers/net/ethernet/google/gve/gve_adminq.h +++ b/drivers/net/ethernet/google/gve/gve_adminq.h @@ -1,7 +1,7 @@  /* SPDX-License-Identifier: (GPL-2.0 OR MIT)   * Google virtual Ethernet (gve) driver   * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc.   */  #ifndef _GVE_ADMINQ_H @@ -21,6 +21,9 @@ enum gve_adminq_opcodes {  	GVE_ADMINQ_DESTROY_RX_QUEUE		= 0x8,  	GVE_ADMINQ_DECONFIGURE_DEVICE_RESOURCES	= 0x9,  	GVE_ADMINQ_SET_DRIVER_PARAMETER		= 0xB, +	GVE_ADMINQ_REPORT_STATS			= 0xC, +	GVE_ADMINQ_REPORT_LINK_SPEED		= 0xD, +	GVE_ADMINQ_GET_PTYPE_MAP		= 0xE,  };  /* Admin queue status codes */ @@ -77,12 +80,71 @@ struct gve_device_descriptor {  static_assert(sizeof(struct gve_device_descriptor) == 40); -struct device_option { -	__be32 option_id; -	__be32 option_length; +struct gve_device_option { +	__be16 option_id; +	__be16 option_length; +	__be32 required_features_mask;  }; -static_assert(sizeof(struct device_option) == 8); +static_assert(sizeof(struct gve_device_option) == 8); + +struct gve_device_option_gqi_rda { +	__be32 supported_features_mask; +}; + +static_assert(sizeof(struct gve_device_option_gqi_rda) == 4); + +struct gve_device_option_gqi_qpl { +	__be32 supported_features_mask; +}; + +static_assert(sizeof(struct gve_device_option_gqi_qpl) == 4); + +struct gve_device_option_dqo_rda { +	__be32 supported_features_mask; +	__be16 tx_comp_ring_entries; +	__be16 rx_buff_ring_entries; +}; + +static_assert(sizeof(struct gve_device_option_dqo_rda) == 8); + +struct gve_device_option_jumbo_frames { +	__be32 supported_features_mask; +	__be16 max_mtu; +	u8 padding[2]; +}; + +static_assert(sizeof(struct gve_device_option_jumbo_frames) == 8); + +/* Terminology: + * + * RDA - Raw DMA Addressing - Buffers associated with SKBs are directly DMA + *       mapped and read/updated by the device. + * + * QPL - Queue Page Lists - Driver uses bounce buffers which are DMA mapped with + *       the device for read/write and data is copied from/to SKBs. + */ +enum gve_dev_opt_id { +	GVE_DEV_OPT_ID_GQI_RAW_ADDRESSING = 0x1, +	GVE_DEV_OPT_ID_GQI_RDA = 0x2, +	GVE_DEV_OPT_ID_GQI_QPL = 0x3, +	GVE_DEV_OPT_ID_DQO_RDA = 0x4, +	GVE_DEV_OPT_ID_JUMBO_FRAMES = 0x8, +}; + +enum gve_dev_opt_req_feat_mask { +	GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RAW_ADDRESSING = 0x0, +	GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RDA = 0x0, +	GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL = 0x0, +	GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA = 0x0, +	GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES = 0x0, +}; + +enum gve_sup_feature_mask { +	GVE_SUP_JUMBO_FRAMES_MASK = 1 << 2, +}; + +#define GVE_DEV_OPT_LEN_GQI_RAW_ADDRESSING 0x0  struct gve_adminq_configure_device_resources {  	__be64 counter_array; @@ -91,9 +153,11 @@ struct gve_adminq_configure_device_resources {  	__be32 num_irq_dbs;  	__be32 irq_db_stride;  	__be32 ntfy_blk_msix_base_idx; +	u8 queue_format; +	u8 padding[7];  }; -static_assert(sizeof(struct gve_adminq_configure_device_resources) == 32); +static_assert(sizeof(struct gve_adminq_configure_device_resources) == 40);  struct gve_adminq_register_page_list {  	__be32 page_list_id; @@ -109,6 +173,8 @@ struct gve_adminq_unregister_page_list {  static_assert(sizeof(struct gve_adminq_unregister_page_list) == 4); +#define GVE_RAW_ADDRESSING_QPL_ID 0xFFFFFFFF +  struct gve_adminq_create_tx_queue {  	__be32 queue_id;  	__be32 reserved; @@ -116,9 +182,13 @@ struct gve_adminq_create_tx_queue {  	__be64 tx_ring_addr;  	__be32 queue_page_list_id;  	__be32 ntfy_id; +	__be64 tx_comp_ring_addr; +	__be16 tx_ring_size; +	__be16 tx_comp_ring_size; +	u8 padding[4];  }; -static_assert(sizeof(struct gve_adminq_create_tx_queue) == 32); +static_assert(sizeof(struct gve_adminq_create_tx_queue) == 48);  struct gve_adminq_create_rx_queue {  	__be32 queue_id; @@ -129,10 +199,14 @@ struct gve_adminq_create_rx_queue {  	__be64 rx_desc_ring_addr;  	__be64 rx_data_ring_addr;  	__be32 queue_page_list_id; -	u8 padding[4]; +	__be16 rx_ring_size; +	__be16 packet_buffer_size; +	__be16 rx_buff_ring_size; +	u8 enable_rsc; +	u8 padding[5];  }; -static_assert(sizeof(struct gve_adminq_create_rx_queue) == 48); +static_assert(sizeof(struct gve_adminq_create_rx_queue) == 56);  /* Queue resources that are shared with the device */  struct gve_queue_resources { @@ -172,6 +246,87 @@ struct gve_adminq_set_driver_parameter {  static_assert(sizeof(struct gve_adminq_set_driver_parameter) == 16); +struct gve_adminq_report_stats { +	__be64 stats_report_len; +	__be64 stats_report_addr; +	__be64 interval; +}; + +static_assert(sizeof(struct gve_adminq_report_stats) == 24); + +struct gve_adminq_report_link_speed { +	__be64 link_speed_address; +}; + +static_assert(sizeof(struct gve_adminq_report_link_speed) == 8); + +struct stats { +	__be32 stat_name; +	__be32 queue_id; +	__be64 value; +}; + +static_assert(sizeof(struct stats) == 16); + +struct gve_stats_report { +	__be64 written_count; +	struct stats stats[]; +}; + +static_assert(sizeof(struct gve_stats_report) == 8); + +enum gve_stat_names { +	// stats from gve +	TX_WAKE_CNT			= 1, +	TX_STOP_CNT			= 2, +	TX_FRAMES_SENT			= 3, +	TX_BYTES_SENT			= 4, +	TX_LAST_COMPLETION_PROCESSED	= 5, +	RX_NEXT_EXPECTED_SEQUENCE	= 6, +	RX_BUFFERS_POSTED		= 7, +	TX_TIMEOUT_CNT			= 8, +	// stats from NIC +	RX_QUEUE_DROP_CNT		= 65, +	RX_NO_BUFFERS_POSTED		= 66, +	RX_DROPS_PACKET_OVER_MRU	= 67, +	RX_DROPS_INVALID_CHECKSUM	= 68, +}; + +enum gve_l3_type { +	/* Must be zero so zero initialized LUT is unknown. */ +	GVE_L3_TYPE_UNKNOWN = 0, +	GVE_L3_TYPE_OTHER, +	GVE_L3_TYPE_IPV4, +	GVE_L3_TYPE_IPV6, +}; + +enum gve_l4_type { +	/* Must be zero so zero initialized LUT is unknown. */ +	GVE_L4_TYPE_UNKNOWN = 0, +	GVE_L4_TYPE_OTHER, +	GVE_L4_TYPE_TCP, +	GVE_L4_TYPE_UDP, +	GVE_L4_TYPE_ICMP, +	GVE_L4_TYPE_SCTP, +}; + +/* These are control path types for PTYPE which are the same as the data path + * types. + */ +struct gve_ptype_entry { +	u8 l3_type; +	u8 l4_type; +}; + +struct gve_ptype_map { +	struct gve_ptype_entry ptypes[1 << 10]; /* PTYPES are always 10 bits. */ +}; + +struct gve_adminq_get_ptype_map { +	__be64 ptype_map_len; +	__be64 ptype_map_addr; +}; +  union gve_adminq_command {  	struct {  		__be32 opcode; @@ -187,6 +342,9 @@ union gve_adminq_command {  			struct gve_adminq_register_page_list reg_page_list;  			struct gve_adminq_unregister_page_list unreg_page_list;  			struct gve_adminq_set_driver_parameter set_driver_param; +			struct gve_adminq_report_stats report_stats; +			struct gve_adminq_report_link_speed report_link_speed; +			struct gve_adminq_get_ptype_map get_ptype_map;  		};  	};  	u8 reserved[64]; @@ -197,8 +355,6 @@ static_assert(sizeof(union gve_adminq_command) == 64);  int gve_adminq_alloc(struct device *dev, struct gve_priv *priv);  void gve_adminq_free(struct device *dev, struct gve_priv *priv);  void gve_adminq_release(struct gve_priv *priv); -int gve_adminq_execute_cmd(struct gve_priv *priv, -			   union gve_adminq_command *cmd_orig);  int gve_adminq_describe_device(struct gve_priv *priv);  int gve_adminq_configure_device_resources(struct gve_priv *priv,  					  dma_addr_t counter_array_bus_addr, @@ -206,12 +362,20 @@ int gve_adminq_configure_device_resources(struct gve_priv *priv,  					  dma_addr_t db_array_bus_addr,  					  u32 num_ntfy_blks);  int gve_adminq_deconfigure_device_resources(struct gve_priv *priv); -int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_id); -int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_id); -int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_id); -int gve_adminq_destroy_rx_queue(struct gve_priv *priv, u32 queue_id); +int gve_adminq_create_tx_queues(struct gve_priv *priv, u32 num_queues); +int gve_adminq_destroy_tx_queues(struct gve_priv *priv, u32 queue_id); +int gve_adminq_create_rx_queues(struct gve_priv *priv, u32 num_queues); +int gve_adminq_destroy_rx_queues(struct gve_priv *priv, u32 queue_id);  int gve_adminq_register_page_list(struct gve_priv *priv,  				  struct gve_queue_page_list *qpl);  int gve_adminq_unregister_page_list(struct gve_priv *priv, u32 page_list_id);  int gve_adminq_set_mtu(struct gve_priv *priv, u64 mtu); +int gve_adminq_report_stats(struct gve_priv *priv, u64 stats_report_len, +			    dma_addr_t stats_report_addr, u64 interval); +int gve_adminq_report_link_speed(struct gve_priv *priv); + +struct gve_ptype_lut; +int gve_adminq_get_ptype_map_dqo(struct gve_priv *priv, +				 struct gve_ptype_lut *ptype_lut); +  #endif /* _GVE_ADMINQ_H */ diff --git a/drivers/net/ethernet/google/gve/gve_desc.h b/drivers/net/ethernet/google/gve/gve_desc.h index 54779871d52e..f4ae9e19b844 100644 --- a/drivers/net/ethernet/google/gve/gve_desc.h +++ b/drivers/net/ethernet/google/gve/gve_desc.h @@ -16,9 +16,11 @@   * Base addresses encoded in seg_addr are not assumed to be physical   * addresses. The ring format assumes these come from some linear address   * space. This could be physical memory, kernel virtual memory, user virtual - * memory. gVNIC uses lists of registered pages. Each queue is assumed - * to be associated with a single such linear address space to ensure a - * consistent meaning for seg_addrs posted to its rings. + * memory. + * If raw dma addressing is not supported then gVNIC uses lists of registered + * pages. Each queue is assumed to be associated with a single such linear + * address space to ensure a consistent meaning for seg_addrs posted to its + * rings.   */  struct gve_tx_pkt_desc { @@ -31,6 +33,14 @@ struct gve_tx_pkt_desc {  	__be64	seg_addr;  /* Base address (see note) of this segment */  } __packed; +struct gve_tx_mtd_desc { +	u8      type_flags;     /* type is lower 4 bits, subtype upper  */ +	u8      path_state;     /* state is lower 4 bits, hash type upper */ +	__be16  reserved0; +	__be32  path_hash; +	__be64  reserved1; +} __packed; +  struct gve_tx_seg_desc {  	u8	type_flags;	/* type is lower 4 bits, flags upper	*/  	u8	l3_offset;	/* TSO: 2 byte units to start of IPH	*/ @@ -44,6 +54,7 @@ struct gve_tx_seg_desc {  #define	GVE_TXD_STD		(0x0 << 4) /* Std with Host Address	*/  #define	GVE_TXD_TSO		(0x1 << 4) /* TSO with Host Address	*/  #define	GVE_TXD_SEG		(0x2 << 4) /* Seg with Host Address	*/ +#define	GVE_TXD_MTD		(0x3 << 4) /* Metadata			*/  /* GVE Transmit Descriptor Flags for Std Pkts */  #define	GVE_TXF_L4CSUM	BIT(0)	/* Need csum offload */ @@ -52,6 +63,17 @@ struct gve_tx_seg_desc {  /* GVE Transmit Descriptor Flags for TSO Segs */  #define	GVE_TXSF_IPV6	BIT(1)	/* IPv6 TSO */ +/* GVE Transmit Descriptor Options for MTD Segs */ +#define GVE_MTD_SUBTYPE_PATH		0 + +#define GVE_MTD_PATH_STATE_DEFAULT	0 +#define GVE_MTD_PATH_STATE_TIMEOUT	1 +#define GVE_MTD_PATH_STATE_CONGESTION	2 +#define GVE_MTD_PATH_STATE_RETRANSMIT	3 + +#define GVE_MTD_PATH_HASH_NONE         (0x0 << 4) +#define GVE_MTD_PATH_HASH_L4           (0x1 << 4) +  /* GVE Receive Packet Descriptor */  /* The start of an ethernet packet comes 2 bytes into the rx buffer.   * gVNIC adds this padding so that both the DMA and the L3/4 protocol header @@ -72,12 +94,15 @@ struct gve_rx_desc {  } __packed;  static_assert(sizeof(struct gve_rx_desc) == 64); -/* As with the Tx ring format, the qpl_offset entries below are offsets into an - * ordered list of registered pages. +/* If the device supports raw dma addressing then the addr in data slot is + * the dma address of the buffer. + * If the device only supports registered segments then the addr is a byte + * offset into the registered segment (an ordered list of pages) where the + * buffer is.   */ -struct gve_rx_data_slot { -	/* byte offset into the rx registered segment of this slot */ +union gve_rx_data_slot {  	__be64 qpl_offset; +	__be64 addr;  };  /* GVE Recive Packet Descriptor Seq No */ @@ -85,12 +110,13 @@ struct gve_rx_data_slot {  /* GVE Recive Packet Descriptor Flags */  #define GVE_RXFLG(x)	cpu_to_be16(1 << (3 + (x))) -#define	GVE_RXF_FRAG	GVE_RXFLG(3)	/* IP Fragment			*/ -#define	GVE_RXF_IPV4	GVE_RXFLG(4)	/* IPv4				*/ -#define	GVE_RXF_IPV6	GVE_RXFLG(5)	/* IPv6				*/ -#define	GVE_RXF_TCP	GVE_RXFLG(6)	/* TCP Packet			*/ -#define	GVE_RXF_UDP	GVE_RXFLG(7)	/* UDP Packet			*/ -#define	GVE_RXF_ERR	GVE_RXFLG(8)	/* Packet Error Detected	*/ +#define	GVE_RXF_FRAG		GVE_RXFLG(3)	/* IP Fragment			*/ +#define	GVE_RXF_IPV4		GVE_RXFLG(4)	/* IPv4				*/ +#define	GVE_RXF_IPV6		GVE_RXFLG(5)	/* IPv6				*/ +#define	GVE_RXF_TCP		GVE_RXFLG(6)	/* TCP Packet			*/ +#define	GVE_RXF_UDP		GVE_RXFLG(7)	/* UDP Packet			*/ +#define	GVE_RXF_ERR		GVE_RXFLG(8)	/* Packet Error Detected	*/ +#define	GVE_RXF_PKT_CONT	GVE_RXFLG(10)	/* Multi Fragment RX packet	*/  /* GVE IRQ */  #define GVE_IRQ_ACK	BIT(31) diff --git a/drivers/net/ethernet/google/gve/gve_desc_dqo.h b/drivers/net/ethernet/google/gve/gve_desc_dqo.h new file mode 100644 index 000000000000..e8fe9adef7f2 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_desc_dqo.h @@ -0,0 +1,256 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) + * Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2021 Google, Inc. + */ + +/* GVE DQO Descriptor formats */ + +#ifndef _GVE_DESC_DQO_H_ +#define _GVE_DESC_DQO_H_ + +#include <linux/build_bug.h> + +#define GVE_TX_MAX_HDR_SIZE_DQO 255 +#define GVE_TX_MIN_TSO_MSS_DQO 88 + +#ifndef __LITTLE_ENDIAN_BITFIELD +#error "Only little endian supported" +#endif + +/* Basic TX descriptor (DTYPE 0x0C) */ +struct gve_tx_pkt_desc_dqo { +	__le64 buf_addr; + +	/* Must be GVE_TX_PKT_DESC_DTYPE_DQO (0xc) */ +	u8 dtype: 5; + +	/* Denotes the last descriptor of a packet. */ +	u8 end_of_packet: 1; +	u8 checksum_offload_enable: 1; + +	/* If set, will generate a descriptor completion for this descriptor. */ +	u8 report_event: 1; +	u8 reserved0; +	__le16 reserved1; + +	/* The TX completion associated with this packet will contain this tag. +	 */ +	__le16 compl_tag; +	u16 buf_size: 14; +	u16 reserved2: 2; +} __packed; +static_assert(sizeof(struct gve_tx_pkt_desc_dqo) == 16); + +#define GVE_TX_PKT_DESC_DTYPE_DQO 0xc +#define GVE_TX_MAX_BUF_SIZE_DQO ((16 * 1024) - 1) + +/* Maximum number of data descriptors allowed per packet, or per-TSO segment. */ +#define GVE_TX_MAX_DATA_DESCS 10 + +/* Min gap between tail and head to avoid cacheline overlap */ +#define GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP 4 + +/* "report_event" on TX packet descriptors may only be reported on the last + * descriptor of a TX packet, and they must be spaced apart with at least this + * value. + */ +#define GVE_TX_MIN_RE_INTERVAL 32 + +struct gve_tx_context_cmd_dtype { +	u8 dtype: 5; +	u8 tso: 1; +	u8 reserved1: 2; + +	u8 reserved2; +}; + +static_assert(sizeof(struct gve_tx_context_cmd_dtype) == 2); + +/* TX Native TSO Context DTYPE (0x05) + * + * "flex" fields allow the driver to send additional packet context to HW. + */ +struct gve_tx_tso_context_desc_dqo { +	/* The L4 payload bytes that should be segmented. */ +	u32 tso_total_len: 24; +	u32 flex10: 8; + +	/* Max segment size in TSO excluding headers. */ +	u16 mss: 14; +	u16 reserved: 2; + +	u8 header_len; /* Header length to use for TSO offload */ +	u8 flex11; +	struct gve_tx_context_cmd_dtype cmd_dtype; +	u8 flex0; +	u8 flex5; +	u8 flex6; +	u8 flex7; +	u8 flex8; +	u8 flex9; +} __packed; +static_assert(sizeof(struct gve_tx_tso_context_desc_dqo) == 16); + +#define GVE_TX_TSO_CTX_DESC_DTYPE_DQO 0x5 + +/* General context descriptor for sending metadata. */ +struct gve_tx_general_context_desc_dqo { +	u8 flex4; +	u8 flex5; +	u8 flex6; +	u8 flex7; +	u8 flex8; +	u8 flex9; +	u8 flex10; +	u8 flex11; +	struct gve_tx_context_cmd_dtype cmd_dtype; +	u16 reserved; +	u8 flex0; +	u8 flex1; +	u8 flex2; +	u8 flex3; +} __packed; +static_assert(sizeof(struct gve_tx_general_context_desc_dqo) == 16); + +#define GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO 0x4 + +/* Logical structure of metadata which is packed into context descriptor flex + * fields. + */ +struct gve_tx_metadata_dqo { +	union { +		struct { +			u8 version; + +			/* If `skb->l4_hash` is set, this value should be +			 * derived from `skb->hash`. +			 * +			 * A zero value means no l4_hash was associated with the +			 * skb. +			 */ +			u16 path_hash: 15; + +			/* Should be set to 1 if the flow associated with the +			 * skb had a rehash from the TCP stack. +			 */ +			u16 rehash_event: 1; +		}  __packed; +		u8 bytes[12]; +	}; +}  __packed; +static_assert(sizeof(struct gve_tx_metadata_dqo) == 12); + +#define GVE_TX_METADATA_VERSION_DQO 0 + +/* TX completion descriptor */ +struct gve_tx_compl_desc { +	/* For types 0-4 this is the TX queue ID associated with this +	 * completion. +	 */ +	u16 id: 11; + +	/* See: GVE_COMPL_TYPE_DQO* */ +	u16 type: 3; +	u16 reserved0: 1; + +	/* Flipped by HW to notify the descriptor is populated. */ +	u16 generation: 1; +	union { +		/* For descriptor completions, this is the last index fetched +		 * by HW + 1. +		 */ +		__le16 tx_head; + +		/* For packet completions, this is the completion tag set on the +		 * TX packet descriptors. +		 */ +		__le16 completion_tag; +	}; +	__le32 reserved1; +} __packed; +static_assert(sizeof(struct gve_tx_compl_desc) == 8); + +#define GVE_COMPL_TYPE_DQO_PKT 0x2 /* Packet completion */ +#define GVE_COMPL_TYPE_DQO_DESC 0x4 /* Descriptor completion */ +#define GVE_COMPL_TYPE_DQO_MISS 0x1 /* Miss path completion */ +#define GVE_COMPL_TYPE_DQO_REINJECTION 0x3 /* Re-injection completion */ + +/* Descriptor to post buffers to HW on buffer queue. */ +struct gve_rx_desc_dqo { +	__le16 buf_id; /* ID returned in Rx completion descriptor */ +	__le16 reserved0; +	__le32 reserved1; +	__le64 buf_addr; /* DMA address of the buffer */ +	__le64 header_buf_addr; +	__le64 reserved2; +} __packed; +static_assert(sizeof(struct gve_rx_desc_dqo) == 32); + +/* Descriptor for HW to notify SW of new packets received on RX queue. */ +struct gve_rx_compl_desc_dqo { +	/* Must be 1 */ +	u8 rxdid: 4; +	u8 reserved0: 4; + +	/* Packet originated from this system rather than the network. */ +	u8 loopback: 1; +	/* Set when IPv6 packet contains a destination options header or routing +	 * header. +	 */ +	u8 ipv6_ex_add: 1; +	/* Invalid packet was received. */ +	u8 rx_error: 1; +	u8 reserved1: 5; + +	u16 packet_type: 10; +	u16 ip_hdr_err: 1; +	u16 udp_len_err: 1; +	u16 raw_cs_invalid: 1; +	u16 reserved2: 3; + +	u16 packet_len: 14; +	/* Flipped by HW to notify the descriptor is populated. */ +	u16 generation: 1; +	/* Should be zero. */ +	u16 buffer_queue_id: 1; + +	u16 header_len: 10; +	u16 rsc: 1; +	u16 split_header: 1; +	u16 reserved3: 4; + +	u8 descriptor_done: 1; +	u8 end_of_packet: 1; +	u8 header_buffer_overflow: 1; +	u8 l3_l4_processed: 1; +	u8 csum_ip_err: 1; +	u8 csum_l4_err: 1; +	u8 csum_external_ip_err: 1; +	u8 csum_external_udp_err: 1; + +	u8 status_error1; + +	__le16 reserved5; +	__le16 buf_id; /* Buffer ID which was sent on the buffer queue. */ + +	union { +		/* Packet checksum. */ +		__le16 raw_cs; +		/* Segment length for RSC packets. */ +		__le16 rsc_seg_len; +	}; +	__le32 hash; +	__le32 reserved6; +	__le64 reserved7; +} __packed; + +static_assert(sizeof(struct gve_rx_compl_desc_dqo) == 32); + +/* Ringing the doorbell too often can hurt performance. + * + * HW requires this value to be at least 8. + */ +#define GVE_RX_BUF_THRESH_DQO 32 + +#endif /* _GVE_DESC_DQO_H_ */ diff --git a/drivers/net/ethernet/google/gve/gve_dqo.h b/drivers/net/ethernet/google/gve/gve_dqo.h new file mode 100644 index 000000000000..1eb4d5fd8561 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_dqo.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) + * Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2021 Google, Inc. + */ + +#ifndef _GVE_DQO_H_ +#define _GVE_DQO_H_ + +#include "gve_adminq.h" + +#define GVE_ITR_ENABLE_BIT_DQO BIT(0) +#define GVE_ITR_CLEAR_PBA_BIT_DQO BIT(1) +#define GVE_ITR_NO_UPDATE_DQO (3 << 3) + +#define GVE_ITR_INTERVAL_DQO_SHIFT 5 +#define GVE_ITR_INTERVAL_DQO_MASK ((1 << 12) - 1) + +#define GVE_TX_IRQ_RATELIMIT_US_DQO 50 +#define GVE_RX_IRQ_RATELIMIT_US_DQO 20 +#define GVE_MAX_ITR_INTERVAL_DQO (GVE_ITR_INTERVAL_DQO_MASK * 2) + +/* Timeout in seconds to wait for a reinjection completion after receiving + * its corresponding miss completion. + */ +#define GVE_REINJECT_COMPL_TIMEOUT 1 + +/* Timeout in seconds to deallocate the completion tag for a packet that was + * prematurely freed for not receiving a valid completion. This should be large + * enough to rule out the possibility of receiving the corresponding valid + * completion after this interval. + */ +#define GVE_DEALLOCATE_COMPL_TIMEOUT 60 + +netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev); +bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean); +int gve_rx_poll_dqo(struct gve_notify_block *block, int budget); +int gve_tx_alloc_rings_dqo(struct gve_priv *priv); +void gve_tx_free_rings_dqo(struct gve_priv *priv); +int gve_rx_alloc_rings_dqo(struct gve_priv *priv); +void gve_rx_free_rings_dqo(struct gve_priv *priv); +int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, +			  struct napi_struct *napi); +void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx); +void gve_rx_write_doorbell_dqo(const struct gve_priv *priv, int queue_idx); + +static inline void +gve_tx_put_doorbell_dqo(const struct gve_priv *priv, +			const struct gve_queue_resources *q_resources, u32 val) +{ +	u64 index; + +	index = be32_to_cpu(q_resources->db_index); +	iowrite32(val, &priv->db_bar2[index]); +} + +/* Builds register value to write to DQO IRQ doorbell to enable with specified + * ITR interval. + */ +static inline u32 gve_setup_itr_interval_dqo(u32 interval_us) +{ +	u32 result = GVE_ITR_ENABLE_BIT_DQO; + +	/* Interval has 2us granularity. */ +	interval_us >>= 1; + +	interval_us &= GVE_ITR_INTERVAL_DQO_MASK; +	result |= (interval_us << GVE_ITR_INTERVAL_DQO_SHIFT); + +	return result; +} + +static inline void +gve_write_irq_doorbell_dqo(const struct gve_priv *priv, +			   const struct gve_notify_block *block, u32 val) +{ +	u32 index = be32_to_cpu(*block->irq_db_index); + +	iowrite32(val, &priv->db_bar2[index]); +} + +/* Sets interrupt throttling interval and enables interrupt + * by writing to IRQ doorbell. + */ +static inline void +gve_set_itr_coalesce_usecs_dqo(struct gve_priv *priv, +			       struct gve_notify_block *block, +			       u32 usecs) +{ +	gve_write_irq_doorbell_dqo(priv, block, +				   gve_setup_itr_interval_dqo(usecs)); +} +#endif /* _GVE_DQO_H_ */ diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c index d8fa816f4473..7b9a2d9d9624 100644 --- a/drivers/net/ethernet/google/gve/gve_ethtool.c +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -1,20 +1,23 @@  // SPDX-License-Identifier: (GPL-2.0 OR MIT)  /* Google virtual Ethernet (gve) driver   * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc.   */ +#include <linux/ethtool.h>  #include <linux/rtnetlink.h>  #include "gve.h" +#include "gve_adminq.h" +#include "gve_dqo.h"  static void gve_get_drvinfo(struct net_device *netdev,  			    struct ethtool_drvinfo *info)  {  	struct gve_priv *priv = netdev_priv(netdev); -	strlcpy(info->driver, "gve", sizeof(info->driver)); -	strlcpy(info->version, gve_version_str, sizeof(info->version)); -	strlcpy(info->bus_info, pci_name(priv->pdev), sizeof(info->bus_info)); +	strscpy(info->driver, "gve", sizeof(info->driver)); +	strscpy(info->version, gve_version_str, sizeof(info->version)); +	strscpy(info->bus_info, pci_name(priv->pdev), sizeof(info->bus_info));  }  static void gve_set_msglevel(struct net_device *netdev, u32 value) @@ -34,41 +37,86 @@ static u32 gve_get_msglevel(struct net_device *netdev)  static const char gve_gstrings_main_stats[][ETH_GSTRING_LEN] = {  	"rx_packets", "tx_packets", "rx_bytes", "tx_bytes",  	"rx_dropped", "tx_dropped", "tx_timeouts", +	"rx_skb_alloc_fail", "rx_buf_alloc_fail", "rx_desc_err_dropped_pkt", +	"interface_up_cnt", "interface_down_cnt", "reset_cnt", +	"page_alloc_fail", "dma_mapping_error", "stats_report_trigger_cnt", +}; + +static const char gve_gstrings_rx_stats[][ETH_GSTRING_LEN] = { +	"rx_posted_desc[%u]", "rx_completed_desc[%u]", "rx_consumed_desc[%u]", "rx_bytes[%u]", +	"rx_cont_packet_cnt[%u]", "rx_frag_flip_cnt[%u]", "rx_frag_copy_cnt[%u]", +	"rx_dropped_pkt[%u]", "rx_copybreak_pkt[%u]", "rx_copied_pkt[%u]", +	"rx_queue_drop_cnt[%u]", "rx_no_buffers_posted[%u]", +	"rx_drops_packet_over_mru[%u]", "rx_drops_invalid_checksum[%u]", +}; + +static const char gve_gstrings_tx_stats[][ETH_GSTRING_LEN] = { +	"tx_posted_desc[%u]", "tx_completed_desc[%u]", "tx_consumed_desc[%u]", "tx_bytes[%u]", +	"tx_wake[%u]", "tx_stop[%u]", "tx_event_counter[%u]", +	"tx_dma_mapping_error[%u]", +}; + +static const char gve_gstrings_adminq_stats[][ETH_GSTRING_LEN] = { +	"adminq_prod_cnt", "adminq_cmd_fail", "adminq_timeouts", +	"adminq_describe_device_cnt", "adminq_cfg_device_resources_cnt", +	"adminq_register_page_list_cnt", "adminq_unregister_page_list_cnt", +	"adminq_create_tx_queue_cnt", "adminq_create_rx_queue_cnt", +	"adminq_destroy_tx_queue_cnt", "adminq_destroy_rx_queue_cnt", +	"adminq_dcfg_device_resources_cnt", "adminq_set_driver_parameter_cnt", +	"adminq_report_stats_cnt", "adminq_report_link_speed_cnt" +}; + +static const char gve_gstrings_priv_flags[][ETH_GSTRING_LEN] = { +	"report-stats",  };  #define GVE_MAIN_STATS_LEN  ARRAY_SIZE(gve_gstrings_main_stats) -#define NUM_GVE_TX_CNTS	5 -#define NUM_GVE_RX_CNTS	2 +#define GVE_ADMINQ_STATS_LEN  ARRAY_SIZE(gve_gstrings_adminq_stats) +#define NUM_GVE_TX_CNTS	ARRAY_SIZE(gve_gstrings_tx_stats) +#define NUM_GVE_RX_CNTS	ARRAY_SIZE(gve_gstrings_rx_stats) +#define GVE_PRIV_FLAGS_STR_LEN ARRAY_SIZE(gve_gstrings_priv_flags)  static void gve_get_strings(struct net_device *netdev, u32 stringset, u8 *data)  {  	struct gve_priv *priv = netdev_priv(netdev);  	char *s = (char *)data; -	int i; +	int i, j; -	if (stringset != ETH_SS_STATS) -		return; +	switch (stringset) { +	case ETH_SS_STATS: +		memcpy(s, *gve_gstrings_main_stats, +		       sizeof(gve_gstrings_main_stats)); +		s += sizeof(gve_gstrings_main_stats); + +		for (i = 0; i < priv->rx_cfg.num_queues; i++) { +			for (j = 0; j < NUM_GVE_RX_CNTS; j++) { +				snprintf(s, ETH_GSTRING_LEN, +					 gve_gstrings_rx_stats[j], i); +				s += ETH_GSTRING_LEN; +			} +		} + +		for (i = 0; i < priv->tx_cfg.num_queues; i++) { +			for (j = 0; j < NUM_GVE_TX_CNTS; j++) { +				snprintf(s, ETH_GSTRING_LEN, +					 gve_gstrings_tx_stats[j], i); +				s += ETH_GSTRING_LEN; +			} +		} + +		memcpy(s, *gve_gstrings_adminq_stats, +		       sizeof(gve_gstrings_adminq_stats)); +		s += sizeof(gve_gstrings_adminq_stats); +		break; + +	case ETH_SS_PRIV_FLAGS: +		memcpy(s, *gve_gstrings_priv_flags, +		       sizeof(gve_gstrings_priv_flags)); +		s += sizeof(gve_gstrings_priv_flags); +		break; -	memcpy(s, *gve_gstrings_main_stats, -	       sizeof(gve_gstrings_main_stats)); -	s += sizeof(gve_gstrings_main_stats); -	for (i = 0; i < priv->rx_cfg.num_queues; i++) { -		snprintf(s, ETH_GSTRING_LEN, "rx_desc_cnt[%u]", i); -		s += ETH_GSTRING_LEN; -		snprintf(s, ETH_GSTRING_LEN, "rx_desc_fill_cnt[%u]", i); -		s += ETH_GSTRING_LEN; -	} -	for (i = 0; i < priv->tx_cfg.num_queues; i++) { -		snprintf(s, ETH_GSTRING_LEN, "tx_req[%u]", i); -		s += ETH_GSTRING_LEN; -		snprintf(s, ETH_GSTRING_LEN, "tx_done[%u]", i); -		s += ETH_GSTRING_LEN; -		snprintf(s, ETH_GSTRING_LEN, "tx_wake[%u]", i); -		s += ETH_GSTRING_LEN; -		snprintf(s, ETH_GSTRING_LEN, "tx_stop[%u]", i); -		s += ETH_GSTRING_LEN; -		snprintf(s, ETH_GSTRING_LEN, "tx_event_counter[%u]", i); -		s += ETH_GSTRING_LEN; +	default: +		break;  	}  } @@ -78,9 +126,11 @@ static int gve_get_sset_count(struct net_device *netdev, int sset)  	switch (sset) {  	case ETH_SS_STATS: -		return GVE_MAIN_STATS_LEN + +		return GVE_MAIN_STATS_LEN + GVE_ADMINQ_STATS_LEN +  		       (priv->rx_cfg.num_queues * NUM_GVE_RX_CNTS) +  		       (priv->tx_cfg.num_queues * NUM_GVE_TX_CNTS); +	case ETH_SS_PRIV_FLAGS: +		return GVE_PRIV_FLAGS_STR_LEN;  	default:  		return -EOPNOTSUPP;  	} @@ -90,36 +140,72 @@ static void  gve_get_ethtool_stats(struct net_device *netdev,  		      struct ethtool_stats *stats, u64 *data)  { -	struct gve_priv *priv = netdev_priv(netdev); -	u64 rx_pkts, rx_bytes, tx_pkts, tx_bytes; +	u64 tmp_rx_pkts, tmp_rx_bytes, tmp_rx_skb_alloc_fail, +		tmp_rx_buf_alloc_fail, tmp_rx_desc_err_dropped_pkt, +		tmp_tx_pkts, tmp_tx_bytes; +	u64 rx_buf_alloc_fail, rx_desc_err_dropped_pkt, rx_pkts, +		rx_skb_alloc_fail, rx_bytes, tx_pkts, tx_bytes, tx_dropped; +	int stats_idx, base_stats_idx, max_stats_idx; +	struct stats *report_stats; +	int *rx_qid_to_stats_idx; +	int *tx_qid_to_stats_idx; +	struct gve_priv *priv; +	bool skip_nic_stats;  	unsigned int start;  	int ring; -	int i; +	int i, j;  	ASSERT_RTNL(); -	for (rx_pkts = 0, rx_bytes = 0, ring = 0; +	priv = netdev_priv(netdev); +	report_stats = priv->stats_report->stats; +	rx_qid_to_stats_idx = kmalloc_array(priv->rx_cfg.num_queues, +					    sizeof(int), GFP_KERNEL); +	if (!rx_qid_to_stats_idx) +		return; +	tx_qid_to_stats_idx = kmalloc_array(priv->tx_cfg.num_queues, +					    sizeof(int), GFP_KERNEL); +	if (!tx_qid_to_stats_idx) { +		kfree(rx_qid_to_stats_idx); +		return; +	} +	for (rx_pkts = 0, rx_bytes = 0, rx_skb_alloc_fail = 0, +	     rx_buf_alloc_fail = 0, rx_desc_err_dropped_pkt = 0, ring = 0;  	     ring < priv->rx_cfg.num_queues; ring++) {  		if (priv->rx) {  			do { +				struct gve_rx_ring *rx = &priv->rx[ring]; +  				start = -				  u64_stats_fetch_begin(&priv->rx[ring].statss); -				rx_pkts += priv->rx[ring].rpackets; -				rx_bytes += priv->rx[ring].rbytes; -			} while (u64_stats_fetch_retry(&priv->rx[ring].statss, +				  u64_stats_fetch_begin_irq(&priv->rx[ring].statss); +				tmp_rx_pkts = rx->rpackets; +				tmp_rx_bytes = rx->rbytes; +				tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail; +				tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail; +				tmp_rx_desc_err_dropped_pkt = +					rx->rx_desc_err_dropped_pkt; +			} while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss,  						       start)); +			rx_pkts += tmp_rx_pkts; +			rx_bytes += tmp_rx_bytes; +			rx_skb_alloc_fail += tmp_rx_skb_alloc_fail; +			rx_buf_alloc_fail += tmp_rx_buf_alloc_fail; +			rx_desc_err_dropped_pkt += tmp_rx_desc_err_dropped_pkt;  		}  	} -	for (tx_pkts = 0, tx_bytes = 0, ring = 0; +	for (tx_pkts = 0, tx_bytes = 0, tx_dropped = 0, ring = 0;  	     ring < priv->tx_cfg.num_queues; ring++) {  		if (priv->tx) {  			do {  				start = -				  u64_stats_fetch_begin(&priv->tx[ring].statss); -				tx_pkts += priv->tx[ring].pkt_done; -				tx_bytes += priv->tx[ring].bytes_done; -			} while (u64_stats_fetch_retry(&priv->tx[ring].statss, +				  u64_stats_fetch_begin_irq(&priv->tx[ring].statss); +				tmp_tx_pkts = priv->tx[ring].pkt_done; +				tmp_tx_bytes = priv->tx[ring].bytes_done; +			} while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss,  						       start)); +			tx_pkts += tmp_tx_pkts; +			tx_bytes += tmp_tx_bytes; +			tx_dropped += priv->tx[ring].dropped_pkt;  		}  	} @@ -128,37 +214,166 @@ gve_get_ethtool_stats(struct net_device *netdev,  	data[i++] = tx_pkts;  	data[i++] = rx_bytes;  	data[i++] = tx_bytes; -	/* Skip rx_dropped and tx_dropped */ -	i += 2; +	/* total rx dropped packets */ +	data[i++] = rx_skb_alloc_fail + rx_buf_alloc_fail + +		    rx_desc_err_dropped_pkt; +	data[i++] = tx_dropped;  	data[i++] = priv->tx_timeo_cnt; +	data[i++] = rx_skb_alloc_fail; +	data[i++] = rx_buf_alloc_fail; +	data[i++] = rx_desc_err_dropped_pkt; +	data[i++] = priv->interface_up_cnt; +	data[i++] = priv->interface_down_cnt; +	data[i++] = priv->reset_cnt; +	data[i++] = priv->page_alloc_fail; +	data[i++] = priv->dma_mapping_error; +	data[i++] = priv->stats_report_trigger_cnt;  	i = GVE_MAIN_STATS_LEN; +	/* For rx cross-reporting stats, start from nic rx stats in report */ +	base_stats_idx = GVE_TX_STATS_REPORT_NUM * priv->tx_cfg.num_queues + +		GVE_RX_STATS_REPORT_NUM * priv->rx_cfg.num_queues; +	max_stats_idx = NIC_RX_STATS_REPORT_NUM * priv->rx_cfg.num_queues + +		base_stats_idx; +	/* Preprocess the stats report for rx, map queue id to start index */ +	skip_nic_stats = false; +	for (stats_idx = base_stats_idx; stats_idx < max_stats_idx; +		stats_idx += NIC_RX_STATS_REPORT_NUM) { +		u32 stat_name = be32_to_cpu(report_stats[stats_idx].stat_name); +		u32 queue_id = be32_to_cpu(report_stats[stats_idx].queue_id); + +		if (stat_name == 0) { +			/* no stats written by NIC yet */ +			skip_nic_stats = true; +			break; +		} +		rx_qid_to_stats_idx[queue_id] = stats_idx; +	}  	/* walk RX rings */  	if (priv->rx) {  		for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) {  			struct gve_rx_ring *rx = &priv->rx[ring]; -			data[i++] = rx->cnt;  			data[i++] = rx->fill_cnt; +			data[i++] = rx->cnt; +			data[i++] = rx->fill_cnt - rx->cnt; +			do { +				start = +				  u64_stats_fetch_begin_irq(&priv->rx[ring].statss); +				tmp_rx_bytes = rx->rbytes; +				tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail; +				tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail; +				tmp_rx_desc_err_dropped_pkt = +					rx->rx_desc_err_dropped_pkt; +			} while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss, +						       start)); +			data[i++] = tmp_rx_bytes; +			data[i++] = rx->rx_cont_packet_cnt; +			data[i++] = rx->rx_frag_flip_cnt; +			data[i++] = rx->rx_frag_copy_cnt; +			/* rx dropped packets */ +			data[i++] = tmp_rx_skb_alloc_fail + +				tmp_rx_buf_alloc_fail + +				tmp_rx_desc_err_dropped_pkt; +			data[i++] = rx->rx_copybreak_pkt; +			data[i++] = rx->rx_copied_pkt; +			/* stats from NIC */ +			if (skip_nic_stats) { +				/* skip NIC rx stats */ +				i += NIC_RX_STATS_REPORT_NUM; +				continue; +			} +			for (j = 0; j < NIC_RX_STATS_REPORT_NUM; j++) { +				u64 value = +				be64_to_cpu(report_stats[rx_qid_to_stats_idx[ring] + j].value); + +				data[i++] = value; +			}  		}  	} else {  		i += priv->rx_cfg.num_queues * NUM_GVE_RX_CNTS;  	} + +	/* For tx cross-reporting stats, start from nic tx stats in report */ +	base_stats_idx = max_stats_idx; +	max_stats_idx = NIC_TX_STATS_REPORT_NUM * priv->tx_cfg.num_queues + +		max_stats_idx; +	/* Preprocess the stats report for tx, map queue id to start index */ +	skip_nic_stats = false; +	for (stats_idx = base_stats_idx; stats_idx < max_stats_idx; +		stats_idx += NIC_TX_STATS_REPORT_NUM) { +		u32 stat_name = be32_to_cpu(report_stats[stats_idx].stat_name); +		u32 queue_id = be32_to_cpu(report_stats[stats_idx].queue_id); + +		if (stat_name == 0) { +			/* no stats written by NIC yet */ +			skip_nic_stats = true; +			break; +		} +		tx_qid_to_stats_idx[queue_id] = stats_idx; +	}  	/* walk TX rings */  	if (priv->tx) {  		for (ring = 0; ring < priv->tx_cfg.num_queues; ring++) {  			struct gve_tx_ring *tx = &priv->tx[ring]; -			data[i++] = tx->req; -			data[i++] = tx->done; +			if (gve_is_gqi(priv)) { +				data[i++] = tx->req; +				data[i++] = tx->done; +				data[i++] = tx->req - tx->done; +			} else { +				/* DQO doesn't currently support +				 * posted/completed descriptor counts; +				 */ +				data[i++] = 0; +				data[i++] = 0; +				data[i++] = tx->dqo_tx.tail - tx->dqo_tx.head; +			} +			do { +				start = +				  u64_stats_fetch_begin_irq(&priv->tx[ring].statss); +				tmp_tx_bytes = tx->bytes_done; +			} while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss, +						       start)); +			data[i++] = tmp_tx_bytes;  			data[i++] = tx->wake_queue;  			data[i++] = tx->stop_queue; -			data[i++] = be32_to_cpu(gve_tx_load_event_counter(priv, -									  tx)); +			data[i++] = gve_tx_load_event_counter(priv, tx); +			data[i++] = tx->dma_mapping_error; +			/* stats from NIC */ +			if (skip_nic_stats) { +				/* skip NIC tx stats */ +				i += NIC_TX_STATS_REPORT_NUM; +				continue; +			} +			for (j = 0; j < NIC_TX_STATS_REPORT_NUM; j++) { +				u64 value = +				be64_to_cpu(report_stats[tx_qid_to_stats_idx[ring] + j].value); +				data[i++] = value; +			}  		}  	} else {  		i += priv->tx_cfg.num_queues * NUM_GVE_TX_CNTS;  	} + +	kfree(rx_qid_to_stats_idx); +	kfree(tx_qid_to_stats_idx); +	/* AQ Stats */ +	data[i++] = priv->adminq_prod_cnt; +	data[i++] = priv->adminq_cmd_fail; +	data[i++] = priv->adminq_timeouts; +	data[i++] = priv->adminq_describe_device_cnt; +	data[i++] = priv->adminq_cfg_device_resources_cnt; +	data[i++] = priv->adminq_register_page_list_cnt; +	data[i++] = priv->adminq_unregister_page_list_cnt; +	data[i++] = priv->adminq_create_tx_queue_cnt; +	data[i++] = priv->adminq_create_rx_queue_cnt; +	data[i++] = priv->adminq_destroy_tx_queue_cnt; +	data[i++] = priv->adminq_destroy_rx_queue_cnt; +	data[i++] = priv->adminq_dcfg_device_resources_cnt; +	data[i++] = priv->adminq_set_driver_parameter_cnt; +	data[i++] = priv->adminq_report_stats_cnt; +	data[i++] = priv->adminq_report_link_speed_cnt;  }  static void gve_get_channels(struct net_device *netdev, @@ -188,7 +403,7 @@ static int gve_set_channels(struct net_device *netdev,  	gve_get_channels(netdev, &old_settings); -	/* Changing combined is not allowed allowed */ +	/* Changing combined is not allowed */  	if (cmd->combined_count != old_settings.combined_count)  		return -EINVAL; @@ -208,7 +423,9 @@ static int gve_set_channels(struct net_device *netdev,  }  static void gve_get_ringparam(struct net_device *netdev, -			      struct ethtool_ringparam *cmd) +			      struct ethtool_ringparam *cmd, +			      struct kernel_ethtool_ringparam *kernel_cmd, +			      struct netlink_ext_ack *extack)  {  	struct gve_priv *priv = netdev_priv(netdev); @@ -230,7 +447,159 @@ static int gve_user_reset(struct net_device *netdev, u32 *flags)  	return -EOPNOTSUPP;  } +static int gve_get_tunable(struct net_device *netdev, +			   const struct ethtool_tunable *etuna, void *value) +{ +	struct gve_priv *priv = netdev_priv(netdev); + +	switch (etuna->id) { +	case ETHTOOL_RX_COPYBREAK: +		*(u32 *)value = priv->rx_copybreak; +		return 0; +	default: +		return -EOPNOTSUPP; +	} +} + +static int gve_set_tunable(struct net_device *netdev, +			   const struct ethtool_tunable *etuna, +			   const void *value) +{ +	struct gve_priv *priv = netdev_priv(netdev); +	u32 len; + +	switch (etuna->id) { +	case ETHTOOL_RX_COPYBREAK: +	{ +		u32 max_copybreak = gve_is_gqi(priv) ? +			(PAGE_SIZE / 2) : priv->data_buffer_size_dqo; + +		len = *(u32 *)value; +		if (len > max_copybreak) +			return -EINVAL; +		priv->rx_copybreak = len; +		return 0; +	} +	default: +		return -EOPNOTSUPP; +	} +} + +static u32 gve_get_priv_flags(struct net_device *netdev) +{ +	struct gve_priv *priv = netdev_priv(netdev); +	u32 ret_flags = 0; + +	/* Only 1 flag exists currently: report-stats (BIT(O)), so set that flag. */ +	if (priv->ethtool_flags & BIT(0)) +		ret_flags |= BIT(0); +	return ret_flags; +} + +static int gve_set_priv_flags(struct net_device *netdev, u32 flags) +{ +	struct gve_priv *priv = netdev_priv(netdev); +	u64 ori_flags, new_flags; + +	ori_flags = READ_ONCE(priv->ethtool_flags); +	new_flags = ori_flags; + +	/* Only one priv flag exists: report-stats (BIT(0))*/ +	if (flags & BIT(0)) +		new_flags |= BIT(0); +	else +		new_flags &= ~(BIT(0)); +	priv->ethtool_flags = new_flags; +	/* start report-stats timer when user turns report stats on. */ +	if (flags & BIT(0)) { +		mod_timer(&priv->stats_report_timer, +			  round_jiffies(jiffies + +					msecs_to_jiffies(priv->stats_report_timer_period))); +	} +	/* Zero off gve stats when report-stats turned off and */ +	/* delete report stats timer. */ +	if (!(flags & BIT(0)) && (ori_flags & BIT(0))) { +		int tx_stats_num = GVE_TX_STATS_REPORT_NUM * +			priv->tx_cfg.num_queues; +		int rx_stats_num = GVE_RX_STATS_REPORT_NUM * +			priv->rx_cfg.num_queues; + +		memset(priv->stats_report->stats, 0, (tx_stats_num + rx_stats_num) * +				   sizeof(struct stats)); +		del_timer_sync(&priv->stats_report_timer); +	} +	return 0; +} + +static int gve_get_link_ksettings(struct net_device *netdev, +				  struct ethtool_link_ksettings *cmd) +{ +	struct gve_priv *priv = netdev_priv(netdev); +	int err = gve_adminq_report_link_speed(priv); + +	cmd->base.speed = priv->link_speed; +	return err; +} + +static int gve_get_coalesce(struct net_device *netdev, +			    struct ethtool_coalesce *ec, +			    struct kernel_ethtool_coalesce *kernel_ec, +			    struct netlink_ext_ack *extack) +{ +	struct gve_priv *priv = netdev_priv(netdev); + +	if (gve_is_gqi(priv)) +		return -EOPNOTSUPP; +	ec->tx_coalesce_usecs = priv->tx_coalesce_usecs; +	ec->rx_coalesce_usecs = priv->rx_coalesce_usecs; + +	return 0; +} + +static int gve_set_coalesce(struct net_device *netdev, +			    struct ethtool_coalesce *ec, +			    struct kernel_ethtool_coalesce *kernel_ec, +			    struct netlink_ext_ack *extack) +{ +	struct gve_priv *priv = netdev_priv(netdev); +	u32 tx_usecs_orig = priv->tx_coalesce_usecs; +	u32 rx_usecs_orig = priv->rx_coalesce_usecs; +	int idx; + +	if (gve_is_gqi(priv)) +		return -EOPNOTSUPP; + +	if (ec->tx_coalesce_usecs > GVE_MAX_ITR_INTERVAL_DQO || +	    ec->rx_coalesce_usecs > GVE_MAX_ITR_INTERVAL_DQO) +		return -EINVAL; +	priv->tx_coalesce_usecs = ec->tx_coalesce_usecs; +	priv->rx_coalesce_usecs = ec->rx_coalesce_usecs; + +	if (tx_usecs_orig != priv->tx_coalesce_usecs) { +		for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) { +			int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx); +			struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; + +			gve_set_itr_coalesce_usecs_dqo(priv, block, +						       priv->tx_coalesce_usecs); +		} +	} + +	if (rx_usecs_orig != priv->rx_coalesce_usecs) { +		for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) { +			int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); +			struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; + +			gve_set_itr_coalesce_usecs_dqo(priv, block, +						       priv->rx_coalesce_usecs); +		} +	} + +	return 0; +} +  const struct ethtool_ops gve_ethtool_ops = { +	.supported_coalesce_params = ETHTOOL_COALESCE_USECS,  	.get_drvinfo = gve_get_drvinfo,  	.get_strings = gve_get_strings,  	.get_sset_count = gve_get_sset_count, @@ -240,6 +609,13 @@ const struct ethtool_ops gve_ethtool_ops = {  	.set_channels = gve_set_channels,  	.get_channels = gve_get_channels,  	.get_link = ethtool_op_get_link, +	.get_coalesce = gve_get_coalesce, +	.set_coalesce = gve_set_coalesce,  	.get_ringparam = gve_get_ringparam,  	.reset = gve_user_reset, +	.get_tunable = gve_get_tunable, +	.set_tunable = gve_set_tunable, +	.get_priv_flags = gve_get_priv_flags, +	.set_priv_flags = gve_set_priv_flags, +	.get_link_ksettings = gve_get_link_ksettings  }; diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index e032563ceefd..d3e3ac242bfc 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1,7 +1,7 @@  // SPDX-License-Identifier: (GPL-2.0 OR MIT)  /* Google virtual Ethernet (gve) driver   * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc.   */  #include <linux/cpumask.h> @@ -14,6 +14,7 @@  #include <linux/workqueue.h>  #include <net/sch_generic.h>  #include "gve.h" +#include "gve_dqo.h"  #include "gve_adminq.h"  #include "gve_register.h" @@ -23,35 +24,53 @@  #define GVE_VERSION		"1.0.0"  #define GVE_VERSION_PREFIX	"GVE-" +// Minimum amount of time between queue kicks in msec (10 seconds) +#define MIN_TX_TIMEOUT_GAP (1000 * 10) +  const char gve_version_str[] = GVE_VERSION;  static const char gve_version_prefix[] = GVE_VERSION_PREFIX; +static netdev_tx_t gve_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ +	struct gve_priv *priv = netdev_priv(dev); + +	if (gve_is_gqi(priv)) +		return gve_tx(skb, dev); +	else +		return gve_tx_dqo(skb, dev); +} +  static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s)  {  	struct gve_priv *priv = netdev_priv(dev);  	unsigned int start; +	u64 packets, bytes;  	int ring;  	if (priv->rx) {  		for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) {  			do {  				start = -				  u64_stats_fetch_begin(&priv->rx[ring].statss); -				s->rx_packets += priv->rx[ring].rpackets; -				s->rx_bytes += priv->rx[ring].rbytes; -			} while (u64_stats_fetch_retry(&priv->rx[ring].statss, +				  u64_stats_fetch_begin_irq(&priv->rx[ring].statss); +				packets = priv->rx[ring].rpackets; +				bytes = priv->rx[ring].rbytes; +			} while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss,  						       start)); +			s->rx_packets += packets; +			s->rx_bytes += bytes;  		}  	}  	if (priv->tx) {  		for (ring = 0; ring < priv->tx_cfg.num_queues; ring++) {  			do {  				start = -				  u64_stats_fetch_begin(&priv->tx[ring].statss); -				s->tx_packets += priv->tx[ring].pkt_done; -				s->tx_bytes += priv->tx[ring].bytes_done; -			} while (u64_stats_fetch_retry(&priv->tx[ring].statss, +				  u64_stats_fetch_begin_irq(&priv->tx[ring].statss); +				packets = priv->tx[ring].pkt_done; +				bytes = priv->tx[ring].bytes_done; +			} while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss,  						       start)); +			s->tx_packets += packets; +			s->tx_bytes += bytes;  		}  	}  } @@ -71,6 +90,9 @@ static int gve_alloc_counter_array(struct gve_priv *priv)  static void gve_free_counter_array(struct gve_priv *priv)  { +	if (!priv->counter_array) +		return; +  	dma_free_coherent(&priv->pdev->dev,  			  priv->num_event_counters *  			  sizeof(*priv->counter_array), @@ -78,6 +100,68 @@ static void gve_free_counter_array(struct gve_priv *priv)  	priv->counter_array = NULL;  } +/* NIC requests to report stats */ +static void gve_stats_report_task(struct work_struct *work) +{ +	struct gve_priv *priv = container_of(work, struct gve_priv, +					     stats_report_task); +	if (gve_get_do_report_stats(priv)) { +		gve_handle_report_stats(priv); +		gve_clear_do_report_stats(priv); +	} +} + +static void gve_stats_report_schedule(struct gve_priv *priv) +{ +	if (!gve_get_probe_in_progress(priv) && +	    !gve_get_reset_in_progress(priv)) { +		gve_set_do_report_stats(priv); +		queue_work(priv->gve_wq, &priv->stats_report_task); +	} +} + +static void gve_stats_report_timer(struct timer_list *t) +{ +	struct gve_priv *priv = from_timer(priv, t, stats_report_timer); + +	mod_timer(&priv->stats_report_timer, +		  round_jiffies(jiffies + +		  msecs_to_jiffies(priv->stats_report_timer_period))); +	gve_stats_report_schedule(priv); +} + +static int gve_alloc_stats_report(struct gve_priv *priv) +{ +	int tx_stats_num, rx_stats_num; + +	tx_stats_num = (GVE_TX_STATS_REPORT_NUM + NIC_TX_STATS_REPORT_NUM) * +		       priv->tx_cfg.num_queues; +	rx_stats_num = (GVE_RX_STATS_REPORT_NUM + NIC_RX_STATS_REPORT_NUM) * +		       priv->rx_cfg.num_queues; +	priv->stats_report_len = struct_size(priv->stats_report, stats, +					     tx_stats_num + rx_stats_num); +	priv->stats_report = +		dma_alloc_coherent(&priv->pdev->dev, priv->stats_report_len, +				   &priv->stats_report_bus, GFP_KERNEL); +	if (!priv->stats_report) +		return -ENOMEM; +	/* Set up timer for the report-stats task */ +	timer_setup(&priv->stats_report_timer, gve_stats_report_timer, 0); +	priv->stats_report_timer_period = GVE_STATS_REPORT_TIMER_PERIOD; +	return 0; +} + +static void gve_free_stats_report(struct gve_priv *priv) +{ +	if (!priv->stats_report) +		return; + +	del_timer_sync(&priv->stats_report_timer); +	dma_free_coherent(&priv->pdev->dev, priv->stats_report_len, +			  priv->stats_report, priv->stats_report_bus); +	priv->stats_report = NULL; +} +  static irqreturn_t gve_mgmnt_intr(int irq, void *arg)  {  	struct gve_priv *priv = arg; @@ -96,40 +180,103 @@ static irqreturn_t gve_intr(int irq, void *arg)  	return IRQ_HANDLED;  } +static irqreturn_t gve_intr_dqo(int irq, void *arg) +{ +	struct gve_notify_block *block = arg; + +	/* Interrupts are automatically masked */ +	napi_schedule_irqoff(&block->napi); +	return IRQ_HANDLED; +} +  static int gve_napi_poll(struct napi_struct *napi, int budget)  {  	struct gve_notify_block *block;  	__be32 __iomem *irq_doorbell;  	bool reschedule = false;  	struct gve_priv *priv; +	int work_done = 0;  	block = container_of(napi, struct gve_notify_block, napi);  	priv = block->priv;  	if (block->tx)  		reschedule |= gve_tx_poll(block, budget); -	if (block->rx) -		reschedule |= gve_rx_poll(block, budget); +	if (block->rx) { +		work_done = gve_rx_poll(block, budget); +		reschedule |= work_done == budget; +	}  	if (reschedule)  		return budget; -	napi_complete(napi); -	irq_doorbell = gve_irq_doorbell(priv, block); -	iowrite32be(GVE_IRQ_ACK | GVE_IRQ_EVENT, irq_doorbell); +       /* Complete processing - don't unmask irq if busy polling is enabled */ +	if (likely(napi_complete_done(napi, work_done))) { +		irq_doorbell = gve_irq_doorbell(priv, block); +		iowrite32be(GVE_IRQ_ACK | GVE_IRQ_EVENT, irq_doorbell); + +		/* Ensure IRQ ACK is visible before we check pending work. +		 * If queue had issued updates, it would be truly visible. +		 */ +		mb(); + +		if (block->tx) +			reschedule |= gve_tx_clean_pending(priv, block->tx); +		if (block->rx) +			reschedule |= gve_rx_work_pending(block->rx); + +		if (reschedule && napi_reschedule(napi)) +			iowrite32be(GVE_IRQ_MASK, irq_doorbell); +	} +	return work_done; +} -	/* Double check we have no extra work. -	 * Ensure unmask synchronizes with checking for work. +static int gve_napi_poll_dqo(struct napi_struct *napi, int budget) +{ +	struct gve_notify_block *block = +		container_of(napi, struct gve_notify_block, napi); +	struct gve_priv *priv = block->priv; +	bool reschedule = false; +	int work_done = 0; + +	/* Clear PCI MSI-X Pending Bit Array (PBA) +	 * +	 * This bit is set if an interrupt event occurs while the vector is +	 * masked. If this bit is set and we reenable the interrupt, it will +	 * fire again. Since we're just about to poll the queue state, we don't +	 * need it to fire again. +	 * +	 * Under high softirq load, it's possible that the interrupt condition +	 * is triggered twice before we got the chance to process it.  	 */ -	dma_rmb(); +	gve_write_irq_doorbell_dqo(priv, block, +				   GVE_ITR_NO_UPDATE_DQO | GVE_ITR_CLEAR_PBA_BIT_DQO); +  	if (block->tx) -		reschedule |= gve_tx_poll(block, -1); -	if (block->rx) -		reschedule |= gve_rx_poll(block, -1); -	if (reschedule && napi_reschedule(napi)) -		iowrite32be(GVE_IRQ_MASK, irq_doorbell); +		reschedule |= gve_tx_poll_dqo(block, /*do_clean=*/true); -	return 0; +	if (block->rx) { +		work_done = gve_rx_poll_dqo(block, budget); +		reschedule |= work_done == budget; +	} + +	if (reschedule) +		return budget; + +	if (likely(napi_complete_done(napi, work_done))) { +		/* Enable interrupts again. +		 * +		 * We don't need to repoll afterwards because HW supports the +		 * PCI MSI-X PBA feature. +		 * +		 * Another interrupt would be triggered if a new event came in +		 * since the last one. +		 */ +		gve_write_irq_doorbell_dqo(priv, block, +					   GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO); +	} + +	return work_done;  }  static int gve_alloc_notify_blocks(struct gve_priv *priv) @@ -141,7 +288,7 @@ static int gve_alloc_notify_blocks(struct gve_priv *priv)  	int i, j;  	int err; -	priv->msix_vectors = kvzalloc(num_vecs_requested * +	priv->msix_vectors = kvcalloc(num_vecs_requested,  				      sizeof(*priv->msix_vectors), GFP_KERNEL);  	if (!priv->msix_vectors)  		return -ENOMEM; @@ -161,6 +308,7 @@ static int gve_alloc_notify_blocks(struct gve_priv *priv)  		int vecs_left = new_num_ntfy_blks % 2;  		priv->num_ntfy_blks = new_num_ntfy_blks; +		priv->mgmt_msix_idx = priv->num_ntfy_blks;  		priv->tx_cfg.max_queues = min_t(int, priv->tx_cfg.max_queues,  						vecs_per_type);  		priv->rx_cfg.max_queues = min_t(int, priv->rx_cfg.max_queues, @@ -186,15 +334,23 @@ static int gve_alloc_notify_blocks(struct gve_priv *priv)  		dev_err(&priv->pdev->dev, "Did not receive management vector.\n");  		goto abort_with_msix_enabled;  	} -	priv->ntfy_blocks = +	priv->irq_db_indices =  		dma_alloc_coherent(&priv->pdev->dev,  				   priv->num_ntfy_blks * -				   sizeof(*priv->ntfy_blocks), -				   &priv->ntfy_block_bus, GFP_KERNEL); -	if (!priv->ntfy_blocks) { +				   sizeof(*priv->irq_db_indices), +				   &priv->irq_db_indices_bus, GFP_KERNEL); +	if (!priv->irq_db_indices) {  		err = -ENOMEM;  		goto abort_with_mgmt_vector;  	} + +	priv->ntfy_blocks = kvzalloc(priv->num_ntfy_blks * +				     sizeof(*priv->ntfy_blocks), GFP_KERNEL); +	if (!priv->ntfy_blocks) { +		err = -ENOMEM; +		goto abort_with_irq_db_indices; +	} +  	/* Setup the other blocks - the first n-1 vectors */  	for (i = 0; i < priv->num_ntfy_blks; i++) {  		struct gve_notify_block *block = &priv->ntfy_blocks[i]; @@ -204,7 +360,8 @@ static int gve_alloc_notify_blocks(struct gve_priv *priv)  			 name, i);  		block->priv = priv;  		err = request_irq(priv->msix_vectors[msix_idx].vector, -				  gve_intr, 0, block->name, block); +				  gve_is_gqi(priv) ? gve_intr : gve_intr_dqo, +				  0, block->name, block);  		if (err) {  			dev_err(&priv->pdev->dev,  				"Failed to receive msix vector %d\n", i); @@ -212,6 +369,7 @@ static int gve_alloc_notify_blocks(struct gve_priv *priv)  		}  		irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector,  				      get_cpu_mask(i % active_cpus)); +		block->irq_db_index = &priv->irq_db_indices[i].index;  	}  	return 0;  abort_with_some_ntfy_blocks: @@ -223,10 +381,13 @@ abort_with_some_ntfy_blocks:  				      NULL);  		free_irq(priv->msix_vectors[msix_idx].vector, block);  	} -	dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks * -			  sizeof(*priv->ntfy_blocks), -			  priv->ntfy_blocks, priv->ntfy_block_bus); +	kvfree(priv->ntfy_blocks);  	priv->ntfy_blocks = NULL; +abort_with_irq_db_indices: +	dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks * +			  sizeof(*priv->irq_db_indices), +			  priv->irq_db_indices, priv->irq_db_indices_bus); +	priv->irq_db_indices = NULL;  abort_with_mgmt_vector:  	free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv);  abort_with_msix_enabled: @@ -241,6 +402,9 @@ static void gve_free_notify_blocks(struct gve_priv *priv)  {  	int i; +	if (!priv->msix_vectors) +		return; +  	/* Free the irqs */  	for (i = 0; i < priv->num_ntfy_blks; i++) {  		struct gve_notify_block *block = &priv->ntfy_blocks[i]; @@ -250,11 +414,13 @@ static void gve_free_notify_blocks(struct gve_priv *priv)  				      NULL);  		free_irq(priv->msix_vectors[msix_idx].vector, block);  	} -	dma_free_coherent(&priv->pdev->dev, -			  priv->num_ntfy_blks * sizeof(*priv->ntfy_blocks), -			  priv->ntfy_blocks, priv->ntfy_block_bus); -	priv->ntfy_blocks = NULL;  	free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv); +	kvfree(priv->ntfy_blocks); +	priv->ntfy_blocks = NULL; +	dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks * +			  sizeof(*priv->irq_db_indices), +			  priv->irq_db_indices, priv->irq_db_indices_bus); +	priv->irq_db_indices = NULL;  	pci_disable_msix(priv->pdev);  	kvfree(priv->msix_vectors);  	priv->msix_vectors = NULL; @@ -270,23 +436,55 @@ static int gve_setup_device_resources(struct gve_priv *priv)  	err = gve_alloc_notify_blocks(priv);  	if (err)  		goto abort_with_counter; +	err = gve_alloc_stats_report(priv); +	if (err) +		goto abort_with_ntfy_blocks;  	err = gve_adminq_configure_device_resources(priv,  						    priv->counter_array_bus,  						    priv->num_event_counters, -						    priv->ntfy_block_bus, +						    priv->irq_db_indices_bus,  						    priv->num_ntfy_blks);  	if (unlikely(err)) {  		dev_err(&priv->pdev->dev,  			"could not setup device_resources: err=%d\n", err);  		err = -ENXIO; -		goto abort_with_ntfy_blocks; +		goto abort_with_stats_report;  	} + +	if (priv->queue_format == GVE_DQO_RDA_FORMAT) { +		priv->ptype_lut_dqo = kvzalloc(sizeof(*priv->ptype_lut_dqo), +					       GFP_KERNEL); +		if (!priv->ptype_lut_dqo) { +			err = -ENOMEM; +			goto abort_with_stats_report; +		} +		err = gve_adminq_get_ptype_map_dqo(priv, priv->ptype_lut_dqo); +		if (err) { +			dev_err(&priv->pdev->dev, +				"Failed to get ptype map: err=%d\n", err); +			goto abort_with_ptype_lut; +		} +	} + +	err = gve_adminq_report_stats(priv, priv->stats_report_len, +				      priv->stats_report_bus, +				      GVE_STATS_REPORT_TIMER_PERIOD); +	if (err) +		dev_err(&priv->pdev->dev, +			"Failed to report stats: err=%d\n", err);  	gve_set_device_resources_ok(priv);  	return 0; + +abort_with_ptype_lut: +	kvfree(priv->ptype_lut_dqo); +	priv->ptype_lut_dqo = NULL; +abort_with_stats_report: +	gve_free_stats_report(priv);  abort_with_ntfy_blocks:  	gve_free_notify_blocks(priv);  abort_with_counter:  	gve_free_counter_array(priv); +  	return err;  } @@ -298,6 +496,13 @@ static void gve_teardown_device_resources(struct gve_priv *priv)  	/* Tell device its resources are being freed */  	if (gve_get_device_resources_ok(priv)) { +		/* detach the stats report */ +		err = gve_adminq_report_stats(priv, 0, 0x0, GVE_STATS_REPORT_TIMER_PERIOD); +		if (err) { +			dev_err(&priv->pdev->dev, +				"Failed to detach stats report: err=%d\n", err); +			gve_trigger_reset(priv); +		}  		err = gve_adminq_deconfigure_device_resources(priv);  		if (err) {  			dev_err(&priv->pdev->dev, @@ -306,17 +511,22 @@ static void gve_teardown_device_resources(struct gve_priv *priv)  			gve_trigger_reset(priv);  		}  	} + +	kvfree(priv->ptype_lut_dqo); +	priv->ptype_lut_dqo = NULL; +  	gve_free_counter_array(priv);  	gve_free_notify_blocks(priv); +	gve_free_stats_report(priv);  	gve_clear_device_resources_ok(priv);  } -static void gve_add_napi(struct gve_priv *priv, int ntfy_idx) +static void gve_add_napi(struct gve_priv *priv, int ntfy_idx, +			 int (*gve_poll)(struct napi_struct *, int))  {  	struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; -	netif_napi_add(priv->dev, &block->napi, gve_napi_poll, -		       NAPI_POLL_WEIGHT); +	netif_napi_add(priv->dev, &block->napi, gve_poll);  }  static void gve_remove_napi(struct gve_priv *priv, int ntfy_idx) @@ -371,76 +581,118 @@ static int gve_create_rings(struct gve_priv *priv)  	int err;  	int i; -	for (i = 0; i < priv->tx_cfg.num_queues; i++) { -		err = gve_adminq_create_tx_queue(priv, i); -		if (err) { -			netif_err(priv, drv, priv->dev, "failed to create tx queue %d\n", -				  i); -			/* This failure will trigger a reset - no need to clean -			 * up -			 */ -			return err; -		} -		netif_dbg(priv, drv, priv->dev, "created tx queue %d\n", i); +	err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues); +	if (err) { +		netif_err(priv, drv, priv->dev, "failed to create %d tx queues\n", +			  priv->tx_cfg.num_queues); +		/* This failure will trigger a reset - no need to clean +		 * up +		 */ +		return err;  	} -	for (i = 0; i < priv->rx_cfg.num_queues; i++) { -		err = gve_adminq_create_rx_queue(priv, i); -		if (err) { -			netif_err(priv, drv, priv->dev, "failed to create rx queue %d\n", -				  i); -			/* This failure will trigger a reset - no need to clean -			 * up -			 */ -			return err; -		} -		/* Rx data ring has been prefilled with packet buffers at -		 * queue allocation time. +	netif_dbg(priv, drv, priv->dev, "created %d tx queues\n", +		  priv->tx_cfg.num_queues); + +	err = gve_adminq_create_rx_queues(priv, priv->rx_cfg.num_queues); +	if (err) { +		netif_err(priv, drv, priv->dev, "failed to create %d rx queues\n", +			  priv->rx_cfg.num_queues); +		/* This failure will trigger a reset - no need to clean +		 * up +		 */ +		return err; +	} +	netif_dbg(priv, drv, priv->dev, "created %d rx queues\n", +		  priv->rx_cfg.num_queues); + +	if (gve_is_gqi(priv)) { +		/* Rx data ring has been prefilled with packet buffers at queue +		 * allocation time. +		 *  		 * Write the doorbell to provide descriptor slots and packet  		 * buffers to the NIC.  		 */ -		gve_rx_write_doorbell(priv, &priv->rx[i]); -		netif_dbg(priv, drv, priv->dev, "created rx queue %d\n", i); +		for (i = 0; i < priv->rx_cfg.num_queues; i++) +			gve_rx_write_doorbell(priv, &priv->rx[i]); +	} else { +		for (i = 0; i < priv->rx_cfg.num_queues; i++) { +			/* Post buffers and ring doorbell. */ +			gve_rx_post_buffers_dqo(&priv->rx[i]); +		}  	}  	return 0;  } +static void add_napi_init_sync_stats(struct gve_priv *priv, +				     int (*napi_poll)(struct napi_struct *napi, +						      int budget)) +{ +	int i; + +	/* Add tx napi & init sync stats*/ +	for (i = 0; i < priv->tx_cfg.num_queues; i++) { +		int ntfy_idx = gve_tx_idx_to_ntfy(priv, i); + +		u64_stats_init(&priv->tx[i].statss); +		priv->tx[i].ntfy_id = ntfy_idx; +		gve_add_napi(priv, ntfy_idx, napi_poll); +	} +	/* Add rx napi  & init sync stats*/ +	for (i = 0; i < priv->rx_cfg.num_queues; i++) { +		int ntfy_idx = gve_rx_idx_to_ntfy(priv, i); + +		u64_stats_init(&priv->rx[i].statss); +		priv->rx[i].ntfy_id = ntfy_idx; +		gve_add_napi(priv, ntfy_idx, napi_poll); +	} +} + +static void gve_tx_free_rings(struct gve_priv *priv) +{ +	if (gve_is_gqi(priv)) { +		gve_tx_free_rings_gqi(priv); +	} else { +		gve_tx_free_rings_dqo(priv); +	} +} +  static int gve_alloc_rings(struct gve_priv *priv)  { -	int ntfy_idx;  	int err; -	int i;  	/* Setup tx rings */ -	priv->tx = kvzalloc(priv->tx_cfg.num_queues * sizeof(*priv->tx), +	priv->tx = kvcalloc(priv->tx_cfg.num_queues, sizeof(*priv->tx),  			    GFP_KERNEL);  	if (!priv->tx)  		return -ENOMEM; -	err = gve_tx_alloc_rings(priv); + +	if (gve_is_gqi(priv)) +		err = gve_tx_alloc_rings(priv); +	else +		err = gve_tx_alloc_rings_dqo(priv);  	if (err)  		goto free_tx; +  	/* Setup rx rings */ -	priv->rx = kvzalloc(priv->rx_cfg.num_queues * sizeof(*priv->rx), +	priv->rx = kvcalloc(priv->rx_cfg.num_queues, sizeof(*priv->rx),  			    GFP_KERNEL);  	if (!priv->rx) {  		err = -ENOMEM;  		goto free_tx_queue;  	} -	err = gve_rx_alloc_rings(priv); + +	if (gve_is_gqi(priv)) +		err = gve_rx_alloc_rings(priv); +	else +		err = gve_rx_alloc_rings_dqo(priv);  	if (err)  		goto free_rx; -	/* Add tx napi & init sync stats*/ -	for (i = 0; i < priv->tx_cfg.num_queues; i++) { -		u64_stats_init(&priv->tx[i].statss); -		ntfy_idx = gve_tx_idx_to_ntfy(priv, i); -		gve_add_napi(priv, ntfy_idx); -	} -	/* Add rx napi  & init sync stats*/ -	for (i = 0; i < priv->rx_cfg.num_queues; i++) { -		u64_stats_init(&priv->rx[i].statss); -		ntfy_idx = gve_rx_idx_to_ntfy(priv, i); -		gve_add_napi(priv, ntfy_idx); -	} + +	if (gve_is_gqi(priv)) +		add_napi_init_sync_stats(priv, gve_napi_poll); +	else +		add_napi_init_sync_stats(priv, gve_napi_poll_dqo);  	return 0; @@ -458,37 +710,34 @@ free_tx:  static int gve_destroy_rings(struct gve_priv *priv)  {  	int err; -	int i; -	for (i = 0; i < priv->tx_cfg.num_queues; i++) { -		err = gve_adminq_destroy_tx_queue(priv, i); -		if (err) { -			netif_err(priv, drv, priv->dev, -				  "failed to destroy tx queue %d\n", -				  i); -			/* This failure will trigger a reset - no need to clean -			 * up -			 */ -			return err; -		} -		netif_dbg(priv, drv, priv->dev, "destroyed tx queue %d\n", i); +	err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues); +	if (err) { +		netif_err(priv, drv, priv->dev, +			  "failed to destroy tx queues\n"); +		/* This failure will trigger a reset - no need to clean up */ +		return err;  	} -	for (i = 0; i < priv->rx_cfg.num_queues; i++) { -		err = gve_adminq_destroy_rx_queue(priv, i); -		if (err) { -			netif_err(priv, drv, priv->dev, -				  "failed to destroy rx queue %d\n", -				  i); -			/* This failure will trigger a reset - no need to clean -			 * up -			 */ -			return err; -		} -		netif_dbg(priv, drv, priv->dev, "destroyed rx queue %d\n", i); +	netif_dbg(priv, drv, priv->dev, "destroyed tx queues\n"); +	err = gve_adminq_destroy_rx_queues(priv, priv->rx_cfg.num_queues); +	if (err) { +		netif_err(priv, drv, priv->dev, +			  "failed to destroy rx queues\n"); +		/* This failure will trigger a reset - no need to clean up */ +		return err;  	} +	netif_dbg(priv, drv, priv->dev, "destroyed rx queues\n");  	return 0;  } +static void gve_rx_free_rings(struct gve_priv *priv) +{ +	if (gve_is_gqi(priv)) +		gve_rx_free_rings_gqi(priv); +	else +		gve_rx_free_rings_dqo(priv); +} +  static void gve_free_rings(struct gve_priv *priv)  {  	int ntfy_idx; @@ -514,14 +763,18 @@ static void gve_free_rings(struct gve_priv *priv)  	}  } -int gve_alloc_page(struct device *dev, struct page **page, dma_addr_t *dma, -		   enum dma_data_direction dir) +int gve_alloc_page(struct gve_priv *priv, struct device *dev, +		   struct page **page, dma_addr_t *dma, +		   enum dma_data_direction dir, gfp_t gfp_flags)  { -	*page = alloc_page(GFP_KERNEL); -	if (!*page) +	*page = alloc_page(gfp_flags); +	if (!*page) { +		priv->page_alloc_fail++;  		return -ENOMEM; +	}  	*dma = dma_map_page(dev, *page, 0, PAGE_SIZE, dir);  	if (dma_mapping_error(dev, *dma)) { +		priv->dma_mapping_error++;  		put_page(*page);  		return -ENOMEM;  	} @@ -545,20 +798,19 @@ static int gve_alloc_queue_page_list(struct gve_priv *priv, u32 id,  	qpl->id = id;  	qpl->num_entries = 0; -	qpl->pages = kvzalloc(pages * sizeof(*qpl->pages), GFP_KERNEL); +	qpl->pages = kvcalloc(pages, sizeof(*qpl->pages), GFP_KERNEL);  	/* caller handles clean up */  	if (!qpl->pages)  		return -ENOMEM; -	qpl->page_buses = kvzalloc(pages * sizeof(*qpl->page_buses), -				   GFP_KERNEL); +	qpl->page_buses = kvcalloc(pages, sizeof(*qpl->page_buses), GFP_KERNEL);  	/* caller handles clean up */  	if (!qpl->page_buses)  		return -ENOMEM;  	for (i = 0; i < pages; i++) { -		err = gve_alloc_page(&priv->pdev->dev, &qpl->pages[i], +		err = gve_alloc_page(priv, &priv->pdev->dev, &qpl->pages[i],  				     &qpl->page_buses[i], -				     gve_qpl_dma_dir(priv, id)); +				     gve_qpl_dma_dir(priv, id), GFP_KERNEL);  		/* caller handles clean up */  		if (err)  			return -ENOMEM; @@ -578,8 +830,7 @@ void gve_free_page(struct device *dev, struct page *page, dma_addr_t dma,  		put_page(page);  } -static void gve_free_queue_page_list(struct gve_priv *priv, -				     int id) +static void gve_free_queue_page_list(struct gve_priv *priv, u32 id)  {  	struct gve_queue_page_list *qpl = &priv->qpls[id];  	int i; @@ -605,7 +856,10 @@ static int gve_alloc_qpls(struct gve_priv *priv)  	int i, j;  	int err; -	priv->qpls = kvzalloc(num_qpls * sizeof(*priv->qpls), GFP_KERNEL); +	if (num_qpls == 0) +		return 0; + +	priv->qpls = kvcalloc(num_qpls, sizeof(*priv->qpls), GFP_KERNEL);  	if (!priv->qpls)  		return -ENOMEM; @@ -617,14 +871,14 @@ static int gve_alloc_qpls(struct gve_priv *priv)  	}  	for (; i < num_qpls; i++) {  		err = gve_alloc_queue_page_list(priv, i, -						priv->rx_pages_per_qpl); +						priv->rx_data_slot_cnt);  		if (err)  			goto free_qpls;  	}  	priv->qpl_cfg.qpl_map_size = BITS_TO_LONGS(num_qpls) *  				     sizeof(unsigned long) * BITS_PER_BYTE; -	priv->qpl_cfg.qpl_id_map = kvzalloc(BITS_TO_LONGS(num_qpls) * +	priv->qpl_cfg.qpl_id_map = kvcalloc(BITS_TO_LONGS(num_qpls),  					    sizeof(unsigned long), GFP_KERNEL);  	if (!priv->qpl_cfg.qpl_id_map) {  		err = -ENOMEM; @@ -645,6 +899,9 @@ static void gve_free_qpls(struct gve_priv *priv)  	int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv);  	int i; +	if (num_qpls == 0) +		return; +  	kvfree(priv->qpl_cfg.qpl_id_map);  	for (i = 0; i < num_qpls; i++) @@ -676,6 +933,7 @@ static int gve_open(struct net_device *dev)  	err = gve_alloc_qpls(priv);  	if (err)  		return err; +  	err = gve_alloc_rings(priv);  	if (err)  		goto free_qpls; @@ -690,13 +948,27 @@ static int gve_open(struct net_device *dev)  	err = gve_register_qpls(priv);  	if (err)  		goto reset; + +	if (!gve_is_gqi(priv)) { +		/* Hard code this for now. This may be tuned in the future for +		 * performance. +		 */ +		priv->data_buffer_size_dqo = GVE_RX_BUFFER_SIZE_DQO; +	}  	err = gve_create_rings(priv);  	if (err)  		goto reset; +  	gve_set_device_rings_ok(priv); +	if (gve_get_report_stats(priv)) +		mod_timer(&priv->stats_report_timer, +			  round_jiffies(jiffies + +				msecs_to_jiffies(priv->stats_report_timer_period))); +  	gve_turnup(priv); -	netif_carrier_on(dev); +	queue_work(priv->gve_wq, &priv->service_task); +	priv->interface_up_cnt++;  	return 0;  free_rings: @@ -735,9 +1007,11 @@ static int gve_close(struct net_device *dev)  			goto err;  		gve_clear_device_rings_ok(priv);  	} +	del_timer_sync(&priv->stats_report_timer);  	gve_free_rings(priv);  	gve_free_qpls(priv); +	priv->interface_down_cnt++;  	return 0;  err: @@ -817,6 +1091,7 @@ static void gve_turndown(struct gve_priv *priv)  	netif_tx_disable(priv->dev);  	gve_clear_napi_enabled(priv); +	gve_clear_report_stats(priv);  }  static void gve_turnup(struct gve_priv *priv) @@ -832,14 +1107,24 @@ static void gve_turnup(struct gve_priv *priv)  		struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];  		napi_enable(&block->napi); -		iowrite32be(0, gve_irq_doorbell(priv, block)); +		if (gve_is_gqi(priv)) { +			iowrite32be(0, gve_irq_doorbell(priv, block)); +		} else { +			gve_set_itr_coalesce_usecs_dqo(priv, block, +						       priv->tx_coalesce_usecs); +		}  	}  	for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) {  		int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);  		struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];  		napi_enable(&block->napi); -		iowrite32be(0, gve_irq_doorbell(priv, block)); +		if (gve_is_gqi(priv)) { +			iowrite32be(0, gve_irq_doorbell(priv, block)); +		} else { +			gve_set_itr_coalesce_usecs_dqo(priv, block, +						       priv->rx_coalesce_usecs); +		}  	}  	gve_set_napi_enabled(priv); @@ -847,18 +1132,93 @@ static void gve_turnup(struct gve_priv *priv)  static void gve_tx_timeout(struct net_device *dev, unsigned int txqueue)  { -	struct gve_priv *priv = netdev_priv(dev); +	struct gve_notify_block *block; +	struct gve_tx_ring *tx = NULL; +	struct gve_priv *priv; +	u32 last_nic_done; +	u32 current_time; +	u32 ntfy_idx; + +	netdev_info(dev, "Timeout on tx queue, %d", txqueue); +	priv = netdev_priv(dev); +	if (txqueue > priv->tx_cfg.num_queues) +		goto reset; + +	ntfy_idx = gve_tx_idx_to_ntfy(priv, txqueue); +	if (ntfy_idx >= priv->num_ntfy_blks) +		goto reset; + +	block = &priv->ntfy_blocks[ntfy_idx]; +	tx = block->tx; + +	current_time = jiffies_to_msecs(jiffies); +	if (tx->last_kick_msec + MIN_TX_TIMEOUT_GAP > current_time) +		goto reset; + +	/* Check to see if there are missed completions, which will allow us to +	 * kick the queue. +	 */ +	last_nic_done = gve_tx_load_event_counter(priv, tx); +	if (last_nic_done - tx->done) { +		netdev_info(dev, "Kicking queue %d", txqueue); +		iowrite32be(GVE_IRQ_MASK, gve_irq_doorbell(priv, block)); +		napi_schedule(&block->napi); +		tx->last_kick_msec = current_time; +		goto out; +	} // Else reset. +reset:  	gve_schedule_reset(priv); + +out: +	if (tx) +		tx->queue_timeout++;  	priv->tx_timeo_cnt++;  } +static int gve_set_features(struct net_device *netdev, +			    netdev_features_t features) +{ +	const netdev_features_t orig_features = netdev->features; +	struct gve_priv *priv = netdev_priv(netdev); +	int err; + +	if ((netdev->features & NETIF_F_LRO) != (features & NETIF_F_LRO)) { +		netdev->features ^= NETIF_F_LRO; +		if (netif_carrier_ok(netdev)) { +			/* To make this process as simple as possible we +			 * teardown the device, set the new configuration, +			 * and then bring the device up again. +			 */ +			err = gve_close(netdev); +			/* We have already tried to reset in close, just fail +			 * at this point. +			 */ +			if (err) +				goto err; + +			err = gve_open(netdev); +			if (err) +				goto err; +		} +	} + +	return 0; +err: +	/* Reverts the change on error. */ +	netdev->features = orig_features; +	netif_err(priv, drv, netdev, +		  "Set features failed! !!! DISABLING ALL QUEUES !!!\n"); +	return err; +} +  static const struct net_device_ops gve_netdev_ops = { -	.ndo_start_xmit		=	gve_tx, +	.ndo_start_xmit		=	gve_start_xmit,  	.ndo_open		=	gve_open,  	.ndo_stop		=	gve_close,  	.ndo_get_stats64	=	gve_get_stats,  	.ndo_tx_timeout         =       gve_tx_timeout, +	.ndo_set_features	=	gve_set_features,  };  static void gve_handle_status(struct gve_priv *priv, u32 status) @@ -867,6 +1227,10 @@ static void gve_handle_status(struct gve_priv *priv, u32 status)  		dev_info(&priv->pdev->dev, "Device requested reset.\n");  		gve_set_do_reset(priv);  	} +	if (GVE_DEVICE_STATUS_REPORT_STATS_MASK & status) { +		priv->stats_report_trigger_cnt++; +		gve_set_do_report_stats(priv); +	}  }  static void gve_handle_reset(struct gve_priv *priv) @@ -885,16 +1249,110 @@ static void gve_handle_reset(struct gve_priv *priv)  	}  } -/* Handle NIC status register changes and reset requests */ +void gve_handle_report_stats(struct gve_priv *priv) +{ +	struct stats *stats = priv->stats_report->stats; +	int idx, stats_idx = 0; +	unsigned int start = 0; +	u64 tx_bytes; + +	if (!gve_get_report_stats(priv)) +		return; + +	be64_add_cpu(&priv->stats_report->written_count, 1); +	/* tx stats */ +	if (priv->tx) { +		for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) { +			u32 last_completion = 0; +			u32 tx_frames = 0; + +			/* DQO doesn't currently support these metrics. */ +			if (gve_is_gqi(priv)) { +				last_completion = priv->tx[idx].done; +				tx_frames = priv->tx[idx].req; +			} + +			do { +				start = u64_stats_fetch_begin_irq(&priv->tx[idx].statss); +				tx_bytes = priv->tx[idx].bytes_done; +			} while (u64_stats_fetch_retry_irq(&priv->tx[idx].statss, start)); +			stats[stats_idx++] = (struct stats) { +				.stat_name = cpu_to_be32(TX_WAKE_CNT), +				.value = cpu_to_be64(priv->tx[idx].wake_queue), +				.queue_id = cpu_to_be32(idx), +			}; +			stats[stats_idx++] = (struct stats) { +				.stat_name = cpu_to_be32(TX_STOP_CNT), +				.value = cpu_to_be64(priv->tx[idx].stop_queue), +				.queue_id = cpu_to_be32(idx), +			}; +			stats[stats_idx++] = (struct stats) { +				.stat_name = cpu_to_be32(TX_FRAMES_SENT), +				.value = cpu_to_be64(tx_frames), +				.queue_id = cpu_to_be32(idx), +			}; +			stats[stats_idx++] = (struct stats) { +				.stat_name = cpu_to_be32(TX_BYTES_SENT), +				.value = cpu_to_be64(tx_bytes), +				.queue_id = cpu_to_be32(idx), +			}; +			stats[stats_idx++] = (struct stats) { +				.stat_name = cpu_to_be32(TX_LAST_COMPLETION_PROCESSED), +				.value = cpu_to_be64(last_completion), +				.queue_id = cpu_to_be32(idx), +			}; +			stats[stats_idx++] = (struct stats) { +				.stat_name = cpu_to_be32(TX_TIMEOUT_CNT), +				.value = cpu_to_be64(priv->tx[idx].queue_timeout), +				.queue_id = cpu_to_be32(idx), +			}; +		} +	} +	/* rx stats */ +	if (priv->rx) { +		for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) { +			stats[stats_idx++] = (struct stats) { +				.stat_name = cpu_to_be32(RX_NEXT_EXPECTED_SEQUENCE), +				.value = cpu_to_be64(priv->rx[idx].desc.seqno), +				.queue_id = cpu_to_be32(idx), +			}; +			stats[stats_idx++] = (struct stats) { +				.stat_name = cpu_to_be32(RX_BUFFERS_POSTED), +				.value = cpu_to_be64(priv->rx[0].fill_cnt), +				.queue_id = cpu_to_be32(idx), +			}; +		} +	} +} + +static void gve_handle_link_status(struct gve_priv *priv, bool link_status) +{ +	if (!gve_get_napi_enabled(priv)) +		return; + +	if (link_status == netif_carrier_ok(priv->dev)) +		return; + +	if (link_status) { +		netdev_info(priv->dev, "Device link is up.\n"); +		netif_carrier_on(priv->dev); +	} else { +		netdev_info(priv->dev, "Device link is down.\n"); +		netif_carrier_off(priv->dev); +	} +} + +/* Handle NIC status register changes, reset requests and report stats */  static void gve_service_task(struct work_struct *work)  {  	struct gve_priv *priv = container_of(work, struct gve_priv,  					     service_task); +	u32 status = ioread32be(&priv->reg_bar0->device_status); -	gve_handle_status(priv, -			  ioread32be(&priv->reg_bar0->device_status)); +	gve_handle_status(priv, status);  	gve_handle_reset(priv); +	gve_handle_link_status(priv, GVE_DEVICE_STATUS_LINK_STATUS_MASK & status);  }  static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device) @@ -913,6 +1371,7 @@ static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device)  	if (skip_describe_device)  		goto setup_device; +	priv->queue_format = GVE_QUEUE_FORMAT_UNSPECIFIED;  	/* Get the initial information we need from the device */  	err = gve_adminq_describe_device(priv);  	if (err) { @@ -920,14 +1379,6 @@ static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device)  			"Could not get device information: err=%d\n", err);  		goto err;  	} -	if (priv->dev->max_mtu > PAGE_SIZE) { -		priv->dev->max_mtu = PAGE_SIZE; -		err = gve_adminq_set_mtu(priv, priv->dev->mtu); -		if (err) { -			netif_err(priv, drv, priv->dev, "Could not set mtu"); -			goto err; -		} -	}  	priv->dev->mtu = priv->dev->max_mtu;  	num_ntfy = pci_msix_vec_count(priv->pdev);  	if (num_ntfy <= 0) { @@ -964,10 +1415,15 @@ static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device)  						priv->rx_cfg.num_queues);  	} -	netif_info(priv, drv, priv->dev, "TX queues %d, RX queues %d\n", -		   priv->tx_cfg.num_queues, priv->rx_cfg.num_queues); -	netif_info(priv, drv, priv->dev, "Max TX queues %d, Max RX queues %d\n", -		   priv->tx_cfg.max_queues, priv->rx_cfg.max_queues); +	dev_info(&priv->pdev->dev, "TX queues %d, RX queues %d\n", +		 priv->tx_cfg.num_queues, priv->rx_cfg.num_queues); +	dev_info(&priv->pdev->dev, "Max TX queues %d, Max RX queues %d\n", +		 priv->tx_cfg.max_queues, priv->rx_cfg.max_queues); + +	if (!gve_is_gqi(priv)) { +		priv->tx_coalesce_usecs = GVE_TX_IRQ_RATELIMIT_US_DQO; +		priv->rx_coalesce_usecs = GVE_RX_IRQ_RATELIMIT_US_DQO; +	}  setup_device:  	err = gve_setup_device_resources(priv); @@ -1047,6 +1503,10 @@ int gve_reset(struct gve_priv *priv, bool attempt_teardown)  	/* Set it all back up */  	err = gve_reset_recovery(priv, was_up);  	gve_clear_reset_in_progress(priv); +	priv->reset_cnt++; +	priv->interface_up_cnt = 0; +	priv->interface_down_cnt = 0; +	priv->stats_report_trigger_cnt = 0;  	return err;  } @@ -1078,7 +1538,7 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)  	err = pci_enable_device(pdev);  	if (err) -		return -ENXIO; +		return err;  	err = pci_request_regions(pdev, "gvnic-cfg");  	if (err) @@ -1086,19 +1546,12 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)  	pci_set_master(pdev); -	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); +	err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));  	if (err) {  		dev_err(&pdev->dev, "Failed to set dma mask: err=%d\n", err);  		goto abort_with_pci_region;  	} -	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); -	if (err) { -		dev_err(&pdev->dev, -			"Failed to set consistent dma mask: err=%d\n", err); -		goto abort_with_pci_region; -	} -  	reg_bar = pci_iomap(pdev, GVE_REGISTER_BAR, 0);  	if (!reg_bar) {  		dev_err(&pdev->dev, "Failed to map pci bar!\n"); @@ -1115,19 +1568,25 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)  	gve_write_version(®_bar->driver_version);  	/* Get max queues to alloc etherdev */ -	max_rx_queues = ioread32be(®_bar->max_tx_queues); -	max_tx_queues = ioread32be(®_bar->max_rx_queues); +	max_tx_queues = ioread32be(®_bar->max_tx_queues); +	max_rx_queues = ioread32be(®_bar->max_rx_queues);  	/* Alloc and setup the netdev and priv */  	dev = alloc_etherdev_mqs(sizeof(*priv), max_tx_queues, max_rx_queues);  	if (!dev) {  		dev_err(&pdev->dev, "could not allocate netdev\n"); +		err = -ENOMEM;  		goto abort_with_db_bar;  	}  	SET_NETDEV_DEV(dev, &pdev->dev);  	pci_set_drvdata(pdev, dev);  	dev->ethtool_ops = &gve_ethtool_ops;  	dev->netdev_ops = &gve_netdev_ops; -	/* advertise features */ + +	/* Set default and supported features. +	 * +	 * Features might be set in other locations as well (such as +	 * `gve_adminq_describe_device`). +	 */  	dev->hw_features = NETIF_F_HIGHDMA;  	dev->hw_features |= NETIF_F_SG;  	dev->hw_features |= NETIF_F_HW_CSUM; @@ -1149,6 +1608,7 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)  	priv->db_bar2 = db_bar;  	priv->service_task_flags = 0x0;  	priv->state_flags = 0x0; +	priv->ethtool_flags = 0x0;  	gve_set_probe_in_progress(priv);  	priv->gve_wq = alloc_ordered_workqueue("gve", 0); @@ -1158,6 +1618,7 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)  		goto abort_with_netdev;  	}  	INIT_WORK(&priv->service_task, gve_service_task); +	INIT_WORK(&priv->stats_report_task, gve_stats_report_task);  	priv->tx_cfg.max_queues = max_tx_queues;  	priv->rx_cfg.max_queues = max_rx_queues; @@ -1167,13 +1628,17 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)  	err = register_netdev(dev);  	if (err) -		goto abort_with_wq; +		goto abort_with_gve_init;  	dev_info(&pdev->dev, "GVE version %s\n", gve_version_str); +	dev_info(&pdev->dev, "GVE queue format %d\n", (int)priv->queue_format);  	gve_clear_probe_in_progress(priv);  	queue_work(priv->gve_wq, &priv->service_task);  	return 0; +abort_with_gve_init: +	gve_teardown_priv_resources(priv); +  abort_with_wq:  	destroy_workqueue(priv->gve_wq); @@ -1191,7 +1656,7 @@ abort_with_pci_region:  abort_with_enabled:  	pci_disable_device(pdev); -	return -ENXIO; +	return err;  }  static void gve_remove(struct pci_dev *pdev) @@ -1211,6 +1676,58 @@ static void gve_remove(struct pci_dev *pdev)  	pci_disable_device(pdev);  } +static void gve_shutdown(struct pci_dev *pdev) +{ +	struct net_device *netdev = pci_get_drvdata(pdev); +	struct gve_priv *priv = netdev_priv(netdev); +	bool was_up = netif_carrier_ok(priv->dev); + +	rtnl_lock(); +	if (was_up && gve_close(priv->dev)) { +		/* If the dev was up, attempt to close, if close fails, reset */ +		gve_reset_and_teardown(priv, was_up); +	} else { +		/* If the dev wasn't up or close worked, finish tearing down */ +		gve_teardown_priv_resources(priv); +	} +	rtnl_unlock(); +} + +#ifdef CONFIG_PM +static int gve_suspend(struct pci_dev *pdev, pm_message_t state) +{ +	struct net_device *netdev = pci_get_drvdata(pdev); +	struct gve_priv *priv = netdev_priv(netdev); +	bool was_up = netif_carrier_ok(priv->dev); + +	priv->suspend_cnt++; +	rtnl_lock(); +	if (was_up && gve_close(priv->dev)) { +		/* If the dev was up, attempt to close, if close fails, reset */ +		gve_reset_and_teardown(priv, was_up); +	} else { +		/* If the dev wasn't up or close worked, finish tearing down */ +		gve_teardown_priv_resources(priv); +	} +	priv->up_before_suspend = was_up; +	rtnl_unlock(); +	return 0; +} + +static int gve_resume(struct pci_dev *pdev) +{ +	struct net_device *netdev = pci_get_drvdata(pdev); +	struct gve_priv *priv = netdev_priv(netdev); +	int err; + +	priv->resume_cnt++; +	rtnl_lock(); +	err = gve_reset_recovery(priv, priv->up_before_suspend); +	rtnl_unlock(); +	return err; +} +#endif /* CONFIG_PM */ +  static const struct pci_device_id gve_id_table[] = {  	{ PCI_DEVICE(PCI_VENDOR_ID_GOOGLE, PCI_DEV_ID_GVNIC) },  	{ } @@ -1221,6 +1738,11 @@ static struct pci_driver gvnic_driver = {  	.id_table	= gve_id_table,  	.probe		= gve_probe,  	.remove		= gve_remove, +	.shutdown	= gve_shutdown, +#ifdef CONFIG_PM +	.suspend        = gve_suspend, +	.resume         = gve_resume, +#endif  };  module_pci_driver(gvnic_driver); diff --git a/drivers/net/ethernet/google/gve/gve_register.h b/drivers/net/ethernet/google/gve/gve_register.h index 84ab8893aadd..fb655463c357 100644 --- a/drivers/net/ethernet/google/gve/gve_register.h +++ b/drivers/net/ethernet/google/gve/gve_register.h @@ -23,5 +23,6 @@ struct gve_registers {  enum gve_device_status_flags {  	GVE_DEVICE_STATUS_RESET_MASK		= BIT(1),  	GVE_DEVICE_STATUS_LINK_STATUS_MASK	= BIT(2), +	GVE_DEVICE_STATUS_REPORT_STATS_MASK	= BIT(3),  };  #endif /* _GVE_REGISTER_H_ */ diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c index 9f52e72ff641..021bbf308d68 100644 --- a/drivers/net/ethernet/google/gve/gve_rx.c +++ b/drivers/net/ethernet/google/gve/gve_rx.c @@ -1,27 +1,51 @@  // SPDX-License-Identifier: (GPL-2.0 OR MIT)  /* Google virtual Ethernet (gve) driver   * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc.   */  #include "gve.h"  #include "gve_adminq.h" +#include "gve_utils.h"  #include <linux/etherdevice.h> -static void gve_rx_remove_from_block(struct gve_priv *priv, int queue_idx) +static void gve_rx_free_buffer(struct device *dev, +			       struct gve_rx_slot_page_info *page_info, +			       union gve_rx_data_slot *data_slot)  { -	struct gve_notify_block *block = -			&priv->ntfy_blocks[gve_rx_idx_to_ntfy(priv, queue_idx)]; +	dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) & +				      GVE_DATA_SLOT_ADDR_PAGE_MASK); -	block->rx = NULL; +	page_ref_sub(page_info->page, page_info->pagecnt_bias - 1); +	gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE); +} + +static void gve_rx_unfill_pages(struct gve_priv *priv, struct gve_rx_ring *rx) +{ +	u32 slots = rx->mask + 1; +	int i; + +	if (rx->data.raw_addressing) { +		for (i = 0; i < slots; i++) +			gve_rx_free_buffer(&priv->pdev->dev, &rx->data.page_info[i], +					   &rx->data.data_ring[i]); +	} else { +		for (i = 0; i < slots; i++) +			page_ref_sub(rx->data.page_info[i].page, +				     rx->data.page_info[i].pagecnt_bias - 1); +		gve_unassign_qpl(priv, rx->data.qpl->id); +		rx->data.qpl = NULL; +	} +	kvfree(rx->data.page_info); +	rx->data.page_info = NULL;  }  static void gve_rx_free_ring(struct gve_priv *priv, int idx)  {  	struct gve_rx_ring *rx = &priv->rx[idx];  	struct device *dev = &priv->pdev->dev; +	u32 slots = rx->mask + 1;  	size_t bytes; -	u32 slots;  	gve_rx_remove_from_block(priv, idx); @@ -33,11 +57,8 @@ static void gve_rx_free_ring(struct gve_priv *priv, int idx)  			  rx->q_resources, rx->q_resources_bus);  	rx->q_resources = NULL; -	gve_unassign_qpl(priv, rx->data.qpl->id); -	rx->data.qpl = NULL; -	kvfree(rx->data.page_info); +	gve_rx_unfill_pages(priv, rx); -	slots = rx->mask + 1;  	bytes = sizeof(*rx->data.data_ring) * slots;  	dma_free_coherent(dev, bytes, rx->data.data_ring,  			  rx->data.data_bus); @@ -46,19 +67,39 @@ static void gve_rx_free_ring(struct gve_priv *priv, int idx)  }  static void gve_setup_rx_buffer(struct gve_rx_slot_page_info *page_info, -				struct gve_rx_data_slot *slot, -				dma_addr_t addr, struct page *page) +			     dma_addr_t addr, struct page *page, __be64 *slot_addr)  {  	page_info->page = page;  	page_info->page_offset = 0;  	page_info->page_address = page_address(page); -	slot->qpl_offset = cpu_to_be64(addr); +	*slot_addr = cpu_to_be64(addr); +	/* The page already has 1 ref */ +	page_ref_add(page, INT_MAX - 1); +	page_info->pagecnt_bias = INT_MAX; +} + +static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev, +			       struct gve_rx_slot_page_info *page_info, +			       union gve_rx_data_slot *data_slot) +{ +	struct page *page; +	dma_addr_t dma; +	int err; + +	err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE, +			     GFP_ATOMIC); +	if (err) +		return err; + +	gve_setup_rx_buffer(page_info, dma, page, &data_slot->addr); +	return 0;  }  static int gve_prefill_rx_pages(struct gve_rx_ring *rx)  {  	struct gve_priv *priv = rx->gve;  	u32 slots; +	int err;  	int i;  	/* Allocate one page per Rx queue slot. Each page is split into two @@ -71,27 +112,46 @@ static int gve_prefill_rx_pages(struct gve_rx_ring *rx)  	if (!rx->data.page_info)  		return -ENOMEM; -	rx->data.qpl = gve_assign_rx_qpl(priv); - +	if (!rx->data.raw_addressing) { +		rx->data.qpl = gve_assign_rx_qpl(priv); +		if (!rx->data.qpl) { +			kvfree(rx->data.page_info); +			rx->data.page_info = NULL; +			return -ENOMEM; +		} +	}  	for (i = 0; i < slots; i++) { -		struct page *page = rx->data.qpl->pages[i]; -		dma_addr_t addr = i * PAGE_SIZE; +		if (!rx->data.raw_addressing) { +			struct page *page = rx->data.qpl->pages[i]; +			dma_addr_t addr = i * PAGE_SIZE; -		gve_setup_rx_buffer(&rx->data.page_info[i], -				    &rx->data.data_ring[i], addr, page); +			gve_setup_rx_buffer(&rx->data.page_info[i], addr, page, +					    &rx->data.data_ring[i].qpl_offset); +			continue; +		} +		err = gve_rx_alloc_buffer(priv, &priv->pdev->dev, &rx->data.page_info[i], +					  &rx->data.data_ring[i]); +		if (err) +			goto alloc_err;  	}  	return slots; +alloc_err: +	while (i--) +		gve_rx_free_buffer(&priv->pdev->dev, +				   &rx->data.page_info[i], +				   &rx->data.data_ring[i]); +	return err;  } -static void gve_rx_add_to_block(struct gve_priv *priv, int queue_idx) +static void gve_rx_ctx_clear(struct gve_rx_ctx *ctx)  { -	u32 ntfy_idx = gve_rx_idx_to_ntfy(priv, queue_idx); -	struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; -	struct gve_rx_ring *rx = &priv->rx[queue_idx]; - -	block->rx = rx; -	rx->ntfy_id = ntfy_idx; +	ctx->curr_frag_cnt = 0; +	ctx->total_expected_size = 0; +	ctx->expected_frag_cnt = 0; +	ctx->skb_head = NULL; +	ctx->skb_tail = NULL; +	ctx->reuse_frags = false;  }  static int gve_rx_alloc_ring(struct gve_priv *priv, int idx) @@ -110,8 +170,9 @@ static int gve_rx_alloc_ring(struct gve_priv *priv, int idx)  	rx->gve = priv;  	rx->q_num = idx; -	slots = priv->rx_pages_per_qpl; +	slots = priv->rx_data_slot_cnt;  	rx->mask = slots - 1; +	rx->data.raw_addressing = priv->queue_format == GVE_GQI_RDA_FORMAT;  	/* alloc rx data ring */  	bytes = sizeof(*rx->data.data_ring) * slots; @@ -156,9 +217,15 @@ static int gve_rx_alloc_ring(struct gve_priv *priv, int idx)  		err = -ENOMEM;  		goto abort_with_q_resources;  	} -	rx->mask = slots - 1;  	rx->cnt = 0; +	rx->db_threshold = priv->rx_desc_cnt / 2;  	rx->desc.seqno = 1; + +	/* Allocating half-page buffers allows page-flipping which is faster +	 * than copying or allocating new pages. +	 */ +	rx->packet_buffer_size = PAGE_SIZE / 2; +	gve_rx_ctx_clear(&rx->ctx);  	gve_rx_add_to_block(priv, idx);  	return 0; @@ -168,7 +235,7 @@ abort_with_q_resources:  			  rx->q_resources, rx->q_resources_bus);  	rx->q_resources = NULL;  abort_filled: -	kvfree(rx->data.page_info); +	gve_rx_unfill_pages(priv, rx);  abort_with_slots:  	bytes = sizeof(*rx->data.data_ring) * slots;  	dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus); @@ -201,7 +268,7 @@ int gve_rx_alloc_rings(struct gve_priv *priv)  	return err;  } -void gve_rx_free_rings(struct gve_priv *priv) +void gve_rx_free_rings_gqi(struct gve_priv *priv)  {  	int i; @@ -225,144 +292,346 @@ static enum pkt_hash_types gve_rss_type(__be16 pkt_flags)  	return PKT_HASH_TYPE_L2;  } -static struct sk_buff *gve_rx_copy(struct net_device *dev, -				   struct napi_struct *napi, -				   struct gve_rx_slot_page_info *page_info, -				   u16 len) +static u16 gve_rx_ctx_padding(struct gve_rx_ctx *ctx)  { -	struct sk_buff *skb = napi_alloc_skb(napi, len); -	void *va = page_info->page_address + GVE_RX_PAD + -		   page_info->page_offset; +	return (ctx->curr_frag_cnt == 0) ? GVE_RX_PAD : 0; +} -	if (unlikely(!skb)) -		return NULL; +static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi, +					struct gve_rx_slot_page_info *page_info, +					u16 packet_buffer_size, u16 len, +					struct gve_rx_ctx *ctx) +{ +	u32 offset = page_info->page_offset +  gve_rx_ctx_padding(ctx); +	struct sk_buff *skb; + +	if (!ctx->skb_head) +		ctx->skb_head = napi_get_frags(napi); -	__skb_put(skb, len); +	if (unlikely(!ctx->skb_head)) +		return NULL; -	skb_copy_to_linear_data(skb, va, len); +	skb = ctx->skb_head; +	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page_info->page, +			offset, len, packet_buffer_size); -	skb->protocol = eth_type_trans(skb, dev);  	return skb;  } -static struct sk_buff *gve_rx_add_frags(struct net_device *dev, -					struct napi_struct *napi, -					struct gve_rx_slot_page_info *page_info, -					u16 len) +static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) +{ +	const __be64 offset = cpu_to_be64(PAGE_SIZE / 2); + +	/* "flip" to other packet buffer on this page */ +	page_info->page_offset ^= PAGE_SIZE / 2; +	*(slot_addr) ^= offset; +} + +static int gve_rx_can_recycle_buffer(struct gve_rx_slot_page_info *page_info) +{ +	int pagecount = page_count(page_info->page); + +	/* This page is not being used by any SKBs - reuse */ +	if (pagecount == page_info->pagecnt_bias) +		return 1; +	/* This page is still being used by an SKB - we can't reuse */ +	else if (pagecount > page_info->pagecnt_bias) +		return 0; +	WARN(pagecount < page_info->pagecnt_bias, +	     "Pagecount should never be less than the bias."); +	return -1; +} + +static struct sk_buff * +gve_rx_raw_addressing(struct device *dev, struct net_device *netdev, +		      struct gve_rx_slot_page_info *page_info, u16 len, +		      struct napi_struct *napi, +		      union gve_rx_data_slot *data_slot, +		      u16 packet_buffer_size, struct gve_rx_ctx *ctx)  { -	struct sk_buff *skb = napi_get_frags(napi); +	struct sk_buff *skb = gve_rx_add_frags(napi, page_info, packet_buffer_size, len, ctx); -	if (unlikely(!skb)) +	if (!skb)  		return NULL; -	skb_add_rx_frag(skb, 0, page_info->page, -			page_info->page_offset + -			GVE_RX_PAD, len, PAGE_SIZE / 2); +	/* Optimistically stop the kernel from freeing the page. +	 * We will check again in refill to determine if we need to alloc a +	 * new page. +	 */ +	gve_dec_pagecnt_bias(page_info);  	return skb;  } -static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, -			     struct gve_rx_data_slot *data_ring) +static struct sk_buff * +gve_rx_qpl(struct device *dev, struct net_device *netdev, +	   struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info, +	   u16 len, struct napi_struct *napi, +	   union gve_rx_data_slot *data_slot)  { -	u64 addr = be64_to_cpu(data_ring->qpl_offset); +	struct gve_rx_ctx *ctx = &rx->ctx; +	struct sk_buff *skb; -	page_info->page_offset ^= PAGE_SIZE / 2; -	addr ^= PAGE_SIZE / 2; -	data_ring->qpl_offset = cpu_to_be64(addr); +	/* if raw_addressing mode is not enabled gvnic can only receive into +	 * registered segments. If the buffer can't be recycled, our only +	 * choice is to copy the data out of it so that we can return it to the +	 * device. +	 */ +	if (ctx->reuse_frags) { +		skb = gve_rx_add_frags(napi, page_info, rx->packet_buffer_size, len, ctx); +		/* No point in recycling if we didn't get the skb */ +		if (skb) { +			/* Make sure that the page isn't freed. */ +			gve_dec_pagecnt_bias(page_info); +			gve_rx_flip_buff(page_info, &data_slot->qpl_offset); +		} +	} else { +		const u16 padding = gve_rx_ctx_padding(ctx); + +		skb = gve_rx_copy(netdev, napi, page_info, len, padding, ctx); +		if (skb) { +			u64_stats_update_begin(&rx->statss); +			rx->rx_frag_copy_cnt++; +			u64_stats_update_end(&rx->statss); +		} +	} +	return skb; +} + +#define GVE_PKTCONT_BIT_IS_SET(x) (GVE_RXF_PKT_CONT & (x)) +static u16 gve_rx_get_fragment_size(struct gve_rx_ctx *ctx, struct gve_rx_desc *desc) +{ +	return be16_to_cpu(desc->len) - gve_rx_ctx_padding(ctx);  } -static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc, -		   netdev_features_t feat, u32 idx) +static bool gve_rx_ctx_init(struct gve_rx_ctx *ctx, struct gve_rx_ring *rx)  { +	bool qpl_mode = !rx->data.raw_addressing, packet_size_error = false; +	bool buffer_error = false, desc_error = false, seqno_error = false;  	struct gve_rx_slot_page_info *page_info;  	struct gve_priv *priv = rx->gve; -	struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; -	struct net_device *dev = priv->dev; -	struct sk_buff *skb; -	int pagecount; -	u16 len; +	u32 idx = rx->cnt & rx->mask; +	bool reuse_frags, can_flip; +	struct gve_rx_desc *desc; +	u16 packet_size = 0; +	u16 n_frags = 0; +	int recycle; -	/* drop this packet */ -	if (unlikely(rx_desc->flags_seq & GVE_RXF_ERR)) -		return true; +	/** In QPL mode, we only flip buffers when all buffers containing the packet +	 * can be flipped. RDA can_flip decisions will be made later, per frag. +	 */ +	can_flip = qpl_mode; +	reuse_frags = can_flip; +	do { +		u16 frag_size; + +		n_frags++; +		desc = &rx->desc.desc_ring[idx]; +		desc_error = unlikely(desc->flags_seq & GVE_RXF_ERR) || desc_error; +		if (GVE_SEQNO(desc->flags_seq) != rx->desc.seqno) { +			seqno_error = true; +			netdev_warn(priv->dev, +				    "RX seqno error: want=%d, got=%d, dropping packet and scheduling reset.", +				    rx->desc.seqno, GVE_SEQNO(desc->flags_seq)); +		} +		frag_size = be16_to_cpu(desc->len); +		packet_size += frag_size; +		if (frag_size > rx->packet_buffer_size) { +			packet_size_error = true; +			netdev_warn(priv->dev, +				    "RX fragment error: packet_buffer_size=%d, frag_size=%d, dropping packet.", +				    rx->packet_buffer_size, be16_to_cpu(desc->len)); +		} +		page_info = &rx->data.page_info[idx]; +		if (can_flip) { +			recycle = gve_rx_can_recycle_buffer(page_info); +			reuse_frags = reuse_frags && recycle > 0; +			buffer_error = buffer_error || unlikely(recycle < 0); +		} +		idx = (idx + 1) & rx->mask; +		rx->desc.seqno = gve_next_seqno(rx->desc.seqno); +	} while (GVE_PKTCONT_BIT_IS_SET(desc->flags_seq)); -	len = be16_to_cpu(rx_desc->len) - GVE_RX_PAD; -	page_info = &rx->data.page_info[idx]; -	dma_sync_single_for_cpu(&priv->pdev->dev, rx->data.qpl->page_buses[idx], -				PAGE_SIZE, DMA_FROM_DEVICE); +	prefetch(rx->desc.desc_ring + idx); -	/* gvnic can only receive into registered segments. If the buffer -	 * can't be recycled, our only choice is to copy the data out of -	 * it so that we can return it to the device. -	 */ +	ctx->curr_frag_cnt = 0; +	ctx->total_expected_size = packet_size - GVE_RX_PAD; +	ctx->expected_frag_cnt = n_frags; +	ctx->skb_head = NULL; +	ctx->reuse_frags = reuse_frags; -	if (PAGE_SIZE == 4096) { -		if (len <= priv->rx_copybreak) { -			/* Just copy small packets */ -			skb = gve_rx_copy(dev, napi, page_info, len); -			goto have_skb; -		} -		if (unlikely(!gve_can_recycle_pages(dev))) { -			skb = gve_rx_copy(dev, napi, page_info, len); -			goto have_skb; +	if (ctx->expected_frag_cnt > 1) { +		u64_stats_update_begin(&rx->statss); +		rx->rx_cont_packet_cnt++; +		u64_stats_update_end(&rx->statss); +	} +	if (ctx->total_expected_size > priv->rx_copybreak && !ctx->reuse_frags && qpl_mode) { +		u64_stats_update_begin(&rx->statss); +		rx->rx_copied_pkt++; +		u64_stats_update_end(&rx->statss); +	} + +	if (unlikely(buffer_error || seqno_error || packet_size_error)) { +		gve_schedule_reset(priv); +		return false; +	} + +	if (unlikely(desc_error)) { +		u64_stats_update_begin(&rx->statss); +		rx->rx_desc_err_dropped_pkt++; +		u64_stats_update_end(&rx->statss); +		return false; +	} +	return true; +} + +static struct sk_buff *gve_rx_skb(struct gve_priv *priv, struct gve_rx_ring *rx, +				  struct gve_rx_slot_page_info *page_info, struct napi_struct *napi, +				  u16 len, union gve_rx_data_slot *data_slot) +{ +	struct net_device *netdev = priv->dev; +	struct gve_rx_ctx *ctx = &rx->ctx; +	struct sk_buff *skb = NULL; + +	if (len <= priv->rx_copybreak && ctx->expected_frag_cnt == 1) { +		/* Just copy small packets */ +		skb = gve_rx_copy(netdev, napi, page_info, len, GVE_RX_PAD, ctx); +		if (skb) { +			u64_stats_update_begin(&rx->statss); +			rx->rx_copied_pkt++; +			rx->rx_frag_copy_cnt++; +			rx->rx_copybreak_pkt++; +			u64_stats_update_end(&rx->statss);  		} -		pagecount = page_count(page_info->page); -		if (pagecount == 1) { -			/* No part of this page is used by any SKBs; we attach -			 * the page fragment to a new SKB and pass it up the -			 * stack. -			 */ -			skb = gve_rx_add_frags(dev, napi, page_info, len); -			if (!skb) -				return true; -			/* Make sure the kernel stack can't release the page */ -			get_page(page_info->page); -			/* "flip" to other packet buffer on this page */ -			gve_rx_flip_buff(page_info, &rx->data.data_ring[idx]); -		} else if (pagecount >= 2) { -			/* We have previously passed the other half of this -			 * page up the stack, but it has not yet been freed. -			 */ -			skb = gve_rx_copy(dev, napi, page_info, len); +	} else { +		if (rx->data.raw_addressing) { +			int recycle = gve_rx_can_recycle_buffer(page_info); + +			if (unlikely(recycle < 0)) { +				gve_schedule_reset(priv); +				return NULL; +			} +			page_info->can_flip = recycle; +			if (page_info->can_flip) { +				u64_stats_update_begin(&rx->statss); +				rx->rx_frag_flip_cnt++; +				u64_stats_update_end(&rx->statss); +			} +			skb = gve_rx_raw_addressing(&priv->pdev->dev, netdev, +						    page_info, len, napi, +						    data_slot, +						    rx->packet_buffer_size, ctx);  		} else { -			WARN(pagecount < 1, "Pagecount should never be < 1"); -			return false; +			if (ctx->reuse_frags) { +				u64_stats_update_begin(&rx->statss); +				rx->rx_frag_flip_cnt++; +				u64_stats_update_end(&rx->statss); +			} +			skb = gve_rx_qpl(&priv->pdev->dev, netdev, rx, +					 page_info, len, napi, data_slot);  		} -	} else { -		skb = gve_rx_copy(dev, napi, page_info, len);  	} +	return skb; +} -have_skb: -	/* We didn't manage to allocate an skb but we haven't had any -	 * reset worthy failures. -	 */ -	if (!skb) -		return true; +static bool gve_rx(struct gve_rx_ring *rx, netdev_features_t feat, +		   u64 *packet_size_bytes, u32 *work_done) +{ +	struct gve_rx_slot_page_info *page_info; +	struct gve_rx_ctx *ctx = &rx->ctx; +	union gve_rx_data_slot *data_slot; +	struct gve_priv *priv = rx->gve; +	struct gve_rx_desc *first_desc; +	struct sk_buff *skb = NULL; +	struct gve_rx_desc *desc; +	struct napi_struct *napi; +	dma_addr_t page_bus; +	u32 work_cnt = 0; +	void *va; +	u32 idx; +	u16 len; + +	idx = rx->cnt & rx->mask; +	first_desc = &rx->desc.desc_ring[idx]; +	desc = first_desc; +	napi = &priv->ntfy_blocks[rx->ntfy_id].napi; + +	if (unlikely(!gve_rx_ctx_init(ctx, rx))) +		goto skb_alloc_fail; + +	while (ctx->curr_frag_cnt < ctx->expected_frag_cnt) { +		/* Prefetch two packet buffers ahead, we will need it soon. */ +		page_info = &rx->data.page_info[(idx + 2) & rx->mask]; +		va = page_info->page_address + page_info->page_offset; + +		prefetch(page_info->page); /* Kernel page struct. */ +		prefetch(va);              /* Packet header. */ +		prefetch(va + 64);         /* Next cacheline too. */ + +		len = gve_rx_get_fragment_size(ctx, desc); + +		page_info = &rx->data.page_info[idx]; +		data_slot = &rx->data.data_ring[idx]; +		page_bus = rx->data.raw_addressing ? +			   be64_to_cpu(data_slot->addr) - page_info->page_offset : +			   rx->data.qpl->page_buses[idx]; +		dma_sync_single_for_cpu(&priv->pdev->dev, page_bus, PAGE_SIZE, DMA_FROM_DEVICE); + +		skb = gve_rx_skb(priv, rx, page_info, napi, len, data_slot); +		if (!skb) { +			u64_stats_update_begin(&rx->statss); +			rx->rx_skb_alloc_fail++; +			u64_stats_update_end(&rx->statss); +			goto skb_alloc_fail; +		} + +		ctx->curr_frag_cnt++; +		rx->cnt++; +		idx = rx->cnt & rx->mask; +		work_cnt++; +		desc = &rx->desc.desc_ring[idx]; +	}  	if (likely(feat & NETIF_F_RXCSUM)) {  		/* NIC passes up the partial sum */ -		if (rx_desc->csum) +		if (first_desc->csum)  			skb->ip_summed = CHECKSUM_COMPLETE;  		else  			skb->ip_summed = CHECKSUM_NONE; -		skb->csum = csum_unfold(rx_desc->csum); +		skb->csum = csum_unfold(first_desc->csum);  	}  	/* parse flags & pass relevant info up */  	if (likely(feat & NETIF_F_RXHASH) && -	    gve_needs_rss(rx_desc->flags_seq)) -		skb_set_hash(skb, be32_to_cpu(rx_desc->rss_hash), -			     gve_rss_type(rx_desc->flags_seq)); +	    gve_needs_rss(first_desc->flags_seq)) +		skb_set_hash(skb, be32_to_cpu(first_desc->rss_hash), +			     gve_rss_type(first_desc->flags_seq)); +	*packet_size_bytes = skb->len + (skb->protocol ? ETH_HLEN : 0); +	*work_done = work_cnt; +	skb_record_rx_queue(skb, rx->q_num);  	if (skb_is_nonlinear(skb))  		napi_gro_frags(napi);  	else  		napi_gro_receive(napi, skb); + +	gve_rx_ctx_clear(ctx);  	return true; + +skb_alloc_fail: +	if (napi->skb) +		napi_free_frags(napi); +	*packet_size_bytes = 0; +	*work_done = ctx->expected_frag_cnt; +	while (ctx->curr_frag_cnt < ctx->expected_frag_cnt) { +		rx->cnt++; +		ctx->curr_frag_cnt++; +	} +	gve_rx_ctx_clear(ctx); +	return false;  } -static bool gve_rx_work_pending(struct gve_rx_ring *rx) +bool gve_rx_work_pending(struct gve_rx_ring *rx)  {  	struct gve_rx_desc *desc;  	__be16 flags_seq; @@ -372,25 +641,82 @@ static bool gve_rx_work_pending(struct gve_rx_ring *rx)  	desc = rx->desc.desc_ring + next_idx;  	flags_seq = desc->flags_seq; -	/* Make sure we have synchronized the seq no with the device */ -	smp_rmb();  	return (GVE_SEQNO(flags_seq) == rx->desc.seqno);  } -bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget, -		       netdev_features_t feat) +static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx)  { +	int refill_target = rx->mask + 1; +	u32 fill_cnt = rx->fill_cnt; + +	while (fill_cnt - rx->cnt < refill_target) { +		struct gve_rx_slot_page_info *page_info; +		u32 idx = fill_cnt & rx->mask; + +		page_info = &rx->data.page_info[idx]; +		if (page_info->can_flip) { +			/* The other half of the page is free because it was +			 * free when we processed the descriptor. Flip to it. +			 */ +			union gve_rx_data_slot *data_slot = +						&rx->data.data_ring[idx]; + +			gve_rx_flip_buff(page_info, &data_slot->addr); +			page_info->can_flip = 0; +		} else { +			/* It is possible that the networking stack has already +			 * finished processing all outstanding packets in the buffer +			 * and it can be reused. +			 * Flipping is unnecessary here - if the networking stack still +			 * owns half the page it is impossible to tell which half. Either +			 * the whole page is free or it needs to be replaced. +			 */ +			int recycle = gve_rx_can_recycle_buffer(page_info); + +			if (recycle < 0) { +				if (!rx->data.raw_addressing) +					gve_schedule_reset(priv); +				return false; +			} +			if (!recycle) { +				/* We can't reuse the buffer - alloc a new one*/ +				union gve_rx_data_slot *data_slot = +						&rx->data.data_ring[idx]; +				struct device *dev = &priv->pdev->dev; +				gve_rx_free_buffer(dev, page_info, data_slot); +				page_info->page = NULL; +				if (gve_rx_alloc_buffer(priv, dev, page_info, +							data_slot)) { +					u64_stats_update_begin(&rx->statss); +					rx->rx_buf_alloc_fail++; +					u64_stats_update_end(&rx->statss); +					break; +				} +			} +		} +		fill_cnt++; +	} +	rx->fill_cnt = fill_cnt; +	return true; +} + +static int gve_clean_rx_done(struct gve_rx_ring *rx, int budget, +			     netdev_features_t feat) +{ +	u32 work_done = 0, total_packet_cnt = 0, ok_packet_cnt = 0;  	struct gve_priv *priv = rx->gve; +	u32 idx = rx->cnt & rx->mask;  	struct gve_rx_desc *desc; -	u32 cnt = rx->cnt; -	u32 idx = cnt & rx->mask; -	u32 work_done = 0;  	u64 bytes = 0; -	desc = rx->desc.desc_ring + idx; +	desc = &rx->desc.desc_ring[idx];  	while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) &&  	       work_done < budget) { +		u64 packet_size_bytes = 0; +		u32 work_cnt = 0; +		bool dropped; +  		netif_info(priv, rx_status, priv->dev,  			   "[%d] idx=%d desc=%p desc->flags_seq=0x%x\n",  			   rx->q_num, idx, desc, desc->flags_seq); @@ -398,35 +724,57 @@ bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget,  			   "[%d] seqno=%d rx->desc.seqno=%d\n",  			   rx->q_num, GVE_SEQNO(desc->flags_seq),  			   rx->desc.seqno); -		bytes += be16_to_cpu(desc->len) - GVE_RX_PAD; -		if (!gve_rx(rx, desc, feat, idx)) -			gve_schedule_reset(priv); -		cnt++; -		idx = cnt & rx->mask; -		desc = rx->desc.desc_ring + idx; -		rx->desc.seqno = gve_next_seqno(rx->desc.seqno); -		work_done++; + +		dropped = !gve_rx(rx, feat, &packet_size_bytes, &work_cnt); +		if (!dropped) { +			bytes += packet_size_bytes; +			ok_packet_cnt++; +		} +		total_packet_cnt++; +		idx = rx->cnt & rx->mask; +		desc = &rx->desc.desc_ring[idx]; +		work_done += work_cnt;  	} -	if (!work_done) -		return false; +	if (!work_done && rx->fill_cnt - rx->cnt > rx->db_threshold) +		return 0; -	u64_stats_update_begin(&rx->statss); -	rx->rpackets += work_done; -	rx->rbytes += bytes; -	u64_stats_update_end(&rx->statss); -	rx->cnt = cnt; -	rx->fill_cnt += work_done; +	if (work_done) { +		u64_stats_update_begin(&rx->statss); +		rx->rpackets += ok_packet_cnt; +		rx->rbytes += bytes; +		u64_stats_update_end(&rx->statss); +	} + +	/* restock ring slots */ +	if (!rx->data.raw_addressing) { +		/* In QPL mode buffs are refilled as the desc are processed */ +		rx->fill_cnt += work_done; +	} else if (rx->fill_cnt - rx->cnt <= rx->db_threshold) { +		/* In raw addressing mode buffs are only refilled if the avail +		 * falls below a threshold. +		 */ +		if (!gve_rx_refill_buffers(priv, rx)) +			return 0; + +		/* If we were not able to completely refill buffers, we'll want +		 * to schedule this queue for work again to refill buffers. +		 */ +		if (rx->fill_cnt - rx->cnt <= rx->db_threshold) { +			gve_rx_write_doorbell(priv, rx); +			return budget; +		} +	}  	gve_rx_write_doorbell(priv, rx); -	return gve_rx_work_pending(rx); +	return total_packet_cnt;  } -bool gve_rx_poll(struct gve_notify_block *block, int budget) +int gve_rx_poll(struct gve_notify_block *block, int budget)  {  	struct gve_rx_ring *rx = block->rx;  	netdev_features_t feat; -	bool repoll = false; +	int work_done = 0;  	feat = block->napi.dev->features; @@ -435,8 +783,7 @@ bool gve_rx_poll(struct gve_notify_block *block, int budget)  		budget = INT_MAX;  	if (budget > 0) -		repoll |= gve_clean_rx_done(rx, budget, feat); -	else -		repoll |= gve_rx_work_pending(rx); -	return repoll; +		work_done = gve_clean_rx_done(rx, budget, feat); + +	return work_done;  } diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c new file mode 100644 index 000000000000..2e6461b0ea8b --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c @@ -0,0 +1,756 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2021 Google, Inc. + */ + +#include "gve.h" +#include "gve_dqo.h" +#include "gve_adminq.h" +#include "gve_utils.h" +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <net/ip6_checksum.h> +#include <net/ipv6.h> +#include <net/tcp.h> + +static int gve_buf_ref_cnt(struct gve_rx_buf_state_dqo *bs) +{ +	return page_count(bs->page_info.page) - bs->page_info.pagecnt_bias; +} + +static void gve_free_page_dqo(struct gve_priv *priv, +			      struct gve_rx_buf_state_dqo *bs) +{ +	page_ref_sub(bs->page_info.page, bs->page_info.pagecnt_bias - 1); +	gve_free_page(&priv->pdev->dev, bs->page_info.page, bs->addr, +		      DMA_FROM_DEVICE); +	bs->page_info.page = NULL; +} + +static struct gve_rx_buf_state_dqo *gve_alloc_buf_state(struct gve_rx_ring *rx) +{ +	struct gve_rx_buf_state_dqo *buf_state; +	s16 buffer_id; + +	buffer_id = rx->dqo.free_buf_states; +	if (unlikely(buffer_id == -1)) +		return NULL; + +	buf_state = &rx->dqo.buf_states[buffer_id]; + +	/* Remove buf_state from free list */ +	rx->dqo.free_buf_states = buf_state->next; + +	/* Point buf_state to itself to mark it as allocated */ +	buf_state->next = buffer_id; + +	return buf_state; +} + +static bool gve_buf_state_is_allocated(struct gve_rx_ring *rx, +				       struct gve_rx_buf_state_dqo *buf_state) +{ +	s16 buffer_id = buf_state - rx->dqo.buf_states; + +	return buf_state->next == buffer_id; +} + +static void gve_free_buf_state(struct gve_rx_ring *rx, +			       struct gve_rx_buf_state_dqo *buf_state) +{ +	s16 buffer_id = buf_state - rx->dqo.buf_states; + +	buf_state->next = rx->dqo.free_buf_states; +	rx->dqo.free_buf_states = buffer_id; +} + +static struct gve_rx_buf_state_dqo * +gve_dequeue_buf_state(struct gve_rx_ring *rx, struct gve_index_list *list) +{ +	struct gve_rx_buf_state_dqo *buf_state; +	s16 buffer_id; + +	buffer_id = list->head; +	if (unlikely(buffer_id == -1)) +		return NULL; + +	buf_state = &rx->dqo.buf_states[buffer_id]; + +	/* Remove buf_state from list */ +	list->head = buf_state->next; +	if (buf_state->next == -1) +		list->tail = -1; + +	/* Point buf_state to itself to mark it as allocated */ +	buf_state->next = buffer_id; + +	return buf_state; +} + +static void gve_enqueue_buf_state(struct gve_rx_ring *rx, +				  struct gve_index_list *list, +				  struct gve_rx_buf_state_dqo *buf_state) +{ +	s16 buffer_id = buf_state - rx->dqo.buf_states; + +	buf_state->next = -1; + +	if (list->head == -1) { +		list->head = buffer_id; +		list->tail = buffer_id; +	} else { +		int tail = list->tail; + +		rx->dqo.buf_states[tail].next = buffer_id; +		list->tail = buffer_id; +	} +} + +static struct gve_rx_buf_state_dqo * +gve_get_recycled_buf_state(struct gve_rx_ring *rx) +{ +	struct gve_rx_buf_state_dqo *buf_state; +	int i; + +	/* Recycled buf states are immediately usable. */ +	buf_state = gve_dequeue_buf_state(rx, &rx->dqo.recycled_buf_states); +	if (likely(buf_state)) +		return buf_state; + +	if (unlikely(rx->dqo.used_buf_states.head == -1)) +		return NULL; + +	/* Used buf states are only usable when ref count reaches 0, which means +	 * no SKBs refer to them. +	 * +	 * Search a limited number before giving up. +	 */ +	for (i = 0; i < 5; i++) { +		buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states); +		if (gve_buf_ref_cnt(buf_state) == 0) +			return buf_state; + +		gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); +	} + +	/* If there are no free buf states discard an entry from +	 * `used_buf_states` so it can be used. +	 */ +	if (unlikely(rx->dqo.free_buf_states == -1)) { +		buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states); +		if (gve_buf_ref_cnt(buf_state) == 0) +			return buf_state; + +		gve_free_page_dqo(rx->gve, buf_state); +		gve_free_buf_state(rx, buf_state); +	} + +	return NULL; +} + +static int gve_alloc_page_dqo(struct gve_priv *priv, +			      struct gve_rx_buf_state_dqo *buf_state) +{ +	int err; + +	err = gve_alloc_page(priv, &priv->pdev->dev, &buf_state->page_info.page, +			     &buf_state->addr, DMA_FROM_DEVICE, GFP_ATOMIC); +	if (err) +		return err; + +	buf_state->page_info.page_offset = 0; +	buf_state->page_info.page_address = +		page_address(buf_state->page_info.page); +	buf_state->last_single_ref_offset = 0; + +	/* The page already has 1 ref. */ +	page_ref_add(buf_state->page_info.page, INT_MAX - 1); +	buf_state->page_info.pagecnt_bias = INT_MAX; + +	return 0; +} + +static void gve_rx_free_ring_dqo(struct gve_priv *priv, int idx) +{ +	struct gve_rx_ring *rx = &priv->rx[idx]; +	struct device *hdev = &priv->pdev->dev; +	size_t completion_queue_slots; +	size_t buffer_queue_slots; +	size_t size; +	int i; + +	completion_queue_slots = rx->dqo.complq.mask + 1; +	buffer_queue_slots = rx->dqo.bufq.mask + 1; + +	gve_rx_remove_from_block(priv, idx); + +	if (rx->q_resources) { +		dma_free_coherent(hdev, sizeof(*rx->q_resources), +				  rx->q_resources, rx->q_resources_bus); +		rx->q_resources = NULL; +	} + +	for (i = 0; i < rx->dqo.num_buf_states; i++) { +		struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i]; + +		if (bs->page_info.page) +			gve_free_page_dqo(priv, bs); +	} + +	if (rx->dqo.bufq.desc_ring) { +		size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots; +		dma_free_coherent(hdev, size, rx->dqo.bufq.desc_ring, +				  rx->dqo.bufq.bus); +		rx->dqo.bufq.desc_ring = NULL; +	} + +	if (rx->dqo.complq.desc_ring) { +		size = sizeof(rx->dqo.complq.desc_ring[0]) * +			completion_queue_slots; +		dma_free_coherent(hdev, size, rx->dqo.complq.desc_ring, +				  rx->dqo.complq.bus); +		rx->dqo.complq.desc_ring = NULL; +	} + +	kvfree(rx->dqo.buf_states); +	rx->dqo.buf_states = NULL; + +	netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx); +} + +static int gve_rx_alloc_ring_dqo(struct gve_priv *priv, int idx) +{ +	struct gve_rx_ring *rx = &priv->rx[idx]; +	struct device *hdev = &priv->pdev->dev; +	size_t size; +	int i; + +	const u32 buffer_queue_slots = +		priv->options_dqo_rda.rx_buff_ring_entries; +	const u32 completion_queue_slots = priv->rx_desc_cnt; + +	netif_dbg(priv, drv, priv->dev, "allocating rx ring DQO\n"); + +	memset(rx, 0, sizeof(*rx)); +	rx->gve = priv; +	rx->q_num = idx; +	rx->dqo.bufq.mask = buffer_queue_slots - 1; +	rx->dqo.complq.num_free_slots = completion_queue_slots; +	rx->dqo.complq.mask = completion_queue_slots - 1; +	rx->ctx.skb_head = NULL; +	rx->ctx.skb_tail = NULL; + +	rx->dqo.num_buf_states = min_t(s16, S16_MAX, buffer_queue_slots * 4); +	rx->dqo.buf_states = kvcalloc(rx->dqo.num_buf_states, +				      sizeof(rx->dqo.buf_states[0]), +				      GFP_KERNEL); +	if (!rx->dqo.buf_states) +		return -ENOMEM; + +	/* Set up linked list of buffer IDs */ +	for (i = 0; i < rx->dqo.num_buf_states - 1; i++) +		rx->dqo.buf_states[i].next = i + 1; + +	rx->dqo.buf_states[rx->dqo.num_buf_states - 1].next = -1; +	rx->dqo.recycled_buf_states.head = -1; +	rx->dqo.recycled_buf_states.tail = -1; +	rx->dqo.used_buf_states.head = -1; +	rx->dqo.used_buf_states.tail = -1; + +	/* Allocate RX completion queue */ +	size = sizeof(rx->dqo.complq.desc_ring[0]) * +		completion_queue_slots; +	rx->dqo.complq.desc_ring = +		dma_alloc_coherent(hdev, size, &rx->dqo.complq.bus, GFP_KERNEL); +	if (!rx->dqo.complq.desc_ring) +		goto err; + +	/* Allocate RX buffer queue */ +	size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots; +	rx->dqo.bufq.desc_ring = +		dma_alloc_coherent(hdev, size, &rx->dqo.bufq.bus, GFP_KERNEL); +	if (!rx->dqo.bufq.desc_ring) +		goto err; + +	rx->q_resources = dma_alloc_coherent(hdev, sizeof(*rx->q_resources), +					     &rx->q_resources_bus, GFP_KERNEL); +	if (!rx->q_resources) +		goto err; + +	gve_rx_add_to_block(priv, idx); + +	return 0; + +err: +	gve_rx_free_ring_dqo(priv, idx); +	return -ENOMEM; +} + +void gve_rx_write_doorbell_dqo(const struct gve_priv *priv, int queue_idx) +{ +	const struct gve_rx_ring *rx = &priv->rx[queue_idx]; +	u64 index = be32_to_cpu(rx->q_resources->db_index); + +	iowrite32(rx->dqo.bufq.tail, &priv->db_bar2[index]); +} + +int gve_rx_alloc_rings_dqo(struct gve_priv *priv) +{ +	int err = 0; +	int i; + +	for (i = 0; i < priv->rx_cfg.num_queues; i++) { +		err = gve_rx_alloc_ring_dqo(priv, i); +		if (err) { +			netif_err(priv, drv, priv->dev, +				  "Failed to alloc rx ring=%d: err=%d\n", +				  i, err); +			goto err; +		} +	} + +	return 0; + +err: +	for (i--; i >= 0; i--) +		gve_rx_free_ring_dqo(priv, i); + +	return err; +} + +void gve_rx_free_rings_dqo(struct gve_priv *priv) +{ +	int i; + +	for (i = 0; i < priv->rx_cfg.num_queues; i++) +		gve_rx_free_ring_dqo(priv, i); +} + +void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx) +{ +	struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq; +	struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq; +	struct gve_priv *priv = rx->gve; +	u32 num_avail_slots; +	u32 num_full_slots; +	u32 num_posted = 0; + +	num_full_slots = (bufq->tail - bufq->head) & bufq->mask; +	num_avail_slots = bufq->mask - num_full_slots; + +	num_avail_slots = min_t(u32, num_avail_slots, complq->num_free_slots); +	while (num_posted < num_avail_slots) { +		struct gve_rx_desc_dqo *desc = &bufq->desc_ring[bufq->tail]; +		struct gve_rx_buf_state_dqo *buf_state; + +		buf_state = gve_get_recycled_buf_state(rx); +		if (unlikely(!buf_state)) { +			buf_state = gve_alloc_buf_state(rx); +			if (unlikely(!buf_state)) +				break; + +			if (unlikely(gve_alloc_page_dqo(priv, buf_state))) { +				u64_stats_update_begin(&rx->statss); +				rx->rx_buf_alloc_fail++; +				u64_stats_update_end(&rx->statss); +				gve_free_buf_state(rx, buf_state); +				break; +			} +		} + +		desc->buf_id = cpu_to_le16(buf_state - rx->dqo.buf_states); +		desc->buf_addr = cpu_to_le64(buf_state->addr + +					     buf_state->page_info.page_offset); + +		bufq->tail = (bufq->tail + 1) & bufq->mask; +		complq->num_free_slots--; +		num_posted++; + +		if ((bufq->tail & (GVE_RX_BUF_THRESH_DQO - 1)) == 0) +			gve_rx_write_doorbell_dqo(priv, rx->q_num); +	} + +	rx->fill_cnt += num_posted; +} + +static void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx, +				struct gve_rx_buf_state_dqo *buf_state) +{ +	const int data_buffer_size = priv->data_buffer_size_dqo; +	int pagecount; + +	/* Can't reuse if we only fit one buffer per page */ +	if (data_buffer_size * 2 > PAGE_SIZE) +		goto mark_used; + +	pagecount = gve_buf_ref_cnt(buf_state); + +	/* Record the offset when we have a single remaining reference. +	 * +	 * When this happens, we know all of the other offsets of the page are +	 * usable. +	 */ +	if (pagecount == 1) { +		buf_state->last_single_ref_offset = +			buf_state->page_info.page_offset; +	} + +	/* Use the next buffer sized chunk in the page. */ +	buf_state->page_info.page_offset += data_buffer_size; +	buf_state->page_info.page_offset &= (PAGE_SIZE - 1); + +	/* If we wrap around to the same offset without ever dropping to 1 +	 * reference, then we don't know if this offset was ever freed. +	 */ +	if (buf_state->page_info.page_offset == +	    buf_state->last_single_ref_offset) { +		goto mark_used; +	} + +	gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state); +	return; + +mark_used: +	gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); +} + +static void gve_rx_skb_csum(struct sk_buff *skb, +			    const struct gve_rx_compl_desc_dqo *desc, +			    struct gve_ptype ptype) +{ +	skb->ip_summed = CHECKSUM_NONE; + +	/* HW did not identify and process L3 and L4 headers. */ +	if (unlikely(!desc->l3_l4_processed)) +		return; + +	if (ptype.l3_type == GVE_L3_TYPE_IPV4) { +		if (unlikely(desc->csum_ip_err || desc->csum_external_ip_err)) +			return; +	} else if (ptype.l3_type == GVE_L3_TYPE_IPV6) { +		/* Checksum should be skipped if this flag is set. */ +		if (unlikely(desc->ipv6_ex_add)) +			return; +	} + +	if (unlikely(desc->csum_l4_err)) +		return; + +	switch (ptype.l4_type) { +	case GVE_L4_TYPE_TCP: +	case GVE_L4_TYPE_UDP: +	case GVE_L4_TYPE_ICMP: +	case GVE_L4_TYPE_SCTP: +		skb->ip_summed = CHECKSUM_UNNECESSARY; +		break; +	default: +		break; +	} +} + +static void gve_rx_skb_hash(struct sk_buff *skb, +			    const struct gve_rx_compl_desc_dqo *compl_desc, +			    struct gve_ptype ptype) +{ +	enum pkt_hash_types hash_type = PKT_HASH_TYPE_L2; + +	if (ptype.l4_type != GVE_L4_TYPE_UNKNOWN) +		hash_type = PKT_HASH_TYPE_L4; +	else if (ptype.l3_type != GVE_L3_TYPE_UNKNOWN) +		hash_type = PKT_HASH_TYPE_L3; + +	skb_set_hash(skb, le32_to_cpu(compl_desc->hash), hash_type); +} + +static void gve_rx_free_skb(struct gve_rx_ring *rx) +{ +	if (!rx->ctx.skb_head) +		return; + +	dev_kfree_skb_any(rx->ctx.skb_head); +	rx->ctx.skb_head = NULL; +	rx->ctx.skb_tail = NULL; +} + +/* Chains multi skbs for single rx packet. + * Returns 0 if buffer is appended, -1 otherwise. + */ +static int gve_rx_append_frags(struct napi_struct *napi, +			       struct gve_rx_buf_state_dqo *buf_state, +			       u16 buf_len, struct gve_rx_ring *rx, +			       struct gve_priv *priv) +{ +	int num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags; + +	if (unlikely(num_frags == MAX_SKB_FRAGS)) { +		struct sk_buff *skb; + +		skb = napi_alloc_skb(napi, 0); +		if (!skb) +			return -1; + +		skb_shinfo(rx->ctx.skb_tail)->frag_list = skb; +		rx->ctx.skb_tail = skb; +		num_frags = 0; +	} +	if (rx->ctx.skb_tail != rx->ctx.skb_head) { +		rx->ctx.skb_head->len += buf_len; +		rx->ctx.skb_head->data_len += buf_len; +		rx->ctx.skb_head->truesize += priv->data_buffer_size_dqo; +	} + +	skb_add_rx_frag(rx->ctx.skb_tail, num_frags, +			buf_state->page_info.page, +			buf_state->page_info.page_offset, +			buf_len, priv->data_buffer_size_dqo); +	gve_dec_pagecnt_bias(&buf_state->page_info); + +	return 0; +} + +/* Returns 0 if descriptor is completed successfully. + * Returns -EINVAL if descriptor is invalid. + * Returns -ENOMEM if data cannot be copied to skb. + */ +static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, +		      const struct gve_rx_compl_desc_dqo *compl_desc, +		      int queue_idx) +{ +	const u16 buffer_id = le16_to_cpu(compl_desc->buf_id); +	const bool eop = compl_desc->end_of_packet != 0; +	struct gve_rx_buf_state_dqo *buf_state; +	struct gve_priv *priv = rx->gve; +	u16 buf_len; + +	if (unlikely(buffer_id >= rx->dqo.num_buf_states)) { +		net_err_ratelimited("%s: Invalid RX buffer_id=%u\n", +				    priv->dev->name, buffer_id); +		return -EINVAL; +	} +	buf_state = &rx->dqo.buf_states[buffer_id]; +	if (unlikely(!gve_buf_state_is_allocated(rx, buf_state))) { +		net_err_ratelimited("%s: RX buffer_id is not allocated: %u\n", +				    priv->dev->name, buffer_id); +		return -EINVAL; +	} + +	if (unlikely(compl_desc->rx_error)) { +		gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, +				      buf_state); +		return -EINVAL; +	} + +	buf_len = compl_desc->packet_len; + +	/* Page might have not been used for awhile and was likely last written +	 * by a different thread. +	 */ +	prefetch(buf_state->page_info.page); + +	/* Sync the portion of dma buffer for CPU to read. */ +	dma_sync_single_range_for_cpu(&priv->pdev->dev, buf_state->addr, +				      buf_state->page_info.page_offset, +				      buf_len, DMA_FROM_DEVICE); + +	/* Append to current skb if one exists. */ +	if (rx->ctx.skb_head) { +		if (unlikely(gve_rx_append_frags(napi, buf_state, buf_len, rx, +						 priv)) != 0) { +			goto error; +		} + +		gve_try_recycle_buf(priv, rx, buf_state); +		return 0; +	} + +	if (eop && buf_len <= priv->rx_copybreak) { +		rx->ctx.skb_head = gve_rx_copy(priv->dev, napi, +					       &buf_state->page_info, buf_len, 0, NULL); +		if (unlikely(!rx->ctx.skb_head)) +			goto error; +		rx->ctx.skb_tail = rx->ctx.skb_head; + +		u64_stats_update_begin(&rx->statss); +		rx->rx_copied_pkt++; +		rx->rx_copybreak_pkt++; +		u64_stats_update_end(&rx->statss); + +		gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, +				      buf_state); +		return 0; +	} + +	rx->ctx.skb_head = napi_get_frags(napi); +	if (unlikely(!rx->ctx.skb_head)) +		goto error; +	rx->ctx.skb_tail = rx->ctx.skb_head; + +	skb_add_rx_frag(rx->ctx.skb_head, 0, buf_state->page_info.page, +			buf_state->page_info.page_offset, buf_len, +			priv->data_buffer_size_dqo); +	gve_dec_pagecnt_bias(&buf_state->page_info); + +	gve_try_recycle_buf(priv, rx, buf_state); +	return 0; + +error: +	gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state); +	return -ENOMEM; +} + +static int gve_rx_complete_rsc(struct sk_buff *skb, +			       const struct gve_rx_compl_desc_dqo *desc, +			       struct gve_ptype ptype) +{ +	struct skb_shared_info *shinfo = skb_shinfo(skb); + +	/* Only TCP is supported right now. */ +	if (ptype.l4_type != GVE_L4_TYPE_TCP) +		return -EINVAL; + +	switch (ptype.l3_type) { +	case GVE_L3_TYPE_IPV4: +		shinfo->gso_type = SKB_GSO_TCPV4; +		break; +	case GVE_L3_TYPE_IPV6: +		shinfo->gso_type = SKB_GSO_TCPV6; +		break; +	default: +		return -EINVAL; +	} + +	shinfo->gso_size = le16_to_cpu(desc->rsc_seg_len); +	return 0; +} + +/* Returns 0 if skb is completed successfully, -1 otherwise. */ +static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi, +			       const struct gve_rx_compl_desc_dqo *desc, +			       netdev_features_t feat) +{ +	struct gve_ptype ptype = +		rx->gve->ptype_lut_dqo->ptypes[desc->packet_type]; +	int err; + +	skb_record_rx_queue(rx->ctx.skb_head, rx->q_num); + +	if (feat & NETIF_F_RXHASH) +		gve_rx_skb_hash(rx->ctx.skb_head, desc, ptype); + +	if (feat & NETIF_F_RXCSUM) +		gve_rx_skb_csum(rx->ctx.skb_head, desc, ptype); + +	/* RSC packets must set gso_size otherwise the TCP stack will complain +	 * that packets are larger than MTU. +	 */ +	if (desc->rsc) { +		err = gve_rx_complete_rsc(rx->ctx.skb_head, desc, ptype); +		if (err < 0) +			return err; +	} + +	if (skb_headlen(rx->ctx.skb_head) == 0) +		napi_gro_frags(napi); +	else +		napi_gro_receive(napi, rx->ctx.skb_head); + +	return 0; +} + +int gve_rx_poll_dqo(struct gve_notify_block *block, int budget) +{ +	struct napi_struct *napi = &block->napi; +	netdev_features_t feat = napi->dev->features; + +	struct gve_rx_ring *rx = block->rx; +	struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq; + +	u32 work_done = 0; +	u64 bytes = 0; +	int err; + +	while (work_done < budget) { +		struct gve_rx_compl_desc_dqo *compl_desc = +			&complq->desc_ring[complq->head]; +		u32 pkt_bytes; + +		/* No more new packets */ +		if (compl_desc->generation == complq->cur_gen_bit) +			break; + +		/* Prefetch the next two descriptors. */ +		prefetch(&complq->desc_ring[(complq->head + 1) & complq->mask]); +		prefetch(&complq->desc_ring[(complq->head + 2) & complq->mask]); + +		/* Do not read data until we own the descriptor */ +		dma_rmb(); + +		err = gve_rx_dqo(napi, rx, compl_desc, rx->q_num); +		if (err < 0) { +			gve_rx_free_skb(rx); +			u64_stats_update_begin(&rx->statss); +			if (err == -ENOMEM) +				rx->rx_skb_alloc_fail++; +			else if (err == -EINVAL) +				rx->rx_desc_err_dropped_pkt++; +			u64_stats_update_end(&rx->statss); +		} + +		complq->head = (complq->head + 1) & complq->mask; +		complq->num_free_slots++; + +		/* When the ring wraps, the generation bit is flipped. */ +		complq->cur_gen_bit ^= (complq->head == 0); + +		/* Receiving a completion means we have space to post another +		 * buffer on the buffer queue. +		 */ +		{ +			struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq; + +			bufq->head = (bufq->head + 1) & bufq->mask; +		} + +		/* Free running counter of completed descriptors */ +		rx->cnt++; + +		if (!rx->ctx.skb_head) +			continue; + +		if (!compl_desc->end_of_packet) +			continue; + +		work_done++; +		pkt_bytes = rx->ctx.skb_head->len; +		/* The ethernet header (first ETH_HLEN bytes) is snipped off +		 * by eth_type_trans. +		 */ +		if (skb_headlen(rx->ctx.skb_head)) +			pkt_bytes += ETH_HLEN; + +		/* gve_rx_complete_skb() will consume skb if successful */ +		if (gve_rx_complete_skb(rx, napi, compl_desc, feat) != 0) { +			gve_rx_free_skb(rx); +			u64_stats_update_begin(&rx->statss); +			rx->rx_desc_err_dropped_pkt++; +			u64_stats_update_end(&rx->statss); +			continue; +		} + +		bytes += pkt_bytes; +		rx->ctx.skb_head = NULL; +		rx->ctx.skb_tail = NULL; +	} + +	gve_rx_post_buffers_dqo(rx); + +	u64_stats_update_begin(&rx->statss); +	rx->rpackets += work_done; +	rx->rbytes += bytes; +	u64_stats_update_end(&rx->statss); + +	return work_done; +} diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index d0244feb0301..4888bf05fbed 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -1,11 +1,12 @@  // SPDX-License-Identifier: (GPL-2.0 OR MIT)  /* Google virtual Ethernet (gve) driver   * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc.   */  #include "gve.h"  #include "gve_adminq.h" +#include "gve_utils.h"  #include <linux/ip.h>  #include <linux/tcp.h>  #include <linux/vmalloc.h> @@ -131,14 +132,6 @@ static void gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes)  	atomic_add(bytes, &fifo->available);  } -static void gve_tx_remove_from_block(struct gve_priv *priv, int queue_idx) -{ -	struct gve_notify_block *block = -			&priv->ntfy_blocks[gve_tx_idx_to_ntfy(priv, queue_idx)]; - -	block->tx = NULL; -} -  static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx,  			     u32 to_do, bool try_to_wake); @@ -151,16 +144,18 @@ static void gve_tx_free_ring(struct gve_priv *priv, int idx)  	gve_tx_remove_from_block(priv, idx);  	slots = tx->mask + 1; -	gve_clean_tx_done(priv, tx, tx->req, false); +	gve_clean_tx_done(priv, tx, priv->tx_desc_cnt, false);  	netdev_tx_reset_queue(tx->netdev_txq);  	dma_free_coherent(hdev, sizeof(*tx->q_resources),  			  tx->q_resources, tx->q_resources_bus);  	tx->q_resources = NULL; -	gve_tx_fifo_release(priv, &tx->tx_fifo); -	gve_unassign_qpl(priv, tx->tx_fifo.qpl->id); -	tx->tx_fifo.qpl = NULL; +	if (!tx->raw_addressing) { +		gve_tx_fifo_release(priv, &tx->tx_fifo); +		gve_unassign_qpl(priv, tx->tx_fifo.qpl->id); +		tx->tx_fifo.qpl = NULL; +	}  	bytes = sizeof(*tx->desc) * slots;  	dma_free_coherent(hdev, bytes, tx->desc, tx->bus); @@ -172,16 +167,6 @@ static void gve_tx_free_ring(struct gve_priv *priv, int idx)  	netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx);  } -static void gve_tx_add_to_block(struct gve_priv *priv, int queue_idx) -{ -	int ntfy_idx = gve_tx_idx_to_ntfy(priv, queue_idx); -	struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; -	struct gve_tx_ring *tx = &priv->tx[queue_idx]; - -	block->tx = tx; -	tx->ntfy_id = ntfy_idx; -} -  static int gve_tx_alloc_ring(struct gve_priv *priv, int idx)  {  	struct gve_tx_ring *tx = &priv->tx[idx]; @@ -191,6 +176,7 @@ static int gve_tx_alloc_ring(struct gve_priv *priv, int idx)  	/* Make sure everything is zeroed to start */  	memset(tx, 0, sizeof(*tx)); +	spin_lock_init(&tx->clean_lock);  	tx->q_num = idx;  	tx->mask = slots - 1; @@ -206,11 +192,16 @@ static int gve_tx_alloc_ring(struct gve_priv *priv, int idx)  	if (!tx->desc)  		goto abort_with_info; -	tx->tx_fifo.qpl = gve_assign_tx_qpl(priv); - -	/* map Tx FIFO */ -	if (gve_tx_fifo_init(priv, &tx->tx_fifo)) -		goto abort_with_desc; +	tx->raw_addressing = priv->queue_format == GVE_GQI_RDA_FORMAT; +	tx->dev = &priv->pdev->dev; +	if (!tx->raw_addressing) { +		tx->tx_fifo.qpl = gve_assign_tx_qpl(priv); +		if (!tx->tx_fifo.qpl) +			goto abort_with_desc; +		/* map Tx FIFO */ +		if (gve_tx_fifo_init(priv, &tx->tx_fifo)) +			goto abort_with_qpl; +	}  	tx->q_resources =  		dma_alloc_coherent(hdev, @@ -228,7 +219,11 @@ static int gve_tx_alloc_ring(struct gve_priv *priv, int idx)  	return 0;  abort_with_fifo: -	gve_tx_fifo_release(priv, &tx->tx_fifo); +	if (!tx->raw_addressing) +		gve_tx_fifo_release(priv, &tx->tx_fifo); +abort_with_qpl: +	if (!tx->raw_addressing) +		gve_unassign_qpl(priv, tx->tx_fifo.qpl->id);  abort_with_desc:  	dma_free_coherent(hdev, bytes, tx->desc, tx->bus);  	tx->desc = NULL; @@ -262,7 +257,7 @@ int gve_tx_alloc_rings(struct gve_priv *priv)  	return err;  } -void gve_tx_free_rings(struct gve_priv *priv) +void gve_tx_free_rings_gqi(struct gve_priv *priv)  {  	int i; @@ -301,53 +296,81 @@ static inline int gve_skb_fifo_bytes_required(struct gve_tx_ring *tx,  	return bytes;  } -/* The most descriptors we could need are 3 - 1 for the headers, 1 for - * the beginning of the payload at the end of the FIFO, and 1 if the - * payload wraps to the beginning of the FIFO. +/* The most descriptors we could need is MAX_SKB_FRAGS + 4 : + * 1 for each skb frag + * 1 for the skb linear portion + * 1 for when tcp hdr needs to be in separate descriptor + * 1 if the payload wraps to the beginning of the FIFO + * 1 for metadata descriptor   */ -#define MAX_TX_DESC_NEEDED	3 +#define MAX_TX_DESC_NEEDED	(MAX_SKB_FRAGS + 4) +static void gve_tx_unmap_buf(struct device *dev, struct gve_tx_buffer_state *info) +{ +	if (info->skb) { +		dma_unmap_single(dev, dma_unmap_addr(info, dma), +				 dma_unmap_len(info, len), +				 DMA_TO_DEVICE); +		dma_unmap_len_set(info, len, 0); +	} else { +		dma_unmap_page(dev, dma_unmap_addr(info, dma), +			       dma_unmap_len(info, len), +			       DMA_TO_DEVICE); +		dma_unmap_len_set(info, len, 0); +	} +}  /* Check if sufficient resources (descriptor ring space, FIFO space) are   * available to transmit the given number of bytes.   */  static inline bool gve_can_tx(struct gve_tx_ring *tx, int bytes_required)  { -	return (gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED && -		gve_tx_fifo_can_alloc(&tx->tx_fifo, bytes_required)); +	bool can_alloc = true; + +	if (!tx->raw_addressing) +		can_alloc = gve_tx_fifo_can_alloc(&tx->tx_fifo, bytes_required); + +	return (gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED && can_alloc);  } +static_assert(NAPI_POLL_WEIGHT >= MAX_TX_DESC_NEEDED); +  /* Stops the queue if the skb cannot be transmitted. */ -static int gve_maybe_stop_tx(struct gve_tx_ring *tx, struct sk_buff *skb) +static int gve_maybe_stop_tx(struct gve_priv *priv, struct gve_tx_ring *tx, +			     struct sk_buff *skb)  { -	int bytes_required; +	int bytes_required = 0; +	u32 nic_done; +	u32 to_do; +	int ret; + +	if (!tx->raw_addressing) +		bytes_required = gve_skb_fifo_bytes_required(tx, skb); -	bytes_required = gve_skb_fifo_bytes_required(tx, skb);  	if (likely(gve_can_tx(tx, bytes_required)))  		return 0; -	/* No space, so stop the queue */ -	tx->stop_queue++; -	netif_tx_stop_queue(tx->netdev_txq); -	smp_mb();	/* sync with restarting queue in gve_clean_tx_done() */ - -	/* Now check for resources again, in case gve_clean_tx_done() freed -	 * resources after we checked and we stopped the queue after -	 * gve_clean_tx_done() checked. -	 * -	 * gve_maybe_stop_tx()			gve_clean_tx_done() -	 *   nsegs/can_alloc test failed -	 *					  gve_tx_free_fifo() -	 *					  if (tx queue stopped) -	 *					    netif_tx_queue_wake() -	 *   netif_tx_stop_queue() -	 *   Need to check again for space here! -	 */ -	if (likely(!gve_can_tx(tx, bytes_required))) -		return -EBUSY; +	ret = -EBUSY; +	spin_lock(&tx->clean_lock); +	nic_done = gve_tx_load_event_counter(priv, tx); +	to_do = nic_done - tx->done; -	netif_tx_start_queue(tx->netdev_txq); -	tx->wake_queue++; -	return 0; +	/* Only try to clean if there is hope for TX */ +	if (to_do + gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED) { +		if (to_do > 0) { +			to_do = min_t(u32, to_do, NAPI_POLL_WEIGHT); +			gve_clean_tx_done(priv, tx, to_do, false); +		} +		if (likely(gve_can_tx(tx, bytes_required))) +			ret = 0; +	} +	if (ret) { +		/* No space, so stop the queue */ +		tx->stop_queue++; +		netif_tx_stop_queue(tx->netdev_txq); +	} +	spin_unlock(&tx->clean_lock); + +	return ret;  }  static void gve_tx_fill_pkt_desc(union gve_tx_desc *pkt_desc, @@ -375,6 +398,19 @@ static void gve_tx_fill_pkt_desc(union gve_tx_desc *pkt_desc,  	pkt_desc->pkt.seg_addr = cpu_to_be64(addr);  } +static void gve_tx_fill_mtd_desc(union gve_tx_desc *mtd_desc, +				 struct sk_buff *skb) +{ +	BUILD_BUG_ON(sizeof(mtd_desc->mtd) != sizeof(mtd_desc->pkt)); + +	mtd_desc->mtd.type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH; +	mtd_desc->mtd.path_state = GVE_MTD_PATH_STATE_DEFAULT | +				   GVE_MTD_PATH_HASH_L4; +	mtd_desc->mtd.path_hash = cpu_to_be32(skb->hash); +	mtd_desc->mtd.reserved0 = 0; +	mtd_desc->mtd.reserved1 = 0; +} +  static void gve_tx_fill_seg_desc(union gve_tx_desc *seg_desc,  				 struct sk_buff *skb, bool is_gso,  				 u16 len, u64 addr) @@ -395,21 +431,18 @@ static void gve_dma_sync_for_device(struct device *dev, dma_addr_t *page_buses,  {  	u64 last_page = (iov_offset + iov_len - 1) / PAGE_SIZE;  	u64 first_page = iov_offset / PAGE_SIZE; -	dma_addr_t dma;  	u64 page; -	for (page = first_page; page <= last_page; page++) { -		dma = page_buses[page]; -		dma_sync_single_for_device(dev, dma, PAGE_SIZE, DMA_TO_DEVICE); -	} +	for (page = first_page; page <= last_page; page++) +		dma_sync_single_for_device(dev, page_buses[page], PAGE_SIZE, DMA_TO_DEVICE);  } -static int gve_tx_add_skb(struct gve_tx_ring *tx, struct sk_buff *skb, -			  struct device *dev) +static int gve_tx_add_skb_copy(struct gve_priv *priv, struct gve_tx_ring *tx, struct sk_buff *skb)  {  	int pad_bytes, hlen, hdr_nfrags, payload_nfrags, l4_hdr_offset;  	union gve_tx_desc *pkt_desc, *seg_desc;  	struct gve_tx_buffer_state *info; +	int mtd_desc_nr = !!skb->l4_hash;  	bool is_gso = skb_is_gso(skb);  	u32 idx = tx->req & tx->mask;  	int payload_iov = 2; @@ -441,19 +474,24 @@ static int gve_tx_add_skb(struct gve_tx_ring *tx, struct sk_buff *skb,  					   &info->iov[payload_iov]);  	gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset, -			     1 + payload_nfrags, hlen, +			     1 + mtd_desc_nr + payload_nfrags, hlen,  			     info->iov[hdr_nfrags - 1].iov_offset);  	skb_copy_bits(skb, 0,  		      tx->tx_fifo.base + info->iov[hdr_nfrags - 1].iov_offset,  		      hlen); -	gve_dma_sync_for_device(dev, tx->tx_fifo.qpl->page_buses, +	gve_dma_sync_for_device(&priv->pdev->dev, tx->tx_fifo.qpl->page_buses,  				info->iov[hdr_nfrags - 1].iov_offset,  				info->iov[hdr_nfrags - 1].iov_len);  	copy_offset = hlen; +	if (mtd_desc_nr) { +		next_idx = (tx->req + 1) & tx->mask; +		gve_tx_fill_mtd_desc(&tx->desc[next_idx], skb); +	} +  	for (i = payload_iov; i < payload_nfrags + payload_iov; i++) { -		next_idx = (tx->req + 1 + i - payload_iov) & tx->mask; +		next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask;  		seg_desc = &tx->desc[next_idx];  		gve_tx_fill_seg_desc(seg_desc, skb, is_gso, @@ -463,13 +501,109 @@ static int gve_tx_add_skb(struct gve_tx_ring *tx, struct sk_buff *skb,  		skb_copy_bits(skb, copy_offset,  			      tx->tx_fifo.base + info->iov[i].iov_offset,  			      info->iov[i].iov_len); -		gve_dma_sync_for_device(dev, tx->tx_fifo.qpl->page_buses, +		gve_dma_sync_for_device(&priv->pdev->dev, tx->tx_fifo.qpl->page_buses,  					info->iov[i].iov_offset,  					info->iov[i].iov_len);  		copy_offset += info->iov[i].iov_len;  	} -	return 1 + payload_nfrags; +	return 1 + mtd_desc_nr + payload_nfrags; +} + +static int gve_tx_add_skb_no_copy(struct gve_priv *priv, struct gve_tx_ring *tx, +				  struct sk_buff *skb) +{ +	const struct skb_shared_info *shinfo = skb_shinfo(skb); +	int hlen, num_descriptors, l4_hdr_offset; +	union gve_tx_desc *pkt_desc, *mtd_desc, *seg_desc; +	struct gve_tx_buffer_state *info; +	int mtd_desc_nr = !!skb->l4_hash; +	bool is_gso = skb_is_gso(skb); +	u32 idx = tx->req & tx->mask; +	u64 addr; +	u32 len; +	int i; + +	info = &tx->info[idx]; +	pkt_desc = &tx->desc[idx]; + +	l4_hdr_offset = skb_checksum_start_offset(skb); +	/* If the skb is gso, then we want only up to the tcp header in the first segment +	 * to efficiently replicate on each segment otherwise we want the linear portion +	 * of the skb (which will contain the checksum because skb->csum_start and +	 * skb->csum_offset are given relative to skb->head) in the first segment. +	 */ +	hlen = is_gso ? l4_hdr_offset + tcp_hdrlen(skb) : skb_headlen(skb); +	len = skb_headlen(skb); + +	info->skb =  skb; + +	addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE); +	if (unlikely(dma_mapping_error(tx->dev, addr))) { +		tx->dma_mapping_error++; +		goto drop; +	} +	dma_unmap_len_set(info, len, len); +	dma_unmap_addr_set(info, dma, addr); + +	num_descriptors = 1 + shinfo->nr_frags; +	if (hlen < len) +		num_descriptors++; +	if (mtd_desc_nr) +		num_descriptors++; + +	gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset, +			     num_descriptors, hlen, addr); + +	if (mtd_desc_nr) { +		idx = (idx + 1) & tx->mask; +		mtd_desc = &tx->desc[idx]; +		gve_tx_fill_mtd_desc(mtd_desc, skb); +	} + +	if (hlen < len) { +		/* For gso the rest of the linear portion of the skb needs to +		 * be in its own descriptor. +		 */ +		len -= hlen; +		addr += hlen; +		idx = (idx + 1) & tx->mask; +		seg_desc = &tx->desc[idx]; +		gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr); +	} + +	for (i = 0; i < shinfo->nr_frags; i++) { +		const skb_frag_t *frag = &shinfo->frags[i]; + +		idx = (idx + 1) & tx->mask; +		seg_desc = &tx->desc[idx]; +		len = skb_frag_size(frag); +		addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE); +		if (unlikely(dma_mapping_error(tx->dev, addr))) { +			tx->dma_mapping_error++; +			goto unmap_drop; +		} +		tx->info[idx].skb = NULL; +		dma_unmap_len_set(&tx->info[idx], len, len); +		dma_unmap_addr_set(&tx->info[idx], dma, addr); + +		gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr); +	} + +	return num_descriptors; + +unmap_drop: +	i += num_descriptors - shinfo->nr_frags; +	while (i--) { +		/* Skip metadata descriptor, if set */ +		if (i == 1 && mtd_desc_nr == 1) +			continue; +		idx--; +		gve_tx_unmap_buf(tx->dev, &tx->info[idx & tx->mask]); +	} +drop: +	tx->dropped_pkt++; +	return 0;  }  netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev) @@ -478,10 +612,10 @@ netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev)  	struct gve_tx_ring *tx;  	int nsegs; -	WARN(skb_get_queue_mapping(skb) > priv->tx_cfg.num_queues, +	WARN(skb_get_queue_mapping(skb) >= priv->tx_cfg.num_queues,  	     "skb queue index out of range");  	tx = &priv->tx[skb_get_queue_mapping(skb)]; -	if (unlikely(gve_maybe_stop_tx(tx, skb))) { +	if (unlikely(gve_maybe_stop_tx(priv, tx, skb))) {  		/* We need to ring the txq doorbell -- we have stopped the Tx  		 * queue for want of resources, but prior calls to gve_tx()  		 * may have added descriptors without ringing the doorbell. @@ -490,17 +624,26 @@ netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev)  		gve_tx_put_doorbell(priv, tx->q_resources, tx->req);  		return NETDEV_TX_BUSY;  	} -	nsegs = gve_tx_add_skb(tx, skb, &priv->pdev->dev); - -	netdev_tx_sent_queue(tx->netdev_txq, skb->len); -	skb_tx_timestamp(skb); - -	/* give packets to NIC */ -	tx->req += nsegs; +	if (tx->raw_addressing) +		nsegs = gve_tx_add_skb_no_copy(priv, tx, skb); +	else +		nsegs = gve_tx_add_skb_copy(priv, tx, skb); + +	/* If the packet is getting sent, we need to update the skb */ +	if (nsegs) { +		netdev_tx_sent_queue(tx->netdev_txq, skb->len); +		skb_tx_timestamp(skb); +		tx->req += nsegs; +	} else { +		dev_kfree_skb_any(skb); +	}  	if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more())  		return NETDEV_TX_OK; +	/* Give packets to NIC. Even if this packet failed to send the doorbell +	 * might need to be rung because of xmit_more. +	 */  	gve_tx_put_doorbell(priv, tx->q_resources, tx->req);  	return NETDEV_TX_OK;  } @@ -525,24 +668,29 @@ static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx,  		info = &tx->info[idx];  		skb = info->skb; +		/* Unmap the buffer */ +		if (tx->raw_addressing) +			gve_tx_unmap_buf(tx->dev, info); +		tx->done++;  		/* Mark as free */  		if (skb) {  			info->skb = NULL;  			bytes += skb->len;  			pkts++;  			dev_consume_skb_any(skb); +			if (tx->raw_addressing) +				continue;  			/* FIFO free */  			for (i = 0; i < ARRAY_SIZE(info->iov); i++) { -				space_freed += info->iov[i].iov_len + -					       info->iov[i].iov_padding; +				space_freed += info->iov[i].iov_len + info->iov[i].iov_padding;  				info->iov[i].iov_len = 0;  				info->iov[i].iov_padding = 0;  			}  		} -		tx->done++;  	} -	gve_tx_free_fifo(&tx->tx_fifo, space_freed); +	if (!tx->raw_addressing) +		gve_tx_free_fifo(&tx->tx_fifo, space_freed);  	u64_stats_update_begin(&tx->statss);  	tx->bytes_done += bytes;  	tx->pkt_done += pkts; @@ -563,19 +711,19 @@ static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx,  	return pkts;  } -__be32 gve_tx_load_event_counter(struct gve_priv *priv, -				 struct gve_tx_ring *tx) +u32 gve_tx_load_event_counter(struct gve_priv *priv, +			      struct gve_tx_ring *tx)  { -	u32 counter_index = be32_to_cpu((tx->q_resources->counter_index)); +	u32 counter_index = be32_to_cpu(tx->q_resources->counter_index); +	__be32 counter = READ_ONCE(priv->counter_array[counter_index]); -	return READ_ONCE(priv->counter_array[counter_index]); +	return be32_to_cpu(counter);  }  bool gve_tx_poll(struct gve_notify_block *block, int budget)  {  	struct gve_priv *priv = block->priv;  	struct gve_tx_ring *tx = block->tx; -	bool repoll = false;  	u32 nic_done;  	u32 to_do; @@ -583,17 +731,23 @@ bool gve_tx_poll(struct gve_notify_block *block, int budget)  	if (budget == 0)  		budget = INT_MAX; +	/* In TX path, it may try to clean completed pkts in order to xmit, +	 * to avoid cleaning conflict, use spin_lock(), it yields better +	 * concurrency between xmit/clean than netif's lock. +	 */ +	spin_lock(&tx->clean_lock);  	/* Find out how much work there is to be done */ -	tx->last_nic_done = gve_tx_load_event_counter(priv, tx); -	nic_done = be32_to_cpu(tx->last_nic_done); -	if (budget > 0) { -		/* Do as much work as we have that the budget will -		 * allow -		 */ -		to_do = min_t(u32, (nic_done - tx->done), budget); -		gve_clean_tx_done(priv, tx, to_do, true); -	} +	nic_done = gve_tx_load_event_counter(priv, tx); +	to_do = min_t(u32, (nic_done - tx->done), budget); +	gve_clean_tx_done(priv, tx, to_do, true); +	spin_unlock(&tx->clean_lock);  	/* If we still have work we want to repoll */ -	repoll |= (nic_done != tx->done); -	return repoll; +	return nic_done != tx->done; +} + +bool gve_tx_clean_pending(struct gve_priv *priv, struct gve_tx_ring *tx) +{ +	u32 nic_done = gve_tx_load_event_counter(priv, tx); + +	return nic_done != tx->done;  } diff --git a/drivers/net/ethernet/google/gve/gve_tx_dqo.c b/drivers/net/ethernet/google/gve/gve_tx_dqo.c new file mode 100644 index 000000000000..588d64819ed5 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_tx_dqo.c @@ -0,0 +1,1022 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2021 Google, Inc. + */ + +#include "gve.h" +#include "gve_adminq.h" +#include "gve_utils.h" +#include "gve_dqo.h" +#include <linux/tcp.h> +#include <linux/slab.h> +#include <linux/skbuff.h> + +/* Returns true if a gve_tx_pending_packet_dqo object is available. */ +static bool gve_has_pending_packet(struct gve_tx_ring *tx) +{ +	/* Check TX path's list. */ +	if (tx->dqo_tx.free_pending_packets != -1) +		return true; + +	/* Check completion handler's list. */ +	if (atomic_read_acquire(&tx->dqo_compl.free_pending_packets) != -1) +		return true; + +	return false; +} + +static struct gve_tx_pending_packet_dqo * +gve_alloc_pending_packet(struct gve_tx_ring *tx) +{ +	struct gve_tx_pending_packet_dqo *pending_packet; +	s16 index; + +	index = tx->dqo_tx.free_pending_packets; + +	/* No pending_packets available, try to steal the list from the +	 * completion handler. +	 */ +	if (unlikely(index == -1)) { +		tx->dqo_tx.free_pending_packets = +			atomic_xchg(&tx->dqo_compl.free_pending_packets, -1); +		index = tx->dqo_tx.free_pending_packets; + +		if (unlikely(index == -1)) +			return NULL; +	} + +	pending_packet = &tx->dqo.pending_packets[index]; + +	/* Remove pending_packet from free list */ +	tx->dqo_tx.free_pending_packets = pending_packet->next; +	pending_packet->state = GVE_PACKET_STATE_PENDING_DATA_COMPL; + +	return pending_packet; +} + +static void +gve_free_pending_packet(struct gve_tx_ring *tx, +			struct gve_tx_pending_packet_dqo *pending_packet) +{ +	s16 index = pending_packet - tx->dqo.pending_packets; + +	pending_packet->state = GVE_PACKET_STATE_UNALLOCATED; +	while (true) { +		s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_pending_packets); + +		pending_packet->next = old_head; +		if (atomic_cmpxchg(&tx->dqo_compl.free_pending_packets, +				   old_head, index) == old_head) { +			break; +		} +	} +} + +/* gve_tx_free_desc - Cleans up all pending tx requests and buffers. + */ +static void gve_tx_clean_pending_packets(struct gve_tx_ring *tx) +{ +	int i; + +	for (i = 0; i < tx->dqo.num_pending_packets; i++) { +		struct gve_tx_pending_packet_dqo *cur_state = +			&tx->dqo.pending_packets[i]; +		int j; + +		for (j = 0; j < cur_state->num_bufs; j++) { +			if (j == 0) { +				dma_unmap_single(tx->dev, +					dma_unmap_addr(cur_state, dma[j]), +					dma_unmap_len(cur_state, len[j]), +					DMA_TO_DEVICE); +			} else { +				dma_unmap_page(tx->dev, +					dma_unmap_addr(cur_state, dma[j]), +					dma_unmap_len(cur_state, len[j]), +					DMA_TO_DEVICE); +			} +		} +		if (cur_state->skb) { +			dev_consume_skb_any(cur_state->skb); +			cur_state->skb = NULL; +		} +	} +} + +static void gve_tx_free_ring_dqo(struct gve_priv *priv, int idx) +{ +	struct gve_tx_ring *tx = &priv->tx[idx]; +	struct device *hdev = &priv->pdev->dev; +	size_t bytes; + +	gve_tx_remove_from_block(priv, idx); + +	if (tx->q_resources) { +		dma_free_coherent(hdev, sizeof(*tx->q_resources), +				  tx->q_resources, tx->q_resources_bus); +		tx->q_resources = NULL; +	} + +	if (tx->dqo.compl_ring) { +		bytes = sizeof(tx->dqo.compl_ring[0]) * +			(tx->dqo.complq_mask + 1); +		dma_free_coherent(hdev, bytes, tx->dqo.compl_ring, +				  tx->complq_bus_dqo); +		tx->dqo.compl_ring = NULL; +	} + +	if (tx->dqo.tx_ring) { +		bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1); +		dma_free_coherent(hdev, bytes, tx->dqo.tx_ring, tx->bus); +		tx->dqo.tx_ring = NULL; +	} + +	kvfree(tx->dqo.pending_packets); +	tx->dqo.pending_packets = NULL; + +	netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx); +} + +static int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int idx) +{ +	struct gve_tx_ring *tx = &priv->tx[idx]; +	struct device *hdev = &priv->pdev->dev; +	int num_pending_packets; +	size_t bytes; +	int i; + +	memset(tx, 0, sizeof(*tx)); +	tx->q_num = idx; +	tx->dev = &priv->pdev->dev; +	tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx); +	atomic_set_release(&tx->dqo_compl.hw_tx_head, 0); + +	/* Queue sizes must be a power of 2 */ +	tx->mask = priv->tx_desc_cnt - 1; +	tx->dqo.complq_mask = priv->options_dqo_rda.tx_comp_ring_entries - 1; + +	/* The max number of pending packets determines the maximum number of +	 * descriptors which maybe written to the completion queue. +	 * +	 * We must set the number small enough to make sure we never overrun the +	 * completion queue. +	 */ +	num_pending_packets = tx->dqo.complq_mask + 1; + +	/* Reserve space for descriptor completions, which will be reported at +	 * most every GVE_TX_MIN_RE_INTERVAL packets. +	 */ +	num_pending_packets -= +		(tx->dqo.complq_mask + 1) / GVE_TX_MIN_RE_INTERVAL; + +	/* Each packet may have at most 2 buffer completions if it receives both +	 * a miss and reinjection completion. +	 */ +	num_pending_packets /= 2; + +	tx->dqo.num_pending_packets = min_t(int, num_pending_packets, S16_MAX); +	tx->dqo.pending_packets = kvcalloc(tx->dqo.num_pending_packets, +					   sizeof(tx->dqo.pending_packets[0]), +					   GFP_KERNEL); +	if (!tx->dqo.pending_packets) +		goto err; + +	/* Set up linked list of pending packets */ +	for (i = 0; i < tx->dqo.num_pending_packets - 1; i++) +		tx->dqo.pending_packets[i].next = i + 1; + +	tx->dqo.pending_packets[tx->dqo.num_pending_packets - 1].next = -1; +	atomic_set_release(&tx->dqo_compl.free_pending_packets, -1); +	tx->dqo_compl.miss_completions.head = -1; +	tx->dqo_compl.miss_completions.tail = -1; +	tx->dqo_compl.timed_out_completions.head = -1; +	tx->dqo_compl.timed_out_completions.tail = -1; + +	bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1); +	tx->dqo.tx_ring = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL); +	if (!tx->dqo.tx_ring) +		goto err; + +	bytes = sizeof(tx->dqo.compl_ring[0]) * (tx->dqo.complq_mask + 1); +	tx->dqo.compl_ring = dma_alloc_coherent(hdev, bytes, +						&tx->complq_bus_dqo, +						GFP_KERNEL); +	if (!tx->dqo.compl_ring) +		goto err; + +	tx->q_resources = dma_alloc_coherent(hdev, sizeof(*tx->q_resources), +					     &tx->q_resources_bus, GFP_KERNEL); +	if (!tx->q_resources) +		goto err; + +	gve_tx_add_to_block(priv, idx); + +	return 0; + +err: +	gve_tx_free_ring_dqo(priv, idx); +	return -ENOMEM; +} + +int gve_tx_alloc_rings_dqo(struct gve_priv *priv) +{ +	int err = 0; +	int i; + +	for (i = 0; i < priv->tx_cfg.num_queues; i++) { +		err = gve_tx_alloc_ring_dqo(priv, i); +		if (err) { +			netif_err(priv, drv, priv->dev, +				  "Failed to alloc tx ring=%d: err=%d\n", +				  i, err); +			goto err; +		} +	} + +	return 0; + +err: +	for (i--; i >= 0; i--) +		gve_tx_free_ring_dqo(priv, i); + +	return err; +} + +void gve_tx_free_rings_dqo(struct gve_priv *priv) +{ +	int i; + +	for (i = 0; i < priv->tx_cfg.num_queues; i++) { +		struct gve_tx_ring *tx = &priv->tx[i]; + +		gve_clean_tx_done_dqo(priv, tx, /*napi=*/NULL); +		netdev_tx_reset_queue(tx->netdev_txq); +		gve_tx_clean_pending_packets(tx); + +		gve_tx_free_ring_dqo(priv, i); +	} +} + +/* Returns the number of slots available in the ring */ +static u32 num_avail_tx_slots(const struct gve_tx_ring *tx) +{ +	u32 num_used = (tx->dqo_tx.tail - tx->dqo_tx.head) & tx->mask; + +	return tx->mask - num_used; +} + +/* Stops the queue if available descriptors is less than 'count'. + * Return: 0 if stop is not required. + */ +static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx, int count) +{ +	if (likely(gve_has_pending_packet(tx) && +		   num_avail_tx_slots(tx) >= count)) +		return 0; + +	/* Update cached TX head pointer */ +	tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head); + +	if (likely(gve_has_pending_packet(tx) && +		   num_avail_tx_slots(tx) >= count)) +		return 0; + +	/* No space, so stop the queue */ +	tx->stop_queue++; +	netif_tx_stop_queue(tx->netdev_txq); + +	/* Sync with restarting queue in `gve_tx_poll_dqo()` */ +	mb(); + +	/* After stopping queue, check if we can transmit again in order to +	 * avoid TOCTOU bug. +	 */ +	tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head); + +	if (likely(!gve_has_pending_packet(tx) || +		   num_avail_tx_slots(tx) < count)) +		return -EBUSY; + +	netif_tx_start_queue(tx->netdev_txq); +	tx->wake_queue++; +	return 0; +} + +static void gve_extract_tx_metadata_dqo(const struct sk_buff *skb, +					struct gve_tx_metadata_dqo *metadata) +{ +	memset(metadata, 0, sizeof(*metadata)); +	metadata->version = GVE_TX_METADATA_VERSION_DQO; + +	if (skb->l4_hash) { +		u16 path_hash = skb->hash ^ (skb->hash >> 16); + +		path_hash &= (1 << 15) - 1; +		if (unlikely(path_hash == 0)) +			path_hash = ~path_hash; + +		metadata->path_hash = path_hash; +	} +} + +static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx, +				     struct sk_buff *skb, u32 len, u64 addr, +				     s16 compl_tag, bool eop, bool is_gso) +{ +	const bool checksum_offload_en = skb->ip_summed == CHECKSUM_PARTIAL; + +	while (len > 0) { +		struct gve_tx_pkt_desc_dqo *desc = +			&tx->dqo.tx_ring[*desc_idx].pkt; +		u32 cur_len = min_t(u32, len, GVE_TX_MAX_BUF_SIZE_DQO); +		bool cur_eop = eop && cur_len == len; + +		*desc = (struct gve_tx_pkt_desc_dqo){ +			.buf_addr = cpu_to_le64(addr), +			.dtype = GVE_TX_PKT_DESC_DTYPE_DQO, +			.end_of_packet = cur_eop, +			.checksum_offload_enable = checksum_offload_en, +			.compl_tag = cpu_to_le16(compl_tag), +			.buf_size = cur_len, +		}; + +		addr += cur_len; +		len -= cur_len; +		*desc_idx = (*desc_idx + 1) & tx->mask; +	} +} + +/* Validates and prepares `skb` for TSO. + * + * Returns header length, or < 0 if invalid. + */ +static int gve_prep_tso(struct sk_buff *skb) +{ +	struct tcphdr *tcp; +	int header_len; +	u32 paylen; +	int err; + +	/* Note: HW requires MSS (gso_size) to be <= 9728 and the total length +	 * of the TSO to be <= 262143. +	 * +	 * However, we don't validate these because: +	 * - Hypervisor enforces a limit of 9K MTU +	 * - Kernel will not produce a TSO larger than 64k +	 */ + +	if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO)) +		return -1; + +	/* Needed because we will modify header. */ +	err = skb_cow_head(skb, 0); +	if (err < 0) +		return err; + +	tcp = tcp_hdr(skb); + +	/* Remove payload length from checksum. */ +	paylen = skb->len - skb_transport_offset(skb); + +	switch (skb_shinfo(skb)->gso_type) { +	case SKB_GSO_TCPV4: +	case SKB_GSO_TCPV6: +		csum_replace_by_diff(&tcp->check, +				     (__force __wsum)htonl(paylen)); + +		/* Compute length of segmentation header. */ +		header_len = skb_tcp_all_headers(skb); +		break; +	default: +		return -EINVAL; +	} + +	if (unlikely(header_len > GVE_TX_MAX_HDR_SIZE_DQO)) +		return -EINVAL; + +	return header_len; +} + +static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc, +				     const struct sk_buff *skb, +				     const struct gve_tx_metadata_dqo *metadata, +				     int header_len) +{ +	*desc = (struct gve_tx_tso_context_desc_dqo){ +		.header_len = header_len, +		.cmd_dtype = { +			.dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO, +			.tso = 1, +		}, +		.flex0 = metadata->bytes[0], +		.flex5 = metadata->bytes[5], +		.flex6 = metadata->bytes[6], +		.flex7 = metadata->bytes[7], +		.flex8 = metadata->bytes[8], +		.flex9 = metadata->bytes[9], +		.flex10 = metadata->bytes[10], +		.flex11 = metadata->bytes[11], +	}; +	desc->tso_total_len = skb->len - header_len; +	desc->mss = skb_shinfo(skb)->gso_size; +} + +static void +gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc, +			     const struct gve_tx_metadata_dqo *metadata) +{ +	*desc = (struct gve_tx_general_context_desc_dqo){ +		.flex0 = metadata->bytes[0], +		.flex1 = metadata->bytes[1], +		.flex2 = metadata->bytes[2], +		.flex3 = metadata->bytes[3], +		.flex4 = metadata->bytes[4], +		.flex5 = metadata->bytes[5], +		.flex6 = metadata->bytes[6], +		.flex7 = metadata->bytes[7], +		.flex8 = metadata->bytes[8], +		.flex9 = metadata->bytes[9], +		.flex10 = metadata->bytes[10], +		.flex11 = metadata->bytes[11], +		.cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO}, +	}; +} + +/* Returns 0 on success, or < 0 on error. + * + * Before this function is called, the caller must ensure + * gve_has_pending_packet(tx) returns true. + */ +static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx, +				      struct sk_buff *skb) +{ +	const struct skb_shared_info *shinfo = skb_shinfo(skb); +	const bool is_gso = skb_is_gso(skb); +	u32 desc_idx = tx->dqo_tx.tail; + +	struct gve_tx_pending_packet_dqo *pkt; +	struct gve_tx_metadata_dqo metadata; +	s16 completion_tag; +	int i; + +	pkt = gve_alloc_pending_packet(tx); +	pkt->skb = skb; +	pkt->num_bufs = 0; +	completion_tag = pkt - tx->dqo.pending_packets; + +	gve_extract_tx_metadata_dqo(skb, &metadata); +	if (is_gso) { +		int header_len = gve_prep_tso(skb); + +		if (unlikely(header_len < 0)) +			goto err; + +		gve_tx_fill_tso_ctx_desc(&tx->dqo.tx_ring[desc_idx].tso_ctx, +					 skb, &metadata, header_len); +		desc_idx = (desc_idx + 1) & tx->mask; +	} + +	gve_tx_fill_general_ctx_desc(&tx->dqo.tx_ring[desc_idx].general_ctx, +				     &metadata); +	desc_idx = (desc_idx + 1) & tx->mask; + +	/* Note: HW requires that the size of a non-TSO packet be within the +	 * range of [17, 9728]. +	 * +	 * We don't double check because +	 * - We limited `netdev->min_mtu` to ETH_MIN_MTU. +	 * - Hypervisor won't allow MTU larger than 9216. +	 */ + +	/* Map the linear portion of skb */ +	{ +		u32 len = skb_headlen(skb); +		dma_addr_t addr; + +		addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE); +		if (unlikely(dma_mapping_error(tx->dev, addr))) +			goto err; + +		dma_unmap_len_set(pkt, len[pkt->num_bufs], len); +		dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr); +		++pkt->num_bufs; + +		gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr, +					 completion_tag, +					 /*eop=*/shinfo->nr_frags == 0, is_gso); +	} + +	for (i = 0; i < shinfo->nr_frags; i++) { +		const skb_frag_t *frag = &shinfo->frags[i]; +		bool is_eop = i == (shinfo->nr_frags - 1); +		u32 len = skb_frag_size(frag); +		dma_addr_t addr; + +		addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE); +		if (unlikely(dma_mapping_error(tx->dev, addr))) +			goto err; + +		dma_unmap_len_set(pkt, len[pkt->num_bufs], len); +		dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr); +		++pkt->num_bufs; + +		gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr, +					 completion_tag, is_eop, is_gso); +	} + +	/* Commit the changes to our state */ +	tx->dqo_tx.tail = desc_idx; + +	/* Request a descriptor completion on the last descriptor of the +	 * packet if we are allowed to by the HW enforced interval. +	 */ +	{ +		u32 last_desc_idx = (desc_idx - 1) & tx->mask; +		u32 last_report_event_interval = +			(last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask; + +		if (unlikely(last_report_event_interval >= +			     GVE_TX_MIN_RE_INTERVAL)) { +			tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true; +			tx->dqo_tx.last_re_idx = last_desc_idx; +		} +	} + +	return 0; + +err: +	for (i = 0; i < pkt->num_bufs; i++) { +		if (i == 0) { +			dma_unmap_single(tx->dev, +					 dma_unmap_addr(pkt, dma[i]), +					 dma_unmap_len(pkt, len[i]), +					 DMA_TO_DEVICE); +		} else { +			dma_unmap_page(tx->dev, +				       dma_unmap_addr(pkt, dma[i]), +				       dma_unmap_len(pkt, len[i]), +				       DMA_TO_DEVICE); +		} +	} + +	pkt->skb = NULL; +	pkt->num_bufs = 0; +	gve_free_pending_packet(tx, pkt); + +	return -1; +} + +static int gve_num_descs_per_buf(size_t size) +{ +	return DIV_ROUND_UP(size, GVE_TX_MAX_BUF_SIZE_DQO); +} + +static int gve_num_buffer_descs_needed(const struct sk_buff *skb) +{ +	const struct skb_shared_info *shinfo = skb_shinfo(skb); +	int num_descs; +	int i; + +	num_descs = gve_num_descs_per_buf(skb_headlen(skb)); + +	for (i = 0; i < shinfo->nr_frags; i++) { +		unsigned int frag_size = skb_frag_size(&shinfo->frags[i]); + +		num_descs += gve_num_descs_per_buf(frag_size); +	} + +	return num_descs; +} + +/* Returns true if HW is capable of sending TSO represented by `skb`. + * + * Each segment must not span more than GVE_TX_MAX_DATA_DESCS buffers. + * - The header is counted as one buffer for every single segment. + * - A buffer which is split between two segments is counted for both. + * - If a buffer contains both header and payload, it is counted as two buffers. + */ +static bool gve_can_send_tso(const struct sk_buff *skb) +{ +	const int max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1; +	const struct skb_shared_info *shinfo = skb_shinfo(skb); +	const int header_len = skb_tcp_all_headers(skb); +	const int gso_size = shinfo->gso_size; +	int cur_seg_num_bufs; +	int cur_seg_size; +	int i; + +	cur_seg_size = skb_headlen(skb) - header_len; +	cur_seg_num_bufs = cur_seg_size > 0; + +	for (i = 0; i < shinfo->nr_frags; i++) { +		if (cur_seg_size >= gso_size) { +			cur_seg_size %= gso_size; +			cur_seg_num_bufs = cur_seg_size > 0; +		} + +		if (unlikely(++cur_seg_num_bufs > max_bufs_per_seg)) +			return false; + +		cur_seg_size += skb_frag_size(&shinfo->frags[i]); +	} + +	return true; +} + +/* Attempt to transmit specified SKB. + * + * Returns 0 if the SKB was transmitted or dropped. + * Returns -1 if there is not currently enough space to transmit the SKB. + */ +static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx, +			  struct sk_buff *skb) +{ +	int num_buffer_descs; +	int total_num_descs; + +	if (skb_is_gso(skb)) { +		/* If TSO doesn't meet HW requirements, attempt to linearize the +		 * packet. +		 */ +		if (unlikely(!gve_can_send_tso(skb) && +			     skb_linearize(skb) < 0)) { +			net_err_ratelimited("%s: Failed to transmit TSO packet\n", +					    priv->dev->name); +			goto drop; +		} + +		num_buffer_descs = gve_num_buffer_descs_needed(skb); +	} else { +		num_buffer_descs = gve_num_buffer_descs_needed(skb); + +		if (unlikely(num_buffer_descs > GVE_TX_MAX_DATA_DESCS)) { +			if (unlikely(skb_linearize(skb) < 0)) +				goto drop; + +			num_buffer_descs = 1; +		} +	} + +	/* Metadata + (optional TSO) + data descriptors. */ +	total_num_descs = 1 + skb_is_gso(skb) + num_buffer_descs; +	if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs + +			GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP))) { +		return -1; +	} + +	if (unlikely(gve_tx_add_skb_no_copy_dqo(tx, skb) < 0)) +		goto drop; + +	netdev_tx_sent_queue(tx->netdev_txq, skb->len); +	skb_tx_timestamp(skb); +	return 0; + +drop: +	tx->dropped_pkt++; +	dev_kfree_skb_any(skb); +	return 0; +} + +/* Transmit a given skb and ring the doorbell. */ +netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev) +{ +	struct gve_priv *priv = netdev_priv(dev); +	struct gve_tx_ring *tx; + +	tx = &priv->tx[skb_get_queue_mapping(skb)]; +	if (unlikely(gve_try_tx_skb(priv, tx, skb) < 0)) { +		/* We need to ring the txq doorbell -- we have stopped the Tx +		 * queue for want of resources, but prior calls to gve_tx() +		 * may have added descriptors without ringing the doorbell. +		 */ +		gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); +		return NETDEV_TX_BUSY; +	} + +	if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more()) +		return NETDEV_TX_OK; + +	gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); +	return NETDEV_TX_OK; +} + +static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list, +			struct gve_tx_pending_packet_dqo *pending_packet) +{ +	s16 old_tail, index; + +	index = pending_packet - tx->dqo.pending_packets; +	old_tail = list->tail; +	list->tail = index; +	if (old_tail == -1) +		list->head = index; +	else +		tx->dqo.pending_packets[old_tail].next = index; + +	pending_packet->next = -1; +	pending_packet->prev = old_tail; +} + +static void remove_from_list(struct gve_tx_ring *tx, +			     struct gve_index_list *list, +			     struct gve_tx_pending_packet_dqo *pkt) +{ +	s16 prev_index, next_index; + +	prev_index = pkt->prev; +	next_index = pkt->next; + +	if (prev_index == -1) { +		/* Node is head */ +		list->head = next_index; +	} else { +		tx->dqo.pending_packets[prev_index].next = next_index; +	} +	if (next_index == -1) { +		/* Node is tail */ +		list->tail = prev_index; +	} else { +		tx->dqo.pending_packets[next_index].prev = prev_index; +	} +} + +static void gve_unmap_packet(struct device *dev, +			     struct gve_tx_pending_packet_dqo *pkt) +{ +	int i; + +	/* SKB linear portion is guaranteed to be mapped */ +	dma_unmap_single(dev, dma_unmap_addr(pkt, dma[0]), +			 dma_unmap_len(pkt, len[0]), DMA_TO_DEVICE); +	for (i = 1; i < pkt->num_bufs; i++) { +		dma_unmap_page(dev, dma_unmap_addr(pkt, dma[i]), +			       dma_unmap_len(pkt, len[i]), DMA_TO_DEVICE); +	} +	pkt->num_bufs = 0; +} + +/* Completion types and expected behavior: + * No Miss compl + Packet compl = Packet completed normally. + * Miss compl + Re-inject compl = Packet completed normally. + * No Miss compl + Re-inject compl = Skipped i.e. packet not completed. + * Miss compl + Packet compl = Skipped i.e. packet not completed. + */ +static void gve_handle_packet_completion(struct gve_priv *priv, +					 struct gve_tx_ring *tx, bool is_napi, +					 u16 compl_tag, u64 *bytes, u64 *pkts, +					 bool is_reinjection) +{ +	struct gve_tx_pending_packet_dqo *pending_packet; + +	if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) { +		net_err_ratelimited("%s: Invalid TX completion tag: %d\n", +				    priv->dev->name, (int)compl_tag); +		return; +	} + +	pending_packet = &tx->dqo.pending_packets[compl_tag]; + +	if (unlikely(is_reinjection)) { +		if (unlikely(pending_packet->state == +			     GVE_PACKET_STATE_TIMED_OUT_COMPL)) { +			net_err_ratelimited("%s: Re-injection completion: %d received after timeout.\n", +					    priv->dev->name, (int)compl_tag); +			/* Packet was already completed as a result of timeout, +			 * so just remove from list and free pending packet. +			 */ +			remove_from_list(tx, +					 &tx->dqo_compl.timed_out_completions, +					 pending_packet); +			gve_free_pending_packet(tx, pending_packet); +			return; +		} +		if (unlikely(pending_packet->state != +			     GVE_PACKET_STATE_PENDING_REINJECT_COMPL)) { +			/* No outstanding miss completion but packet allocated +			 * implies packet receives a re-injection completion +			 * without a prior miss completion. Return without +			 * completing the packet. +			 */ +			net_err_ratelimited("%s: Re-injection completion received without corresponding miss completion: %d\n", +					    priv->dev->name, (int)compl_tag); +			return; +		} +		remove_from_list(tx, &tx->dqo_compl.miss_completions, +				 pending_packet); +	} else { +		/* Packet is allocated but not a pending data completion. */ +		if (unlikely(pending_packet->state != +			     GVE_PACKET_STATE_PENDING_DATA_COMPL)) { +			net_err_ratelimited("%s: No pending data completion: %d\n", +					    priv->dev->name, (int)compl_tag); +			return; +		} +	} +	gve_unmap_packet(tx->dev, pending_packet); + +	*bytes += pending_packet->skb->len; +	(*pkts)++; +	napi_consume_skb(pending_packet->skb, is_napi); +	pending_packet->skb = NULL; +	gve_free_pending_packet(tx, pending_packet); +} + +static void gve_handle_miss_completion(struct gve_priv *priv, +				       struct gve_tx_ring *tx, u16 compl_tag, +				       u64 *bytes, u64 *pkts) +{ +	struct gve_tx_pending_packet_dqo *pending_packet; + +	if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) { +		net_err_ratelimited("%s: Invalid TX completion tag: %d\n", +				    priv->dev->name, (int)compl_tag); +		return; +	} + +	pending_packet = &tx->dqo.pending_packets[compl_tag]; +	if (unlikely(pending_packet->state != +				GVE_PACKET_STATE_PENDING_DATA_COMPL)) { +		net_err_ratelimited("%s: Unexpected packet state: %d for completion tag : %d\n", +				    priv->dev->name, (int)pending_packet->state, +				    (int)compl_tag); +		return; +	} + +	pending_packet->state = GVE_PACKET_STATE_PENDING_REINJECT_COMPL; +	/* jiffies can wraparound but time comparisons can handle overflows. */ +	pending_packet->timeout_jiffies = +			jiffies + +			msecs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT * +					 MSEC_PER_SEC); +	add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet); + +	*bytes += pending_packet->skb->len; +	(*pkts)++; +} + +static void remove_miss_completions(struct gve_priv *priv, +				    struct gve_tx_ring *tx) +{ +	struct gve_tx_pending_packet_dqo *pending_packet; +	s16 next_index; + +	next_index = tx->dqo_compl.miss_completions.head; +	while (next_index != -1) { +		pending_packet = &tx->dqo.pending_packets[next_index]; +		next_index = pending_packet->next; +		/* Break early because packets should timeout in order. */ +		if (time_is_after_jiffies(pending_packet->timeout_jiffies)) +			break; + +		remove_from_list(tx, &tx->dqo_compl.miss_completions, +				 pending_packet); +		/* Unmap buffers and free skb but do not unallocate packet i.e. +		 * the completion tag is not freed to ensure that the driver +		 * can take appropriate action if a corresponding valid +		 * completion is received later. +		 */ +		gve_unmap_packet(tx->dev, pending_packet); +		/* This indicates the packet was dropped. */ +		dev_kfree_skb_any(pending_packet->skb); +		pending_packet->skb = NULL; +		tx->dropped_pkt++; +		net_err_ratelimited("%s: No reinjection completion was received for: %d.\n", +				    priv->dev->name, +				    (int)(pending_packet - tx->dqo.pending_packets)); + +		pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL; +		pending_packet->timeout_jiffies = +				jiffies + +				msecs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT * +						 MSEC_PER_SEC); +		/* Maintain pending packet in another list so the packet can be +		 * unallocated at a later time. +		 */ +		add_to_list(tx, &tx->dqo_compl.timed_out_completions, +			    pending_packet); +	} +} + +static void remove_timed_out_completions(struct gve_priv *priv, +					 struct gve_tx_ring *tx) +{ +	struct gve_tx_pending_packet_dqo *pending_packet; +	s16 next_index; + +	next_index = tx->dqo_compl.timed_out_completions.head; +	while (next_index != -1) { +		pending_packet = &tx->dqo.pending_packets[next_index]; +		next_index = pending_packet->next; +		/* Break early because packets should timeout in order. */ +		if (time_is_after_jiffies(pending_packet->timeout_jiffies)) +			break; + +		remove_from_list(tx, &tx->dqo_compl.timed_out_completions, +				 pending_packet); +		gve_free_pending_packet(tx, pending_packet); +	} +} + +int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, +			  struct napi_struct *napi) +{ +	u64 reinject_compl_bytes = 0; +	u64 reinject_compl_pkts = 0; +	int num_descs_cleaned = 0; +	u64 miss_compl_bytes = 0; +	u64 miss_compl_pkts = 0; +	u64 pkt_compl_bytes = 0; +	u64 pkt_compl_pkts = 0; + +	/* Limit in order to avoid blocking for too long */ +	while (!napi || pkt_compl_pkts < napi->weight) { +		struct gve_tx_compl_desc *compl_desc = +			&tx->dqo.compl_ring[tx->dqo_compl.head]; +		u16 type; + +		if (compl_desc->generation == tx->dqo_compl.cur_gen_bit) +			break; + +		/* Prefetch the next descriptor. */ +		prefetch(&tx->dqo.compl_ring[(tx->dqo_compl.head + 1) & +				tx->dqo.complq_mask]); + +		/* Do not read data until we own the descriptor */ +		dma_rmb(); +		type = compl_desc->type; + +		if (type == GVE_COMPL_TYPE_DQO_DESC) { +			/* This is the last descriptor fetched by HW plus one */ +			u16 tx_head = le16_to_cpu(compl_desc->tx_head); + +			atomic_set_release(&tx->dqo_compl.hw_tx_head, tx_head); +		} else if (type == GVE_COMPL_TYPE_DQO_PKT) { +			u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); + +			gve_handle_packet_completion(priv, tx, !!napi, +						     compl_tag, +						     &pkt_compl_bytes, +						     &pkt_compl_pkts, +						     /*is_reinjection=*/false); +		} else if (type == GVE_COMPL_TYPE_DQO_MISS) { +			u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); + +			gve_handle_miss_completion(priv, tx, compl_tag, +						   &miss_compl_bytes, +						   &miss_compl_pkts); +		} else if (type == GVE_COMPL_TYPE_DQO_REINJECTION) { +			u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); + +			gve_handle_packet_completion(priv, tx, !!napi, +						     compl_tag, +						     &reinject_compl_bytes, +						     &reinject_compl_pkts, +						     /*is_reinjection=*/true); +		} + +		tx->dqo_compl.head = +			(tx->dqo_compl.head + 1) & tx->dqo.complq_mask; +		/* Flip the generation bit when we wrap around */ +		tx->dqo_compl.cur_gen_bit ^= tx->dqo_compl.head == 0; +		num_descs_cleaned++; +	} + +	netdev_tx_completed_queue(tx->netdev_txq, +				  pkt_compl_pkts + miss_compl_pkts, +				  pkt_compl_bytes + miss_compl_bytes); + +	remove_miss_completions(priv, tx); +	remove_timed_out_completions(priv, tx); + +	u64_stats_update_begin(&tx->statss); +	tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes; +	tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts; +	u64_stats_update_end(&tx->statss); +	return num_descs_cleaned; +} + +bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean) +{ +	struct gve_tx_compl_desc *compl_desc; +	struct gve_tx_ring *tx = block->tx; +	struct gve_priv *priv = block->priv; + +	if (do_clean) { +		int num_descs_cleaned = gve_clean_tx_done_dqo(priv, tx, +							      &block->napi); + +		/* Sync with queue being stopped in `gve_maybe_stop_tx_dqo()` */ +		mb(); + +		if (netif_tx_queue_stopped(tx->netdev_txq) && +		    num_descs_cleaned > 0) { +			tx->wake_queue++; +			netif_tx_wake_queue(tx->netdev_txq); +		} +	} + +	/* Return true if we still have work. */ +	compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head]; +	return compl_desc->generation != tx->dqo_compl.cur_gen_bit; +} diff --git a/drivers/net/ethernet/google/gve/gve_utils.c b/drivers/net/ethernet/google/gve/gve_utils.c new file mode 100644 index 000000000000..d57508bc4307 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_utils.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2021 Google, Inc. + */ + +#include "gve.h" +#include "gve_adminq.h" +#include "gve_utils.h" + +void gve_tx_remove_from_block(struct gve_priv *priv, int queue_idx) +{ +	struct gve_notify_block *block = +			&priv->ntfy_blocks[gve_tx_idx_to_ntfy(priv, queue_idx)]; + +	block->tx = NULL; +} + +void gve_tx_add_to_block(struct gve_priv *priv, int queue_idx) +{ +	unsigned int active_cpus = min_t(int, priv->num_ntfy_blks / 2, +					 num_online_cpus()); +	int ntfy_idx = gve_tx_idx_to_ntfy(priv, queue_idx); +	struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; +	struct gve_tx_ring *tx = &priv->tx[queue_idx]; + +	block->tx = tx; +	tx->ntfy_id = ntfy_idx; +	netif_set_xps_queue(priv->dev, get_cpu_mask(ntfy_idx % active_cpus), +			    queue_idx); +} + +void gve_rx_remove_from_block(struct gve_priv *priv, int queue_idx) +{ +	struct gve_notify_block *block = +			&priv->ntfy_blocks[gve_rx_idx_to_ntfy(priv, queue_idx)]; + +	block->rx = NULL; +} + +void gve_rx_add_to_block(struct gve_priv *priv, int queue_idx) +{ +	u32 ntfy_idx = gve_rx_idx_to_ntfy(priv, queue_idx); +	struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; +	struct gve_rx_ring *rx = &priv->rx[queue_idx]; + +	block->rx = rx; +	rx->ntfy_id = ntfy_idx; +} + +struct sk_buff *gve_rx_copy(struct net_device *dev, struct napi_struct *napi, +			    struct gve_rx_slot_page_info *page_info, u16 len, +			    u16 padding, struct gve_rx_ctx *ctx) +{ +	void *va = page_info->page_address + padding + page_info->page_offset; +	int skb_linear_offset = 0; +	bool set_protocol = false; +	struct sk_buff *skb; + +	if (ctx) { +		if (!ctx->skb_head) +			ctx->skb_head = napi_alloc_skb(napi, ctx->total_expected_size); + +		if (unlikely(!ctx->skb_head)) +			return NULL; +		skb = ctx->skb_head; +		skb_linear_offset = skb->len; +		set_protocol = ctx->curr_frag_cnt == ctx->expected_frag_cnt - 1; +	} else { +		skb = napi_alloc_skb(napi, len); + +		if (unlikely(!skb)) +			return NULL; +		set_protocol = true; +	} +	__skb_put(skb, len); +	skb_copy_to_linear_data_offset(skb, skb_linear_offset, va, len); + +	if (set_protocol) +		skb->protocol = eth_type_trans(skb, dev); + +	return skb; +} + +void gve_dec_pagecnt_bias(struct gve_rx_slot_page_info *page_info) +{ +	page_info->pagecnt_bias--; +	if (page_info->pagecnt_bias == 0) { +		int pagecount = page_count(page_info->page); + +		/* If we have run out of bias - set it back up to INT_MAX +		 * minus the existing refs. +		 */ +		page_info->pagecnt_bias = INT_MAX - pagecount; + +		/* Set pagecount back up to max. */ +		page_ref_add(page_info->page, INT_MAX - pagecount); +	} +} diff --git a/drivers/net/ethernet/google/gve/gve_utils.h b/drivers/net/ethernet/google/gve/gve_utils.h new file mode 100644 index 000000000000..6d98e69fd3b8 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_utils.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) + * Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2021 Google, Inc. + */ + +#ifndef _GVE_UTILS_H +#define _GVE_UTILS_H + +#include <linux/etherdevice.h> + +#include "gve.h" + +void gve_tx_remove_from_block(struct gve_priv *priv, int queue_idx); +void gve_tx_add_to_block(struct gve_priv *priv, int queue_idx); + +void gve_rx_remove_from_block(struct gve_priv *priv, int queue_idx); +void gve_rx_add_to_block(struct gve_priv *priv, int queue_idx); + +struct sk_buff *gve_rx_copy(struct net_device *dev, struct napi_struct *napi, +			    struct gve_rx_slot_page_info *page_info, u16 len, +			    u16 pad, struct gve_rx_ctx *ctx); + +/* Decrement pagecnt_bias. Set it back to INT_MAX if it reached zero. */ +void gve_dec_pagecnt_bias(struct gve_rx_slot_page_info *page_info); + +#endif /* _GVE_UTILS_H */ +  | 
