aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/net/hyperv/netvsc_drv.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/net/hyperv/netvsc_drv.c')
-rw-r--r--drivers/net/hyperv/netvsc_drv.c523
1 files changed, 309 insertions, 214 deletions
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 2c0a24c606fc..89eb4f179a3c 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -10,6 +10,7 @@
#include <linux/init.h>
#include <linux/atomic.h>
+#include <linux/ethtool.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/device.h>
@@ -37,9 +38,6 @@
#include "hyperv_net.h"
#define RING_SIZE_MIN 64
-#define RETRY_US_LO 5000
-#define RETRY_US_HI 10000
-#define RETRY_MAX 2000 /* >10 sec */
#define LINKCHANGE_INT (2 * HZ)
#define VF_TAKEOVER_INT (HZ / 10)
@@ -244,56 +242,6 @@ static inline void *init_ppi_data(struct rndis_message *msg,
return ppi + 1;
}
-/* Azure hosts don't support non-TCP port numbers in hashing for fragmented
- * packets. We can use ethtool to change UDP hash level when necessary.
- */
-static inline u32 netvsc_get_hash(
- struct sk_buff *skb,
- const struct net_device_context *ndc)
-{
- struct flow_keys flow;
- u32 hash, pkt_proto = 0;
- static u32 hashrnd __read_mostly;
-
- net_get_random_once(&hashrnd, sizeof(hashrnd));
-
- if (!skb_flow_dissect_flow_keys(skb, &flow, 0))
- return 0;
-
- switch (flow.basic.ip_proto) {
- case IPPROTO_TCP:
- if (flow.basic.n_proto == htons(ETH_P_IP))
- pkt_proto = HV_TCP4_L4HASH;
- else if (flow.basic.n_proto == htons(ETH_P_IPV6))
- pkt_proto = HV_TCP6_L4HASH;
-
- break;
-
- case IPPROTO_UDP:
- if (flow.basic.n_proto == htons(ETH_P_IP))
- pkt_proto = HV_UDP4_L4HASH;
- else if (flow.basic.n_proto == htons(ETH_P_IPV6))
- pkt_proto = HV_UDP6_L4HASH;
-
- break;
- }
-
- if (pkt_proto & ndc->l4_hash) {
- return skb_get_hash(skb);
- } else {
- if (flow.basic.n_proto == htons(ETH_P_IP))
- hash = jhash2((u32 *)&flow.addrs.v4addrs, 2, hashrnd);
- else if (flow.basic.n_proto == htons(ETH_P_IPV6))
- hash = jhash2((u32 *)&flow.addrs.v6addrs, 8, hashrnd);
- else
- return 0;
-
- __skb_set_sw_hash(skb, hash, false);
- }
-
- return hash;
-}
-
static inline int netvsc_get_tx_queue(struct net_device *ndev,
struct sk_buff *skb, int old_idx)
{
@@ -367,38 +315,35 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
}
rcu_read_unlock();
- while (unlikely(txq >= ndev->real_num_tx_queues))
+ while (txq >= ndev->real_num_tx_queues)
txq -= ndev->real_num_tx_queues;
return txq;
}
-static u32 fill_pg_buf(struct page *page, u32 offset, u32 len,
+static u32 fill_pg_buf(unsigned long hvpfn, u32 offset, u32 len,
struct hv_page_buffer *pb)
{
int j = 0;
- /* Deal with compound pages by ignoring unused part
- * of the page.
- */
- page += (offset >> PAGE_SHIFT);
- offset &= ~PAGE_MASK;
+ hvpfn += offset >> HV_HYP_PAGE_SHIFT;
+ offset = offset & ~HV_HYP_PAGE_MASK;
while (len > 0) {
unsigned long bytes;
- bytes = PAGE_SIZE - offset;
+ bytes = HV_HYP_PAGE_SIZE - offset;
if (bytes > len)
bytes = len;
- pb[j].pfn = page_to_pfn(page);
+ pb[j].pfn = hvpfn;
pb[j].offset = offset;
pb[j].len = bytes;
offset += bytes;
len -= bytes;
- if (offset == PAGE_SIZE && len) {
- page++;
+ if (offset == HV_HYP_PAGE_SIZE && len) {
+ hvpfn++;
offset = 0;
j++;
}
@@ -421,23 +366,26 @@ static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb,
* 2. skb linear data
* 3. skb fragment data
*/
- slots_used += fill_pg_buf(virt_to_page(hdr),
- offset_in_page(hdr),
- len, &pb[slots_used]);
+ slots_used += fill_pg_buf(virt_to_hvpfn(hdr),
+ offset_in_hvpage(hdr),
+ len,
+ &pb[slots_used]);
packet->rmsg_size = len;
packet->rmsg_pgcnt = slots_used;
- slots_used += fill_pg_buf(virt_to_page(data),
- offset_in_page(data),
- skb_headlen(skb), &pb[slots_used]);
+ slots_used += fill_pg_buf(virt_to_hvpfn(data),
+ offset_in_hvpage(data),
+ skb_headlen(skb),
+ &pb[slots_used]);
for (i = 0; i < frags; i++) {
skb_frag_t *frag = skb_shinfo(skb)->frags + i;
- slots_used += fill_pg_buf(skb_frag_page(frag),
- skb_frag_off(frag),
- skb_frag_size(frag), &pb[slots_used]);
+ slots_used += fill_pg_buf(page_to_hvpfn(skb_frag_page(frag)),
+ skb_frag_off(frag),
+ skb_frag_size(frag),
+ &pb[slots_used]);
}
return slots_used;
}
@@ -453,8 +401,8 @@ static int count_skb_frag_slots(struct sk_buff *skb)
unsigned long offset = skb_frag_off(frag);
/* Skip unused frames from start of page */
- offset &= ~PAGE_MASK;
- pages += PFN_UP(offset + size);
+ offset &= ~HV_HYP_PAGE_MASK;
+ pages += HVPFN_UP(offset + size);
}
return pages;
}
@@ -462,12 +410,12 @@ static int count_skb_frag_slots(struct sk_buff *skb)
static int netvsc_get_slots(struct sk_buff *skb)
{
char *data = skb->data;
- unsigned int offset = offset_in_page(data);
+ unsigned int offset = offset_in_hvpage(data);
unsigned int len = skb_headlen(skb);
int slots;
int frag_slots;
- slots = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+ slots = DIV_ROUND_UP(offset + len, HV_HYP_PAGE_SIZE);
frag_slots = count_skb_frag_slots(skb);
return slots + frag_slots;
}
@@ -502,7 +450,7 @@ static int netvsc_vf_xmit(struct net_device *net, struct net_device *vf_netdev,
int rc;
skb->dev = vf_netdev;
- skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
+ skb_record_rx_queue(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping);
rc = dev_queue_xmit(skb);
if (likely(rc == NET_XMIT_SUCCESS || rc == NET_XMIT_CN)) {
@@ -532,12 +480,14 @@ static int netvsc_xmit(struct sk_buff *skb, struct net_device *net, bool xdp_tx)
u32 hash;
struct hv_page_buffer pb[MAX_PAGE_BUFFER_COUNT];
- /* if VF is present and up then redirect packets
- * already called with rcu_read_lock_bh
+ /* If VF is present and up then redirect packets to it.
+ * Skip the VF if it is marked down or has no carrier.
+ * If netpoll is in uses, then VF can not be used either.
*/
vf_netdev = rcu_dereference_bh(net_device_ctx->vf_netdev);
if (vf_netdev && netif_running(vf_netdev) &&
- !netpoll_tx_running(net))
+ netif_carrier_ok(vf_netdev) && !netpoll_tx_running(net) &&
+ net_device_ctx->data_path_is_vf)
return netvsc_vf_xmit(net, vf_netdev, skb);
/* We will atmost need two pages to describe the rndis
@@ -605,6 +555,29 @@ static int netvsc_xmit(struct sk_buff *skb, struct net_device *net, bool xdp_tx)
*hash_info = hash;
}
+ /* When using AF_PACKET we need to drop VLAN header from
+ * the frame and update the SKB to allow the HOST OS
+ * to transmit the 802.1Q packet
+ */
+ if (skb->protocol == htons(ETH_P_8021Q)) {
+ u16 vlan_tci;
+
+ skb_reset_mac_header(skb);
+ if (eth_type_vlan(eth_hdr(skb)->h_proto)) {
+ if (unlikely(__skb_vlan_pop(skb, &vlan_tci) != 0)) {
+ ++net_device_ctx->eth_stats.vlan_error;
+ goto drop;
+ }
+
+ __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci);
+ /* Update the NDIS header pkt lengths */
+ packet->total_data_buflen -= VLAN_HLEN;
+ packet->total_bytes -= VLAN_HLEN;
+ rndis_msg->msg_len = packet->total_data_buflen;
+ rndis_msg->msg.pkt.data_len = packet->total_data_buflen;
+ }
+ }
+
if (skb_vlan_tag_present(skb)) {
struct ndis_pkt_8021q_info *vlan;
@@ -638,10 +611,7 @@ static int netvsc_xmit(struct sk_buff *skb, struct net_device *net, bool xdp_tx)
} else {
lso_info->lso_v2_transmit.ip_version =
NDIS_TCP_LARGE_SEND_OFFLOAD_IPV6;
- ipv6_hdr(skb)->payload_len = 0;
- tcp_hdr(skb)->check =
- ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
- &ipv6_hdr(skb)->daddr, 0, IPPROTO_TCP, 0);
+ tcp_v6_gso_csum_prep(skb);
}
lso_info->lso_v2_transmit.tcp_header_offset = skb_transport_offset(skb);
lso_info->lso_v2_transmit.mss = skb_shinfo(skb)->gso_size;
@@ -710,7 +680,8 @@ no_memory:
goto drop;
}
-static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *ndev)
+static netdev_tx_t netvsc_start_xmit(struct sk_buff *skb,
+ struct net_device *ndev)
{
return netvsc_xmit(skb, ndev, false);
}
@@ -719,19 +690,44 @@ static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *ndev)
* netvsc_linkstatus_callback - Link up/down notification
*/
void netvsc_linkstatus_callback(struct net_device *net,
- struct rndis_message *resp)
+ struct rndis_message *resp,
+ void *data, u32 data_buflen)
{
struct rndis_indicate_status *indicate = &resp->msg.indicate_status;
struct net_device_context *ndev_ctx = netdev_priv(net);
struct netvsc_reconfig *event;
unsigned long flags;
+ /* Ensure the packet is big enough to access its fields */
+ if (resp->msg_len - RNDIS_HEADER_SIZE < sizeof(struct rndis_indicate_status)) {
+ netdev_err(net, "invalid rndis_indicate_status packet, len: %u\n",
+ resp->msg_len);
+ return;
+ }
+
+ /* Copy the RNDIS indicate status into nvchan->recv_buf */
+ memcpy(indicate, data + RNDIS_HEADER_SIZE, sizeof(*indicate));
+
/* Update the physical link speed when changing to another vSwitch */
if (indicate->status == RNDIS_STATUS_LINK_SPEED_CHANGE) {
u32 speed;
- speed = *(u32 *)((void *)indicate
- + indicate->status_buf_offset) / 10000;
+ /* Validate status_buf_offset and status_buflen.
+ *
+ * Certain (pre-Fe) implementations of Hyper-V's vSwitch didn't account
+ * for the status buffer field in resp->msg_len; perform the validation
+ * using data_buflen (>= resp->msg_len).
+ */
+ if (indicate->status_buflen < sizeof(speed) ||
+ indicate->status_buf_offset < sizeof(*indicate) ||
+ data_buflen - RNDIS_HEADER_SIZE < indicate->status_buf_offset ||
+ data_buflen - RNDIS_HEADER_SIZE - indicate->status_buf_offset
+ < indicate->status_buflen) {
+ netdev_err(net, "invalid rndis_indicate_status packet\n");
+ return;
+ }
+
+ speed = *(u32 *)(data + RNDIS_HEADER_SIZE + indicate->status_buf_offset) / 10000;
ndev_ctx->speed = speed;
return;
}
@@ -757,7 +753,8 @@ void netvsc_linkstatus_callback(struct net_device *net,
schedule_delayed_work(&ndev_ctx->dwork, 0);
}
-static void netvsc_xdp_xmit(struct sk_buff *skb, struct net_device *ndev)
+/* This function should only be called after skb_record_rx_queue() */
+void netvsc_xdp_xmit(struct sk_buff *skb, struct net_device *ndev)
{
int rc;
@@ -786,10 +783,11 @@ static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net,
struct xdp_buff *xdp)
{
struct napi_struct *napi = &nvchan->napi;
- const struct ndis_pkt_8021q_info *vlan = nvchan->rsc.vlan;
+ const struct ndis_pkt_8021q_info *vlan = &nvchan->rsc.vlan;
const struct ndis_tcp_ip_checksum_info *csum_info =
- nvchan->rsc.csum_info;
- const u32 *hash_info = nvchan->rsc.hash_info;
+ &nvchan->rsc.csum_info;
+ const u32 *hash_info = &nvchan->rsc.hash_info;
+ u8 ppi_flags = nvchan->rsc.ppi_flags;
struct sk_buff *skb;
void *xbuf = xdp->data_hard_start;
int i;
@@ -797,7 +795,7 @@ static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net,
if (xbuf) {
unsigned int hdroom = xdp->data - xdp->data_hard_start;
unsigned int xlen = xdp->data_end - xdp->data;
- unsigned int frag_size = netvsc_xdp_fraglen(hdroom + xlen);
+ unsigned int frag_size = xdp->frame_sz;
skb = build_skb(xbuf, frag_size);
@@ -833,22 +831,28 @@ static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net,
* We compute it here if the flags are set, because on Linux, the IP
* checksum is always checked.
*/
- if (csum_info && csum_info->receive.ip_checksum_value_invalid &&
+ if ((ppi_flags & NVSC_RSC_CSUM_INFO) && csum_info->receive.ip_checksum_value_invalid &&
csum_info->receive.ip_checksum_succeeded &&
- skb->protocol == htons(ETH_P_IP))
+ skb->protocol == htons(ETH_P_IP)) {
+ /* Check that there is enough space to hold the IP header. */
+ if (skb_headlen(skb) < sizeof(struct iphdr)) {
+ kfree_skb(skb);
+ return NULL;
+ }
netvsc_comp_ipcsum(skb);
+ }
/* Do L4 checksum offload if enabled and present. */
- if (csum_info && (net->features & NETIF_F_RXCSUM)) {
+ if ((ppi_flags & NVSC_RSC_CSUM_INFO) && (net->features & NETIF_F_RXCSUM)) {
if (csum_info->receive.tcp_checksum_succeeded ||
csum_info->receive.udp_checksum_succeeded)
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
- if (hash_info && (net->features & NETIF_F_RXHASH))
+ if ((ppi_flags & NVSC_RSC_HASH_INFO) && (net->features & NETIF_F_RXHASH))
skb_set_hash(skb, *hash_info, PKT_HASH_TYPE_L4);
- if (vlan) {
+ if (ppi_flags & NVSC_RSC_VLAN) {
u16 vlan_tci = vlan->vlanid | (vlan->pri << VLAN_PRIO_SHIFT) |
(vlan->cfi ? VLAN_CFI_MASK : 0);
@@ -871,7 +875,7 @@ int netvsc_recv_callback(struct net_device *net,
struct vmbus_channel *channel = nvchan->channel;
u16 q_idx = channel->offermsg.offer.sub_channel_index;
struct sk_buff *skb;
- struct netvsc_stats *rx_stats = &nvchan->rx_stats;
+ struct netvsc_stats_rx *rx_stats = &nvchan->rx_stats;
struct xdp_buff xdp;
u32 act;
@@ -880,6 +884,9 @@ int netvsc_recv_callback(struct net_device *net,
act = netvsc_run_xdp(net, nvchan, &xdp);
+ if (act == XDP_REDIRECT)
+ return NVSP_STAT_SUCCESS;
+
if (act != XDP_PASS && act != XDP_TX) {
u64_stats_update_begin(&rx_stats->syncp);
rx_stats->xdp_drop++;
@@ -904,6 +911,9 @@ int netvsc_recv_callback(struct net_device *net,
* statistics will not work correctly.
*/
u64_stats_update_begin(&rx_stats->syncp);
+ if (act == XDP_TX)
+ rx_stats->xdp_tx++;
+
rx_stats->packets++;
rx_stats->bytes += nvchan->rsc.pktlen;
@@ -925,8 +935,8 @@ int netvsc_recv_callback(struct net_device *net,
static void netvsc_get_drvinfo(struct net_device *net,
struct ethtool_drvinfo *info)
{
- strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
- strlcpy(info->fw_version, "N/A", sizeof(info->fw_version));
+ strscpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
+ strscpy(info->fw_version, "N/A", sizeof(info->fw_version));
}
static void netvsc_get_channels(struct net_device *net,
@@ -1143,23 +1153,6 @@ out:
return ret;
}
-static bool
-netvsc_validate_ethtool_ss_cmd(const struct ethtool_link_ksettings *cmd)
-{
- struct ethtool_link_ksettings diff1 = *cmd;
- struct ethtool_link_ksettings diff2 = {};
-
- diff1.base.speed = 0;
- diff1.base.duplex = 0;
- /* advertising and cmd are usually set */
- ethtool_link_ksettings_zero_link_mode(&diff1, advertising);
- diff1.base.cmd = 0;
- /* We set port to PORT_OTHER */
- diff2.base.port = PORT_OTHER;
-
- return !memcmp(&diff1, &diff2, sizeof(diff1));
-}
-
static void netvsc_init_settings(struct net_device *dev)
{
struct net_device_context *ndc = netdev_priv(dev);
@@ -1176,6 +1169,12 @@ static int netvsc_get_link_ksettings(struct net_device *dev,
struct ethtool_link_ksettings *cmd)
{
struct net_device_context *ndc = netdev_priv(dev);
+ struct net_device *vf_netdev;
+
+ vf_netdev = rtnl_dereference(ndc->vf_netdev);
+
+ if (vf_netdev)
+ return __ethtool_get_link_ksettings(vf_netdev, cmd);
cmd->base.speed = ndc->speed;
cmd->base.duplex = ndc->duplex;
@@ -1188,18 +1187,18 @@ static int netvsc_set_link_ksettings(struct net_device *dev,
const struct ethtool_link_ksettings *cmd)
{
struct net_device_context *ndc = netdev_priv(dev);
- u32 speed;
+ struct net_device *vf_netdev = rtnl_dereference(ndc->vf_netdev);
- speed = cmd->base.speed;
- if (!ethtool_validate_speed(speed) ||
- !ethtool_validate_duplex(cmd->base.duplex) ||
- !netvsc_validate_ethtool_ss_cmd(cmd))
- return -EINVAL;
+ if (vf_netdev) {
+ if (!vf_netdev->ethtool_ops->set_link_ksettings)
+ return -EOPNOTSUPP;
- ndc->speed = speed;
- ndc->duplex = cmd->base.duplex;
+ return vf_netdev->ethtool_ops->set_link_ksettings(vf_netdev,
+ cmd);
+ }
- return 0;
+ return ethtool_virtdev_set_link_ksettings(dev, cmd,
+ &ndc->speed, &ndc->duplex);
}
static int netvsc_change_mtu(struct net_device *ndev, int mtu)
@@ -1310,28 +1309,29 @@ static void netvsc_get_pcpu_stats(struct net_device *net,
/* fetch percpu stats of netvsc */
for (i = 0; i < nvdev->num_chn; i++) {
const struct netvsc_channel *nvchan = &nvdev->chan_table[i];
- const struct netvsc_stats *stats;
+ const struct netvsc_stats_tx *tx_stats;
+ const struct netvsc_stats_rx *rx_stats;
struct netvsc_ethtool_pcpu_stats *this_tot =
&pcpu_tot[nvchan->channel->target_cpu];
u64 packets, bytes;
unsigned int start;
- stats = &nvchan->tx_stats;
+ tx_stats = &nvchan->tx_stats;
do {
- start = u64_stats_fetch_begin_irq(&stats->syncp);
- packets = stats->packets;
- bytes = stats->bytes;
- } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+ start = u64_stats_fetch_begin_irq(&tx_stats->syncp);
+ packets = tx_stats->packets;
+ bytes = tx_stats->bytes;
+ } while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start));
this_tot->tx_bytes += bytes;
this_tot->tx_packets += packets;
- stats = &nvchan->rx_stats;
+ rx_stats = &nvchan->rx_stats;
do {
- start = u64_stats_fetch_begin_irq(&stats->syncp);
- packets = stats->packets;
- bytes = stats->bytes;
- } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+ start = u64_stats_fetch_begin_irq(&rx_stats->syncp);
+ packets = rx_stats->packets;
+ bytes = rx_stats->bytes;
+ } while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start));
this_tot->rx_bytes += bytes;
this_tot->rx_packets += packets;
@@ -1363,27 +1363,28 @@ static void netvsc_get_stats64(struct net_device *net,
for (i = 0; i < nvdev->num_chn; i++) {
const struct netvsc_channel *nvchan = &nvdev->chan_table[i];
- const struct netvsc_stats *stats;
+ const struct netvsc_stats_tx *tx_stats;
+ const struct netvsc_stats_rx *rx_stats;
u64 packets, bytes, multicast;
unsigned int start;
- stats = &nvchan->tx_stats;
+ tx_stats = &nvchan->tx_stats;
do {
- start = u64_stats_fetch_begin_irq(&stats->syncp);
- packets = stats->packets;
- bytes = stats->bytes;
- } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+ start = u64_stats_fetch_begin_irq(&tx_stats->syncp);
+ packets = tx_stats->packets;
+ bytes = tx_stats->bytes;
+ } while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start));
t->tx_bytes += bytes;
t->tx_packets += packets;
- stats = &nvchan->rx_stats;
+ rx_stats = &nvchan->rx_stats;
do {
- start = u64_stats_fetch_begin_irq(&stats->syncp);
- packets = stats->packets;
- bytes = stats->bytes;
- multicast = stats->multicast + stats->broadcast;
- } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+ start = u64_stats_fetch_begin_irq(&rx_stats->syncp);
+ packets = rx_stats->packets;
+ bytes = rx_stats->bytes;
+ multicast = rx_stats->multicast + rx_stats->broadcast;
+ } while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start));
t->rx_bytes += bytes;
t->rx_packets += packets;
@@ -1440,6 +1441,7 @@ static const struct {
{ "rx_no_memory", offsetof(struct netvsc_ethtool_stats, rx_no_memory) },
{ "stop_queue", offsetof(struct netvsc_ethtool_stats, stop_queue) },
{ "wake_queue", offsetof(struct netvsc_ethtool_stats, wake_queue) },
+ { "vlan_error", offsetof(struct netvsc_ethtool_stats, vlan_error) },
}, pcpu_stats[] = {
{ "cpu%u_rx_packets",
offsetof(struct netvsc_ethtool_pcpu_stats, rx_packets) },
@@ -1471,8 +1473,8 @@ static const struct {
/* statistics per queue (rx/tx packets/bytes) */
#define NETVSC_PCPU_STATS_LEN (num_present_cpus() * ARRAY_SIZE(pcpu_stats))
-/* 5 statistics per queue (rx/tx packets/bytes, rx xdp_drop) */
-#define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 5)
+/* 8 statistics per queue (rx/tx packets/bytes, XDP actions) */
+#define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 8)
static int netvsc_get_sset_count(struct net_device *dev, int string_set)
{
@@ -1499,12 +1501,16 @@ static void netvsc_get_ethtool_stats(struct net_device *dev,
struct net_device_context *ndc = netdev_priv(dev);
struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
const void *nds = &ndc->eth_stats;
- const struct netvsc_stats *qstats;
+ const struct netvsc_stats_tx *tx_stats;
+ const struct netvsc_stats_rx *rx_stats;
struct netvsc_vf_pcpu_stats sum;
struct netvsc_ethtool_pcpu_stats *pcpu_sum;
unsigned int start;
u64 packets, bytes;
u64 xdp_drop;
+ u64 xdp_redirect;
+ u64 xdp_tx;
+ u64 xdp_xmit;
int i, j, cpu;
if (!nvdev)
@@ -1518,31 +1524,40 @@ static void netvsc_get_ethtool_stats(struct net_device *dev,
data[i++] = *(u64 *)((void *)&sum + vf_stats[j].offset);
for (j = 0; j < nvdev->num_chn; j++) {
- qstats = &nvdev->chan_table[j].tx_stats;
+ tx_stats = &nvdev->chan_table[j].tx_stats;
do {
- start = u64_stats_fetch_begin_irq(&qstats->syncp);
- packets = qstats->packets;
- bytes = qstats->bytes;
- } while (u64_stats_fetch_retry_irq(&qstats->syncp, start));
+ start = u64_stats_fetch_begin_irq(&tx_stats->syncp);
+ packets = tx_stats->packets;
+ bytes = tx_stats->bytes;
+ xdp_xmit = tx_stats->xdp_xmit;
+ } while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start));
data[i++] = packets;
data[i++] = bytes;
+ data[i++] = xdp_xmit;
- qstats = &nvdev->chan_table[j].rx_stats;
+ rx_stats = &nvdev->chan_table[j].rx_stats;
do {
- start = u64_stats_fetch_begin_irq(&qstats->syncp);
- packets = qstats->packets;
- bytes = qstats->bytes;
- xdp_drop = qstats->xdp_drop;
- } while (u64_stats_fetch_retry_irq(&qstats->syncp, start));
+ start = u64_stats_fetch_begin_irq(&rx_stats->syncp);
+ packets = rx_stats->packets;
+ bytes = rx_stats->bytes;
+ xdp_drop = rx_stats->xdp_drop;
+ xdp_redirect = rx_stats->xdp_redirect;
+ xdp_tx = rx_stats->xdp_tx;
+ } while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start));
data[i++] = packets;
data[i++] = bytes;
data[i++] = xdp_drop;
+ data[i++] = xdp_redirect;
+ data[i++] = xdp_tx;
}
pcpu_sum = kvmalloc_array(num_possible_cpus(),
sizeof(struct netvsc_ethtool_pcpu_stats),
GFP_KERNEL);
+ if (!pcpu_sum)
+ return;
+
netvsc_get_pcpu_stats(dev, pcpu_sum);
for_each_present_cpu(cpu) {
struct netvsc_ethtool_pcpu_stats *this_sum = &pcpu_sum[cpu];
@@ -1566,34 +1581,26 @@ static void netvsc_get_strings(struct net_device *dev, u32 stringset, u8 *data)
switch (stringset) {
case ETH_SS_STATS:
- for (i = 0; i < ARRAY_SIZE(netvsc_stats); i++) {
- memcpy(p, netvsc_stats[i].name, ETH_GSTRING_LEN);
- p += ETH_GSTRING_LEN;
- }
+ for (i = 0; i < ARRAY_SIZE(netvsc_stats); i++)
+ ethtool_sprintf(&p, netvsc_stats[i].name);
- for (i = 0; i < ARRAY_SIZE(vf_stats); i++) {
- memcpy(p, vf_stats[i].name, ETH_GSTRING_LEN);
- p += ETH_GSTRING_LEN;
- }
+ for (i = 0; i < ARRAY_SIZE(vf_stats); i++)
+ ethtool_sprintf(&p, vf_stats[i].name);
for (i = 0; i < nvdev->num_chn; i++) {
- sprintf(p, "tx_queue_%u_packets", i);
- p += ETH_GSTRING_LEN;
- sprintf(p, "tx_queue_%u_bytes", i);
- p += ETH_GSTRING_LEN;
- sprintf(p, "rx_queue_%u_packets", i);
- p += ETH_GSTRING_LEN;
- sprintf(p, "rx_queue_%u_bytes", i);
- p += ETH_GSTRING_LEN;
- sprintf(p, "rx_queue_%u_xdp_drop", i);
- p += ETH_GSTRING_LEN;
+ ethtool_sprintf(&p, "tx_queue_%u_packets", i);
+ ethtool_sprintf(&p, "tx_queue_%u_bytes", i);
+ ethtool_sprintf(&p, "tx_queue_%u_xdp_xmit", i);
+ ethtool_sprintf(&p, "rx_queue_%u_packets", i);
+ ethtool_sprintf(&p, "rx_queue_%u_bytes", i);
+ ethtool_sprintf(&p, "rx_queue_%u_xdp_drop", i);
+ ethtool_sprintf(&p, "rx_queue_%u_xdp_redirect", i);
+ ethtool_sprintf(&p, "rx_queue_%u_xdp_tx", i);
}
for_each_present_cpu(cpu) {
- for (i = 0; i < ARRAY_SIZE(pcpu_stats); i++) {
- sprintf(p, pcpu_stats[i].name, cpu);
- p += ETH_GSTRING_LEN;
- }
+ for (i = 0; i < ARRAY_SIZE(pcpu_stats); i++)
+ ethtool_sprintf(&p, pcpu_stats[i].name, cpu);
}
break;
@@ -1825,7 +1832,9 @@ static void __netvsc_get_ringparam(struct netvsc_device *nvdev,
}
static void netvsc_get_ringparam(struct net_device *ndev,
- struct ethtool_ringparam *ring)
+ struct ethtool_ringparam *ring,
+ struct kernel_ethtool_ringparam *kernel_ring,
+ struct netlink_ext_ack *extack)
{
struct net_device_context *ndevctx = netdev_priv(ndev);
struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
@@ -1837,7 +1846,9 @@ static void netvsc_get_ringparam(struct net_device *ndev,
}
static int netvsc_set_ringparam(struct net_device *ndev,
- struct ethtool_ringparam *ring)
+ struct ethtool_ringparam *ring,
+ struct kernel_ethtool_ringparam *kernel_ring,
+ struct netlink_ext_ack *extack)
{
struct net_device_context *ndevctx = netdev_priv(ndev);
struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
@@ -1947,6 +1958,23 @@ syncvf:
return ret;
}
+static int netvsc_get_regs_len(struct net_device *netdev)
+{
+ return VRSS_SEND_TAB_SIZE * sizeof(u32);
+}
+
+static void netvsc_get_regs(struct net_device *netdev,
+ struct ethtool_regs *regs, void *p)
+{
+ struct net_device_context *ndc = netdev_priv(netdev);
+ u32 *regs_buff = p;
+
+ /* increase the version, if buffer format is changed. */
+ regs->version = 1;
+
+ memcpy(regs_buff, ndc->tx_table, VRSS_SEND_TAB_SIZE * sizeof(u32));
+}
+
static u32 netvsc_get_msglevel(struct net_device *ndev)
{
struct net_device_context *ndev_ctx = netdev_priv(ndev);
@@ -1963,6 +1991,8 @@ static void netvsc_set_msglevel(struct net_device *ndev, u32 val)
static const struct ethtool_ops ethtool_ops = {
.get_drvinfo = netvsc_get_drvinfo,
+ .get_regs_len = netvsc_get_regs_len,
+ .get_regs = netvsc_get_regs,
.get_msglevel = netvsc_get_msglevel,
.set_msglevel = netvsc_set_msglevel,
.get_link = ethtool_op_get_link,
@@ -1998,6 +2028,7 @@ static const struct net_device_ops device_ops = {
.ndo_select_queue = netvsc_select_queue,
.ndo_get_stats64 = netvsc_get_stats64,
.ndo_bpf = netvsc_bpf,
+ .ndo_xdp_xmit = netvsc_ndoxdp_xmit,
};
/*
@@ -2011,11 +2042,11 @@ static void netvsc_link_change(struct work_struct *w)
container_of(w, struct net_device_context, dwork.work);
struct hv_device *device_obj = ndev_ctx->device_ctx;
struct net_device *net = hv_get_drvdata(device_obj);
+ unsigned long flags, next_reconfig, delay;
+ struct netvsc_reconfig *event = NULL;
struct netvsc_device *net_device;
struct rndis_device *rdev;
- struct netvsc_reconfig *event = NULL;
- bool notify = false, reschedule = false;
- unsigned long flags, next_reconfig, delay;
+ bool reschedule = false;
/* if changes are happening, comeback later */
if (!rtnl_trylock()) {
@@ -2064,7 +2095,7 @@ static void netvsc_link_change(struct work_struct *w)
netif_carrier_on(net);
netvsc_tx_enable(net_device, net);
} else {
- notify = true;
+ __netdev_notify_peers(net);
}
kfree(event);
break;
@@ -2093,9 +2124,6 @@ static void netvsc_link_change(struct work_struct *w)
rtnl_unlock();
- if (notify)
- netdev_notify_peers(net);
-
/* link_watch only sends one notification with current state per
* second, handle next reconfig event in 2 seconds.
*/
@@ -2249,6 +2277,7 @@ static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev)
{
struct device *parent = vf_netdev->dev.parent;
struct net_device_context *ndev_ctx;
+ struct net_device *ndev;
struct pci_dev *pdev;
u32 serial;
@@ -2271,8 +2300,29 @@ static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev)
if (!ndev_ctx->vf_alloc)
continue;
- if (ndev_ctx->vf_serial == serial)
- return hv_get_drvdata(ndev_ctx->device_ctx);
+ if (ndev_ctx->vf_serial != serial)
+ continue;
+
+ ndev = hv_get_drvdata(ndev_ctx->device_ctx);
+ if (ndev->addr_len != vf_netdev->addr_len ||
+ memcmp(ndev->perm_addr, vf_netdev->perm_addr,
+ ndev->addr_len) != 0)
+ continue;
+
+ return ndev;
+
+ }
+
+ /* Fallback path to check synthetic vf with
+ * help of mac addr
+ */
+ list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) {
+ ndev = hv_get_drvdata(ndev_ctx->device_ctx);
+ if (ether_addr_equal(vf_netdev->perm_addr, ndev->perm_addr)) {
+ netdev_notice(vf_netdev,
+ "falling back to mac addr based matching\n");
+ return ndev;
+ }
}
netdev_notice(vf_netdev,
@@ -2326,6 +2376,9 @@ static int netvsc_register_vf(struct net_device *vf_netdev)
dev_hold(vf_netdev);
rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev);
+ if (ndev->needed_headroom < vf_netdev->needed_headroom)
+ ndev->needed_headroom = vf_netdev->needed_headroom;
+
vf_netdev->wanted_features = ndev->features;
netdev_update_features(vf_netdev);
@@ -2335,13 +2388,26 @@ static int netvsc_register_vf(struct net_device *vf_netdev)
return NOTIFY_OK;
}
-/* VF up/down change detected, schedule to change data path */
-static int netvsc_vf_changed(struct net_device *vf_netdev)
+/* Change the data path when VF UP/DOWN/CHANGE are detected.
+ *
+ * Typically a UP or DOWN event is followed by a CHANGE event, so
+ * net_device_ctx->data_path_is_vf is used to cache the current data path
+ * to avoid the duplicate call of netvsc_switch_datapath() and the duplicate
+ * message.
+ *
+ * During hibernation, if a VF NIC driver (e.g. mlx5) preserves the network
+ * interface, there is only the CHANGE event and no UP or DOWN event.
+ */
+static int netvsc_vf_changed(struct net_device *vf_netdev, unsigned long event)
{
struct net_device_context *net_device_ctx;
struct netvsc_device *netvsc_dev;
struct net_device *ndev;
- bool vf_is_up = netif_running(vf_netdev);
+ bool vf_is_up = false;
+ int ret;
+
+ if (event != NETDEV_GOING_DOWN)
+ vf_is_up = netif_running(vf_netdev);
ndev = get_netvsc_byref(vf_netdev);
if (!ndev)
@@ -2352,9 +2418,25 @@ static int netvsc_vf_changed(struct net_device *vf_netdev)
if (!netvsc_dev)
return NOTIFY_DONE;
- netvsc_switch_datapath(ndev, vf_is_up);
- netdev_info(ndev, "Data path switched %s VF: %s\n",
- vf_is_up ? "to" : "from", vf_netdev->name);
+ if (net_device_ctx->data_path_is_vf == vf_is_up)
+ return NOTIFY_OK;
+
+ if (vf_is_up && !net_device_ctx->vf_alloc) {
+ netdev_info(ndev, "Waiting for the VF association from host\n");
+ wait_for_completion(&net_device_ctx->vf_add);
+ }
+
+ ret = netvsc_switch_datapath(ndev, vf_is_up);
+
+ if (ret) {
+ netdev_err(ndev,
+ "Data path failed to switch %s VF: %s, err: %d\n",
+ vf_is_up ? "to" : "from", vf_netdev->name, ret);
+ return NOTIFY_DONE;
+ } else {
+ netdev_info(ndev, "Data path switched %s VF: %s\n",
+ vf_is_up ? "to" : "from", vf_netdev->name);
+ }
return NOTIFY_OK;
}
@@ -2375,11 +2457,14 @@ static int netvsc_unregister_vf(struct net_device *vf_netdev)
netvsc_vf_setxdp(vf_netdev, NULL);
+ reinit_completion(&net_device_ctx->vf_add);
netdev_rx_handler_unregister(vf_netdev);
netdev_upper_dev_unlink(vf_netdev, ndev);
RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL);
dev_put(vf_netdev);
+ ndev->needed_headroom = RNDIS_AND_PPI_SIZE;
+
return NOTIFY_OK;
}
@@ -2412,6 +2497,7 @@ static int netvsc_probe(struct hv_device *dev,
INIT_DELAYED_WORK(&net_device_ctx->dwork, netvsc_link_change);
+ init_completion(&net_device_ctx->vf_add);
spin_lock_init(&net_device_ctx->lock);
INIT_LIST_HEAD(&net_device_ctx->reconfig_events);
INIT_DELAYED_WORK(&net_device_ctx->vf_takeover, netvsc_vf_setup);
@@ -2424,6 +2510,7 @@ static int netvsc_probe(struct hv_device *dev,
net->netdev_ops = &device_ops;
net->ethtool_ops = &ethtool_ops;
SET_NETDEV_DEV(net, &dev->device);
+ dma_set_min_align_mask(&dev->device, HV_HYP_PAGE_SIZE - 1);
/* We always need headroom for rndis header */
net->needed_headroom = RNDIS_AND_PPI_SIZE;
@@ -2449,7 +2536,7 @@ static int netvsc_probe(struct hv_device *dev,
goto rndis_failed;
}
- memcpy(net->dev_addr, device_info->mac_adr, ETH_ALEN);
+ eth_hw_addr_set(net, device_info->mac_adr);
/* We must get rtnl lock before scheduling nvdev->subchan_work,
* otherwise netvsc_subchan_work() can get rtnl lock first and wait
@@ -2470,6 +2557,8 @@ static int netvsc_probe(struct hv_device *dev,
NETIF_F_HW_VLAN_CTAG_RX;
net->vlan_features = net->features;
+ netdev_lockdep_set_classes(net);
+
/* MTU range: 68 - 1500 or 65521 */
net->min_mtu = NETVSC_MTU_MIN;
if (nvdev->nvsp_version >= NVSP_PROTOCOL_VERSION_2)
@@ -2554,8 +2643,8 @@ static int netvsc_remove(struct hv_device *dev)
static int netvsc_suspend(struct hv_device *dev)
{
struct net_device_context *ndev_ctx;
- struct net_device *vf_netdev, *net;
struct netvsc_device *nvdev;
+ struct net_device *net;
int ret;
net = hv_get_drvdata(dev);
@@ -2571,13 +2660,12 @@ static int netvsc_suspend(struct hv_device *dev)
goto out;
}
- vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
- if (vf_netdev)
- netvsc_unregister_vf(vf_netdev);
-
/* Save the current config info */
ndev_ctx->saved_netvsc_dev_info = netvsc_devinfo_get(nvdev);
-
+ if (!ndev_ctx->saved_netvsc_dev_info) {
+ ret = -ENOMEM;
+ goto out;
+ }
ret = netvsc_detach(net, nvdev);
out:
rtnl_unlock();
@@ -2595,6 +2683,12 @@ static int netvsc_resume(struct hv_device *dev)
rtnl_lock();
net_device_ctx = netdev_priv(net);
+
+ /* Reset the data path to the netvsc NIC before re-opening the vmbus
+ * channel. Later netvsc_netdev_event() will switch the data path to
+ * the VF upon the UP or CHANGE event.
+ */
+ net_device_ctx->data_path_is_vf = false;
device_info = net_device_ctx->saved_netvsc_dev_info;
ret = netvsc_attach(net, device_info);
@@ -2651,8 +2745,7 @@ static int netvsc_netdev_event(struct notifier_block *this,
return NOTIFY_DONE;
/* Avoid Bonding master dev with same MAC registering as VF */
- if ((event_dev->priv_flags & IFF_BONDING) &&
- (event_dev->flags & IFF_MASTER))
+ if (netif_is_bond_master(event_dev))
return NOTIFY_DONE;
switch (event) {
@@ -2662,7 +2755,9 @@ static int netvsc_netdev_event(struct notifier_block *this,
return netvsc_unregister_vf(event_dev);
case NETDEV_UP:
case NETDEV_DOWN:
- return netvsc_vf_changed(event_dev);
+ case NETDEV_CHANGE:
+ case NETDEV_GOING_DOWN:
+ return netvsc_vf_changed(event_dev, event);
default:
return NOTIFY_DONE;
}