aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan.c1
-rw-r--r--net/8021q/vlan.h12
-rw-r--r--net/8021q/vlan_core.c29
-rw-r--r--net/8021q/vlan_dev.c2
-rw-r--r--net/bridge/br_netfilter_hooks.c15
-rw-r--r--net/bridge/br_private.h2
-rw-r--r--net/bridge/br_vlan.c6
-rw-r--r--net/core/datagram.c43
-rw-r--r--net/core/dev.c8
-rw-r--r--net/core/dev_addr_lists.c97
-rw-r--r--net/core/flow_dissector.c3
-rw-r--r--net/core/rtnetlink.c24
-rw-r--r--net/core/skbuff.c45
-rw-r--r--net/core/sock.c8
-rw-r--r--net/dccp/ipv4.c13
-rw-r--r--net/dccp/ipv6.c13
-rw-r--r--net/dccp/proto.c2
-rw-r--r--net/ipv4/af_inet.c4
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/fou.c68
-rw-r--r--net/ipv4/gre_demux.c9
-rw-r--r--net/ipv4/icmp.c6
-rw-r--r--net/ipv4/inet_connection_sock.c14
-rw-r--r--net/ipv4/inet_hashtables.c34
-rw-r--r--net/ipv4/ip_gre.c50
-rw-r--r--net/ipv4/ip_input.c73
-rw-r--r--net/ipv4/ip_tunnel_core.c2
-rw-r--r--net/ipv4/ipip.c14
-rw-r--r--net/ipv4/metrics.c26
-rw-r--r--net/ipv4/protocol.c1
-rw-r--r--net/ipv4/raw.c31
-rw-r--r--net/ipv4/sysctl_net_ipv4.c11
-rw-r--r--net/ipv4/tcp_bbr.c15
-rw-r--r--net/ipv4/tcp_input.c7
-rw-r--r--net/ipv4/tcp_ipv4.c27
-rw-r--r--net/ipv4/tcp_output.c23
-rw-r--r--net/ipv4/tunnel4.c18
-rw-r--r--net/ipv4/udp.c190
-rw-r--r--net/ipv4/udp_impl.h2
-rw-r--r--net/ipv4/udp_offload.c109
-rw-r--r--net/ipv4/udp_tunnel.c1
-rw-r--r--net/ipv4/udplite.c4
-rw-r--r--net/ipv4/xfrm4_protocol.c18
-rw-r--r--net/ipv6/anycast.c6
-rw-r--r--net/ipv6/datagram.c10
-rw-r--r--net/ipv6/fou6.c74
-rw-r--r--net/ipv6/icmp.c4
-rw-r--r--net/ipv6/inet6_hashtables.c14
-rw-r--r--net/ipv6/ip6_gre.c18
-rw-r--r--net/ipv6/ip6_input.c63
-rw-r--r--net/ipv6/ip6_offload.c13
-rw-r--r--net/ipv6/ipv6_sockglue.c2
-rw-r--r--net/ipv6/raw.c5
-rw-r--r--net/ipv6/route.c5
-rw-r--r--net/ipv6/tcp_ipv6.c13
-rw-r--r--net/ipv6/tunnel6.c12
-rw-r--r--net/ipv6/udp.c241
-rw-r--r--net/ipv6/udp_impl.h4
-rw-r--r--net/ipv6/udp_offload.c6
-rw-r--r--net/ipv6/udplite.c5
-rw-r--r--net/ipv6/xfrm6_protocol.c18
-rw-r--r--net/iucv/af_iucv.c41
-rw-r--r--net/netfilter/nfnetlink_queue.c5
-rw-r--r--net/openvswitch/actions.c13
-rw-r--r--net/openvswitch/flow.c6
-rw-r--r--net/openvswitch/flow.h2
-rw-r--r--net/openvswitch/flow_netlink.c22
-rw-r--r--net/openvswitch/vport-netdev.c1
-rw-r--r--net/sched/act_vlan.c2
-rw-r--r--net/sched/cls_api.c256
-rw-r--r--net/sched/sch_api.c78
-rw-r--r--net/sched/sch_fq.c21
-rw-r--r--net/sched/sch_mq.c9
-rw-r--r--net/sched/sch_prio.c47
-rw-r--r--net/sched/sch_red.c30
-rw-r--r--net/sctp/input.c5
-rw-r--r--net/sctp/ipv6.c7
-rw-r--r--net/sctp/stream_interleave.c34
-rw-r--r--net/tipc/link.c59
-rw-r--r--net/tipc/msg.h1
80 files changed, 1631 insertions, 603 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 5e9950453955..1b7a375c6616 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -330,6 +330,7 @@ static void vlan_transfer_features(struct net_device *dev,
vlandev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
vlandev->priv_flags |= (vlan->real_dev->priv_flags & IFF_XMIT_DST_RELEASE);
+ vlandev->hw_enc_features = vlan_tnl_features(vlan->real_dev);
netdev_update_features(vlandev);
}
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 44df1c3df02d..c46daf09a501 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -92,6 +92,18 @@ static inline struct net_device *vlan_find_dev(struct net_device *real_dev,
return NULL;
}
+static inline netdev_features_t vlan_tnl_features(struct net_device *real_dev)
+{
+ netdev_features_t ret;
+
+ ret = real_dev->hw_enc_features &
+ (NETIF_F_CSUM_MASK | NETIF_F_ALL_TSO | NETIF_F_GSO_ENCAP_ALL);
+
+ if ((ret & NETIF_F_GSO_ENCAP_ALL) && (ret & NETIF_F_CSUM_MASK))
+ return (ret & ~NETIF_F_CSUM_MASK) | NETIF_F_HW_CSUM;
+ return 0;
+}
+
#define vlan_group_for_each_dev(grp, i, dev) \
for ((i) = 0; i < VLAN_PROTO_NUM * VLAN_N_VID; i++) \
if (((dev) = __vlan_group_get_device((grp), (i) / VLAN_N_VID, \
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 4f60e86f4b8d..57425049faf2 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -57,7 +57,7 @@ bool vlan_do_receive(struct sk_buff **skbp)
}
skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci);
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
rx_stats = this_cpu_ptr(vlan_dev_priv(vlan_dev)->vlan_pcpu_stats);
@@ -223,6 +223,33 @@ static int vlan_kill_rx_filter_info(struct net_device *dev, __be16 proto, u16 vi
return -ENODEV;
}
+int vlan_for_each(struct net_device *dev,
+ int (*action)(struct net_device *dev, int vid, void *arg),
+ void *arg)
+{
+ struct vlan_vid_info *vid_info;
+ struct vlan_info *vlan_info;
+ struct net_device *vdev;
+ int ret;
+
+ ASSERT_RTNL();
+
+ vlan_info = rtnl_dereference(dev->vlan_info);
+ if (!vlan_info)
+ return 0;
+
+ list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
+ vdev = vlan_group_get_device(&vlan_info->grp, vid_info->proto,
+ vid_info->vid);
+ ret = action(vdev, vid_info->vid, arg);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(vlan_for_each);
+
int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto)
{
struct net_device *real_dev = vlan_info->real_dev;
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index ff720f1ebf73..b2d9c8f27cd7 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -562,6 +562,7 @@ static int vlan_dev_init(struct net_device *dev)
dev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG |
NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE |
+ NETIF_F_GSO_ENCAP_ALL |
NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC |
NETIF_F_ALL_FCOE;
@@ -572,6 +573,7 @@ static int vlan_dev_init(struct net_device *dev)
netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n");
dev->vlan_features = real_dev->vlan_features & ~NETIF_F_ALL_FCOE;
+ dev->hw_enc_features = vlan_tnl_features(real_dev);
/* ipv6 shared card related stuff */
dev->dev_id = real_dev->dev_id;
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index b1b5e8516724..c9383c470a83 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -671,10 +671,8 @@ static int br_nf_push_frag_xmit(struct net *net, struct sock *sk, struct sk_buff
return 0;
}
- if (data->vlan_tci) {
- skb->vlan_tci = data->vlan_tci;
- skb->vlan_proto = data->vlan_proto;
- }
+ if (data->vlan_proto)
+ __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci);
skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size);
__skb_push(skb, data->encap_size);
@@ -740,8 +738,13 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff
data = this_cpu_ptr(&brnf_frag_data_storage);
- data->vlan_tci = skb->vlan_tci;
- data->vlan_proto = skb->vlan_proto;
+ if (skb_vlan_tag_present(skb)) {
+ data->vlan_tci = skb->vlan_tci;
+ data->vlan_proto = skb->vlan_proto;
+ } else {
+ data->vlan_proto = 0;
+ }
+
data->encap_size = nf_bridge_encap_header_len(skb);
data->size = ETH_HLEN + data->encap_size;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 2920e06a5403..67105c66584a 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -905,7 +905,7 @@ static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid)
int err = 0;
if (skb_vlan_tag_present(skb)) {
- *vid = skb_vlan_tag_get(skb) & VLAN_VID_MASK;
+ *vid = skb_vlan_tag_get_id(skb);
} else {
*vid = 0;
err = -EINVAL;
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 8c9297a01947..a7e869da21bf 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -420,7 +420,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br,
}
if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
if (p && (p->flags & BR_VLAN_TUNNEL) &&
br_handle_egress_vlan_tunnel(skb, v)) {
@@ -493,8 +493,8 @@ static bool __allowed_ingress(const struct net_bridge *br,
__vlan_hwaccel_put_tag(skb, br->vlan_proto, pvid);
else
/* Priority-tagged Frame.
- * At this point, We know that skb->vlan_tci had
- * VLAN_TAG_PRESENT bit and its VID field was 0x000.
+ * At this point, we know that skb->vlan_tci VID
+ * field was 0.
* We update only VID field and preserve PCP field.
*/
skb->vlan_tci |= pvid;
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 57f3a6fcfc1e..07983b90d2bd 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -728,49 +728,6 @@ fault:
return -EFAULT;
}
-__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
-{
- __sum16 sum;
-
- sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
- if (likely(!sum)) {
- if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
- !skb->csum_complete_sw)
- netdev_rx_csum_fault(skb->dev);
- }
- if (!skb_shared(skb))
- skb->csum_valid = !sum;
- return sum;
-}
-EXPORT_SYMBOL(__skb_checksum_complete_head);
-
-__sum16 __skb_checksum_complete(struct sk_buff *skb)
-{
- __wsum csum;
- __sum16 sum;
-
- csum = skb_checksum(skb, 0, skb->len, 0);
-
- /* skb->csum holds pseudo checksum */
- sum = csum_fold(csum_add(skb->csum, csum));
- if (likely(!sum)) {
- if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
- !skb->csum_complete_sw)
- netdev_rx_csum_fault(skb->dev);
- }
-
- if (!skb_shared(skb)) {
- /* Save full packet checksum */
- skb->csum = csum;
- skb->ip_summed = CHECKSUM_COMPLETE;
- skb->csum_complete_sw = 1;
- skb->csum_valid = !sum;
- }
-
- return sum;
-}
-EXPORT_SYMBOL(__skb_checksum_complete);
-
/**
* skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec.
* @skb: skbuff
diff --git a/net/core/dev.c b/net/core/dev.c
index 0ffcbdd55fa9..bf7e0a471186 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4889,7 +4889,7 @@ skip_classify:
* and set skb->priority like in vlan_do_receive()
* For the time being, just ignore Priority Code Point
*/
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
}
type = skb->protocol;
@@ -5386,7 +5386,9 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi,
}
diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
- diffs |= p->vlan_tci ^ skb->vlan_tci;
+ diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
+ if (skb_vlan_tag_present(p))
+ diffs |= p->vlan_tci ^ skb->vlan_tci;
diffs |= skb_metadata_dst_cmp(p, skb);
diffs |= skb_metadata_differs(p, skb);
if (maclen == ETH_HLEN)
@@ -5652,7 +5654,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
__skb_pull(skb, skb_headlen(skb));
/* restore the reserve we had after netdev_alloc_skb_ip_align() */
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
skb->dev = napi->dev;
skb->skb_iif = 0;
skb->encapsulation = 0;
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index d884d8f5f0e5..81a8cd4ea3bd 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -278,6 +278,103 @@ int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
EXPORT_SYMBOL(__hw_addr_sync_dev);
/**
+ * __hw_addr_ref_sync_dev - Synchronize device's multicast address list taking
+ * into account references
+ * @list: address list to synchronize
+ * @dev: device to sync
+ * @sync: function to call if address or reference on it should be added
+ * @unsync: function to call if address or some reference on it should removed
+ *
+ * This function is intended to be called from the ndo_set_rx_mode
+ * function of devices that require explicit address or references on it
+ * add/remove notifications. The unsync function may be NULL in which case
+ * the addresses or references on it requiring removal will simply be
+ * removed without any notification to the device. That is responsibility of
+ * the driver to identify and distribute address or references on it between
+ * internal address tables.
+ **/
+int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*sync)(struct net_device *,
+ const unsigned char *, int),
+ int (*unsync)(struct net_device *,
+ const unsigned char *, int))
+{
+ struct netdev_hw_addr *ha, *tmp;
+ int err, ref_cnt;
+
+ /* first go through and flush out any unsynced/stale entries */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ /* sync if address is not used */
+ if ((ha->sync_cnt << 1) <= ha->refcount)
+ continue;
+
+ /* if fails defer unsyncing address */
+ ref_cnt = ha->refcount - ha->sync_cnt;
+ if (unsync && unsync(dev, ha->addr, ref_cnt))
+ continue;
+
+ ha->refcount = (ref_cnt << 1) + 1;
+ ha->sync_cnt = ref_cnt;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+
+ /* go through and sync updated/new entries to the list */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ /* sync if address added or reused */
+ if ((ha->sync_cnt << 1) >= ha->refcount)
+ continue;
+
+ ref_cnt = ha->refcount - ha->sync_cnt;
+ err = sync(dev, ha->addr, ref_cnt);
+ if (err)
+ return err;
+
+ ha->refcount = ref_cnt << 1;
+ ha->sync_cnt = ref_cnt;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(__hw_addr_ref_sync_dev);
+
+/**
+ * __hw_addr_ref_unsync_dev - Remove synchronized addresses and references on
+ * it from device
+ * @list: address list to remove synchronized addresses (references on it) from
+ * @dev: device to sync
+ * @unsync: function to call if address and references on it should be removed
+ *
+ * Remove all addresses that were added to the device by
+ * __hw_addr_ref_sync_dev(). This function is intended to be called from the
+ * ndo_stop or ndo_open functions on devices that require explicit address (or
+ * references on it) add/remove notifications. If the unsync function pointer
+ * is NULL then this function can be used to just reset the sync_cnt for the
+ * addresses in the list.
+ **/
+void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*unsync)(struct net_device *,
+ const unsigned char *, int))
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (!ha->sync_cnt)
+ continue;
+
+ /* if fails defer unsyncing address */
+ if (unsync && unsync(dev, ha->addr, ha->sync_cnt))
+ continue;
+
+ ha->refcount -= ha->sync_cnt - 1;
+ ha->sync_cnt = 0;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+}
+EXPORT_SYMBOL(__hw_addr_ref_unsync_dev);
+
+/**
* __hw_addr_unsync_dev - Remove synchronized addresses from device
* @list: address list to remove synchronized addresses from
* @dev: device to sync
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 588f475019d4..2e8d91e54179 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -952,8 +952,7 @@ proto_again:
if (!vlan) {
key_vlan->vlan_id = skb_vlan_tag_get_id(skb);
- key_vlan->vlan_priority =
- (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT);
+ key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb);
} else {
key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) &
VLAN_VID_MASK;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 33d9227a8b80..86f2d9cbdae3 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2885,9 +2885,11 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
}
EXPORT_SYMBOL(rtnl_configure_link);
-struct net_device *rtnl_create_link(struct net *net,
- const char *ifname, unsigned char name_assign_type,
- const struct rtnl_link_ops *ops, struct nlattr *tb[])
+struct net_device *rtnl_create_link(struct net *net, const char *ifname,
+ unsigned char name_assign_type,
+ const struct rtnl_link_ops *ops,
+ struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
{
struct net_device *dev;
unsigned int num_tx_queues = 1;
@@ -2903,11 +2905,15 @@ struct net_device *rtnl_create_link(struct net *net,
else if (ops->get_num_rx_queues)
num_rx_queues = ops->get_num_rx_queues();
- if (num_tx_queues < 1 || num_tx_queues > 4096)
+ if (num_tx_queues < 1 || num_tx_queues > 4096) {
+ NL_SET_ERR_MSG(extack, "Invalid number of transmit queues");
return ERR_PTR(-EINVAL);
+ }
- if (num_rx_queues < 1 || num_rx_queues > 4096)
+ if (num_rx_queues < 1 || num_rx_queues > 4096) {
+ NL_SET_ERR_MSG(extack, "Invalid number of receive queues");
return ERR_PTR(-EINVAL);
+ }
dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type,
ops->setup, num_tx_queues, num_rx_queues);
@@ -3048,7 +3054,7 @@ replay:
if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
err = nla_parse_nested(attr, ops->maxtype,
linkinfo[IFLA_INFO_DATA],
- ops->policy, NULL);
+ ops->policy, extack);
if (err < 0)
return err;
data = attr;
@@ -3070,7 +3076,7 @@ replay:
m_ops->slave_maxtype,
linkinfo[IFLA_INFO_SLAVE_DATA],
m_ops->slave_policy,
- NULL);
+ extack);
if (err < 0)
return err;
slave_data = slave_attr;
@@ -3134,6 +3140,7 @@ replay:
goto replay;
}
#endif
+ NL_SET_ERR_MSG(extack, "Unknown device type");
return -EOPNOTSUPP;
}
@@ -3154,6 +3161,7 @@ replay:
link_net = get_net_ns_by_id(dest_net, id);
if (!link_net) {
+ NL_SET_ERR_MSG(extack, "Unknown network namespace id");
err = -EINVAL;
goto out;
}
@@ -3163,7 +3171,7 @@ replay:
}
dev = rtnl_create_link(link_net ? : dest_net, ifname,
- name_assign_type, ops, tb);
+ name_assign_type, ops, tb, extack);
if (IS_ERR(dev)) {
err = PTR_ERR(dev);
goto out;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b4ee5c8b928f..396fcb3baad0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2645,6 +2645,49 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
}
EXPORT_SYMBOL(skb_copy_and_csum_bits);
+__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
+{
+ __sum16 sum;
+
+ sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev);
+ }
+ if (!skb_shared(skb))
+ skb->csum_valid = !sum;
+ return sum;
+}
+EXPORT_SYMBOL(__skb_checksum_complete_head);
+
+__sum16 __skb_checksum_complete(struct sk_buff *skb)
+{
+ __wsum csum;
+ __sum16 sum;
+
+ csum = skb_checksum(skb, 0, skb->len, 0);
+
+ /* skb->csum holds pseudo checksum */
+ sum = csum_fold(csum_add(skb->csum, csum));
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev);
+ }
+
+ if (!skb_shared(skb)) {
+ /* Save full packet checksum */
+ skb->csum = csum;
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ skb->csum_complete_sw = 1;
+ skb->csum_valid = !sum;
+ }
+
+ return sum;
+}
+EXPORT_SYMBOL(__skb_checksum_complete);
+
static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum)
{
net_warn_ratelimited(
@@ -5123,7 +5166,7 @@ int skb_vlan_pop(struct sk_buff *skb)
int err;
if (likely(skb_vlan_tag_present(skb))) {
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
} else {
if (unlikely(!eth_type_vlan(skb->protocol)))
return 0;
diff --git a/net/core/sock.c b/net/core/sock.c
index 080a880a1761..6d7e189e3cd9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -567,6 +567,8 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval,
lock_sock(sk);
sk->sk_bound_dev_if = index;
+ if (sk->sk_prot->rehash)
+ sk->sk_prot->rehash(sk);
sk_dst_reset(sk);
release_sock(sk);
@@ -950,10 +952,12 @@ set_rcvbuf:
clear_bit(SOCK_PASSSEC, &sock->flags);
break;
case SO_MARK:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
ret = -EPERM;
- else
+ } else if (val != sk->sk_mark) {
sk->sk_mark = val;
+ sk_dst_reset(sk);
+ }
break;
case SO_RXQ_OVFL:
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 8e08cea6f178..26a21d97b6b0 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -231,7 +231,7 @@ EXPORT_SYMBOL(dccp_req_err);
* check at all. A more general error queue to queue errors for later handling
* is probably better.
*/
-static void dccp_v4_err(struct sk_buff *skb, u32 info)
+static int dccp_v4_err(struct sk_buff *skb, u32 info)
{
const struct iphdr *iph = (struct iphdr *)skb->data;
const u8 offset = iph->ihl << 2;
@@ -259,16 +259,18 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
inet_iif(skb), 0);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
- return;
+ return -ENOENT;
}
if (sk->sk_state == DCCP_TIME_WAIT) {
inet_twsk_put(inet_twsk(sk));
- return;
+ return 0;
}
seq = dccp_hdr_seq(dh);
- if (sk->sk_state == DCCP_NEW_SYN_RECV)
- return dccp_req_err(sk, seq);
+ if (sk->sk_state == DCCP_NEW_SYN_RECV) {
+ dccp_req_err(sk, seq);
+ return 0;
+ }
bh_lock_sock(sk);
/* If too many ICMPs get dropped on busy
@@ -357,6 +359,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
out:
bh_unlock_sock(sk);
sock_put(sk);
+ return 0;
}
static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 6344f1b18a6a..d5740bad5b18 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -68,7 +68,7 @@ static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb)
}
-static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
@@ -96,16 +96,18 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (!sk) {
__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
ICMP6_MIB_INERRORS);
- return;
+ return -ENOENT;
}
if (sk->sk_state == DCCP_TIME_WAIT) {
inet_twsk_put(inet_twsk(sk));
- return;
+ return 0;
}
seq = dccp_hdr_seq(dh);
- if (sk->sk_state == DCCP_NEW_SYN_RECV)
- return dccp_req_err(sk, seq);
+ if (sk->sk_state == DCCP_NEW_SYN_RECV) {
+ dccp_req_err(sk, seq);
+ return 0;
+ }
bh_lock_sock(sk);
if (sock_owned_by_user(sk))
@@ -183,6 +185,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
out:
bh_unlock_sock(sk);
sock_put(sk);
+ return 0;
}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 43733accf58e..658cd32bb7b3 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -948,6 +948,7 @@ int inet_dccp_listen(struct socket *sock, int backlog)
if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
goto out;
+ sk->sk_max_ack_backlog = backlog;
/* Really, if the socket is already in listen state
* we can only allow the backlog to be adjusted.
*/
@@ -960,7 +961,6 @@ int inet_dccp_listen(struct socket *sock, int backlog)
if (err)
goto out;
}
- sk->sk_max_ack_backlog = backlog;
err = 0;
out:
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1fbe2f815474..326c422c22f8 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -208,6 +208,7 @@ int inet_listen(struct socket *sock, int backlog)
if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
goto out;
+ sk->sk_max_ack_backlog = backlog;
/* Really, if the socket is already in listen state
* we can only allow the backlog to be adjusted.
*/
@@ -231,7 +232,6 @@ int inet_listen(struct socket *sock, int backlog)
goto out;
tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
}
- sk->sk_max_ack_backlog = backlog;
err = 0;
out:
@@ -1964,6 +1964,8 @@ static int __init inet_init(void)
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
+ raw_init();
+
ping_init();
/*
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index b5c3937ca6ec..5022bc63863a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1076,7 +1076,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
if (!fi)
goto failure;
fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx,
- cfg->fc_mx_len);
+ cfg->fc_mx_len, extack);
if (unlikely(IS_ERR(fi->fib_metrics))) {
err = PTR_ERR(fi->fib_metrics);
kfree(fi);
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 500a59906b87..0d0ad19ecb87 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -3,6 +3,7 @@
#include <linux/socket.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
+#include <linux/icmp.h>
#include <linux/udp.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -1003,15 +1004,82 @@ static int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
return 0;
}
+static int gue_err_proto_handler(int proto, struct sk_buff *skb, u32 info)
+{
+ const struct net_protocol *ipprot = rcu_dereference(inet_protos[proto]);
+
+ if (ipprot && ipprot->err_handler) {
+ if (!ipprot->err_handler(skb, info))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static int gue_err(struct sk_buff *skb, u32 info)
+{
+ int transport_offset = skb_transport_offset(skb);
+ struct guehdr *guehdr;
+ size_t optlen;
+ int ret;
+
+ if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr))
+ return -EINVAL;
+
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ switch (guehdr->version) {
+ case 0: /* Full GUE header present */
+ break;
+ case 1: {
+ /* Direct encasulation of IPv4 or IPv6 */
+ skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));
+
+ switch (((struct iphdr *)guehdr)->version) {
+ case 4:
+ ret = gue_err_proto_handler(IPPROTO_IPIP, skb, info);
+ goto out;
+#if IS_ENABLED(CONFIG_IPV6)
+ case 6:
+ ret = gue_err_proto_handler(IPPROTO_IPV6, skb, info);
+ goto out;
+#endif
+ default:
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ }
+ default: /* Undefined version */
+ return -EOPNOTSUPP;
+ }
+
+ if (guehdr->control)
+ return -ENOENT;
+
+ optlen = guehdr->hlen << 2;
+
+ if (validate_gue_flags(guehdr, optlen))
+ return -EINVAL;
+
+ skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));
+ ret = gue_err_proto_handler(guehdr->proto_ctype, skb, info);
+
+out:
+ skb_set_transport_header(skb, transport_offset);
+ return ret;
+}
+
static const struct ip_tunnel_encap_ops fou_iptun_ops = {
.encap_hlen = fou_encap_hlen,
.build_header = fou_build_header,
+ .err_handler = gue_err,
};
static const struct ip_tunnel_encap_ops gue_iptun_ops = {
.encap_hlen = gue_encap_hlen,
.build_header = gue_build_header,
+ .err_handler = gue_err,
};
static int ip_tunnel_encap_add_fou_ops(void)
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 7efe740c06eb..a4bf22ee3aed 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -151,20 +151,25 @@ drop:
return NET_RX_DROP;
}
-static void gre_err(struct sk_buff *skb, u32 info)
+static int gre_err(struct sk_buff *skb, u32 info)
{
const struct gre_protocol *proto;
const struct iphdr *iph = (const struct iphdr *)skb->data;
u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f;
+ int err = 0;
if (ver >= GREPROTO_MAX)
- return;
+ return -EINVAL;
rcu_read_lock();
proto = rcu_dereference(gre_proto[ver]);
if (proto && proto->err_handler)
proto->err_handler(skb, info);
+ else
+ err = -EPROTONOSUPPORT;
rcu_read_unlock();
+
+ return err;
}
static const struct net_protocol net_gre_protocol = {
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index d832beed6e3a..065997f414e6 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1079,7 +1079,7 @@ error:
goto drop;
}
-void icmp_err(struct sk_buff *skb, u32 info)
+int icmp_err(struct sk_buff *skb, u32 info)
{
struct iphdr *iph = (struct iphdr *)skb->data;
int offset = iph->ihl<<2;
@@ -1094,13 +1094,15 @@ void icmp_err(struct sk_buff *skb, u32 info)
*/
if (icmph->type != ICMP_ECHOREPLY) {
ping_err(skb, offset, info);
- return;
+ return 0;
}
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP);
else if (type == ICMP_REDIRECT)
ipv4_redirect(skb, net, 0, IPPROTO_ICMP);
+
+ return 0;
}
/*
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 15e7f7915a21..6ea523d71947 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -183,7 +183,9 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *
int i, low, high, attempt_half;
struct inet_bind_bucket *tb;
u32 remaining, offset;
+ int l3mdev;
+ l3mdev = inet_sk_bound_l3mdev(sk);
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan:
inet_get_local_port_range(net, &low, &high);
@@ -219,7 +221,8 @@ other_parity_scan:
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->port == port) {
+ if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
+ tb->port == port) {
if (!inet_csk_bind_conflict(sk, tb, false, false))
goto success;
goto next_port;
@@ -293,6 +296,9 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
struct net *net = sock_net(sk);
struct inet_bind_bucket *tb = NULL;
kuid_t uid = sock_i_uid(sk);
+ int l3mdev;
+
+ l3mdev = inet_sk_bound_l3mdev(sk);
if (!port) {
head = inet_csk_find_open_port(sk, &tb, &port);
@@ -306,11 +312,12 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->port == port)
+ if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
+ tb->port == port)
goto tb_found;
tb_not_found:
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
- net, head, port);
+ net, head, port, l3mdev);
if (!tb)
goto fail_unlock;
tb_found:
@@ -874,7 +881,6 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
reqsk_queue_alloc(&icsk->icsk_accept_queue);
- sk->sk_max_ack_backlog = backlog;
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 411dd7a90046..13890d5bfc34 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -65,12 +65,14 @@ static u32 sk_ehashfn(const struct sock *sk)
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
struct net *net,
struct inet_bind_hashbucket *head,
- const unsigned short snum)
+ const unsigned short snum,
+ int l3mdev)
{
struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
if (tb) {
write_pnet(&tb->ib_net, net);
+ tb->l3mdev = l3mdev;
tb->port = snum;
tb->fastreuse = 0;
tb->fastreuseport = 0;
@@ -135,6 +137,7 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
table->bhash_size);
struct inet_bind_hashbucket *head = &table->bhash[bhash];
struct inet_bind_bucket *tb;
+ int l3mdev;
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
@@ -143,6 +146,8 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
return -ENOENT;
}
if (tb->port != port) {
+ l3mdev = inet_sk_bound_l3mdev(sk);
+
/* NOTE: using tproxy and redirecting skbs to a proxy
* on a different listener port breaks the assumption
* that the listener socket's icsk_bind_hash is the same
@@ -150,12 +155,13 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
* create a new bind bucket for the child here. */
inet_bind_bucket_for_each(tb, &head->chain) {
if (net_eq(ib_net(tb), sock_net(sk)) &&
- tb->port == port)
+ tb->l3mdev == l3mdev && tb->port == port)
break;
}
if (!tb) {
tb = inet_bind_bucket_create(table->bind_bucket_cachep,
- sock_net(sk), head, port);
+ sock_net(sk), head, port,
+ l3mdev);
if (!tb) {
spin_unlock(&head->lock);
return -ENOMEM;
@@ -229,6 +235,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
{
int score = -1;
struct inet_sock *inet = inet_sk(sk);
+ bool dev_match;
if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
!ipv6_only_sock(sk)) {
@@ -239,15 +246,12 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score += 4;
}
- if (sk->sk_bound_dev_if || exact_dif) {
- bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
+ dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+ dif, sdif);
+ if (!dev_match)
+ return -1;
+ score += 4;
- if (!dev_match)
- return -1;
- if (sk->sk_bound_dev_if)
- score += 4;
- }
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
}
@@ -675,6 +679,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
u32 remaining, offset;
int ret, i, low, high;
static u32 hint;
+ int l3mdev;
if (port) {
head = &hinfo->bhash[inet_bhashfn(net, port,
@@ -693,6 +698,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
return ret;
}
+ l3mdev = inet_sk_bound_l3mdev(sk);
+
inet_get_local_port_range(net, &low, &high);
high++; /* [32768, 60999] -> [32768, 61000[ */
remaining = high - low;
@@ -719,7 +726,8 @@ other_parity_scan:
* the established check is already unique enough.
*/
inet_bind_bucket_for_each(tb, &head->chain) {
- if (net_eq(ib_net(tb), net) && tb->port == port) {
+ if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
+ tb->port == port) {
if (tb->fastreuse >= 0 ||
tb->fastreuseport >= 0)
goto next_port;
@@ -732,7 +740,7 @@ other_parity_scan:
}
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
- net, head, port);
+ net, head, port, l3mdev);
if (!tb) {
spin_unlock_bh(&head->lock);
return -ENOMEM;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 38befe829caf..76a9a5f7a40e 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -121,8 +121,8 @@ static unsigned int ipgre_net_id __read_mostly;
static unsigned int gre_tap_net_id __read_mostly;
static unsigned int erspan_net_id __read_mostly;
-static void ipgre_err(struct sk_buff *skb, u32 info,
- const struct tnl_ptk_info *tpi)
+static int ipgre_err(struct sk_buff *skb, u32 info,
+ const struct tnl_ptk_info *tpi)
{
/* All the routers (except for Linux) return only
@@ -146,17 +146,32 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
unsigned int data_len = 0;
struct ip_tunnel *t;
+ if (tpi->proto == htons(ETH_P_TEB))
+ itn = net_generic(net, gre_tap_net_id);
+ else if (tpi->proto == htons(ETH_P_ERSPAN) ||
+ tpi->proto == htons(ETH_P_ERSPAN2))
+ itn = net_generic(net, erspan_net_id);
+ else
+ itn = net_generic(net, ipgre_net_id);
+
+ iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
+ iph->daddr, iph->saddr, tpi->key);
+
+ if (!t)
+ return -ENOENT;
+
switch (type) {
default:
case ICMP_PARAMETERPROB:
- return;
+ return 0;
case ICMP_DEST_UNREACH:
switch (code) {
case ICMP_SR_FAILED:
case ICMP_PORT_UNREACH:
/* Impossible event. */
- return;
+ return 0;
default:
/* All others are translated to HOST_UNREACH.
rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -168,7 +183,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
case ICMP_TIME_EXCEEDED:
if (code != ICMP_EXC_TTL)
- return;
+ return 0;
data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
break;
@@ -176,40 +191,27 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
break;
}
- if (tpi->proto == htons(ETH_P_TEB))
- itn = net_generic(net, gre_tap_net_id);
- else if (tpi->proto == htons(ETH_P_ERSPAN) ||
- tpi->proto == htons(ETH_P_ERSPAN2))
- itn = net_generic(net, erspan_net_id);
- else
- itn = net_generic(net, ipgre_net_id);
-
- iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
- t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
- iph->daddr, iph->saddr, tpi->key);
-
- if (!t)
- return;
-
#if IS_ENABLED(CONFIG_IPV6)
if (tpi->proto == htons(ETH_P_IPV6) &&
!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
type, data_len))
- return;
+ return 0;
#endif
if (t->parms.iph.daddr == 0 ||
ipv4_is_multicast(t->parms.iph.daddr))
- return;
+ return 0;
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
- return;
+ return 0;
if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
t->err_count++;
else
t->err_count = 1;
t->err_time = jiffies;
+
+ return 0;
}
static void gre_err(struct sk_buff *skb, u32 info)
@@ -1601,7 +1603,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
memset(&tb, 0, sizeof(tb));
dev = rtnl_create_link(net, name, name_assign_type,
- &ipgre_tap_ops, tb);
+ &ipgre_tap_ops, tb, NULL);
if (IS_ERR(dev))
return dev;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 35a786c0aaa0..72250b4e466d 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -188,51 +188,50 @@ bool ip_call_ra_chain(struct sk_buff *skb)
return false;
}
-static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
{
- __skb_pull(skb, skb_network_header_len(skb));
-
- rcu_read_lock();
- {
- int protocol = ip_hdr(skb)->protocol;
- const struct net_protocol *ipprot;
- int raw;
+ const struct net_protocol *ipprot;
+ int raw, ret;
- resubmit:
- raw = raw_local_deliver(skb, protocol);
+resubmit:
+ raw = raw_local_deliver(skb, protocol);
- ipprot = rcu_dereference(inet_protos[protocol]);
- if (ipprot) {
- int ret;
-
- if (!ipprot->no_policy) {
- if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- kfree_skb(skb);
- goto out;
- }
- nf_reset(skb);
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot) {
+ if (!ipprot->no_policy) {
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ kfree_skb(skb);
+ return;
}
- ret = ipprot->handler(skb);
- if (ret < 0) {
- protocol = -ret;
- goto resubmit;
+ nf_reset(skb);
+ }
+ ret = ipprot->handler(skb);
+ if (ret < 0) {
+ protocol = -ret;
+ goto resubmit;
+ }
+ __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+ } else {
+ if (!raw) {
+ if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
+ icmp_send(skb, ICMP_DEST_UNREACH,
+ ICMP_PROT_UNREACH, 0);
}
- __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+ kfree_skb(skb);
} else {
- if (!raw) {
- if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
- icmp_send(skb, ICMP_DEST_UNREACH,
- ICMP_PROT_UNREACH, 0);
- }
- kfree_skb(skb);
- } else {
- __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
- consume_skb(skb);
- }
+ __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+ consume_skb(skb);
}
}
- out:
+}
+
+static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ __skb_pull(skb, skb_network_header_len(skb));
+
+ rcu_read_lock();
+ ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);
rcu_read_unlock();
return 0;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index dde671e97829..f45b96d715f0 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -120,7 +120,7 @@ int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
}
skb_clear_hash_if_not_l4(skb);
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
skb_set_queue_mapping(skb, 0);
skb_scrub_packet(skb, xnet);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index e65287c27e3d..57c5dd283a2c 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -140,6 +140,13 @@ static int ipip_err(struct sk_buff *skb, u32 info)
struct ip_tunnel *t;
int err = 0;
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->daddr, iph->saddr, 0);
+ if (!t) {
+ err = -ENOENT;
+ goto out;
+ }
+
switch (type) {
case ICMP_DEST_UNREACH:
switch (code) {
@@ -167,13 +174,6 @@ static int ipip_err(struct sk_buff *skb, u32 info)
goto out;
}
- t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
- iph->daddr, iph->saddr, 0);
- if (!t) {
- err = -ENOENT;
- goto out;
- }
-
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol);
goto out;
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
index 6d218f5a2e71..ca9a5fefdefa 100644
--- a/net/ipv4/metrics.c
+++ b/net/ipv4/metrics.c
@@ -6,7 +6,8 @@
#include <net/tcp.h>
static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
- int fc_mx_len, u32 *metrics)
+ int fc_mx_len, u32 *metrics,
+ struct netlink_ext_ack *extack)
{
bool ecn_ca = false;
struct nlattr *nla;
@@ -21,19 +22,26 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
if (!type)
continue;
- if (type > RTAX_MAX)
+ if (type > RTAX_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid metric type");
return -EINVAL;
+ }
if (type == RTAX_CC_ALGO) {
char tmp[TCP_CA_NAME_MAX];
nla_strlcpy(tmp, nla, sizeof(tmp));
val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
- if (val == TCP_CA_UNSPEC)
+ if (val == TCP_CA_UNSPEC) {
+ NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm");
return -EINVAL;
+ }
} else {
- if (nla_len(nla) != sizeof(u32))
+ if (nla_len(nla) != sizeof(u32)) {
+ NL_SET_ERR_MSG_ATTR(extack, nla,
+ "Invalid attribute in metrics");
return -EINVAL;
+ }
val = nla_get_u32(nla);
}
if (type == RTAX_ADVMSS && val > 65535 - 40)
@@ -42,8 +50,10 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
val = 65535 - 15;
if (type == RTAX_HOPLIMIT && val > 255)
val = 255;
- if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+ if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) {
+ NL_SET_ERR_MSG(extack, "Unknown flag set in feature mask in metrics attribute");
return -EINVAL;
+ }
metrics[type - 1] = val;
}
@@ -54,7 +64,8 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
}
struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
- int fc_mx_len)
+ int fc_mx_len,
+ struct netlink_ext_ack *extack)
{
struct dst_metrics *fib_metrics;
int err;
@@ -66,7 +77,8 @@ struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
if (unlikely(!fib_metrics))
return ERR_PTR(-ENOMEM);
- err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics);
+ err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics,
+ extack);
if (!err) {
refcount_set(&fib_metrics->refcnt, 1);
} else {
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 32a691b7ce2c..92d249e053be 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -29,6 +29,7 @@
#include <net/protocol.h>
struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
+EXPORT_SYMBOL(inet_protos);
const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
EXPORT_SYMBOL(inet_offloads);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 8ca3eb06ba04..fb1f02015a15 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -131,8 +131,7 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
if (net_eq(sock_net(sk), net) && inet->inet_num == num &&
!(inet->inet_daddr && inet->inet_daddr != raddr) &&
!(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
- !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
- sk->sk_bound_dev_if != sdif))
+ raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
goto found; /* gotcha */
}
sk = NULL;
@@ -805,7 +804,7 @@ out:
return copied;
}
-static int raw_init(struct sock *sk)
+static int raw_sk_init(struct sock *sk)
{
struct raw_sock *rp = raw_sk(sk);
@@ -970,7 +969,7 @@ struct proto raw_prot = {
.connect = ip4_datagram_connect,
.disconnect = __udp_disconnect,
.ioctl = raw_ioctl,
- .init = raw_init,
+ .init = raw_sk_init,
.setsockopt = raw_setsockopt,
.getsockopt = raw_getsockopt,
.sendmsg = raw_sendmsg,
@@ -1133,4 +1132,28 @@ void __init raw_proc_exit(void)
{
unregister_pernet_subsys(&raw_net_ops);
}
+
+static void raw_sysctl_init_net(struct net *net)
+{
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ net->ipv4.sysctl_raw_l3mdev_accept = 1;
+#endif
+}
+
+static int __net_init raw_sysctl_init(struct net *net)
+{
+ raw_sysctl_init_net(net);
+ return 0;
+}
+
+static struct pernet_operations __net_initdata raw_sysctl_ops = {
+ .init = raw_sysctl_init,
+};
+
+void __init raw_init(void)
+{
+ raw_sysctl_init_net(&init_net);
+ if (register_pernet_subsys(&raw_sysctl_ops))
+ panic("RAW: failed to init sysctl parameters.\n");
+}
#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 891ed2f91467..ba0fc4b18465 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -602,6 +602,17 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = ipv4_ping_group_range,
},
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ {
+ .procname = "raw_l3mdev_accept",
+ .data = &init_net.ipv4.sysctl_raw_l3mdev_accept,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
{
.procname = "tcp_ecn",
.data = &init_net.ipv4.sysctl_tcp_ecn,
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 9277abdd822a..0f497fc49c3f 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -128,7 +128,12 @@ static const u32 bbr_probe_rtt_mode_ms = 200;
/* Skip TSO below the following bandwidth (bits/sec): */
static const int bbr_min_tso_rate = 1200000;
-/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */
+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+ * In order to help drive the network toward lower queues and low latency while
+ * maintaining high utilization, the average pacing rate aims to be slightly
+ * lower than the estimated bandwidth. This is an important aspect of the
+ * design.
+ */
static const int bbr_pacing_margin_percent = 1;
/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
@@ -247,13 +252,7 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
}
-/* Pace using current bw estimate and a gain factor. In order to help drive the
- * network toward lower queues while maintaining high utilization and low
- * latency, the average pacing rate aims to be slightly (~1%) lower than the
- * estimated bandwidth. This is an important aspect of the design. In this
- * implementation this slightly lower pacing rate is achieved implicitly by not
- * including link-layer headers in the packet size used for the pacing rate.
- */
+/* Pace using current bw estimate and a gain factor. */
static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
{
struct tcp_sock *tp = tcp_sk(sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2868ef28ce52..edaaebfbcd46 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2457,8 +2457,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
tp->prior_cwnd - 1;
sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
- } else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
- !(flag & FLAG_LOST_RETRANS)) {
+ } else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) ==
+ FLAG_RETRANS_DATA_ACKED) {
sndcnt = min_t(int, delta,
max_t(int, tp->prr_delivered - tp->prr_out,
newly_acked_sacked) + 1);
@@ -3610,7 +3610,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (flag & FLAG_UPDATE_TS_RECENT)
tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
- if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
+ if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) ==
+ FLAG_SND_UNA_ADVANCED) {
/* Window is constant, pure forward advance.
* No more checks are required.
* Note, we use the fact that SND.UNA>=SND.WL2.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index de47038afdf0..0952d4b772e7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -423,7 +423,7 @@ EXPORT_SYMBOL(tcp_req_err);
*
*/
-void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
{
const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
@@ -446,20 +446,21 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
inet_iif(icmp_skb), 0);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
- return;
+ return -ENOENT;
}
if (sk->sk_state == TCP_TIME_WAIT) {
inet_twsk_put(inet_twsk(sk));
- return;
+ return 0;
}
seq = ntohl(th->seq);
- if (sk->sk_state == TCP_NEW_SYN_RECV)
- return tcp_req_err(sk, seq,
- type == ICMP_PARAMETERPROB ||
- type == ICMP_TIME_EXCEEDED ||
- (type == ICMP_DEST_UNREACH &&
- (code == ICMP_NET_UNREACH ||
- code == ICMP_HOST_UNREACH)));
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
+ type == ICMP_TIME_EXCEEDED ||
+ (type == ICMP_DEST_UNREACH &&
+ (code == ICMP_NET_UNREACH ||
+ code == ICMP_HOST_UNREACH)));
+ return 0;
+ }
bh_lock_sock(sk);
/* If too many ICMPs get dropped on busy
@@ -541,7 +542,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
skb = tcp_rtx_queue_head(sk);
- BUG_ON(!skb);
tcp_mstamp_refresh(tp);
delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
@@ -613,6 +613,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
out:
bh_unlock_sock(sk);
sock_put(sk);
+ return 0;
}
void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
@@ -2573,8 +2574,8 @@ static int __net_init tcp_sk_init(struct net *net)
* which are too large can cause TCP streams to be bursty.
*/
net->ipv4.sysctl_tcp_tso_win_divisor = 3;
- /* Default TSQ limit of four TSO segments */
- net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
+ /* Default TSQ limit of 16 TSO segments */
+ net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
/* rfc5961 challenge ack rate limiting */
net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
net->ipv4.sysctl_tcp_min_tso_segs = 2;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9c34b97d365d..d40d4cc53319 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1907,10 +1907,11 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
bool *is_cwnd_limited, u32 max_segs)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 age, send_win, cong_win, limit, in_flight;
+ u32 send_win, cong_win, limit, in_flight;
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *head;
int win_divisor;
+ s64 delta;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto send_now;
@@ -1919,9 +1920,12 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
goto send_now;
/* Avoid bursty behavior by allowing defer
- * only if the last write was recent.
+ * only if the last write was recent (1 ms).
+ * Note that tp->tcp_wstamp_ns can be in the future if we have
+ * packets waiting in a qdisc or device for EDT delivery.
*/
- if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0)
+ delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
+ if (delta > 0)
goto send_now;
in_flight = tcp_packets_in_flight(tp);
@@ -1944,6 +1948,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
goto send_now;
+ /* If this packet won't get more data, do not wait. */
+ if (TCP_SKB_CB(skb)->eor)
+ goto send_now;
+
win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
if (win_divisor) {
u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
@@ -1968,9 +1976,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
head = tcp_rtx_queue_head(sk);
if (!head)
goto send_now;
- age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head));
+ delta = tp->tcp_clock_cache - head->tstamp;
/* If next ACK is likely to come too late (half srtt), do not defer */
- if (age < (tp->srtt_us >> 4))
+ if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
goto send_now;
/* Ok, it looks like it is advisable to defer. */
@@ -2212,8 +2220,9 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit = max_t(unsigned long,
2 * skb->truesize,
sk->sk_pacing_rate >> sk->sk_pacing_shift);
- limit = min_t(unsigned long, limit,
- sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
+ if (sk->sk_pacing_status == SK_PACING_NONE)
+ limit = min_t(unsigned long, limit,
+ sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
limit <<= factor;
if (refcount_read(&sk->sk_wmem_alloc) > limit) {
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index c0630013c1ae..33bf8e9c8663 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -149,34 +149,40 @@ drop:
}
#endif
-static void tunnel4_err(struct sk_buff *skb, u32 info)
+static int tunnel4_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
for_each_tunnel_rcu(tunnel4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
#if IS_ENABLED(CONFIG_IPV6)
-static void tunnel64_err(struct sk_buff *skb, u32 info)
+static int tunnel64_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
for_each_tunnel_rcu(tunnel64_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
#endif
#if IS_ENABLED(CONFIG_MPLS)
-static void tunnelmpls4_err(struct sk_buff *skb, u32 info)
+static int tunnelmpls4_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
for_each_tunnel_rcu(tunnelmpls4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
#endif
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1976fddb9e00..6f8890c5bc7e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -105,6 +105,7 @@
#include <net/net_namespace.h>
#include <net/icmp.h>
#include <net/inet_hashtables.h>
+#include <net/ip_tunnels.h>
#include <net/route.h>
#include <net/checksum.h>
#include <net/xfrm.h>
@@ -115,6 +116,7 @@
#include "udp_impl.h"
#include <net/sock_reuseport.h>
#include <net/addrconf.h>
+#include <net/udp_tunnel.h>
struct udp_table udp_table __read_mostly;
EXPORT_SYMBOL(udp_table);
@@ -371,6 +373,7 @@ static int compute_score(struct sock *sk, struct net *net,
{
int score;
struct inet_sock *inet;
+ bool dev_match;
if (!net_eq(sock_net(sk), net) ||
udp_sk(sk)->udp_port_hash != hnum ||
@@ -398,15 +401,11 @@ static int compute_score(struct sock *sk, struct net *net,
score += 4;
}
- if (sk->sk_bound_dev_if || exact_dif) {
- bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
-
- if (!dev_match)
- return -1;
- if (sk->sk_bound_dev_if)
- score += 4;
- }
+ dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+ dif, sdif);
+ if (!dev_match)
+ return -1;
+ score += 4;
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
@@ -585,6 +584,89 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
return true;
}
+DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
+void udp_encap_enable(void)
+{
+ static_branch_enable(&udp_encap_needed_key);
+}
+EXPORT_SYMBOL(udp_encap_enable);
+
+/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
+ * through error handlers in encapsulations looking for a match.
+ */
+static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info)
+{
+ int i;
+
+ for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
+ int (*handler)(struct sk_buff *skb, u32 info);
+
+ if (!iptun_encaps[i])
+ continue;
+ handler = rcu_dereference(iptun_encaps[i]->err_handler);
+ if (handler && !handler(skb, info))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+/* Try to match ICMP errors to UDP tunnels by looking up a socket without
+ * reversing source and destination port: this will match tunnels that force the
+ * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that
+ * lwtunnels might actually break this assumption by being configured with
+ * different destination ports on endpoints, in this case we won't be able to
+ * trace ICMP messages back to them.
+ *
+ * If this doesn't match any socket, probe tunnels with arbitrary destination
+ * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port
+ * we've sent packets to won't necessarily match the local destination port.
+ *
+ * Then ask the tunnel implementation to match the error against a valid
+ * association.
+ *
+ * Return an error if we can't find a match, the socket if we need further
+ * processing, zero otherwise.
+ */
+static struct sock *__udp4_lib_err_encap(struct net *net,
+ const struct iphdr *iph,
+ struct udphdr *uh,
+ struct udp_table *udptable,
+ struct sk_buff *skb, u32 info)
+{
+ int network_offset, transport_offset;
+ struct sock *sk;
+
+ network_offset = skb_network_offset(skb);
+ transport_offset = skb_transport_offset(skb);
+
+ /* Network header needs to point to the outer IPv4 header inside ICMP */
+ skb_reset_network_header(skb);
+
+ /* Transport header needs to point to the UDP header */
+ skb_set_transport_header(skb, iph->ihl << 2);
+
+ sk = __udp4_lib_lookup(net, iph->daddr, uh->source,
+ iph->saddr, uh->dest, skb->dev->ifindex, 0,
+ udptable, NULL);
+ if (sk) {
+ int (*lookup)(struct sock *sk, struct sk_buff *skb);
+ struct udp_sock *up = udp_sk(sk);
+
+ lookup = READ_ONCE(up->encap_err_lookup);
+ if (!lookup || lookup(sk, skb))
+ sk = NULL;
+ }
+
+ if (!sk)
+ sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info));
+
+ skb_set_transport_header(skb, transport_offset);
+ skb_set_network_header(skb, network_offset);
+
+ return sk;
+}
+
/*
* This routine is called by the ICMP module when it gets some
* sort of error condition. If err < 0 then the socket should
@@ -596,13 +678,14 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
* to find the appropriate port.
*/
-void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
+int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
{
struct inet_sock *inet;
const struct iphdr *iph = (const struct iphdr *)skb->data;
struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
+ bool tunnel = false;
struct sock *sk;
int harderr;
int err;
@@ -612,8 +695,21 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
iph->saddr, uh->source, skb->dev->ifindex,
inet_sdif(skb), udptable, NULL);
if (!sk) {
- __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
- return; /* No socket for error */
+ /* No socket for error: try tunnels before discarding */
+ sk = ERR_PTR(-ENOENT);
+ if (static_branch_unlikely(&udp_encap_needed_key)) {
+ sk = __udp4_lib_err_encap(net, iph, uh, udptable, skb,
+ info);
+ if (!sk)
+ return 0;
+ }
+
+ if (IS_ERR(sk)) {
+ __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
+ return PTR_ERR(sk);
+ }
+
+ tunnel = true;
}
err = 0;
@@ -656,6 +752,10 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
* RFC1122: OK. Passes ICMP errors back to application, as per
* 4.1.3.3.
*/
+ if (tunnel) {
+ /* ...not for tunnels though: we don't have a sending socket */
+ goto out;
+ }
if (!inet->recverr) {
if (!harderr || sk->sk_state != TCP_ESTABLISHED)
goto out;
@@ -665,12 +765,12 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
sk->sk_err = err;
sk->sk_error_report(sk);
out:
- return;
+ return 0;
}
-void udp_err(struct sk_buff *skb, u32 info)
+int udp_err(struct sk_buff *skb, u32 info)
{
- __udp4_lib_err(skb, info, &udp_table);
+ return __udp4_lib_err(skb, info, &udp_table);
}
/*
@@ -1713,6 +1813,10 @@ try_again:
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
*addr_len = sizeof(*sin);
}
+
+ if (udp_sk(sk)->gro_enabled)
+ udp_cmsg_recv(msg, sk, skb);
+
if (inet->cmsg_flags)
ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
@@ -1889,13 +1993,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return 0;
}
-DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
-void udp_encap_enable(void)
-{
- static_branch_enable(&udp_encap_needed_key);
-}
-EXPORT_SYMBOL(udp_encap_enable);
-
/* returns:
* -1: error
* 0: success
@@ -1904,7 +2001,7 @@ EXPORT_SYMBOL(udp_encap_enable);
* Note that in the success and error cases, the skb is assumed to
* have either been requeued or freed.
*/
-static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
@@ -2007,6 +2104,27 @@ drop:
return -1;
}
+static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff *next, *segs;
+ int ret;
+
+ if (likely(!udp_unexpected_gso(sk, skb)))
+ return udp_queue_rcv_one_skb(sk, skb);
+
+ BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET);
+ __skb_push(skb, -skb_mac_offset(skb));
+ segs = udp_rcv_segment(sk, skb, true);
+ for (skb = segs; skb; skb = next) {
+ next = skb->next;
+ __skb_pull(skb, skb_transport_offset(skb));
+ ret = udp_queue_rcv_one_skb(sk, skb);
+ if (ret > 0)
+ ip_protocol_deliver_rcu(dev_net(skb->dev), skb, -ret);
+ }
+ return 0;
+}
+
/* For TCP sockets, sk_rx_dst is protected by socket lock
* For UDP, we use xchg() to guard against concurrent changes.
*/
@@ -2398,11 +2516,15 @@ void udp_destroy_sock(struct sock *sk)
bool slow = lock_sock_fast(sk);
udp_flush_pending_frames(sk);
unlock_sock_fast(sk, slow);
- if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
- void (*encap_destroy)(struct sock *sk);
- encap_destroy = READ_ONCE(up->encap_destroy);
- if (encap_destroy)
- encap_destroy(sk);
+ if (static_branch_unlikely(&udp_encap_needed_key)) {
+ if (up->encap_type) {
+ void (*encap_destroy)(struct sock *sk);
+ encap_destroy = READ_ONCE(up->encap_destroy);
+ if (encap_destroy)
+ encap_destroy(sk);
+ }
+ if (up->encap_enabled)
+ static_branch_disable(&udp_encap_needed_key);
}
}
@@ -2447,7 +2569,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
/* FALLTHROUGH */
case UDP_ENCAP_L2TPINUDP:
up->encap_type = val;
- udp_encap_enable();
+ lock_sock(sk);
+ udp_tunnel_encap_enable(sk->sk_socket);
+ release_sock(sk);
break;
default:
err = -ENOPROTOOPT;
@@ -2469,6 +2593,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
up->gso_size = val;
break;
+ case UDP_GRO:
+ lock_sock(sk);
+ if (valbool)
+ udp_tunnel_encap_enable(sk->sk_socket);
+ up->gro_enabled = valbool;
+ release_sock(sk);
+ break;
+
/*
* UDP-Lite's partial checksum coverage (RFC 3828).
*/
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index e7d18b140287..322672655419 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -7,7 +7,7 @@
#include <net/inet_common.h>
int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int);
-void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
+int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
int udp_v4_get_port(struct sock *sk, unsigned short snum);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 802f2bc00d69..0646d61f4fa8 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -343,6 +343,54 @@ out:
return segs;
}
+#define UDP_GRO_CNT_MAX 64
+static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
+ struct sk_buff *skb)
+{
+ struct udphdr *uh = udp_hdr(skb);
+ struct sk_buff *pp = NULL;
+ struct udphdr *uh2;
+ struct sk_buff *p;
+
+ /* requires non zero csum, for symmetry with GSO */
+ if (!uh->check) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+
+ /* pull encapsulating udp header */
+ skb_gro_pull(skb, sizeof(struct udphdr));
+ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
+
+ list_for_each_entry(p, head, list) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ uh2 = udp_hdr(p);
+
+ /* Match ports only, as csum is always non zero */
+ if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ /* Terminate the flow on len mismatch or if it grow "too much".
+ * Under small packet flood GRO count could elsewhere grow a lot
+ * leading to execessive truesize values
+ */
+ if (!skb_gro_receive(p, skb) &&
+ NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX)
+ pp = p;
+ else if (uh->len != uh2->len)
+ pp = p;
+
+ return pp;
+ }
+
+ /* mismatch, but we never need to flush */
+ return NULL;
+}
+
struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
struct udphdr *uh, udp_lookup_t lookup)
{
@@ -353,23 +401,27 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
int flush = 1;
struct sock *sk;
+ rcu_read_lock();
+ sk = (*lookup)(skb, uh->source, uh->dest);
+ if (!sk)
+ goto out_unlock;
+
+ if (udp_sk(sk)->gro_enabled) {
+ pp = call_gro_receive(udp_gro_receive_segment, head, skb);
+ rcu_read_unlock();
+ return pp;
+ }
+
if (NAPI_GRO_CB(skb)->encap_mark ||
(skb->ip_summed != CHECKSUM_PARTIAL &&
NAPI_GRO_CB(skb)->csum_cnt == 0 &&
- !NAPI_GRO_CB(skb)->csum_valid))
- goto out;
+ !NAPI_GRO_CB(skb)->csum_valid) ||
+ !udp_sk(sk)->gro_receive)
+ goto out_unlock;
/* mark that this skb passed once through the tunnel gro layer */
NAPI_GRO_CB(skb)->encap_mark = 1;
- rcu_read_lock();
- sk = (*lookup)(skb, uh->source, uh->dest);
-
- if (sk && udp_sk(sk)->gro_receive)
- goto unflush;
- goto out_unlock;
-
-unflush:
flush = 0;
list_for_each_entry(p, head, list) {
@@ -394,7 +446,6 @@ unflush:
out_unlock:
rcu_read_unlock();
-out:
skb_gro_flush_final(skb, pp, flush);
return pp;
}
@@ -427,6 +478,19 @@ flush:
return NULL;
}
+static int udp_gro_complete_segment(struct sk_buff *skb)
+{
+ struct udphdr *uh = udp_hdr(skb);
+
+ skb->csum_start = (unsigned char *)uh - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4;
+ return 0;
+}
+
int udp_gro_complete(struct sk_buff *skb, int nhoff,
udp_lookup_t lookup)
{
@@ -437,16 +501,21 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
uh->len = newlen;
- /* Set encapsulation before calling into inner gro_complete() functions
- * to make them set up the inner offsets.
- */
- skb->encapsulation = 1;
-
rcu_read_lock();
sk = (*lookup)(skb, uh->source, uh->dest);
- if (sk && udp_sk(sk)->gro_complete)
+ if (sk && udp_sk(sk)->gro_enabled) {
+ err = udp_gro_complete_segment(skb);
+ } else if (sk && udp_sk(sk)->gro_complete) {
+ skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM
+ : SKB_GSO_UDP_TUNNEL;
+
+ /* Set encapsulation before calling into inner gro_complete()
+ * functions to make them set up the inner offsets.
+ */
+ skb->encapsulation = 1;
err = udp_sk(sk)->gro_complete(sk, skb,
nhoff + sizeof(struct udphdr));
+ }
rcu_read_unlock();
if (skb->remcsum_offload)
@@ -461,13 +530,9 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
const struct iphdr *iph = ip_hdr(skb);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
- if (uh->check) {
- skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+ if (uh->check)
uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
iph->daddr, 0);
- } else {
- skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
- }
return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
}
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 6539ff15e9a3..d0c412fc56ad 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -68,6 +68,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
udp_sk(sk)->encap_type = cfg->encap_type;
udp_sk(sk)->encap_rcv = cfg->encap_rcv;
+ udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup;
udp_sk(sk)->encap_destroy = cfg->encap_destroy;
udp_sk(sk)->gro_receive = cfg->gro_receive;
udp_sk(sk)->gro_complete = cfg->gro_complete;
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 8545457752fb..39c7f17d916f 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -25,9 +25,9 @@ static int udplite_rcv(struct sk_buff *skb)
return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
}
-static void udplite_err(struct sk_buff *skb, u32 info)
+static int udplite_err(struct sk_buff *skb, u32 info)
{
- __udp4_lib_err(skb, info, &udplite_table);
+ return __udp4_lib_err(skb, info, &udplite_table);
}
static const struct net_protocol udplite_protocol = {
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index 8dd0e6ab8606..35c54865dc42 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -106,13 +106,15 @@ static int xfrm4_esp_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm4_esp_err(struct sk_buff *skb, u32 info)
+static int xfrm4_esp_err(struct sk_buff *skb, u32 info)
{
struct xfrm4_protocol *handler;
for_each_protocol_rcu(esp4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static int xfrm4_ah_rcv(struct sk_buff *skb)
@@ -132,13 +134,15 @@ static int xfrm4_ah_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm4_ah_err(struct sk_buff *skb, u32 info)
+static int xfrm4_ah_err(struct sk_buff *skb, u32 info)
{
struct xfrm4_protocol *handler;
for_each_protocol_rcu(ah4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
@@ -158,13 +162,15 @@ static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
+static int xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
{
struct xfrm4_protocol *handler;
for_each_protocol_rcu(ipcomp4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static const struct net_protocol esp4_protocol = {
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 94999058e110..cca3b3603c42 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -433,7 +433,6 @@ static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *ad
bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
const struct in6_addr *addr)
{
- unsigned int hash = inet6_acaddr_hash(net, addr);
struct net_device *nh_dev;
struct ifacaddr6 *aca;
bool found = false;
@@ -441,7 +440,9 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
rcu_read_lock();
if (dev)
found = ipv6_chk_acast_dev(dev, addr);
- else
+ else {
+ unsigned int hash = inet6_acaddr_hash(net, addr);
+
hlist_for_each_entry_rcu(aca, &inet6_acaddr_lst[hash],
aca_addr_lst) {
nh_dev = fib6_info_nh_dev(aca->aca_rt);
@@ -452,6 +453,7 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
break;
}
}
+ }
rcu_read_unlock();
return found;
}
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 1ede7a16a0be..bde08aa549f3 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -772,6 +772,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
case IPV6_2292PKTINFO:
{
struct net_device *dev = NULL;
+ int src_idx;
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) {
err = -EINVAL;
@@ -779,12 +780,15 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
}
src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
+ src_idx = src_info->ipi6_ifindex;
- if (src_info->ipi6_ifindex) {
+ if (src_idx) {
if (fl6->flowi6_oif &&
- src_info->ipi6_ifindex != fl6->flowi6_oif)
+ src_idx != fl6->flowi6_oif &&
+ (sk->sk_bound_dev_if != fl6->flowi6_oif ||
+ !sk_dev_equal_l3scope(sk, src_idx)))
return -EINVAL;
- fl6->flowi6_oif = src_info->ipi6_ifindex;
+ fl6->flowi6_oif = src_idx;
}
addr_type = __ipv6_addr_type(&src_info->ipi6_addr);
diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c
index 6de3c04b0f30..bd675c61deb1 100644
--- a/net/ipv6/fou6.c
+++ b/net/ipv6/fou6.c
@@ -4,6 +4,7 @@
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/udp.h>
+#include <linux/icmpv6.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <net/fou.h>
@@ -69,14 +70,87 @@ static int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
return 0;
}
+static int gue6_err_proto_handler(int proto, struct sk_buff *skb,
+ struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, u32 info)
+{
+ const struct inet6_protocol *ipprot;
+
+ ipprot = rcu_dereference(inet6_protos[proto]);
+ if (ipprot && ipprot->err_handler) {
+ if (!ipprot->err_handler(skb, opt, type, code, offset, info))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static int gue6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ int transport_offset = skb_transport_offset(skb);
+ struct guehdr *guehdr;
+ size_t optlen;
+ int ret;
+
+ if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr))
+ return -EINVAL;
+
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ switch (guehdr->version) {
+ case 0: /* Full GUE header present */
+ break;
+ case 1: {
+ /* Direct encasulation of IPv4 or IPv6 */
+ skb_set_transport_header(skb, -(int)sizeof(struct icmp6hdr));
+
+ switch (((struct iphdr *)guehdr)->version) {
+ case 4:
+ ret = gue6_err_proto_handler(IPPROTO_IPIP, skb, opt,
+ type, code, offset, info);
+ goto out;
+ case 6:
+ ret = gue6_err_proto_handler(IPPROTO_IPV6, skb, opt,
+ type, code, offset, info);
+ goto out;
+ default:
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ }
+ default: /* Undefined version */
+ return -EOPNOTSUPP;
+ }
+
+ if (guehdr->control)
+ return -ENOENT;
+
+ optlen = guehdr->hlen << 2;
+
+ if (validate_gue_flags(guehdr, optlen))
+ return -EINVAL;
+
+ skb_set_transport_header(skb, -(int)sizeof(struct icmp6hdr));
+ ret = gue6_err_proto_handler(guehdr->proto_ctype, skb,
+ opt, type, code, offset, info);
+
+out:
+ skb_set_transport_header(skb, transport_offset);
+ return ret;
+}
+
+
static const struct ip6_tnl_encap_ops fou_ip6tun_ops = {
.encap_hlen = fou_encap_hlen,
.build_header = fou6_build_header,
+ .err_handler = gue6_err,
};
static const struct ip6_tnl_encap_ops gue_ip6tun_ops = {
.encap_hlen = gue_encap_hlen,
.build_header = gue6_build_header,
+ .err_handler = gue6_err,
};
static int ip6_tnl_encap_add_fou_ops(void)
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index c9c53ade55c3..5d7aa2c2770c 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -84,7 +84,7 @@ static inline struct sock *icmpv6_sk(struct net *net)
return net->ipv6.icmp_sk[smp_processor_id()];
}
-static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
/* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */
@@ -100,6 +100,8 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (!(type & ICMPV6_INFOMSG_MASK))
if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
ping_err(skb, offset, ntohl(info));
+
+ return 0;
}
static int icmpv6_rcv(struct sk_buff *skb);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 3d7c7460a0c5..5eeeba7181a1 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -99,6 +99,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
const int dif, const int sdif, bool exact_dif)
{
int score = -1;
+ bool dev_match;
if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum &&
sk->sk_family == PF_INET6) {
@@ -109,15 +110,12 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score++;
}
- if (sk->sk_bound_dev_if || exact_dif) {
- bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
+ dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+ dif, sdif);
+ if (!dev_match)
+ return -1;
+ score++;
- if (!dev_match)
- return -1;
- if (sk->sk_bound_dev_if)
- score++;
- }
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
}
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 515adbdba1d2..81b69bcee714 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -423,7 +423,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev)
}
-static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct net *net = dev_net(skb->dev);
@@ -433,13 +433,13 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IPV6),
offset) < 0)
- return;
+ return -EINVAL;
ipv6h = (const struct ipv6hdr *)skb->data;
t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr,
tpi.key, tpi.proto);
if (!t)
- return;
+ return -ENOENT;
switch (type) {
struct ipv6_tlv_tnl_enc_lim *tel;
@@ -449,14 +449,14 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
t->parms.name);
if (code != ICMPV6_PORT_UNREACH)
break;
- return;
+ return 0;
case ICMPV6_TIME_EXCEED:
if (code == ICMPV6_EXC_HOPLIMIT) {
net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n",
t->parms.name);
break;
}
- return;
+ return 0;
case ICMPV6_PARAMPROB:
teli = 0;
if (code == ICMPV6_HDR_FIELD)
@@ -472,14 +472,14 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n",
t->parms.name);
}
- return;
+ return 0;
case ICMPV6_PKT_TOOBIG:
ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
- return;
+ return 0;
case NDISC_REDIRECT:
ip6_redirect(skb, net, skb->dev->ifindex, 0,
sock_net_uid(net, NULL));
- return;
+ return 0;
}
if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO))
@@ -487,6 +487,8 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
else
t->err_count = 1;
t->err_time = jiffies;
+
+ return 0;
}
static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 96577e742afd..3c06cc9e9b79 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -319,28 +319,26 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
/*
* Deliver the packet to the host
*/
-
-
-static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr,
+ bool have_final)
{
const struct inet6_protocol *ipprot;
struct inet6_dev *idev;
unsigned int nhoff;
- int nexthdr;
bool raw;
- bool have_final = false;
/*
* Parse extension headers
*/
- rcu_read_lock();
resubmit:
idev = ip6_dst_idev(skb_dst(skb));
- if (!pskb_pull(skb, skb_transport_offset(skb)))
- goto discard;
nhoff = IP6CB(skb)->nhoff;
- nexthdr = skb_network_header(skb)[nhoff];
+ if (!have_final) {
+ if (!pskb_pull(skb, skb_transport_offset(skb)))
+ goto discard;
+ nexthdr = skb_network_header(skb)[nhoff];
+ }
resubmit_final:
raw = raw6_local_deliver(skb, nexthdr);
@@ -359,6 +357,8 @@ resubmit_final:
}
} else if (ipprot->flags & INET6_PROTO_FINAL) {
const struct ipv6hdr *hdr;
+ int sdif = inet6_sdif(skb);
+ struct net_device *dev;
/* Only do this once for first final protocol */
have_final = true;
@@ -371,9 +371,19 @@ resubmit_final:
skb_postpull_rcsum(skb, skb_network_header(skb),
skb_network_header_len(skb));
hdr = ipv6_hdr(skb);
+
+ /* skb->dev passed may be master dev for vrfs. */
+ if (sdif) {
+ dev = dev_get_by_index_rcu(net, sdif);
+ if (!dev)
+ goto discard;
+ } else {
+ dev = skb->dev;
+ }
+
if (ipv6_addr_is_multicast(&hdr->daddr) &&
- !ipv6_chk_mcast_addr(skb->dev, &hdr->daddr,
- &hdr->saddr) &&
+ !ipv6_chk_mcast_addr(dev, &hdr->daddr,
+ &hdr->saddr) &&
!ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb)))
goto discard;
}
@@ -411,13 +421,19 @@ resubmit_final:
consume_skb(skb);
}
}
- rcu_read_unlock();
- return 0;
+ return;
discard:
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
- rcu_read_unlock();
kfree_skb(skb);
+}
+
+static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ rcu_read_lock();
+ ip6_protocol_deliver_rcu(net, skb, 0, false);
+ rcu_read_unlock();
+
return 0;
}
@@ -432,15 +448,32 @@ EXPORT_SYMBOL_GPL(ip6_input);
int ip6_mc_input(struct sk_buff *skb)
{
+ int sdif = inet6_sdif(skb);
const struct ipv6hdr *hdr;
+ struct net_device *dev;
bool deliver;
__IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev),
__in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST,
skb->len);
+ /* skb->dev passed may be master dev for vrfs. */
+ if (sdif) {
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif);
+ if (!dev) {
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+ } else {
+ dev = skb->dev;
+ }
+
hdr = ipv6_hdr(skb);
- deliver = ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, NULL);
+ deliver = ipv6_chk_mcast_addr(dev, &hdr->daddr, NULL);
+ if (sdif)
+ rcu_read_unlock();
#ifdef CONFIG_IPV6_MROUTE
/*
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index c7e495f12011..70f525c33cb6 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -229,14 +229,21 @@ static struct sk_buff *ipv6_gro_receive(struct list_head *head,
* XXX skbs on the gro_list have all been parsed and pulled
* already so we don't need to compare nlen
* (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops)))
- * memcmp() alone below is suffcient, right?
+ * memcmp() alone below is sufficient, right?
*/
if ((first_word & htonl(0xF00FFFFF)) ||
- memcmp(&iph->nexthdr, &iph2->nexthdr,
- nlen - offsetof(struct ipv6hdr, nexthdr))) {
+ !ipv6_addr_equal(&iph->saddr, &iph2->saddr) ||
+ !ipv6_addr_equal(&iph->daddr, &iph2->daddr) ||
+ *(u16 *)&iph->nexthdr != *(u16 *)&iph2->nexthdr) {
+not_same_flow:
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
+ if (unlikely(nlen > sizeof(struct ipv6hdr))) {
+ if (memcmp(iph + 1, iph2 + 1,
+ nlen - sizeof(struct ipv6hdr)))
+ goto not_same_flow;
+ }
/* flush if Traffic Class fields are different */
NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000));
NAPI_GRO_CB(p)->flush |= flush;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 381ce38940ae..973e215c3114 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -486,7 +486,7 @@ sticky_done:
retv = -EFAULT;
break;
}
- if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != sk->sk_bound_dev_if)
+ if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex))
goto e_inval;
np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 5e0efd3954e9..aed7eb5c2123 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -86,9 +86,8 @@ struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
!ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr))
continue;
- if (sk->sk_bound_dev_if &&
- sk->sk_bound_dev_if != dif &&
- sk->sk_bound_dev_if != sdif)
+ if (!raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+ dif, sdif))
continue;
if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2a7423c39456..b2447b7c7303 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2975,7 +2975,8 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
if (!rt)
goto out;
- rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len);
+ rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
+ extack);
if (IS_ERR(rt->fib6_metrics)) {
err = PTR_ERR(rt->fib6_metrics);
/* Do not leave garbage there. */
@@ -3708,7 +3709,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net,
if (!f6i)
return ERR_PTR(-ENOMEM);
- f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0);
+ f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL);
f6i->dst_nocount = true;
f6i->dst_host = true;
f6i->fib6_protocol = RTPROT_KERNEL;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 03e6b7a2bc53..a3f559162521 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -349,7 +349,7 @@ static void tcp_v6_mtu_reduced(struct sock *sk)
}
}
-static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
@@ -371,17 +371,19 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (!sk) {
__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
ICMP6_MIB_INERRORS);
- return;
+ return -ENOENT;
}
if (sk->sk_state == TCP_TIME_WAIT) {
inet_twsk_put(inet_twsk(sk));
- return;
+ return 0;
}
seq = ntohl(th->seq);
fatal = icmpv6_err_convert(type, code, &err);
- if (sk->sk_state == TCP_NEW_SYN_RECV)
- return tcp_req_err(sk, seq, fatal);
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ tcp_req_err(sk, seq, fatal);
+ return 0;
+ }
bh_lock_sock(sk);
if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
@@ -467,6 +469,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
out:
bh_unlock_sock(sk);
sock_put(sk);
+ return 0;
}
diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c
index dae25cad05cd..1991dede7367 100644
--- a/net/ipv6/tunnel6.c
+++ b/net/ipv6/tunnel6.c
@@ -134,24 +134,28 @@ drop:
return 0;
}
-static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct xfrm6_tunnel *handler;
for_each_tunnel_rcu(tunnel6_handlers, handler)
if (!handler->err_handler(skb, opt, type, code, offset, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
-static void tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct xfrm6_tunnel *handler;
for_each_tunnel_rcu(tunnel46_handlers, handler)
if (!handler->err_handler(skb, opt, type, code, offset, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static const struct inet6_protocol tunnel6_protocol = {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d2d97d07ef27..dde51fc7ac16 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -45,6 +45,7 @@
#include <net/raw.h>
#include <net/tcp_states.h>
#include <net/ip6_checksum.h>
+#include <net/ip6_tunnel.h>
#include <net/xfrm.h>
#include <net/inet_hashtables.h>
#include <net/inet6_hashtables.h>
@@ -117,6 +118,7 @@ static int compute_score(struct sock *sk, struct net *net,
{
int score;
struct inet_sock *inet;
+ bool dev_match;
if (!net_eq(sock_net(sk), net) ||
udp_sk(sk)->udp_port_hash != hnum ||
@@ -144,15 +146,10 @@ static int compute_score(struct sock *sk, struct net *net,
score++;
}
- if (sk->sk_bound_dev_if || exact_dif) {
- bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
-
- if (!dev_match)
- return -1;
- if (sk->sk_bound_dev_if)
- score++;
- }
+ dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif);
+ if (!dev_match)
+ return -1;
+ score++;
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
@@ -329,6 +326,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int err;
int is_udplite = IS_UDPLITE(sk);
bool checksum_valid = false;
+ struct udp_mib *mib;
int is_udp4;
if (flags & MSG_ERRQUEUE)
@@ -352,6 +350,7 @@ try_again:
msg->msg_flags |= MSG_TRUNC;
is_udp4 = (skb->protocol == htons(ETH_P_IP));
+ mib = __UDPX_MIB(sk, is_udp4);
/*
* If checksum is needed at all, try to do it while copying the
@@ -380,24 +379,13 @@ try_again:
if (unlikely(err)) {
if (!peeked) {
atomic_inc(&sk->sk_drops);
- if (is_udp4)
- UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
- is_udplite);
- else
- UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
- is_udplite);
+ SNMP_INC_STATS(mib, UDP_MIB_INERRORS);
}
kfree_skb(skb);
return err;
}
- if (!peeked) {
- if (is_udp4)
- UDP_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS,
- is_udplite);
- else
- UDP6_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS,
- is_udplite);
- }
+ if (!peeked)
+ SNMP_INC_STATS(mib, UDP_MIB_INDATAGRAMS);
sock_recv_ts_and_drops(msg, sk, skb);
@@ -421,6 +409,9 @@ try_again:
*addr_len = sizeof(*sin6);
}
+ if (udp_sk(sk)->gro_enabled)
+ udp_cmsg_recv(msg, sk, skb);
+
if (np->rxopt.all)
ip6_datagram_recv_common_ctl(sk, msg, skb);
@@ -443,17 +434,8 @@ try_again:
csum_copy_err:
if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags,
udp_skb_destructor)) {
- if (is_udp4) {
- UDP_INC_STATS(sock_net(sk),
- UDP_MIB_CSUMERRORS, is_udplite);
- UDP_INC_STATS(sock_net(sk),
- UDP_MIB_INERRORS, is_udplite);
- } else {
- UDP6_INC_STATS(sock_net(sk),
- UDP_MIB_CSUMERRORS, is_udplite);
- UDP6_INC_STATS(sock_net(sk),
- UDP_MIB_INERRORS, is_udplite);
- }
+ SNMP_INC_STATS(mib, UDP_MIB_CSUMERRORS);
+ SNMP_INC_STATS(mib, UDP_MIB_INERRORS);
}
kfree_skb(skb);
@@ -463,15 +445,106 @@ csum_copy_err:
goto try_again;
}
-void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
- u8 type, u8 code, int offset, __be32 info,
- struct udp_table *udptable)
+DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
+void udpv6_encap_enable(void)
+{
+ static_branch_enable(&udpv6_encap_needed_key);
+}
+EXPORT_SYMBOL(udpv6_encap_enable);
+
+/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
+ * through error handlers in encapsulations looking for a match.
+ */
+static int __udp6_lib_err_encap_no_sk(struct sk_buff *skb,
+ struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, u32 info)
+{
+ int i;
+
+ for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
+ int (*handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, u32 info);
+
+ if (!ip6tun_encaps[i])
+ continue;
+ handler = rcu_dereference(ip6tun_encaps[i]->err_handler);
+ if (handler && !handler(skb, opt, type, code, offset, info))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+/* Try to match ICMP errors to UDP tunnels by looking up a socket without
+ * reversing source and destination port: this will match tunnels that force the
+ * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that
+ * lwtunnels might actually break this assumption by being configured with
+ * different destination ports on endpoints, in this case we won't be able to
+ * trace ICMP messages back to them.
+ *
+ * If this doesn't match any socket, probe tunnels with arbitrary destination
+ * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port
+ * we've sent packets to won't necessarily match the local destination port.
+ *
+ * Then ask the tunnel implementation to match the error against a valid
+ * association.
+ *
+ * Return an error if we can't find a match, the socket if we need further
+ * processing, zero otherwise.
+ */
+static struct sock *__udp6_lib_err_encap(struct net *net,
+ const struct ipv6hdr *hdr, int offset,
+ struct udphdr *uh,
+ struct udp_table *udptable,
+ struct sk_buff *skb,
+ struct inet6_skb_parm *opt,
+ u8 type, u8 code, __be32 info)
+{
+ int network_offset, transport_offset;
+ struct sock *sk;
+
+ network_offset = skb_network_offset(skb);
+ transport_offset = skb_transport_offset(skb);
+
+ /* Network header needs to point to the outer IPv6 header inside ICMP */
+ skb_reset_network_header(skb);
+
+ /* Transport header needs to point to the UDP header */
+ skb_set_transport_header(skb, offset);
+
+ sk = __udp6_lib_lookup(net, &hdr->daddr, uh->source,
+ &hdr->saddr, uh->dest,
+ inet6_iif(skb), 0, udptable, skb);
+ if (sk) {
+ int (*lookup)(struct sock *sk, struct sk_buff *skb);
+ struct udp_sock *up = udp_sk(sk);
+
+ lookup = READ_ONCE(up->encap_err_lookup);
+ if (!lookup || lookup(sk, skb))
+ sk = NULL;
+ }
+
+ if (!sk) {
+ sk = ERR_PTR(__udp6_lib_err_encap_no_sk(skb, opt, type, code,
+ offset, info));
+ }
+
+ skb_set_transport_header(skb, transport_offset);
+ skb_set_network_header(skb, network_offset);
+
+ return sk;
+}
+
+int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info,
+ struct udp_table *udptable)
{
struct ipv6_pinfo *np;
const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
const struct in6_addr *saddr = &hdr->saddr;
const struct in6_addr *daddr = &hdr->daddr;
struct udphdr *uh = (struct udphdr *)(skb->data+offset);
+ bool tunnel = false;
struct sock *sk;
int harderr;
int err;
@@ -480,9 +553,23 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
inet6_iif(skb), inet6_sdif(skb), udptable, skb);
if (!sk) {
- __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
- ICMP6_MIB_INERRORS);
- return;
+ /* No socket for error: try tunnels before discarding */
+ sk = ERR_PTR(-ENOENT);
+ if (static_branch_unlikely(&udpv6_encap_needed_key)) {
+ sk = __udp6_lib_err_encap(net, hdr, offset, uh,
+ udptable, skb,
+ opt, type, code, info);
+ if (!sk)
+ return 0;
+ }
+
+ if (IS_ERR(sk)) {
+ __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
+ ICMP6_MIB_INERRORS);
+ return PTR_ERR(sk);
+ }
+
+ tunnel = true;
}
harderr = icmpv6_err_convert(type, code, &err);
@@ -496,10 +583,19 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
harderr = 1;
}
if (type == NDISC_REDIRECT) {
- ip6_sk_redirect(skb, sk);
+ if (tunnel) {
+ ip6_redirect(skb, sock_net(sk), inet6_iif(skb),
+ sk->sk_mark, sk->sk_uid);
+ } else {
+ ip6_sk_redirect(skb, sk);
+ }
goto out;
}
+ /* Tunnels don't have an application socket: don't pass errors back */
+ if (tunnel)
+ goto out;
+
if (!np->recverr) {
if (!harderr || sk->sk_state != TCP_ESTABLISHED)
goto out;
@@ -510,7 +606,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
sk->sk_err = err;
sk->sk_error_report(sk);
out:
- return;
+ return 0;
}
static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
@@ -541,21 +637,14 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return 0;
}
-static __inline__ void udpv6_err(struct sk_buff *skb,
- struct inet6_skb_parm *opt, u8 type,
- u8 code, int offset, __be32 info)
-{
- __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
-}
-
-DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
-void udpv6_encap_enable(void)
+static __inline__ int udpv6_err(struct sk_buff *skb,
+ struct inet6_skb_parm *opt, u8 type,
+ u8 code, int offset, __be32 info)
{
- static_branch_enable(&udpv6_encap_needed_key);
+ return __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
}
-EXPORT_SYMBOL(udpv6_encap_enable);
-static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
@@ -638,10 +727,32 @@ drop:
return -1;
}
+static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff *next, *segs;
+ int ret;
+
+ if (likely(!udp_unexpected_gso(sk, skb)))
+ return udpv6_queue_rcv_one_skb(sk, skb);
+
+ __skb_push(skb, -skb_mac_offset(skb));
+ segs = udp_rcv_segment(sk, skb, false);
+ for (skb = segs; skb; skb = next) {
+ next = skb->next;
+ __skb_pull(skb, skb_transport_offset(skb));
+
+ ret = udpv6_queue_rcv_one_skb(sk, skb);
+ if (ret > 0)
+ ip6_protocol_deliver_rcu(dev_net(skb->dev), skb, ret,
+ true);
+ }
+ return 0;
+}
+
static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
__be16 loc_port, const struct in6_addr *loc_addr,
__be16 rmt_port, const struct in6_addr *rmt_addr,
- int dif, unsigned short hnum)
+ int dif, int sdif, unsigned short hnum)
{
struct inet_sock *inet = inet_sk(sk);
@@ -653,7 +764,7 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
(inet->inet_dport && inet->inet_dport != rmt_port) ||
(!ipv6_addr_any(&sk->sk_v6_daddr) &&
!ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
- (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) ||
+ !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif) ||
(!ipv6_addr_any(&sk->sk_v6_rcv_saddr) &&
!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)))
return false;
@@ -687,6 +798,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
unsigned int offset = offsetof(typeof(*sk), sk_node);
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
int dif = inet6_iif(skb);
+ int sdif = inet6_sdif(skb);
struct hlist_node *node;
struct sk_buff *nskb;
@@ -701,7 +813,8 @@ start_lookup:
sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr,
- uh->source, saddr, dif, hnum))
+ uh->source, saddr, dif, sdif,
+ hnum))
continue;
/* If zero checksum and no_check is not on for
* the socket then skip it.
@@ -1458,11 +1571,15 @@ void udpv6_destroy_sock(struct sock *sk)
udp_v6_flush_pending_frames(sk);
release_sock(sk);
- if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) {
- void (*encap_destroy)(struct sock *sk);
- encap_destroy = READ_ONCE(up->encap_destroy);
- if (encap_destroy)
- encap_destroy(sk);
+ if (static_branch_unlikely(&udpv6_encap_needed_key)) {
+ if (up->encap_type) {
+ void (*encap_destroy)(struct sock *sk);
+ encap_destroy = READ_ONCE(up->encap_destroy);
+ if (encap_destroy)
+ encap_destroy(sk);
+ }
+ if (up->encap_enabled)
+ static_branch_disable(&udpv6_encap_needed_key);
}
inet6_destroy_sock(sk);
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index 7903e21c178b..5730e6503cb4 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -9,8 +9,8 @@
#include <net/transp_v6.h>
int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int);
-void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int,
- __be32, struct udp_table *);
+int __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int,
+ __be32, struct udp_table *);
int udp_v6_get_port(struct sock *sk, unsigned short snum);
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 1b8e161ac527..828b2457f97b 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -147,13 +147,9 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
- if (uh->check) {
- skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+ if (uh->check)
uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr,
&ipv6h->daddr, 0);
- } else {
- skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
- }
return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb);
}
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 5000ad6878e6..a125aebc29e5 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -20,11 +20,12 @@ static int udplitev6_rcv(struct sk_buff *skb)
return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
}
-static void udplitev6_err(struct sk_buff *skb,
+static int udplitev6_err(struct sk_buff *skb,
struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
- __udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table);
+ return __udp6_lib_err(skb, opt, type, code, offset, info,
+ &udplite_table);
}
static const struct inet6_protocol udplitev6_protocol = {
diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c
index b2dc8ce49378..cc979b702c89 100644
--- a/net/ipv6/xfrm6_protocol.c
+++ b/net/ipv6/xfrm6_protocol.c
@@ -80,14 +80,16 @@ static int xfrm6_esp_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct xfrm6_protocol *handler;
for_each_protocol_rcu(esp6_handlers, handler)
if (!handler->err_handler(skb, opt, type, code, offset, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static int xfrm6_ah_rcv(struct sk_buff *skb)
@@ -107,14 +109,16 @@ static int xfrm6_ah_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct xfrm6_protocol *handler;
for_each_protocol_rcu(ah6_handlers, handler)
if (!handler->err_handler(skb, opt, type, code, offset, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static int xfrm6_ipcomp_rcv(struct sk_buff *skb)
@@ -134,14 +138,16 @@ static int xfrm6_ipcomp_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct xfrm6_protocol *handler;
for_each_protocol_rcu(ipcomp6_handlers, handler)
if (!handler->err_handler(skb, opt, type, code, offset, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static const struct inet6_protocol esp6_protocol = {
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 0bed4cc20603..78ea5a739d10 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1873,30 +1873,26 @@ static void iucv_callback_txdone(struct iucv_path *path,
struct sock *sk = path->private;
struct sk_buff *this = NULL;
struct sk_buff_head *list = &iucv_sk(sk)->send_skb_q;
- struct sk_buff *list_skb = list->next;
+ struct sk_buff *list_skb;
unsigned long flags;
bh_lock_sock(sk);
- if (!skb_queue_empty(list)) {
- spin_lock_irqsave(&list->lock, flags);
- while (list_skb != (struct sk_buff *)list) {
- if (msg->tag == IUCV_SKB_CB(list_skb)->tag) {
- this = list_skb;
- break;
- }
- list_skb = list_skb->next;
+ spin_lock_irqsave(&list->lock, flags);
+ skb_queue_walk(list, list_skb) {
+ if (msg->tag == IUCV_SKB_CB(list_skb)->tag) {
+ this = list_skb;
+ break;
}
- if (this)
- __skb_unlink(this, list);
-
- spin_unlock_irqrestore(&list->lock, flags);
+ }
+ if (this)
+ __skb_unlink(this, list);
+ spin_unlock_irqrestore(&list->lock, flags);
- if (this) {
- kfree_skb(this);
- /* wake up any process waiting for sending */
- iucv_sock_wake_msglim(sk);
- }
+ if (this) {
+ kfree_skb(this);
+ /* wake up any process waiting for sending */
+ iucv_sock_wake_msglim(sk);
}
if (sk->sk_state == IUCV_CLOSING) {
@@ -2284,11 +2280,7 @@ static void afiucv_hs_callback_txnotify(struct sk_buff *skb,
list = &iucv->send_skb_q;
spin_lock_irqsave(&list->lock, flags);
- if (skb_queue_empty(list))
- goto out_unlock;
- list_skb = list->next;
- nskb = list_skb->next;
- while (list_skb != (struct sk_buff *)list) {
+ skb_queue_walk_safe(list, list_skb, nskb) {
if (skb_shinfo(list_skb) == skb_shinfo(skb)) {
switch (n) {
case TX_NOTIFY_OK:
@@ -2321,10 +2313,7 @@ static void afiucv_hs_callback_txnotify(struct sk_buff *skb,
}
break;
}
- list_skb = nskb;
- nskb = nskb->next;
}
-out_unlock:
spin_unlock_irqrestore(&list->lock, flags);
if (sk->sk_state == IUCV_CLOSING) {
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 43041f087eb3..1ce30efe6854 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1148,8 +1148,9 @@ static int nfqa_parse_bridge(struct nf_queue_entry *entry,
if (!tb[NFQA_VLAN_TCI] || !tb[NFQA_VLAN_PROTO])
return -EINVAL;
- entry->skb->vlan_tci = ntohs(nla_get_be16(tb[NFQA_VLAN_TCI]));
- entry->skb->vlan_proto = nla_get_be16(tb[NFQA_VLAN_PROTO]);
+ __vlan_hwaccel_put_tag(entry->skb,
+ nla_get_be16(tb[NFQA_VLAN_PROTO]),
+ ntohs(nla_get_be16(tb[NFQA_VLAN_TCI])));
}
if (nfqa[NFQA_L2HDR]) {
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 85ae53d8fd09..e47ebbbe71b8 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -301,7 +301,7 @@ static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key,
key->eth.vlan.tpid = vlan->vlan_tpid;
}
return skb_vlan_push(skb, vlan->vlan_tpid,
- ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
+ ntohs(vlan->vlan_tci) & ~VLAN_CFI_MASK);
}
/* 'src' is already properly masked. */
@@ -822,8 +822,10 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *sk
__skb_dst_copy(skb, data->dst);
*OVS_CB(skb) = data->cb;
skb->inner_protocol = data->inner_protocol;
- skb->vlan_tci = data->vlan_tci;
- skb->vlan_proto = data->vlan_proto;
+ if (data->vlan_tci & VLAN_CFI_MASK)
+ __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci & ~VLAN_CFI_MASK);
+ else
+ __vlan_hwaccel_clear_tag(skb);
/* Reconstruct the MAC header. */
skb_push(skb, data->l2_len);
@@ -867,7 +869,10 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb,
data->cb = *OVS_CB(skb);
data->inner_protocol = skb->inner_protocol;
data->network_offset = orig_network_offset;
- data->vlan_tci = skb->vlan_tci;
+ if (skb_vlan_tag_present(skb))
+ data->vlan_tci = skb_vlan_tag_get(skb) | VLAN_CFI_MASK;
+ else
+ data->vlan_tci = 0;
data->vlan_proto = skb->vlan_proto;
data->mac_proto = mac_proto;
data->l2_len = hlen;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 35966da84769..57e07768c9d1 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -325,7 +325,7 @@ static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh,
return -ENOMEM;
vh = (struct vlan_head *)skb->data;
- key_vh->tci = vh->tci | htons(VLAN_TAG_PRESENT);
+ key_vh->tci = vh->tci | htons(VLAN_CFI_MASK);
key_vh->tpid = vh->tpid;
if (unlikely(untag_vlan)) {
@@ -358,7 +358,7 @@ static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
int res;
if (skb_vlan_tag_present(skb)) {
- key->eth.vlan.tci = htons(skb->vlan_tci);
+ key->eth.vlan.tci = htons(skb->vlan_tci) | htons(VLAN_CFI_MASK);
key->eth.vlan.tpid = skb->vlan_proto;
} else {
/* Parse outer vlan tag in the non-accelerated case. */
@@ -597,7 +597,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
* skb_vlan_pop(), which will later shift the ethertype into
* skb->protocol.
*/
- if (key->eth.cvlan.tci & htons(VLAN_TAG_PRESENT))
+ if (key->eth.cvlan.tci & htons(VLAN_CFI_MASK))
skb->protocol = key->eth.cvlan.tpid;
else
skb->protocol = key->eth.type;
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index c670dd24b8b7..ba01fc4270bd 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -60,7 +60,7 @@ struct ovs_tunnel_info {
struct vlan_head {
__be16 tpid; /* Vlan type. Generally 802.1q or 802.1ad.*/
- __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */
+ __be16 tci; /* 0 if no VLAN, VLAN_CFI_MASK set otherwise. */
};
#define OVS_SW_FLOW_KEY_METADATA_SIZE \
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 865ecef68196..435a4bdf8f89 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -990,9 +990,9 @@ static int validate_vlan_from_nlattrs(const struct sw_flow_match *match,
if (a[OVS_KEY_ATTR_VLAN])
tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
- if (!(tci & htons(VLAN_TAG_PRESENT))) {
+ if (!(tci & htons(VLAN_CFI_MASK))) {
if (tci) {
- OVS_NLERR(log, "%s TCI does not have VLAN_TAG_PRESENT bit set.",
+ OVS_NLERR(log, "%s TCI does not have VLAN_CFI_MASK bit set.",
(inner) ? "C-VLAN" : "VLAN");
return -EINVAL;
} else if (nla_len(a[OVS_KEY_ATTR_ENCAP])) {
@@ -1013,9 +1013,9 @@ static int validate_vlan_mask_from_nlattrs(const struct sw_flow_match *match,
__be16 tci = 0;
__be16 tpid = 0;
bool encap_valid = !!(match->key->eth.vlan.tci &
- htons(VLAN_TAG_PRESENT));
+ htons(VLAN_CFI_MASK));
bool i_encap_valid = !!(match->key->eth.cvlan.tci &
- htons(VLAN_TAG_PRESENT));
+ htons(VLAN_CFI_MASK));
if (!(key_attrs & (1 << OVS_KEY_ATTR_ENCAP))) {
/* Not a VLAN. */
@@ -1039,8 +1039,8 @@ static int validate_vlan_mask_from_nlattrs(const struct sw_flow_match *match,
(inner) ? "C-VLAN" : "VLAN", ntohs(tpid));
return -EINVAL;
}
- if (!(tci & htons(VLAN_TAG_PRESENT))) {
- OVS_NLERR(log, "%s TCI mask does not have exact match for VLAN_TAG_PRESENT bit.",
+ if (!(tci & htons(VLAN_CFI_MASK))) {
+ OVS_NLERR(log, "%s TCI mask does not have exact match for VLAN_CFI_MASK bit.",
(inner) ? "C-VLAN" : "VLAN");
return -EINVAL;
}
@@ -1095,7 +1095,7 @@ static int parse_vlan_from_nlattrs(struct sw_flow_match *match,
if (err)
return err;
- encap_valid = !!(match->key->eth.vlan.tci & htons(VLAN_TAG_PRESENT));
+ encap_valid = !!(match->key->eth.vlan.tci & htons(VLAN_CFI_MASK));
if (encap_valid) {
err = __parse_vlan_from_nlattrs(match, key_attrs, true, a,
is_mask, log);
@@ -2943,7 +2943,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
vlan = nla_data(a);
if (!eth_type_vlan(vlan->vlan_tpid))
return -EINVAL;
- if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
+ if (!(vlan->vlan_tci & htons(VLAN_CFI_MASK)))
return -EINVAL;
vlan_tci = vlan->vlan_tci;
break;
@@ -2959,7 +2959,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
/* Prohibit push MPLS other than to a white list
* for packets that have a known tag order.
*/
- if (vlan_tci & htons(VLAN_TAG_PRESENT) ||
+ if (vlan_tci & htons(VLAN_CFI_MASK) ||
(eth_type != htons(ETH_P_IP) &&
eth_type != htons(ETH_P_IPV6) &&
eth_type != htons(ETH_P_ARP) &&
@@ -2971,7 +2971,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
}
case OVS_ACTION_ATTR_POP_MPLS:
- if (vlan_tci & htons(VLAN_TAG_PRESENT) ||
+ if (vlan_tci & htons(VLAN_CFI_MASK) ||
!eth_p_mpls(eth_type))
return -EINVAL;
@@ -3036,7 +3036,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
case OVS_ACTION_ATTR_POP_ETH:
if (mac_proto != MAC_PROTO_ETHERNET)
return -EINVAL;
- if (vlan_tci & htons(VLAN_TAG_PRESENT))
+ if (vlan_tci & htons(VLAN_CFI_MASK))
return -EINVAL;
mac_proto = MAC_PROTO_NONE;
break;
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 2e5e7a41d8ef..9bec22e3e9e8 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -84,7 +84,6 @@ static struct net_device *get_dpdev(const struct datapath *dp)
struct vport *local;
local = ovs_vport_ovsl(dp, OVSP_LOCAL);
- BUG_ON(!local);
return local->dev;
}
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index ba677d54a7af..93fdaf707313 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -63,7 +63,7 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a,
/* extract existing tag (and guarantee no hw-accel tag) */
if (skb_vlan_tag_present(skb)) {
tci = skb_vlan_tag_get(skb);
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
} else {
/* in-payload vlan tag, pop it */
err = __skb_vlan_pop(skb, &tci);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index f427a1e00e7e..d92f44ac4c39 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -25,6 +25,7 @@
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/idr.h>
+#include <linux/rhashtable.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/netlink.h>
@@ -365,6 +366,245 @@ static void tcf_chain_flush(struct tcf_chain *chain)
}
}
+static struct tcf_block *tc_dev_ingress_block(struct net_device *dev)
+{
+ const struct Qdisc_class_ops *cops;
+ struct Qdisc *qdisc;
+
+ if (!dev_ingress_queue(dev))
+ return NULL;
+
+ qdisc = dev_ingress_queue(dev)->qdisc_sleeping;
+ if (!qdisc)
+ return NULL;
+
+ cops = qdisc->ops->cl_ops;
+ if (!cops)
+ return NULL;
+
+ if (!cops->tcf_block)
+ return NULL;
+
+ return cops->tcf_block(qdisc, TC_H_MIN_INGRESS, NULL);
+}
+
+static struct rhashtable indr_setup_block_ht;
+
+struct tc_indr_block_dev {
+ struct rhash_head ht_node;
+ struct net_device *dev;
+ unsigned int refcnt;
+ struct list_head cb_list;
+ struct tcf_block *block;
+};
+
+struct tc_indr_block_cb {
+ struct list_head list;
+ void *cb_priv;
+ tc_indr_block_bind_cb_t *cb;
+ void *cb_ident;
+};
+
+static const struct rhashtable_params tc_indr_setup_block_ht_params = {
+ .key_offset = offsetof(struct tc_indr_block_dev, dev),
+ .head_offset = offsetof(struct tc_indr_block_dev, ht_node),
+ .key_len = sizeof(struct net_device *),
+};
+
+static struct tc_indr_block_dev *
+tc_indr_block_dev_lookup(struct net_device *dev)
+{
+ return rhashtable_lookup_fast(&indr_setup_block_ht, &dev,
+ tc_indr_setup_block_ht_params);
+}
+
+static struct tc_indr_block_dev *tc_indr_block_dev_get(struct net_device *dev)
+{
+ struct tc_indr_block_dev *indr_dev;
+
+ indr_dev = tc_indr_block_dev_lookup(dev);
+ if (indr_dev)
+ goto inc_ref;
+
+ indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL);
+ if (!indr_dev)
+ return NULL;
+
+ INIT_LIST_HEAD(&indr_dev->cb_list);
+ indr_dev->dev = dev;
+ indr_dev->block = tc_dev_ingress_block(dev);
+ if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node,
+ tc_indr_setup_block_ht_params)) {
+ kfree(indr_dev);
+ return NULL;
+ }
+
+inc_ref:
+ indr_dev->refcnt++;
+ return indr_dev;
+}
+
+static void tc_indr_block_dev_put(struct tc_indr_block_dev *indr_dev)
+{
+ if (--indr_dev->refcnt)
+ return;
+
+ rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node,
+ tc_indr_setup_block_ht_params);
+ kfree(indr_dev);
+}
+
+static struct tc_indr_block_cb *
+tc_indr_block_cb_lookup(struct tc_indr_block_dev *indr_dev,
+ tc_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+ struct tc_indr_block_cb *indr_block_cb;
+
+ list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
+ if (indr_block_cb->cb == cb &&
+ indr_block_cb->cb_ident == cb_ident)
+ return indr_block_cb;
+ return NULL;
+}
+
+static struct tc_indr_block_cb *
+tc_indr_block_cb_add(struct tc_indr_block_dev *indr_dev, void *cb_priv,
+ tc_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+ struct tc_indr_block_cb *indr_block_cb;
+
+ indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident);
+ if (indr_block_cb)
+ return ERR_PTR(-EEXIST);
+
+ indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL);
+ if (!indr_block_cb)
+ return ERR_PTR(-ENOMEM);
+
+ indr_block_cb->cb_priv = cb_priv;
+ indr_block_cb->cb = cb;
+ indr_block_cb->cb_ident = cb_ident;
+ list_add(&indr_block_cb->list, &indr_dev->cb_list);
+
+ return indr_block_cb;
+}
+
+static void tc_indr_block_cb_del(struct tc_indr_block_cb *indr_block_cb)
+{
+ list_del(&indr_block_cb->list);
+ kfree(indr_block_cb);
+}
+
+static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
+ struct tc_indr_block_cb *indr_block_cb,
+ enum tc_block_command command)
+{
+ struct tc_block_offload bo = {
+ .command = command,
+ .binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
+ .block = indr_dev->block,
+ };
+
+ if (!indr_dev->block)
+ return;
+
+ indr_block_cb->cb(indr_dev->dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
+ &bo);
+}
+
+int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
+ tc_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+ struct tc_indr_block_cb *indr_block_cb;
+ struct tc_indr_block_dev *indr_dev;
+ int err;
+
+ indr_dev = tc_indr_block_dev_get(dev);
+ if (!indr_dev)
+ return -ENOMEM;
+
+ indr_block_cb = tc_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident);
+ err = PTR_ERR_OR_ZERO(indr_block_cb);
+ if (err)
+ goto err_dev_put;
+
+ tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_BIND);
+ return 0;
+
+err_dev_put:
+ tc_indr_block_dev_put(indr_dev);
+ return err;
+}
+EXPORT_SYMBOL_GPL(__tc_indr_block_cb_register);
+
+int tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
+ tc_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+ int err;
+
+ rtnl_lock();
+ err = __tc_indr_block_cb_register(dev, cb_priv, cb, cb_ident);
+ rtnl_unlock();
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(tc_indr_block_cb_register);
+
+void __tc_indr_block_cb_unregister(struct net_device *dev,
+ tc_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+ struct tc_indr_block_cb *indr_block_cb;
+ struct tc_indr_block_dev *indr_dev;
+
+ indr_dev = tc_indr_block_dev_lookup(dev);
+ if (!indr_dev)
+ return;
+
+ indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident);
+ if (!indr_block_cb)
+ return;
+
+ /* Send unbind message if required to free any block cbs. */
+ tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_UNBIND);
+ tc_indr_block_cb_del(indr_block_cb);
+ tc_indr_block_dev_put(indr_dev);
+}
+EXPORT_SYMBOL_GPL(__tc_indr_block_cb_unregister);
+
+void tc_indr_block_cb_unregister(struct net_device *dev,
+ tc_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+ rtnl_lock();
+ __tc_indr_block_cb_unregister(dev, cb, cb_ident);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(tc_indr_block_cb_unregister);
+
+static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev,
+ struct tcf_block_ext_info *ei,
+ enum tc_block_command command,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_indr_block_cb *indr_block_cb;
+ struct tc_indr_block_dev *indr_dev;
+ struct tc_block_offload bo = {
+ .command = command,
+ .binder_type = ei->binder_type,
+ .block = block,
+ .extack = extack,
+ };
+
+ indr_dev = tc_indr_block_dev_lookup(dev);
+ if (!indr_dev)
+ return;
+
+ indr_dev->block = command == TC_BLOCK_BIND ? block : NULL;
+
+ list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
+ indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
+ &bo);
+}
+
static bool tcf_block_offload_in_use(struct tcf_block *block)
{
return block->offloadcnt;
@@ -406,12 +646,17 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack);
if (err == -EOPNOTSUPP)
goto no_offload_dev_inc;
- return err;
+ if (err)
+ return err;
+
+ tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack);
+ return 0;
no_offload_dev_inc:
if (tcf_block_offload_in_use(block))
return -EOPNOTSUPP;
block->nooffloaddevcnt++;
+ tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack);
return 0;
}
@@ -421,6 +666,8 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
struct net_device *dev = q->dev_queue->dev;
int err;
+ tc_indr_block_call(block, dev, ei, TC_BLOCK_UNBIND, NULL);
+
if (!dev->netdev_ops->ndo_setup_tc)
goto no_offload_dev_dec;
err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL);
@@ -2355,6 +2602,11 @@ static int __init tc_filter_init(void)
if (err)
goto err_register_pernet_subsys;
+ err = rhashtable_init(&indr_setup_block_ht,
+ &tc_indr_setup_block_ht_params);
+ if (err)
+ goto err_rhash_setup_block_ht;
+
rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, 0);
rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0);
rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter,
@@ -2366,6 +2618,8 @@ static int __init tc_filter_init(void)
return 0;
+err_rhash_setup_block_ht:
+ unregister_pernet_subsys(&tcf_net_ops);
err_register_pernet_subsys:
destroy_workqueue(tc_filter_wq);
return err;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index ca3b0f46de53..f55bc50cd0a9 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -810,6 +810,56 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
}
EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
+int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
+ void *type_data)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ int err;
+
+ sch->flags &= ~TCQ_F_OFFLOADED;
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return 0;
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
+ if (err == -EOPNOTSUPP)
+ return 0;
+
+ if (!err)
+ sch->flags |= TCQ_F_OFFLOADED;
+
+ return err;
+}
+EXPORT_SYMBOL(qdisc_offload_dump_helper);
+
+void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
+ struct Qdisc *new, struct Qdisc *old,
+ enum tc_setup_type type, void *type_data,
+ struct netlink_ext_ack *extack)
+{
+ bool any_qdisc_is_offloaded;
+ int err;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
+
+ /* Don't report error if the graft is part of destroy operation. */
+ if (!err || !new || new == &noop_qdisc)
+ return;
+
+ /* Don't report error if the parent, the old child and the new
+ * one are not offloaded.
+ */
+ any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
+ any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
+ any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
+
+ if (any_qdisc_is_offloaded)
+ NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
+}
+EXPORT_SYMBOL(qdisc_offload_graft_helper);
+
static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
u32 portid, u32 seq, u16 flags, int event)
{
@@ -957,7 +1007,6 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
{
struct Qdisc *q = old;
struct net *net = dev_net(dev);
- int err = 0;
if (parent == NULL) {
unsigned int i, num_q, ingress;
@@ -1012,28 +1061,29 @@ skip:
dev_activate(dev);
} else {
const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
+ unsigned long cl;
+ int err;
/* Only support running class lockless if parent is lockless */
if (new && (new->flags & TCQ_F_NOLOCK) &&
parent && !(parent->flags & TCQ_F_NOLOCK))
new->flags &= ~TCQ_F_NOLOCK;
- err = -EOPNOTSUPP;
- if (cops && cops->graft) {
- unsigned long cl = cops->find(parent, classid);
+ if (!cops || !cops->graft)
+ return -EOPNOTSUPP;
- if (cl) {
- err = cops->graft(parent, cl, new, &old,
- extack);
- } else {
- NL_SET_ERR_MSG(extack, "Specified class not found");
- err = -ENOENT;
- }
+ cl = cops->find(parent, classid);
+ if (!cl) {
+ NL_SET_ERR_MSG(extack, "Specified class not found");
+ return -ENOENT;
}
- if (!err)
- notify_and_destroy(net, skb, n, classid, old, new);
+
+ err = cops->graft(parent, cl, new, &old, extack);
+ if (err)
+ return err;
+ notify_and_destroy(net, skb, n, classid, old, new);
}
- return err;
+ return 0;
}
static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 4b1af706896c..3671eab91107 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -94,6 +94,7 @@ struct fq_sched_data {
u32 flow_refill_delay;
u32 flow_plimit; /* max packets per flow */
unsigned long flow_max_rate; /* optional max rate per flow */
+ u64 ce_threshold;
u32 orphan_mask; /* mask for orphaned skb */
u32 low_rate_threshold;
struct rb_root *fq_root;
@@ -107,6 +108,7 @@ struct fq_sched_data {
u64 stat_gc_flows;
u64 stat_internal_packets;
u64 stat_throttled;
+ u64 stat_ce_mark;
u64 stat_flows_plimit;
u64 stat_pkts_too_long;
u64 stat_allocation_errors;
@@ -454,6 +456,11 @@ begin:
fq_flow_set_throttled(q, f);
goto begin;
}
+ if (time_next_packet &&
+ (s64)(now - time_next_packet - q->ce_threshold) > 0) {
+ INET_ECN_set_ce(skb);
+ q->stat_ce_mark++;
+ }
}
skb = fq_dequeue_head(sch, f);
@@ -650,6 +657,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
[TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
[TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
[TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
+ [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 },
};
static int fq_change(struct Qdisc *sch, struct nlattr *opt,
@@ -729,6 +737,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
if (tb[TCA_FQ_ORPHAN_MASK])
q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
+ if (tb[TCA_FQ_CE_THRESHOLD])
+ q->ce_threshold = (u64)NSEC_PER_USEC *
+ nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
+
if (!err) {
sch_tree_unlock(sch);
err = fq_resize(sch, fq_log);
@@ -779,6 +791,10 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
q->fq_trees_log = ilog2(1024);
q->orphan_mask = 1024 - 1;
q->low_rate_threshold = 550000 / 8;
+
+ /* Default ce_threshold of 4294 seconds */
+ q->ce_threshold = (u64)NSEC_PER_USEC * ~0U;
+
qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
if (opt)
@@ -792,6 +808,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct fq_sched_data *q = qdisc_priv(sch);
+ u64 ce_threshold = q->ce_threshold;
struct nlattr *opts;
opts = nla_nest_start(skb, TCA_OPTIONS);
@@ -800,6 +817,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
+ do_div(ce_threshold, NSEC_PER_USEC);
+
if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
@@ -812,6 +831,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
q->low_rate_threshold) ||
+ nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
goto nla_put_failure;
@@ -841,6 +861,7 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
st.throttled_flows = q->throttled_flows;
st.unthrottle_latency_ns = min_t(unsigned long,
q->unthrottle_latency_ns, ~0U);
+ st.ce_mark = q->stat_ce_mark;
sch_tree_unlock(sch);
return gnet_stats_copy_app(d, &st, sizeof(st));
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index f20f3a0f8424..1db5c1bf6ddd 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -38,9 +38,8 @@ static int mq_offload(struct Qdisc *sch, enum tc_mq_command cmd)
return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
}
-static void mq_offload_stats(struct Qdisc *sch)
+static int mq_offload_stats(struct Qdisc *sch)
{
- struct net_device *dev = qdisc_dev(sch);
struct tc_mq_qopt_offload opt = {
.command = TC_MQ_STATS,
.handle = sch->handle,
@@ -50,8 +49,7 @@ static void mq_offload_stats(struct Qdisc *sch)
},
};
- if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc)
- dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
+ return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_MQ, &opt);
}
static void mq_destroy(struct Qdisc *sch)
@@ -171,9 +169,8 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
spin_unlock_bh(qdisc_lock(qdisc));
}
- mq_offload_stats(sch);
- return 0;
+ return mq_offload_stats(sch);
}
static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl)
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index f8af98621179..cdf68706e40f 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -220,7 +220,6 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
qdisc_tree_reduce_backlog(child, child->q.qlen,
child->qstats.backlog);
- qdisc_put(child);
}
for (i = oldbands; i < q->bands; i++) {
@@ -230,6 +229,9 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
}
sch_tree_unlock(sch);
+
+ for (i = q->bands; i < oldbands; i++)
+ qdisc_put(q->queues[i]);
return 0;
}
@@ -251,7 +253,6 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt,
static int prio_dump_offload(struct Qdisc *sch)
{
- struct net_device *dev = qdisc_dev(sch);
struct tc_prio_qopt_offload hw_stats = {
.command = TC_PRIO_STATS,
.handle = sch->handle,
@@ -263,21 +264,8 @@ static int prio_dump_offload(struct Qdisc *sch)
},
},
};
- int err;
-
- sch->flags &= ~TCQ_F_OFFLOADED;
- if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
- return 0;
-
- err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO,
- &hw_stats);
- if (err == -EOPNOTSUPP)
- return 0;
- if (!err)
- sch->flags |= TCQ_F_OFFLOADED;
-
- return err;
+ return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_PRIO, &hw_stats);
}
static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -309,43 +297,22 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
{
struct prio_sched_data *q = qdisc_priv(sch);
struct tc_prio_qopt_offload graft_offload;
- struct net_device *dev = qdisc_dev(sch);
unsigned long band = arg - 1;
- bool any_qdisc_is_offloaded;
- int err;
if (new == NULL)
new = &noop_qdisc;
*old = qdisc_replace(sch, new, &q->queues[band]);
- if (!tc_can_offload(dev))
- return 0;
-
graft_offload.handle = sch->handle;
graft_offload.parent = sch->parent;
graft_offload.graft_params.band = band;
graft_offload.graft_params.child_handle = new->handle;
graft_offload.command = TC_PRIO_GRAFT;
- err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO,
- &graft_offload);
-
- /* Don't report error if the graft is part of destroy operation. */
- if (err && new != &noop_qdisc) {
- /* Don't report error if the parent, the old child and the new
- * one are not offloaded.
- */
- any_qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
- any_qdisc_is_offloaded |= new->flags & TCQ_F_OFFLOADED;
- if (*old)
- any_qdisc_is_offloaded |= (*old)->flags &
- TCQ_F_OFFLOADED;
-
- if (any_qdisc_is_offloaded)
- NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
- }
-
+ qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, *old,
+ TC_SETUP_QDISC_PRIO, &graft_offload,
+ extack);
return 0;
}
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 3ce6c0a2c493..a1d08bdd9357 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -167,6 +167,7 @@ static int red_offload(struct Qdisc *sch, bool enable)
opt.set.max = q->parms.qth_max >> q->parms.Wlog;
opt.set.probability = q->parms.max_P;
opt.set.is_ecn = red_use_ecn(q);
+ opt.set.is_harddrop = red_use_harddrop(q);
opt.set.qstats = &sch->qstats;
} else {
opt.command = TC_RED_DESTROY;
@@ -193,10 +194,10 @@ static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
static int red_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
+ struct Qdisc *old_child = NULL, *child = NULL;
struct red_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_RED_MAX + 1];
struct tc_red_qopt *ctl;
- struct Qdisc *child = NULL;
int err;
u32 max_P;
@@ -233,7 +234,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
if (child) {
qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
q->qdisc->qstats.backlog);
- qdisc_put(q->qdisc);
+ old_child = q->qdisc;
q->qdisc = child;
}
@@ -252,7 +253,11 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
red_start_of_idle_period(&q->vars);
sch_tree_unlock(sch);
+
red_offload(sch, true);
+
+ if (old_child)
+ qdisc_put(old_child);
return 0;
}
@@ -279,9 +284,8 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt,
return red_change(sch, opt, extack);
}
-static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt)
+static int red_dump_offload_stats(struct Qdisc *sch)
{
- struct net_device *dev = qdisc_dev(sch);
struct tc_red_qopt_offload hw_stats = {
.command = TC_RED_STATS,
.handle = sch->handle,
@@ -291,22 +295,8 @@ static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt)
.stats.qstats = &sch->qstats,
},
};
- int err;
-
- sch->flags &= ~TCQ_F_OFFLOADED;
-
- if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
- return 0;
-
- err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
- &hw_stats);
- if (err == -EOPNOTSUPP)
- return 0;
-
- if (!err)
- sch->flags |= TCQ_F_OFFLOADED;
- return err;
+ return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_RED, &hw_stats);
}
static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -324,7 +314,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
};
int err;
- err = red_dump_offload_stats(sch, &opt);
+ err = red_dump_offload_stats(sch);
if (err)
goto nla_put_failure;
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 5c36a99882ed..7ab08a5b36dc 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -574,7 +574,7 @@ void sctp_err_finish(struct sock *sk, struct sctp_transport *t)
* is probably better.
*
*/
-void sctp_v4_err(struct sk_buff *skb, __u32 info)
+int sctp_v4_err(struct sk_buff *skb, __u32 info)
{
const struct iphdr *iph = (const struct iphdr *)skb->data;
const int ihlen = iph->ihl * 4;
@@ -599,7 +599,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
skb->transport_header = savesctp;
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
- return;
+ return -ENOENT;
}
/* Warning: The sock lock is held. Remember to call
* sctp_err_finish!
@@ -653,6 +653,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
out_unlock:
sctp_err_finish(sk, transport);
+ return 0;
}
/*
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index fc6c5e4bffa5..6e27c62646e9 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -138,7 +138,7 @@ static struct notifier_block sctp_inet6addr_notifier = {
};
/* ICMP error handler. */
-static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct inet6_dev *idev;
@@ -147,7 +147,7 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
struct sctp_transport *transport;
struct ipv6_pinfo *np;
__u16 saveip, savesctp;
- int err;
+ int err, ret = 0;
struct net *net = dev_net(skb->dev);
idev = in6_dev_get(skb->dev);
@@ -163,6 +163,7 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
skb->transport_header = savesctp;
if (!sk) {
__ICMP6_INC_STATS(net, idev, ICMP6_MIB_INERRORS);
+ ret = -ENOENT;
goto out;
}
@@ -202,6 +203,8 @@ out_unlock:
out:
if (likely(idev != NULL))
in6_dev_put(idev);
+
+ return ret;
}
static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index 0a78cdf86463..2b499a85db0e 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -140,7 +140,7 @@ static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq,
struct sctp_ulpevent *event)
{
struct sctp_ulpevent *cevent;
- struct sk_buff *pos;
+ struct sk_buff *pos, *loc;
pos = skb_peek_tail(&ulpq->reasm);
if (!pos) {
@@ -166,23 +166,30 @@ static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq,
return;
}
+ loc = NULL;
skb_queue_walk(&ulpq->reasm, pos) {
cevent = sctp_skb2event(pos);
if (event->stream < cevent->stream ||
(event->stream == cevent->stream &&
- MID_lt(event->mid, cevent->mid)))
+ MID_lt(event->mid, cevent->mid))) {
+ loc = pos;
break;
-
+ }
if (event->stream == cevent->stream &&
event->mid == cevent->mid &&
!(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) &&
(event->msg_flags & SCTP_DATA_FIRST_FRAG ||
- event->fsn < cevent->fsn))
+ event->fsn < cevent->fsn)) {
+ loc = pos;
break;
+ }
}
- __skb_queue_before(&ulpq->reasm, pos, sctp_event2skb(event));
+ if (!loc)
+ __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
+ else
+ __skb_queue_before(&ulpq->reasm, loc, sctp_event2skb(event));
}
static struct sctp_ulpevent *sctp_intl_retrieve_partial(
@@ -383,7 +390,7 @@ static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq,
struct sctp_ulpevent *event)
{
struct sctp_ulpevent *cevent;
- struct sk_buff *pos;
+ struct sk_buff *pos, *loc;
pos = skb_peek_tail(&ulpq->lobby);
if (!pos) {
@@ -403,18 +410,25 @@ static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq,
return;
}
+ loc = NULL;
skb_queue_walk(&ulpq->lobby, pos) {
cevent = (struct sctp_ulpevent *)pos->cb;
- if (cevent->stream > event->stream)
+ if (cevent->stream > event->stream) {
+ loc = pos;
break;
-
+ }
if (cevent->stream == event->stream &&
- MID_lt(event->mid, cevent->mid))
+ MID_lt(event->mid, cevent->mid)) {
+ loc = pos;
break;
+ }
}
- __skb_queue_before(&ulpq->lobby, pos, sctp_event2skb(event));
+ if (!loc)
+ __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
+ else
+ __skb_queue_before(&ulpq->lobby, loc, sctp_event2skb(event));
}
static void sctp_intl_retrieve_ordered(struct sctp_ulpq *ulpq,
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 836727e363c4..9e265eb89726 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -105,7 +105,7 @@ struct tipc_stats {
* @transmitq: queue for sent, non-acked messages
* @backlogq: queue for messages waiting to be sent
* @snt_nxt: next sequence number to use for outbound messages
- * @last_retransmitted: sequence number of most recently retransmitted message
+ * @prev_from: sequence number of most previous retransmission request
* @stale_cnt: counter for number of identical retransmit attempts
* @stale_limit: time when repeated identical retransmits must force link reset
* @ackers: # of peers that needs to ack each packet before it can be released
@@ -163,7 +163,7 @@ struct tipc_link {
u16 limit;
} backlog[5];
u16 snd_nxt;
- u16 last_retransm;
+ u16 prev_from;
u16 window;
u16 stale_cnt;
unsigned long stale_limit;
@@ -186,9 +186,6 @@ struct tipc_link {
u16 acked;
struct tipc_link *bc_rcvlink;
struct tipc_link *bc_sndlink;
- unsigned long prev_retr;
- u16 prev_from;
- u16 prev_to;
u8 nack_state;
bool bc_peer_is_up;
@@ -210,7 +207,7 @@ enum {
BC_NACK_SND_SUPPRESS,
};
-#define TIPC_BC_RETR_LIMIT 10 /* [ms] */
+#define TIPC_BC_RETR_LIM msecs_to_jiffies(10) /* [ms] */
/*
* Interval between NACKs when packets arrive out of order
@@ -1036,10 +1033,12 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r,
if (!skb)
return 0;
+ if (less(to, from))
+ return 0;
/* Detect repeated retransmit failures on same packet */
- if (r->last_retransm != buf_seqno(skb)) {
- r->last_retransm = buf_seqno(skb);
+ if (r->prev_from != from) {
+ r->prev_from = from;
r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance);
r->stale_cnt = 0;
} else if (++r->stale_cnt > 99 && time_after(jiffies, r->stale_limit)) {
@@ -1055,6 +1054,11 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r,
continue;
if (more(msg_seqno(hdr), to))
break;
+ if (link_is_bc_sndlink(l)) {
+ if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr))
+ continue;
+ TIPC_SKB_CB(skb)->nxt_retr = jiffies + TIPC_BC_RETR_LIM;
+ }
_skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC);
if (!_skb)
return 0;
@@ -1737,42 +1741,6 @@ void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr)
l->rcv_nxt = peers_snd_nxt;
}
-/* link_bc_retr eval()- check if the indicated range can be retransmitted now
- * - Adjust permitted range if there is overlap with previous retransmission
- */
-static bool link_bc_retr_eval(struct tipc_link *l, u16 *from, u16 *to)
-{
- unsigned long elapsed = jiffies_to_msecs(jiffies - l->prev_retr);
-
- if (less(*to, *from))
- return false;
-
- /* New retransmission request */
- if ((elapsed > TIPC_BC_RETR_LIMIT) ||
- less(*to, l->prev_from) || more(*from, l->prev_to)) {
- l->prev_from = *from;
- l->prev_to = *to;
- l->prev_retr = jiffies;
- return true;
- }
-
- /* Inside range of previous retransmit */
- if (!less(*from, l->prev_from) && !more(*to, l->prev_to))
- return false;
-
- /* Fully or partially outside previous range => exclude overlap */
- if (less(*from, l->prev_from)) {
- *to = l->prev_from - 1;
- l->prev_from = *from;
- }
- if (more(*to, l->prev_to)) {
- *from = l->prev_to + 1;
- l->prev_to = *to;
- }
- l->prev_retr = jiffies;
- return true;
-}
-
/* tipc_link_bc_sync_rcv - update rcv link according to peer's send state
*/
int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
@@ -1803,8 +1771,7 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
if (more(peers_snd_nxt, l->rcv_nxt + l->window))
return rc;
- if (link_bc_retr_eval(snd_l, &from, &to))
- rc = tipc_link_retrans(snd_l, l, from, to, xmitq);
+ rc = tipc_link_retrans(snd_l, l, from, to, xmitq);
l->snd_nxt = peers_snd_nxt;
if (link_bc_rcv_gap(l))
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index a2879e6ec5b6..a0924956bb61 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -105,6 +105,7 @@ struct tipc_skb_cb {
u32 bytes_read;
u32 orig_member;
struct sk_buff *tail;
+ unsigned long nxt_retr;
bool validated;
u16 chain_imp;
u16 ackers;