From e1608f3fa857b600045b6df7f7dadc70eeaa4496 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 29 Nov 2019 23:29:11 +0100 Subject: bpf: Avoid setting bpf insns pages read-only when prog is jited For the case where the interpreter is compiled out or when the prog is jited it is completely unnecessary to set the BPF insn pages as read-only. In fact, on frequent churn of BPF programs, it could lead to performance degradation of the system over time since it would break the direct map down to 4k pages when calling set_memory_ro() for the insn buffer on x86-64 / arm64 and there is no reverse operation. Thus, avoid breaking up large pages for data maps, and only limit this to the module range used by the JIT where it is necessary to set the image read-only and executable. Suggested-by: Peter Zijlstra Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191129222911.3710-1-daniel@iogearbox.net --- include/linux/filter.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 1b1e8b8f88da..a141cb07e76a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -776,8 +776,12 @@ bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default) static inline void bpf_prog_lock_ro(struct bpf_prog *fp) { - set_vm_flush_reset_perms(fp); - set_memory_ro((unsigned long)fp, fp->pages); +#ifndef CONFIG_BPF_JIT_ALWAYS_ON + if (!fp->jited) { + set_vm_flush_reset_perms(fp); + set_memory_ro((unsigned long)fp, fp->pages); + } +#endif } static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) -- cgit v1.2.3-59-g8ed1b From 040b5cfbcefa263ccf2c118c4938308606bb7ed8 Mon Sep 17 00:00:00 2001 From: Martin Varghese Date: Mon, 2 Dec 2019 10:49:51 +0530 Subject: Fixed updating of ethertype in function skb_mpls_pop The skb_mpls_pop was not updating ethertype of an ethernet packet if the packet was originally received from a non ARPHRD_ETHER device. In the below OVS data path flow, since the device corresponding to port 7 is an l3 device (ARPHRD_NONE) the skb_mpls_pop function does not update the ethertype of the packet even though the previous push_eth action had added an ethernet header to the packet. recirc_id(0),in_port(7),eth_type(0x8847), mpls(label=12/0xfffff,tc=0/0,ttl=0/0x0,bos=1/1), actions:push_eth(src=00:00:00:00:00:00,dst=00:00:00:00:00:00), pop_mpls(eth_type=0x800),4 Fixes: ed246cee09b9 ("net: core: move pop MPLS functionality from OvS to core helper") Signed-off-by: Martin Varghese Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 ++- net/core/skbuff.c | 6 ++++-- net/openvswitch/actions.c | 3 ++- net/sched/act_mpls.c | 4 +++- 4 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7af5bec7d3b0..5aea72fe8498 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3530,7 +3530,8 @@ int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, int mac_len); -int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len); +int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, + bool ethernet); int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse); int skb_mpls_dec_ttl(struct sk_buff *skb); struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 867e61df00db..312e80e86898 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5529,12 +5529,14 @@ EXPORT_SYMBOL_GPL(skb_mpls_push); * @skb: buffer * @next_proto: ethertype of header after popped MPLS header * @mac_len: length of the MAC header + * @ethernet: flag to indicate if ethernet header is present in packet * * Expects skb->data at mac header. * * Returns 0 on success, -errno otherwise. */ -int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len) +int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, + bool ethernet) { int err; @@ -5553,7 +5555,7 @@ int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len) skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); - if (skb->dev && skb->dev->type == ARPHRD_ETHER) { + if (ethernet) { struct ethhdr *hdr; /* use mpls_hdr() to get ethertype to account for VLANs. */ diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 12936c151cc0..91e210061bb3 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -179,7 +179,8 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, { int err; - err = skb_mpls_pop(skb, ethertype, skb->mac_len); + err = skb_mpls_pop(skb, ethertype, skb->mac_len, + ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET); if (err) return err; diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 325eddcc6621..a7d856203af1 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) /* Copyright (C) 2019 Netronome Systems, Inc. */ +#include #include #include #include @@ -76,7 +77,8 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, switch (p->tcfm_action) { case TCA_MPLS_ACT_POP: - if (skb_mpls_pop(skb, p->tcfm_proto, mac_len)) + if (skb_mpls_pop(skb, p->tcfm_proto, mac_len, + skb->dev && skb->dev->type == ARPHRD_ETHER)) goto drop; break; case TCA_MPLS_ACT_PUSH: -- cgit v1.2.3-59-g8ed1b From 8ffb055beae58574d3e77b4bf9d4d15eace1ca27 Mon Sep 17 00:00:00 2001 From: Yoshiki Komachi Date: Tue, 3 Dec 2019 19:40:12 +0900 Subject: cls_flower: Fix the behavior using port ranges with hw-offload The recent commit 5c72299fba9d ("net: sched: cls_flower: Classify packets using port ranges") had added filtering based on port ranges to tc flower. However the commit missed necessary changes in hw-offload code, so the feature gave rise to generating incorrect offloaded flow keys in NIC. One more detailed example is below: $ tc qdisc add dev eth0 ingress $ tc filter add dev eth0 ingress protocol ip flower ip_proto tcp \ dst_port 100-200 action drop With the setup above, an exact match filter with dst_port == 0 will be installed in NIC by hw-offload. IOW, the NIC will have a rule which is equivalent to the following one. $ tc qdisc add dev eth0 ingress $ tc filter add dev eth0 ingress protocol ip flower ip_proto tcp \ dst_port 0 action drop The behavior was caused by the flow dissector which extracts packet data into the flow key in the tc flower. More specifically, regardless of exact match or specified port ranges, fl_init_dissector() set the FLOW_DISSECTOR_KEY_PORTS flag in struct flow_dissector to extract port numbers from skb in skb_flow_dissect() called by fl_classify(). Note that device drivers received the same struct flow_dissector object as used in skb_flow_dissect(). Thus, offloaded drivers could not identify which of these is used because the FLOW_DISSECTOR_KEY_PORTS flag was set to struct flow_dissector in either case. This patch adds the new FLOW_DISSECTOR_KEY_PORTS_RANGE flag and the new tp_range field in struct fl_flow_key to recognize which filters are applied to offloaded drivers. At this point, when filters based on port ranges passed to drivers, drivers return the EOPNOTSUPP error because they do not support the feature (the newly created FLOW_DISSECTOR_KEY_PORTS_RANGE flag). Fixes: 5c72299fba9d ("net: sched: cls_flower: Classify packets using port ranges") Signed-off-by: Yoshiki Komachi Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 1 + net/core/flow_dissector.c | 37 ++++++++++---- net/sched/cls_flower.c | 118 ++++++++++++++++++++++++------------------- 3 files changed, 95 insertions(+), 61 deletions(-) (limited to 'include') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index b8c20e9f343e..d93017a7ce5c 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -235,6 +235,7 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */ FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */ FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */ + FLOW_DISSECTOR_KEY_PORTS_RANGE, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */ FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */ FLOW_DISSECTOR_KEY_TIPC, /* struct flow_dissector_key_tipc */ diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index ca871657a4c4..69395b804709 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -759,6 +759,31 @@ __skb_flow_dissect_tcp(const struct sk_buff *skb, key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF)); } +static void +__skb_flow_dissect_ports(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, int nhoff, + u8 ip_proto, int hlen) +{ + enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX; + struct flow_dissector_key_ports *key_ports; + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) + dissector_ports = FLOW_DISSECTOR_KEY_PORTS; + else if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE)) + dissector_ports = FLOW_DISSECTOR_KEY_PORTS_RANGE; + + if (dissector_ports == FLOW_DISSECTOR_KEY_MAX) + return; + + key_ports = skb_flow_dissector_target(flow_dissector, + dissector_ports, + target_container); + key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, + data, hlen); +} + static void __skb_flow_dissect_ipv4(const struct sk_buff *skb, struct flow_dissector *flow_dissector, @@ -928,7 +953,6 @@ bool __skb_flow_dissect(const struct net *net, struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; - struct flow_dissector_key_ports *key_ports; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; struct bpf_prog *attached = NULL; @@ -1383,14 +1407,9 @@ ip_proto_again: break; } - if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS) && - !(key_control->flags & FLOW_DIS_IS_FRAGMENT)) { - key_ports = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS, - target_container); - key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, - data, hlen); - } + if (!(key_control->flags & FLOW_DIS_IS_FRAGMENT)) + __skb_flow_dissect_ports(skb, flow_dissector, target_container, + data, nhoff, ip_proto, hlen); /* Process result of IP proto processing */ switch (fdret) { diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index c307ee1d6ca6..6c68971d99df 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -56,8 +56,13 @@ struct fl_flow_key { struct flow_dissector_key_ip ip; struct flow_dissector_key_ip enc_ip; struct flow_dissector_key_enc_opts enc_opts; - struct flow_dissector_key_ports tp_min; - struct flow_dissector_key_ports tp_max; + union { + struct flow_dissector_key_ports tp; + struct { + struct flow_dissector_key_ports tp_min; + struct flow_dissector_key_ports tp_max; + }; + } tp_range; struct flow_dissector_key_ct ct; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ @@ -200,19 +205,19 @@ static bool fl_range_port_dst_cmp(struct cls_fl_filter *filter, { __be16 min_mask, max_mask, min_val, max_val; - min_mask = htons(filter->mask->key.tp_min.dst); - max_mask = htons(filter->mask->key.tp_max.dst); - min_val = htons(filter->key.tp_min.dst); - max_val = htons(filter->key.tp_max.dst); + min_mask = htons(filter->mask->key.tp_range.tp_min.dst); + max_mask = htons(filter->mask->key.tp_range.tp_max.dst); + min_val = htons(filter->key.tp_range.tp_min.dst); + max_val = htons(filter->key.tp_range.tp_max.dst); if (min_mask && max_mask) { - if (htons(key->tp.dst) < min_val || - htons(key->tp.dst) > max_val) + if (htons(key->tp_range.tp.dst) < min_val || + htons(key->tp_range.tp.dst) > max_val) return false; /* skb does not have min and max values */ - mkey->tp_min.dst = filter->mkey.tp_min.dst; - mkey->tp_max.dst = filter->mkey.tp_max.dst; + mkey->tp_range.tp_min.dst = filter->mkey.tp_range.tp_min.dst; + mkey->tp_range.tp_max.dst = filter->mkey.tp_range.tp_max.dst; } return true; } @@ -223,19 +228,19 @@ static bool fl_range_port_src_cmp(struct cls_fl_filter *filter, { __be16 min_mask, max_mask, min_val, max_val; - min_mask = htons(filter->mask->key.tp_min.src); - max_mask = htons(filter->mask->key.tp_max.src); - min_val = htons(filter->key.tp_min.src); - max_val = htons(filter->key.tp_max.src); + min_mask = htons(filter->mask->key.tp_range.tp_min.src); + max_mask = htons(filter->mask->key.tp_range.tp_max.src); + min_val = htons(filter->key.tp_range.tp_min.src); + max_val = htons(filter->key.tp_range.tp_max.src); if (min_mask && max_mask) { - if (htons(key->tp.src) < min_val || - htons(key->tp.src) > max_val) + if (htons(key->tp_range.tp.src) < min_val || + htons(key->tp_range.tp.src) > max_val) return false; /* skb does not have min and max values */ - mkey->tp_min.src = filter->mkey.tp_min.src; - mkey->tp_max.src = filter->mkey.tp_max.src; + mkey->tp_range.tp_min.src = filter->mkey.tp_range.tp_min.src; + mkey->tp_range.tp_max.src = filter->mkey.tp_range.tp_max.src; } return true; } @@ -734,23 +739,25 @@ static void fl_set_key_val(struct nlattr **tb, static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask) { - fl_set_key_val(tb, &key->tp_min.dst, - TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_min.dst, - TCA_FLOWER_UNSPEC, sizeof(key->tp_min.dst)); - fl_set_key_val(tb, &key->tp_max.dst, - TCA_FLOWER_KEY_PORT_DST_MAX, &mask->tp_max.dst, - TCA_FLOWER_UNSPEC, sizeof(key->tp_max.dst)); - fl_set_key_val(tb, &key->tp_min.src, - TCA_FLOWER_KEY_PORT_SRC_MIN, &mask->tp_min.src, - TCA_FLOWER_UNSPEC, sizeof(key->tp_min.src)); - fl_set_key_val(tb, &key->tp_max.src, - TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_max.src, - TCA_FLOWER_UNSPEC, sizeof(key->tp_max.src)); - - if ((mask->tp_min.dst && mask->tp_max.dst && - htons(key->tp_max.dst) <= htons(key->tp_min.dst)) || - (mask->tp_min.src && mask->tp_max.src && - htons(key->tp_max.src) <= htons(key->tp_min.src))) + fl_set_key_val(tb, &key->tp_range.tp_min.dst, + TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_range.tp_min.dst, + TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_min.dst)); + fl_set_key_val(tb, &key->tp_range.tp_max.dst, + TCA_FLOWER_KEY_PORT_DST_MAX, &mask->tp_range.tp_max.dst, + TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.dst)); + fl_set_key_val(tb, &key->tp_range.tp_min.src, + TCA_FLOWER_KEY_PORT_SRC_MIN, &mask->tp_range.tp_min.src, + TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_min.src)); + fl_set_key_val(tb, &key->tp_range.tp_max.src, + TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_range.tp_max.src, + TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.src)); + + if ((mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst && + htons(key->tp_range.tp_max.dst) <= + htons(key->tp_range.tp_min.dst)) || + (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src && + htons(key->tp_range.tp_max.src) <= + htons(key->tp_range.tp_min.src))) return -EINVAL; return 0; @@ -1509,9 +1516,10 @@ static void fl_init_dissector(struct flow_dissector *dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); - if (FL_KEY_IS_MASKED(mask, tp) || - FL_KEY_IS_MASKED(mask, tp_min) || FL_KEY_IS_MASKED(mask, tp_max)) - FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_PORTS, tp); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_PORTS_RANGE, tp_range); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_IP, ip); FL_KEY_SET_IF_MASKED(mask, keys, cnt, @@ -1560,8 +1568,10 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head, fl_mask_copy(newmask, mask); - if ((newmask->key.tp_min.dst && newmask->key.tp_max.dst) || - (newmask->key.tp_min.src && newmask->key.tp_max.src)) + if ((newmask->key.tp_range.tp_min.dst && + newmask->key.tp_range.tp_max.dst) || + (newmask->key.tp_range.tp_min.src && + newmask->key.tp_range.tp_max.src)) newmask->flags |= TCA_FLOWER_MASK_FLAGS_RANGE; err = fl_init_mask_hashtable(newmask); @@ -2159,18 +2169,22 @@ static int fl_dump_key_val(struct sk_buff *skb, static int fl_dump_key_port_range(struct sk_buff *skb, struct fl_flow_key *key, struct fl_flow_key *mask) { - if (fl_dump_key_val(skb, &key->tp_min.dst, TCA_FLOWER_KEY_PORT_DST_MIN, - &mask->tp_min.dst, TCA_FLOWER_UNSPEC, - sizeof(key->tp_min.dst)) || - fl_dump_key_val(skb, &key->tp_max.dst, TCA_FLOWER_KEY_PORT_DST_MAX, - &mask->tp_max.dst, TCA_FLOWER_UNSPEC, - sizeof(key->tp_max.dst)) || - fl_dump_key_val(skb, &key->tp_min.src, TCA_FLOWER_KEY_PORT_SRC_MIN, - &mask->tp_min.src, TCA_FLOWER_UNSPEC, - sizeof(key->tp_min.src)) || - fl_dump_key_val(skb, &key->tp_max.src, TCA_FLOWER_KEY_PORT_SRC_MAX, - &mask->tp_max.src, TCA_FLOWER_UNSPEC, - sizeof(key->tp_max.src))) + if (fl_dump_key_val(skb, &key->tp_range.tp_min.dst, + TCA_FLOWER_KEY_PORT_DST_MIN, + &mask->tp_range.tp_min.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp_range.tp_min.dst)) || + fl_dump_key_val(skb, &key->tp_range.tp_max.dst, + TCA_FLOWER_KEY_PORT_DST_MAX, + &mask->tp_range.tp_max.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp_range.tp_max.dst)) || + fl_dump_key_val(skb, &key->tp_range.tp_min.src, + TCA_FLOWER_KEY_PORT_SRC_MIN, + &mask->tp_range.tp_min.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp_range.tp_min.src)) || + fl_dump_key_val(skb, &key->tp_range.tp_max.src, + TCA_FLOWER_KEY_PORT_SRC_MAX, + &mask->tp_range.tp_max.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp_range.tp_max.src))) return -1; return 0; -- cgit v1.2.3-59-g8ed1b From c4e85f73afb6384123e5ef1bba3315b2e3ad031e Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 4 Dec 2019 15:35:52 +0100 Subject: net: ipv6: add net argument to ip6_dst_lookup_flow This will be used in the conversion of ipv6_stub to ip6_dst_lookup_flow, as some modules currently pass a net argument without a socket to ip6_dst_lookup. This is equivalent to commit 343d60aada5a ("ipv6: change ipv6_stub_impl.ipv6_dst_lookup to take net argument"). Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 +- net/dccp/ipv6.c | 6 +++--- net/ipv6/af_inet6.c | 2 +- net/ipv6/datagram.c | 2 +- net/ipv6/inet6_connection_sock.c | 4 ++-- net/ipv6/ip6_output.c | 8 ++++---- net/ipv6/raw.c | 2 +- net/ipv6/syncookies.c | 2 +- net/ipv6/tcp_ipv6.c | 4 ++-- net/l2tp/l2tp_ip6.c | 2 +- net/sctp/ipv6.c | 4 ++-- 11 files changed, 19 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index d04b7abe2a4c..4e95f6df508c 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1022,7 +1022,7 @@ static inline struct sk_buff *ip6_finish_skb(struct sock *sk) int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); -struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, +struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst); struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 25aab672fc99..1e5e08cc0bfc 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -210,7 +210,7 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); rcu_read_unlock(); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; @@ -282,7 +282,7 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6)); /* sk = NULL, but it is safe for now. RST socket required. */ - dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); + dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(skb, dst); ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0, 0); @@ -912,7 +912,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); final_p = fl6_update_dst(&fl6, opt, &final); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto failure; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 60e2ff91a5b3..e84e8b1ffbc7 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -765,7 +765,7 @@ int inet6_sk_rebuild_header(struct sock *sk) &final); rcu_read_unlock(); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { sk->sk_route_caps = 0; sk->sk_err_soft = -PTR_ERR(dst); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 96f939248d2f..390bedde21a5 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -85,7 +85,7 @@ int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr) final_p = fl6_update_dst(&fl6, opt, &final); rcu_read_unlock(); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 0a0945a5b30d..fe9cb8d1adca 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -48,7 +48,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, fl6->flowi6_uid = sk->sk_uid; security_req_classify_flow(req, flowi6_to_flowi(fl6)); - dst = ip6_dst_lookup_flow(sk, fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (IS_ERR(dst)) return NULL; @@ -103,7 +103,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, dst = __inet6_csk_dst_check(sk, np->dst_cookie); if (!dst) { - dst = ip6_dst_lookup_flow(sk, fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!IS_ERR(dst)) ip6_dst_store(sk, dst, NULL, NULL); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 945508a7cb0f..087304427bbb 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1144,19 +1144,19 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup); * It returns a valid dst pointer on success, or a pointer encoded * error code. */ -struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, +struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst) { struct dst_entry *dst = NULL; int err; - err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6); + err = ip6_dst_lookup_tail(net, sk, &dst, fl6); if (err) return ERR_PTR(err); if (final_dst) fl6->daddr = *final_dst; - return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); + return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); } EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); @@ -1188,7 +1188,7 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, if (dst) return dst; - dst = ip6_dst_lookup_flow(sk, fl6, final_dst); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); if (connected && !IS_ERR(dst)) ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index a77f6b7d3a7c..dfe5e603ffe1 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -925,7 +925,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 16632e02e9b0..30915f6f31e3 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -235,7 +235,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) fl6.flowi6_uid = sk->sk_uid; security_req_classify_flow(req, flowi6_to_flowi(&fl6)); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) goto out_free; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 81f51335e326..df5fd9109696 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -275,7 +275,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto failure; @@ -906,7 +906,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 * Underlying function will use this to retrieve the network * namespace */ - dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); + dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(buff, dst); ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass, diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 802f19aba7e3..d148766f40d1 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -615,7 +615,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); - dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index dd860fea0148..bc734cfaa29e 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -275,7 +275,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); rcu_read_unlock(); - dst = ip6_dst_lookup_flow(sk, fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!asoc || saddr) goto out; @@ -328,7 +328,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl6->saddr = laddr->a.v6.sin6_addr; fl6->fl6_sport = laddr->a.v6.sin6_port; final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); - bdst = ip6_dst_lookup_flow(sk, fl6, final_p); + bdst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (IS_ERR(bdst)) continue; -- cgit v1.2.3-59-g8ed1b From 6c8991f41546c3c472503dff1ea9daaddf9331c2 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 4 Dec 2019 15:35:53 +0100 Subject: net: ipv6_stub: use ip6_dst_lookup_flow instead of ip6_dst_lookup ipv6_stub uses the ip6_dst_lookup function to allow other modules to perform IPv6 lookups. However, this function skips the XFRM layer entirely. All users of ipv6_stub->ip6_dst_lookup use ip_route_output_flow (via the ip_route_output_key and ip_route_output helpers) for their IPv4 lookups, which calls xfrm_lookup_route(). This patch fixes this inconsistent behavior by switching the stub to ip6_dst_lookup_flow, which also calls xfrm_lookup_route(). This requires some changes in all the callers, as these two functions take different arguments and have different return types. Fixes: 5f81bd2e5d80 ("ipv6: export a stub for IPv6 symbols used by vxlan") Reported-by: Xiumei Mu Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- drivers/infiniband/core/addr.c | 7 +++---- drivers/infiniband/sw/rxe/rxe_net.c | 8 +++++--- drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c | 8 ++++---- drivers/net/geneve.c | 4 +++- drivers/net/vxlan.c | 8 +++----- include/net/ipv6_stubs.h | 6 ++++-- net/core/lwt_bpf.c | 4 +--- net/ipv6/addrconf_core.c | 11 ++++++----- net/ipv6/af_inet6.c | 2 +- net/mpls/af_mpls.c | 7 +++---- net/tipc/udp_media.c | 9 ++++++--- 11 files changed, 39 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 6d7ec371e7b2..606fa6d86685 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -421,16 +421,15 @@ static int addr6_resolve(struct sockaddr *src_sock, (const struct sockaddr_in6 *)dst_sock; struct flowi6 fl6; struct dst_entry *dst; - int ret; memset(&fl6, 0, sizeof fl6); fl6.daddr = dst_in->sin6_addr; fl6.saddr = src_in->sin6_addr; fl6.flowi6_oif = addr->bound_dev_if; - ret = ipv6_stub->ipv6_dst_lookup(addr->net, NULL, &dst, &fl6); - if (ret < 0) - return ret; + dst = ipv6_stub->ipv6_dst_lookup_flow(addr->net, NULL, &fl6, NULL); + if (IS_ERR(dst)) + return PTR_ERR(dst); if (ipv6_addr_any(&src_in->sin6_addr)) src_in->sin6_addr = fl6.saddr; diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 5a3474f9351b..312c2fc961c0 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -117,10 +117,12 @@ static struct dst_entry *rxe_find_route6(struct net_device *ndev, memcpy(&fl6.daddr, daddr, sizeof(*daddr)); fl6.flowi6_proto = IPPROTO_UDP; - if (unlikely(ipv6_stub->ipv6_dst_lookup(sock_net(recv_sockets.sk6->sk), - recv_sockets.sk6->sk, &ndst, &fl6))) { + ndst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(recv_sockets.sk6->sk), + recv_sockets.sk6->sk, &fl6, + NULL); + if (unlikely(IS_ERR(ndst))) { pr_err_ratelimited("no route to %pI6\n", daddr); - goto put; + return NULL; } if (unlikely(ndst->error)) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c index 6ed87534d314..c754987278a9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -297,10 +297,10 @@ static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv, int ret; - ret = ipv6_stub->ipv6_dst_lookup(dev_net(mirred_dev), NULL, &dst, - fl6); - if (ret < 0) - return ret; + dst = ipv6_stub->ipv6_dst_lookup_flow(dev_net(mirred_dev), NULL, fl6, + NULL); + if (IS_ERR(dst)) + return PTR_ERR(dst); if (!(*out_ttl)) *out_ttl = ip6_dst_hoplimit(dst); diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 3ab24fdccd3b..5c6b7fc04ea6 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -853,7 +853,9 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb, if (dst) return dst; } - if (ipv6_stub->ipv6_dst_lookup(geneve->net, gs6->sock->sk, &dst, fl6)) { + dst = ipv6_stub->ipv6_dst_lookup_flow(geneve->net, gs6->sock->sk, fl6, + NULL); + if (IS_ERR(dst)) { netdev_dbg(dev, "no route to %pI6\n", &fl6->daddr); return ERR_PTR(-ENETUNREACH); } diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index bf04bc2e68c2..4c34375c2e22 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2275,7 +2275,6 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, bool use_cache = ip_tunnel_dst_cache_usable(skb, info); struct dst_entry *ndst; struct flowi6 fl6; - int err; if (!sock6) return ERR_PTR(-EIO); @@ -2298,10 +2297,9 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, fl6.fl6_dport = dport; fl6.fl6_sport = sport; - err = ipv6_stub->ipv6_dst_lookup(vxlan->net, - sock6->sock->sk, - &ndst, &fl6); - if (unlikely(err < 0)) { + ndst = ipv6_stub->ipv6_dst_lookup_flow(vxlan->net, sock6->sock->sk, + &fl6, NULL); + if (unlikely(IS_ERR(ndst))) { netdev_dbg(dev, "no route to %pI6\n", daddr); return ERR_PTR(-ENETUNREACH); } diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 5c93e942c50b..3e7d2c0e79ca 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -24,8 +24,10 @@ struct ipv6_stub { const struct in6_addr *addr); int (*ipv6_sock_mc_drop)(struct sock *sk, int ifindex, const struct in6_addr *addr); - int (*ipv6_dst_lookup)(struct net *net, struct sock *sk, - struct dst_entry **dst, struct flowi6 *fl6); + struct dst_entry *(*ipv6_dst_lookup_flow)(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, + const struct in6_addr *final_dst); int (*ipv6_route_input)(struct sk_buff *skb); struct fib6_table *(*fib6_get_table)(struct net *net, u32 id); diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 74cfb8b5ab33..99a6de52b21d 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -230,9 +230,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) fl6.daddr = iph6->daddr; fl6.saddr = iph6->saddr; - err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6); - if (unlikely(err)) - goto err; + dst = ipv6_stub->ipv6_dst_lookup_flow(net, skb->sk, &fl6, NULL); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto err; diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 2fc079284ca4..ea00ce3d4117 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -129,11 +129,12 @@ int inet6addr_validator_notifier_call_chain(unsigned long val, void *v) } EXPORT_SYMBOL(inet6addr_validator_notifier_call_chain); -static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1, - struct dst_entry **u2, - struct flowi6 *u3) +static struct dst_entry *eafnosupport_ipv6_dst_lookup_flow(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, + const struct in6_addr *final_dst) { - return -EAFNOSUPPORT; + return ERR_PTR(-EAFNOSUPPORT); } static int eafnosupport_ipv6_route_input(struct sk_buff *skb) @@ -190,7 +191,7 @@ static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt) } const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { - .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, + .ipv6_dst_lookup_flow = eafnosupport_ipv6_dst_lookup_flow, .ipv6_route_input = eafnosupport_ipv6_route_input, .fib6_get_table = eafnosupport_fib6_get_table, .fib6_table_lookup = eafnosupport_fib6_table_lookup, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e84e8b1ffbc7..d727c3b41495 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -946,7 +946,7 @@ static int ipv6_route_input(struct sk_buff *skb) static const struct ipv6_stub ipv6_stub_impl = { .ipv6_sock_mc_join = ipv6_sock_mc_join, .ipv6_sock_mc_drop = ipv6_sock_mc_drop, - .ipv6_dst_lookup = ip6_dst_lookup, + .ipv6_dst_lookup_flow = ip6_dst_lookup_flow, .ipv6_route_input = ipv6_route_input, .fib6_get_table = fib6_get_table, .fib6_table_lookup = fib6_table_lookup, diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index c312741df2ce..4701edffb1f7 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -617,16 +617,15 @@ static struct net_device *inet6_fib_lookup_dev(struct net *net, struct net_device *dev; struct dst_entry *dst; struct flowi6 fl6; - int err; if (!ipv6_stub) return ERR_PTR(-EAFNOSUPPORT); memset(&fl6, 0, sizeof(fl6)); memcpy(&fl6.daddr, addr, sizeof(struct in6_addr)); - err = ipv6_stub->ipv6_dst_lookup(net, NULL, &dst, &fl6); - if (err) - return ERR_PTR(err); + dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); + if (IS_ERR(dst)) + return ERR_CAST(dst); dev = dst->dev; dev_hold(dev); diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 86aaa4d3e781..ed113735c019 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -195,10 +195,13 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, .saddr = src->ipv6, .flowi6_proto = IPPROTO_UDP }; - err = ipv6_stub->ipv6_dst_lookup(net, ub->ubsock->sk, - &ndst, &fl6); - if (err) + ndst = ipv6_stub->ipv6_dst_lookup_flow(net, + ub->ubsock->sk, + &fl6, NULL); + if (IS_ERR(ndst)) { + err = PTR_ERR(ndst); goto tx_error; + } dst_cache_set_ip6(cache, ndst, &fl6.saddr); } ttl = ip6_dst_hoplimit(ndst); -- cgit v1.2.3-59-g8ed1b From d04ac224b1688f005a84f764cfe29844f8e9da08 Mon Sep 17 00:00:00 2001 From: Martin Varghese Date: Thu, 5 Dec 2019 05:57:22 +0530 Subject: net: Fixed updating of ethertype in skb_mpls_push() The skb_mpls_push was not updating ethertype of an ethernet packet if the packet was originally received from a non ARPHRD_ETHER device. In the below OVS data path flow, since the device corresponding to port 7 is an l3 device (ARPHRD_NONE) the skb_mpls_push function does not update the ethertype of the packet even though the previous push_eth action had added an ethernet header to the packet. recirc_id(0),in_port(7),eth_type(0x0800),ipv4(tos=0/0xfc,ttl=64,frag=no), actions:push_eth(src=00:00:00:00:00:00,dst=00:00:00:00:00:00), push_mpls(label=13,tc=0,ttl=64,bos=1,eth_type=0x8847),4 Fixes: 8822e270d697 ("net: core: move push MPLS functionality from OvS to core helper") Signed-off-by: Martin Varghese Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- net/core/skbuff.c | 4 ++-- net/openvswitch/actions.c | 3 ++- net/sched/act_mpls.c | 3 ++- 4 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5aea72fe8498..e9133bcf0544 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3529,7 +3529,7 @@ int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, - int mac_len); + int mac_len, bool ethernet); int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, bool ethernet); int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 312e80e86898..973a71f4bc89 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5484,7 +5484,7 @@ static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, * Returns 0 on success, -errno otherwise. */ int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, - int mac_len) + int mac_len, bool ethernet) { struct mpls_shim_hdr *lse; int err; @@ -5515,7 +5515,7 @@ int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, lse->label_stack_entry = mpls_lse; skb_postpush_rcsum(skb, lse, MPLS_HLEN); - if (skb->dev && skb->dev->type == ARPHRD_ETHER) + if (ethernet) skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto); skb->protocol = mpls_proto; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 91e210061bb3..4c8395462303 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -166,7 +166,8 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, int err; err = skb_mpls_push(skb, mpls->mpls_lse, mpls->mpls_ethertype, - skb->mac_len); + skb->mac_len, + ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET); if (err) return err; diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index a7d856203af1..be3f215cd027 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -83,7 +83,8 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, break; case TCA_MPLS_ACT_PUSH: new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb->protocol)); - if (skb_mpls_push(skb, new_lse, p->tcfm_proto, mac_len)) + if (skb_mpls_push(skb, new_lse, p->tcfm_proto, mac_len, + skb->dev && skb->dev->type == ARPHRD_ETHER)) goto drop; break; case TCA_MPLS_ACT_MODIFY: -- cgit v1.2.3-59-g8ed1b From dbad3408896c3c5722ec9cda065468b3df16c5bf Mon Sep 17 00:00:00 2001 From: John Hurley Date: Thu, 5 Dec 2019 17:03:34 +0000 Subject: net: core: rename indirect block ingress cb function With indirect blocks, a driver can register for callbacks from a device that is does not 'own', for example, a tunnel device. When registering to or unregistering from a new device, a callback is triggered to generate a bind/unbind event. This, in turn, allows the driver to receive any existing rules or to properly clean up installed rules. When first added, it was assumed that all indirect block registrations would be for ingress offloads. However, the NFP driver can, in some instances, support clsact qdisc binds for egress offload. Change the name of the indirect block callback command in flow_offload to remove the 'ingress' identifier from it. While this does not change functionality, a follow up patch will implement a more more generic callback than just those currently just supporting ingress offload. Fixes: 4d12ba42787b ("nfp: flower: allow offloading of matches on 'internal' ports") Signed-off-by: John Hurley Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/flow_offload.h | 15 ++++++------- net/core/flow_offload.c | 45 +++++++++++++++++++-------------------- net/netfilter/nf_tables_offload.c | 6 +++--- net/sched/cls_api.c | 4 ++-- 4 files changed, 34 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 86c567f531f3..c6f7bd22db60 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -380,19 +380,18 @@ static inline void flow_block_init(struct flow_block *flow_block) typedef int flow_indr_block_bind_cb_t(struct net_device *dev, void *cb_priv, enum tc_setup_type type, void *type_data); -typedef void flow_indr_block_ing_cmd_t(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command command); +typedef void flow_indr_block_cmd_t(struct net_device *dev, + flow_indr_block_bind_cb_t *cb, void *cb_priv, + enum flow_block_command command); -struct flow_indr_block_ing_entry { - flow_indr_block_ing_cmd_t *cb; +struct flow_indr_block_entry { + flow_indr_block_cmd_t *cb; struct list_head list; }; -void flow_indr_add_block_ing_cb(struct flow_indr_block_ing_entry *entry); +void flow_indr_add_block_cb(struct flow_indr_block_entry *entry); -void flow_indr_del_block_ing_cb(struct flow_indr_block_ing_entry *entry); +void flow_indr_del_block_cb(struct flow_indr_block_entry *entry); int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, flow_indr_block_bind_cb_t *cb, diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index cf52d9c422fa..45b6a59ac124 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -283,7 +283,7 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f, } EXPORT_SYMBOL(flow_block_cb_setup_simple); -static LIST_HEAD(block_ing_cb_list); +static LIST_HEAD(block_cb_list); static struct rhashtable indr_setup_block_ht; @@ -391,20 +391,19 @@ static void flow_indr_block_cb_del(struct flow_indr_block_cb *indr_block_cb) kfree(indr_block_cb); } -static DEFINE_MUTEX(flow_indr_block_ing_cb_lock); +static DEFINE_MUTEX(flow_indr_block_cb_lock); -static void flow_block_ing_cmd(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command command) +static void flow_block_cmd(struct net_device *dev, + flow_indr_block_bind_cb_t *cb, void *cb_priv, + enum flow_block_command command) { - struct flow_indr_block_ing_entry *entry; + struct flow_indr_block_entry *entry; - mutex_lock(&flow_indr_block_ing_cb_lock); - list_for_each_entry(entry, &block_ing_cb_list, list) { + mutex_lock(&flow_indr_block_cb_lock); + list_for_each_entry(entry, &block_cb_list, list) { entry->cb(dev, cb, cb_priv, command); } - mutex_unlock(&flow_indr_block_ing_cb_lock); + mutex_unlock(&flow_indr_block_cb_lock); } int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, @@ -424,8 +423,8 @@ int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, if (err) goto err_dev_put; - flow_block_ing_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv, - FLOW_BLOCK_BIND); + flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv, + FLOW_BLOCK_BIND); return 0; @@ -464,8 +463,8 @@ void __flow_indr_block_cb_unregister(struct net_device *dev, if (!indr_block_cb) return; - flow_block_ing_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv, - FLOW_BLOCK_UNBIND); + flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv, + FLOW_BLOCK_UNBIND); flow_indr_block_cb_del(indr_block_cb); flow_indr_block_dev_put(indr_dev); @@ -499,21 +498,21 @@ void flow_indr_block_call(struct net_device *dev, } EXPORT_SYMBOL_GPL(flow_indr_block_call); -void flow_indr_add_block_ing_cb(struct flow_indr_block_ing_entry *entry) +void flow_indr_add_block_cb(struct flow_indr_block_entry *entry) { - mutex_lock(&flow_indr_block_ing_cb_lock); - list_add_tail(&entry->list, &block_ing_cb_list); - mutex_unlock(&flow_indr_block_ing_cb_lock); + mutex_lock(&flow_indr_block_cb_lock); + list_add_tail(&entry->list, &block_cb_list); + mutex_unlock(&flow_indr_block_cb_lock); } -EXPORT_SYMBOL_GPL(flow_indr_add_block_ing_cb); +EXPORT_SYMBOL_GPL(flow_indr_add_block_cb); -void flow_indr_del_block_ing_cb(struct flow_indr_block_ing_entry *entry) +void flow_indr_del_block_cb(struct flow_indr_block_entry *entry) { - mutex_lock(&flow_indr_block_ing_cb_lock); + mutex_lock(&flow_indr_block_cb_lock); list_del(&entry->list); - mutex_unlock(&flow_indr_block_ing_cb_lock); + mutex_unlock(&flow_indr_block_cb_lock); } -EXPORT_SYMBOL_GPL(flow_indr_del_block_ing_cb); +EXPORT_SYMBOL_GPL(flow_indr_del_block_cb); static int __init init_flow_indr_rhashtable(void) { diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 68f17a6921d8..431f3b803bfb 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -588,7 +588,7 @@ static int nft_offload_netdev_event(struct notifier_block *this, return NOTIFY_DONE; } -static struct flow_indr_block_ing_entry block_ing_entry = { +static struct flow_indr_block_entry block_ing_entry = { .cb = nft_indr_block_cb, .list = LIST_HEAD_INIT(block_ing_entry.list), }; @@ -605,13 +605,13 @@ int nft_offload_init(void) if (err < 0) return err; - flow_indr_add_block_ing_cb(&block_ing_entry); + flow_indr_add_block_cb(&block_ing_entry); return 0; } void nft_offload_exit(void) { - flow_indr_del_block_ing_cb(&block_ing_entry); + flow_indr_del_block_cb(&block_ing_entry); unregister_netdevice_notifier(&nft_offload_netdev_notifier); } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 20d60b8fcb70..75b48083614e 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3626,7 +3626,7 @@ static struct pernet_operations tcf_net_ops = { .size = sizeof(struct tcf_net), }; -static struct flow_indr_block_ing_entry block_ing_entry = { +static struct flow_indr_block_entry block_ing_entry = { .cb = tc_indr_block_get_and_ing_cmd, .list = LIST_HEAD_INIT(block_ing_entry.list), }; @@ -3643,7 +3643,7 @@ static int __init tc_filter_init(void) if (err) goto err_register_pernet_subsys; - flow_indr_add_block_ing_cb(&block_ing_entry); + flow_indr_add_block_cb(&block_ing_entry); rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, RTNL_FLAG_DOIT_UNLOCKED); -- cgit v1.2.3-59-g8ed1b From 04d26e7b159a396372646a480f4caa166d1b6720 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 6 Dec 2019 12:38:36 +0100 Subject: tcp: fix rejected syncookies due to stale timestamps If no synflood happens for a long enough period of time, then the synflood timestamp isn't refreshed and jiffies can advance so much that time_after32() can't accurately compare them any more. Therefore, we can end up in a situation where time_after32(now, last_overflow + HZ) returns false, just because these two values are too far apart. In that case, the synflood timestamp isn't updated as it should be, which can trick tcp_synq_no_recent_overflow() into rejecting valid syncookies. For example, let's consider the following scenario on a system with HZ=1000: * The synflood timestamp is 0, either because that's the timestamp of the last synflood or, more commonly, because we're working with a freshly created socket. * We receive a new SYN, which triggers synflood protection. Let's say that this happens when jiffies == 2147484649 (that is, 'synflood timestamp' + HZ + 2^31 + 1). * Then tcp_synq_overflow() doesn't update the synflood timestamp, because time_after32(2147484649, 1000) returns false. With: - 2147484649: the value of jiffies, aka. 'now'. - 1000: the value of 'last_overflow' + HZ. * A bit later, we receive the ACK completing the 3WHS. But cookie_v[46]_check() rejects it because tcp_synq_no_recent_overflow() says that we're not under synflood. That's because time_after32(2147484649, 120000) returns false. With: - 2147484649: the value of jiffies, aka. 'now'. - 120000: the value of 'last_overflow' + TCP_SYNCOOKIE_VALID. Of course, in reality jiffies would have increased a bit, but this condition will last for the next 119 seconds, which is far enough to accommodate for jiffie's growth. Fix this by updating the overflow timestamp whenever jiffies isn't within the [last_overflow, last_overflow + HZ] range. That shouldn't have any performance impact since the update still happens at most once per second. Now we're guaranteed to have fresh timestamps while under synflood, so tcp_synq_no_recent_overflow() can safely use it with time_after32() in such situations. Stale timestamps can still make tcp_synq_no_recent_overflow() return the wrong verdict when not under synflood. This will be handled in the next patch. For 64 bits architectures, the problem was introduced with the conversion of ->tw_ts_recent_stamp to 32 bits integer by commit cca9bab1b72c ("tcp: use monotonic timestamps for PAWS"). The problem has always been there on 32 bits architectures. Fixes: cca9bab1b72c ("tcp: use monotonic timestamps for PAWS") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Guillaume Nault Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/time.h | 13 +++++++++++++ include/net/tcp.h | 5 +++-- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/time.h b/include/linux/time.h index 0760a4f5a15c..8e10b9dbd8c2 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -97,4 +97,17 @@ static inline bool itimerspec64_valid(const struct itimerspec64 *its) */ #define time_after32(a, b) ((s32)((u32)(b) - (u32)(a)) < 0) #define time_before32(b, a) time_after32(a, b) + +/** + * time_between32 - check if a 32-bit timestamp is within a given time range + * @t: the time which may be within [l,h] + * @l: the lower bound of the range + * @h: the higher bound of the range + * + * time_before32(t, l, h) returns true if @l <= @t <= @h. All operands are + * treated as 32-bit integers. + * + * Equivalent to !(time_before32(@t, @l) || time_after32(@t, @h)). + */ +#define time_between32(t, l, h) ((u32)(h) - (u32)(l) >= (u32)(t) - (u32)(l)) #endif diff --git a/include/net/tcp.h b/include/net/tcp.h index 36f195fb576a..7d734ba391fc 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -494,14 +494,15 @@ static inline void tcp_synq_overflow(const struct sock *sk) reuse = rcu_dereference(sk->sk_reuseport_cb); if (likely(reuse)) { last_overflow = READ_ONCE(reuse->synq_overflow_ts); - if (time_after32(now, last_overflow + HZ)) + if (!time_between32(now, last_overflow, + last_overflow + HZ)) WRITE_ONCE(reuse->synq_overflow_ts, now); return; } } last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; - if (time_after32(now, last_overflow + HZ)) + if (!time_between32(now, last_overflow, last_overflow + HZ)) tcp_sk(sk)->rx_opt.ts_recent_stamp = now; } -- cgit v1.2.3-59-g8ed1b From cb44a08f8647fd2e8db5cc9ac27cd8355fa392d8 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 6 Dec 2019 12:38:43 +0100 Subject: tcp: tighten acceptance of ACKs not matching a child socket When no synflood occurs, the synflood timestamp isn't updated. Therefore it can be so old that time_after32() can consider it to be in the future. That's a problem for tcp_synq_no_recent_overflow() as it may report that a recent overflow occurred while, in fact, it's just that jiffies has grown past 'last_overflow' + TCP_SYNCOOKIE_VALID + 2^31. Spurious detection of recent overflows lead to extra syncookie verification in cookie_v[46]_check(). At that point, the verification should fail and the packet dropped. But we should have dropped the packet earlier as we didn't even send a syncookie. Let's refine tcp_synq_no_recent_overflow() to report a recent overflow only if jiffies is within the [last_overflow, last_overflow + TCP_SYNCOOKIE_VALID] interval. This way, no spurious recent overflow is reported when jiffies wraps and 'last_overflow' becomes in the future from the point of view of time_after32(). However, if jiffies wraps and enters the [last_overflow, last_overflow + TCP_SYNCOOKIE_VALID] interval (with 'last_overflow' being a stale synflood timestamp), then tcp_synq_no_recent_overflow() still erroneously reports an overflow. In such cases, we have to rely on syncookie verification to drop the packet. We unfortunately have no way to differentiate between a fresh and a stale syncookie timestamp. In practice, using last_overflow as lower bound is problematic. If the synflood timestamp is concurrently updated between the time we read jiffies and the moment we store the timestamp in 'last_overflow', then 'now' becomes smaller than 'last_overflow' and tcp_synq_no_recent_overflow() returns true, potentially dropping a valid syncookie. Reading jiffies after loading the timestamp could fix the problem, but that'd require a memory barrier. Let's just accommodate for potential timestamp growth instead and extend the interval using 'last_overflow - HZ' as lower bound. Signed-off-by: Guillaume Nault Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 7d734ba391fc..43e04e14c41e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -518,13 +518,23 @@ static inline bool tcp_synq_no_recent_overflow(const struct sock *sk) reuse = rcu_dereference(sk->sk_reuseport_cb); if (likely(reuse)) { last_overflow = READ_ONCE(reuse->synq_overflow_ts); - return time_after32(now, last_overflow + - TCP_SYNCOOKIE_VALID); + return !time_between32(now, last_overflow - HZ, + last_overflow + + TCP_SYNCOOKIE_VALID); } } last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; - return time_after32(now, last_overflow + TCP_SYNCOOKIE_VALID); + + /* If last_overflow <= jiffies <= last_overflow + TCP_SYNCOOKIE_VALID, + * then we're under synflood. However, we have to use + * 'last_overflow - HZ' as lower bound. That's because a concurrent + * tcp_synq_overflow() could update .ts_recent_stamp after we read + * jiffies but before we store .ts_recent_stamp into last_overflow, + * which could lead to rejecting a valid syncookie. + */ + return !time_between32(now, last_overflow - HZ, + last_overflow + TCP_SYNCOOKIE_VALID); } static inline u32 tcp_cookie_time(void) -- cgit v1.2.3-59-g8ed1b From 721c8dafad26ccfa90ff659ee19755e3377b829d Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 6 Dec 2019 12:38:49 +0100 Subject: tcp: Protect accesses to .ts_recent_stamp with {READ,WRITE}_ONCE() Syncookies borrow the ->rx_opt.ts_recent_stamp field to store the timestamp of the last synflood. Protect them with READ_ONCE() and WRITE_ONCE() since reads and writes aren't serialised. Use of .rx_opt.ts_recent_stamp for storing the synflood timestamp was introduced by a0f82f64e269 ("syncookies: remove last_synq_overflow from struct tcp_sock"). But unprotected accesses were already there when timestamp was stored in .last_synq_overflow. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Guillaume Nault Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 43e04e14c41e..86b9a8766648 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -501,9 +501,9 @@ static inline void tcp_synq_overflow(const struct sock *sk) } } - last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp); if (!time_between32(now, last_overflow, last_overflow + HZ)) - tcp_sk(sk)->rx_opt.ts_recent_stamp = now; + WRITE_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp, now); } /* syncookies: no recent synqueue overflow on this listening socket? */ @@ -524,7 +524,7 @@ static inline bool tcp_synq_no_recent_overflow(const struct sock *sk) } } - last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp); /* If last_overflow <= jiffies <= last_overflow + TCP_SYNCOOKIE_VALID, * then we're under synflood. However, we have to use -- cgit v1.2.3-59-g8ed1b From 501a90c945103e8627406763dac418f20f3837b2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Dec 2019 20:43:46 -0800 Subject: inet: protect against too small mtu values. syzbot was once again able to crash a host by setting a very small mtu on loopback device. Let's make inetdev_valid_mtu() available in include/net/ip.h, and use it in ip_setup_cork(), so that we protect both ip_append_page() and __ip_append_data() Also add a READ_ONCE() when the device mtu is read. Pairs this lockless read with one WRITE_ONCE() in __dev_set_mtu(), even if other code paths might write over this field. Add a big comment in include/linux/netdevice.h about dev->mtu needing READ_ONCE()/WRITE_ONCE() annotations. Hopefully we will add the missing ones in followup patches. [1] refcount_t: saturated; leaking memory. WARNING: CPU: 0 PID: 9464 at lib/refcount.c:22 refcount_warn_saturate+0x138/0x1f0 lib/refcount.c:22 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 9464 Comm: syz-executor850 Not tainted 5.4.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x197/0x210 lib/dump_stack.c:118 panic+0x2e3/0x75c kernel/panic.c:221 __warn.cold+0x2f/0x3e kernel/panic.c:582 report_bug+0x289/0x300 lib/bug.c:195 fixup_bug arch/x86/kernel/traps.c:174 [inline] fixup_bug arch/x86/kernel/traps.c:169 [inline] do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:267 do_invalid_op+0x37/0x50 arch/x86/kernel/traps.c:286 invalid_op+0x23/0x30 arch/x86/entry/entry_64.S:1027 RIP: 0010:refcount_warn_saturate+0x138/0x1f0 lib/refcount.c:22 Code: 06 31 ff 89 de e8 c8 f5 e6 fd 84 db 0f 85 6f ff ff ff e8 7b f4 e6 fd 48 c7 c7 e0 71 4f 88 c6 05 56 a6 a4 06 01 e8 c7 a8 b7 fd <0f> 0b e9 50 ff ff ff e8 5c f4 e6 fd 0f b6 1d 3d a6 a4 06 31 ff 89 RSP: 0018:ffff88809689f550 EFLAGS: 00010286 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff815e4336 RDI: ffffed1012d13e9c RBP: ffff88809689f560 R08: ffff88809c50a3c0 R09: fffffbfff15d31b1 R10: fffffbfff15d31b0 R11: ffffffff8ae98d87 R12: 0000000000000001 R13: 0000000000040100 R14: ffff888099041104 R15: ffff888218d96e40 refcount_add include/linux/refcount.h:193 [inline] skb_set_owner_w+0x2b6/0x410 net/core/sock.c:1999 sock_wmalloc+0xf1/0x120 net/core/sock.c:2096 ip_append_page+0x7ef/0x1190 net/ipv4/ip_output.c:1383 udp_sendpage+0x1c7/0x480 net/ipv4/udp.c:1276 inet_sendpage+0xdb/0x150 net/ipv4/af_inet.c:821 kernel_sendpage+0x92/0xf0 net/socket.c:3794 sock_sendpage+0x8b/0xc0 net/socket.c:936 pipe_to_sendpage+0x2da/0x3c0 fs/splice.c:458 splice_from_pipe_feed fs/splice.c:512 [inline] __splice_from_pipe+0x3ee/0x7c0 fs/splice.c:636 splice_from_pipe+0x108/0x170 fs/splice.c:671 generic_splice_sendpage+0x3c/0x50 fs/splice.c:842 do_splice_from fs/splice.c:861 [inline] direct_splice_actor+0x123/0x190 fs/splice.c:1035 splice_direct_to_actor+0x3b4/0xa30 fs/splice.c:990 do_splice_direct+0x1da/0x2a0 fs/splice.c:1078 do_sendfile+0x597/0xd00 fs/read_write.c:1464 __do_sys_sendfile64 fs/read_write.c:1525 [inline] __se_sys_sendfile64 fs/read_write.c:1511 [inline] __x64_sys_sendfile64+0x1dd/0x220 fs/read_write.c:1511 do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x441409 Code: e8 ac e8 ff ff 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 eb 08 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fffb64c4f78 EFLAGS: 00000246 ORIG_RAX: 0000000000000028 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000000000441409 RDX: 0000000000000000 RSI: 0000000000000006 RDI: 0000000000000005 RBP: 0000000000073b8a R08: 0000000000000010 R09: 0000000000000010 R10: 0000000000010001 R11: 0000000000000246 R12: 0000000000402180 R13: 0000000000402210 R14: 0000000000000000 R15: 0000000000000000 Kernel Offset: disabled Rebooting in 86400 seconds.. Fixes: 1470ddf7f8ce ("inet: Remove explicit write references to sk/inet in ip_append_data") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ include/net/ip.h | 5 +++++ net/core/dev.c | 3 ++- net/ipv4/devinet.c | 5 ----- net/ipv4/ip_output.c | 13 ++++++++----- 5 files changed, 20 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cf0923579af4..9ef20389622d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1881,6 +1881,11 @@ struct net_device { unsigned char if_port; unsigned char dma; + /* Note : dev->mtu is often read without holding a lock. + * Writers usually hold RTNL. + * It is recommended to use READ_ONCE() to annotate the reads, + * and to use WRITE_ONCE() to annotate the writes. + */ unsigned int mtu; unsigned int min_mtu; unsigned int max_mtu; diff --git a/include/net/ip.h b/include/net/ip.h index 02d68e346f67..5b317c9f4470 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -760,4 +760,9 @@ int ip_misc_proc_init(void); int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, u8 family, struct netlink_ext_ack *extack); +static inline bool inetdev_valid_mtu(unsigned int mtu) +{ + return likely(mtu >= IPV4_MIN_MTU); +} + #endif /* _IP_H */ diff --git a/net/core/dev.c b/net/core/dev.c index e7c027fb4808..2c277b8aba38 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8188,7 +8188,8 @@ int __dev_set_mtu(struct net_device *dev, int new_mtu) if (ops->ndo_change_mtu) return ops->ndo_change_mtu(dev, new_mtu); - dev->mtu = new_mtu; + /* Pairs with all the lockless reads of dev->mtu in the stack */ + WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(__dev_set_mtu); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index a4b5bd4d2c89..e4632bd2026d 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1496,11 +1496,6 @@ skip: } } -static bool inetdev_valid_mtu(unsigned int mtu) -{ - return mtu >= IPV4_MIN_MTU; -} - static void inetdev_send_gratuitous_arp(struct net_device *dev, struct in_device *in_dev) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 9d83cb320dcb..14db1e0b8a6e 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1258,15 +1258,18 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->addr = ipc->addr; } - /* - * We steal reference to this route, caller should not release it - */ - *rtp = NULL; cork->fragsize = ip_sk_use_pmtu(sk) ? - dst_mtu(&rt->dst) : rt->dst.dev->mtu; + dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu); + + if (!inetdev_valid_mtu(cork->fragsize)) + return -ENETUNREACH; cork->gso_size = ipc->gso_size; + cork->dst = &rt->dst; + /* We stole this route, caller should not release it. */ + *rtp = NULL; + cork->length = 0; cork->ttl = ipc->ttl; cork->tos = ipc->tos; -- cgit v1.2.3-59-g8ed1b