aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv6
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv6')
-rw-r--r--net/ipv6/Kconfig115
-rw-r--r--net/ipv6/Makefile15
-rw-r--r--net/ipv6/addrconf.c967
-rw-r--r--net/ipv6/addrconf_core.c22
-rw-r--r--net/ipv6/addrlabel.c27
-rw-r--r--net/ipv6/af_inet6.c215
-rw-r--r--net/ipv6/ah6.c40
-rw-r--r--net/ipv6/anycast.c21
-rw-r--r--net/ipv6/calipso.c23
-rw-r--r--net/ipv6/datagram.c30
-rw-r--r--net/ipv6/esp6.c514
-rw-r--r--net/ipv6/esp6_offload.c95
-rw-r--r--net/ipv6/exthdrs.c442
-rw-r--r--net/ipv6/fib6_rules.c62
-rw-r--r--net/ipv6/fou6.c1
-rw-r--r--net/ipv6/icmp.c211
-rw-r--r--net/ipv6/ila/ila.h5
-rw-r--r--net/ipv6/ila/ila_lwt.c2
-rw-r--r--net/ipv6/ila/ila_main.c2
-rw-r--r--net/ipv6/ila/ila_xlat.c8
-rw-r--r--net/ipv6/inet6_connection_sock.c4
-rw-r--r--net/ipv6/inet6_hashtables.c92
-rw-r--r--net/ipv6/ioam6.c979
-rw-r--r--net/ipv6/ioam6_iptunnel.c477
-rw-r--r--net/ipv6/ip6_fib.c156
-rw-r--r--net/ipv6/ip6_flowlabel.c324
-rw-r--r--net/ipv6/ip6_gre.c262
-rw-r--r--net/ipv6/ip6_icmp.c20
-rw-r--r--net/ipv6/ip6_input.c83
-rw-r--r--net/ipv6/ip6_offload.c101
-rw-r--r--net/ipv6/ip6_output.c469
-rw-r--r--net/ipv6/ip6_tunnel.c451
-rw-r--r--net/ipv6/ip6_udp_tunnel.c9
-rw-r--r--net/ipv6/ip6_vti.c158
-rw-r--r--net/ipv6/ip6mr.c401
-rw-r--r--net/ipv6/ipcomp6.c14
-rw-r--r--net/ipv6/ipv6_sockglue.c817
-rw-r--r--net/ipv6/mcast.c1175
-rw-r--r--net/ipv6/mcast_snoop.c12
-rw-r--r--net/ipv6/mip6.c113
-rw-r--r--net/ipv6/ndisc.c172
-rw-r--r--net/ipv6/netfilter.c25
-rw-r--r--net/ipv6/netfilter/Kconfig19
-rw-r--r--net/ipv6/netfilter/Makefile6
-rw-r--r--net/ipv6/netfilter/ip6_tables.c198
-rw-r--r--net/ipv6/netfilter/ip6t_NPT.c39
-rw-r--r--net/ipv6/netfilter/ip6t_REJECT.c2
-rw-r--r--net/ipv6/netfilter/ip6t_SYNPROXY.c1
-rw-r--r--net/ipv6/netfilter/ip6t_ah.c3
-rw-r--r--net/ipv6/netfilter/ip6t_frag.c3
-rw-r--r--net/ipv6/netfilter/ip6t_hbh.c3
-rw-r--r--net/ipv6/netfilter/ip6t_rpfilter.c10
-rw-r--r--net/ipv6/netfilter/ip6t_rt.c51
-rw-r--r--net/ipv6/netfilter/ip6table_filter.c50
-rw-r--r--net/ipv6/netfilter/ip6table_mangle.c50
-rw-r--r--net/ipv6/netfilter/ip6table_nat.c89
-rw-r--r--net/ipv6/netfilter/ip6table_raw.c51
-rw-r--r--net/ipv6/netfilter/ip6table_security.c48
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c78
-rw-r--r--net/ipv6/netfilter/nf_defrag_ipv6_hooks.c33
-rw-r--r--net/ipv6/netfilter/nf_flow_table_ipv6.c37
-rw-r--r--net/ipv6/netfilter/nf_log_ipv6.c425
-rw-r--r--net/ipv6/netfilter/nf_reject_ipv6.c170
-rw-r--r--net/ipv6/netfilter/nf_socket_ipv6.c10
-rw-r--r--net/ipv6/netfilter/nf_tproxy_ipv6.c8
-rw-r--r--net/ipv6/netfilter/nft_dup_ipv6.c20
-rw-r--r--net/ipv6/netfilter/nft_fib_ipv6.c36
-rw-r--r--net/ipv6/netfilter/nft_reject_ipv6.c5
-rw-r--r--net/ipv6/output_core.c28
-rw-r--r--net/ipv6/ping.c63
-rw-r--r--net/ipv6/proc.c2
-rw-r--r--net/ipv6/raw.c204
-rw-r--r--net/ipv6/reassembly.c22
-rw-r--r--net/ipv6/route.c764
-rw-r--r--net/ipv6/rpl.c124
-rw-r--r--net/ipv6/rpl_iptunnel.c376
-rw-r--r--net/ipv6/seg6.c94
-rw-r--r--net/ipv6/seg6_hmac.c8
-rw-r--r--net/ipv6/seg6_iptunnel.c252
-rw-r--r--net/ipv6/seg6_local.c1395
-rw-r--r--net/ipv6/sit.c505
-rw-r--r--net/ipv6/syncookies.c23
-rw-r--r--net/ipv6/sysctl_net_ipv6.c116
-rw-r--r--net/ipv6/tcp_ipv6.c440
-rw-r--r--net/ipv6/tcpv6_offload.c1
-rw-r--r--net/ipv6/tunnel6.c128
-rw-r--r--net/ipv6/udp.c410
-rw-r--r--net/ipv6/udp_impl.h15
-rw-r--r--net/ipv6/udp_offload.c33
-rw-r--r--net/ipv6/udplite.c16
-rw-r--r--net/ipv6/xfrm6_input.c111
-rw-r--r--net/ipv6/xfrm6_output.c117
-rw-r--r--net/ipv6/xfrm6_policy.c8
-rw-r--r--net/ipv6/xfrm6_protocol.c48
-rw-r--r--net/ipv6/xfrm6_state.c26
-rw-r--r--net/ipv6/xfrm6_tunnel.c17
96 files changed, 10852 insertions, 5070 deletions
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index ae1344e4cec5..658bfed1df8b 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -7,14 +7,15 @@
menuconfig IPV6
tristate "The IPv6 protocol"
default y
- ---help---
+ select CRYPTO_LIB_SHA1
+ help
Support for IP version 6 (IPv6).
For general information about IPv6, see
<https://en.wikipedia.org/wiki/IPv6>.
For specific information about IPv6 under Linux, see
- Documentation/networking/ipv6.txt and read the HOWTO at
- <http://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/>
+ Documentation/networking/ipv6.rst and read the HOWTO at
+ <https://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/>
To compile this protocol support as a module, choose M here: the
module will be called ipv6.
@@ -23,7 +24,7 @@ if IPV6
config IPV6_ROUTER_PREF
bool "IPv6: Router Preference (RFC 4191) support"
- ---help---
+ help
Router Preference is an optional extension to the Router
Advertisement message which improves the ability of hosts
to pick an appropriate router, especially when the hosts
@@ -34,14 +35,14 @@ config IPV6_ROUTER_PREF
config IPV6_ROUTE_INFO
bool "IPv6: Route Information (RFC 4191) support"
depends on IPV6_ROUTER_PREF
- ---help---
+ help
Support of Route Information.
If unsure, say N.
config IPV6_OPTIMISTIC_DAD
bool "IPv6: Enable RFC 4429 Optimistic DAD"
- ---help---
+ help
Support for optimistic Duplicate Address Detection. It allows for
autoconfigured addresses to be used more quickly.
@@ -49,29 +50,31 @@ config IPV6_OPTIMISTIC_DAD
config INET6_AH
tristate "IPv6: AH transformation"
- select XFRM_ALGO
- select CRYPTO
- select CRYPTO_HMAC
- select CRYPTO_MD5
- select CRYPTO_SHA1
- ---help---
- Support for IPsec AH.
+ select XFRM_AH
+ help
+ Support for IPsec AH (Authentication Header).
+
+ AH can be used with various authentication algorithms. Besides
+ enabling AH support itself, this option enables the generic
+ implementations of the algorithms that RFC 8221 lists as MUST be
+ implemented. If you need any other algorithms, you'll need to enable
+ them in the crypto API. You should also enable accelerated
+ implementations of any needed algorithms when available.
If unsure, say Y.
config INET6_ESP
tristate "IPv6: ESP transformation"
- select XFRM_ALGO
- select CRYPTO
- select CRYPTO_AUTHENC
- select CRYPTO_HMAC
- select CRYPTO_MD5
- select CRYPTO_CBC
- select CRYPTO_SHA1
- select CRYPTO_DES
- select CRYPTO_ECHAINIV
- ---help---
- Support for IPsec ESP.
+ select XFRM_ESP
+ help
+ Support for IPsec ESP (Encapsulating Security Payload).
+
+ ESP can be used with various encryption and authentication algorithms.
+ Besides enabling ESP support itself, this option enables the generic
+ implementations of the algorithms that RFC 8221 lists as MUST be
+ implemented. If you need any other algorithms, you'll need to enable
+ them in the crypto API. You should also enable accelerated
+ implementations of any needed algorithms when available.
If unsure, say Y.
@@ -80,7 +83,7 @@ config INET6_ESP_OFFLOAD
depends on INET6_ESP
select XFRM_OFFLOAD
default n
- ---help---
+ help
Support for ESP transformation offload. This makes sense
only if this system really does IPsec and want to do it
with high throughput. A typical desktop system does not
@@ -88,11 +91,23 @@ config INET6_ESP_OFFLOAD
If unsure, say N.
+config INET6_ESPINTCP
+ bool "IPv6: ESP in TCP encapsulation (RFC 8229)"
+ depends on XFRM && INET6_ESP
+ select STREAM_PARSER
+ select NET_SOCK_MSG
+ select XFRM_ESPINTCP
+ help
+ Support for RFC 8229 encapsulation of ESP and IKE over
+ TCP/IPv6 sockets.
+
+ If unsure, say N.
+
config INET6_IPCOMP
tristate "IPv6: IPComp transformation"
select INET6_XFRM_TUNNEL
select XFRM_IPCOMP
- ---help---
+ help
Support for IP Payload Compression Protocol (IPComp) (RFC3173),
typically needed for IPsec.
@@ -101,7 +116,7 @@ config INET6_IPCOMP
config IPV6_MIP6
tristate "IPv6: Mobility"
select XFRM
- ---help---
+ help
Support for IPv6 Mobility described in RFC 3775.
If unsure, say N.
@@ -111,7 +126,7 @@ config IPV6_ILA
depends on NETFILTER
select DST_CACHE
select LWTUNNEL
- ---help---
+ help
Support for IPv6 Identifier Locator Addressing (ILA).
ILA is a mechanism to do network virtualization without
@@ -141,7 +156,7 @@ tristate "Virtual (secure) IPv6: tunneling"
select IPV6_TUNNEL
select NET_IP_TUNNEL
select XFRM
- ---help---
+ help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
encapsulating protocol. This can be used with xfrm mode tunnel to give
@@ -154,7 +169,7 @@ config IPV6_SIT
select NET_IP_TUNNEL
select IPV6_NDISC_NODETYPE
default y
- ---help---
+ help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
encapsulating protocol. This driver implements encapsulation of IPv6
@@ -167,7 +182,7 @@ config IPV6_SIT_6RD
bool "IPv6: IPv6 Rapid Deployment (6RD)"
depends on IPV6_SIT
default n
- ---help---
+ help
IPv6 Rapid Deployment (6rd; draft-ietf-softwire-ipv6-6rd) builds upon
mechanisms of 6to4 (RFC3056) to enable a service provider to rapidly
deploy IPv6 unicast service to IPv4 sites to which it provides
@@ -190,7 +205,7 @@ config IPV6_TUNNEL
select INET6_TUNNEL
select DST_CACHE
select GRO_CELLS
- ---help---
+ help
Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in
RFC 2473.
@@ -201,7 +216,7 @@ config IPV6_GRE
select IPV6_TUNNEL
select NET_IP_TUNNEL
depends on NET_IPGRE_DEMUX
- ---help---
+ help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
encapsulating protocol. This particular tunneling driver implements
@@ -226,13 +241,13 @@ config IPV6_FOU_TUNNEL
config IPV6_MULTIPLE_TABLES
bool "IPv6: Multiple Routing Tables"
select FIB_RULES
- ---help---
+ help
Support multiple routing tables.
config IPV6_SUBTREES
bool "IPv6: source address based routing"
depends on IPV6_MULTIPLE_TABLES
- ---help---
+ help
Enable routing by source address or prefix.
The destination address is still the primary routing key, so mixing
@@ -247,7 +262,7 @@ config IPV6_MROUTE
bool "IPv6: multicast routing"
depends on IPV6
select IP_MROUTE_COMMON
- ---help---
+ help
Support for IPv6 multicast forwarding.
If unsure, say N.
@@ -268,7 +283,7 @@ config IPV6_MROUTE_MULTIPLE_TABLES
config IPV6_PIMSM_V2
bool "IPv6: PIM-SM version 2 support"
depends on IPV6_MROUTE
- ---help---
+ help
Support for IPv6 PIM multicast routing protocol PIM-SMv2.
If unsure, say N.
@@ -278,7 +293,7 @@ config IPV6_SEG6_LWTUNNEL
select LWTUNNEL
select DST_CACHE
select IPV6_MULTIPLE_TABLES
- ---help---
+ help
Support for encapsulation of packets within an outer IPv6
header and a Segment Routing Header using the lightweight
tunnels mechanism. Also enable support for advanced local
@@ -289,10 +304,11 @@ config IPV6_SEG6_LWTUNNEL
config IPV6_SEG6_HMAC
bool "IPv6: Segment Routing HMAC support"
depends on IPV6
+ select CRYPTO
select CRYPTO_HMAC
select CRYPTO_SHA1
select CRYPTO_SHA256
- ---help---
+ help
Support for HMAC signature generation and verification
of SR-enabled packets.
@@ -303,4 +319,25 @@ config IPV6_SEG6_BPF
depends on IPV6_SEG6_LWTUNNEL
depends on IPV6 = y
+config IPV6_RPL_LWTUNNEL
+ bool "IPv6: RPL Source Routing Header support"
+ depends on IPV6
+ select LWTUNNEL
+ help
+ Support for RFC6554 RPL Source Routing Header using the lightweight
+ tunnels mechanism.
+
+ If unsure, say N.
+
+config IPV6_IOAM6_LWTUNNEL
+ bool "IPv6: IOAM Pre-allocated Trace insertion support"
+ depends on IPV6
+ select LWTUNNEL
+ select DST_CACHE
+ help
+ Support for the insertion of IOAM Pre-allocated Trace
+ Header using the lightweight tunnels mechanism.
+
+ If unsure, say N.
+
endif # IPV6
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 8ccf35514015..3036a45e8a1e 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -5,16 +5,14 @@
obj-$(CONFIG_IPV6) += ipv6.o
-ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
+ipv6-y := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
addrlabel.o \
route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
- udp_offload.o seg6.o fib6_notifier.o
+ udp_offload.o seg6.o fib6_notifier.o rpl.o ioam6.o
-ipv6-offload := ip6_offload.o tcpv6_offload.o exthdrs_offload.o
-
-ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o
+ipv6-$(CONFIG_SYSCTL) += sysctl_net_ipv6.o
ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o
ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
@@ -26,8 +24,8 @@ ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o
ipv6-$(CONFIG_NETLABEL) += calipso.o
ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o seg6_local.o
ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o
-
-ipv6-objs += $(ipv6-y)
+ipv6-$(CONFIG_IPV6_RPL_LWTUNNEL) += rpl_iptunnel.o
+ipv6-$(CONFIG_IPV6_IOAM6_LWTUNNEL) += ioam6_iptunnel.o
obj-$(CONFIG_INET6_AH) += ah6.o
obj-$(CONFIG_INET6_ESP) += esp6.o
@@ -46,7 +44,8 @@ obj-$(CONFIG_IPV6_GRE) += ip6_gre.o
obj-$(CONFIG_IPV6_FOU) += fou6.o
obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o
-obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload)
+obj-$(CONFIG_INET) += output_core.o protocol.o \
+ ip6_offload.o tcpv6_offload.o exthdrs_offload.o
obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 46d614b611db..9c3f5202a97b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -89,6 +89,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/export.h>
+#include <linux/ioam6.h>
#define INFINITY_LIFE_TIME 0xFFFFFFFF
@@ -103,7 +104,7 @@ static inline u32 cstamp_delta(unsigned long cstamp)
static inline s32 rfc3315_s14_backoff_init(s32 irt)
{
/* multiply 'initial retransmission time' by 0.9 .. 1.1 */
- u64 tmp = (900000 + prandom_u32() % 200001) * (u64)irt;
+ u64 tmp = (900000 + prandom_u32_max(200001)) * (u64)irt;
do_div(tmp, 1000000);
return (s32)tmp;
}
@@ -111,11 +112,11 @@ static inline s32 rfc3315_s14_backoff_init(s32 irt)
static inline s32 rfc3315_s14_backoff_update(s32 rt, s32 mrt)
{
/* multiply 'retransmission timeout' by 1.9 .. 2.1 */
- u64 tmp = (1900000 + prandom_u32() % 200001) * (u64)rt;
+ u64 tmp = (1900000 + prandom_u32_max(200001)) * (u64)rt;
do_div(tmp, 1000000);
if ((s32)tmp > mrt) {
/* multiply 'maximum retransmission time' by 0.9 .. 1.1 */
- tmp = (900000 + prandom_u32() % 200001) * (u64)mrt;
+ tmp = (900000 + prandom_u32_max(200001)) * (u64)mrt;
do_div(tmp, 1000000);
}
return (s32)tmp;
@@ -135,8 +136,7 @@ static inline void addrconf_sysctl_unregister(struct inet6_dev *idev)
}
#endif
-static void ipv6_regen_rndid(struct inet6_dev *idev);
-static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr);
+static void ipv6_gen_rnd_iid(struct in6_addr *addr);
static int ipv6_generate_eui64(u8 *eui, struct net_device *dev);
static int ipv6_count_addresses(const struct inet6_dev *idev);
@@ -146,25 +146,18 @@ static int ipv6_generate_stable_address(struct in6_addr *addr,
#define IN6_ADDR_HSIZE_SHIFT 8
#define IN6_ADDR_HSIZE (1 << IN6_ADDR_HSIZE_SHIFT)
-/*
- * Configured unicast address hash table
- */
-static struct hlist_head inet6_addr_lst[IN6_ADDR_HSIZE];
-static DEFINE_SPINLOCK(addrconf_hash_lock);
-static void addrconf_verify(void);
-static void addrconf_verify_rtnl(void);
-static void addrconf_verify_work(struct work_struct *);
+static void addrconf_verify(struct net *net);
+static void addrconf_verify_rtnl(struct net *net);
static struct workqueue_struct *addrconf_wq;
-static DECLARE_DELAYED_WORK(addr_chk_work, addrconf_verify_work);
static void addrconf_join_anycast(struct inet6_ifaddr *ifp);
static void addrconf_leave_anycast(struct inet6_ifaddr *ifp);
static void addrconf_type_change(struct net_device *dev,
unsigned long event);
-static int addrconf_ifdown(struct net_device *dev, int how);
+static int addrconf_ifdown(struct net_device *dev, bool unregister);
static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
int plen,
@@ -206,6 +199,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.max_desync_factor = MAX_DESYNC_FACTOR,
.max_addresses = IPV6_MAX_ADDRESSES,
.accept_ra_defrtr = 1,
+ .ra_defrtr_metric = IP6_RT_PRIO_USER,
.accept_ra_from_local = 0,
.accept_ra_min_hop_limit= 1,
.accept_ra_pinfo = 1,
@@ -236,6 +230,11 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.enhanced_dad = 1,
.addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64,
.disable_policy = 0,
+ .rpl_seg_enabled = 0,
+ .ioam6_enabled = 0,
+ .ioam6_id = IOAM6_DEFAULT_IF_ID,
+ .ioam6_id_wide = IOAM6_DEFAULT_IF_ID_WIDE,
+ .ndisc_evict_nocarrier = 1,
};
static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -260,6 +259,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.max_desync_factor = MAX_DESYNC_FACTOR,
.max_addresses = IPV6_MAX_ADDRESSES,
.accept_ra_defrtr = 1,
+ .ra_defrtr_metric = IP6_RT_PRIO_USER,
.accept_ra_from_local = 0,
.accept_ra_min_hop_limit= 1,
.accept_ra_pinfo = 1,
@@ -290,6 +290,11 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.enhanced_dad = 1,
.addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64,
.disable_policy = 0,
+ .rpl_seg_enabled = 0,
+ .ioam6_enabled = 0,
+ .ioam6_id = IOAM6_DEFAULT_IF_ID,
+ .ioam6_id_wide = IOAM6_DEFAULT_IF_ID_WIDE,
+ .ndisc_evict_nocarrier = 1,
};
/* Check if link is ready: is it up and is a valid qdisc available */
@@ -330,7 +335,7 @@ static int snmp6_alloc_dev(struct inet6_dev *idev)
{
int i;
- idev->stats.ipv6 = alloc_percpu(struct ipstats_mib);
+ idev->stats.ipv6 = alloc_percpu_gfp(struct ipstats_mib, GFP_KERNEL_ACCOUNT);
if (!idev->stats.ipv6)
goto err_ip;
@@ -346,7 +351,7 @@ static int snmp6_alloc_dev(struct inet6_dev *idev)
if (!idev->stats.icmpv6dev)
goto err_icmp;
idev->stats.icmpv6msgdev = kzalloc(sizeof(struct icmpv6msg_mib_device),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!idev->stats.icmpv6msgdev)
goto err_icmpmsg;
@@ -367,10 +372,10 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
ASSERT_RTNL();
- if (dev->mtu < IPV6_MIN_MTU)
+ if (dev->mtu < IPV6_MIN_MTU && dev != blackhole_netdev)
return ERR_PTR(-EINVAL);
- ndev = kzalloc(sizeof(struct inet6_dev), GFP_KERNEL);
+ ndev = kzalloc(sizeof(*ndev), GFP_KERNEL_ACCOUNT);
if (!ndev)
return ERR_PTR(err);
@@ -384,6 +389,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
ndev->cnf.mtu6 = dev->mtu;
+ ndev->ra_mtu = 0;
ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
if (!ndev->nd_parms) {
kfree(ndev);
@@ -392,23 +398,24 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
if (ndev->cnf.forwarding)
dev_disable_lro(dev);
/* We refer to the device */
- dev_hold(dev);
+ netdev_hold(dev, &ndev->dev_tracker, GFP_KERNEL);
if (snmp6_alloc_dev(ndev) < 0) {
netdev_dbg(dev, "%s: cannot allocate memory for statistics\n",
__func__);
neigh_parms_release(&nd_tbl, ndev->nd_parms);
- dev_put(dev);
+ netdev_put(dev, &ndev->dev_tracker);
kfree(ndev);
return ERR_PTR(err);
}
- if (snmp6_register_dev(ndev) < 0) {
- netdev_dbg(dev, "%s: cannot create /proc/net/dev_snmp6/%s\n",
- __func__, dev->name);
- goto err_release;
+ if (dev != blackhole_netdev) {
+ if (snmp6_register_dev(ndev) < 0) {
+ netdev_dbg(dev, "%s: cannot create /proc/net/dev_snmp6/%s\n",
+ __func__, dev->name);
+ goto err_release;
+ }
}
-
/* One reference from device. */
refcount_set(&ndev->refcnt, 1);
@@ -430,8 +437,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
dev->type == ARPHRD_SIT ||
dev->type == ARPHRD_NONE) {
ndev->cnf.use_tempaddr = -1;
- } else
- ipv6_regen_rndid(ndev);
+ }
ndev->token = in6addr_any;
@@ -440,25 +446,28 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
ipv6_mc_init_dev(ndev);
ndev->tstamp = jiffies;
- err = addrconf_sysctl_register(ndev);
- if (err) {
- ipv6_mc_destroy_dev(ndev);
- snmp6_unregister_dev(ndev);
- goto err_release;
+ if (dev != blackhole_netdev) {
+ err = addrconf_sysctl_register(ndev);
+ if (err) {
+ ipv6_mc_destroy_dev(ndev);
+ snmp6_unregister_dev(ndev);
+ goto err_release;
+ }
}
/* protected by rtnl_lock */
rcu_assign_pointer(dev->ip6_ptr, ndev);
- /* Join interface-local all-node multicast group */
- ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allnodes);
-
- /* Join all-node multicast group */
- ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes);
+ if (dev != blackhole_netdev) {
+ /* Join interface-local all-node multicast group */
+ ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allnodes);
- /* Join all-router multicast group if forwarding is set */
- if (ndev->cnf.forwarding && (dev->flags & IFF_MULTICAST))
- ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
+ /* Join all-node multicast group */
+ ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes);
+ /* Join all-router multicast group if forwarding is set */
+ if (ndev->cnf.forwarding && (dev->flags & IFF_MULTICAST))
+ ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
+ }
return ndev;
err_release:
@@ -542,7 +551,7 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
#ifdef CONFIG_IPV6_MROUTE
if ((all || type == NETCONFA_MC_FORWARDING) &&
nla_put_s32(skb, NETCONFA_MC_FORWARDING,
- devconf->mc_forwarding) < 0)
+ atomic_read(&devconf->mc_forwarding)) < 0)
goto nla_put_failure;
#endif
if ((all || type == NETCONFA_PROXY_NEIGH) &&
@@ -692,8 +701,7 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
errout:
if (in6_dev)
in6_dev_put(in6_dev);
- if (dev)
- dev_put(dev);
+ dev_put(dev);
return err;
}
@@ -789,6 +797,7 @@ static void dev_forward_change(struct inet6_dev *idev)
{
struct net_device *dev;
struct inet6_ifaddr *ifa;
+ LIST_HEAD(tmp_addr_list);
if (!idev)
return;
@@ -807,14 +816,24 @@ static void dev_forward_change(struct inet6_dev *idev)
}
}
+ read_lock_bh(&idev->lock);
list_for_each_entry(ifa, &idev->addr_list, if_list) {
if (ifa->flags&IFA_F_TENTATIVE)
continue;
+ list_add_tail(&ifa->if_list_aux, &tmp_addr_list);
+ }
+ read_unlock_bh(&idev->lock);
+
+ while (!list_empty(&tmp_addr_list)) {
+ ifa = list_first_entry(&tmp_addr_list,
+ struct inet6_ifaddr, if_list_aux);
+ list_del(&ifa->if_list_aux);
if (idev->cnf.forwarding)
addrconf_join_anycast(ifa);
else
addrconf_leave_anycast(ifa);
}
+
inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
NETCONFA_FORWARDING,
dev->ifindex, &idev->cnf);
@@ -1000,9 +1019,7 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
{
struct inet6_ifaddr *ifp;
- hlist_for_each_entry(ifp, &inet6_addr_lst[hash], addr_lst) {
- if (!net_eq(dev_net(ifp->idev->dev), net))
- continue;
+ hlist_for_each_entry(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
if (ipv6_addr_equal(&ifp->addr, addr)) {
if (!dev || ifp->idev->dev == dev)
return true;
@@ -1013,20 +1030,21 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa)
{
- unsigned int hash = inet6_addr_hash(dev_net(dev), &ifa->addr);
+ struct net *net = dev_net(dev);
+ unsigned int hash = inet6_addr_hash(net, &ifa->addr);
int err = 0;
- spin_lock(&addrconf_hash_lock);
+ spin_lock(&net->ipv6.addrconf_hash_lock);
/* Ignore adding duplicate addresses on an interface */
- if (ipv6_chk_same_addr(dev_net(dev), &ifa->addr, dev, hash)) {
+ if (ipv6_chk_same_addr(net, &ifa->addr, dev, hash)) {
netdev_dbg(dev, "ipv6_add_addr: already assigned\n");
err = -EEXIST;
} else {
- hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]);
+ hlist_add_head_rcu(&ifa->addr_lst, &net->ipv6.inet6_addr_lst[hash]);
}
- spin_unlock(&addrconf_hash_lock);
+ spin_unlock(&net->ipv6.addrconf_hash_lock);
return err;
}
@@ -1078,7 +1096,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
goto out;
}
- ifa = kzalloc(sizeof(*ifa), gfp_flags);
+ ifa = kzalloc(sizeof(*ifa), gfp_flags | __GFP_ACCOUNT);
if (!ifa) {
err = -ENOBUFS;
goto out;
@@ -1091,10 +1109,6 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
goto out;
}
- if (net->ipv6.devconf_all->disable_policy ||
- idev->cnf.disable_policy)
- f6i->dst_nopolicy = true;
-
neigh_parms_data_state_setall(idev->nd_parms);
ifa->addr = *cfg->pfx;
@@ -1108,6 +1122,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
ifa->prefix_len = cfg->plen;
ifa->rt_priority = cfg->rt_priority;
ifa->flags = cfg->ifa_flags;
+ ifa->ifa_proto = cfg->ifa_proto;
/* No need to add the TENTATIVE flag for addresses with NODAD */
if (!(cfg->ifa_flags & IFA_F_NODAD))
ifa->flags |= IFA_F_TENTATIVE;
@@ -1236,7 +1251,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires,
ifp->idev->dev, 0, RTF_DEFAULT, true);
if (f6i) {
if (del_rt)
- ip6_del_rt(dev_net(ifp->idev->dev), f6i);
+ ip6_del_rt(dev_net(ifp->idev->dev), f6i, false);
else {
if (!(f6i->fib6_flags & RTF_EXPIRES))
fib6_set_expires(f6i, expires);
@@ -1250,9 +1265,10 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires,
static void ipv6_del_addr(struct inet6_ifaddr *ifp)
{
- int state;
enum cleanup_prefix_rt_t action = CLEANUP_PREFIX_RT_NOP;
+ struct net *net = dev_net(ifp->idev->dev);
unsigned long expires;
+ int state;
ASSERT_RTNL();
@@ -1264,9 +1280,9 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
if (state == INET6_IFADDR_STATE_DEAD)
goto out;
- spin_lock_bh(&addrconf_hash_lock);
+ spin_lock_bh(&net->ipv6.addrconf_hash_lock);
hlist_del_init_rcu(&ifp->addr_lst);
- spin_unlock_bh(&addrconf_hash_lock);
+ spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
write_lock_bh(&ifp->idev->lock);
@@ -1304,29 +1320,21 @@ out:
in6_ifa_put(ifp);
}
-static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp,
- struct inet6_ifaddr *ift,
- bool block)
+static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, bool block)
{
struct inet6_dev *idev = ifp->idev;
- struct in6_addr addr, *tmpaddr;
unsigned long tmp_tstamp, age;
unsigned long regen_advance;
- struct ifa6_config cfg;
- int ret = 0;
unsigned long now = jiffies;
- long max_desync_factor;
s32 cnf_temp_preferred_lft;
+ struct inet6_ifaddr *ift;
+ struct ifa6_config cfg;
+ long max_desync_factor;
+ struct in6_addr addr;
+ int ret = 0;
write_lock_bh(&idev->lock);
- if (ift) {
- spin_lock_bh(&ift->lock);
- memcpy(&addr.s6_addr[8], &ift->addr.s6_addr[8], 8);
- spin_unlock_bh(&ift->lock);
- tmpaddr = &addr;
- } else {
- tmpaddr = NULL;
- }
+
retry:
in6_dev_hold(idev);
if (idev->cnf.use_tempaddr <= 0) {
@@ -1349,13 +1357,13 @@ retry:
}
in6_ifa_hold(ifp);
memcpy(addr.s6_addr, ifp->addr.s6_addr, 8);
- ipv6_try_regen_rndid(idev, tmpaddr);
- memcpy(&addr.s6_addr[8], idev->rndid, 8);
+ ipv6_gen_rnd_iid(&addr);
+
age = (now - ifp->tstamp) / HZ;
regen_advance = idev->cnf.regen_max_retry *
idev->cnf.dad_transmits *
- NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ;
+ max(NEIGH_VAR(idev->nd_parms, RETRANS_TIME), HZ/100) / HZ;
/* recalculate max_desync_factor each time and update
* idev->desync_factor if it's larger
@@ -1415,7 +1423,6 @@ retry:
in6_ifa_put(ifp);
in6_dev_put(idev);
pr_info("%s: retry temporary address regeneration\n", __func__);
- tmpaddr = &addr;
write_lock_bh(&idev->lock);
goto retry;
}
@@ -1837,8 +1844,8 @@ out:
}
EXPORT_SYMBOL(ipv6_dev_get_saddr);
-int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr,
- u32 banned_flags)
+static int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr,
+ u32 banned_flags)
{
struct inet6_ifaddr *ifp;
int err = -EADDRNOTAVAIL;
@@ -1902,12 +1909,13 @@ EXPORT_SYMBOL(ipv6_chk_addr);
* 2. does the address exist on the specific device
* (skip_dev_check = false)
*/
-int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
- const struct net_device *dev, bool skip_dev_check,
- int strict, u32 banned_flags)
+static struct net_device *
+__ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
+ const struct net_device *dev, bool skip_dev_check,
+ int strict, u32 banned_flags)
{
unsigned int hash = inet6_addr_hash(net, addr);
- const struct net_device *l3mdev;
+ struct net_device *l3mdev, *ndev;
struct inet6_ifaddr *ifp;
u32 ifp_flags;
@@ -1917,11 +1925,10 @@ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
if (skip_dev_check)
dev = NULL;
- hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
- if (!net_eq(dev_net(ifp->idev->dev), net))
- continue;
+ hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
+ ndev = ifp->idev->dev;
- if (l3mdev_master_dev_rcu(ifp->idev->dev) != l3mdev)
+ if (l3mdev_master_dev_rcu(ndev) != l3mdev)
continue;
/* Decouple optimistic from tentative for evaluation here.
@@ -1932,15 +1939,23 @@ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
: ifp->flags;
if (ipv6_addr_equal(&ifp->addr, addr) &&
!(ifp_flags&banned_flags) &&
- (!dev || ifp->idev->dev == dev ||
+ (!dev || ndev == dev ||
!(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) {
rcu_read_unlock();
- return 1;
+ return ndev;
}
}
rcu_read_unlock();
- return 0;
+ return NULL;
+}
+
+int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
+ const struct net_device *dev, bool skip_dev_check,
+ int strict, u32 banned_flags)
+{
+ return __ipv6_chk_addr_and_flags(net, addr, dev, skip_dev_check,
+ strict, banned_flags) ? 1 : 0;
}
EXPORT_SYMBOL(ipv6_chk_addr_and_flags);
@@ -1992,6 +2007,22 @@ int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev)
}
EXPORT_SYMBOL(ipv6_chk_prefix);
+/**
+ * ipv6_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @dev: used to find the L3 domain of interest
+ *
+ * The caller should be protected by RCU, or RTNL.
+ */
+struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr,
+ struct net_device *dev)
+{
+ return __ipv6_chk_addr_and_flags(net, addr, dev, !dev, 1,
+ IFA_F_TENTATIVE);
+}
+EXPORT_SYMBOL(ipv6_dev_find);
+
struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr,
struct net_device *dev, int strict)
{
@@ -1999,9 +2030,7 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add
struct inet6_ifaddr *ifp, *result = NULL;
rcu_read_lock();
- hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
- if (!net_eq(dev_net(ifp->idev->dev), net))
- continue;
+ hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
if (ipv6_addr_equal(&ifp->addr, addr)) {
if (!dev || ifp->idev->dev == dev ||
!(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) {
@@ -2030,7 +2059,7 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed)
if (ifpub) {
in6_ifa_hold(ifpub);
spin_unlock_bh(&ifp->lock);
- ipv6_create_tempaddr(ifpub, ifp, true);
+ ipv6_create_tempaddr(ifpub, true);
in6_ifa_put(ifpub);
} else {
spin_unlock_bh(&ifp->lock);
@@ -2068,7 +2097,7 @@ static int addrconf_dad_end(struct inet6_ifaddr *ifp)
void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
{
struct inet6_dev *idev = ifp->idev;
- struct net *net = dev_net(ifp->idev->dev);
+ struct net *net = dev_net(idev->dev);
if (addrconf_dad_end(ifp)) {
in6_ifa_put(ifp);
@@ -2211,12 +2240,12 @@ static int addrconf_ifid_6lowpan(u8 *eui, struct net_device *dev)
static int addrconf_ifid_ieee1394(u8 *eui, struct net_device *dev)
{
- union fwnet_hwaddr *ha;
+ const union fwnet_hwaddr *ha;
if (dev->addr_len != FWNET_ALEN)
return -1;
- ha = (union fwnet_hwaddr *)dev->dev_addr;
+ ha = (const union fwnet_hwaddr *)dev->dev_addr;
memcpy(eui, &ha->uc.uniq_id, sizeof(ha->uc.uniq_id));
eui[0] ^= 2;
@@ -2327,40 +2356,38 @@ static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev)
return err;
}
-/* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */
-static void ipv6_regen_rndid(struct inet6_dev *idev)
+/* Generation of a randomized Interface Identifier
+ * draft-ietf-6man-rfc4941bis, Section 3.3.1
+ */
+
+static void ipv6_gen_rnd_iid(struct in6_addr *addr)
{
regen:
- get_random_bytes(idev->rndid, sizeof(idev->rndid));
- idev->rndid[0] &= ~0x02;
+ get_random_bytes(&addr->s6_addr[8], 8);
- /*
- * <draft-ietf-ipngwg-temp-addresses-v2-00.txt>:
- * check if generated address is not inappropriate
+ /* <draft-ietf-6man-rfc4941bis-08.txt>, Section 3.3.1:
+ * check if generated address is not inappropriate:
*
- * - Reserved subnet anycast (RFC 2526)
- * 11111101 11....11 1xxxxxxx
- * - ISATAP (RFC4214) 6.1
- * 00-00-5E-FE-xx-xx-xx-xx
- * - value 0
- * - XXX: already assigned to an address on the device
+ * - Reserved IPv6 Interface Identifiers
+ * - XXX: already assigned to an address on the device
*/
- if (idev->rndid[0] == 0xfd &&
- (idev->rndid[1]&idev->rndid[2]&idev->rndid[3]&idev->rndid[4]&idev->rndid[5]&idev->rndid[6]) == 0xff &&
- (idev->rndid[7]&0x80))
+
+ /* Subnet-router anycast: 0000:0000:0000:0000 */
+ if (!(addr->s6_addr32[2] | addr->s6_addr32[3]))
goto regen;
- if ((idev->rndid[0]|idev->rndid[1]) == 0) {
- if (idev->rndid[2] == 0x5e && idev->rndid[3] == 0xfe)
- goto regen;
- if ((idev->rndid[2]|idev->rndid[3]|idev->rndid[4]|idev->rndid[5]|idev->rndid[6]|idev->rndid[7]) == 0x00)
- goto regen;
- }
-}
-static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr)
-{
- if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0)
- ipv6_regen_rndid(idev);
+ /* IANA Ethernet block: 0200:5EFF:FE00:0000-0200:5EFF:FE00:5212
+ * Proxy Mobile IPv6: 0200:5EFF:FE00:5213
+ * IANA Ethernet block: 0200:5EFF:FE00:5214-0200:5EFF:FEFF:FFFF
+ */
+ if (ntohl(addr->s6_addr32[2]) == 0x02005eff &&
+ (ntohl(addr->s6_addr32[3]) & 0Xff000000) == 0xfe000000)
+ goto regen;
+
+ /* Reserved subnet anycast addresses */
+ if (ntohl(addr->s6_addr32[2]) == 0xfdffffff &&
+ ntohl(addr->s6_addr32[3]) >= 0Xffffff80)
+ goto regen;
}
/*
@@ -2452,8 +2479,9 @@ static void addrconf_add_mroute(struct net_device *dev)
.fc_ifindex = dev->ifindex,
.fc_dst_len = 8,
.fc_flags = RTF_UP,
- .fc_type = RTN_UNICAST,
+ .fc_type = RTN_MULTICAST,
.fc_nlinfo.nl_net = dev_net(dev),
+ .fc_protocol = RTPROT_KERNEL,
};
ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0);
@@ -2542,7 +2570,7 @@ static void manage_tempaddrs(struct inet6_dev *idev,
* no temporary address currently exists.
*/
read_unlock_bh(&idev->lock);
- ipv6_create_tempaddr(ifp, NULL, false);
+ ipv6_create_tempaddr(ifp, false);
} else {
read_unlock_bh(&idev->lock);
}
@@ -2573,6 +2601,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
.valid_lft = valid_lft,
.preferred_lft = prefered_lft,
.scope = addr_type & IPV6_ADDR_SCOPE_MASK,
+ .ifa_proto = IFAPROT_KERNEL_RA
};
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
@@ -2648,7 +2677,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
create, now);
in6_ifa_put(ifp);
- addrconf_verify();
+ addrconf_verify(net);
}
return 0;
@@ -2729,7 +2758,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
if (rt) {
/* Autoconf prefix route */
if (valid_lft == 0) {
- ip6_del_rt(net, rt);
+ ip6_del_rt(net, rt, false);
rt = NULL;
} else if (addrconf_finite_timeout(rt_expires)) {
/* not infinity */
@@ -2807,6 +2836,33 @@ put:
in6_dev_put(in6_dev);
}
+static int addrconf_set_sit_dstaddr(struct net *net, struct net_device *dev,
+ struct in6_ifreq *ireq)
+{
+ struct ip_tunnel_parm p = { };
+ int err;
+
+ if (!(ipv6_addr_type(&ireq->ifr6_addr) & IPV6_ADDR_COMPATv4))
+ return -EADDRNOTAVAIL;
+
+ p.iph.daddr = ireq->ifr6_addr.s6_addr32[3];
+ p.iph.version = 4;
+ p.iph.ihl = 5;
+ p.iph.protocol = IPPROTO_IPV6;
+ p.iph.ttl = 64;
+
+ if (!dev->netdev_ops->ndo_tunnel_ctl)
+ return -EOPNOTSUPP;
+ err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, SIOCADDTUNNEL);
+ if (err)
+ return err;
+
+ dev = __dev_get_by_name(net, p.name);
+ if (!dev)
+ return -ENOBUFS;
+ return dev_open(dev, NULL);
+}
+
/*
* Set destination address.
* Special case for SIT interfaces where we create a new "virtual"
@@ -2814,61 +2870,19 @@ put:
*/
int addrconf_set_dstaddr(struct net *net, void __user *arg)
{
- struct in6_ifreq ireq;
struct net_device *dev;
- int err = -EINVAL;
-
- rtnl_lock();
+ struct in6_ifreq ireq;
+ int err = -ENODEV;
- err = -EFAULT;
+ if (!IS_ENABLED(CONFIG_IPV6_SIT))
+ return -ENODEV;
if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
- goto err_exit;
+ return -EFAULT;
+ rtnl_lock();
dev = __dev_get_by_index(net, ireq.ifr6_ifindex);
-
- err = -ENODEV;
- if (!dev)
- goto err_exit;
-
-#if IS_ENABLED(CONFIG_IPV6_SIT)
- if (dev->type == ARPHRD_SIT) {
- const struct net_device_ops *ops = dev->netdev_ops;
- struct ifreq ifr;
- struct ip_tunnel_parm p;
-
- err = -EADDRNOTAVAIL;
- if (!(ipv6_addr_type(&ireq.ifr6_addr) & IPV6_ADDR_COMPATv4))
- goto err_exit;
-
- memset(&p, 0, sizeof(p));
- p.iph.daddr = ireq.ifr6_addr.s6_addr32[3];
- p.iph.saddr = 0;
- p.iph.version = 4;
- p.iph.ihl = 5;
- p.iph.protocol = IPPROTO_IPV6;
- p.iph.ttl = 64;
- ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
-
- if (ops->ndo_do_ioctl) {
- mm_segment_t oldfs = get_fs();
-
- set_fs(KERNEL_DS);
- err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
- set_fs(oldfs);
- } else
- err = -EOPNOTSUPP;
-
- if (err == 0) {
- err = -ENOBUFS;
- dev = __dev_get_by_name(net, p.name);
- if (!dev)
- goto err_exit;
- err = dev_open(dev, NULL);
- }
- }
-#endif
-
-err_exit:
+ if (dev && dev->type == ARPHRD_SIT)
+ err = addrconf_set_sit_dstaddr(net, dev, &ireq);
rtnl_unlock();
return err;
}
@@ -2975,7 +2989,7 @@ static int inet6_addr_add(struct net *net, int ifindex,
manage_tempaddrs(idev, ifp, cfg->valid_lft,
cfg->preferred_lft, true, jiffies);
in6_ifa_put(ifp);
- addrconf_verify_rtnl();
+ addrconf_verify_rtnl(net);
return 0;
} else if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
ipv6_mc_config(net->ipv6.mc_autojoin_sk, false,
@@ -3015,7 +3029,7 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags,
manage_tempaddrs(idev, ifp, 0, 0, false,
jiffies);
ipv6_del_addr(ifp);
- addrconf_verify_rtnl();
+ addrconf_verify_rtnl(net);
if (ipv6_addr_is_multicast(pfx)) {
ipv6_mc_config(net->ipv6.mc_autojoin_sk,
false, pfx, dev->ifindex);
@@ -3072,7 +3086,7 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg)
}
static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
- int plen, int scope)
+ int plen, int scope, u8 proto)
{
struct inet6_ifaddr *ifp;
struct ifa6_config cfg = {
@@ -3081,7 +3095,8 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
.ifa_flags = IFA_F_PERMANENT,
.valid_lft = INFINITY_LIFE_TIME,
.preferred_lft = INFINITY_LIFE_TIME,
- .scope = scope
+ .scope = scope,
+ .ifa_proto = proto
};
ifp = ipv6_add_addr(idev, &cfg, true, NULL);
@@ -3095,21 +3110,27 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
}
}
-#if IS_ENABLED(CONFIG_IPV6_SIT)
-static void sit_add_v4_addrs(struct inet6_dev *idev)
+#if IS_ENABLED(CONFIG_IPV6_SIT) || IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE)
+static void add_v4_addrs(struct inet6_dev *idev)
{
struct in6_addr addr;
struct net_device *dev;
struct net *net = dev_net(idev->dev);
- int scope, plen;
+ int scope, plen, offset = 0;
u32 pflags = 0;
ASSERT_RTNL();
memset(&addr, 0, sizeof(struct in6_addr));
- memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4);
+ /* in case of IP6GRE the dev_addr is an IPv6 and therefore we use only the last 4 bytes */
+ if (idev->dev->addr_len == sizeof(struct in6_addr))
+ offset = sizeof(struct in6_addr) - 4;
+ memcpy(&addr.s6_addr32[3], idev->dev->dev_addr + offset, 4);
if (idev->dev->flags&IFF_POINTOPOINT) {
+ if (idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_NONE)
+ return;
+
addr.s6_addr32[0] = htonl(0xfe800000);
scope = IFA_LINK;
plen = 64;
@@ -3120,7 +3141,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
}
if (addr.s6_addr32[3]) {
- add_addr(idev, &addr, plen, scope);
+ add_addr(idev, &addr, plen, scope, IFAPROT_UNSPEC);
addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags,
GFP_KERNEL);
return;
@@ -3143,7 +3164,8 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
flag |= IFA_HOST;
}
- add_addr(idev, &addr, plen, flag);
+ add_addr(idev, &addr, plen, flag,
+ IFAPROT_UNSPEC);
addrconf_prefix_route(&addr, plen, 0, idev->dev,
0, pflags, GFP_KERNEL);
}
@@ -3166,7 +3188,7 @@ static void init_loopback(struct net_device *dev)
return;
}
- add_addr(idev, &in6addr_loopback, 128, IFA_HOST);
+ add_addr(idev, &in6addr_loopback, 128, IFA_HOST, IFAPROT_KERNEL_LO);
}
void addrconf_add_linklocal(struct inet6_dev *idev,
@@ -3178,7 +3200,8 @@ void addrconf_add_linklocal(struct inet6_dev *idev,
.ifa_flags = flags | IFA_F_PERMANENT,
.valid_lft = INFINITY_LIFE_TIME,
.preferred_lft = INFINITY_LIFE_TIME,
- .scope = IFA_LINK
+ .scope = IFA_LINK,
+ .ifa_proto = IFAPROT_KERNEL_LL
};
struct inet6_ifaddr *ifp;
@@ -3220,11 +3243,11 @@ static int ipv6_generate_stable_address(struct in6_addr *address,
const struct inet6_dev *idev)
{
static DEFINE_SPINLOCK(lock);
- static __u32 digest[SHA_DIGEST_WORDS];
- static __u32 workspace[SHA_WORKSPACE_WORDS];
+ static __u32 digest[SHA1_DIGEST_WORDS];
+ static __u32 workspace[SHA1_WORKSPACE_WORDS];
static union {
- char __data[SHA_MESSAGE_BYTES];
+ char __data[SHA1_BLOCK_SIZE];
struct {
struct in6_addr secret;
__be32 prefix[2];
@@ -3249,7 +3272,7 @@ static int ipv6_generate_stable_address(struct in6_addr *address,
retry:
spin_lock_bh(&lock);
- sha_init(digest);
+ sha1_init(digest);
memset(&data, 0, sizeof(data));
memset(workspace, 0, sizeof(workspace));
memcpy(data.hwaddr, idev->dev->perm_addr, idev->dev->addr_len);
@@ -3258,7 +3281,7 @@ retry:
data.secret = secret;
data.dad_count = dad_count;
- sha_transform(digest, data.__data, workspace);
+ sha1_transform(digest, data.__data, workspace);
temp = *address;
temp.s6_addr32[2] = (__force __be32)digest[0];
@@ -3296,12 +3319,16 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
if (netif_is_l3_master(idev->dev))
return;
+ /* no link local addresses on devices flagged as slaves */
+ if (idev->dev->flags & IFF_SLAVE)
+ return;
+
ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
switch (idev->cnf.addr_gen_mode) {
case IN6_ADDR_GEN_MODE_RANDOM:
ipv6_gen_mode_random_init(idev);
- /* fallthrough */
+ fallthrough;
case IN6_ADDR_GEN_MODE_STABLE_PRIVACY:
if (!ipv6_generate_stable_address(&addr, 0, idev))
addrconf_add_linklocal(idev, &addr,
@@ -3341,8 +3368,6 @@ static void addrconf_dev_config(struct net_device *dev)
(dev->type != ARPHRD_IEEE1394) &&
(dev->type != ARPHRD_TUNNEL6) &&
(dev->type != ARPHRD_6LOWPAN) &&
- (dev->type != ARPHRD_IP6GRE) &&
- (dev->type != ARPHRD_IPGRE) &&
(dev->type != ARPHRD_TUNNEL) &&
(dev->type != ARPHRD_NONE) &&
(dev->type != ARPHRD_RAWIP)) {
@@ -3390,14 +3415,14 @@ static void addrconf_sit_config(struct net_device *dev)
return;
}
- sit_add_v4_addrs(idev);
+ add_v4_addrs(idev);
if (dev->flags&IFF_POINTOPOINT)
addrconf_add_mroute(dev);
}
#endif
-#if IS_ENABLED(CONFIG_NET_IPGRE)
+#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE)
static void addrconf_gre_config(struct net_device *dev)
{
struct inet6_dev *idev;
@@ -3410,7 +3435,13 @@ static void addrconf_gre_config(struct net_device *dev)
return;
}
- addrconf_addr_gen(idev, true);
+ if (dev->type == ARPHRD_ETHER) {
+ addrconf_addr_gen(idev, true);
+ return;
+ }
+
+ add_v4_addrs(idev);
+
if (dev->flags & IFF_POINTOPOINT)
addrconf_add_mroute(dev);
}
@@ -3523,16 +3554,18 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
break;
run_pending = 1;
-
- /* fall through */
-
+ fallthrough;
case NETDEV_UP:
case NETDEV_CHANGE:
- if (dev->flags & IFF_SLAVE)
+ if (idev && idev->cnf.disable_ipv6)
break;
- if (idev && idev->cnf.disable_ipv6)
+ if (dev->flags & IFF_SLAVE) {
+ if (event == NETDEV_UP && !IS_ERR_OR_NULL(idev) &&
+ dev->flags & IFF_UP && dev->flags & IFF_MULTICAST)
+ ipv6_mc_up(idev);
break;
+ }
if (event == NETDEV_UP) {
/* restore routes for permanent addresses */
@@ -3588,7 +3621,8 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
addrconf_sit_config(dev);
break;
#endif
-#if IS_ENABLED(CONFIG_NET_IPGRE)
+#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE)
+ case ARPHRD_IP6GRE:
case ARPHRD_IPGRE:
addrconf_gre_config(dev);
break;
@@ -3667,7 +3701,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
* an L3 master device (e.g., VRF)
*/
if (info->upper_dev && netif_is_l3_master(info->upper_dev))
- addrconf_ifdown(dev, 0);
+ addrconf_ifdown(dev, false);
}
return NOTIFY_OK;
@@ -3700,13 +3734,15 @@ static bool addr_is_local(const struct in6_addr *addr)
(IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
}
-static int addrconf_ifdown(struct net_device *dev, int how)
+static int addrconf_ifdown(struct net_device *dev, bool unregister)
{
- unsigned long event = how ? NETDEV_UNREGISTER : NETDEV_DOWN;
+ unsigned long event = unregister ? NETDEV_UNREGISTER : NETDEV_DOWN;
struct net *net = dev_net(dev);
struct inet6_dev *idev;
- struct inet6_ifaddr *ifa, *tmp;
+ struct inet6_ifaddr *ifa;
+ LIST_HEAD(tmp_addr_list);
bool keep_addr = false;
+ bool was_ready;
int state, i;
ASSERT_RTNL();
@@ -3721,7 +3757,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
* Step 1: remove reference to ipv6 device from parent device.
* Do not dev_put!
*/
- if (how) {
+ if (unregister) {
idev->dead = 1;
/* protected by rtnl_lock */
@@ -3735,7 +3771,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
/* combine the user config with event to determine if permanent
* addresses are to be removed from address hash table
*/
- if (!how && !idev->cnf.disable_ipv6) {
+ if (!unregister && !idev->cnf.disable_ipv6) {
/* aggregate the system setting and interface setting */
int _keep_addr = net->ipv6.devconf_all->keep_addr_on_down;
@@ -3747,9 +3783,9 @@ static int addrconf_ifdown(struct net_device *dev, int how)
/* Step 2: clear hash table */
for (i = 0; i < IN6_ADDR_HSIZE; i++) {
- struct hlist_head *h = &inet6_addr_lst[i];
+ struct hlist_head *h = &net->ipv6.inet6_addr_lst[i];
- spin_lock_bh(&addrconf_hash_lock);
+ spin_lock_bh(&net->ipv6.addrconf_hash_lock);
restart:
hlist_for_each_entry_rcu(ifa, h, addr_lst) {
if (ifa->idev == idev) {
@@ -3765,15 +3801,18 @@ restart:
}
}
}
- spin_unlock_bh(&addrconf_hash_lock);
+ spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
}
write_lock_bh(&idev->lock);
addrconf_del_rs_timer(idev);
- /* Step 2: clear flags for stateless addrconf */
- if (!how)
+ /* Step 2: clear flags for stateless addrconf, repeated down
+ * detection
+ */
+ was_ready = idev->if_flags & IF_READY;
+ if (!unregister)
idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY);
/* Step 3: clear tempaddr list */
@@ -3793,16 +3832,23 @@ restart:
write_lock_bh(&idev->lock);
}
- list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
+ list_for_each_entry(ifa, &idev->addr_list, if_list)
+ list_add_tail(&ifa->if_list_aux, &tmp_addr_list);
+ write_unlock_bh(&idev->lock);
+
+ while (!list_empty(&tmp_addr_list)) {
struct fib6_info *rt = NULL;
bool keep;
+ ifa = list_first_entry(&tmp_addr_list,
+ struct inet6_ifaddr, if_list_aux);
+ list_del(&ifa->if_list_aux);
+
addrconf_del_dad_work(ifa);
keep = keep_addr && (ifa->flags & IFA_F_PERMANENT) &&
!addr_is_local(&ifa->addr);
- write_unlock_bh(&idev->lock);
spin_lock_bh(&ifa->lock);
if (keep) {
@@ -3822,7 +3868,7 @@ restart:
spin_unlock_bh(&ifa->lock);
if (rt)
- ip6_del_rt(net, rt);
+ ip6_del_rt(net, rt, false);
if (state != INET6_IFADDR_STATE_DEAD) {
__ipv6_ifa_notify(RTM_DELADDR, ifa);
@@ -3833,27 +3879,27 @@ restart:
addrconf_leave_solict(ifa->idev, &ifa->addr);
}
- write_lock_bh(&idev->lock);
if (!keep) {
+ write_lock_bh(&idev->lock);
list_del_rcu(&ifa->if_list);
+ write_unlock_bh(&idev->lock);
in6_ifa_put(ifa);
}
}
- write_unlock_bh(&idev->lock);
-
/* Step 5: Discard anycast and multicast list */
- if (how) {
+ if (unregister) {
ipv6_ac_destroy_dev(idev);
ipv6_mc_destroy_dev(idev);
- } else {
+ } else if (was_ready) {
ipv6_mc_down(idev);
}
idev->tstamp = jiffies;
+ idev->ra_mtu = 0;
/* Last: Shot the device (if unregistered) */
- if (how) {
+ if (unregister) {
addrconf_sysctl_unregister(idev);
neigh_parms_release(&nd_tbl, idev->nd_parms);
neigh_ifdown(&nd_tbl, dev);
@@ -3921,7 +3967,7 @@ static void addrconf_dad_kick(struct inet6_ifaddr *ifp)
if (ifp->flags & IFA_F_OPTIMISTIC)
rand_num = 0;
else
- rand_num = prandom_u32() % (idev->cnf.rtr_solicit_delay ? : 1);
+ rand_num = prandom_u32_max(idev->cnf.rtr_solicit_delay ?: 1);
nonce = 0;
if (idev->cnf.enhanced_dad ||
@@ -3944,8 +3990,6 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
addrconf_join_solict(dev, &ifp->addr);
- prandom_seed((__force u32) ifp->addr.s6_addr32[3]);
-
read_lock_bh(&idev->lock);
spin_lock(&ifp->lock);
if (ifp->state == INET6_IFADDR_STATE_DEAD)
@@ -4075,7 +4119,7 @@ static void addrconf_dad_work(struct work_struct *w)
in6_ifa_hold(ifp);
addrconf_dad_stop(ifp, 1);
if (disable_ipv6)
- addrconf_ifdown(idev->dev, 0);
+ addrconf_ifdown(idev->dev, false);
goto out;
}
@@ -4117,7 +4161,8 @@ static void addrconf_dad_work(struct work_struct *w)
ifp->dad_probes--;
addrconf_mod_dad_work(ifp,
- NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME));
+ max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME),
+ HZ/100));
spin_unlock(&ifp->lock);
write_unlock_bh(&idev->lock);
@@ -4172,7 +4217,8 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id,
send_rs = send_mld &&
ipv6_accept_ra(ifp->idev) &&
ifp->idev->cnf.rtr_solicits != 0 &&
- (dev->flags&IFF_LOOPBACK) == 0;
+ (dev->flags & IFF_LOOPBACK) == 0 &&
+ (dev->type != ARPHRD_TUNNEL);
read_unlock_bh(&ifp->idev->lock);
/* While dad is in progress mld report's source address is in6_addrany.
@@ -4219,7 +4265,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id,
* before this temporary address becomes deprecated.
*/
if (ifp->flags & IFA_F_TEMPORARY)
- addrconf_verify_rtnl();
+ addrconf_verify_rtnl(dev_net(dev));
}
static void addrconf_dad_run(struct inet6_dev *idev, bool restart)
@@ -4261,10 +4307,8 @@ static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos)
}
for (; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) {
- hlist_for_each_entry_rcu(ifa, &inet6_addr_lst[state->bucket],
+ hlist_for_each_entry_rcu(ifa, &net->ipv6.inet6_addr_lst[state->bucket],
addr_lst) {
- if (!net_eq(dev_net(ifa->idev->dev), net))
- continue;
/* sync with offset */
if (p < state->offset) {
p++;
@@ -4287,8 +4331,6 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
hlist_for_each_entry_continue_rcu(ifa, addr_lst) {
- if (!net_eq(dev_net(ifa->idev->dev), net))
- continue;
state->offset++;
return ifa;
}
@@ -4296,9 +4338,7 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq,
state->offset = 0;
while (++state->bucket < IN6_ADDR_HSIZE) {
hlist_for_each_entry_rcu(ifa,
- &inet6_addr_lst[state->bucket], addr_lst) {
- if (!net_eq(dev_net(ifa->idev->dev), net))
- continue;
+ &net->ipv6.inet6_addr_lst[state->bucket], addr_lst) {
return ifa;
}
}
@@ -4386,9 +4426,7 @@ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr)
int ret = 0;
rcu_read_lock();
- hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
- if (!net_eq(dev_net(ifp->idev->dev), net))
- continue;
+ hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
if (ipv6_addr_equal(&ifp->addr, addr) &&
(ifp->flags & IFA_F_HOMEADDRESS)) {
ret = 1;
@@ -4400,11 +4438,62 @@ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr)
}
#endif
+/* RFC6554 has some algorithm to avoid loops in segment routing by
+ * checking if the segments contains any of a local interface address.
+ *
+ * Quote:
+ *
+ * To detect loops in the SRH, a router MUST determine if the SRH
+ * includes multiple addresses assigned to any interface on that router.
+ * If such addresses appear more than once and are separated by at least
+ * one address not assigned to that router.
+ */
+int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs,
+ unsigned char nsegs)
+{
+ const struct in6_addr *addr;
+ int i, ret = 0, found = 0;
+ struct inet6_ifaddr *ifp;
+ bool separated = false;
+ unsigned int hash;
+ bool hash_found;
+
+ rcu_read_lock();
+ for (i = 0; i < nsegs; i++) {
+ addr = &segs[i];
+ hash = inet6_addr_hash(net, addr);
+
+ hash_found = false;
+ hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
+
+ if (ipv6_addr_equal(&ifp->addr, addr)) {
+ hash_found = true;
+ break;
+ }
+ }
+
+ if (hash_found) {
+ if (found > 1 && separated) {
+ ret = 1;
+ break;
+ }
+
+ separated = false;
+ found++;
+ } else {
+ separated = true;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
/*
* Periodic address status verification
*/
-static void addrconf_verify_rtnl(void)
+static void addrconf_verify_rtnl(struct net *net)
{
unsigned long now, next, next_sec, next_sched;
struct inet6_ifaddr *ifp;
@@ -4416,11 +4505,11 @@ static void addrconf_verify_rtnl(void)
now = jiffies;
next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);
- cancel_delayed_work(&addr_chk_work);
+ cancel_delayed_work(&net->ipv6.addr_chk_work);
for (i = 0; i < IN6_ADDR_HSIZE; i++) {
restart:
- hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[i], addr_lst) {
+ hlist_for_each_entry_rcu_bh(ifp, &net->ipv6.inet6_addr_lst[i], addr_lst) {
unsigned long age;
/* When setting preferred_lft to a value not zero or
@@ -4435,11 +4524,46 @@ restart:
/* We try to batch several events at once. */
age = (now - ifp->tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+ if ((ifp->flags&IFA_F_TEMPORARY) &&
+ !(ifp->flags&IFA_F_TENTATIVE) &&
+ ifp->prefered_lft != INFINITY_LIFE_TIME &&
+ !ifp->regen_count && ifp->ifpub) {
+ /* This is a non-regenerated temporary addr. */
+
+ unsigned long regen_advance = ifp->idev->cnf.regen_max_retry *
+ ifp->idev->cnf.dad_transmits *
+ max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME), HZ/100) / HZ;
+
+ if (age + regen_advance >= ifp->prefered_lft) {
+ struct inet6_ifaddr *ifpub = ifp->ifpub;
+ if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
+ next = ifp->tstamp + ifp->prefered_lft * HZ;
+
+ ifp->regen_count++;
+ in6_ifa_hold(ifp);
+ in6_ifa_hold(ifpub);
+ spin_unlock(&ifp->lock);
+
+ spin_lock(&ifpub->lock);
+ ifpub->regen_count = 0;
+ spin_unlock(&ifpub->lock);
+ rcu_read_unlock_bh();
+ ipv6_create_tempaddr(ifpub, true);
+ in6_ifa_put(ifpub);
+ in6_ifa_put(ifp);
+ rcu_read_lock_bh();
+ goto restart;
+ } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next))
+ next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ;
+ }
+
if (ifp->valid_lft != INFINITY_LIFE_TIME &&
age >= ifp->valid_lft) {
spin_unlock(&ifp->lock);
in6_ifa_hold(ifp);
+ rcu_read_unlock_bh();
ipv6_del_addr(ifp);
+ rcu_read_lock_bh();
goto restart;
} else if (ifp->prefered_lft == INFINITY_LIFE_TIME) {
spin_unlock(&ifp->lock);
@@ -4466,35 +4590,6 @@ restart:
in6_ifa_put(ifp);
goto restart;
}
- } else if ((ifp->flags&IFA_F_TEMPORARY) &&
- !(ifp->flags&IFA_F_TENTATIVE)) {
- unsigned long regen_advance = ifp->idev->cnf.regen_max_retry *
- ifp->idev->cnf.dad_transmits *
- NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME) / HZ;
-
- if (age >= ifp->prefered_lft - regen_advance) {
- struct inet6_ifaddr *ifpub = ifp->ifpub;
- if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
- next = ifp->tstamp + ifp->prefered_lft * HZ;
- if (!ifp->regen_count && ifpub) {
- ifp->regen_count++;
- in6_ifa_hold(ifp);
- in6_ifa_hold(ifpub);
- spin_unlock(&ifp->lock);
-
- spin_lock(&ifpub->lock);
- ifpub->regen_count = 0;
- spin_unlock(&ifpub->lock);
- rcu_read_unlock_bh();
- ipv6_create_tempaddr(ifpub, ifp, true);
- in6_ifa_put(ifpub);
- in6_ifa_put(ifp);
- rcu_read_lock_bh();
- goto restart;
- }
- } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next))
- next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ;
- spin_unlock(&ifp->lock);
} else {
/* ifp->prefered_lft <= ifp->valid_lft */
if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
@@ -4517,20 +4612,23 @@ restart:
pr_debug("now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n",
now, next, next_sec, next_sched);
- mod_delayed_work(addrconf_wq, &addr_chk_work, next_sched - now);
+ mod_delayed_work(addrconf_wq, &net->ipv6.addr_chk_work, next_sched - now);
rcu_read_unlock_bh();
}
static void addrconf_verify_work(struct work_struct *w)
{
+ struct net *net = container_of(to_delayed_work(w), struct net,
+ ipv6.addr_chk_work);
+
rtnl_lock();
- addrconf_verify_rtnl();
+ addrconf_verify_rtnl(net);
rtnl_unlock();
}
-static void addrconf_verify(void)
+static void addrconf_verify(struct net *net)
{
- mod_delayed_work(addrconf_wq, &addr_chk_work, 0);
+ mod_delayed_work(addrconf_wq, &net->ipv6.addr_chk_work, 0);
}
static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local,
@@ -4559,6 +4657,7 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = {
[IFA_FLAGS] = { .len = sizeof(u32) },
[IFA_RT_PRIORITY] = { .len = sizeof(u32) },
[IFA_TARGET_NETNSID] = { .type = NLA_S32 },
+ [IFA_PROTO] = { .type = NLA_U8 },
};
static int
@@ -4607,7 +4706,7 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp,
prio = ifp->rt_priority ? : IP6_RT_PRIO_ADDRCONF;
if (f6i->fib6_metric != prio) {
/* delete old one */
- ip6_del_rt(dev_net(ifp->idev->dev), f6i);
+ ip6_del_rt(dev_net(ifp->idev->dev), f6i, false);
/* add new one */
addrconf_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr,
@@ -4626,7 +4725,8 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp,
return 0;
}
-static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
+static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp,
+ struct ifa6_config *cfg)
{
u32 flags;
clock_t expires;
@@ -4683,6 +4783,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
ifp->tstamp = jiffies;
ifp->valid_lft = cfg->valid_lft;
ifp->prefered_lft = cfg->preferred_lft;
+ ifp->ifa_proto = cfg->ifa_proto;
if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority)
ifp->rt_priority = cfg->rt_priority;
@@ -4740,7 +4841,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
jiffies);
}
- addrconf_verify_rtnl();
+ addrconf_verify_rtnl(net);
return 0;
}
@@ -4776,6 +4877,9 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tb[IFA_RT_PRIORITY])
cfg.rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
+ if (tb[IFA_PROTO])
+ cfg.ifa_proto = nla_get_u8(tb[IFA_PROTO]);
+
cfg.valid_lft = INFINITY_LIFE_TIME;
cfg.preferred_lft = INFINITY_LIFE_TIME;
@@ -4827,7 +4931,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
!(nlh->nlmsg_flags & NLM_F_REPLACE))
err = -EEXIST;
else
- err = inet6_addr_modify(ifa, &cfg);
+ err = inet6_addr_modify(net, ifa, &cfg);
in6_ifa_put(ifa);
@@ -4879,6 +4983,7 @@ static inline int inet6_ifaddr_msgsize(void)
+ nla_total_size(16) /* IFA_ADDRESS */
+ nla_total_size(sizeof(struct ifa_cacheinfo))
+ nla_total_size(4) /* IFA_FLAGS */
+ + nla_total_size(1) /* IFA_PROTO */
+ nla_total_size(4) /* IFA_RT_PRIORITY */;
}
@@ -4916,6 +5021,7 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
goto error;
+ spin_lock_bh(&ifa->lock);
if (!((ifa->flags&IFA_F_PERMANENT) &&
(ifa->prefered_lft == INFINITY_LIFE_TIME))) {
preferred = ifa->prefered_lft;
@@ -4937,6 +5043,7 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
preferred = INFINITY_LIFE_TIME;
valid = INFINITY_LIFE_TIME;
}
+ spin_unlock_bh(&ifa->lock);
if (!ipv6_addr_any(&ifa->peer_addr)) {
if (nla_put_in6_addr(skb, IFA_LOCAL, &ifa->addr) < 0 ||
@@ -4956,6 +5063,10 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
if (nla_put_u32(skb, IFA_FLAGS, ifa->flags) < 0)
goto error;
+ if (ifa->ifa_proto &&
+ nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto))
+ goto error;
+
nlmsg_end(skb, nlh);
return 0;
@@ -4980,8 +5091,10 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
return -EMSGSIZE;
if (args->netnsid >= 0 &&
- nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
+ nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) {
+ nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
+ }
put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
if (nla_put_in6_addr(skb, IFA_MULTICAST, &ifmca->mca_addr) < 0 ||
@@ -5012,8 +5125,10 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
return -EMSGSIZE;
if (args->netnsid >= 0 &&
- nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
+ nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) {
+ nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
+ }
put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
if (nla_put_in6_addr(skb, IFA_ANYCAST, &ifaca->aca_addr) < 0 ||
@@ -5057,17 +5172,20 @@ next:
break;
}
case MULTICAST_ADDR:
+ read_unlock_bh(&idev->lock);
fillargs->event = RTM_GETMULTICAST;
/* multicast address */
- for (ifmca = idev->mc_list; ifmca;
- ifmca = ifmca->next, ip_idx++) {
+ for (ifmca = rtnl_dereference(idev->mc_list);
+ ifmca;
+ ifmca = rtnl_dereference(ifmca->next), ip_idx++) {
if (ip_idx < s_ip_idx)
continue;
err = inet6_fill_ifmcaddr(skb, ifmca, fillargs);
if (err < 0)
break;
}
+ read_lock_bh(&idev->lock);
break;
case ANYCAST_ADDR:
fillargs->event = RTM_GETANYCAST;
@@ -5156,8 +5274,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
.netnsid = -1,
.type = type,
};
- struct net *net = sock_net(skb->sk);
- struct net *tgt_net = net;
+ struct net *tgt_net = sock_net(skb->sk);
int idx, s_idx, s_ip_idx;
int h, s_h;
struct net_device *dev;
@@ -5296,7 +5413,7 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb,
static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
- struct net *net = sock_net(in_skb->sk);
+ struct net *tgt_net = sock_net(in_skb->sk);
struct inet6_fill_args fillargs = {
.portid = NETLINK_CB(in_skb).portid,
.seq = nlh->nlmsg_seq,
@@ -5304,7 +5421,6 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
.flags = 0,
.netnsid = -1,
};
- struct net *tgt_net = net;
struct ifaddrmsg *ifm;
struct nlattr *tb[IFA_MAX+1];
struct in6_addr *addr = NULL, *peer;
@@ -5357,8 +5473,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
errout_ifa:
in6_ifa_put(ifa);
errout:
- if (dev)
- dev_put(dev);
+ dev_put(dev);
if (fillargs.netnsid >= 0)
put_net(tgt_net);
@@ -5428,6 +5543,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor;
array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses;
array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr;
+ array[DEVCONF_RA_DEFRTR_METRIC] = cnf->ra_defrtr_metric;
array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] = cnf->accept_ra_min_hop_limit;
array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo;
#ifdef CONFIG_IPV6_ROUTER_PREF
@@ -5446,7 +5562,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_USE_OPTIMISTIC] = cnf->use_optimistic;
#endif
#ifdef CONFIG_IPV6_MROUTE
- array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding;
+ array[DEVCONF_MC_FORWARDING] = atomic_read(&cnf->mc_forwarding);
#endif
array[DEVCONF_DISABLE_IPV6] = cnf->disable_ipv6;
array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad;
@@ -5469,6 +5585,12 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode;
array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy;
array[DEVCONF_NDISC_TCLASS] = cnf->ndisc_tclass;
+ array[DEVCONF_RPL_SEG_ENABLED] = cnf->rpl_seg_enabled;
+ array[DEVCONF_IOAM6_ENABLED] = cnf->ioam6_enabled;
+ array[DEVCONF_IOAM6_ID] = cnf->ioam6_id;
+ array[DEVCONF_IOAM6_ID_WIDE] = cnf->ioam6_id_wide;
+ array[DEVCONF_NDISC_EVICT_NOCARRIER] = cnf->ndisc_evict_nocarrier;
+ array[DEVCONF_ACCEPT_UNTRACKED_NA] = cnf->accept_untracked_na;
}
static inline size_t inet6_ifla6_size(void)
@@ -5480,6 +5602,7 @@ static inline size_t inet6_ifla6_size(void)
+ nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */
+ nla_total_size(sizeof(struct in6_addr)) /* IFLA_INET6_TOKEN */
+ nla_total_size(1) /* IFLA_INET6_ADDR_GEN_MODE */
+ + nla_total_size(4) /* IFLA_INET6_RA_MTU */
+ 0;
}
@@ -5588,6 +5711,10 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode))
goto nla_put_failure;
+ if (idev->ra_mtu &&
+ nla_put_u32(skb, IFLA_INET6_RA_MTU, idev->ra_mtu))
+ goto nla_put_failure;
+
return 0;
nla_put_failure:
@@ -5617,7 +5744,8 @@ static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev,
return 0;
}
-static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
+static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token,
+ struct netlink_ext_ack *extack)
{
struct inet6_ifaddr *ifp;
struct net_device *dev = idev->dev;
@@ -5628,12 +5756,29 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
if (!token)
return -EINVAL;
- if (dev->flags & (IFF_LOOPBACK | IFF_NOARP))
+
+ if (dev->flags & IFF_LOOPBACK) {
+ NL_SET_ERR_MSG_MOD(extack, "Device is loopback");
return -EINVAL;
- if (!ipv6_accept_ra(idev))
+ }
+
+ if (dev->flags & IFF_NOARP) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Device does not do neighbour discovery");
return -EINVAL;
- if (idev->cnf.rtr_solicits == 0)
+ }
+
+ if (!ipv6_accept_ra(idev)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Router advertisement is disabled on device");
return -EINVAL;
+ }
+
+ if (idev->cnf.rtr_solicits == 0) {
+ NL_SET_ERR_MSG(extack,
+ "Router solicitation is disabled on device");
+ return -EINVAL;
+ }
write_lock_bh(&idev->lock);
@@ -5679,13 +5824,16 @@ update_lft:
write_unlock_bh(&idev->lock);
inet6_ifinfo_notify(RTM_NEWLINK, idev);
- addrconf_verify_rtnl();
+ addrconf_verify_rtnl(dev_net(dev));
return 0;
}
static const struct nla_policy inet6_af_policy[IFLA_INET6_MAX + 1] = {
[IFLA_INET6_ADDR_GEN_MODE] = { .type = NLA_U8 },
[IFLA_INET6_TOKEN] = { .len = sizeof(struct in6_addr) },
+ [IFLA_INET6_RA_MTU] = { .type = NLA_REJECT,
+ .reject_message =
+ "IFLA_INET6_RA_MTU can not be set" },
};
static int check_addr_gen_mode(int mode)
@@ -5709,7 +5857,8 @@ static int check_stable_privacy(struct inet6_dev *idev, struct net *net,
}
static int inet6_validate_link_af(const struct net_device *dev,
- const struct nlattr *nla)
+ const struct nlattr *nla,
+ struct netlink_ext_ack *extack)
{
struct nlattr *tb[IFLA_INET6_MAX + 1];
struct inet6_dev *idev = NULL;
@@ -5722,7 +5871,7 @@ static int inet6_validate_link_af(const struct net_device *dev,
}
err = nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla,
- inet6_af_policy, NULL);
+ inet6_af_policy, extack);
if (err)
return err;
@@ -5741,7 +5890,8 @@ static int inet6_validate_link_af(const struct net_device *dev,
return 0;
}
-static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
+static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla,
+ struct netlink_ext_ack *extack)
{
struct inet6_dev *idev = __in6_dev_get(dev);
struct nlattr *tb[IFLA_INET6_MAX + 1];
@@ -5751,10 +5901,11 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
return -EAFNOSUPPORT;
if (nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla, NULL, NULL) < 0)
- BUG();
+ return -EINVAL;
if (tb[IFLA_INET6_TOKEN]) {
- err = inet6_set_iftoken(idev, nla_data(tb[IFLA_INET6_TOKEN]));
+ err = inet6_set_iftoken(idev, nla_data(tb[IFLA_INET6_TOKEN]),
+ extack);
if (err)
return err;
}
@@ -6027,10 +6178,10 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
ifp->idev->dev, 0, 0,
false);
if (rt)
- ip6_del_rt(net, rt);
+ ip6_del_rt(net, rt, false);
}
if (ifp->rt) {
- ip6_del_rt(net, ifp->rt);
+ ip6_del_rt(net, ifp->rt, false);
ifp->rt = NULL;
}
rt_genid_bump_ipv6(net);
@@ -6041,17 +6192,14 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
{
- rcu_read_lock_bh();
if (likely(ifp->idev->dead == 0))
__ipv6_ifa_notify(event, ifp);
- rcu_read_unlock_bh();
}
#ifdef CONFIG_SYSCTL
-static
-int addrconf_sysctl_forward(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int addrconf_sysctl_forward(struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int val = *valp;
@@ -6075,9 +6223,8 @@ int addrconf_sysctl_forward(struct ctl_table *ctl, int write,
return ret;
}
-static
-int addrconf_sysctl_mtu(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int addrconf_sysctl_mtu(struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct inet6_dev *idev = ctl->extra1;
int min_mtu = IPV6_MIN_MTU;
@@ -6147,9 +6294,8 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf)
return 0;
}
-static
-int addrconf_sysctl_disable(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int addrconf_sysctl_disable(struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int val = *valp;
@@ -6173,9 +6319,8 @@ int addrconf_sysctl_disable(struct ctl_table *ctl, int write,
return ret;
}
-static
-int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int ret;
@@ -6216,7 +6361,7 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
}
static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int ret = 0;
@@ -6278,7 +6423,7 @@ out:
}
static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int err;
@@ -6345,8 +6490,7 @@ out:
static
int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl,
- int write,
- void __user *buffer,
+ int write, void *buffer,
size_t *lenp,
loff_t *ppos)
{
@@ -6446,10 +6590,8 @@ int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val)
return 0;
}
-static
-int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int val = *valp;
@@ -6472,6 +6614,7 @@ int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write,
static int minus_one = -1;
static const int two_five_five = 255;
+static u32 ioam6_if_id_max = U16_MAX;
static const struct ctl_table addrconf_sysctl[] = {
{
@@ -6627,6 +6770,14 @@ static const struct ctl_table addrconf_sysctl[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "ra_defrtr_metric",
+ .data = &ipv6_devconf.ra_defrtr_metric,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = (void *)SYSCTL_ONE,
+ },
+ {
.procname = "accept_ra_min_hop_limit",
.data = &ipv6_devconf.accept_ra_min_hop_limit,
.maxlen = sizeof(int),
@@ -6827,10 +6978,10 @@ static const struct ctl_table addrconf_sysctl[] = {
.proc_handler = proc_dointvec,
},
{
- .procname = "addr_gen_mode",
- .data = &ipv6_devconf.addr_gen_mode,
- .maxlen = sizeof(int),
- .mode = 0644,
+ .procname = "addr_gen_mode",
+ .data = &ipv6_devconf.addr_gen_mode,
+ .maxlen = sizeof(int),
+ .mode = 0644,
.proc_handler = addrconf_sysctl_addr_gen_mode,
},
{
@@ -6850,6 +7001,56 @@ static const struct ctl_table addrconf_sysctl[] = {
.extra2 = (void *)&two_five_five,
},
{
+ .procname = "rpl_seg_enabled",
+ .data = &ipv6_devconf.rpl_seg_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ioam6_enabled",
+ .data = &ipv6_devconf.ioam6_enabled,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = (void *)SYSCTL_ZERO,
+ .extra2 = (void *)SYSCTL_ONE,
+ },
+ {
+ .procname = "ioam6_id",
+ .data = &ipv6_devconf.ioam6_id,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = (void *)SYSCTL_ZERO,
+ .extra2 = (void *)&ioam6_if_id_max,
+ },
+ {
+ .procname = "ioam6_id_wide",
+ .data = &ipv6_devconf.ioam6_id_wide,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
+ {
+ .procname = "ndisc_evict_nocarrier",
+ .data = &ipv6_devconf.ndisc_evict_nocarrier,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = (void *)SYSCTL_ZERO,
+ .extra2 = (void *)SYSCTL_ONE,
+ },
+ {
+ .procname = "accept_untracked_na",
+ .data = &ipv6_devconf.accept_untracked_na,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
/* sentinel */
}
};
@@ -6861,7 +7062,7 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name,
struct ctl_table *table;
char path[sizeof("net/ipv6/conf/") + IFNAMSIZ];
- table = kmemdup(addrconf_sysctl, sizeof(addrconf_sysctl), GFP_KERNEL);
+ table = kmemdup(addrconf_sysctl, sizeof(addrconf_sysctl), GFP_KERNEL_ACCOUNT);
if (!table)
goto out;
@@ -6949,6 +7150,14 @@ static int __net_init addrconf_init_net(struct net *net)
int err = -ENOMEM;
struct ipv6_devconf *all, *dflt;
+ spin_lock_init(&net->ipv6.addrconf_hash_lock);
+ INIT_DEFERRABLE_WORK(&net->ipv6.addr_chk_work, addrconf_verify_work);
+ net->ipv6.inet6_addr_lst = kcalloc(IN6_ADDR_HSIZE,
+ sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!net->ipv6.inet6_addr_lst)
+ goto err_alloc_addr;
+
all = kmemdup(&ipv6_devconf, sizeof(ipv6_devconf), GFP_KERNEL);
if (!all)
goto err_alloc_all;
@@ -6957,10 +7166,26 @@ static int __net_init addrconf_init_net(struct net *net)
if (!dflt)
goto err_alloc_dflt;
- if (IS_ENABLED(CONFIG_SYSCTL) &&
- sysctl_devconf_inherit_init_net == 1 && !net_eq(net, &init_net)) {
- memcpy(all, init_net.ipv6.devconf_all, sizeof(ipv6_devconf));
- memcpy(dflt, init_net.ipv6.devconf_dflt, sizeof(ipv6_devconf_dflt));
+ if (!net_eq(net, &init_net)) {
+ switch (net_inherit_devconf()) {
+ case 1: /* copy from init_net */
+ memcpy(all, init_net.ipv6.devconf_all,
+ sizeof(ipv6_devconf));
+ memcpy(dflt, init_net.ipv6.devconf_dflt,
+ sizeof(ipv6_devconf_dflt));
+ break;
+ case 3: /* copy from the current netns */
+ memcpy(all, current->nsproxy->net_ns->ipv6.devconf_all,
+ sizeof(ipv6_devconf));
+ memcpy(dflt,
+ current->nsproxy->net_ns->ipv6.devconf_dflt,
+ sizeof(ipv6_devconf_dflt));
+ break;
+ case 0:
+ case 2:
+ /* use compiled values */
+ break;
+ }
}
/* these will be inherited by all namespaces */
@@ -6989,15 +7214,21 @@ err_reg_dflt:
__addrconf_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL);
err_reg_all:
kfree(dflt);
+ net->ipv6.devconf_dflt = NULL;
#endif
err_alloc_dflt:
kfree(all);
+ net->ipv6.devconf_all = NULL;
err_alloc_all:
+ kfree(net->ipv6.inet6_addr_lst);
+err_alloc_addr:
return err;
}
static void __net_exit addrconf_exit_net(struct net *net)
{
+ int i;
+
#ifdef CONFIG_SYSCTL
__addrconf_sysctl_unregister(net, net->ipv6.devconf_dflt,
NETCONFA_IFINDEX_DEFAULT);
@@ -7005,7 +7236,19 @@ static void __net_exit addrconf_exit_net(struct net *net)
NETCONFA_IFINDEX_ALL);
#endif
kfree(net->ipv6.devconf_dflt);
+ net->ipv6.devconf_dflt = NULL;
kfree(net->ipv6.devconf_all);
+ net->ipv6.devconf_all = NULL;
+
+ cancel_delayed_work_sync(&net->ipv6.addr_chk_work);
+ /*
+ * Check hash table, then free it.
+ */
+ for (i = 0; i < IN6_ADDR_HSIZE; i++)
+ WARN_ON_ONCE(!hlist_empty(&net->ipv6.inet6_addr_lst[i]));
+
+ kfree(net->ipv6.inet6_addr_lst);
+ net->ipv6.inet6_addr_lst = NULL;
}
static struct pernet_operations addrconf_ops = {
@@ -7028,7 +7271,7 @@ static struct rtnl_af_ops inet6_ops __read_mostly = {
int __init addrconf_init(void)
{
struct inet6_dev *idev;
- int i, err;
+ int err;
err = ipv6_addr_label_init();
if (err < 0) {
@@ -7047,26 +7290,8 @@ int __init addrconf_init(void)
goto out_nowq;
}
- /* The addrconf netdev notifier requires that loopback_dev
- * has it's ipv6 private information allocated and setup
- * before it can bring up and give link-local addresses
- * to other devices which are up.
- *
- * Unfortunately, loopback_dev is not necessarily the first
- * entry in the global dev_base list of net devices. In fact,
- * it is likely to be the very last entry on that list.
- * So this causes the notifier registry below to try and
- * give link-local addresses to all devices besides loopback_dev
- * first, then loopback_dev, which cases all the non-loopback_dev
- * devices to fail to get a link-local address.
- *
- * So, as a temporary fix, allocate the ipv6 structure for
- * loopback_dev first by hand.
- * Longer term, all of the dependencies ipv6 has upon the loopback
- * device and it being up should be removed.
- */
rtnl_lock();
- idev = ipv6_add_dev(init_net.loopback_dev);
+ idev = ipv6_add_dev(blackhole_netdev);
rtnl_unlock();
if (IS_ERR(idev)) {
err = PTR_ERR(idev);
@@ -7075,12 +7300,9 @@ int __init addrconf_init(void)
ip6_route_init_special_entries();
- for (i = 0; i < IN6_ADDR_HSIZE; i++)
- INIT_HLIST_HEAD(&inet6_addr_lst[i]);
-
register_netdevice_notifier(&ipv6_dev_notf);
- addrconf_verify();
+ addrconf_verify(&init_net);
rtnl_af_register(&inet6_ops);
@@ -7138,7 +7360,6 @@ out:
void addrconf_cleanup(void)
{
struct net_device *dev;
- int i;
unregister_netdevice_notifier(&ipv6_dev_notf);
unregister_pernet_subsys(&addrconf_ops);
@@ -7152,18 +7373,10 @@ void addrconf_cleanup(void)
for_each_netdev(&init_net, dev) {
if (__in6_dev_get(dev) == NULL)
continue;
- addrconf_ifdown(dev, 1);
+ addrconf_ifdown(dev, true);
}
- addrconf_ifdown(init_net.loopback_dev, 2);
+ addrconf_ifdown(init_net.loopback_dev, true);
- /*
- * Check hash table.
- */
- spin_lock_bh(&addrconf_hash_lock);
- for (i = 0; i < IN6_ADDR_HSIZE; i++)
- WARN_ON(!hlist_empty(&inet6_addr_lst[i]));
- spin_unlock_bh(&addrconf_hash_lock);
- cancel_delayed_work(&addr_chk_work);
rtnl_unlock();
destroy_workqueue(addrconf_wq);
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index ea00ce3d4117..507a8353a6bd 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -185,11 +185,25 @@ static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
return -EAFNOSUPPORT;
}
-static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt)
+static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt,
+ bool skip_notify)
{
return -EAFNOSUPPORT;
}
+static int eafnosupport_ipv6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
+{
+ kfree_skb(skb);
+ return -EAFNOSUPPORT;
+}
+
+static struct net_device *eafnosupport_ipv6_dev_find(struct net *net, const struct in6_addr *addr,
+ struct net_device *dev)
+{
+ return ERR_PTR(-EAFNOSUPPORT);
+}
+
const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
.ipv6_dst_lookup_flow = eafnosupport_ipv6_dst_lookup_flow,
.ipv6_route_input = eafnosupport_ipv6_route_input,
@@ -200,6 +214,8 @@ const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
.ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6,
.fib6_nh_init = eafnosupport_fib6_nh_init,
.ip6_del_rt = eafnosupport_ip6_del_rt,
+ .ipv6_fragment = eafnosupport_ipv6_fragment,
+ .ipv6_dev_find = eafnosupport_ipv6_dev_find,
};
EXPORT_SYMBOL_GPL(ipv6_stub);
@@ -241,13 +257,13 @@ void in6_dev_finish_destroy(struct inet6_dev *idev)
struct net_device *dev = idev->dev;
WARN_ON(!list_empty(&idev->addr_list));
- WARN_ON(idev->mc_list);
+ WARN_ON(rcu_access_pointer(idev->mc_list));
WARN_ON(timer_pending(&idev->rs_timer));
#ifdef NET_REFCNT_DEBUG
pr_debug("%s: %s\n", __func__, dev ? dev->name : "NIL");
#endif
- dev_put(dev);
+ netdev_put(dev, &idev->dev_tracker);
if (!idev->dead) {
pr_warn("Freeing alive inet6 device %p\n", idev);
return;
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index 642fc6ac13d2..17ac45aa7194 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -306,7 +306,9 @@ static int ip6addrlbl_del(struct net *net,
/* add default label */
static int __net_init ip6addrlbl_net_init(struct net *net)
{
- int err = 0;
+ struct ip6addrlbl_entry *p = NULL;
+ struct hlist_node *n;
+ int err;
int i;
ADDRLABEL(KERN_DEBUG "%s\n", __func__);
@@ -315,14 +317,20 @@ static int __net_init ip6addrlbl_net_init(struct net *net)
INIT_HLIST_HEAD(&net->ipv6.ip6addrlbl_table.head);
for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) {
- int ret = ip6addrlbl_add(net,
- ip6addrlbl_init_table[i].prefix,
- ip6addrlbl_init_table[i].prefixlen,
- 0,
- ip6addrlbl_init_table[i].label, 0);
- /* XXX: should we free all rules when we catch an error? */
- if (ret && (!err || err != -ENOMEM))
- err = ret;
+ err = ip6addrlbl_add(net,
+ ip6addrlbl_init_table[i].prefix,
+ ip6addrlbl_init_table[i].prefixlen,
+ 0,
+ ip6addrlbl_init_table[i].label, 0);
+ if (err)
+ goto err_ip6addrlbl_add;
+ }
+ return 0;
+
+err_ip6addrlbl_add:
+ hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) {
+ hlist_del_rcu(&p->list);
+ kfree_rcu(p, rcu);
}
return err;
}
@@ -429,6 +437,7 @@ static void ip6addrlbl_putmsg(struct nlmsghdr *nlh,
{
struct ifaddrlblmsg *ifal = nlmsg_data(nlh);
ifal->ifal_family = AF_INET6;
+ ifal->__ifal_reserved = 0;
ifal->ifal_prefixlen = prefixlen;
ifal->ifal_flags = 0;
ifal->ifal_index = ifindex;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d727c3b41495..024191004982 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -59,6 +59,11 @@
#endif
#include <net/calipso.h>
#include <net/seg6.h>
+#include <net/rpl.h>
+#include <net/compat.h>
+#include <net/xfrm.h>
+#include <net/ioam6.h>
+#include <net/rawv6.h>
#include <linux/uaccess.h>
#include <linux/mroute6.h>
@@ -104,6 +109,12 @@ static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
}
+void inet6_sock_destruct(struct sock *sk)
+{
+ inet6_cleanup_sock(sk);
+ inet_sock_destruct(sk);
+}
+
static int inet6_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
@@ -196,7 +207,7 @@ lookup_protocol:
inet->hdrincl = 1;
}
- sk->sk_destruct = inet_sock_destruct;
+ sk->sk_destruct = inet6_sock_destruct;
sk->sk_family = PF_INET6;
sk->sk_protocol = protocol;
@@ -219,10 +230,10 @@ lookup_protocol:
inet->mc_loop = 1;
inet->mc_ttl = 1;
inet->mc_index = 0;
- inet->mc_list = NULL;
+ RCU_INIT_POINTER(inet->mc_list, NULL);
inet->rcv_tos = 0;
- if (net->ipv4.sysctl_ip_no_pmtu_disc)
+ if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc))
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
@@ -272,7 +283,7 @@ out_rcu_unlock:
}
static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
- bool force_bind_address_no_port, bool with_lock)
+ u32 flags)
{
struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr;
struct inet_sock *inet = inet_sk(sk);
@@ -292,11 +303,12 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
return -EINVAL;
snum = ntohs(addr->sin6_port);
- if (snum && inet_port_requires_bind_service(net, snum) &&
+ if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
+ snum && inet_port_requires_bind_service(net, snum) &&
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
return -EACCES;
- if (with_lock)
+ if (flags & BIND_WITH_LOCK)
lock_sock(sk);
/* Check these errors (active socket, double bind). */
@@ -313,7 +325,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
/* Binding to v4-mapped address on a v6-only socket
* makes no sense
*/
- if (sk->sk_ipv6only) {
+ if (ipv6_only_sock(sk)) {
err = -EINVAL;
goto out;
}
@@ -332,11 +344,8 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
chk_addr_ret = inet_addr_type_dev_table(net, dev, v4addr);
rcu_read_unlock();
- if (!inet_can_nonlocal_bind(net, inet) &&
- v4addr != htonl(INADDR_ANY) &&
- chk_addr_ret != RTN_LOCAL &&
- chk_addr_ret != RTN_MULTICAST &&
- chk_addr_ret != RTN_BROADCAST) {
+ if (!inet_addr_valid_or_nonlocal(net, inet, v4addr,
+ chk_addr_ret)) {
err = -EADDRNOTAVAIL;
goto out;
}
@@ -399,18 +408,22 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
/* Make sure we are allowed to bind here. */
if (snum || !(inet->bind_address_no_port ||
- force_bind_address_no_port)) {
+ (flags & BIND_FORCE_ADDRESS_NO_PORT))) {
if (sk->sk_prot->get_port(sk, snum)) {
sk->sk_ipv6only = saved_ipv6only;
inet_reset_saddr(sk);
err = -EADDRINUSE;
goto out;
}
- err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk);
- if (err) {
- sk->sk_ipv6only = saved_ipv6only;
- inet_reset_saddr(sk);
- goto out;
+ if (!(flags & BIND_FROM_BPF)) {
+ err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk);
+ if (err) {
+ sk->sk_ipv6only = saved_ipv6only;
+ inet_reset_saddr(sk);
+ if (sk->sk_prot->put_port)
+ sk->sk_prot->put_port(sk);
+ goto out;
+ }
}
}
@@ -422,7 +435,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
inet->inet_dport = 0;
inet->inet_daddr = 0;
out:
- if (with_lock)
+ if (flags & BIND_WITH_LOCK)
release_sock(sk);
return err;
out_unlock:
@@ -434,11 +447,15 @@ out_unlock:
int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sock *sk = sock->sk;
+ u32 flags = BIND_WITH_LOCK;
+ const struct proto *prot;
int err = 0;
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ prot = READ_ONCE(sk->sk_prot);
/* If the socket has its own bind function then use it. */
- if (sk->sk_prot->bind)
- return sk->sk_prot->bind(sk, uaddr, addr_len);
+ if (prot->bind)
+ return prot->bind(sk, uaddr, addr_len);
if (addr_len < SIN6_LEN_RFC2133)
return -EINVAL;
@@ -446,11 +463,12 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
/* BPF prog is run before any checks are done so that if the prog
* changes context in a wrong way it will be caught.
*/
- err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr);
+ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
+ CGROUP_INET6_BIND, &flags);
if (err)
return err;
- return __inet6_bind(sk, uaddr, addr_len, false, true);
+ return __inet6_bind(sk, uaddr, addr_len, flags);
}
EXPORT_SYMBOL(inet6_bind);
@@ -498,12 +516,17 @@ void inet6_destroy_sock(struct sock *sk)
}
EXPORT_SYMBOL_GPL(inet6_destroy_sock);
+void inet6_cleanup_sock(struct sock *sk)
+{
+ inet6_destroy_sock(sk);
+}
+EXPORT_SYMBOL_GPL(inet6_cleanup_sock);
+
/*
* This does both peername and sockname.
*/
-
int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
- int peer)
+ int peer)
{
struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr;
struct sock *sk = sock->sk;
@@ -513,85 +536,153 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
sin->sin6_family = AF_INET6;
sin->sin6_flowinfo = 0;
sin->sin6_scope_id = 0;
+ lock_sock(sk);
if (peer) {
- if (!inet->inet_dport)
- return -ENOTCONN;
- if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
- peer == 1)
+ if (!inet->inet_dport ||
+ (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
+ peer == 1)) {
+ release_sock(sk);
return -ENOTCONN;
+ }
sin->sin6_port = inet->inet_dport;
sin->sin6_addr = sk->sk_v6_daddr;
if (np->sndflow)
sin->sin6_flowinfo = np->flow_label;
+ BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
+ CGROUP_INET6_GETPEERNAME);
} else {
if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
sin->sin6_addr = np->saddr;
else
sin->sin6_addr = sk->sk_v6_rcv_saddr;
-
sin->sin6_port = inet->inet_sport;
+ BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
+ CGROUP_INET6_GETSOCKNAME);
}
sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
sk->sk_bound_dev_if);
+ release_sock(sk);
return sizeof(*sin);
}
EXPORT_SYMBOL(inet6_getname);
int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
+ void __user *argp = (void __user *)arg;
struct sock *sk = sock->sk;
struct net *net = sock_net(sk);
+ const struct proto *prot;
switch (cmd) {
case SIOCADDRT:
- case SIOCDELRT:
-
- return ipv6_route_ioctl(net, cmd, (void __user *)arg);
+ case SIOCDELRT: {
+ struct in6_rtmsg rtmsg;
+ if (copy_from_user(&rtmsg, argp, sizeof(rtmsg)))
+ return -EFAULT;
+ return ipv6_route_ioctl(net, cmd, &rtmsg);
+ }
case SIOCSIFADDR:
- return addrconf_add_ifaddr(net, (void __user *) arg);
+ return addrconf_add_ifaddr(net, argp);
case SIOCDIFADDR:
- return addrconf_del_ifaddr(net, (void __user *) arg);
+ return addrconf_del_ifaddr(net, argp);
case SIOCSIFDSTADDR:
- return addrconf_set_dstaddr(net, (void __user *) arg);
+ return addrconf_set_dstaddr(net, argp);
default:
- if (!sk->sk_prot->ioctl)
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ prot = READ_ONCE(sk->sk_prot);
+ if (!prot->ioctl)
return -ENOIOCTLCMD;
- return sk->sk_prot->ioctl(sk, cmd, arg);
+ return prot->ioctl(sk, cmd, arg);
}
/*NOTREACHED*/
return 0;
}
EXPORT_SYMBOL(inet6_ioctl);
+#ifdef CONFIG_COMPAT
+struct compat_in6_rtmsg {
+ struct in6_addr rtmsg_dst;
+ struct in6_addr rtmsg_src;
+ struct in6_addr rtmsg_gateway;
+ u32 rtmsg_type;
+ u16 rtmsg_dst_len;
+ u16 rtmsg_src_len;
+ u32 rtmsg_metric;
+ u32 rtmsg_info;
+ u32 rtmsg_flags;
+ s32 rtmsg_ifindex;
+};
+
+static int inet6_compat_routing_ioctl(struct sock *sk, unsigned int cmd,
+ struct compat_in6_rtmsg __user *ur)
+{
+ struct in6_rtmsg rt;
+
+ if (copy_from_user(&rt.rtmsg_dst, &ur->rtmsg_dst,
+ 3 * sizeof(struct in6_addr)) ||
+ get_user(rt.rtmsg_type, &ur->rtmsg_type) ||
+ get_user(rt.rtmsg_dst_len, &ur->rtmsg_dst_len) ||
+ get_user(rt.rtmsg_src_len, &ur->rtmsg_src_len) ||
+ get_user(rt.rtmsg_metric, &ur->rtmsg_metric) ||
+ get_user(rt.rtmsg_info, &ur->rtmsg_info) ||
+ get_user(rt.rtmsg_flags, &ur->rtmsg_flags) ||
+ get_user(rt.rtmsg_ifindex, &ur->rtmsg_ifindex))
+ return -EFAULT;
+
+
+ return ipv6_route_ioctl(sock_net(sk), cmd, &rt);
+}
+
+int inet6_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ void __user *argp = compat_ptr(arg);
+ struct sock *sk = sock->sk;
+
+ switch (cmd) {
+ case SIOCADDRT:
+ case SIOCDELRT:
+ return inet6_compat_routing_ioctl(sk, cmd, argp);
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+EXPORT_SYMBOL_GPL(inet6_compat_ioctl);
+#endif /* CONFIG_COMPAT */
+
INDIRECT_CALLABLE_DECLARE(int udpv6_sendmsg(struct sock *, struct msghdr *,
size_t));
int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
struct sock *sk = sock->sk;
+ const struct proto *prot;
if (unlikely(inet_send_prepare(sk)))
return -EAGAIN;
- return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udpv6_sendmsg,
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ prot = READ_ONCE(sk->sk_prot);
+ return INDIRECT_CALL_2(prot->sendmsg, tcp_sendmsg, udpv6_sendmsg,
sk, msg, size);
}
INDIRECT_CALLABLE_DECLARE(int udpv6_recvmsg(struct sock *, struct msghdr *,
- size_t, int, int, int *));
+ size_t, int, int *));
int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int flags)
{
struct sock *sk = sock->sk;
+ const struct proto *prot;
int addr_len = 0;
int err;
if (likely(!(flags & MSG_ERRQUEUE)))
sock_rps_record_flow(sk);
- err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udpv6_recvmsg,
- sk, msg, size, flags & MSG_DONTWAIT,
- flags & ~MSG_DONTWAIT, &addr_len);
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ prot = READ_ONCE(sk->sk_prot);
+ err = INDIRECT_CALL_2(prot->recvmsg, tcp_recvmsg, udpv6_recvmsg,
+ sk, msg, size, flags, &addr_len);
if (err >= 0)
msg->msg_namelen = addr_len;
return err;
@@ -623,10 +714,10 @@ const struct proto_ops inet6_stream_ops = {
.sendpage_locked = tcp_sendpage_locked,
.splice_read = tcp_splice_read,
.read_sock = tcp_read_sock,
+ .read_skb = tcp_read_skb,
.peek_len = tcp_peek_len,
#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet6_compat_ioctl,
#endif
.set_rcvlowat = tcp_set_rcvlowat,
};
@@ -649,12 +740,12 @@ const struct proto_ops inet6_dgram_ops = {
.getsockopt = sock_common_getsockopt, /* ok */
.sendmsg = inet6_sendmsg, /* retpoline's sake */
.recvmsg = inet6_recvmsg, /* retpoline's sake */
+ .read_skb = udp_read_skb,
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
.set_peek_off = sk_set_peek_off,
#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet6_compat_ioctl,
#endif
};
@@ -758,7 +849,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
fl6.fl6_dport = inet->inet_dport;
fl6.fl6_sport = inet->inet_sport;
fl6.flowi6_uid = sk->sk_uid;
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
rcu_read_lock();
final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt),
@@ -893,8 +984,12 @@ static int __net_init inet6_net_init(struct net *net)
net->ipv6.sysctl.max_hbh_opts_cnt = IP6_DEFAULT_MAX_HBH_OPTS_CNT;
net->ipv6.sysctl.max_dst_opts_len = IP6_DEFAULT_MAX_DST_OPTS_LEN;
net->ipv6.sysctl.max_hbh_opts_len = IP6_DEFAULT_MAX_HBH_OPTS_LEN;
+ net->ipv6.sysctl.fib_notify_on_flag_change = 0;
atomic_set(&net->ipv6.fib6_sernum, 1);
+ net->ipv6.sysctl.ioam6_id = IOAM6_DEFAULT_ID;
+ net->ipv6.sysctl.ioam6_id_wide = IOAM6_DEFAULT_ID_WIDE;
+
err = ipv6_init_mibs(net);
if (err)
return err;
@@ -955,17 +1050,27 @@ static const struct ipv6_stub ipv6_stub_impl = {
.ip6_mtu_from_fib6 = ip6_mtu_from_fib6,
.fib6_nh_init = fib6_nh_init,
.fib6_nh_release = fib6_nh_release,
+ .fib6_nh_release_dsts = fib6_nh_release_dsts,
.fib6_update_sernum = fib6_update_sernum_stub,
.fib6_rt_update = fib6_rt_update,
.ip6_del_rt = ip6_del_rt,
.udpv6_encap_enable = udpv6_encap_enable,
.ndisc_send_na = ndisc_send_na,
+#if IS_ENABLED(CONFIG_XFRM)
+ .xfrm6_local_rxpmtu = xfrm6_local_rxpmtu,
+ .xfrm6_udp_encap_rcv = xfrm6_udp_encap_rcv,
+ .xfrm6_rcv_encap = xfrm6_rcv_encap,
+#endif
.nd_tbl = &nd_tbl,
+ .ipv6_fragment = ip6_fragment,
+ .ipv6_dev_find = ipv6_dev_find,
};
static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = {
.inet6_bind = __inet6_bind,
.udp6_lib_lookup = __udp6_lib_lookup,
+ .ipv6_setsockopt = do_ipv6_setsockopt,
+ .ipv6_getsockopt = do_ipv6_getsockopt,
};
static int __init inet6_init(void)
@@ -979,6 +1084,8 @@ static int __init inet6_init(void)
for (r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r)
INIT_LIST_HEAD(r);
+ raw_hashinfo_init(&raw_v6_hashinfo);
+
if (disable_ipv6_mod) {
pr_info("Loaded, but administratively disabled, reboot required to enable\n");
goto out;
@@ -1114,6 +1221,14 @@ static int __init inet6_init(void)
if (err)
goto seg6_fail;
+ err = rpl_init();
+ if (err)
+ goto rpl_fail;
+
+ err = ioam6_init();
+ if (err)
+ goto ioam6_fail;
+
err = igmp6_late_init();
if (err)
goto igmp6_late_err;
@@ -1136,6 +1251,10 @@ sysctl_fail:
igmp6_late_cleanup();
#endif
igmp6_late_err:
+ ioam6_exit();
+ioam6_fail:
+ rpl_exit();
+rpl_fail:
seg6_exit();
seg6_fail:
calipso_exit();
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 95835e8d99aa..5228d2716289 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -36,7 +36,7 @@ struct tmp_ext {
struct in6_addr saddr;
#endif
struct in6_addr daddr;
- char hdrs[0];
+ char hdrs[];
};
struct ah_skb_cb {
@@ -175,7 +175,6 @@ static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *des
* See 11.3.2 of RFC 3775 for details.
*/
if (opt[off] == IPV6_TLV_HAO) {
- struct in6_addr final_addr;
struct ipv6_destopt_hao *hao;
hao = (struct ipv6_destopt_hao *)&opt[off];
@@ -184,9 +183,7 @@ static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *des
hao->length);
goto bad;
}
- final_addr = hao->addr;
- hao->addr = iph->saddr;
- iph->saddr = final_addr;
+ swap(hao->addr, iph->saddr);
}
break;
}
@@ -259,7 +256,7 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir)
case NEXTHDR_DEST:
if (dir == XFRM_POLICY_OUT)
ipv6_rearrange_destopt(iph, exthdr.opth);
- /* fall through */
+ fallthrough;
case NEXTHDR_HOP:
if (!zero_out_mutable_opts(exthdr.opth)) {
net_dbg_ratelimited("overrun %sopts\n",
@@ -316,7 +313,7 @@ static void ah6_output_done(struct crypto_async_request *base, int err)
}
kfree(AH_SKB_CB(skb)->tmp);
- xfrm_output_resume(skb, err);
+ xfrm_output_resume(skb->sk, skb, err);
}
static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
@@ -588,7 +585,8 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
memset(ah->auth_data, 0, ahp->icv_trunc_len);
- if (ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN))
+ err = ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN);
+ if (err)
goto out_free;
ip6h->priority = 0;
@@ -668,30 +666,38 @@ static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
}
-static int ah6_init_state(struct xfrm_state *x)
+static int ah6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
struct ah_data *ahp = NULL;
struct xfrm_algo_desc *aalg_desc;
struct crypto_ahash *ahash;
- if (!x->aalg)
+ if (!x->aalg) {
+ NL_SET_ERR_MSG(extack, "AH requires a state with an AUTH algorithm");
goto error;
+ }
- if (x->encap)
+ if (x->encap) {
+ NL_SET_ERR_MSG(extack, "AH is not compatible with encapsulation");
goto error;
+ }
ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
if (!ahp)
return -ENOMEM;
ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
- if (IS_ERR(ahash))
+ if (IS_ERR(ahash)) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto error;
+ }
ahp->ahash = ahash;
if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
- (x->aalg->alg_key_len + 7) / 8))
+ (x->aalg->alg_key_len + 7) / 8)) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto error;
+ }
/*
* Lookup the algorithm description maintained by xfrm_algo,
@@ -704,9 +710,7 @@ static int ah6_init_state(struct xfrm_state *x)
if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
crypto_ahash_digestsize(ahash)) {
- pr_info("AH: %s digestsize %u != %hu\n",
- x->aalg->alg_name, crypto_ahash_digestsize(ahash),
- aalg_desc->uinfo.auth.icv_fullbits/8);
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto error;
}
@@ -723,6 +727,7 @@ static int ah6_init_state(struct xfrm_state *x)
x->props.header_len += sizeof(struct ipv6hdr);
break;
default:
+ NL_SET_ERR_MSG(extack, "Invalid mode requested for AH, must be one of TRANSPORT, TUNNEL, BEET");
goto error;
}
x->data = ahp;
@@ -754,7 +759,6 @@ static int ah6_rcv_cb(struct sk_buff *skb, int err)
}
static const struct xfrm_type ah6_type = {
- .description = "AH6",
.owner = THIS_MODULE,
.proto = IPPROTO_AH,
.flags = XFRM_TYPE_REPLAY_PROT,
@@ -762,11 +766,11 @@ static const struct xfrm_type ah6_type = {
.destructor = ah6_destroy,
.input = ah6_input,
.output = ah6_output,
- .hdr_offset = xfrm6_find_1stfragopt,
};
static struct xfrm6_protocol ah6_protocol = {
.handler = xfrm6_rcv,
+ .input_handler = xfrm_input,
.cb_handler = ah6_rcv_cb,
.err_handler = ah6_err,
.priority = 0,
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index fed91ab7ec46..dacdea7fcb62 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -183,7 +183,7 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
return 0;
}
-void ipv6_sock_ac_close(struct sock *sk)
+void __ipv6_sock_ac_close(struct sock *sk)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct net_device *dev = NULL;
@@ -191,10 +191,7 @@ void ipv6_sock_ac_close(struct sock *sk)
struct net *net = sock_net(sk);
int prev_index;
- if (!np->ipv6_ac_list)
- return;
-
- rtnl_lock();
+ ASSERT_RTNL();
pac = np->ipv6_ac_list;
np->ipv6_ac_list = NULL;
@@ -211,6 +208,16 @@ void ipv6_sock_ac_close(struct sock *sk)
sock_kfree_s(sk, pac, sizeof(*pac));
pac = next;
}
+}
+
+void ipv6_sock_ac_close(struct sock *sk)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ if (!np->ipv6_ac_list)
+ return;
+ rtnl_lock();
+ __ipv6_sock_ac_close(sk);
rtnl_unlock();
}
@@ -364,7 +371,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
ipv6_del_acaddr_hash(aca);
addrconf_leave_solict(idev, &aca->aca_addr);
- ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
+ ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false);
aca_put(aca);
return 0;
@@ -393,7 +400,7 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev)
addrconf_leave_solict(idev, &aca->aca_addr);
- ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
+ ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false);
aca_put(aca);
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index 221c81f85cbf..1578ed9e97d8 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -83,6 +83,9 @@ struct calipso_map_cache_entry {
static struct calipso_map_cache_bkt *calipso_cache;
+static void calipso_cache_invalidate(void);
+static void calipso_doi_putdef(struct calipso_doi *doi_def);
+
/* Label Mapping Cache Functions
*/
@@ -423,7 +426,7 @@ static void calipso_doi_free_rcu(struct rcu_head *entry)
/**
* calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine
* @doi: the DOI value
- * @audit_secid: the LSM secid to use in the audit message
+ * @audit_info: NetLabel audit information
*
* Description:
* Removes a DOI definition from the CALIPSO engine. The NetLabel routines will
@@ -444,15 +447,10 @@ static int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info)
ret_val = -ENOENT;
goto doi_remove_return;
}
- if (!refcount_dec_and_test(&doi_def->refcount)) {
- spin_unlock(&calipso_doi_list_lock);
- ret_val = -EBUSY;
- goto doi_remove_return;
- }
list_del_rcu(&doi_def->list);
spin_unlock(&calipso_doi_list_lock);
- call_rcu(&doi_def->rcu, calipso_doi_free_rcu);
+ calipso_doi_putdef(doi_def);
ret_val = 0;
doi_remove_return:
@@ -508,10 +506,8 @@ static void calipso_doi_putdef(struct calipso_doi *doi_def)
if (!refcount_dec_and_test(&doi_def->refcount))
return;
- spin_lock(&calipso_doi_list_lock);
- list_del_rcu(&doi_def->list);
- spin_unlock(&calipso_doi_list_lock);
+ calipso_cache_invalidate();
call_rcu(&doi_def->rcu, calipso_doi_free_rcu);
}
@@ -761,7 +757,7 @@ static int calipso_genopt(unsigned char *buf, u32 start, u32 buf_len,
calipso[1] = len - 2;
*(__be32 *)(calipso + 2) = htonl(doi_def->doi);
calipso[6] = (len - CALIPSO_HDR_LEN) / 4;
- calipso[7] = secattr->attr.mls.lvl,
+ calipso[7] = secattr->attr.mls.lvl;
crc = ~crc_ccitt(0xffff, calipso, len);
calipso[8] = crc & 0xff;
calipso[9] = (crc >> 8) & 0xff;
@@ -1047,7 +1043,8 @@ static int calipso_opt_getattr(const unsigned char *calipso,
goto getattr_return;
}
- secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+ if (secattr->attr.mls.cat)
+ secattr->flags |= NETLBL_SECATTR_MLS_CAT;
}
secattr->type = NETLBL_NLTYPE_CALIPSO;
@@ -1225,7 +1222,7 @@ static int calipso_req_setattr(struct request_sock *req,
/**
* calipso_req_delattr - Delete the CALIPSO option from a request socket
- * @reg: the request socket
+ * @req: the request socket
*
* Description:
* Removes the CALIPSO option from a request socket, if present.
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 390bedde21a5..5ecb56522f9d 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -19,6 +19,7 @@
#include <linux/route.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/icmp.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
@@ -59,7 +60,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk)
if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr))
fl6->flowi6_oif = np->mcast_oif;
- security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
}
int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
@@ -144,7 +145,7 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
int err;
if (usin->sin6_family == AF_INET) {
- if (__ipv6_only_sock(sk))
+ if (ipv6_only_sock(sk))
return -EAFNOSUPPORT;
err = __ip4_datagram_connect(sk, uaddr, addr_len);
goto ipv4_connected;
@@ -177,7 +178,7 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
if (addr_type & IPV6_ADDR_MAPPED) {
struct sockaddr_in sin;
- if (__ipv6_only_sock(sk)) {
+ if (ipv6_only_sock(sk)) {
err = -ENETUNREACH;
goto out;
}
@@ -217,11 +218,11 @@ ipv4_connected:
err = -EINVAL;
goto out;
}
- sk->sk_bound_dev_if = usin->sin6_scope_id;
+ WRITE_ONCE(sk->sk_bound_dev_if, usin->sin6_scope_id);
}
if (!sk->sk_bound_dev_if && (addr_type & IPV6_ADDR_MULTICAST))
- sk->sk_bound_dev_if = np->mcast_oif;
+ WRITE_ONCE(sk->sk_bound_dev_if, np->mcast_oif);
/* Connect to link-local address requires an interface */
if (!sk->sk_bound_dev_if) {
@@ -255,7 +256,7 @@ ipv4_connected:
goto out;
}
- reuseport_has_conns(sk, true);
+ reuseport_has_conns_set(sk);
sk->sk_state = TCP_ESTABLISHED;
sk_set_txhash(sk);
out:
@@ -284,6 +285,17 @@ int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr,
}
EXPORT_SYMBOL_GPL(ip6_datagram_connect_v6_only);
+static void ipv6_icmp_error_rfc4884(const struct sk_buff *skb,
+ struct sock_ee_data_rfc4884 *out)
+{
+ switch (icmp6_hdr(skb)->icmp6_type) {
+ case ICMPV6_TIME_EXCEED:
+ case ICMPV6_DEST_UNREACH:
+ ip_icmp_error_rfc4884(skb, out, sizeof(struct icmp6hdr),
+ icmp6_hdr(skb)->icmp6_datagram_len * 8);
+ }
+}
+
void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
__be16 port, u32 info, u8 *payload)
{
@@ -313,6 +325,10 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
serr->port = port;
__skb_pull(skb, payload - skb->data);
+
+ if (inet6_sk(sk)->recverr_rfc4884)
+ ipv6_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884);
+
skb_reset_transport_header(skb);
if (sock_queue_err_skb(sk, skb))
@@ -782,7 +798,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
if (src_idx) {
if (fl6->flowi6_oif &&
src_idx != fl6->flowi6_oif &&
- (sk->sk_bound_dev_if != fl6->flowi6_oif ||
+ (READ_ONCE(sk->sk_bound_dev_if) != fl6->flowi6_oif ||
!sk_dev_equal_l3scope(sk, src_idx)))
return -EINVAL;
fl6->flowi6_oif = src_idx;
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index a3b403ba8f8f..14ed868680c6 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -26,11 +26,16 @@
#include <linux/random.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <net/ip6_checksum.h>
#include <net/ip6_route.h>
#include <net/icmp.h>
#include <net/ipv6.h>
#include <net/protocol.h>
+#include <net/udp.h>
#include <linux/icmpv6.h>
+#include <net/tcp.h>
+#include <net/espintcp.h>
+#include <net/inet6_hashtables.h>
#include <linux/highmem.h>
@@ -39,6 +44,11 @@ struct esp_skb_cb {
void *tmp;
};
+struct esp_output_extra {
+ __be32 seqhi;
+ u32 esphoff;
+};
+
#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
/*
@@ -72,9 +82,9 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqihlen)
return kmalloc(len, GFP_ATOMIC);
}
-static inline __be32 *esp_tmp_seqhi(void *tmp)
+static inline void *esp_tmp_extra(void *tmp)
{
- return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
+ return PTR_ALIGN(tmp, __alignof__(struct esp_output_extra));
}
static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
@@ -105,15 +115,15 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
{
struct crypto_aead *aead = x->data;
- int seqhilen = 0;
+ int extralen = 0;
u8 *iv;
struct aead_request *req;
struct scatterlist *sg;
if (x->props.flags & XFRM_STATE_ESN)
- seqhilen += sizeof(__be32);
+ extralen += sizeof(struct esp_output_extra);
- iv = esp_tmp_iv(aead, tmp, seqhilen);
+ iv = esp_tmp_iv(aead, tmp, extralen);
req = esp_tmp_req(aead, iv);
/* Unref skb_frag_pages in the src scatterlist if necessary.
@@ -124,6 +134,150 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
put_page(sg_page(sg));
}
+#ifdef CONFIG_INET6_ESPINTCP
+struct esp_tcp_sk {
+ struct sock *sk;
+ struct rcu_head rcu;
+};
+
+static void esp_free_tcp_sk(struct rcu_head *head)
+{
+ struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu);
+
+ sock_put(esk->sk);
+ kfree(esk);
+}
+
+static struct sock *esp6_find_tcp_sk(struct xfrm_state *x)
+{
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct net *net = xs_net(x);
+ struct esp_tcp_sk *esk;
+ __be16 sport, dport;
+ struct sock *nsk;
+ struct sock *sk;
+
+ sk = rcu_dereference(x->encap_sk);
+ if (sk && sk->sk_state == TCP_ESTABLISHED)
+ return sk;
+
+ spin_lock_bh(&x->lock);
+ sport = encap->encap_sport;
+ dport = encap->encap_dport;
+ nsk = rcu_dereference_protected(x->encap_sk,
+ lockdep_is_held(&x->lock));
+ if (sk && sk == nsk) {
+ esk = kmalloc(sizeof(*esk), GFP_ATOMIC);
+ if (!esk) {
+ spin_unlock_bh(&x->lock);
+ return ERR_PTR(-ENOMEM);
+ }
+ RCU_INIT_POINTER(x->encap_sk, NULL);
+ esk->sk = sk;
+ call_rcu(&esk->rcu, esp_free_tcp_sk);
+ }
+ spin_unlock_bh(&x->lock);
+
+ sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &x->id.daddr.in6,
+ dport, &x->props.saddr.in6, ntohs(sport), 0, 0);
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ if (!tcp_is_ulp_esp(sk)) {
+ sock_put(sk);
+ return ERR_PTR(-EINVAL);
+ }
+
+ spin_lock_bh(&x->lock);
+ nsk = rcu_dereference_protected(x->encap_sk,
+ lockdep_is_held(&x->lock));
+ if (encap->encap_sport != sport ||
+ encap->encap_dport != dport) {
+ sock_put(sk);
+ sk = nsk ?: ERR_PTR(-EREMCHG);
+ } else if (sk == nsk) {
+ sock_put(sk);
+ } else {
+ rcu_assign_pointer(x->encap_sk, sk);
+ }
+ spin_unlock_bh(&x->lock);
+
+ return sk;
+}
+
+static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct sock *sk;
+ int err;
+
+ rcu_read_lock();
+
+ sk = esp6_find_tcp_sk(x);
+ err = PTR_ERR_OR_ZERO(sk);
+ if (err)
+ goto out;
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk))
+ err = espintcp_queue_out(sk, skb);
+ else
+ err = espintcp_push_skb(sk, skb);
+ bh_unlock_sock(sk);
+
+out:
+ rcu_read_unlock();
+ return err;
+}
+
+static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct xfrm_state *x = dst->xfrm;
+
+ return esp_output_tcp_finish(x, skb);
+}
+
+static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+
+ local_bh_disable();
+ err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb);
+ local_bh_enable();
+
+ /* EINPROGRESS just happens to do the right thing. It
+ * actually means that the skb has been consumed and
+ * isn't coming back.
+ */
+ return err ?: -EINPROGRESS;
+}
+#else
+static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+{
+ kfree_skb(skb);
+
+ return -EOPNOTSUPP;
+}
+#endif
+
+static void esp_output_encap_csum(struct sk_buff *skb)
+{
+ /* UDP encap with IPv6 requires a valid checksum */
+ if (*skb_mac_header(skb) == IPPROTO_UDP) {
+ struct udphdr *uh = udp_hdr(skb);
+ struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ int len = ntohs(uh->len);
+ unsigned int offset = skb_transport_offset(skb);
+ __wsum csum = skb_checksum(skb, offset, skb->len - offset, 0);
+
+ uh->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ len, IPPROTO_UDP, csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ }
+}
+
static void esp_output_done(struct crypto_async_request *base, int err)
{
struct sk_buff *skb = base->data;
@@ -143,6 +297,8 @@ static void esp_output_done(struct crypto_async_request *base, int err)
esp_ssg_unref(x, tmp);
kfree(tmp);
+ esp_output_encap_csum(skb);
+
if (xo && (xo->flags & XFRM_DEV_RESUME)) {
if (err) {
XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
@@ -154,7 +310,11 @@ static void esp_output_done(struct crypto_async_request *base, int err)
secpath_reset(skb);
xfrm_dev_resume(skb);
} else {
- xfrm_output_resume(skb, err);
+ if (!err &&
+ x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
+ esp_output_tail_tcp(x, skb);
+ else
+ xfrm_output_resume(skb->sk, skb, err);
}
}
@@ -163,7 +323,7 @@ static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
{
struct ip_esp_hdr *esph = (void *)(skb->data + offset);
void *tmp = ESP_SKB_CB(skb)->tmp;
- __be32 *seqhi = esp_tmp_seqhi(tmp);
+ __be32 *seqhi = esp_tmp_extra(tmp);
esph->seq_no = esph->spi;
esph->spi = *seqhi;
@@ -171,27 +331,36 @@ static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
static void esp_output_restore_header(struct sk_buff *skb)
{
- esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32));
+ void *tmp = ESP_SKB_CB(skb)->tmp;
+ struct esp_output_extra *extra = esp_tmp_extra(tmp);
+
+ esp_restore_header(skb, skb_transport_offset(skb) + extra->esphoff -
+ sizeof(__be32));
}
static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb,
struct xfrm_state *x,
struct ip_esp_hdr *esph,
- __be32 *seqhi)
+ struct esp_output_extra *extra)
{
/* For ESN we move the header forward by 4 bytes to
- * accomodate the high bits. We will move it back after
+ * accommodate the high bits. We will move it back after
* encryption.
*/
if ((x->props.flags & XFRM_STATE_ESN)) {
+ __u32 seqhi;
struct xfrm_offload *xo = xfrm_offload(skb);
- esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
- *seqhi = esph->spi;
if (xo)
- esph->seq_no = htonl(xo->seq.hi);
+ seqhi = xo->seq.hi;
else
- esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ seqhi = XFRM_SKB_CB(skb)->seq.output.hi;
+
+ extra->esphoff = (unsigned char *)esph -
+ skb_transport_header(skb);
+ esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
+ extra->seqhi = esph->spi;
+ esph->seq_no = htonl(seqhi);
}
esph->spi = x->id.spi;
@@ -207,31 +376,125 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
esp_output_done(base, err);
}
-static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
+static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb,
+ int encap_type,
+ struct esp_info *esp,
+ __be16 sport,
+ __be16 dport)
+{
+ struct udphdr *uh;
+ __be32 *udpdata32;
+ unsigned int len;
+
+ len = skb->len + esp->tailen - skb_transport_offset(skb);
+ if (len > U16_MAX)
+ return ERR_PTR(-EMSGSIZE);
+
+ uh = (struct udphdr *)esp->esph;
+ uh->source = sport;
+ uh->dest = dport;
+ uh->len = htons(len);
+ uh->check = 0;
+
+ *skb_mac_header(skb) = IPPROTO_UDP;
+
+ if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) {
+ udpdata32 = (__be32 *)(uh + 1);
+ udpdata32[0] = udpdata32[1] = 0;
+ return (struct ip_esp_hdr *)(udpdata32 + 2);
+ }
+
+ return (struct ip_esp_hdr *)(uh + 1);
+}
+
+#ifdef CONFIG_INET6_ESPINTCP
+static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x,
+ struct sk_buff *skb,
+ struct esp_info *esp)
+{
+ __be16 *lenp = (void *)esp->esph;
+ struct ip_esp_hdr *esph;
+ unsigned int len;
+ struct sock *sk;
+
+ len = skb->len + esp->tailen - skb_transport_offset(skb);
+ if (len > IP_MAX_MTU)
+ return ERR_PTR(-EMSGSIZE);
+
+ rcu_read_lock();
+ sk = esp6_find_tcp_sk(x);
+ rcu_read_unlock();
+
+ if (IS_ERR(sk))
+ return ERR_CAST(sk);
+
+ *lenp = htons(len);
+ esph = (struct ip_esp_hdr *)(lenp + 1);
+
+ return esph;
+}
+#else
+static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x,
+ struct sk_buff *skb,
+ struct esp_info *esp)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+#endif
+
+static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb,
+ struct esp_info *esp)
{
- /* Fill padding... */
- if (tfclen) {
- memset(tail, 0, tfclen);
- tail += tfclen;
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct ip_esp_hdr *esph;
+ __be16 sport, dport;
+ int encap_type;
+
+ spin_lock_bh(&x->lock);
+ sport = encap->encap_sport;
+ dport = encap->encap_dport;
+ encap_type = encap->encap_type;
+ spin_unlock_bh(&x->lock);
+
+ switch (encap_type) {
+ default:
+ case UDP_ENCAP_ESPINUDP:
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ esph = esp6_output_udp_encap(skb, encap_type, esp, sport, dport);
+ break;
+ case TCP_ENCAP_ESPINTCP:
+ esph = esp6_output_tcp_encap(x, skb, esp);
+ break;
}
- do {
- int i;
- for (i = 0; i < plen - 2; i++)
- tail[i] = i + 1;
- } while (0);
- tail[plen - 2] = plen - 2;
- tail[plen - 1] = proto;
+
+ if (IS_ERR(esph))
+ return PTR_ERR(esph);
+
+ esp->esph = esph;
+
+ return 0;
}
int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
{
u8 *tail;
- u8 *vaddr;
int nfrags;
+ int esph_offset;
struct page *page;
struct sk_buff *trailer;
int tailen = esp->tailen;
+ if (x->encap) {
+ int err = esp6_output_encap(x, skb, esp);
+
+ if (err < 0)
+ return err;
+ }
+
+ if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE ||
+ ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE)
+ goto cow;
+
if (!skb_cloned(skb)) {
if (tailen <= skb_tailroom(skb)) {
nfrags = 1;
@@ -259,14 +522,10 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
page = pfrag->page;
get_page(page);
- vaddr = kmap_atomic(page);
-
- tail = vaddr + pfrag->offset;
+ tail = page_address(page) + pfrag->offset;
esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
- kunmap_atomic(vaddr);
-
nfrags = skb_shinfo(skb)->nr_frags;
__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
@@ -290,10 +549,13 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
}
cow:
+ esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb);
+
nfrags = skb_cow_data(skb, tailen, &trailer);
if (nfrags < 0)
goto out;
tail = skb_tail_pointer(trailer);
+ esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset);
skip_cow:
esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
@@ -311,20 +573,20 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
void *tmp;
int ivlen;
int assoclen;
- int seqhilen;
- __be32 *seqhi;
+ int extralen;
struct page *page;
struct ip_esp_hdr *esph;
struct aead_request *req;
struct crypto_aead *aead;
struct scatterlist *sg, *dsg;
+ struct esp_output_extra *extra;
int err = -ENOMEM;
assoclen = sizeof(struct ip_esp_hdr);
- seqhilen = 0;
+ extralen = 0;
if (x->props.flags & XFRM_STATE_ESN) {
- seqhilen += sizeof(__be32);
+ extralen += sizeof(*extra);
assoclen += sizeof(__be32);
}
@@ -332,12 +594,12 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
alen = crypto_aead_authsize(aead);
ivlen = crypto_aead_ivsize(aead);
- tmp = esp_alloc_tmp(aead, esp->nfrags + 2, seqhilen);
+ tmp = esp_alloc_tmp(aead, esp->nfrags + 2, extralen);
if (!tmp)
goto error;
- seqhi = esp_tmp_seqhi(tmp);
- iv = esp_tmp_iv(aead, tmp, seqhilen);
+ extra = esp_tmp_extra(tmp);
+ iv = esp_tmp_iv(aead, tmp, extralen);
req = esp_tmp_req(aead, iv);
sg = esp_req_sg(aead, req);
@@ -346,7 +608,8 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
else
dsg = &sg[esp->nfrags];
- esph = esp_output_set_esn(skb, x, ip_esp_hdr(skb), seqhi);
+ esph = esp_output_set_esn(skb, x, esp->esph, extra);
+ esp->esph = esph;
sg_init_table(sg, esp->nfrags);
err = skb_to_sgvec(skb, sg,
@@ -410,11 +673,15 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
case 0:
if ((x->props.flags & XFRM_STATE_ESN))
esp_output_restore_header(skb);
+ esp_output_encap_csum(skb);
}
if (sg != dsg)
esp_ssg_unref(x, tmp);
+ if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
+ err = esp_output_tail_tcp(x, skb);
+
error_free:
kfree(tmp);
error:
@@ -454,11 +721,13 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
esp.plen = esp.clen - skb->len - esp.tfclen;
esp.tailen = esp.tfclen + esp.plen + alen;
+ esp.esph = ip_esp_hdr(skb);
+
esp.nfrags = esp6_output_head(x, skb, &esp);
if (esp.nfrags < 0)
return esp.nfrags;
- esph = ip_esp_hdr(skb);
+ esph = esp.esph;
esph->spi = x->id.spi;
esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
@@ -473,7 +742,6 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
static inline int esp_remove_trailer(struct sk_buff *skb)
{
struct xfrm_state *x = xfrm_input_state(skb);
- struct xfrm_offload *xo = xfrm_offload(skb);
struct crypto_aead *aead = x->data;
int alen, hlen, elen;
int padlen, trimlen;
@@ -485,11 +753,6 @@ static inline int esp_remove_trailer(struct sk_buff *skb)
hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
elen = skb->len - hlen;
- if (xo && (xo->flags & XFRM_ESP_NO_TRAILER)) {
- ret = xo->proto;
- goto out;
- }
-
ret = skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2);
BUG_ON(ret);
@@ -523,7 +786,7 @@ int esp6_input_done2(struct sk_buff *skb, int err)
int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
int hdr_len = skb_network_header_len(skb);
- if (!xo || (xo && !(xo->flags & CRYPTO_DONE)))
+ if (!xo || !(xo->flags & CRYPTO_DONE))
kfree(ESP_SKB_CB(skb)->tmp);
if (unlikely(err))
@@ -533,6 +796,72 @@ int esp6_input_done2(struct sk_buff *skb, int err)
if (unlikely(err < 0))
goto out;
+ if (x->encap) {
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ int offset = skb_network_offset(skb) + sizeof(*ip6h);
+ struct xfrm_encap_tmpl *encap = x->encap;
+ u8 nexthdr = ip6h->nexthdr;
+ __be16 frag_off, source;
+ struct udphdr *uh;
+ struct tcphdr *th;
+
+ offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
+ if (offset == -1) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ uh = (void *)(skb->data + offset);
+ th = (void *)(skb->data + offset);
+ hdr_len += offset;
+
+ switch (x->encap->encap_type) {
+ case TCP_ENCAP_ESPINTCP:
+ source = th->source;
+ break;
+ case UDP_ENCAP_ESPINUDP:
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ source = uh->source;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * 1) if the NAT-T peer's IP or port changed then
+ * advertize the change to the keying daemon.
+ * This is an inbound SA, so just compare
+ * SRC ports.
+ */
+ if (!ipv6_addr_equal(&ip6h->saddr, &x->props.saddr.in6) ||
+ source != encap->encap_sport) {
+ xfrm_address_t ipaddr;
+
+ memcpy(&ipaddr.a6, &ip6h->saddr.s6_addr, sizeof(ipaddr.a6));
+ km_new_mapping(x, &ipaddr, source);
+
+ /* XXX: perhaps add an extra
+ * policy check here, to see
+ * if we should allow or
+ * reject a packet from a
+ * different source
+ * address/port.
+ */
+ }
+
+ /*
+ * 2) ignore UDP/TCP checksums in case
+ * of NAT-T in Transport Mode, or
+ * perform other post-processing fixes
+ * as per draft-ietf-ipsec-udp-encaps-06,
+ * section 3.1.2
+ */
+ if (x->props.mode == XFRM_MODE_TRANSPORT)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+
skb_postpull_rcsum(skb, skb_network_header(skb),
skb_network_header_len(skb));
skb_pull_rcsum(skb, hlen);
@@ -568,7 +897,7 @@ static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
struct xfrm_state *x = xfrm_input_state(skb);
/* For ESN we move the header forward by 4 bytes to
- * accomodate the high bits. We will move it back after
+ * accommodate the high bits. We will move it back after
* decryption.
*/
if ((x->props.flags & XFRM_STATE_ESN)) {
@@ -648,7 +977,7 @@ skip_cow:
goto out;
ESP_SKB_CB(skb)->tmp = tmp;
- seqhi = esp_tmp_seqhi(tmp);
+ seqhi = esp_tmp_extra(tmp);
iv = esp_tmp_iv(aead, tmp, seqhilen);
req = esp_tmp_req(aead, iv);
sg = esp_req_sg(aead, req);
@@ -722,16 +1051,17 @@ static void esp6_destroy(struct xfrm_state *x)
crypto_free_aead(aead);
}
-static int esp_init_aead(struct xfrm_state *x)
+static int esp_init_aead(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
char aead_name[CRYPTO_MAX_ALG_NAME];
struct crypto_aead *aead;
int err;
- err = -ENAMETOOLONG;
if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
- x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME)
- goto error;
+ x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) {
+ NL_SET_ERR_MSG(extack, "Algorithm name is too long");
+ return -ENAMETOOLONG;
+ }
aead = crypto_alloc_aead(aead_name, 0, 0);
err = PTR_ERR(aead);
@@ -749,11 +1079,15 @@ static int esp_init_aead(struct xfrm_state *x)
if (err)
goto error;
+ return 0;
+
error:
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
return err;
}
-static int esp_init_authenc(struct xfrm_state *x)
+static int esp_init_authenc(struct xfrm_state *x,
+ struct netlink_ext_ack *extack)
{
struct crypto_aead *aead;
struct crypto_authenc_key_param *param;
@@ -764,10 +1098,6 @@ static int esp_init_authenc(struct xfrm_state *x)
unsigned int keylen;
int err;
- err = -EINVAL;
- if (!x->ealg)
- goto error;
-
err = -ENAMETOOLONG;
if ((x->props.flags & XFRM_STATE_ESN)) {
@@ -776,22 +1106,28 @@ static int esp_init_authenc(struct xfrm_state *x)
x->geniv ?: "", x->geniv ? "(" : "",
x->aalg ? x->aalg->alg_name : "digest_null",
x->ealg->alg_name,
- x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME)
+ x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
+ NL_SET_ERR_MSG(extack, "Algorithm name is too long");
goto error;
+ }
} else {
if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
"%s%sauthenc(%s,%s)%s",
x->geniv ?: "", x->geniv ? "(" : "",
x->aalg ? x->aalg->alg_name : "digest_null",
x->ealg->alg_name,
- x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME)
+ x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
+ NL_SET_ERR_MSG(extack, "Algorithm name is too long");
goto error;
+ }
}
aead = crypto_alloc_aead(authenc_name, 0, 0);
err = PTR_ERR(aead);
- if (IS_ERR(aead))
+ if (IS_ERR(aead)) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto error;
+ }
x->data = aead;
@@ -821,17 +1157,16 @@ static int esp_init_authenc(struct xfrm_state *x)
err = -EINVAL;
if (aalg_desc->uinfo.auth.icv_fullbits / 8 !=
crypto_aead_authsize(aead)) {
- pr_info("ESP: %s digestsize %u != %hu\n",
- x->aalg->alg_name,
- crypto_aead_authsize(aead),
- aalg_desc->uinfo.auth.icv_fullbits / 8);
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto free_key;
}
err = crypto_aead_setauthsize(
aead, x->aalg->alg_trunc_len / 8);
- if (err)
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto free_key;
+ }
}
param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
@@ -846,21 +1181,22 @@ error:
return err;
}
-static int esp6_init_state(struct xfrm_state *x)
+static int esp6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
struct crypto_aead *aead;
u32 align;
int err;
- if (x->encap)
- return -EINVAL;
-
x->data = NULL;
- if (x->aead)
- err = esp_init_aead(x);
- else
- err = esp_init_authenc(x);
+ if (x->aead) {
+ err = esp_init_aead(x, extack);
+ } else if (x->ealg) {
+ err = esp_init_authenc(x, extack);
+ } else {
+ NL_SET_ERR_MSG(extack, "ESP: AEAD or CRYPT must be provided");
+ err = -EINVAL;
+ }
if (err)
goto error;
@@ -883,6 +1219,31 @@ static int esp6_init_state(struct xfrm_state *x)
break;
}
+ if (x->encap) {
+ struct xfrm_encap_tmpl *encap = x->encap;
+
+ switch (encap->encap_type) {
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported encapsulation type for ESP");
+ err = -EINVAL;
+ goto error;
+ case UDP_ENCAP_ESPINUDP:
+ x->props.header_len += sizeof(struct udphdr);
+ break;
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
+ break;
+#ifdef CONFIG_INET6_ESPINTCP
+ case TCP_ENCAP_ESPINTCP:
+ /* only the length field, TCP encap is done by
+ * the socket
+ */
+ x->props.header_len += 2;
+ break;
+#endif
+ }
+ }
+
align = ALIGN(crypto_aead_blocksize(aead), 4);
x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);
@@ -896,7 +1257,6 @@ static int esp6_rcv_cb(struct sk_buff *skb, int err)
}
static const struct xfrm_type esp6_type = {
- .description = "ESP6",
.owner = THIS_MODULE,
.proto = IPPROTO_ESP,
.flags = XFRM_TYPE_REPLAY_PROT,
@@ -904,11 +1264,11 @@ static const struct xfrm_type esp6_type = {
.destructor = esp6_destroy,
.input = esp6_input,
.output = esp6_output,
- .hdr_offset = xfrm6_find_1stfragopt,
};
static struct xfrm6_protocol esp6_protocol = {
.handler = xfrm6_rcv,
+ .input_handler = xfrm_input,
.cb_handler = esp6_rcv_cb,
.err_handler = esp6_err,
.priority = 0,
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index fd535053245b..79d43548279c 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -16,6 +16,7 @@
#include <crypto/authenc.h>
#include <linux/err.h>
#include <linux/module.h>
+#include <net/gro.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/esp.h>
@@ -85,10 +86,8 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head,
sp->olen++;
xo = xfrm_offload(skb);
- if (!xo) {
- xfrm_state_put(x);
+ if (!xo)
goto out_reset;
- }
}
xo->flags |= XFRM_GRO;
@@ -123,9 +122,16 @@ static void esp6_gso_encap(struct xfrm_state *x, struct sk_buff *skb)
struct ip_esp_hdr *esph;
struct ipv6hdr *iph = ipv6_hdr(skb);
struct xfrm_offload *xo = xfrm_offload(skb);
- int proto = iph->nexthdr;
+ u8 proto = iph->nexthdr;
skb_push(skb, -skb_network_offset(skb));
+
+ if (x->outer_mode.encap == XFRM_MODE_TRANSPORT) {
+ __be16 frag;
+
+ ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto, &frag);
+ }
+
esph = ip_esp_hdr(skb);
*skb_mac_header(skb) = IPPROTO_ESP;
@@ -139,8 +145,10 @@ static struct sk_buff *xfrm6_tunnel_gso_segment(struct xfrm_state *x,
struct sk_buff *skb,
netdev_features_t features)
{
- __skb_push(skb, skb->mac_len);
- return skb_mac_gso_segment(skb, features);
+ __be16 type = x->inner_mode.family == AF_INET ? htons(ETH_P_IP)
+ : htons(ETH_P_IPV6);
+
+ return skb_eth_gso_segment(skb, features, type);
}
static struct sk_buff *xfrm6_transport_gso_segment(struct xfrm_state *x,
@@ -159,6 +167,51 @@ static struct sk_buff *xfrm6_transport_gso_segment(struct xfrm_state *x,
return segs;
}
+static struct sk_buff *xfrm6_beet_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ const struct net_offload *ops;
+ u8 proto = xo->proto;
+
+ skb->transport_header += x->props.header_len;
+
+ if (x->sel.family != AF_INET6) {
+ skb->transport_header -=
+ (sizeof(struct ipv6hdr) - sizeof(struct iphdr));
+
+ if (proto == IPPROTO_BEETPH) {
+ struct ip_beet_phdr *ph =
+ (struct ip_beet_phdr *)skb->data;
+
+ skb->transport_header += ph->hdrlen * 8;
+ proto = ph->nexthdr;
+ } else {
+ skb->transport_header -= IPV4_BEET_PHMAXLEN;
+ }
+
+ if (proto == IPPROTO_TCP)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6;
+ } else {
+ __be16 frag;
+
+ skb->transport_header +=
+ ipv6_skip_exthdr(skb, 0, &proto, &frag);
+ }
+
+ if (proto == IPPROTO_IPIP)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6;
+
+ __skb_pull(skb, skb_transport_offset(skb));
+ ops = rcu_dereference(inet6_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
+
+ return segs;
+}
+
static struct sk_buff *xfrm6_outer_mode_gso_segment(struct xfrm_state *x,
struct sk_buff *skb,
netdev_features_t features)
@@ -168,6 +221,8 @@ static struct sk_buff *xfrm6_outer_mode_gso_segment(struct xfrm_state *x,
return xfrm6_tunnel_gso_segment(x, skb, features);
case XFRM_MODE_TRANSPORT:
return xfrm6_transport_gso_segment(x, skb, features);
+ case XFRM_MODE_BEET:
+ return xfrm6_beet_gso_segment(x, skb, features);
}
return ERR_PTR(-EOPNOTSUPP);
@@ -205,9 +260,11 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb,
skb->encap_hdr_csum = 1;
if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev)
- esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
+ esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK |
+ NETIF_F_SCTP_CRC);
else if (!(features & NETIF_F_HW_ESP_TX_CSUM))
- esp_features = features & ~NETIF_F_CSUM_MASK;
+ esp_features = features & ~(NETIF_F_CSUM_MASK |
+ NETIF_F_SCTP_CRC);
xo->flags |= XFRM_GSO_SEGMENT;
@@ -235,7 +292,6 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features
int alen;
int blksize;
struct xfrm_offload *xo;
- struct ip_esp_hdr *esph;
struct crypto_aead *aead;
struct esp_info esp;
bool hw_offload = true;
@@ -268,7 +324,7 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features
esp.plen = esp.clen - skb->len - esp.tfclen;
esp.tailen = esp.tfclen + esp.plen + alen;
- if (!hw_offload || (hw_offload && !skb_is_gso(skb))) {
+ if (!hw_offload || !skb_is_gso(skb)) {
esp.nfrags = esp6_output_head(x, skb, &esp);
if (esp.nfrags < 0)
return esp.nfrags;
@@ -276,13 +332,13 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features
seq = xo->seq.low;
- esph = ip_esp_hdr(skb);
- esph->spi = x->id.spi;
+ esp.esph = ip_esp_hdr(skb);
+ esp.esph->spi = x->id.spi;
skb_push(skb, -skb_network_offset(skb));
if (xo->flags & XFRM_GSO_SEGMENT) {
- esph->seq_no = htonl(seq);
+ esp.esph->seq_no = htonl(seq);
if (!skb_is_gso(skb))
xo->seq.low++;
@@ -298,8 +354,17 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features
ipv6_hdr(skb)->payload_len = htons(len);
- if (hw_offload)
+ if (hw_offload) {
+ if (!skb_ext_add(skb, SKB_EXT_SEC_PATH))
+ return -ENOMEM;
+
+ xo = xfrm_offload(skb);
+ if (!xo)
+ return -EINVAL;
+
+ xo->flags |= XFRM_XMIT;
return 0;
+ }
err = esp6_output_tail(x, skb, &esp);
if (err)
@@ -318,7 +383,6 @@ static const struct net_offload esp6_offload = {
};
static const struct xfrm_type_offload esp6_type_offload = {
- .description = "ESP6 OFFLOAD",
.owner = THIS_MODULE,
.proto = IPPROTO_ESP,
.input_tail = esp6_input_tail,
@@ -347,3 +411,4 @@ module_exit(esp6_offload_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET6, XFRM_PROTO_ESP);
+MODULE_DESCRIPTION("IPV6 GSO/GRO offload support");
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index ab5add0fe6b4..a8d961d3a477 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -48,22 +48,13 @@
#ifdef CONFIG_IPV6_SEG6_HMAC
#include <net/seg6_hmac.h>
#endif
+#include <net/rpl.h>
+#include <linux/ioam6.h>
+#include <net/ioam6.h>
+#include <net/dst_metadata.h>
#include <linux/uaccess.h>
-/*
- * Parsing tlv encoded headers.
- *
- * Parsing function "func" returns true, if parsing succeed
- * and false, if it failed.
- * It MUST NOT touch skb->h.
- */
-
-struct tlvtype_proc {
- int type;
- bool (*func)(struct sk_buff *skb, int offset);
-};
-
/*********************
Generic functions
*********************/
@@ -97,27 +88,35 @@ static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff,
*/
if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr))
break;
- /* fall through */
+ fallthrough;
case 2: /* send ICMP PARM PROB regardless and drop packet */
- icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff);
+ icmpv6_param_prob_reason(skb, ICMPV6_UNK_OPTION, optoff,
+ SKB_DROP_REASON_UNHANDLED_PROTO);
return false;
}
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
return false;
}
+static bool ipv6_hop_ra(struct sk_buff *skb, int optoff);
+static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff);
+static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff);
+static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff);
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+static bool ipv6_dest_hao(struct sk_buff *skb, int optoff);
+#endif
+
/* Parse tlv encoded option header (hop-by-hop or destination) */
-static bool ip6_parse_tlv(const struct tlvtype_proc *procs,
+static bool ip6_parse_tlv(bool hopbyhop,
struct sk_buff *skb,
int max_count)
{
int len = (skb_transport_header(skb)[1] + 1) << 3;
const unsigned char *nh = skb_network_header(skb);
int off = skb_network_header_len(skb);
- const struct tlvtype_proc *curr;
bool disallow_unknowns = false;
int tlv_count = 0;
int padlen = 0;
@@ -134,18 +133,23 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs,
len -= 2;
while (len > 0) {
- int optlen = nh[off + 1] + 2;
- int i;
+ int optlen, i;
- switch (nh[off]) {
- case IPV6_TLV_PAD1:
- optlen = 1;
+ if (nh[off] == IPV6_TLV_PAD1) {
padlen++;
if (padlen > 7)
goto bad;
- break;
+ off++;
+ len--;
+ continue;
+ }
+ if (len < 2)
+ goto bad;
+ optlen = nh[off + 1] + 2;
+ if (optlen > len)
+ goto bad;
- case IPV6_TLV_PADN:
+ if (nh[off] == IPV6_TLV_PADN) {
/* RFC 2460 states that the purpose of PadN is
* to align the containing header to multiples
* of 8. 7 is therefore the highest valid value.
@@ -162,32 +166,51 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs,
if (nh[off + i] != 0)
goto bad;
}
- break;
-
- default: /* Other TLV code so scan list */
- if (optlen > len)
- goto bad;
-
+ } else {
tlv_count++;
if (tlv_count > max_count)
goto bad;
- for (curr = procs; curr->type >= 0; curr++) {
- if (curr->type == nh[off]) {
- /* type specific length/alignment
- checks will be performed in the
- func(). */
- if (curr->func(skb, off) == false)
+ if (hopbyhop) {
+ switch (nh[off]) {
+ case IPV6_TLV_ROUTERALERT:
+ if (!ipv6_hop_ra(skb, off))
+ return false;
+ break;
+ case IPV6_TLV_IOAM:
+ if (!ipv6_hop_ioam(skb, off))
+ return false;
+ break;
+ case IPV6_TLV_JUMBO:
+ if (!ipv6_hop_jumbo(skb, off))
+ return false;
+ break;
+ case IPV6_TLV_CALIPSO:
+ if (!ipv6_hop_calipso(skb, off))
+ return false;
+ break;
+ default:
+ if (!ip6_tlvopt_unknown(skb, off,
+ disallow_unknowns))
+ return false;
+ break;
+ }
+ } else {
+ switch (nh[off]) {
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ case IPV6_TLV_HAO:
+ if (!ipv6_dest_hao(skb, off))
+ return false;
+ break;
+#endif
+ default:
+ if (!ip6_tlvopt_unknown(skb, off,
+ disallow_unknowns))
return false;
break;
}
}
- if (curr->type < 0 &&
- !ip6_tlvopt_unknown(skb, off, disallow_unknowns))
- return false;
-
padlen = 0;
- break;
}
off += optlen;
len -= optlen;
@@ -196,7 +219,7 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs,
if (len == 0)
return true;
bad:
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
return false;
}
@@ -210,6 +233,7 @@ static bool ipv6_dest_hao(struct sk_buff *skb, int optoff)
struct ipv6_destopt_hao *hao;
struct inet6_skb_parm *opt = IP6CB(skb);
struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ SKB_DR(reason);
int ret;
if (opt->dsthao) {
@@ -224,19 +248,23 @@ static bool ipv6_dest_hao(struct sk_buff *skb, int optoff)
if (hao->length != 16) {
net_dbg_ratelimited("hao invalid option length = %d\n",
hao->length);
+ SKB_DR_SET(reason, IP_INHDR);
goto discard;
}
if (!(ipv6_addr_type(&hao->addr) & IPV6_ADDR_UNICAST)) {
net_dbg_ratelimited("hao is not an unicast addr: %pI6\n",
&hao->addr);
+ SKB_DR_SET(reason, INVALID_PROTO);
goto discard;
}
ret = xfrm6_input_addr(skb, (xfrm_address_t *)&ipv6h->daddr,
(xfrm_address_t *)&hao->addr, IPPROTO_DSTOPTS);
- if (unlikely(ret < 0))
+ if (unlikely(ret < 0)) {
+ SKB_DR_SET(reason, XFRM_POLICY);
goto discard;
+ }
if (skb_cloned(skb)) {
if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
@@ -259,21 +287,11 @@ static bool ipv6_dest_hao(struct sk_buff *skb, int optoff)
return true;
discard:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return false;
}
#endif
-static const struct tlvtype_proc tlvprocdestopt_lst[] = {
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
- {
- .type = IPV6_TLV_HAO,
- .func = ipv6_dest_hao,
- },
-#endif
- {-1, NULL}
-};
-
static int ipv6_destopt_rcv(struct sk_buff *skb)
{
struct inet6_dev *idev = __in6_dev_get(skb->dev);
@@ -304,8 +322,7 @@ fail_and_free:
dstbuf = opt->dst1;
#endif
- if (ip6_parse_tlv(tlvprocdestopt_lst, skb,
- init_net.ipv6.sysctl.max_dst_opts_cnt)) {
+ if (ip6_parse_tlv(false, skb, net->ipv6.sysctl.max_dst_opts_cnt)) {
skb->transport_header += extlen;
opt = IP6CB(skb);
#if IS_ENABLED(CONFIG_IPV6_MIP6)
@@ -380,7 +397,7 @@ static int ipv6_srh_rcv(struct sk_buff *skb)
looped_back:
if (hdr->segments_left == 0) {
- if (hdr->nexthdr == NEXTHDR_IPV6) {
+ if (hdr->nexthdr == NEXTHDR_IPV6 || hdr->nexthdr == NEXTHDR_IPV4) {
int offset = (hdr->hdrlen + 1) << 3;
skb_postpull_rcsum(skb, skb_network_header(skb),
@@ -396,7 +413,8 @@ looped_back:
skb_reset_network_header(skb);
skb_reset_transport_header(skb);
skb->encapsulation = 0;
-
+ if (hdr->nexthdr == NEXTHDR_IPV4)
+ skb->protocol = htons(ETH_P_IP);
__skb_tunnel_rx(skb, skb->dev, net);
netif_rx(skb);
@@ -468,6 +486,192 @@ looped_back:
return -1;
}
+static int ipv6_rpl_srh_rcv(struct sk_buff *skb)
+{
+ struct ipv6_rpl_sr_hdr *hdr, *ohdr, *chdr;
+ struct inet6_skb_parm *opt = IP6CB(skb);
+ struct net *net = dev_net(skb->dev);
+ struct inet6_dev *idev;
+ struct ipv6hdr *oldhdr;
+ unsigned char *buf;
+ int accept_rpl_seg;
+ int i, err;
+ u64 n = 0;
+ u32 r;
+
+ idev = __in6_dev_get(skb->dev);
+
+ accept_rpl_seg = net->ipv6.devconf_all->rpl_seg_enabled;
+ if (accept_rpl_seg > idev->cnf.rpl_seg_enabled)
+ accept_rpl_seg = idev->cnf.rpl_seg_enabled;
+
+ if (!accept_rpl_seg) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+looped_back:
+ hdr = (struct ipv6_rpl_sr_hdr *)skb_transport_header(skb);
+
+ if (hdr->segments_left == 0) {
+ if (hdr->nexthdr == NEXTHDR_IPV6) {
+ int offset = (hdr->hdrlen + 1) << 3;
+
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ skb_network_header_len(skb));
+
+ if (!pskb_pull(skb, offset)) {
+ kfree_skb(skb);
+ return -1;
+ }
+ skb_postpull_rcsum(skb, skb_transport_header(skb),
+ offset);
+
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb->encapsulation = 0;
+
+ __skb_tunnel_rx(skb, skb->dev, net);
+
+ netif_rx(skb);
+ return -1;
+ }
+
+ opt->srcrt = skb_network_header_len(skb);
+ opt->lastopt = opt->srcrt;
+ skb->transport_header += (hdr->hdrlen + 1) << 3;
+ opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb);
+
+ return 1;
+ }
+
+ if (!pskb_may_pull(skb, sizeof(*hdr))) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+ n = (hdr->hdrlen << 3) - hdr->pad - (16 - hdr->cmpre);
+ r = do_div(n, (16 - hdr->cmpri));
+ /* checks if calculation was without remainder and n fits into
+ * unsigned char which is segments_left field. Should not be
+ * higher than that.
+ */
+ if (r || (n + 1) > 255) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+ if (hdr->segments_left > n + 1) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
+ ((&hdr->segments_left) -
+ skb_network_header(skb)));
+ return -1;
+ }
+
+ if (skb_cloned(skb)) {
+ if (pskb_expand_head(skb, IPV6_RPL_SRH_WORST_SWAP_SIZE, 0,
+ GFP_ATOMIC)) {
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_OUTDISCARDS);
+ kfree_skb(skb);
+ return -1;
+ }
+ } else {
+ err = skb_cow_head(skb, IPV6_RPL_SRH_WORST_SWAP_SIZE);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return -1;
+ }
+ }
+
+ hdr = (struct ipv6_rpl_sr_hdr *)skb_transport_header(skb);
+
+ if (!pskb_may_pull(skb, ipv6_rpl_srh_size(n, hdr->cmpri,
+ hdr->cmpre))) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+ hdr->segments_left--;
+ i = n - hdr->segments_left;
+
+ buf = kcalloc(struct_size(hdr, segments.addr, n + 2), 2, GFP_ATOMIC);
+ if (unlikely(!buf)) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+ ohdr = (struct ipv6_rpl_sr_hdr *)buf;
+ ipv6_rpl_srh_decompress(ohdr, hdr, &ipv6_hdr(skb)->daddr, n);
+ chdr = (struct ipv6_rpl_sr_hdr *)(buf + ((ohdr->hdrlen + 1) << 3));
+
+ if ((ipv6_addr_type(&ipv6_hdr(skb)->daddr) & IPV6_ADDR_MULTICAST) ||
+ (ipv6_addr_type(&ohdr->rpl_segaddr[i]) & IPV6_ADDR_MULTICAST)) {
+ kfree_skb(skb);
+ kfree(buf);
+ return -1;
+ }
+
+ err = ipv6_chk_rpl_srh_loop(net, ohdr->rpl_segaddr, n + 1);
+ if (err) {
+ icmpv6_send(skb, ICMPV6_PARAMPROB, 0, 0);
+ kfree_skb(skb);
+ kfree(buf);
+ return -1;
+ }
+
+ swap(ipv6_hdr(skb)->daddr, ohdr->rpl_segaddr[i]);
+
+ ipv6_rpl_srh_compress(chdr, ohdr, &ipv6_hdr(skb)->daddr, n);
+
+ oldhdr = ipv6_hdr(skb);
+
+ skb_pull(skb, ((hdr->hdrlen + 1) << 3));
+ skb_postpull_rcsum(skb, oldhdr,
+ sizeof(struct ipv6hdr) + ((hdr->hdrlen + 1) << 3));
+ skb_push(skb, ((chdr->hdrlen + 1) << 3) + sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ memmove(ipv6_hdr(skb), oldhdr, sizeof(struct ipv6hdr));
+ memcpy(skb_transport_header(skb), chdr, (chdr->hdrlen + 1) << 3);
+
+ ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+ skb_postpush_rcsum(skb, ipv6_hdr(skb),
+ sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3));
+
+ kfree(buf);
+
+ skb_dst_drop(skb);
+
+ ip6_route_input(skb);
+
+ if (skb_dst(skb)->error) {
+ dst_input(skb);
+ return -1;
+ }
+
+ if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) {
+ if (ipv6_hdr(skb)->hop_limit <= 1) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ icmpv6_send(skb, ICMPV6_TIME_EXCEED,
+ ICMPV6_EXC_HOPLIMIT, 0);
+ kfree_skb(skb);
+ return -1;
+ }
+ ipv6_hdr(skb)->hop_limit--;
+
+ skb_pull(skb, sizeof(struct ipv6hdr));
+ goto looped_back;
+ }
+
+ dst_input(skb);
+
+ return -1;
+}
+
/********************************
Routing header.
********************************/
@@ -485,7 +689,6 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
struct net *net = dev_net(skb->dev);
int accept_source_route = net->ipv6.devconf_all->accept_source_route;
- idev = __in6_dev_get(skb->dev);
if (idev && accept_source_route > idev->cnf.accept_source_route)
accept_source_route = idev->cnf.accept_source_route;
@@ -506,9 +709,16 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
return -1;
}
- /* segment routing */
- if (hdr->type == IPV6_SRCRT_TYPE_4)
+ switch (hdr->type) {
+ case IPV6_SRCRT_TYPE_4:
+ /* segment routing */
return ipv6_srh_rcv(skb);
+ case IPV6_SRCRT_TYPE_3:
+ /* rpl segment routing */
+ return ipv6_rpl_srh_rcv(skb);
+ default:
+ break;
+ }
looped_back:
if (hdr->segments_left == 0) {
@@ -709,11 +919,6 @@ void ipv6_exthdrs_exit(void)
/*
* Note: we cannot rely on skb_dst(skb) before we assign it in ip6_route_input().
*/
-static inline struct inet6_dev *ipv6_skb_idev(struct sk_buff *skb)
-{
- return skb_dst(skb) ? ip6_dst_idev(skb_dst(skb)) : __in6_dev_get(skb->dev);
-}
-
static inline struct net *ipv6_skb_net(struct sk_buff *skb)
{
return skb_dst(skb) ? dev_net(skb_dst(skb)->dev) : dev_net(skb->dev);
@@ -732,7 +937,61 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff)
}
net_dbg_ratelimited("ipv6_hop_ra: wrong RA length %d\n",
nh[optoff + 1]);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
+ return false;
+}
+
+/* IOAM */
+
+static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff)
+{
+ struct ioam6_trace_hdr *trace;
+ struct ioam6_namespace *ns;
+ struct ioam6_hdr *hdr;
+
+ /* Bad alignment (must be 4n-aligned) */
+ if (optoff & 3)
+ goto drop;
+
+ /* Ignore if IOAM is not enabled on ingress */
+ if (!__in6_dev_get(skb->dev)->cnf.ioam6_enabled)
+ goto ignore;
+
+ /* Truncated Option header */
+ hdr = (struct ioam6_hdr *)(skb_network_header(skb) + optoff);
+ if (hdr->opt_len < 2)
+ goto drop;
+
+ switch (hdr->type) {
+ case IOAM6_TYPE_PREALLOC:
+ /* Truncated Pre-allocated Trace header */
+ if (hdr->opt_len < 2 + sizeof(*trace))
+ goto drop;
+
+ /* Malformed Pre-allocated Trace header */
+ trace = (struct ioam6_trace_hdr *)((u8 *)hdr + sizeof(*hdr));
+ if (hdr->opt_len < 2 + sizeof(*trace) + trace->remlen * 4)
+ goto drop;
+
+ /* Ignore if the IOAM namespace is unknown */
+ ns = ioam6_namespace(ipv6_skb_net(skb), trace->namespace_id);
+ if (!ns)
+ goto ignore;
+
+ if (!skb_valid_dst(skb))
+ ip6_route_input(skb);
+
+ ioam6_fill_trace_data(skb, ns, trace, true);
+ break;
+ default:
+ break;
+ }
+
+ignore:
+ return true;
+
+drop:
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
return false;
}
@@ -741,31 +1000,30 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff)
static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
{
const unsigned char *nh = skb_network_header(skb);
- struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
- struct net *net = ipv6_skb_net(skb);
+ SKB_DR(reason);
u32 pkt_len;
if (nh[optoff + 1] != 4 || (optoff & 3) != 2) {
net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
nh[optoff+1]);
- __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ SKB_DR_SET(reason, IP_INHDR);
goto drop;
}
pkt_len = ntohl(*(__be32 *)(nh + optoff + 2));
if (pkt_len <= IPV6_MAXPLEN) {
- __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
- icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2);
+ icmpv6_param_prob_reason(skb, ICMPV6_HDR_FIELD, optoff + 2,
+ SKB_DROP_REASON_IP_INHDR);
return false;
}
if (ipv6_hdr(skb)->payload_len) {
- __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
- icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff);
+ icmpv6_param_prob_reason(skb, ICMPV6_HDR_FIELD, optoff,
+ SKB_DROP_REASON_IP_INHDR);
return false;
}
if (pkt_len > skb->len - sizeof(struct ipv6hdr)) {
- __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTRUNCATEDPKTS);
+ SKB_DR_SET(reason, PKT_TOO_SMALL);
goto drop;
}
@@ -776,7 +1034,7 @@ static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
return true;
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return false;
}
@@ -798,26 +1056,10 @@ static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff)
return true;
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
return false;
}
-static const struct tlvtype_proc tlvprochopopt_lst[] = {
- {
- .type = IPV6_TLV_ROUTERALERT,
- .func = ipv6_hop_ra,
- },
- {
- .type = IPV6_TLV_JUMBO,
- .func = ipv6_hop_jumbo,
- },
- {
- .type = IPV6_TLV_CALIPSO,
- .func = ipv6_hop_calipso,
- },
- { -1, }
-};
-
int ipv6_parse_hopopts(struct sk_buff *skb)
{
struct inet6_skb_parm *opt = IP6CB(skb);
@@ -843,8 +1085,7 @@ fail_and_free:
goto fail_and_free;
opt->flags |= IP6SKB_HOPBYHOP;
- if (ip6_parse_tlv(tlvprochopopt_lst, skb,
- init_net.ipv6.sysctl.max_hbh_opts_cnt)) {
+ if (ip6_parse_tlv(true, skb, net->ipv6.sysctl.max_hbh_opts_cnt)) {
skb->transport_header += extlen;
opt = IP6CB(skb);
opt->nhoff = sizeof(struct ipv6hdr);
@@ -1035,7 +1276,6 @@ static void ipv6_renew_option(int renewtype,
* @opt: original options
* @newtype: option type to replace in @opt
* @newopt: new option of type @newtype to replace (user-mem)
- * @newoptlen: length of @newopt
*
* Returns a new set of options which is a copy of @opt with the
* option type @newtype replaced with @newopt.
@@ -1106,14 +1346,14 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
return opt2;
}
-struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
- struct ipv6_txoptions *opt)
+struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space,
+ struct ipv6_txoptions *opt)
{
/*
* ignore the dest before srcrt unless srcrt is being included.
* --yoshfuji
*/
- if (opt && opt->dst0opt && !opt->srcrt) {
+ if (opt->dst0opt && !opt->srcrt) {
if (opt_space != opt) {
memcpy(opt_space, opt, sizeof(*opt_space));
opt = opt_space;
@@ -1124,7 +1364,7 @@ struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
return opt;
}
-EXPORT_SYMBOL_GPL(ipv6_fixup_options);
+EXPORT_SYMBOL_GPL(__ipv6_fixup_options);
/**
* fl6_update_dst - update flowi destination address with info given
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index fafe556d21e0..7c2003833010 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -13,8 +13,10 @@
#include <linux/netdevice.h>
#include <linux/notifier.h>
#include <linux/export.h>
+#include <linux/indirect_call_wrapper.h>
#include <net/fib_rules.h>
+#include <net/inet_dscp.h>
#include <net/ipv6.h>
#include <net/addrconf.h>
#include <net/ip6_route.h>
@@ -24,14 +26,14 @@ struct fib6_rule {
struct fib_rule common;
struct rt6key src;
struct rt6key dst;
- u8 tclass;
+ dscp_t dscp;
};
static bool fib6_rule_matchall(const struct fib_rule *rule)
{
struct fib6_rule *r = container_of(rule, struct fib6_rule, common);
- if (r->dst.plen || r->src.plen || r->tclass)
+ if (r->dst.plen || r->src.plen || r->dscp)
return false;
return fib_rule_matchall(rule);
}
@@ -111,11 +113,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
} else {
struct rt6_info *rt;
- rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags);
+ rt = pol_lookup_func(lookup,
+ net, net->ipv6.fib6_local_tbl, fl6, skb, flags);
if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN)
return &rt->dst;
ip6_rt_put_flags(rt, flags);
- rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
+ rt = pol_lookup_func(lookup,
+ net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
if (rt->dst.error != -EAGAIN)
return &rt->dst;
ip6_rt_put_flags(rt, flags);
@@ -226,7 +230,8 @@ static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
goto out;
}
- rt = lookup(net, table, flp6, arg->lookup_data, flags);
+ rt = pol_lookup_func(lookup,
+ net, table, flp6, arg->lookup_data, flags);
if (rt != net->ipv6.ip6_null_entry) {
err = fib6_rule_saddr(net, rule, flags, flp6,
ip6_dst_idev(&rt->dst)->dev);
@@ -252,8 +257,9 @@ out:
return err;
}
-static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
- int flags, struct fib_lookup_arg *arg)
+INDIRECT_CALLABLE_SCOPE int fib6_rule_action(struct fib_rule *rule,
+ struct flowi *flp, int flags,
+ struct fib_lookup_arg *arg)
{
if (arg->lookup_ptr == fib6_table_lookup)
return fib6_rule_action_alt(rule, flp, flags, arg);
@@ -261,7 +267,9 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
return __fib6_rule_action(rule, flp, flags, arg);
}
-static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
+INDIRECT_CALLABLE_SCOPE bool fib6_rule_suppress(struct fib_rule *rule,
+ int flags,
+ struct fib_lookup_arg *arg)
{
struct fib6_result *res = arg->result;
struct rt6_info *rt = res->rt6;
@@ -288,12 +296,12 @@ static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg
return false;
suppress_route:
- if (!(arg->flags & FIB_LOOKUP_NOREF))
- ip6_rt_put(rt);
+ ip6_rt_put_flags(rt, flags);
return true;
}
-static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule,
+ struct flowi *fl, int flags)
{
struct fib6_rule *r = (struct fib6_rule *) rule;
struct flowi6 *fl6 = &fl->u.ip6;
@@ -316,7 +324,7 @@ static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
return 0;
}
- if (r->tclass && r->tclass != ip6_tclass(fl6->flowlabel))
+ if (r->dscp && r->dscp != ip6_dscp(fl6->flowlabel))
return 0;
if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto))
@@ -333,10 +341,6 @@ static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
return 1;
}
-static const struct nla_policy fib6_rule_policy[FRA_MAX+1] = {
- FRA_GENERIC_POLICY,
-};
-
static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh,
struct nlattr **tb,
@@ -346,6 +350,13 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct net *net = sock_net(skb->sk);
struct fib6_rule *rule6 = (struct fib6_rule *) rule;
+ if (!inet_validate_dscp(frh->tos)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid dsfield (tos): ECN bits must be 0");
+ goto errout;
+ }
+ rule6->dscp = inet_dsfield_to_dscp(frh->tos);
+
if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) {
if (rule->table == RT6_TABLE_UNSPEC) {
NL_SET_ERR_MSG(extack, "Invalid table");
@@ -366,7 +377,6 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule6->src.plen = frh->src_len;
rule6->dst.plen = frh->dst_len;
- rule6->tclass = frh->tos;
if (fib_rule_requires_fldissect(rule))
net->ipv6.fib6_rules_require_fldissect++;
@@ -399,7 +409,7 @@ static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
if (frh->dst_len && (rule6->dst.plen != frh->dst_len))
return 0;
- if (frh->tos && (rule6->tclass != frh->tos))
+ if (frh->tos && inet_dscp_to_dsfield(rule6->dscp) != frh->tos)
return 0;
if (frh->src_len &&
@@ -420,7 +430,7 @@ static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
frh->dst_len = rule6->dst.plen;
frh->src_len = rule6->src.plen;
- frh->tos = rule6->tclass;
+ frh->tos = inet_dscp_to_dsfield(rule6->dscp);
if ((rule6->dst.plen &&
nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) ||
@@ -452,7 +462,6 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = {
.fill = fib6_rule_fill,
.nlmsg_payload = fib6_rule_nlmsg_payload,
.nlgroup = RTNLGRP_IPV6_RULE,
- .policy = fib6_rule_policy,
.owner = THIS_MODULE,
.fro_net = &init_net,
};
@@ -460,7 +469,7 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = {
static int __net_init fib6_rules_net_init(struct net *net)
{
struct fib_rules_ops *ops;
- int err = -ENOMEM;
+ int err;
ops = fib_rules_register(&fib6_rules_ops_template, net);
if (IS_ERR(ops))
@@ -484,16 +493,21 @@ out_fib6_rules_ops:
goto out;
}
-static void __net_exit fib6_rules_net_exit(struct net *net)
+static void __net_exit fib6_rules_net_exit_batch(struct list_head *net_list)
{
+ struct net *net;
+
rtnl_lock();
- fib_rules_unregister(net->ipv6.fib6_rules_ops);
+ list_for_each_entry(net, net_list, exit_list) {
+ fib_rules_unregister(net->ipv6.fib6_rules_ops);
+ cond_resched();
+ }
rtnl_unlock();
}
static struct pernet_operations fib6_rules_net_ops = {
.init = fib6_rules_net_init,
- .exit = fib6_rules_net_exit,
+ .exit_batch = fib6_rules_net_exit_batch,
};
int __init fib6_rules_init(void)
diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c
index 091f94184dc1..430518ae26fa 100644
--- a/net/ipv6/fou6.c
+++ b/net/ipv6/fou6.c
@@ -224,3 +224,4 @@ module_init(fou6_init);
module_exit(fou6_fini);
MODULE_AUTHOR("Tom Herbert <therbert@google.com>");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Foo over UDP (IPv6)");
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index ef408a5090a2..9d92d51c4757 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -57,6 +57,7 @@
#include <net/protocol.h>
#include <net/raw.h>
#include <net/rawv6.h>
+#include <net/seg6.h>
#include <net/transp_v6.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
@@ -68,17 +69,7 @@
#include <linux/uaccess.h>
-/*
- * The ICMP socket(s). This is the most convenient way to flow control
- * our ICMP output as well as maintain a clean interface throughout
- * all layers. All Socketless IP sends will soon be gone.
- *
- * On SMP we have one ICMP socket per-cpu.
- */
-static struct sock *icmpv6_sk(struct net *net)
-{
- return this_cpu_read(*net->ipv6.icmp_sk);
-}
+static DEFINE_PER_CPU(struct sock *, ipv6_icmp_sk);
static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
@@ -109,11 +100,11 @@ static const struct inet6_protocol icmpv6_protocol = {
};
/* Called with BH disabled */
-static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
+static struct sock *icmpv6_xmit_lock(struct net *net)
{
struct sock *sk;
- sk = icmpv6_sk(net);
+ sk = this_cpu_read(ipv6_icmp_sk);
if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
/* This can happen if the output path (f.e. SIT or
* ip6ip6 tunnel) signals dst_link_failure() for an
@@ -121,11 +112,13 @@ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
*/
return NULL;
}
+ sock_net_set(sk, net);
return sk;
}
-static __inline__ void icmpv6_xmit_unlock(struct sock *sk)
+static void icmpv6_xmit_unlock(struct sock *sk)
{
+ sock_net_set(sk, &init_net);
spin_unlock(&sk->sk_lock.slock);
}
@@ -158,7 +151,13 @@ static bool is_ineligible(const struct sk_buff *skb)
tp = skb_header_pointer(skb,
ptr+offsetof(struct icmp6hdr, icmp6_type),
sizeof(_type), &_type);
- if (!tp || !(*tp & ICMPV6_INFOMSG_MASK))
+
+ /* Based on RFC 8200, Section 4.5 Fragment Header, return
+ * false if this is a fragment packet with no icmp header info.
+ */
+ if (!tp && frag_off != 0)
+ return false;
+ else if (!tp || !(*tp & ICMPV6_INFOMSG_MASK))
return true;
}
return false;
@@ -229,6 +228,25 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
return res;
}
+static bool icmpv6_rt_has_prefsrc(struct sock *sk, u8 type,
+ struct flowi6 *fl6)
+{
+ struct net *net = sock_net(sk);
+ struct dst_entry *dst;
+ bool res = false;
+
+ dst = ip6_route_output(net, sk, fl6);
+ if (!dst->error) {
+ struct rt6_info *rt = (struct rt6_info *)dst;
+ struct in6_addr prefsrc;
+
+ rt6_get_prefsrc(rt, &prefsrc);
+ res = !ipv6_addr_any(&prefsrc);
+ }
+ dst_release(dst);
+ return res;
+}
+
/*
* an inline helper for the "simple" if statement below
* checks if parameter problem report is caused by an
@@ -295,10 +313,10 @@ static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, st
{
struct icmpv6_msg *msg = (struct icmpv6_msg *) from;
struct sk_buff *org_skb = msg->skb;
- __wsum csum = 0;
+ __wsum csum;
csum = skb_copy_and_csum_bits(org_skb, msg->offset + offset,
- to, len, csum);
+ to, len);
skb->csum = csum_block_add(skb->csum, csum, odd);
if (!(msg->type & ICMPV6_INFOMSG_MASK))
nf_ct_attach(skb, org_skb);
@@ -306,10 +324,9 @@ static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, st
}
#if IS_ENABLED(CONFIG_IPV6_MIP6)
-static void mip6_addr_swap(struct sk_buff *skb)
+static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt)
{
struct ipv6hdr *iph = ipv6_hdr(skb);
- struct inet6_skb_parm *opt = IP6CB(skb);
struct ipv6_destopt_hao *hao;
struct in6_addr tmp;
int off;
@@ -326,7 +343,7 @@ static void mip6_addr_swap(struct sk_buff *skb)
}
}
#else
-static inline void mip6_addr_swap(struct sk_buff *skb) {}
+static inline void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) {}
#endif
static struct dst_entry *icmpv6_route_lookup(struct net *net,
@@ -420,8 +437,9 @@ static int icmp6_iif(const struct sk_buff *skb)
/*
* Send an ICMP message in response to a packet in error
*/
-static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
- const struct in6_addr *force_saddr)
+void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+ const struct in6_addr *force_saddr,
+ const struct inet6_skb_parm *parm)
{
struct inet6_dev *idev = NULL;
struct ipv6hdr *hdr = ipv6_hdr(skb);
@@ -482,8 +500,11 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
if (__ipv6_addr_needs_scope_id(addr_type)) {
iif = icmp6_iif(skb);
} else {
- dst = skb_dst(skb);
- iif = l3mdev_master_ifindex(dst ? dst->dev : skb->dev);
+ /*
+ * The source device is used for looking up which routing table
+ * to use for sending an ICMP error.
+ */
+ iif = l3mdev_master_ifindex(skb->dev);
}
/*
@@ -514,7 +535,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type))
goto out_bh_enable;
- mip6_addr_swap(skb);
+ mip6_addr_swap(skb, parm);
sk = icmpv6_xmit_lock(net);
if (!sk)
@@ -527,11 +548,11 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
saddr = force_saddr;
if (saddr) {
fl6.saddr = *saddr;
- } else {
+ } else if (!icmpv6_rt_has_prefsrc(sk, type, &fl6)) {
/* select a more meaningful saddr from input if */
struct net_device *in_netdev;
- in_netdev = dev_get_by_index(net, IP6CB(skb)->iif);
+ in_netdev = dev_get_by_index(net, parm->iif);
if (in_netdev) {
ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr,
inet6_sk(sk)->srcprefs,
@@ -545,9 +566,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
fl6.fl6_icmp_code = code;
fl6.flowi6_uid = sock_net_uid(net, NULL);
fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL);
- security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+ security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
- sk->sk_mark = mark;
np = inet6_sk(sk);
if (!icmpv6_xrlim_allow(sk, type, &fl6))
@@ -564,6 +584,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
fl6.flowi6_oif = np->ucast_oif;
ipcm6_init_sk(&ipc6, np);
+ ipc6.sockc.mark = mark;
fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
dst = icmpv6_route_lookup(net, skb, sk, &fl6);
@@ -606,13 +627,15 @@ out:
out_bh_enable:
local_bh_enable();
}
+EXPORT_SYMBOL(icmp6_send);
-/* Slightly more convenient version of icmp6_send.
+/* Slightly more convenient version of icmp6_send with drop reasons.
*/
-void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos)
+void icmpv6_param_prob_reason(struct sk_buff *skb, u8 code, int pos,
+ enum skb_drop_reason reason)
{
- icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL);
- kfree_skb(skb);
+ icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL, IP6CB(skb));
+ kfree_skb_reason(skb, reason);
}
/* Generate icmpv6 with type/code ICMPV6_DEST_UNREACH/ICMPV6_ADDR_UNREACH
@@ -668,10 +691,10 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
}
if (type == ICMP_TIME_EXCEEDED)
icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
- info, &temp_saddr);
+ info, &temp_saddr, IP6CB(skb2));
else
icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH,
- info, &temp_saddr);
+ info, &temp_saddr, IP6CB(skb2));
if (rt)
ip6_rt_put(rt);
@@ -696,6 +719,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
struct ipcm6_cookie ipc6;
u32 mark = IP6_REPLY_MARK(net, skb->mark);
bool acast;
+ u8 type;
if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) &&
net->ipv6.sysctl.icmpv6_echo_ignore_multicast)
@@ -711,8 +735,13 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
!(net->ipv6.sysctl.anycast_src_echo_reply && acast))
saddr = NULL;
+ if (icmph->icmp6_type == ICMPV6_EXT_ECHO_REQUEST)
+ type = ICMPV6_EXT_ECHO_REPLY;
+ else
+ type = ICMPV6_ECHO_REPLY;
+
memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr));
- tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY;
+ tmp_hdr.icmp6_type = type;
memset(&fl6, 0, sizeof(fl6));
if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES)
@@ -723,16 +752,15 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
if (saddr)
fl6.saddr = *saddr;
fl6.flowi6_oif = icmp6_iif(skb);
- fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
+ fl6.fl6_icmp_type = type;
fl6.flowi6_mark = mark;
fl6.flowi6_uid = sock_net_uid(net, NULL);
- security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+ security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
local_bh_disable();
sk = icmpv6_xmit_lock(net);
if (!sk)
goto out_bh_enable;
- sk->sk_mark = mark;
np = inet6_sk(sk);
if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
@@ -755,11 +783,16 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
msg.skb = skb;
msg.offset = 0;
- msg.type = ICMPV6_ECHO_REPLY;
+ msg.type = type;
ipcm6_init_sk(&ipc6, np);
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb));
+ ipc6.sockc.mark = mark;
+
+ if (icmph->icmp6_type == ICMPV6_EXT_ECHO_REQUEST)
+ if (!icmp_build_probe(skb, (struct icmphdr *)&tmp_hdr))
+ goto out_dst_release;
if (ip6_append_data(sk, icmpv6_getfrag, &msg,
skb->len + sizeof(struct icmp6hdr),
@@ -781,6 +814,7 @@ out_bh_enable:
void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
{
+ struct inet6_skb_parm *opt = IP6CB(skb);
const struct inet6_protocol *ipprot;
int inner_offset;
__be16 frag_off;
@@ -790,6 +824,8 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
goto out;
+ seg6_icmp_srh(skb, opt);
+
nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr;
if (ipv6_ext_hdr(nexthdr)) {
/* now skip over extension headers */
@@ -814,7 +850,7 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
ipprot = rcu_dereference(inet6_protos[nexthdr]);
if (ipprot && ipprot->err_handler)
- ipprot->err_handler(skb, NULL, type, code, inner_offset, info);
+ ipprot->err_handler(skb, opt, type, code, inner_offset, info);
raw6_icmp_error(skb, nexthdr, type, code, inner_offset, info);
return;
@@ -829,21 +865,23 @@ out:
static int icmpv6_rcv(struct sk_buff *skb)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct net *net = dev_net(skb->dev);
struct net_device *dev = icmp6_dev(skb);
struct inet6_dev *idev = __in6_dev_get(dev);
const struct in6_addr *saddr, *daddr;
struct icmp6hdr *hdr;
u8 type;
- bool success = false;
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
struct sec_path *sp = skb_sec_path(skb);
int nh;
if (!(sp && sp->xvec[sp->len - 1]->props.flags &
- XFRM_STATE_ICMP))
+ XFRM_STATE_ICMP)) {
+ reason = SKB_DROP_REASON_XFRM_POLICY;
goto drop_no_count;
+ }
if (!pskb_may_pull(skb, sizeof(*hdr) + sizeof(struct ipv6hdr)))
goto drop_no_count;
@@ -851,8 +889,11 @@ static int icmpv6_rcv(struct sk_buff *skb)
nh = skb_network_offset(skb);
skb_set_network_header(skb, sizeof(*hdr));
- if (!xfrm6_policy_check_reverse(NULL, XFRM_POLICY_IN, skb))
+ if (!xfrm6_policy_check_reverse(NULL, XFRM_POLICY_IN,
+ skb)) {
+ reason = SKB_DROP_REASON_XFRM_POLICY;
goto drop_no_count;
+ }
skb_set_network_header(skb, nh);
}
@@ -882,9 +923,18 @@ static int icmpv6_rcv(struct sk_buff *skb)
if (!net->ipv6.sysctl.icmpv6_echo_ignore_all)
icmpv6_echo_reply(skb);
break;
+ case ICMPV6_EXT_ECHO_REQUEST:
+ if (!net->ipv6.sysctl.icmpv6_echo_ignore_all &&
+ READ_ONCE(net->ipv4.sysctl_icmp_echo_enable_probe))
+ icmpv6_echo_reply(skb);
+ break;
case ICMPV6_ECHO_REPLY:
- success = ping_rcv(skb);
+ reason = ping_rcv(skb);
+ break;
+
+ case ICMPV6_EXT_ECHO_REPLY:
+ reason = ping_rcv(skb);
break;
case ICMPV6_PKT_TOOBIG:
@@ -898,7 +948,7 @@ static int icmpv6_rcv(struct sk_buff *skb)
hdr = icmp6_hdr(skb);
/* to notify */
- /* fall through */
+ fallthrough;
case ICMPV6_DEST_UNREACH:
case ICMPV6_TIME_EXCEED:
case ICMPV6_PARAMPROB:
@@ -915,11 +965,11 @@ static int icmpv6_rcv(struct sk_buff *skb)
case ICMPV6_MGM_QUERY:
igmp6_event_query(skb);
- break;
+ return 0;
case ICMPV6_MGM_REPORT:
igmp6_event_report(skb);
- break;
+ return 0;
case ICMPV6_MGM_REDUCTION:
case ICMPV6_NI_QUERY:
@@ -950,19 +1000,20 @@ static int icmpv6_rcv(struct sk_buff *skb)
/* until the v6 path can be better sorted assume failure and
* preserve the status quo behaviour for the rest of the paths to here
*/
- if (success)
- consume_skb(skb);
+ if (reason)
+ kfree_skb_reason(skb, reason);
else
- kfree_skb(skb);
+ consume_skb(skb);
return 0;
csum_error:
+ reason = SKB_DROP_REASON_ICMP_CSUM;
__ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS);
discard_it:
__ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INERRORS);
drop_no_count:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return 0;
}
@@ -979,62 +1030,30 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6,
fl6->fl6_icmp_type = type;
fl6->fl6_icmp_code = 0;
fl6->flowi6_oif = oif;
- security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
-}
-
-static void __net_exit icmpv6_sk_exit(struct net *net)
-{
- int i;
-
- for_each_possible_cpu(i)
- inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv6.icmp_sk, i));
- free_percpu(net->ipv6.icmp_sk);
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
}
-static int __net_init icmpv6_sk_init(struct net *net)
+int __init icmpv6_init(void)
{
struct sock *sk;
int err, i;
- net->ipv6.icmp_sk = alloc_percpu(struct sock *);
- if (!net->ipv6.icmp_sk)
- return -ENOMEM;
-
for_each_possible_cpu(i) {
err = inet_ctl_sock_create(&sk, PF_INET6,
- SOCK_RAW, IPPROTO_ICMPV6, net);
+ SOCK_RAW, IPPROTO_ICMPV6, &init_net);
if (err < 0) {
pr_err("Failed to initialize the ICMP6 control socket (err %d)\n",
err);
- goto fail;
+ return err;
}
- *per_cpu_ptr(net->ipv6.icmp_sk, i) = sk;
+ per_cpu(ipv6_icmp_sk, i) = sk;
/* Enough space for 2 64K ICMP packets, including
* sk_buff struct overhead.
*/
sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
}
- return 0;
-
- fail:
- icmpv6_sk_exit(net);
- return err;
-}
-
-static struct pernet_operations icmpv6_sk_ops = {
- .init = icmpv6_sk_init,
- .exit = icmpv6_sk_exit,
-};
-
-int __init icmpv6_init(void)
-{
- int err;
-
- err = register_pernet_subsys(&icmpv6_sk_ops);
- if (err < 0)
- return err;
err = -EAGAIN;
if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0)
@@ -1049,14 +1068,12 @@ sender_reg_err:
inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
fail:
pr_err("Failed to register ICMP6 protocol\n");
- unregister_pernet_subsys(&icmpv6_sk_ops);
return err;
}
void icmpv6_cleanup(void)
{
inet6_unregister_icmp_sender(icmp6_send);
- unregister_pernet_subsys(&icmpv6_sk_ops);
inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
}
@@ -1140,23 +1157,23 @@ static struct ctl_table ipv6_icmp_table_template[] = {
{
.procname = "echo_ignore_all",
.data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_all,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "echo_ignore_multicast",
.data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_multicast,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "echo_ignore_anycast",
.data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_anycast,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "ratemask",
diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h
index bb6fc0d54dae..ad5f6f6ba333 100644
--- a/net/ipv6/ila/ila.h
+++ b/net/ipv6/ila/ila.h
@@ -68,11 +68,6 @@ static inline struct ila_addr *ila_a2i(struct in6_addr *addr)
return (struct ila_addr *)addr;
}
-static inline bool ila_addr_is_ila(struct ila_addr *iaddr)
-{
- return (iaddr->ident.type != ILA_ATYPE_IID);
-}
-
struct ila_params {
struct ila_locator locator;
struct ila_locator locator_match;
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 422dcc691f71..8c1ce78956ba 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -125,7 +125,7 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
[ILA_ATTR_HOOK_TYPE] = { .type = NLA_U8, },
};
-static int ila_build_state(struct nlattr *nla,
+static int ila_build_state(struct net *net, struct nlattr *nla,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c
index 257d2b681246..3faf62530d6a 100644
--- a/net/ipv6/ila/ila_main.c
+++ b/net/ipv6/ila/ila_main.c
@@ -55,6 +55,7 @@ struct genl_family ila_nl_family __ro_after_init = {
.module = THIS_MODULE,
.ops = ila_nl_ops,
.n_ops = ARRAY_SIZE(ila_nl_ops),
+ .resv_start_op = ILA_CMD_FLUSH + 1,
};
static __net_init int ila_init_net(struct net *net)
@@ -120,3 +121,4 @@ module_init(ila_init);
module_exit(ila_fini);
MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IPv6: Identifier Locator Addressing (ILA)");
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 5fc1f4e0c0cf..47447f0241df 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -601,8 +601,6 @@ out_ret:
return ret;
}
-#define ILA_HASH_TABLE_SIZE 1024
-
int ila_xlat_init_net(struct net *net)
{
struct ila_net *ilan = net_generic(net, ila_net_id);
@@ -612,7 +610,11 @@ int ila_xlat_init_net(struct net *net)
if (err)
return err;
- rhashtable_init(&ilan->xlat.rhash_table, &rht_params);
+ err = rhashtable_init(&ilan->xlat.rhash_table, &rht_params);
+ if (err) {
+ free_bucket_spinlocks(ilan->xlat.locks);
+ return err;
+ }
return 0;
}
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index e315526fa244..5a9f4d722f35 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -46,7 +46,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk,
fl6->fl6_dport = ireq->ir_rmt_port;
fl6->fl6_sport = htons(ireq->ir_num);
fl6->flowi6_uid = sk->sk_uid;
- security_req_classify_flow(req, flowi6_to_flowi(fl6));
+ security_req_classify_flow(req, flowi6_to_flowi_common(fl6));
dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
if (IS_ERR(dst))
@@ -95,7 +95,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
fl6->fl6_sport = inet->inet_sport;
fl6->fl6_dport = inet->inet_dport;
fl6->flowi6_uid = sk->sk_uid;
- security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
rcu_read_lock();
final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index fbe9d4295eac..b64b49012655 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -69,12 +69,12 @@ begin:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
if (sk->sk_hash != hash)
continue;
- if (!INET6_MATCH(sk, net, saddr, daddr, ports, dif, sdif))
+ if (!inet6_match(net, sk, saddr, daddr, ports, dif, sdif))
continue;
if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
goto out;
- if (unlikely(!INET6_MATCH(sk, net, saddr, daddr, ports, dif, sdif))) {
+ if (unlikely(!inet6_match(net, sk, saddr, daddr, ports, dif, sdif))) {
sock_gen_put(sk);
goto begin;
}
@@ -92,7 +92,7 @@ EXPORT_SYMBOL(__inet6_lookup_established);
static inline int compute_score(struct sock *sk, struct net *net,
const unsigned short hnum,
const struct in6_addr *daddr,
- const int dif, const int sdif, bool exact_dif)
+ const int dif, const int sdif)
{
int score = -1;
@@ -104,13 +104,30 @@ static inline int compute_score(struct sock *sk, struct net *net,
if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
return -1;
- score = 1;
+ score = sk->sk_bound_dev_if ? 2 : 1;
if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
score++;
}
return score;
}
+static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk,
+ struct sk_buff *skb, int doff,
+ const struct in6_addr *saddr,
+ __be16 sport,
+ const struct in6_addr *daddr,
+ unsigned short hnum)
+{
+ struct sock *reuse_sk = NULL;
+ u32 phash;
+
+ if (sk->sk_reuseport) {
+ phash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
+ reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
+ }
+ return reuse_sk;
+}
+
/* called with rcu_read_lock() */
static struct sock *inet6_lhash2_lookup(struct net *net,
struct inet_listen_hashbucket *ilb2,
@@ -119,25 +136,18 @@ static struct sock *inet6_lhash2_lookup(struct net *net,
const __be16 sport, const struct in6_addr *daddr,
const unsigned short hnum, const int dif, const int sdif)
{
- bool exact_dif = inet6_exact_dif_match(net, skb);
- struct inet_connection_sock *icsk;
struct sock *sk, *result = NULL;
+ struct hlist_nulls_node *node;
int score, hiscore = 0;
- u32 phash = 0;
- inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
- sk = (struct sock *)icsk;
- score = compute_score(sk, net, hnum, daddr, dif, sdif,
- exact_dif);
+ sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
+ score = compute_score(sk, net, hnum, daddr, dif, sdif);
if (score > hiscore) {
- if (sk->sk_reuseport) {
- phash = inet6_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, phash,
- skb, doff);
- if (result)
- return result;
- }
+ result = lookup_reuseport(net, sk, skb, doff,
+ saddr, sport, daddr, hnum);
+ if (result)
+ return result;
+
result = sk;
hiscore = score;
}
@@ -146,6 +156,31 @@ static struct sock *inet6_lhash2_lookup(struct net *net,
return result;
}
+static inline struct sock *inet6_lookup_run_bpf(struct net *net,
+ struct inet_hashinfo *hashinfo,
+ struct sk_buff *skb, int doff,
+ const struct in6_addr *saddr,
+ const __be16 sport,
+ const struct in6_addr *daddr,
+ const u16 hnum, const int dif)
+{
+ struct sock *sk, *reuse_sk;
+ bool no_reuseport;
+
+ if (hashinfo != net->ipv4.tcp_death_row.hashinfo)
+ return NULL; /* only TCP is supported */
+
+ no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_TCP, saddr, sport,
+ daddr, hnum, dif, &sk);
+ if (no_reuseport || IS_ERR_OR_NULL(sk))
+ return sk;
+
+ reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum);
+ if (reuse_sk)
+ sk = reuse_sk;
+ return sk;
+}
+
struct sock *inet6_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
@@ -157,6 +192,14 @@ struct sock *inet6_lookup_listener(struct net *net,
struct sock *result = NULL;
unsigned int hash2;
+ /* Lookup redirect from BPF */
+ if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
+ result = inet6_lookup_run_bpf(net, hashinfo, skb, doff,
+ saddr, sport, daddr, hnum, dif);
+ if (result)
+ goto done;
+ }
+
hash2 = ipv6_portaddr_hash(net, daddr, hnum);
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
@@ -223,7 +266,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
if (sk2->sk_hash != hash)
continue;
- if (likely(INET6_MATCH(sk2, net, saddr, daddr, ports,
+ if (likely(inet6_match(net, sk2, saddr, daddr, ports,
dif, sdif))) {
if (sk2->sk_state == TCP_TIME_WAIT) {
tw = inet_twsk(sk2);
@@ -262,7 +305,7 @@ not_unique:
return -EADDRNOTAVAIL;
}
-static u32 inet6_sk_port_offset(const struct sock *sk)
+static u64 inet6_sk_port_offset(const struct sock *sk)
{
const struct inet_sock *inet = inet_sk(sk);
@@ -274,7 +317,7 @@ static u32 inet6_sk_port_offset(const struct sock *sk)
int inet6_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk)
{
- u32 port_offset = 0;
+ u64 port_offset = 0;
if (!inet_sk(sk)->inet_num)
port_offset = inet6_sk_port_offset(sk);
@@ -287,11 +330,8 @@ int inet6_hash(struct sock *sk)
{
int err = 0;
- if (sk->sk_state != TCP_CLOSE) {
- local_bh_disable();
+ if (sk->sk_state != TCP_CLOSE)
err = __inet_hash(sk, NULL);
- local_bh_enable();
- }
return err;
}
diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c
new file mode 100644
index 000000000000..571f0e4d9cf3
--- /dev/null
+++ b/net/ipv6/ioam6.c
@@ -0,0 +1,979 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * IPv6 IOAM implementation
+ *
+ * Author:
+ * Justin Iurman <justin.iurman@uliege.be>
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/ioam6.h>
+#include <linux/ioam6_genl.h>
+#include <linux/rhashtable.h>
+#include <linux/netdevice.h>
+
+#include <net/addrconf.h>
+#include <net/genetlink.h>
+#include <net/ioam6.h>
+#include <net/sch_generic.h>
+
+static void ioam6_ns_release(struct ioam6_namespace *ns)
+{
+ kfree_rcu(ns, rcu);
+}
+
+static void ioam6_sc_release(struct ioam6_schema *sc)
+{
+ kfree_rcu(sc, rcu);
+}
+
+static void ioam6_free_ns(void *ptr, void *arg)
+{
+ struct ioam6_namespace *ns = (struct ioam6_namespace *)ptr;
+
+ if (ns)
+ ioam6_ns_release(ns);
+}
+
+static void ioam6_free_sc(void *ptr, void *arg)
+{
+ struct ioam6_schema *sc = (struct ioam6_schema *)ptr;
+
+ if (sc)
+ ioam6_sc_release(sc);
+}
+
+static int ioam6_ns_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+ const struct ioam6_namespace *ns = obj;
+
+ return (ns->id != *(__be16 *)arg->key);
+}
+
+static int ioam6_sc_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+ const struct ioam6_schema *sc = obj;
+
+ return (sc->id != *(u32 *)arg->key);
+}
+
+static const struct rhashtable_params rht_ns_params = {
+ .key_len = sizeof(__be16),
+ .key_offset = offsetof(struct ioam6_namespace, id),
+ .head_offset = offsetof(struct ioam6_namespace, head),
+ .automatic_shrinking = true,
+ .obj_cmpfn = ioam6_ns_cmpfn,
+};
+
+static const struct rhashtable_params rht_sc_params = {
+ .key_len = sizeof(u32),
+ .key_offset = offsetof(struct ioam6_schema, id),
+ .head_offset = offsetof(struct ioam6_schema, head),
+ .automatic_shrinking = true,
+ .obj_cmpfn = ioam6_sc_cmpfn,
+};
+
+static struct genl_family ioam6_genl_family;
+
+static const struct nla_policy ioam6_genl_policy_addns[] = {
+ [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 },
+ [IOAM6_ATTR_NS_DATA] = { .type = NLA_U32 },
+ [IOAM6_ATTR_NS_DATA_WIDE] = { .type = NLA_U64 },
+};
+
+static const struct nla_policy ioam6_genl_policy_delns[] = {
+ [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 },
+};
+
+static const struct nla_policy ioam6_genl_policy_addsc[] = {
+ [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 },
+ [IOAM6_ATTR_SC_DATA] = { .type = NLA_BINARY,
+ .len = IOAM6_MAX_SCHEMA_DATA_LEN },
+};
+
+static const struct nla_policy ioam6_genl_policy_delsc[] = {
+ [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy ioam6_genl_policy_ns_sc[] = {
+ [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 },
+ [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 },
+ [IOAM6_ATTR_SC_NONE] = { .type = NLA_FLAG },
+};
+
+static int ioam6_genl_addns(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ioam6_pernet_data *nsdata;
+ struct ioam6_namespace *ns;
+ u64 data64;
+ u32 data32;
+ __be16 id;
+ int err;
+
+ if (!info->attrs[IOAM6_ATTR_NS_ID])
+ return -EINVAL;
+
+ id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID]));
+ nsdata = ioam6_pernet(genl_info_net(info));
+
+ mutex_lock(&nsdata->lock);
+
+ ns = rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params);
+ if (ns) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+
+ ns = kzalloc(sizeof(*ns), GFP_KERNEL);
+ if (!ns) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ ns->id = id;
+
+ if (!info->attrs[IOAM6_ATTR_NS_DATA])
+ data32 = IOAM6_U32_UNAVAILABLE;
+ else
+ data32 = nla_get_u32(info->attrs[IOAM6_ATTR_NS_DATA]);
+
+ if (!info->attrs[IOAM6_ATTR_NS_DATA_WIDE])
+ data64 = IOAM6_U64_UNAVAILABLE;
+ else
+ data64 = nla_get_u64(info->attrs[IOAM6_ATTR_NS_DATA_WIDE]);
+
+ ns->data = cpu_to_be32(data32);
+ ns->data_wide = cpu_to_be64(data64);
+
+ err = rhashtable_lookup_insert_fast(&nsdata->namespaces, &ns->head,
+ rht_ns_params);
+ if (err)
+ kfree(ns);
+
+out_unlock:
+ mutex_unlock(&nsdata->lock);
+ return err;
+}
+
+static int ioam6_genl_delns(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ioam6_pernet_data *nsdata;
+ struct ioam6_namespace *ns;
+ struct ioam6_schema *sc;
+ __be16 id;
+ int err;
+
+ if (!info->attrs[IOAM6_ATTR_NS_ID])
+ return -EINVAL;
+
+ id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID]));
+ nsdata = ioam6_pernet(genl_info_net(info));
+
+ mutex_lock(&nsdata->lock);
+
+ ns = rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params);
+ if (!ns) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+
+ sc = rcu_dereference_protected(ns->schema,
+ lockdep_is_held(&nsdata->lock));
+
+ err = rhashtable_remove_fast(&nsdata->namespaces, &ns->head,
+ rht_ns_params);
+ if (err)
+ goto out_unlock;
+
+ if (sc)
+ rcu_assign_pointer(sc->ns, NULL);
+
+ ioam6_ns_release(ns);
+
+out_unlock:
+ mutex_unlock(&nsdata->lock);
+ return err;
+}
+
+static int __ioam6_genl_dumpns_element(struct ioam6_namespace *ns,
+ u32 portid,
+ u32 seq,
+ u32 flags,
+ struct sk_buff *skb,
+ u8 cmd)
+{
+ struct ioam6_schema *sc;
+ u64 data64;
+ u32 data32;
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &ioam6_genl_family, flags, cmd);
+ if (!hdr)
+ return -ENOMEM;
+
+ data32 = be32_to_cpu(ns->data);
+ data64 = be64_to_cpu(ns->data_wide);
+
+ if (nla_put_u16(skb, IOAM6_ATTR_NS_ID, be16_to_cpu(ns->id)) ||
+ (data32 != IOAM6_U32_UNAVAILABLE &&
+ nla_put_u32(skb, IOAM6_ATTR_NS_DATA, data32)) ||
+ (data64 != IOAM6_U64_UNAVAILABLE &&
+ nla_put_u64_64bit(skb, IOAM6_ATTR_NS_DATA_WIDE,
+ data64, IOAM6_ATTR_PAD)))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+
+ sc = rcu_dereference(ns->schema);
+ if (sc && nla_put_u32(skb, IOAM6_ATTR_SC_ID, sc->id)) {
+ rcu_read_unlock();
+ goto nla_put_failure;
+ }
+
+ rcu_read_unlock();
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int ioam6_genl_dumpns_start(struct netlink_callback *cb)
+{
+ struct ioam6_pernet_data *nsdata = ioam6_pernet(sock_net(cb->skb->sk));
+ struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+ if (!iter) {
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ cb->args[0] = (long)iter;
+ }
+
+ rhashtable_walk_enter(&nsdata->namespaces, iter);
+
+ return 0;
+}
+
+static int ioam6_genl_dumpns_done(struct netlink_callback *cb)
+{
+ struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+ rhashtable_walk_exit(iter);
+ kfree(iter);
+
+ return 0;
+}
+
+static int ioam6_genl_dumpns(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rhashtable_iter *iter;
+ struct ioam6_namespace *ns;
+ int err;
+
+ iter = (struct rhashtable_iter *)cb->args[0];
+ rhashtable_walk_start(iter);
+
+ for (;;) {
+ ns = rhashtable_walk_next(iter);
+
+ if (IS_ERR(ns)) {
+ if (PTR_ERR(ns) == -EAGAIN)
+ continue;
+ err = PTR_ERR(ns);
+ goto done;
+ } else if (!ns) {
+ break;
+ }
+
+ err = __ioam6_genl_dumpns_element(ns,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI,
+ skb,
+ IOAM6_CMD_DUMP_NAMESPACES);
+ if (err)
+ goto done;
+ }
+
+ err = skb->len;
+
+done:
+ rhashtable_walk_stop(iter);
+ return err;
+}
+
+static int ioam6_genl_addsc(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ioam6_pernet_data *nsdata;
+ int len, len_aligned, err;
+ struct ioam6_schema *sc;
+ u32 id;
+
+ if (!info->attrs[IOAM6_ATTR_SC_ID] || !info->attrs[IOAM6_ATTR_SC_DATA])
+ return -EINVAL;
+
+ id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]);
+ nsdata = ioam6_pernet(genl_info_net(info));
+
+ mutex_lock(&nsdata->lock);
+
+ sc = rhashtable_lookup_fast(&nsdata->schemas, &id, rht_sc_params);
+ if (sc) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+
+ len = nla_len(info->attrs[IOAM6_ATTR_SC_DATA]);
+ len_aligned = ALIGN(len, 4);
+
+ sc = kzalloc(sizeof(*sc) + len_aligned, GFP_KERNEL);
+ if (!sc) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ sc->id = id;
+ sc->len = len_aligned;
+ sc->hdr = cpu_to_be32(sc->id | ((u8)(sc->len / 4) << 24));
+ nla_memcpy(sc->data, info->attrs[IOAM6_ATTR_SC_DATA], len);
+
+ err = rhashtable_lookup_insert_fast(&nsdata->schemas, &sc->head,
+ rht_sc_params);
+ if (err)
+ goto free_sc;
+
+out_unlock:
+ mutex_unlock(&nsdata->lock);
+ return err;
+free_sc:
+ kfree(sc);
+ goto out_unlock;
+}
+
+static int ioam6_genl_delsc(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ioam6_pernet_data *nsdata;
+ struct ioam6_namespace *ns;
+ struct ioam6_schema *sc;
+ int err;
+ u32 id;
+
+ if (!info->attrs[IOAM6_ATTR_SC_ID])
+ return -EINVAL;
+
+ id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]);
+ nsdata = ioam6_pernet(genl_info_net(info));
+
+ mutex_lock(&nsdata->lock);
+
+ sc = rhashtable_lookup_fast(&nsdata->schemas, &id, rht_sc_params);
+ if (!sc) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+
+ ns = rcu_dereference_protected(sc->ns, lockdep_is_held(&nsdata->lock));
+
+ err = rhashtable_remove_fast(&nsdata->schemas, &sc->head,
+ rht_sc_params);
+ if (err)
+ goto out_unlock;
+
+ if (ns)
+ rcu_assign_pointer(ns->schema, NULL);
+
+ ioam6_sc_release(sc);
+
+out_unlock:
+ mutex_unlock(&nsdata->lock);
+ return err;
+}
+
+static int __ioam6_genl_dumpsc_element(struct ioam6_schema *sc,
+ u32 portid, u32 seq, u32 flags,
+ struct sk_buff *skb, u8 cmd)
+{
+ struct ioam6_namespace *ns;
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &ioam6_genl_family, flags, cmd);
+ if (!hdr)
+ return -ENOMEM;
+
+ if (nla_put_u32(skb, IOAM6_ATTR_SC_ID, sc->id) ||
+ nla_put(skb, IOAM6_ATTR_SC_DATA, sc->len, sc->data))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+
+ ns = rcu_dereference(sc->ns);
+ if (ns && nla_put_u16(skb, IOAM6_ATTR_NS_ID, be16_to_cpu(ns->id))) {
+ rcu_read_unlock();
+ goto nla_put_failure;
+ }
+
+ rcu_read_unlock();
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int ioam6_genl_dumpsc_start(struct netlink_callback *cb)
+{
+ struct ioam6_pernet_data *nsdata = ioam6_pernet(sock_net(cb->skb->sk));
+ struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+ if (!iter) {
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ cb->args[0] = (long)iter;
+ }
+
+ rhashtable_walk_enter(&nsdata->schemas, iter);
+
+ return 0;
+}
+
+static int ioam6_genl_dumpsc_done(struct netlink_callback *cb)
+{
+ struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+ rhashtable_walk_exit(iter);
+ kfree(iter);
+
+ return 0;
+}
+
+static int ioam6_genl_dumpsc(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rhashtable_iter *iter;
+ struct ioam6_schema *sc;
+ int err;
+
+ iter = (struct rhashtable_iter *)cb->args[0];
+ rhashtable_walk_start(iter);
+
+ for (;;) {
+ sc = rhashtable_walk_next(iter);
+
+ if (IS_ERR(sc)) {
+ if (PTR_ERR(sc) == -EAGAIN)
+ continue;
+ err = PTR_ERR(sc);
+ goto done;
+ } else if (!sc) {
+ break;
+ }
+
+ err = __ioam6_genl_dumpsc_element(sc,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI,
+ skb,
+ IOAM6_CMD_DUMP_SCHEMAS);
+ if (err)
+ goto done;
+ }
+
+ err = skb->len;
+
+done:
+ rhashtable_walk_stop(iter);
+ return err;
+}
+
+static int ioam6_genl_ns_set_schema(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ioam6_namespace *ns, *ns_ref;
+ struct ioam6_schema *sc, *sc_ref;
+ struct ioam6_pernet_data *nsdata;
+ __be16 ns_id;
+ u32 sc_id;
+ int err;
+
+ if (!info->attrs[IOAM6_ATTR_NS_ID] ||
+ (!info->attrs[IOAM6_ATTR_SC_ID] &&
+ !info->attrs[IOAM6_ATTR_SC_NONE]))
+ return -EINVAL;
+
+ ns_id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID]));
+ nsdata = ioam6_pernet(genl_info_net(info));
+
+ mutex_lock(&nsdata->lock);
+
+ ns = rhashtable_lookup_fast(&nsdata->namespaces, &ns_id, rht_ns_params);
+ if (!ns) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+
+ if (info->attrs[IOAM6_ATTR_SC_NONE]) {
+ sc = NULL;
+ } else {
+ sc_id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]);
+ sc = rhashtable_lookup_fast(&nsdata->schemas, &sc_id,
+ rht_sc_params);
+ if (!sc) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+ }
+
+ sc_ref = rcu_dereference_protected(ns->schema,
+ lockdep_is_held(&nsdata->lock));
+ if (sc_ref)
+ rcu_assign_pointer(sc_ref->ns, NULL);
+ rcu_assign_pointer(ns->schema, sc);
+
+ if (sc) {
+ ns_ref = rcu_dereference_protected(sc->ns,
+ lockdep_is_held(&nsdata->lock));
+ if (ns_ref)
+ rcu_assign_pointer(ns_ref->schema, NULL);
+ rcu_assign_pointer(sc->ns, ns);
+ }
+
+ err = 0;
+
+out_unlock:
+ mutex_unlock(&nsdata->lock);
+ return err;
+}
+
+static const struct genl_ops ioam6_genl_ops[] = {
+ {
+ .cmd = IOAM6_CMD_ADD_NAMESPACE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = ioam6_genl_addns,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ioam6_genl_policy_addns,
+ .maxattr = ARRAY_SIZE(ioam6_genl_policy_addns) - 1,
+ },
+ {
+ .cmd = IOAM6_CMD_DEL_NAMESPACE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = ioam6_genl_delns,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ioam6_genl_policy_delns,
+ .maxattr = ARRAY_SIZE(ioam6_genl_policy_delns) - 1,
+ },
+ {
+ .cmd = IOAM6_CMD_DUMP_NAMESPACES,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .start = ioam6_genl_dumpns_start,
+ .dumpit = ioam6_genl_dumpns,
+ .done = ioam6_genl_dumpns_done,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = IOAM6_CMD_ADD_SCHEMA,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = ioam6_genl_addsc,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ioam6_genl_policy_addsc,
+ .maxattr = ARRAY_SIZE(ioam6_genl_policy_addsc) - 1,
+ },
+ {
+ .cmd = IOAM6_CMD_DEL_SCHEMA,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = ioam6_genl_delsc,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ioam6_genl_policy_delsc,
+ .maxattr = ARRAY_SIZE(ioam6_genl_policy_delsc) - 1,
+ },
+ {
+ .cmd = IOAM6_CMD_DUMP_SCHEMAS,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .start = ioam6_genl_dumpsc_start,
+ .dumpit = ioam6_genl_dumpsc,
+ .done = ioam6_genl_dumpsc_done,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = IOAM6_CMD_NS_SET_SCHEMA,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = ioam6_genl_ns_set_schema,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ioam6_genl_policy_ns_sc,
+ .maxattr = ARRAY_SIZE(ioam6_genl_policy_ns_sc) - 1,
+ },
+};
+
+static struct genl_family ioam6_genl_family __ro_after_init = {
+ .name = IOAM6_GENL_NAME,
+ .version = IOAM6_GENL_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .ops = ioam6_genl_ops,
+ .n_ops = ARRAY_SIZE(ioam6_genl_ops),
+ .resv_start_op = IOAM6_CMD_NS_SET_SCHEMA + 1,
+ .module = THIS_MODULE,
+};
+
+struct ioam6_namespace *ioam6_namespace(struct net *net, __be16 id)
+{
+ struct ioam6_pernet_data *nsdata = ioam6_pernet(net);
+
+ return rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params);
+}
+
+static void __ioam6_fill_trace_data(struct sk_buff *skb,
+ struct ioam6_namespace *ns,
+ struct ioam6_trace_hdr *trace,
+ struct ioam6_schema *sc,
+ u8 sclen, bool is_input)
+{
+ struct timespec64 ts;
+ ktime_t tstamp;
+ u64 raw64;
+ u32 raw32;
+ u16 raw16;
+ u8 *data;
+ u8 byte;
+
+ data = trace->data + trace->remlen * 4 - trace->nodelen * 4 - sclen * 4;
+
+ /* hop_lim and node_id */
+ if (trace->type.bit0) {
+ byte = ipv6_hdr(skb)->hop_limit;
+ if (is_input)
+ byte--;
+
+ raw32 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id;
+
+ *(__be32 *)data = cpu_to_be32((byte << 24) | raw32);
+ data += sizeof(__be32);
+ }
+
+ /* ingress_if_id and egress_if_id */
+ if (trace->type.bit1) {
+ if (!skb->dev)
+ raw16 = IOAM6_U16_UNAVAILABLE;
+ else
+ raw16 = (__force u16)__in6_dev_get(skb->dev)->cnf.ioam6_id;
+
+ *(__be16 *)data = cpu_to_be16(raw16);
+ data += sizeof(__be16);
+
+ if (skb_dst(skb)->dev->flags & IFF_LOOPBACK)
+ raw16 = IOAM6_U16_UNAVAILABLE;
+ else
+ raw16 = (__force u16)__in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id;
+
+ *(__be16 *)data = cpu_to_be16(raw16);
+ data += sizeof(__be16);
+ }
+
+ /* timestamp seconds */
+ if (trace->type.bit2) {
+ if (!skb->dev) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ } else {
+ tstamp = skb_tstamp_cond(skb, true);
+ ts = ktime_to_timespec64(tstamp);
+
+ *(__be32 *)data = cpu_to_be32((u32)ts.tv_sec);
+ }
+ data += sizeof(__be32);
+ }
+
+ /* timestamp subseconds */
+ if (trace->type.bit3) {
+ if (!skb->dev) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ } else {
+ if (!trace->type.bit2) {
+ tstamp = skb_tstamp_cond(skb, true);
+ ts = ktime_to_timespec64(tstamp);
+ }
+
+ *(__be32 *)data = cpu_to_be32((u32)(ts.tv_nsec / NSEC_PER_USEC));
+ }
+ data += sizeof(__be32);
+ }
+
+ /* transit delay */
+ if (trace->type.bit4) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* namespace data */
+ if (trace->type.bit5) {
+ *(__be32 *)data = ns->data;
+ data += sizeof(__be32);
+ }
+
+ /* queue depth */
+ if (trace->type.bit6) {
+ struct netdev_queue *queue;
+ struct Qdisc *qdisc;
+ __u32 qlen, backlog;
+
+ if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ } else {
+ queue = skb_get_tx_queue(skb_dst(skb)->dev, skb);
+ qdisc = rcu_dereference(queue->qdisc);
+ qdisc_qstats_qlen_backlog(qdisc, &qlen, &backlog);
+
+ *(__be32 *)data = cpu_to_be32(backlog);
+ }
+ data += sizeof(__be32);
+ }
+
+ /* checksum complement */
+ if (trace->type.bit7) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* hop_lim and node_id (wide) */
+ if (trace->type.bit8) {
+ byte = ipv6_hdr(skb)->hop_limit;
+ if (is_input)
+ byte--;
+
+ raw64 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id_wide;
+
+ *(__be64 *)data = cpu_to_be64(((u64)byte << 56) | raw64);
+ data += sizeof(__be64);
+ }
+
+ /* ingress_if_id and egress_if_id (wide) */
+ if (trace->type.bit9) {
+ if (!skb->dev)
+ raw32 = IOAM6_U32_UNAVAILABLE;
+ else
+ raw32 = __in6_dev_get(skb->dev)->cnf.ioam6_id_wide;
+
+ *(__be32 *)data = cpu_to_be32(raw32);
+ data += sizeof(__be32);
+
+ if (skb_dst(skb)->dev->flags & IFF_LOOPBACK)
+ raw32 = IOAM6_U32_UNAVAILABLE;
+ else
+ raw32 = __in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id_wide;
+
+ *(__be32 *)data = cpu_to_be32(raw32);
+ data += sizeof(__be32);
+ }
+
+ /* namespace data (wide) */
+ if (trace->type.bit10) {
+ *(__be64 *)data = ns->data_wide;
+ data += sizeof(__be64);
+ }
+
+ /* buffer occupancy */
+ if (trace->type.bit11) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit12 undefined: filled with empty value */
+ if (trace->type.bit12) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit13 undefined: filled with empty value */
+ if (trace->type.bit13) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit14 undefined: filled with empty value */
+ if (trace->type.bit14) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit15 undefined: filled with empty value */
+ if (trace->type.bit15) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit16 undefined: filled with empty value */
+ if (trace->type.bit16) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit17 undefined: filled with empty value */
+ if (trace->type.bit17) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit18 undefined: filled with empty value */
+ if (trace->type.bit18) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit19 undefined: filled with empty value */
+ if (trace->type.bit19) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit20 undefined: filled with empty value */
+ if (trace->type.bit20) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit21 undefined: filled with empty value */
+ if (trace->type.bit21) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* opaque state snapshot */
+ if (trace->type.bit22) {
+ if (!sc) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE >> 8);
+ } else {
+ *(__be32 *)data = sc->hdr;
+ data += sizeof(__be32);
+
+ memcpy(data, sc->data, sc->len);
+ }
+ }
+}
+
+/* called with rcu_read_lock() */
+void ioam6_fill_trace_data(struct sk_buff *skb,
+ struct ioam6_namespace *ns,
+ struct ioam6_trace_hdr *trace,
+ bool is_input)
+{
+ struct ioam6_schema *sc;
+ u8 sclen = 0;
+
+ /* Skip if Overflow flag is set
+ */
+ if (trace->overflow)
+ return;
+
+ /* NodeLen does not include Opaque State Snapshot length. We need to
+ * take it into account if the corresponding bit is set (bit 22) and
+ * if the current IOAM namespace has an active schema attached to it
+ */
+ sc = rcu_dereference(ns->schema);
+ if (trace->type.bit22) {
+ sclen = sizeof_field(struct ioam6_schema, hdr) / 4;
+
+ if (sc)
+ sclen += sc->len / 4;
+ }
+
+ /* If there is no space remaining, we set the Overflow flag and we
+ * skip without filling the trace
+ */
+ if (!trace->remlen || trace->remlen < trace->nodelen + sclen) {
+ trace->overflow = 1;
+ return;
+ }
+
+ __ioam6_fill_trace_data(skb, ns, trace, sc, sclen, is_input);
+ trace->remlen -= trace->nodelen + sclen;
+}
+
+static int __net_init ioam6_net_init(struct net *net)
+{
+ struct ioam6_pernet_data *nsdata;
+ int err = -ENOMEM;
+
+ nsdata = kzalloc(sizeof(*nsdata), GFP_KERNEL);
+ if (!nsdata)
+ goto out;
+
+ mutex_init(&nsdata->lock);
+ net->ipv6.ioam6_data = nsdata;
+
+ err = rhashtable_init(&nsdata->namespaces, &rht_ns_params);
+ if (err)
+ goto free_nsdata;
+
+ err = rhashtable_init(&nsdata->schemas, &rht_sc_params);
+ if (err)
+ goto free_rht_ns;
+
+out:
+ return err;
+free_rht_ns:
+ rhashtable_destroy(&nsdata->namespaces);
+free_nsdata:
+ kfree(nsdata);
+ net->ipv6.ioam6_data = NULL;
+ goto out;
+}
+
+static void __net_exit ioam6_net_exit(struct net *net)
+{
+ struct ioam6_pernet_data *nsdata = ioam6_pernet(net);
+
+ rhashtable_free_and_destroy(&nsdata->namespaces, ioam6_free_ns, NULL);
+ rhashtable_free_and_destroy(&nsdata->schemas, ioam6_free_sc, NULL);
+
+ kfree(nsdata);
+}
+
+static struct pernet_operations ioam6_net_ops = {
+ .init = ioam6_net_init,
+ .exit = ioam6_net_exit,
+};
+
+int __init ioam6_init(void)
+{
+ int err = register_pernet_subsys(&ioam6_net_ops);
+ if (err)
+ goto out;
+
+ err = genl_register_family(&ioam6_genl_family);
+ if (err)
+ goto out_unregister_pernet_subsys;
+
+#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL
+ err = ioam6_iptunnel_init();
+ if (err)
+ goto out_unregister_genl;
+#endif
+
+ pr_info("In-situ OAM (IOAM) with IPv6\n");
+
+out:
+ return err;
+#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL
+out_unregister_genl:
+ genl_unregister_family(&ioam6_genl_family);
+#endif
+out_unregister_pernet_subsys:
+ unregister_pernet_subsys(&ioam6_net_ops);
+ goto out;
+}
+
+void ioam6_exit(void)
+{
+#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL
+ ioam6_iptunnel_exit();
+#endif
+ genl_unregister_family(&ioam6_genl_family);
+ unregister_pernet_subsys(&ioam6_net_ops);
+}
diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c
new file mode 100644
index 000000000000..f6f5b83dd954
--- /dev/null
+++ b/net/ipv6/ioam6_iptunnel.c
@@ -0,0 +1,477 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * IPv6 IOAM Lightweight Tunnel implementation
+ *
+ * Author:
+ * Justin Iurman <justin.iurman@uliege.be>
+ */
+
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/ioam6.h>
+#include <linux/ioam6_iptunnel.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/lwtunnel.h>
+#include <net/ioam6.h>
+#include <net/netlink.h>
+#include <net/ipv6.h>
+#include <net/dst_cache.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+
+#define IOAM6_MASK_SHORT_FIELDS 0xff100000
+#define IOAM6_MASK_WIDE_FIELDS 0xe00000
+
+struct ioam6_lwt_encap {
+ struct ipv6_hopopt_hdr eh;
+ u8 pad[2]; /* 2-octet padding for 4n-alignment */
+ struct ioam6_hdr ioamh;
+ struct ioam6_trace_hdr traceh;
+} __packed;
+
+struct ioam6_lwt_freq {
+ u32 k;
+ u32 n;
+};
+
+struct ioam6_lwt {
+ struct dst_cache cache;
+ struct ioam6_lwt_freq freq;
+ atomic_t pkt_cnt;
+ u8 mode;
+ struct in6_addr tundst;
+ struct ioam6_lwt_encap tuninfo;
+};
+
+static struct netlink_range_validation freq_range = {
+ .min = IOAM6_IPTUNNEL_FREQ_MIN,
+ .max = IOAM6_IPTUNNEL_FREQ_MAX,
+};
+
+static struct ioam6_lwt *ioam6_lwt_state(struct lwtunnel_state *lwt)
+{
+ return (struct ioam6_lwt *)lwt->data;
+}
+
+static struct ioam6_lwt_encap *ioam6_lwt_info(struct lwtunnel_state *lwt)
+{
+ return &ioam6_lwt_state(lwt)->tuninfo;
+}
+
+static struct ioam6_trace_hdr *ioam6_lwt_trace(struct lwtunnel_state *lwt)
+{
+ return &(ioam6_lwt_state(lwt)->tuninfo.traceh);
+}
+
+static const struct nla_policy ioam6_iptunnel_policy[IOAM6_IPTUNNEL_MAX + 1] = {
+ [IOAM6_IPTUNNEL_FREQ_K] = NLA_POLICY_FULL_RANGE(NLA_U32, &freq_range),
+ [IOAM6_IPTUNNEL_FREQ_N] = NLA_POLICY_FULL_RANGE(NLA_U32, &freq_range),
+ [IOAM6_IPTUNNEL_MODE] = NLA_POLICY_RANGE(NLA_U8,
+ IOAM6_IPTUNNEL_MODE_MIN,
+ IOAM6_IPTUNNEL_MODE_MAX),
+ [IOAM6_IPTUNNEL_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+ [IOAM6_IPTUNNEL_TRACE] = NLA_POLICY_EXACT_LEN(sizeof(struct ioam6_trace_hdr)),
+};
+
+static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace)
+{
+ u32 fields;
+
+ if (!trace->type_be32 || !trace->remlen ||
+ trace->remlen > IOAM6_TRACE_DATA_SIZE_MAX / 4 ||
+ trace->type.bit12 | trace->type.bit13 | trace->type.bit14 |
+ trace->type.bit15 | trace->type.bit16 | trace->type.bit17 |
+ trace->type.bit18 | trace->type.bit19 | trace->type.bit20 |
+ trace->type.bit21)
+ return false;
+
+ trace->nodelen = 0;
+ fields = be32_to_cpu(trace->type_be32);
+
+ trace->nodelen += hweight32(fields & IOAM6_MASK_SHORT_FIELDS)
+ * (sizeof(__be32) / 4);
+ trace->nodelen += hweight32(fields & IOAM6_MASK_WIDE_FIELDS)
+ * (sizeof(__be64) / 4);
+
+ return true;
+}
+
+static int ioam6_build_state(struct net *net, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IOAM6_IPTUNNEL_MAX + 1];
+ struct ioam6_lwt_encap *tuninfo;
+ struct ioam6_trace_hdr *trace;
+ struct lwtunnel_state *lwt;
+ struct ioam6_lwt *ilwt;
+ int len_aligned, err;
+ u32 freq_k, freq_n;
+ u8 mode;
+
+ if (family != AF_INET6)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, IOAM6_IPTUNNEL_MAX, nla,
+ ioam6_iptunnel_policy, extack);
+ if (err < 0)
+ return err;
+
+ if ((!tb[IOAM6_IPTUNNEL_FREQ_K] && tb[IOAM6_IPTUNNEL_FREQ_N]) ||
+ (tb[IOAM6_IPTUNNEL_FREQ_K] && !tb[IOAM6_IPTUNNEL_FREQ_N])) {
+ NL_SET_ERR_MSG(extack, "freq: missing parameter");
+ return -EINVAL;
+ } else if (!tb[IOAM6_IPTUNNEL_FREQ_K] && !tb[IOAM6_IPTUNNEL_FREQ_N]) {
+ freq_k = IOAM6_IPTUNNEL_FREQ_MIN;
+ freq_n = IOAM6_IPTUNNEL_FREQ_MIN;
+ } else {
+ freq_k = nla_get_u32(tb[IOAM6_IPTUNNEL_FREQ_K]);
+ freq_n = nla_get_u32(tb[IOAM6_IPTUNNEL_FREQ_N]);
+
+ if (freq_k > freq_n) {
+ NL_SET_ERR_MSG(extack, "freq: k > n is forbidden");
+ return -EINVAL;
+ }
+ }
+
+ if (!tb[IOAM6_IPTUNNEL_MODE])
+ mode = IOAM6_IPTUNNEL_MODE_INLINE;
+ else
+ mode = nla_get_u8(tb[IOAM6_IPTUNNEL_MODE]);
+
+ if (!tb[IOAM6_IPTUNNEL_DST] && mode != IOAM6_IPTUNNEL_MODE_INLINE) {
+ NL_SET_ERR_MSG(extack, "this mode needs a tunnel destination");
+ return -EINVAL;
+ }
+
+ if (!tb[IOAM6_IPTUNNEL_TRACE]) {
+ NL_SET_ERR_MSG(extack, "missing trace");
+ return -EINVAL;
+ }
+
+ trace = nla_data(tb[IOAM6_IPTUNNEL_TRACE]);
+ if (!ioam6_validate_trace_hdr(trace)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_TRACE],
+ "invalid trace validation");
+ return -EINVAL;
+ }
+
+ len_aligned = ALIGN(trace->remlen * 4, 8);
+ lwt = lwtunnel_state_alloc(sizeof(*ilwt) + len_aligned);
+ if (!lwt)
+ return -ENOMEM;
+
+ ilwt = ioam6_lwt_state(lwt);
+ err = dst_cache_init(&ilwt->cache, GFP_ATOMIC);
+ if (err) {
+ kfree(lwt);
+ return err;
+ }
+
+ atomic_set(&ilwt->pkt_cnt, 0);
+ ilwt->freq.k = freq_k;
+ ilwt->freq.n = freq_n;
+
+ ilwt->mode = mode;
+ if (tb[IOAM6_IPTUNNEL_DST])
+ ilwt->tundst = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_DST]);
+
+ tuninfo = ioam6_lwt_info(lwt);
+ tuninfo->eh.hdrlen = ((sizeof(*tuninfo) + len_aligned) >> 3) - 1;
+ tuninfo->pad[0] = IPV6_TLV_PADN;
+ tuninfo->ioamh.type = IOAM6_TYPE_PREALLOC;
+ tuninfo->ioamh.opt_type = IPV6_TLV_IOAM;
+ tuninfo->ioamh.opt_len = sizeof(tuninfo->ioamh) - 2 + sizeof(*trace)
+ + trace->remlen * 4;
+
+ memcpy(&tuninfo->traceh, trace, sizeof(*trace));
+
+ if (len_aligned - trace->remlen * 4) {
+ tuninfo->traceh.data[trace->remlen * 4] = IPV6_TLV_PADN;
+ tuninfo->traceh.data[trace->remlen * 4 + 1] = 2;
+ }
+
+ lwt->type = LWTUNNEL_ENCAP_IOAM6;
+ lwt->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
+ *ts = lwt;
+
+ return 0;
+}
+
+static int ioam6_do_fill(struct net *net, struct sk_buff *skb)
+{
+ struct ioam6_trace_hdr *trace;
+ struct ioam6_namespace *ns;
+
+ trace = (struct ioam6_trace_hdr *)(skb_transport_header(skb)
+ + sizeof(struct ipv6_hopopt_hdr) + 2
+ + sizeof(struct ioam6_hdr));
+
+ ns = ioam6_namespace(net, trace->namespace_id);
+ if (ns)
+ ioam6_fill_trace_data(skb, ns, trace, false);
+
+ return 0;
+}
+
+static int ioam6_do_inline(struct net *net, struct sk_buff *skb,
+ struct ioam6_lwt_encap *tuninfo)
+{
+ struct ipv6hdr *oldhdr, *hdr;
+ int hdrlen, err;
+
+ hdrlen = (tuninfo->eh.hdrlen + 1) << 3;
+
+ err = skb_cow_head(skb, hdrlen + skb->mac_len);
+ if (unlikely(err))
+ return err;
+
+ oldhdr = ipv6_hdr(skb);
+ skb_pull(skb, sizeof(*oldhdr));
+ skb_postpull_rcsum(skb, skb_network_header(skb), sizeof(*oldhdr));
+
+ skb_push(skb, sizeof(*oldhdr) + hdrlen);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+
+ hdr = ipv6_hdr(skb);
+ memmove(hdr, oldhdr, sizeof(*oldhdr));
+ tuninfo->eh.nexthdr = hdr->nexthdr;
+
+ skb_set_transport_header(skb, sizeof(*hdr));
+ skb_postpush_rcsum(skb, hdr, sizeof(*hdr) + hdrlen);
+
+ memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen);
+
+ hdr->nexthdr = NEXTHDR_HOP;
+ hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr));
+
+ return ioam6_do_fill(net, skb);
+}
+
+static int ioam6_do_encap(struct net *net, struct sk_buff *skb,
+ struct ioam6_lwt_encap *tuninfo,
+ struct in6_addr *tundst)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct ipv6hdr *hdr, *inner_hdr;
+ int hdrlen, len, err;
+
+ hdrlen = (tuninfo->eh.hdrlen + 1) << 3;
+ len = sizeof(*hdr) + hdrlen;
+
+ err = skb_cow_head(skb, len + skb->mac_len);
+ if (unlikely(err))
+ return err;
+
+ inner_hdr = ipv6_hdr(skb);
+
+ skb_push(skb, len);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+ skb_set_transport_header(skb, sizeof(*hdr));
+
+ tuninfo->eh.nexthdr = NEXTHDR_IPV6;
+ memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen);
+
+ hdr = ipv6_hdr(skb);
+ memcpy(hdr, inner_hdr, sizeof(*hdr));
+
+ hdr->nexthdr = NEXTHDR_HOP;
+ hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr));
+ hdr->daddr = *tundst;
+ ipv6_dev_get_saddr(net, dst->dev, &hdr->daddr,
+ IPV6_PREFER_SRC_PUBLIC, &hdr->saddr);
+
+ skb_postpush_rcsum(skb, hdr, len);
+
+ return ioam6_do_fill(net, skb);
+}
+
+static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct in6_addr orig_daddr;
+ struct ioam6_lwt *ilwt;
+ int err = -EINVAL;
+ u32 pkt_cnt;
+
+ if (skb->protocol != htons(ETH_P_IPV6))
+ goto drop;
+
+ ilwt = ioam6_lwt_state(dst->lwtstate);
+
+ /* Check for insertion frequency (i.e., "k over n" insertions) */
+ pkt_cnt = atomic_fetch_inc(&ilwt->pkt_cnt);
+ if (pkt_cnt % ilwt->freq.n >= ilwt->freq.k)
+ goto out;
+
+ orig_daddr = ipv6_hdr(skb)->daddr;
+
+ switch (ilwt->mode) {
+ case IOAM6_IPTUNNEL_MODE_INLINE:
+do_inline:
+ /* Direct insertion - if there is no Hop-by-Hop yet */
+ if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP)
+ goto out;
+
+ err = ioam6_do_inline(net, skb, &ilwt->tuninfo);
+ if (unlikely(err))
+ goto drop;
+
+ break;
+ case IOAM6_IPTUNNEL_MODE_ENCAP:
+do_encap:
+ /* Encapsulation (ip6ip6) */
+ err = ioam6_do_encap(net, skb, &ilwt->tuninfo, &ilwt->tundst);
+ if (unlikely(err))
+ goto drop;
+
+ break;
+ case IOAM6_IPTUNNEL_MODE_AUTO:
+ /* Automatic (RFC8200 compliant):
+ * - local packets -> INLINE mode
+ * - in-transit packets -> ENCAP mode
+ */
+ if (!skb->dev)
+ goto do_inline;
+
+ goto do_encap;
+ default:
+ goto drop;
+ }
+
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+ if (unlikely(err))
+ goto drop;
+
+ if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) {
+ preempt_disable();
+ dst = dst_cache_get(&ilwt->cache);
+ preempt_enable();
+
+ if (unlikely(!dst)) {
+ struct ipv6hdr *hdr = ipv6_hdr(skb);
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.daddr = hdr->daddr;
+ fl6.saddr = hdr->saddr;
+ fl6.flowlabel = ip6_flowinfo(hdr);
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_proto = hdr->nexthdr;
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ err = dst->error;
+ dst_release(dst);
+ goto drop;
+ }
+
+ preempt_disable();
+ dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr);
+ preempt_enable();
+ }
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ return dst_output(net, sk, skb);
+ }
+out:
+ return dst->lwtstate->orig_output(net, sk, skb);
+drop:
+ kfree_skb(skb);
+ return err;
+}
+
+static void ioam6_destroy_state(struct lwtunnel_state *lwt)
+{
+ dst_cache_destroy(&ioam6_lwt_state(lwt)->cache);
+}
+
+static int ioam6_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate);
+ int err;
+
+ err = nla_put_u32(skb, IOAM6_IPTUNNEL_FREQ_K, ilwt->freq.k);
+ if (err)
+ goto ret;
+
+ err = nla_put_u32(skb, IOAM6_IPTUNNEL_FREQ_N, ilwt->freq.n);
+ if (err)
+ goto ret;
+
+ err = nla_put_u8(skb, IOAM6_IPTUNNEL_MODE, ilwt->mode);
+ if (err)
+ goto ret;
+
+ if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) {
+ err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_DST, &ilwt->tundst);
+ if (err)
+ goto ret;
+ }
+
+ err = nla_put(skb, IOAM6_IPTUNNEL_TRACE, sizeof(ilwt->tuninfo.traceh),
+ &ilwt->tuninfo.traceh);
+ret:
+ return err;
+}
+
+static int ioam6_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate);
+ int nlsize;
+
+ nlsize = nla_total_size(sizeof(ilwt->freq.k)) +
+ nla_total_size(sizeof(ilwt->freq.n)) +
+ nla_total_size(sizeof(ilwt->mode)) +
+ nla_total_size(sizeof(ilwt->tuninfo.traceh));
+
+ if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE)
+ nlsize += nla_total_size(sizeof(ilwt->tundst));
+
+ return nlsize;
+}
+
+static int ioam6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct ioam6_trace_hdr *trace_a = ioam6_lwt_trace(a);
+ struct ioam6_trace_hdr *trace_b = ioam6_lwt_trace(b);
+ struct ioam6_lwt *ilwt_a = ioam6_lwt_state(a);
+ struct ioam6_lwt *ilwt_b = ioam6_lwt_state(b);
+
+ return (ilwt_a->freq.k != ilwt_b->freq.k ||
+ ilwt_a->freq.n != ilwt_b->freq.n ||
+ ilwt_a->mode != ilwt_b->mode ||
+ (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE &&
+ !ipv6_addr_equal(&ilwt_a->tundst, &ilwt_b->tundst)) ||
+ trace_a->namespace_id != trace_b->namespace_id);
+}
+
+static const struct lwtunnel_encap_ops ioam6_iptun_ops = {
+ .build_state = ioam6_build_state,
+ .destroy_state = ioam6_destroy_state,
+ .output = ioam6_output,
+ .fill_encap = ioam6_fill_encap_info,
+ .get_encap_size = ioam6_encap_nlsize,
+ .cmp_encap = ioam6_encap_cmp,
+ .owner = THIS_MODULE,
+};
+
+int __init ioam6_iptunnel_init(void)
+{
+ return lwtunnel_encap_add_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6);
+}
+
+void ioam6_iptunnel_exit(void)
+{
+ lwtunnel_encap_del_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6);
+}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 72abf892302f..413f66781e50 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -15,6 +15,7 @@
#define pr_fmt(fmt) "IPv6: " fmt
+#include <linux/bpf.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/net.h>
@@ -32,6 +33,7 @@
#include <net/lwtunnel.h>
#include <net/fib_notifier.h>
+#include <net/ip_fib.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
@@ -110,7 +112,7 @@ void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
fn = rcu_dereference_protected(f6i->fib6_node,
lockdep_is_held(&f6i->fib6_table->tb6_lock));
if (fn)
- fn->fn_sernum = fib6_new_sernum(net);
+ WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net));
}
/*
@@ -314,7 +316,8 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
{
struct rt6_info *rt;
- rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
+ rt = pol_lookup_func(lookup,
+ net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
if (rt->dst.error == -EAGAIN) {
ip6_rt_put_flags(rt, flags);
rt = net->ipv6.ip6_null_entry;
@@ -498,7 +501,7 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb,
hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
err = fib6_table_dump(net, tb, w);
- if (err < 0)
+ if (err)
goto out;
}
}
@@ -506,7 +509,8 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb,
out:
kfree(w);
- return err;
+ /* The tree traversal function should never return a positive value. */
+ return err > 0 ? -EINVAL : err;
}
static int fib6_dump_node(struct fib6_walker *w)
@@ -586,12 +590,13 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
spin_unlock_bh(&table->tb6_lock);
if (res > 0) {
cb->args[4] = 1;
- cb->args[5] = w->root->fn_sernum;
+ cb->args[5] = READ_ONCE(w->root->fn_sernum);
}
} else {
- if (cb->args[5] != w->root->fn_sernum) {
+ int sernum = READ_ONCE(w->root->fn_sernum);
+ if (cb->args[5] != sernum) {
/* Begin at the root if the tree changed */
- cb->args[5] = w->root->fn_sernum;
+ cb->args[5] = sernum;
w->state = FWS_INIT;
w->node = w->root;
w->skip = w->count;
@@ -664,7 +669,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
if (arg.filter.table_id) {
tb = fib6_get_table(net, arg.filter.table_id);
if (!tb) {
- if (arg.filter.dump_all_families)
+ if (rtnl_msg_family(cb->nlh) != PF_INET6)
goto out;
NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist");
@@ -1024,6 +1029,8 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
{
struct fib6_table *table = rt->fib6_table;
+ /* Flush all cached dst in exception table */
+ rt6_flush_exceptions(rt);
fib6_drop_pcpu_from(rt, table);
if (rt->nh && !list_empty(&rt->nh_list))
@@ -1336,10 +1343,10 @@ static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
lockdep_is_held(&rt->fib6_table->tb6_lock));
- /* paired with smp_rmb() in rt6_get_cookie_safe() */
+ /* paired with smp_rmb() in fib6_get_cookie_safe() */
smp_wmb();
while (fn) {
- fn->fn_sernum = sernum;
+ WRITE_ONCE(fn->fn_sernum, sernum);
fn = rcu_dereference_protected(fn->parent,
lockdep_is_held(&rt->fib6_table->tb6_lock));
}
@@ -1373,7 +1380,6 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
int err = -ENOMEM;
int allow_create = 1;
int replace_required = 0;
- int sernum = fib6_new_sernum(info->nl_net);
if (info->nlh) {
if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
@@ -1473,7 +1479,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
if (!err) {
if (rt->nh)
list_add(&rt->nh_list, &rt->nh->f6i_list);
- __fib6_update_sernum_upto_root(rt, sernum);
+ __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net));
fib6_start_gc(info->nl_net, rt);
}
@@ -1811,10 +1817,14 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
children = 0;
child = NULL;
- if (fn_r)
- child = fn_r, children |= 1;
- if (fn_l)
- child = fn_l, children |= 2;
+ if (fn_r) {
+ child = fn_r;
+ children |= 1;
+ }
+ if (fn_l) {
+ child = fn_l;
+ children |= 2;
+ }
if (children == 3 || FIB6_SUBTREE(fn)
#ifdef CONFIG_IPV6_SUBTREES
@@ -1922,9 +1932,6 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
net->ipv6.rt6_stats->fib_rt_entries--;
net->ipv6.rt6_stats->fib_discarded_routes++;
- /* Flush all cached dst in exception table */
- rt6_flush_exceptions(rt);
-
/* Reset round-robin state, if necessary */
if (rcu_access_pointer(fn->rr_ptr) == rt)
fn->rr_ptr = NULL;
@@ -1992,14 +1999,19 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
/* Need to own table->tb6_lock */
int fib6_del(struct fib6_info *rt, struct nl_info *info)
{
- struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
- lockdep_is_held(&rt->fib6_table->tb6_lock));
- struct fib6_table *table = rt->fib6_table;
struct net *net = info->nl_net;
struct fib6_info __rcu **rtp;
struct fib6_info __rcu **rtp_next;
+ struct fib6_table *table;
+ struct fib6_node *fn;
+
+ if (rt == net->ipv6.fib6_null_entry)
+ return -ENOENT;
- if (!fn || rt == net->ipv6.fib6_null_entry)
+ table = rt->fib6_table;
+ fn = rcu_dereference_protected(rt->fib6_node,
+ lockdep_is_held(&table->tb6_lock));
+ if (!fn)
return -ENOENT;
WARN_ON(!(fn->fn_flags & RTN_RTINFO));
@@ -2068,8 +2080,8 @@ static int fib6_walk_continue(struct fib6_walker *w)
continue;
}
w->state = FWS_L;
+ fallthrough;
#endif
- /* fall through */
case FWS_L:
left = rcu_dereference_protected(fn->left, 1);
if (left) {
@@ -2078,7 +2090,7 @@ static int fib6_walk_continue(struct fib6_walker *w)
continue;
}
w->state = FWS_R;
- /* fall through */
+ fallthrough;
case FWS_R:
right = rcu_dereference_protected(fn->right, 1);
if (right) {
@@ -2088,7 +2100,7 @@ static int fib6_walk_continue(struct fib6_walker *w)
}
w->state = FWS_C;
w->leaf = rcu_dereference_protected(fn->leaf, 1);
- /* fall through */
+ fallthrough;
case FWS_C:
if (w->leaf && fn->fn_flags & RTN_RTINFO) {
int err;
@@ -2107,7 +2119,7 @@ static int fib6_walk_continue(struct fib6_walker *w)
}
skip:
w->state = FWS_U;
- /* fall through */
+ fallthrough;
case FWS_U:
if (fn == w->root)
return 0;
@@ -2163,8 +2175,8 @@ static int fib6_clean_node(struct fib6_walker *w)
};
if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
- w->node->fn_sernum != c->sernum)
- w->node->fn_sernum = c->sernum;
+ READ_ONCE(w->node->fn_sernum) != c->sernum)
+ WRITE_ONCE(w->node->fn_sernum, c->sernum);
if (!c->func) {
WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE);
@@ -2345,6 +2357,10 @@ static int __net_init fib6_net_init(struct net *net)
if (err)
return err;
+ /* Default to 3-tuple */
+ net->ipv6.sysctl.multipath_hash_fields =
+ FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;
+
spin_lock_init(&net->ipv6.fib6_gc_lock);
rwlock_init(&net->ipv6.fib6_walker_lock);
INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
@@ -2352,7 +2368,7 @@ static int __net_init fib6_net_init(struct net *net)
net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
if (!net->ipv6.rt6_stats)
- goto out_timer;
+ goto out_notifier;
/* Avoid false sharing : Use at least a full cache line */
size = max_t(size_t, size, L1_CACHE_BYTES);
@@ -2397,7 +2413,7 @@ out_fib_table_hash:
kfree(net->ipv6.fib_table_hash);
out_rt6_stats:
kfree(net->ipv6.rt6_stats);
-out_timer:
+out_notifier:
fib6_notifier_exit(net);
return -ENOMEM;
}
@@ -2434,8 +2450,8 @@ int __init fib6_init(void)
int ret = -ENOMEM;
fib6_node_kmem = kmem_cache_create("fib6_nodes",
- sizeof(struct fib6_node),
- 0, SLAB_HWCACHE_ALIGN,
+ sizeof(struct fib6_node), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT,
NULL);
if (!fib6_node_kmem)
goto out;
@@ -2467,7 +2483,7 @@ void fib6_gc_cleanup(void)
}
#ifdef CONFIG_PROC_FS
-static int ipv6_route_seq_show(struct seq_file *seq, void *v)
+static int ipv6_route_native_seq_show(struct seq_file *seq, void *v)
{
struct fib6_info *rt = v;
struct ipv6_route_iter *iter = seq->private;
@@ -2476,7 +2492,7 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
const struct net_device *dev;
if (rt->nh)
- fib6_nh = nexthop_fib6_nh(rt->nh);
+ fib6_nh = nexthop_fib6_nh_bh(rt->nh);
seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
@@ -2528,7 +2544,7 @@ static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter,
iter->w.state = FWS_INIT;
iter->w.node = iter->w.root;
iter->w.args = iter;
- iter->sernum = iter->w.root->fn_sernum;
+ iter->sernum = READ_ONCE(iter->w.root->fn_sernum);
INIT_LIST_HEAD(&iter->w.lh);
fib6_walker_link(net, &iter->w);
}
@@ -2556,8 +2572,10 @@ static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
{
- if (iter->sernum != iter->w.root->fn_sernum) {
- iter->sernum = iter->w.root->fn_sernum;
+ int sernum = READ_ONCE(iter->w.root->fn_sernum);
+
+ if (iter->sernum != sernum) {
+ iter->sernum = sernum;
iter->w.state = FWS_INIT;
iter->w.node = iter->w.root;
WARN_ON(iter->w.skip);
@@ -2612,8 +2630,10 @@ static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
iter->skip = *pos;
if (iter->tbl) {
+ loff_t p = 0;
+
ipv6_route_seq_setup_walk(iter, net);
- return ipv6_route_seq_next(seq, NULL, pos);
+ return ipv6_route_seq_next(seq, NULL, &p);
} else {
return NULL;
}
@@ -2625,7 +2645,7 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
return w->node && !(w->state == FWS_U && w->node == w->root);
}
-static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
+static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v)
__releases(RCU_BH)
{
struct net *net = seq_file_net(seq);
@@ -2637,6 +2657,62 @@ static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
rcu_read_unlock_bh();
}
+#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
+static int ipv6_route_prog_seq_show(struct bpf_prog *prog,
+ struct bpf_iter_meta *meta,
+ void *v)
+{
+ struct bpf_iter__ipv6_route ctx;
+
+ ctx.meta = meta;
+ ctx.rt = v;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int ipv6_route_seq_show(struct seq_file *seq, void *v)
+{
+ struct ipv6_route_iter *iter = seq->private;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, false);
+ if (!prog)
+ return ipv6_route_native_seq_show(seq, v);
+
+ ret = ipv6_route_prog_seq_show(prog, &meta, v);
+ iter->w.leaf = NULL;
+
+ return ret;
+}
+
+static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ if (!v) {
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, true);
+ if (prog)
+ (void)ipv6_route_prog_seq_show(prog, &meta, v);
+ }
+
+ ipv6_route_native_seq_stop(seq, v);
+}
+#else
+static int ipv6_route_seq_show(struct seq_file *seq, void *v)
+{
+ return ipv6_route_native_seq_show(seq, v);
+}
+
+static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
+{
+ ipv6_route_native_seq_stop(seq, v);
+}
+#endif
+
const struct seq_operations ipv6_route_seq_ops = {
.start = ipv6_route_seq_start,
.next = ipv6_route_seq_next,
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index d64b83e85642..18481eb76a0a 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -220,7 +220,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net,
spin_lock_bh(&ip6_fl_lock);
if (label == 0) {
for (;;) {
- fl->label = htonl(prandom_u32())&IPV6_FLOWLABEL_MASK;
+ fl->label = htonl(get_random_u32())&IPV6_FLOWLABEL_MASK;
if (fl->label) {
lfl = __fl_lookup(net, fl->label);
if (!lfl)
@@ -371,7 +371,7 @@ static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned lo
static struct ip6_flowlabel *
fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
- char __user *optval, int optlen, int *err_p)
+ sockptr_t optval, int optlen, int *err_p)
{
struct ip6_flowlabel *fl = NULL;
int olen;
@@ -401,7 +401,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
memset(fl->opt, 0, sizeof(*fl->opt));
fl->opt->tot_len = sizeof(*fl->opt) + olen;
err = -EFAULT;
- if (copy_from_user(fl->opt+1, optval+CMSG_ALIGN(sizeof(*freq)), olen))
+ if (copy_from_sockptr_offset(fl->opt + 1, optval,
+ CMSG_ALIGN(sizeof(*freq)), olen))
goto done;
msg.msg_controllen = olen;
@@ -449,8 +450,10 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
err = -EINVAL;
goto done;
}
- if (fl_shared_exclusive(fl) || fl->opt)
+ if (fl_shared_exclusive(fl) || fl->opt) {
+ WRITE_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl, 1);
static_branch_deferred_inc(&ipv6_flowlabel_exclusive);
+ }
return fl;
done:
@@ -533,187 +536,212 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq,
return -ENOENT;
}
-int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen)
+#define socklist_dereference(__sflp) \
+ rcu_dereference_protected(__sflp, lockdep_is_held(&ip6_sk_fl_lock))
+
+static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq)
{
- int uninitialized_var(err);
- struct net *net = sock_net(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
- struct in6_flowlabel_req freq;
- struct ipv6_fl_socklist *sfl1 = NULL;
- struct ipv6_fl_socklist *sfl;
struct ipv6_fl_socklist __rcu **sflp;
- struct ip6_flowlabel *fl, *fl1 = NULL;
+ struct ipv6_fl_socklist *sfl;
+ if (freq->flr_flags & IPV6_FL_F_REFLECT) {
+ if (sk->sk_protocol != IPPROTO_TCP)
+ return -ENOPROTOOPT;
+ if (!np->repflow)
+ return -ESRCH;
+ np->flow_label = 0;
+ np->repflow = 0;
+ return 0;
+ }
- if (optlen < sizeof(freq))
- return -EINVAL;
+ spin_lock_bh(&ip6_sk_fl_lock);
+ for (sflp = &np->ipv6_fl_list;
+ (sfl = socklist_dereference(*sflp)) != NULL;
+ sflp = &sfl->next) {
+ if (sfl->fl->label == freq->flr_label)
+ goto found;
+ }
+ spin_unlock_bh(&ip6_sk_fl_lock);
+ return -ESRCH;
+found:
+ if (freq->flr_label == (np->flow_label & IPV6_FLOWLABEL_MASK))
+ np->flow_label &= ~IPV6_FLOWLABEL_MASK;
+ *sflp = sfl->next;
+ spin_unlock_bh(&ip6_sk_fl_lock);
+ fl_release(sfl->fl);
+ kfree_rcu(sfl, rcu);
+ return 0;
+}
- if (copy_from_user(&freq, optval, sizeof(freq)))
- return -EFAULT;
+static int ipv6_flowlabel_renew(struct sock *sk, struct in6_flowlabel_req *freq)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct net *net = sock_net(sk);
+ struct ipv6_fl_socklist *sfl;
+ int err;
- switch (freq.flr_action) {
- case IPV6_FL_A_PUT:
- if (freq.flr_flags & IPV6_FL_F_REFLECT) {
- if (sk->sk_protocol != IPPROTO_TCP)
- return -ENOPROTOOPT;
- if (!np->repflow)
- return -ESRCH;
- np->flow_label = 0;
- np->repflow = 0;
- return 0;
- }
- spin_lock_bh(&ip6_sk_fl_lock);
- for (sflp = &np->ipv6_fl_list;
- (sfl = rcu_dereference_protected(*sflp,
- lockdep_is_held(&ip6_sk_fl_lock))) != NULL;
- sflp = &sfl->next) {
- if (sfl->fl->label == freq.flr_label) {
- if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK))
- np->flow_label &= ~IPV6_FLOWLABEL_MASK;
- *sflp = sfl->next;
- spin_unlock_bh(&ip6_sk_fl_lock);
- fl_release(sfl->fl);
- kfree_rcu(sfl, rcu);
- return 0;
- }
+ rcu_read_lock_bh();
+ for_each_sk_fl_rcu(np, sfl) {
+ if (sfl->fl->label == freq->flr_label) {
+ err = fl6_renew(sfl->fl, freq->flr_linger,
+ freq->flr_expires);
+ rcu_read_unlock_bh();
+ return err;
}
- spin_unlock_bh(&ip6_sk_fl_lock);
- return -ESRCH;
+ }
+ rcu_read_unlock_bh();
- case IPV6_FL_A_RENEW:
- rcu_read_lock_bh();
- for_each_sk_fl_rcu(np, sfl) {
- if (sfl->fl->label == freq.flr_label) {
- err = fl6_renew(sfl->fl, freq.flr_linger, freq.flr_expires);
- rcu_read_unlock_bh();
- return err;
- }
- }
- rcu_read_unlock_bh();
+ if (freq->flr_share == IPV6_FL_S_NONE &&
+ ns_capable(net->user_ns, CAP_NET_ADMIN)) {
+ struct ip6_flowlabel *fl = fl_lookup(net, freq->flr_label);
- if (freq.flr_share == IPV6_FL_S_NONE &&
- ns_capable(net->user_ns, CAP_NET_ADMIN)) {
- fl = fl_lookup(net, freq.flr_label);
- if (fl) {
- err = fl6_renew(fl, freq.flr_linger, freq.flr_expires);
- fl_release(fl);
- return err;
- }
+ if (fl) {
+ err = fl6_renew(fl, freq->flr_linger,
+ freq->flr_expires);
+ fl_release(fl);
+ return err;
}
- return -ESRCH;
-
- case IPV6_FL_A_GET:
- if (freq.flr_flags & IPV6_FL_F_REFLECT) {
- struct net *net = sock_net(sk);
- if (net->ipv6.sysctl.flowlabel_consistency) {
- net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n");
- return -EPERM;
- }
+ }
+ return -ESRCH;
+}
- if (sk->sk_protocol != IPPROTO_TCP)
- return -ENOPROTOOPT;
+static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq,
+ sockptr_t optval, int optlen)
+{
+ struct ipv6_fl_socklist *sfl, *sfl1 = NULL;
+ struct ip6_flowlabel *fl, *fl1 = NULL;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct net *net = sock_net(sk);
+ int err;
- np->repflow = 1;
- return 0;
+ if (freq->flr_flags & IPV6_FL_F_REFLECT) {
+ if (net->ipv6.sysctl.flowlabel_consistency) {
+ net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n");
+ return -EPERM;
}
- if (freq.flr_label & ~IPV6_FLOWLABEL_MASK)
- return -EINVAL;
+ if (sk->sk_protocol != IPPROTO_TCP)
+ return -ENOPROTOOPT;
+ np->repflow = 1;
+ return 0;
+ }
- if (net->ipv6.sysctl.flowlabel_state_ranges &&
- (freq.flr_label & IPV6_FLOWLABEL_STATELESS_FLAG))
- return -ERANGE;
+ if (freq->flr_label & ~IPV6_FLOWLABEL_MASK)
+ return -EINVAL;
+ if (net->ipv6.sysctl.flowlabel_state_ranges &&
+ (freq->flr_label & IPV6_FLOWLABEL_STATELESS_FLAG))
+ return -ERANGE;
- fl = fl_create(net, sk, &freq, optval, optlen, &err);
- if (!fl)
- return err;
- sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL);
+ fl = fl_create(net, sk, freq, optval, optlen, &err);
+ if (!fl)
+ return err;
- if (freq.flr_label) {
- err = -EEXIST;
- rcu_read_lock_bh();
- for_each_sk_fl_rcu(np, sfl) {
- if (sfl->fl->label == freq.flr_label) {
- if (freq.flr_flags&IPV6_FL_F_EXCL) {
- rcu_read_unlock_bh();
- goto done;
- }
- fl1 = sfl->fl;
- if (!atomic_inc_not_zero(&fl1->users))
- fl1 = NULL;
- break;
+ sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL);
+
+ if (freq->flr_label) {
+ err = -EEXIST;
+ rcu_read_lock_bh();
+ for_each_sk_fl_rcu(np, sfl) {
+ if (sfl->fl->label == freq->flr_label) {
+ if (freq->flr_flags & IPV6_FL_F_EXCL) {
+ rcu_read_unlock_bh();
+ goto done;
}
+ fl1 = sfl->fl;
+ if (!atomic_inc_not_zero(&fl1->users))
+ fl1 = NULL;
+ break;
}
- rcu_read_unlock_bh();
+ }
+ rcu_read_unlock_bh();
- if (!fl1)
- fl1 = fl_lookup(net, freq.flr_label);
- if (fl1) {
+ if (!fl1)
+ fl1 = fl_lookup(net, freq->flr_label);
+ if (fl1) {
recheck:
- err = -EEXIST;
- if (freq.flr_flags&IPV6_FL_F_EXCL)
- goto release;
- err = -EPERM;
- if (fl1->share == IPV6_FL_S_EXCL ||
- fl1->share != fl->share ||
- ((fl1->share == IPV6_FL_S_PROCESS) &&
- (fl1->owner.pid != fl->owner.pid)) ||
- ((fl1->share == IPV6_FL_S_USER) &&
- !uid_eq(fl1->owner.uid, fl->owner.uid)))
- goto release;
-
- err = -ENOMEM;
- if (!sfl1)
- goto release;
- if (fl->linger > fl1->linger)
- fl1->linger = fl->linger;
- if ((long)(fl->expires - fl1->expires) > 0)
- fl1->expires = fl->expires;
- fl_link(np, sfl1, fl1);
- fl_free(fl);
- return 0;
+ err = -EEXIST;
+ if (freq->flr_flags&IPV6_FL_F_EXCL)
+ goto release;
+ err = -EPERM;
+ if (fl1->share == IPV6_FL_S_EXCL ||
+ fl1->share != fl->share ||
+ ((fl1->share == IPV6_FL_S_PROCESS) &&
+ (fl1->owner.pid != fl->owner.pid)) ||
+ ((fl1->share == IPV6_FL_S_USER) &&
+ !uid_eq(fl1->owner.uid, fl->owner.uid)))
+ goto release;
+
+ err = -ENOMEM;
+ if (!sfl1)
+ goto release;
+ if (fl->linger > fl1->linger)
+ fl1->linger = fl->linger;
+ if ((long)(fl->expires - fl1->expires) > 0)
+ fl1->expires = fl->expires;
+ fl_link(np, sfl1, fl1);
+ fl_free(fl);
+ return 0;
release:
- fl_release(fl1);
- goto done;
- }
- }
- err = -ENOENT;
- if (!(freq.flr_flags&IPV6_FL_F_CREATE))
- goto done;
-
- err = -ENOMEM;
- if (!sfl1)
+ fl_release(fl1);
goto done;
+ }
+ }
+ err = -ENOENT;
+ if (!(freq->flr_flags & IPV6_FL_F_CREATE))
+ goto done;
- err = mem_check(sk);
- if (err != 0)
- goto done;
+ err = -ENOMEM;
+ if (!sfl1)
+ goto done;
- fl1 = fl_intern(net, fl, freq.flr_label);
- if (fl1)
- goto recheck;
+ err = mem_check(sk);
+ if (err != 0)
+ goto done;
- if (!freq.flr_label) {
- if (copy_to_user(&((struct in6_flowlabel_req __user *) optval)->flr_label,
- &fl->label, sizeof(fl->label))) {
- /* Intentionally ignore fault. */
- }
- }
+ fl1 = fl_intern(net, fl, freq->flr_label);
+ if (fl1)
+ goto recheck;
- fl_link(np, sfl1, fl);
- return 0;
+ if (!freq->flr_label) {
+ size_t offset = offsetof(struct in6_flowlabel_req, flr_label);
- default:
- return -EINVAL;
+ if (copy_to_sockptr_offset(optval, offset, &fl->label,
+ sizeof(fl->label))) {
+ /* Intentionally ignore fault. */
+ }
}
+ fl_link(np, sfl1, fl);
+ return 0;
done:
fl_free(fl);
kfree(sfl1);
return err;
}
+int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen)
+{
+ struct in6_flowlabel_req freq;
+
+ if (optlen < sizeof(freq))
+ return -EINVAL;
+ if (copy_from_sockptr(&freq, optval, sizeof(freq)))
+ return -EFAULT;
+
+ switch (freq.flr_action) {
+ case IPV6_FL_A_PUT:
+ return ipv6_flowlabel_put(sk, &freq);
+ case IPV6_FL_A_RENEW:
+ return ipv6_flowlabel_renew(sk, &freq);
+ case IPV6_FL_A_GET:
+ return ipv6_flowlabel_get(sk, &freq, optval, optlen);
+ default:
+ return -EINVAL;
+ }
+}
+
#ifdef CONFIG_PROC_FS
struct ip6fl_iter_state {
@@ -779,7 +807,7 @@ static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos)
{
struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
- state->pid_ns = proc_pid_ns(file_inode(seq->file));
+ state->pid_ns = proc_pid_ns(file_inode(seq->file)->i_sb);
rcu_read_lock_bh();
return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 781ca8c07a0d..c035a96fba3a 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -127,6 +127,7 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
gre_proto == htons(ETH_P_ERSPAN2)) ?
ARPHRD_ETHER : ARPHRD_IP6GRE;
int score, cand_score = 4;
+ struct net_device *ndev;
for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) {
if (!ipv6_addr_equal(local, &t->parms.laddr) ||
@@ -238,9 +239,9 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
if (t && t->dev->flags & IFF_UP)
return t;
- dev = ign->fb_tunnel_dev;
- if (dev && dev->flags & IFF_UP)
- return netdev_priv(dev);
+ ndev = READ_ONCE(ign->fb_tunnel_dev);
+ if (ndev && ndev->flags & IFF_UP)
+ return netdev_priv(ndev);
return NULL;
}
@@ -359,7 +360,7 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net,
if (parms->name[0]) {
if (!dev_valid_name(parms->name))
return NULL;
- strlcpy(name, parms->name, IFNAMSIZ);
+ strscpy(name, parms->name, IFNAMSIZ);
} else {
strcpy(name, "ip6gre%d");
}
@@ -381,12 +382,6 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net,
goto failed_free;
ip6gre_tnl_link_config(nt, 1);
-
- /* Can use a lockless transmit, unless we generate output sequences */
- if (!(nt->parms.o_flags & TUNNEL_SEQ))
- dev->features |= NETIF_F_LLTX;
-
- dev_hold(dev);
ip6gre_tunnel_link(ign, nt);
return nt;
@@ -403,7 +398,7 @@ static void ip6erspan_tunnel_uninit(struct net_device *dev)
ip6erspan_tunnel_unlink_md(ign, t);
ip6gre_tunnel_unlink(ign, t);
dst_cache_reset(&t->dst_cache);
- dev_put(dev);
+ netdev_put(dev, &t->dev_tracker);
}
static void ip6gre_tunnel_uninit(struct net_device *dev)
@@ -413,8 +408,10 @@ static void ip6gre_tunnel_uninit(struct net_device *dev)
ip6gre_tunnel_unlink_md(ign, t);
ip6gre_tunnel_unlink(ign, t);
+ if (ign->fb_tunnel_dev == dev)
+ WRITE_ONCE(ign->fb_tunnel_dev, NULL);
dst_cache_reset(&t->dst_cache);
- dev_put(dev);
+ netdev_put(dev, &t->dev_tracker);
}
@@ -675,8 +672,8 @@ static int prepare_ip6gre_xmit_ipv6(struct sk_buff *skb,
tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
if (tel->encap_limit == 0) {
- icmpv6_send(skb, ICMPV6_PARAMPROB,
- ICMPV6_HDR_FIELD, offset + 2);
+ icmpv6_ndo_send(skb, ICMPV6_PARAMPROB,
+ ICMPV6_HDR_FIELD, offset + 2);
return -1;
}
*encap_limit = tel->encap_limit - 1;
@@ -704,6 +701,44 @@ static int prepare_ip6gre_xmit_ipv6(struct sk_buff *skb,
return 0;
}
+static int prepare_ip6gre_xmit_other(struct sk_buff *skb,
+ struct net_device *dev,
+ struct flowi6 *fl6, __u8 *dsfield,
+ int *encap_limit)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+
+ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ *encap_limit = t->parms.encap_limit;
+
+ memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6));
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ *dsfield = 0;
+ else
+ *dsfield = ip6_tclass(t->parms.flowinfo);
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+ fl6->flowi6_mark = skb->mark;
+ else
+ fl6->flowi6_mark = t->parms.fwmark;
+
+ fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
+ return 0;
+}
+
+static struct ip_tunnel_info *skb_tunnel_info_txcheck(struct sk_buff *skb)
+{
+ struct ip_tunnel_info *tun_info;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX)))
+ return ERR_PTR(-EINVAL);
+
+ return tun_info;
+}
+
static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
struct net_device *dev, __u8 dsfield,
struct flowi6 *fl6, int encap_limit,
@@ -711,6 +746,7 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
{
struct ip6_tnl *tunnel = netdev_priv(dev);
__be16 protocol;
+ __be16 flags;
if (dev->type == ARPHRD_ETHER)
IPCB(skb)->flags = 0;
@@ -720,21 +756,17 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
else
fl6->daddr = tunnel->parms.raddr;
- if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen))
- return -ENOMEM;
-
/* Push GRE header. */
protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto;
if (tunnel->parms.collect_md) {
struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
- __be16 flags;
+ int tun_hlen;
- tun_info = skb_tunnel_info(skb);
- if (unlikely(!tun_info ||
- !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
- ip_tunnel_info_af(tun_info) != AF_INET6))
+ tun_info = skb_tunnel_info_txcheck(skb);
+ if (IS_ERR(tun_info) ||
+ unlikely(ip_tunnel_info_af(tun_info) != AF_INET6))
return -EINVAL;
key = &tun_info->key;
@@ -743,25 +775,32 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
fl6->daddr = key->u.ipv6.dst;
fl6->flowlabel = key->label;
fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+ fl6->fl6_gre_key = tunnel_id_to_key32(key->tun_id);
dsfield = key->tos;
flags = key->tun_flags &
(TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
- tunnel->tun_hlen = gre_calc_hlen(flags);
+ tun_hlen = gre_calc_hlen(flags);
+
+ if (skb_cow_head(skb, dev->needed_headroom ?: tun_hlen + tunnel->encap_hlen))
+ return -ENOMEM;
- gre_build_header(skb, tunnel->tun_hlen,
+ gre_build_header(skb, tun_hlen,
flags, protocol,
tunnel_id_to_key32(tun_info->key.tun_id),
- (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++)
+ (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno))
: 0);
} else {
- if (tunnel->parms.o_flags & TUNNEL_SEQ)
- tunnel->o_seqno++;
+ if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen))
+ return -ENOMEM;
- gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
+ flags = tunnel->parms.o_flags;
+
+ gre_build_header(skb, tunnel->tun_hlen, flags,
protocol, tunnel->parms.o_key,
- htonl(tunnel->o_seqno));
+ (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno))
+ : 0);
}
return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu,
@@ -792,8 +831,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
if (err != 0) {
/* XXX: send ICMP error even if DF is not set. */
if (err == -EMSGSIZE)
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(mtu));
+ icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
return -1;
}
@@ -824,7 +863,7 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
&mtu, skb->protocol);
if (err != 0) {
if (err == -EMSGSIZE)
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
return -1;
}
@@ -856,20 +895,18 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev)
struct ip6_tnl *t = netdev_priv(dev);
int encap_limit = -1;
struct flowi6 fl6;
+ __u8 dsfield = 0;
__u32 mtu;
int err;
- if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
- encap_limit = t->parms.encap_limit;
-
- if (!t->parms.collect_md)
- memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+ if (!t->parms.collect_md &&
+ prepare_ip6gre_xmit_other(skb, dev, &fl6, &dsfield, &encap_limit))
+ return -1;
err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
if (err)
return err;
-
- err = __gre6_xmit(skb, dev, 0, &fl6, encap_limit, &mtu, skb->protocol);
+ err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, skb->protocol);
return err;
}
@@ -879,6 +916,7 @@ static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb,
{
struct ip6_tnl *t = netdev_priv(dev);
struct net_device_stats *stats = &t->dev->stats;
+ __be16 payload_protocol;
int ret;
if (!pskb_inet_may_pull(skb))
@@ -887,7 +925,8 @@ static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb,
if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr))
goto tx_err;
- switch (skb->protocol) {
+ payload_protocol = skb_protocol(skb, true);
+ switch (payload_protocol) {
case htons(ETH_P_IP):
ret = ip6gre_xmit_ipv4(skb, dev);
break;
@@ -905,7 +944,8 @@ static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb,
return NETDEV_TX_OK;
tx_err:
- stats->tx_errors++;
+ if (!t->parms.collect_md || !IS_ERR(skb_tunnel_info_txcheck(skb)))
+ stats->tx_errors++;
stats->tx_dropped++;
kfree_skb(skb);
return NETDEV_TX_OK;
@@ -914,6 +954,7 @@ tx_err:
static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
struct net_device *dev)
{
+ struct ip_tunnel_info *tun_info = NULL;
struct ip6_tnl *t = netdev_priv(dev);
struct dst_entry *dst = skb_dst(skb);
struct net_device_stats *stats;
@@ -925,7 +966,6 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
__be16 proto;
__u32 mtu;
int nhoff;
- int thoff;
if (!pskb_inet_may_pull(skb))
goto tx_err;
@@ -946,10 +986,16 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
(ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
truncate = true;
- thoff = skb_transport_header(skb) - skb_mac_header(skb);
- if (skb->protocol == htons(ETH_P_IPV6) &&
- (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
- truncate = true;
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ int thoff;
+
+ if (skb_transport_header_was_set(skb))
+ thoff = skb_transport_header(skb) - skb_mac_header(skb);
+ else
+ thoff = nhoff + sizeof(struct ipv6hdr);
+ if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)
+ truncate = true;
+ }
if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen))
goto tx_err;
@@ -961,15 +1007,13 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
* for native mode, call prepare_ip6gre_xmit_{ipv4,ipv6}.
*/
if (t->parms.collect_md) {
- struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
struct erspan_metadata *md;
__be32 tun_id;
- tun_info = skb_tunnel_info(skb);
- if (unlikely(!tun_info ||
- !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
- ip_tunnel_info_af(tun_info) != AF_INET6))
+ tun_info = skb_tunnel_info_txcheck(skb);
+ if (IS_ERR(tun_info) ||
+ unlikely(ip_tunnel_info_af(tun_info) != AF_INET6))
goto tx_err;
key = &tun_info->key;
@@ -978,6 +1022,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
fl6.daddr = key->u.ipv6.dst;
fl6.flowlabel = key->label;
fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+ fl6.fl6_gre_key = tunnel_id_to_key32(key->tun_id);
dsfield = key->tos;
if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
@@ -1038,7 +1083,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
/* Push GRE header. */
proto = (t->parms.erspan_ver == 1) ? htons(ETH_P_ERSPAN)
: htons(ETH_P_ERSPAN2);
- gre_build_header(skb, 8, TUNNEL_SEQ, proto, 0, htonl(t->o_seqno++));
+ gre_build_header(skb, 8, TUNNEL_SEQ, proto, 0, htonl(atomic_fetch_inc(&t->o_seqno)));
/* TooBig packet may have updated dst->dev's mtu */
if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu)
@@ -1050,10 +1095,10 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
/* XXX: send ICMP error even if DF is not set. */
if (err == -EMSGSIZE) {
if (skb->protocol == htons(ETH_P_IP))
- icmp_send(skb, ICMP_DEST_UNREACH,
- ICMP_FRAG_NEEDED, htonl(mtu));
+ icmp_ndo_send(skb, ICMP_DEST_UNREACH,
+ ICMP_FRAG_NEEDED, htonl(mtu));
else
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
}
goto tx_err;
@@ -1062,7 +1107,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
tx_err:
stats = &t->dev->stats;
- stats->tx_errors++;
+ if (!IS_ERR(tun_info))
+ stats->tx_errors++;
stats->tx_dropped++;
kfree_skb(skb);
return NETDEV_TX_OK;
@@ -1075,7 +1121,7 @@ static void ip6gre_tnl_link_config_common(struct ip6_tnl *t)
struct flowi6 *fl6 = &t->fl.u.ip6;
if (dev->type != ARPHRD_ETHER) {
- memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
+ __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr));
memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
}
@@ -1085,6 +1131,7 @@ static void ip6gre_tnl_link_config_common(struct ip6_tnl *t)
fl6->flowi6_oif = p->link;
fl6->flowlabel = 0;
fl6->flowi6_proto = IPPROTO_GRE;
+ fl6->fl6_gre_key = t->parms.o_key;
if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS))
fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo;
@@ -1119,18 +1166,25 @@ static void ip6gre_tnl_link_config_route(struct ip6_tnl *t, int set_mtu,
return;
if (rt->dst.dev) {
- dev->needed_headroom = rt->dst.dev->hard_header_len +
- t_hlen;
+ unsigned short dst_len = rt->dst.dev->hard_header_len +
+ t_hlen;
+
+ if (t->dev->header_ops)
+ dev->hard_header_len = dst_len;
+ else
+ dev->needed_headroom = dst_len;
if (set_mtu) {
- dev->mtu = rt->dst.dev->mtu - t_hlen;
+ int mtu = rt->dst.dev->mtu - t_hlen;
+
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
- dev->mtu -= 8;
+ mtu -= 8;
if (dev->type == ARPHRD_ETHER)
- dev->mtu -= ETH_HLEN;
+ mtu -= ETH_HLEN;
- if (dev->mtu < IPV6_MIN_MTU)
- dev->mtu = IPV6_MIN_MTU;
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+ WRITE_ONCE(dev->mtu, mtu);
}
}
ip6_rt_put(rt);
@@ -1145,7 +1199,12 @@ static int ip6gre_calc_hlen(struct ip6_tnl *tunnel)
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
- tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen;
+
+ if (tunnel->dev->header_ops)
+ tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+ else
+ tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen;
+
return t_hlen;
}
@@ -1221,8 +1280,9 @@ static void ip6gre_tnl_parm_to_user(struct ip6_tnl_parm2 *u,
memcpy(u->name, p->name, sizeof(u->name));
}
-static int ip6gre_tunnel_ioctl(struct net_device *dev,
- struct ifreq *ifr, int cmd)
+static int ip6gre_tunnel_siocdevprivate(struct net_device *dev,
+ struct ifreq *ifr, void __user *data,
+ int cmd)
{
int err = 0;
struct ip6_tnl_parm2 p;
@@ -1236,7 +1296,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev,
switch (cmd) {
case SIOCGETTUNNEL:
if (dev == ign->fb_tunnel_dev) {
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+ if (copy_from_user(&p, data, sizeof(p))) {
err = -EFAULT;
break;
}
@@ -1247,7 +1307,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev,
}
memset(&p, 0, sizeof(p));
ip6gre_tnl_parm_to_user(&p, &t->parms);
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ if (copy_to_user(data, &p, sizeof(p)))
err = -EFAULT;
break;
@@ -1258,7 +1318,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev,
goto done;
err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ if (copy_from_user(&p, data, sizeof(p)))
goto done;
err = -EINVAL;
@@ -1295,7 +1355,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev,
memset(&p, 0, sizeof(p));
ip6gre_tnl_parm_to_user(&p, &t->parms);
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ if (copy_to_user(data, &p, sizeof(p)))
err = -EFAULT;
} else
err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
@@ -1308,7 +1368,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev,
if (dev == ign->fb_tunnel_dev) {
err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ if (copy_from_user(&p, data, sizeof(p)))
goto done;
err = -ENOENT;
ip6gre_tnl_parm_from_user(&p1, &p);
@@ -1375,9 +1435,9 @@ static const struct net_device_ops ip6gre_netdev_ops = {
.ndo_init = ip6gre_tunnel_init,
.ndo_uninit = ip6gre_tunnel_uninit,
.ndo_start_xmit = ip6gre_tunnel_xmit,
- .ndo_do_ioctl = ip6gre_tunnel_ioctl,
+ .ndo_siocdevprivate = ip6gre_tunnel_siocdevprivate,
.ndo_change_mtu = ip6_tnl_change_mtu,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
@@ -1414,26 +1474,23 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
static void ip6gre_tnl_init_features(struct net_device *dev)
{
struct ip6_tnl *nt = netdev_priv(dev);
+ __be16 flags;
- dev->features |= GRE6_FEATURES;
+ dev->features |= GRE6_FEATURES | NETIF_F_LLTX;
dev->hw_features |= GRE6_FEATURES;
- if (!(nt->parms.o_flags & TUNNEL_SEQ)) {
- /* TCP offload with GRE SEQ is not supported, nor
- * can we support 2 levels of outer headers requiring
- * an update.
- */
- if (!(nt->parms.o_flags & TUNNEL_CSUM) ||
- nt->encap.type == TUNNEL_ENCAP_NONE) {
- dev->features |= NETIF_F_GSO_SOFTWARE;
- dev->hw_features |= NETIF_F_GSO_SOFTWARE;
- }
+ flags = nt->parms.o_flags;
- /* Can use a lockless transmit, unless we generate
- * output sequences
- */
- dev->features |= NETIF_F_LLTX;
- }
+ /* TCP offload with GRE SEQ is not supported, nor can we support 2
+ * levels of outer headers requiring an update.
+ */
+ if (flags & TUNNEL_SEQ)
+ return;
+ if (flags & TUNNEL_CSUM && nt->encap.type != TUNNEL_ENCAP_NONE)
+ return;
+
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
}
static int ip6gre_tunnel_init_common(struct net_device *dev)
@@ -1472,6 +1529,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
}
ip6gre_tnl_init_features(dev);
+ netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL);
return 0;
cleanup_dst_cache_init:
@@ -1496,7 +1554,7 @@ static int ip6gre_tunnel_init(struct net_device *dev)
if (tunnel->parms.collect_md)
return 0;
- memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr));
+ __dev_addr_set(dev, &tunnel->parms.laddr, sizeof(struct in6_addr));
memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr));
if (ipv6_addr_any(&tunnel->parms.raddr))
@@ -1514,14 +1572,12 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev)
strcpy(tunnel->parms.name, dev->name);
tunnel->hlen = sizeof(struct ipv6hdr) + 4;
-
- dev_hold(dev);
}
static struct inet6_protocol ip6gre_protocol __read_mostly = {
.handler = gre_rcv,
.err_handler = ip6gre_err,
- .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+ .flags = INET6_PROTO_FINAL,
};
static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head)
@@ -1559,17 +1615,18 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head)
static int __net_init ip6gre_init_net(struct net *net)
{
struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+ struct net_device *ndev;
int err;
if (!net_has_fallback_tunnels(net))
return 0;
- ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0",
- NET_NAME_UNKNOWN,
- ip6gre_tunnel_setup);
- if (!ign->fb_tunnel_dev) {
+ ndev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0",
+ NET_NAME_UNKNOWN, ip6gre_tunnel_setup);
+ if (!ndev) {
err = -ENOMEM;
goto err_alloc_dev;
}
+ ign->fb_tunnel_dev = ndev;
dev_net_set(ign->fb_tunnel_dev, net);
/* FB netdevice is special: we have one, and only one per netns.
* Allowing to move it to another netns is clearly unsafe.
@@ -1589,7 +1646,7 @@ static int __net_init ip6gre_init_net(struct net *net)
return 0;
err_reg_dev:
- free_netdev(ign->fb_tunnel_dev);
+ free_netdev(ndev);
err_alloc_dev:
return err;
}
@@ -1813,7 +1870,7 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_change_mtu = ip6_tnl_change_mtu,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
@@ -1864,6 +1921,7 @@ static int ip6erspan_tap_init(struct net_device *dev)
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
ip6erspan_tnl_link_config(tunnel, 1);
+ netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL);
return 0;
cleanup_dst_cache_init:
@@ -1881,7 +1939,7 @@ static const struct net_device_ops ip6erspan_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_change_mtu = ip6_tnl_change_mtu,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
@@ -1963,8 +2021,6 @@ static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev,
if (tb[IFLA_MTU])
ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
- dev_hold(dev);
-
out:
return err;
}
diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c
index e0086758b6ee..9e3574880cb0 100644
--- a/net/ipv6/ip6_icmp.c
+++ b/net/ipv6/ip6_icmp.c
@@ -9,6 +9,8 @@
#if IS_ENABLED(CONFIG_IPV6)
+#if !IS_BUILTIN(CONFIG_IPV6)
+
static ip6_icmp_send_t __rcu *ip6_icmp_send;
int inet6_register_icmp_sender(ip6_icmp_send_t *fn)
@@ -31,25 +33,25 @@ int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn)
}
EXPORT_SYMBOL(inet6_unregister_icmp_sender);
-void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
+void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+ const struct inet6_skb_parm *parm)
{
ip6_icmp_send_t *send;
rcu_read_lock();
send = rcu_dereference(ip6_icmp_send);
-
- if (!send)
- goto out;
- send(skb, type, code, info, NULL);
-out:
+ if (send)
+ send(skb, type, code, info, NULL, parm);
rcu_read_unlock();
}
-EXPORT_SYMBOL(icmpv6_send);
+EXPORT_SYMBOL(__icmpv6_send);
+#endif
#if IS_ENABLED(CONFIG_NF_NAT)
#include <net/netfilter/nf_conntrack.h>
void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
{
+ struct inet6_skb_parm parm = { 0 };
struct sk_buff *cloned_skb = NULL;
enum ip_conntrack_info ctinfo;
struct in6_addr orig_ip;
@@ -57,7 +59,7 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
ct = nf_ct_get(skb_in, &ctinfo);
if (!ct || !(ct->status & IPS_SRC_NAT)) {
- icmpv6_send(skb_in, type, code, info);
+ __icmpv6_send(skb_in, type, code, info, &parm);
return;
}
@@ -72,7 +74,7 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
orig_ip = ipv6_hdr(skb_in)->saddr;
ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6;
- icmpv6_send(skb_in, type, code, info);
+ __icmpv6_send(skb_in, type, code, info, &parm);
ipv6_hdr(skb_in)->saddr = orig_ip;
out:
consume_skb(cloned_skb);
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 7b089d0ac8cd..e1ebf5e42ebe 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -32,6 +32,7 @@
#include <net/sock.h>
#include <net/snmp.h>
+#include <net/udp.h>
#include <net/ipv6.h>
#include <net/protocol.h>
@@ -44,21 +45,23 @@
#include <net/inet_ecn.h>
#include <net/dst_metadata.h>
-INDIRECT_CALLABLE_DECLARE(void udp_v6_early_demux(struct sk_buff *));
-INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *));
static void ip6_rcv_finish_core(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
- void (*edemux)(struct sk_buff *skb);
-
- if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
- const struct inet6_protocol *ipprot;
-
- ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]);
- if (ipprot && (edemux = READ_ONCE(ipprot->early_demux)))
- INDIRECT_CALL_2(edemux, tcp_v6_early_demux,
- udp_v6_early_demux, skb);
+ if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
+ !skb_dst(skb) && !skb->sk) {
+ switch (ipv6_hdr(skb)->nexthdr) {
+ case IPPROTO_TCP:
+ if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux))
+ tcp_v6_early_demux(skb);
+ break;
+ case IPPROTO_UDP:
+ if (READ_ONCE(net->ipv4.sysctl_udp_early_demux))
+ udp_v6_early_demux(skb);
+ break;
+ }
}
+
if (!skb_valid_dst(skb))
ip6_route_input(skb);
}
@@ -145,12 +148,14 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
struct net *net)
{
+ enum skb_drop_reason reason;
const struct ipv6hdr *hdr;
u32 pkt_len;
struct inet6_dev *idev;
if (skb->pkt_type == PACKET_OTHERHOST) {
- kfree_skb(skb);
+ dev_core_stats_rx_otherhost_dropped_inc(skb->dev);
+ kfree_skb_reason(skb, SKB_DROP_REASON_OTHERHOST);
return NULL;
}
@@ -160,9 +165,12 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
__IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_IN, skb->len);
+ SKB_DR_SET(reason, NOT_SPECIFIED);
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL ||
!idev || unlikely(idev->cnf.disable_ipv6)) {
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
+ if (idev && unlikely(idev->cnf.disable_ipv6))
+ SKB_DR_SET(reason, IPV6DISABLED);
goto drop;
}
@@ -186,8 +194,10 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
hdr = ipv6_hdr(skb);
- if (hdr->version != 6)
+ if (hdr->version != 6) {
+ SKB_DR_SET(reason, UNHANDLED_PROTO);
goto err;
+ }
__IP6_ADD_STATS(net, idev,
IPSTATS_MIB_NOECTPKTS +
@@ -225,8 +235,10 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
if (!ipv6_addr_is_multicast(&hdr->daddr) &&
(skb->pkt_type == PACKET_BROADCAST ||
skb->pkt_type == PACKET_MULTICAST) &&
- idev->cnf.drop_unicast_in_l2_multicast)
+ idev->cnf.drop_unicast_in_l2_multicast) {
+ SKB_DR_SET(reason, UNICAST_IN_L2_MULTICAST);
goto err;
+ }
/* RFC4291 2.7
* Nodes must not originate a packet to a multicast address whose scope
@@ -245,16 +257,6 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
if (ipv6_addr_is_multicast(&hdr->saddr))
goto err;
- /* While RFC4291 is not explicit about v4mapped addresses
- * in IPv6 headers, it seems clear linux dual-stack
- * model can not deal properly with these.
- * Security models could be fooled by ::ffff:127.0.0.1 for example.
- *
- * https://tools.ietf.org/html/draft-itojun-v6ops-v4mapped-harmful-02
- */
- if (ipv6_addr_v4mapped(&hdr->saddr))
- goto err;
-
skb->transport_header = skb->network_header + sizeof(*hdr);
IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
@@ -265,12 +267,11 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
if (pkt_len + sizeof(struct ipv6hdr) > skb->len) {
__IP6_INC_STATS(net,
idev, IPSTATS_MIB_INTRUNCATEDPKTS);
+ SKB_DR_SET(reason, PKT_TOO_SMALL);
goto drop;
}
- if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) {
- __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
- goto drop;
- }
+ if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
+ goto err;
hdr = ipv6_hdr(skb);
}
@@ -285,14 +286,16 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
rcu_read_unlock();
/* Must drop socket now because of tproxy. */
- skb_orphan(skb);
+ if (!skb_sk_is_prefetched(skb))
+ skb_orphan(skb);
return skb;
err:
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ SKB_DR_OR(reason, IP_INHDR);
drop:
rcu_read_unlock();
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return NULL;
}
@@ -351,7 +354,6 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
ip6_sublist_rcv(&sublist, curr_dev, curr_net);
}
-INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *));
/*
@@ -363,6 +365,7 @@ void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr,
const struct inet6_protocol *ipprot;
struct inet6_dev *idev;
unsigned int nhoff;
+ SKB_DR(reason);
bool raw;
/*
@@ -422,12 +425,16 @@ resubmit_final:
if (ipv6_addr_is_multicast(&hdr->daddr) &&
!ipv6_chk_mcast_addr(dev, &hdr->daddr,
&hdr->saddr) &&
- !ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb)))
+ !ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb))) {
+ SKB_DR_SET(reason, IP_INADDRERRORS);
goto discard;
+ }
}
if (!(ipprot->flags & INET6_PROTO_NOPOLICY) &&
- !xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
+ !xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ SKB_DR_SET(reason, XFRM_POLICY);
goto discard;
+ }
ret = INDIRECT_CALL_2(ipprot->handler, tcp_v6_rcv, udpv6_rcv,
skb);
@@ -453,8 +460,11 @@ resubmit_final:
IPSTATS_MIB_INUNKNOWNPROTOS);
icmpv6_send(skb, ICMPV6_PARAMPROB,
ICMPV6_UNK_NEXTHDR, nhoff);
+ SKB_DR_SET(reason, IP_NOPROTO);
+ } else {
+ SKB_DR_SET(reason, XFRM_POLICY);
}
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
} else {
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDELIVERS);
consume_skb(skb);
@@ -464,11 +474,12 @@ resubmit_final:
discard:
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
}
static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ skb_clear_delivery_time(skb);
rcu_read_lock();
ip6_protocol_deliver_rcu(net, skb, 0, false);
rcu_read_unlock();
@@ -518,7 +529,7 @@ int ip6_mc_input(struct sk_buff *skb)
/*
* IPv6 multicast router mode is now supported ;)
*/
- if (dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding &&
+ if (atomic_read(&dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding) &&
!(ipv6_addr_type(&hdr->daddr) &
(IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) &&
likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) {
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 7fbb44736a34..3ee345672849 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -13,6 +13,9 @@
#include <net/protocol.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/gro.h>
#include "ip6_offload.h"
@@ -74,7 +77,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct ipv6hdr *ipv6h;
const struct net_offload *ops;
- int proto;
+ int proto, nexthdr;
struct frag_hdr *fptr;
unsigned int payload_len;
u8 *prevhdr;
@@ -84,6 +87,28 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
bool gso_partial;
skb_reset_network_header(skb);
+ nexthdr = ipv6_has_hopopt_jumbo(skb);
+ if (nexthdr) {
+ const int hophdr_len = sizeof(struct hop_jumbo_hdr);
+ int err;
+
+ err = skb_cow_head(skb, 0);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ /* remove the HBH header.
+ * Layout: [Ethernet header][IPv6 header][HBH][TCP header]
+ */
+ memmove(skb_mac_header(skb) + hophdr_len,
+ skb_mac_header(skb),
+ ETH_HLEN + sizeof(struct ipv6hdr));
+ skb->data += hophdr_len;
+ skb->len -= hophdr_len;
+ skb->network_header += hophdr_len;
+ skb->mac_header += hophdr_len;
+ ipv6h = (struct ipv6hdr *)skb->data;
+ ipv6h->nexthdr = nexthdr;
+ }
nhoff = skb_network_header(skb) - skb_mac_header(skb);
if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
goto out;
@@ -111,6 +136,8 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
if (likely(ops && ops->callbacks.gso_segment)) {
skb_reset_transport_header(skb);
segs = ops->callbacks.gso_segment(skb, features);
+ if (!segs)
+ skb->network_header = skb_mac_header(skb) + nhoff - skb->head;
}
if (IS_ERR_OR_NULL(segs))
@@ -177,10 +204,6 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph,
return len;
}
-INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *,
- struct sk_buff *));
-INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *,
- struct sk_buff *));
INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,
struct sk_buff *skb)
{
@@ -196,12 +219,9 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,
off = skb_gro_offset(skb);
hlen = off + sizeof(*iph);
- iph = skb_gro_header_fast(skb, off);
- if (skb_gro_header_hard(skb, hlen)) {
- iph = skb_gro_header_slow(skb, hlen, off);
- if (unlikely(!iph))
- goto out;
- }
+ iph = skb_gro_header(skb, hlen, off);
+ if (unlikely(!iph))
+ goto out;
skb_set_network_header(skb, off);
skb_gro_pull(skb, sizeof(*iph));
@@ -209,11 +229,10 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,
flush += ntohs(iph->payload_len) != skb_gro_len(skb);
- rcu_read_lock();
proto = iph->nexthdr;
ops = rcu_dereference(inet6_offloads[proto]);
if (!ops || !ops->callbacks.gro_receive) {
- __pskb_pull(skb, skb_gro_offset(skb));
+ pskb_pull(skb, skb_gro_offset(skb));
skb_gro_frag0_invalidate(skb);
proto = ipv6_gso_pull_exthdrs(skb, proto);
skb_gro_pull(skb, -skb_transport_offset(skb));
@@ -222,7 +241,7 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,
ops = rcu_dereference(inet6_offloads[proto]);
if (!ops || !ops->callbacks.gro_receive)
- goto out_unlock;
+ goto out;
iph = ipv6_hdr(skb);
}
@@ -249,9 +268,9 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,
* memcmp() alone below is sufficient, right?
*/
if ((first_word & htonl(0xF00FFFFF)) ||
- !ipv6_addr_equal(&iph->saddr, &iph2->saddr) ||
- !ipv6_addr_equal(&iph->daddr, &iph2->daddr) ||
- *(u16 *)&iph->nexthdr != *(u16 *)&iph2->nexthdr) {
+ !ipv6_addr_equal(&iph->saddr, &iph2->saddr) ||
+ !ipv6_addr_equal(&iph->daddr, &iph2->daddr) ||
+ iph->nexthdr != iph2->nexthdr) {
not_same_flow:
NAPI_GRO_CB(p)->same_flow = 0;
continue;
@@ -262,7 +281,8 @@ not_same_flow:
goto not_same_flow;
}
/* flush if Traffic Class fields are different */
- NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000));
+ NAPI_GRO_CB(p)->flush |= !!((first_word & htonl(0x0FF00000)) |
+ (__force __be32)(iph->hop_limit ^ iph2->hop_limit));
NAPI_GRO_CB(p)->flush |= flush;
/* If the previous IP ID value was based on an atomic
@@ -280,9 +300,6 @@ not_same_flow:
pp = indirect_call_gro_receive_l4(tcp6_gro_receive, udp6_gro_receive,
ops->callbacks.gro_receive, head, skb);
-out_unlock:
- rcu_read_unlock();
-
out:
skb_gro_flush_final(skb, pp, flush);
@@ -319,33 +336,55 @@ static struct sk_buff *ip4ip6_gro_receive(struct list_head *head,
return inet_gro_receive(head, skb);
}
-INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *, int));
-INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int));
INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
{
const struct net_offload *ops;
- struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff);
+ struct ipv6hdr *iph;
int err = -ENOSYS;
+ u32 payload_len;
if (skb->encapsulation) {
skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6));
skb_set_inner_network_header(skb, nhoff);
}
- iph->payload_len = htons(skb->len - nhoff - sizeof(*iph));
-
- rcu_read_lock();
+ payload_len = skb->len - nhoff - sizeof(*iph);
+ if (unlikely(payload_len > IPV6_MAXPLEN)) {
+ struct hop_jumbo_hdr *hop_jumbo;
+ int hoplen = sizeof(*hop_jumbo);
+
+ /* Move network header left */
+ memmove(skb_mac_header(skb) - hoplen, skb_mac_header(skb),
+ skb->transport_header - skb->mac_header);
+ skb->data -= hoplen;
+ skb->len += hoplen;
+ skb->mac_header -= hoplen;
+ skb->network_header -= hoplen;
+ iph = (struct ipv6hdr *)(skb->data + nhoff);
+ hop_jumbo = (struct hop_jumbo_hdr *)(iph + 1);
+
+ /* Build hop-by-hop options */
+ hop_jumbo->nexthdr = iph->nexthdr;
+ hop_jumbo->hdrlen = 0;
+ hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
+ hop_jumbo->tlv_len = 4;
+ hop_jumbo->jumbo_payload_len = htonl(payload_len + hoplen);
+
+ iph->nexthdr = NEXTHDR_HOP;
+ iph->payload_len = 0;
+ } else {
+ iph = (struct ipv6hdr *)(skb->data + nhoff);
+ iph->payload_len = htons(payload_len);
+ }
nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops);
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
- goto out_unlock;
+ goto out;
err = INDIRECT_CALL_L4(ops->callbacks.gro_complete, tcp6_gro_complete,
udp6_gro_complete, skb, nhoff);
-out_unlock:
- rcu_read_unlock();
-
+out:
return err;
}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 087304427bbb..e19507614f64 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -54,23 +54,35 @@
#include <linux/mroute6.h>
#include <net/l3mdev.h>
#include <net/lwtunnel.h>
+#include <net/ip_tunnels.h>
static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct net_device *dev = dst->dev;
- const struct in6_addr *nexthop;
+ struct inet6_dev *idev = ip6_dst_idev(dst);
+ unsigned int hh_len = LL_RESERVED_SPACE(dev);
+ const struct in6_addr *daddr, *nexthop;
+ struct ipv6hdr *hdr;
struct neighbour *neigh;
int ret;
- if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
- struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
+ /* Be paranoid, rather than too clever. */
+ if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
+ skb = skb_expand_head(skb, hh_len);
+ if (!skb) {
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
+ return -ENOMEM;
+ }
+ }
+ hdr = ipv6_hdr(skb);
+ daddr = &hdr->daddr;
+ if (ipv6_addr_is_multicast(daddr)) {
if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
((mroute6_is_socket(net, skb) &&
!(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
- ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
- &ipv6_hdr(skb)->saddr))) {
+ ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
/* Do not check for IFF_ALLMULTI; multicast routing
@@ -81,7 +93,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
net, sk, newskb, NULL, newskb->dev,
dev_loopback_xmit);
- if (ipv6_hdr(skb)->hop_limit == 0) {
+ if (hdr->hop_limit == 0) {
IP6_INC_STATS(net, idev,
IPSTATS_MIB_OUTDISCARDS);
kfree_skb(skb);
@@ -90,9 +102,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
}
IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
-
- if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
- IPV6_ADDR_SCOPE_NODELOCAL &&
+ if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
!(dev->flags & IFF_LOOPBACK)) {
kfree_skb(skb);
return 0;
@@ -107,34 +117,77 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
}
rcu_read_lock_bh();
- nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
- neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
- if (unlikely(!neigh))
- neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
- if (!IS_ERR(neigh)) {
- sock_confirm_neigh(skb, neigh);
- ret = neigh_output(neigh, skb, false);
- rcu_read_unlock_bh();
- return ret;
+ nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
+ neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
+
+ if (unlikely(IS_ERR_OR_NULL(neigh))) {
+ if (unlikely(!neigh))
+ neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
+ if (IS_ERR(neigh)) {
+ rcu_read_unlock_bh();
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
+ return -EINVAL;
+ }
}
+ sock_confirm_neigh(skb, neigh);
+ ret = neigh_output(neigh, skb, false);
rcu_read_unlock_bh();
+ return ret;
+}
- IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
- kfree_skb(skb);
- return -EINVAL;
+static int
+ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
+ struct sk_buff *skb, unsigned int mtu)
+{
+ struct sk_buff *segs, *nskb;
+ netdev_features_t features;
+ int ret = 0;
+
+ /* Please see corresponding comment in ip_finish_output_gso
+ * describing the cases where GSO segment length exceeds the
+ * egress MTU.
+ */
+ features = netif_skb_features(skb);
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR_OR_NULL(segs)) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+
+ consume_skb(skb);
+
+ skb_list_walk_safe(segs, segs, nskb) {
+ int err;
+
+ skb_mark_not_on_list(segs);
+ err = ip6_fragment(net, sk, segs, ip6_finish_output2);
+ if (err && ret == 0)
+ ret = err;
+ }
+
+ return ret;
}
static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ unsigned int mtu;
+
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) {
- IPCB(skb)->flags |= IPSKB_REROUTED;
+ IP6CB(skb)->flags |= IP6SKB_REROUTED;
return dst_output(net, sk, skb);
}
#endif
- if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
+ mtu = ip6_skb_dst_mtu(skb);
+ if (skb_is_gso(skb) &&
+ !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
+ !skb_gso_validate_network_len(skb, mtu))
+ return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
+
+ if ((skb->len > mtu && !skb_is_gso(skb)) ||
dst_allfrag(skb_dst(skb)) ||
(IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
return ip6_fragment(net, sk, skb, ip6_finish_output2);
@@ -149,11 +202,10 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
switch (ret) {
case NET_XMIT_SUCCESS:
- return __ip6_finish_output(net, sk, skb);
case NET_XMIT_CN:
return __ip6_finish_output(net, sk, skb) ? : ret;
default:
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
return ret;
}
}
@@ -168,7 +220,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (unlikely(idev->cnf.disable_ipv6)) {
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
return 0;
}
@@ -177,6 +229,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
ip6_finish_output,
!(IP6CB(skb)->flags & IP6SKB_REROUTED));
}
+EXPORT_SYMBOL(ip6_output);
bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
{
@@ -199,6 +252,10 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
const struct ipv6_pinfo *np = inet6_sk(sk);
struct in6_addr *first_hop = &fl6->daddr;
struct dst_entry *dst = skb_dst(skb);
+ struct net_device *dev = dst->dev;
+ struct inet6_dev *idev = ip6_dst_idev(dst);
+ struct hop_jumbo_hdr *hop_jumbo;
+ int hoplen = sizeof(*hop_jumbo);
unsigned int head_room;
struct ipv6hdr *hdr;
u8 proto = fl6->flowi6_proto;
@@ -206,22 +263,16 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
int hlimit = -1;
u32 mtu;
- head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
+ head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
if (opt)
head_room += opt->opt_nflen + opt->opt_flen;
- if (unlikely(skb_headroom(skb) < head_room)) {
- struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
- if (!skb2) {
- IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_OUTDISCARDS);
- kfree_skb(skb);
+ if (unlikely(head_room > skb_headroom(skb))) {
+ skb = skb_expand_head(skb, head_room);
+ if (!skb) {
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
return -ENOBUFS;
}
- if (skb->sk)
- skb_set_owner_w(skb2, skb->sk);
- consume_skb(skb);
- skb = skb2;
}
if (opt) {
@@ -235,6 +286,20 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
&fl6->saddr);
}
+ if (unlikely(seg_len > IPV6_MAXPLEN)) {
+ hop_jumbo = skb_push(skb, hoplen);
+
+ hop_jumbo->nexthdr = proto;
+ hop_jumbo->hdrlen = 0;
+ hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
+ hop_jumbo->tlv_len = 4;
+ hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
+
+ proto = IPPROTO_HOPOPTS;
+ seg_len = 0;
+ IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
+ }
+
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
hdr = ipv6_hdr(skb);
@@ -263,8 +328,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
mtu = dst_mtu(dst);
if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
- IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_OUT, skb->len);
+ IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
/* if egress device is enslaved to an L3 master device pass the
* skb to its handler for processing
@@ -277,17 +341,17 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
* we promote our socket to non const
*/
return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
- net, (struct sock *)sk, skb, NULL, dst->dev,
+ net, (struct sock *)sk, skb, NULL, dev,
dst_output);
}
- skb->dev = dst->dev;
+ skb->dev = dev;
/* ipv6_local_error() does not require socket lock,
* we promote our socket to non const
*/
ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
- IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
kfree_skb(skb);
return -EMSGSIZE;
}
@@ -395,7 +459,7 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk,
}
#endif
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
return dst_output(net, sk, skb);
}
@@ -419,13 +483,15 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
int ip6_forward(struct sk_buff *skb)
{
- struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
struct dst_entry *dst = skb_dst(skb);
struct ipv6hdr *hdr = ipv6_hdr(skb);
struct inet6_skb_parm *opt = IP6CB(skb);
struct net *net = dev_net(dst->dev);
+ struct inet6_dev *idev;
+ SKB_DR(reason);
u32 mtu;
+ idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
if (net->ipv6.devconf_all->forwarding == 0)
goto error;
@@ -438,7 +504,9 @@ int ip6_forward(struct sk_buff *skb)
if (skb_warn_if_lro(skb))
goto drop;
- if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
+ if (!net->ipv6.devconf_all->disable_policy &&
+ (!idev || !idev->cnf.disable_policy) &&
+ !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
goto drop;
}
@@ -467,12 +535,10 @@ int ip6_forward(struct sk_buff *skb)
* check and decrement ttl
*/
if (hdr->hop_limit <= 1) {
- /* Force OUTPUT device used as source address */
- skb->dev = dst->dev;
icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
return -ETIMEDOUT;
}
@@ -480,9 +546,10 @@ int ip6_forward(struct sk_buff *skb)
if (net->ipv6.devconf_all->proxy_ndp &&
pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
int proxied = ip6_forward_proxy_check(skb);
- if (proxied > 0)
+ if (proxied > 0) {
+ hdr->hop_limit--;
return ip6_input(skb);
- else if (proxied < 0) {
+ } else if (proxied < 0) {
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
goto drop;
}
@@ -490,6 +557,7 @@ int ip6_forward(struct sk_buff *skb)
if (!xfrm6_route_forward(skb)) {
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
+ SKB_DR_SET(reason, XFRM_POLICY);
goto drop;
}
dst = skb_dst(skb);
@@ -538,7 +606,7 @@ int ip6_forward(struct sk_buff *skb)
}
}
- mtu = ip6_dst_mtu_forward(dst);
+ mtu = ip6_dst_mtu_maybe_forward(dst, true);
if (mtu < IPV6_MIN_MTU)
mtu = IPV6_MIN_MTU;
@@ -549,7 +617,7 @@ int ip6_forward(struct sk_buff *skb)
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
__IP6_INC_STATS(net, ip6_dst_idev(dst),
IPSTATS_MIB_FRAGFAILS);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
return -EMSGSIZE;
}
@@ -571,8 +639,9 @@ int ip6_forward(struct sk_buff *skb)
error:
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
+ SKB_DR_SET(reason, IP_INADDRERRORS);
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return -EINVAL;
}
@@ -766,6 +835,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
inet6_sk(skb->sk) : NULL;
+ bool mono_delivery_time = skb->mono_delivery_time;
struct ip6_frag_state state;
unsigned int mtu, hlen, nexthdr_offset;
ktime_t tstamp = skb->tstamp;
@@ -856,7 +926,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
if (iter.frag)
ip6_fraglist_prepare(skb, &iter);
- skb->tstamp = tstamp;
+ skb_set_delivery_time(skb, tstamp, mono_delivery_time);
err = output(net, sk, skb);
if (!err)
IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
@@ -915,7 +985,7 @@ slow_path:
/*
* Put this fragment into the sending queue.
*/
- frag->tstamp = tstamp;
+ skb_set_delivery_time(frag, tstamp, mono_delivery_time);
err = output(net, sk, frag);
if (err)
goto fail;
@@ -930,7 +1000,7 @@ slow_path:
fail_toobig:
if (skb->sk && dst_allfrag(skb_dst(skb)))
- sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
+ sk_gso_disable(skb->sk);
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
err = -EMSGSIZE;
@@ -987,8 +1057,7 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
#ifdef CONFIG_IPV6_SUBTREES
ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
#endif
- (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
- (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
+ (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
dst_release(dst);
dst = NULL;
}
@@ -1016,13 +1085,11 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
* ip6_route_output will fail given src=any saddr, though, so
* that's why we try it again later.
*/
- if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
+ if (ipv6_addr_any(&fl6->saddr)) {
struct fib6_info *from;
struct rt6_info *rt;
- bool had_dst = *dst != NULL;
- if (!had_dst)
- *dst = ip6_route_output(net, sk, fl6);
+ *dst = ip6_route_output(net, sk, fl6);
rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
rcu_read_lock();
@@ -1039,7 +1106,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
* never existed and let the SA-enabled version take
* over.
*/
- if (!had_dst && (*dst)->error) {
+ if ((*dst)->error) {
dst_release(*dst);
*dst = NULL;
}
@@ -1117,6 +1184,7 @@ out_err_release:
/**
* ip6_dst_lookup - perform route lookup on flow
+ * @net: Network namespace to perform lookup in
* @sk: socket which provides route info
* @dst: pointer to dst_entry * for result
* @fl6: flow to lookup
@@ -1135,6 +1203,7 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup);
/**
* ip6_dst_lookup_flow - perform route lookup on flow with ipsec
+ * @net: Network namespace to perform lookup in
* @sk: socket which provides route info
* @fl6: flow to lookup
* @final_dst: final destination address for ipsec lookup
@@ -1196,6 +1265,74 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
}
EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
+/**
+ * ip6_dst_lookup_tunnel - perform route lookup on tunnel
+ * @skb: Packet for which lookup is done
+ * @dev: Tunnel device
+ * @net: Network namespace of tunnel device
+ * @sock: Socket which provides route info
+ * @saddr: Memory to store the src ip address
+ * @info: Tunnel information
+ * @protocol: IP protocol
+ * @use_cache: Flag to enable cache usage
+ * This function performs a route lookup on a tunnel
+ *
+ * It returns a valid dst pointer and stores src address to be used in
+ * tunnel in param saddr on success, else a pointer encoded error code.
+ */
+
+struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
+ struct net_device *dev,
+ struct net *net,
+ struct socket *sock,
+ struct in6_addr *saddr,
+ const struct ip_tunnel_info *info,
+ u8 protocol,
+ bool use_cache)
+{
+ struct dst_entry *dst = NULL;
+#ifdef CONFIG_DST_CACHE
+ struct dst_cache *dst_cache;
+#endif
+ struct flowi6 fl6;
+ __u8 prio;
+
+#ifdef CONFIG_DST_CACHE
+ dst_cache = (struct dst_cache *)&info->dst_cache;
+ if (use_cache) {
+ dst = dst_cache_get_ip6(dst_cache, saddr);
+ if (dst)
+ return dst;
+ }
+#endif
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_proto = protocol;
+ fl6.daddr = info->key.u.ipv6.dst;
+ fl6.saddr = info->key.u.ipv6.src;
+ prio = info->key.tos;
+ fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
+
+ dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
+ NULL);
+ if (IS_ERR(dst)) {
+ netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
+ return ERR_PTR(-ENETUNREACH);
+ }
+ if (dst->dev == dev) { /* is this necessary? */
+ netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
+ dst_release(dst);
+ return ERR_PTR(-ELOOP);
+ }
+#ifdef CONFIG_DST_CACHE
+ if (use_cache)
+ dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
+#endif
+ *saddr = fl6.saddr;
+ return dst;
+}
+EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
+
static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
gfp_t gfp)
{
@@ -1234,11 +1371,16 @@ static void ip6_append_data_mtu(unsigned int *mtu,
static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
- struct rt6_info *rt, struct flowi6 *fl6)
+ struct rt6_info *rt)
{
struct ipv6_pinfo *np = inet6_sk(sk);
unsigned int mtu;
- struct ipv6_txoptions *opt = ipc6->opt;
+ struct ipv6_txoptions *nopt, *opt = ipc6->opt;
+
+ /* callers pass dst together with a reference, set it first so
+ * ip6_cork_release() can put it down even in case of an error.
+ */
+ cork->base.dst = &rt->dst;
/*
* setup for corking
@@ -1247,39 +1389,32 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
if (WARN_ON(v6_cork->opt))
return -EINVAL;
- v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
- if (unlikely(!v6_cork->opt))
+ nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
+ if (unlikely(!nopt))
return -ENOBUFS;
- v6_cork->opt->tot_len = sizeof(*opt);
- v6_cork->opt->opt_flen = opt->opt_flen;
- v6_cork->opt->opt_nflen = opt->opt_nflen;
+ nopt->tot_len = sizeof(*opt);
+ nopt->opt_flen = opt->opt_flen;
+ nopt->opt_nflen = opt->opt_nflen;
- v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
- sk->sk_allocation);
- if (opt->dst0opt && !v6_cork->opt->dst0opt)
+ nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
+ if (opt->dst0opt && !nopt->dst0opt)
return -ENOBUFS;
- v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
- sk->sk_allocation);
- if (opt->dst1opt && !v6_cork->opt->dst1opt)
+ nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
+ if (opt->dst1opt && !nopt->dst1opt)
return -ENOBUFS;
- v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
- sk->sk_allocation);
- if (opt->hopopt && !v6_cork->opt->hopopt)
+ nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
+ if (opt->hopopt && !nopt->hopopt)
return -ENOBUFS;
- v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
- sk->sk_allocation);
- if (opt->srcrt && !v6_cork->opt->srcrt)
+ nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
+ if (opt->srcrt && !nopt->srcrt)
return -ENOBUFS;
/* need source address above miyazawa*/
}
- dst_hold(&rt->dst);
- cork->base.dst = &rt->dst;
- cork->fl.u.ip6 = *fl6;
v6_cork->hop_limit = ipc6->hlimit;
v6_cork->tclass = ipc6->tclass;
if (rt->dst.flags & DST_XFRM_TUNNEL)
@@ -1292,8 +1427,6 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
if (np->frag_size)
mtu = np->frag_size;
}
- if (mtu < IPV6_MIN_MTU)
- return -EINVAL;
cork->base.fragsize = mtu;
cork->base.gso_size = ipc6->gso_size;
cork->base.tx_flags = 0;
@@ -1310,17 +1443,18 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
}
static int __ip6_append_data(struct sock *sk,
- struct flowi6 *fl6,
struct sk_buff_head *queue,
- struct inet_cork *cork,
+ struct inet_cork_full *cork_full,
struct inet6_cork *v6_cork,
struct page_frag *pfrag,
int getfrag(void *from, char *to, int offset,
int len, int odd, struct sk_buff *skb),
- void *from, int length, int transhdrlen,
+ void *from, size_t length, int transhdrlen,
unsigned int flags, struct ipcm6_cookie *ipc6)
{
struct sk_buff *skb, *skb_prev = NULL;
+ struct inet_cork *cork = &cork_full->base;
+ struct flowi6 *fl6 = &cork_full->fl.u.ip6;
unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
struct ubuf_info *uarg = NULL;
int exthdrlen = 0;
@@ -1329,6 +1463,7 @@ static int __ip6_append_data(struct sock *sk,
int copy;
int err;
int offset = 0;
+ bool zc = false;
u32 tskey = 0;
struct rt6_info *rt = (struct rt6_info *)cork->dst;
struct ipv6_txoptions *opt = v6_cork->opt;
@@ -1349,14 +1484,12 @@ static int __ip6_append_data(struct sock *sk,
if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
- tskey = sk->sk_tskey++;
+ tskey = atomic_inc_return(&sk->sk_tskey) - 1;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
(opt ? opt->opt_nflen : 0);
- maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
- sizeof(struct frag_hdr);
headersize = sizeof(struct ipv6hdr) +
(opt ? opt->opt_flen + opt->opt_nflen : 0) +
@@ -1364,6 +1497,13 @@ static int __ip6_append_data(struct sock *sk,
sizeof(struct frag_hdr) : 0) +
rt->rt6i_nfheader_len;
+ if (mtu <= fragheaderlen ||
+ ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
+ goto emsgsize;
+
+ maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
+ sizeof(struct frag_hdr);
+
/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
* the first fragment
*/
@@ -1372,6 +1512,7 @@ static int __ip6_append_data(struct sock *sk,
if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
(sk->sk_protocol == IPPROTO_UDP ||
+ sk->sk_protocol == IPPROTO_ICMPV6 ||
sk->sk_protocol == IPPROTO_RAW)) {
ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
sizeof(struct ipv6hdr));
@@ -1400,17 +1541,35 @@ emsgsize:
rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
csummode = CHECKSUM_PARTIAL;
- if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
- uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
- if (!uarg)
- return -ENOBUFS;
- extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
- if (rt->dst.dev->features & NETIF_F_SG &&
- csummode == CHECKSUM_PARTIAL) {
- paged = true;
- } else {
- uarg->zerocopy = 0;
- skb_zcopy_set(skb, uarg, &extra_uref);
+ if ((flags & MSG_ZEROCOPY) && length) {
+ struct msghdr *msg = from;
+
+ if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
+ if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
+ return -EINVAL;
+
+ /* Leave uarg NULL if can't zerocopy, callers should
+ * be able to handle it.
+ */
+ if ((rt->dst.dev->features & NETIF_F_SG) &&
+ csummode == CHECKSUM_PARTIAL) {
+ paged = true;
+ zc = true;
+ uarg = msg->msg_ubuf;
+ }
+ } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
+ uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
+ if (!uarg)
+ return -ENOBUFS;
+ extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
+ if (rt->dst.dev->features & NETIF_F_SG &&
+ csummode == CHECKSUM_PARTIAL) {
+ paged = true;
+ zc = true;
+ } else {
+ uarg_to_msgzc(uarg)->zerocopy = 0;
+ skb_zcopy_set(skb, uarg, &extra_uref);
+ }
}
}
@@ -1420,7 +1579,7 @@ emsgsize:
* Otherwise, we need to reserve fragment header and
* fragment alignment (= 8-15 octects, in total).
*
- * Note that we may need to "move" the data from the tail of
+ * Note that we may need to "move" the data from the tail
* of the buffer to the new fragment when we split
* the message.
*
@@ -1445,7 +1604,7 @@ emsgsize:
unsigned int datalen;
unsigned int fraglen;
unsigned int fraggap;
- unsigned int alloclen;
+ unsigned int alloclen, alloc_extra;
unsigned int pagedlen;
alloc_new_skb:
/* There's no room in the current skb */
@@ -1472,17 +1631,28 @@ alloc_new_skb:
fraglen = datalen + fragheaderlen;
pagedlen = 0;
+ alloc_extra = hh_len;
+ alloc_extra += dst_exthdrlen;
+ alloc_extra += rt->dst.trailer_len;
+
+ /* We just reserve space for fragment header.
+ * Note: this may be overallocation if the message
+ * (without MSG_MORE) fits into the MTU.
+ */
+ alloc_extra += sizeof(struct frag_hdr);
+
if ((flags & MSG_MORE) &&
!(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
- else if (!paged)
+ else if (!paged &&
+ (fraglen + alloc_extra < SKB_MAX_ALLOC ||
+ !(rt->dst.dev->features & NETIF_F_SG)))
alloclen = fraglen;
else {
- alloclen = min_t(int, fraglen, MAX_HEADER);
- pagedlen = fraglen - alloclen;
+ alloclen = fragheaderlen + transhdrlen;
+ pagedlen = datalen - transhdrlen;
}
-
- alloclen += dst_exthdrlen;
+ alloclen += alloc_extra;
if (datalen != length + fraggap) {
/*
@@ -1492,30 +1662,21 @@ alloc_new_skb:
datalen += rt->dst.trailer_len;
}
- alloclen += rt->dst.trailer_len;
fraglen = datalen + fragheaderlen;
- /*
- * We just reserve space for fragment header.
- * Note: this may be overallocation if the message
- * (without MSG_MORE) fits into the MTU.
- */
- alloclen += sizeof(struct frag_hdr);
-
copy = datalen - transhdrlen - fraggap - pagedlen;
if (copy < 0) {
err = -EINVAL;
goto error;
}
if (transhdrlen) {
- skb = sock_alloc_send_skb(sk,
- alloclen + hh_len,
+ skb = sock_alloc_send_skb(sk, alloclen,
(flags & MSG_DONTWAIT), &err);
} else {
skb = NULL;
if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
2 * sk->sk_sndbuf)
- skb = alloc_skb(alloclen + hh_len,
+ skb = alloc_skb(alloclen,
sk->sk_allocation);
if (unlikely(!skb))
err = -ENOBUFS;
@@ -1543,7 +1704,7 @@ alloc_new_skb:
if (fraggap) {
skb->csum = skb_copy_and_csum_bits(
skb_prev, maxfraglen,
- data + transhdrlen, fraggap, 0);
+ data + transhdrlen, fraggap);
skb_prev->csum = csum_sub(skb_prev->csum,
skb->csum);
data += fraggap;
@@ -1599,13 +1760,14 @@ alloc_new_skb:
err = -EFAULT;
goto error;
}
- } else if (!uarg || !uarg->zerocopy) {
+ } else if (!zc) {
int i = skb_shinfo(skb)->nr_frags;
err = -ENOMEM;
if (!sk_page_frag_refill(sk, pfrag))
goto error;
+ skb_zcopy_downgrade_managed(skb);
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
err = -EMSGSIZE;
@@ -1645,8 +1807,7 @@ alloc_new_skb:
error_efault:
err = -EFAULT;
error:
- if (uarg)
- sock_zerocopy_put_abort(uarg, extra_uref);
+ net_zcopy_put_abort(uarg, extra_uref);
cork->length -= length;
IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
@@ -1656,7 +1817,7 @@ error:
int ip6_append_data(struct sock *sk,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
- void *from, int length, int transhdrlen,
+ void *from, size_t length, int transhdrlen,
struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
struct rt6_info *rt, unsigned int flags)
{
@@ -1671,34 +1832,46 @@ int ip6_append_data(struct sock *sk,
/*
* setup for corking
*/
+ dst_hold(&rt->dst);
err = ip6_setup_cork(sk, &inet->cork, &np->cork,
- ipc6, rt, fl6);
+ ipc6, rt);
if (err)
return err;
+ inet->cork.fl.u.ip6 = *fl6;
exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
length += exthdrlen;
transhdrlen += exthdrlen;
} else {
- fl6 = &inet->cork.fl.u.ip6;
transhdrlen = 0;
}
- return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
+ return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
&np->cork, sk_page_frag(sk), getfrag,
from, length, transhdrlen, flags, ipc6);
}
EXPORT_SYMBOL_GPL(ip6_append_data);
+static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
+{
+ struct dst_entry *dst = cork->base.dst;
+
+ cork->base.dst = NULL;
+ cork->base.flags &= ~IPCORK_ALLFRAG;
+ skb_dst_set(skb, dst);
+}
+
static void ip6_cork_release(struct inet_cork_full *cork,
struct inet6_cork *v6_cork)
{
if (v6_cork->opt) {
- kfree(v6_cork->opt->dst0opt);
- kfree(v6_cork->opt->dst1opt);
- kfree(v6_cork->opt->hopopt);
- kfree(v6_cork->opt->srcrt);
- kfree(v6_cork->opt);
+ struct ipv6_txoptions *opt = v6_cork->opt;
+
+ kfree(opt->dst0opt);
+ kfree(opt->dst1opt);
+ kfree(opt->hopopt);
+ kfree(opt->srcrt);
+ kfree(opt);
v6_cork->opt = NULL;
}
@@ -1707,7 +1880,6 @@ static void ip6_cork_release(struct inet_cork_full *cork,
cork->base.dst = NULL;
cork->base.flags &= ~IPCORK_ALLFRAG;
}
- memset(&cork->fl, 0, sizeof(cork->fl));
}
struct sk_buff *__ip6_make_skb(struct sock *sk,
@@ -1717,7 +1889,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
{
struct sk_buff *skb, *tmp_skb;
struct sk_buff **tail_skb;
- struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
+ struct in6_addr *final_dst;
struct ipv6_pinfo *np = inet6_sk(sk);
struct net *net = sock_net(sk);
struct ipv6hdr *hdr;
@@ -1747,9 +1919,9 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
/* Allow local fragmentation. */
skb->ignore_df = ip6_sk_ignore_df(sk);
-
- *final_dst = fl6->daddr;
__skb_pull(skb, skb_network_header_len(skb));
+
+ final_dst = &fl6->daddr;
if (opt && opt->opt_flen)
ipv6_push_frag_opts(skb, opt, &proto);
if (opt && opt->opt_nflen)
@@ -1769,10 +1941,9 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
skb->priority = sk->sk_priority;
skb->mark = cork->base.mark;
-
skb->tstamp = cork->base.transmit_time;
- skb_dst_set(skb, dst_clone(&rt->dst));
+ ip6_cork_steal_dst(skb, cork);
IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
if (proto == IPPROTO_ICMPV6) {
struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
@@ -1843,27 +2014,27 @@ EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
struct sk_buff *ip6_make_skb(struct sock *sk,
int getfrag(void *from, char *to, int offset,
int len, int odd, struct sk_buff *skb),
- void *from, int length, int transhdrlen,
- struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
- struct rt6_info *rt, unsigned int flags,
- struct inet_cork_full *cork)
+ void *from, size_t length, int transhdrlen,
+ struct ipcm6_cookie *ipc6, struct rt6_info *rt,
+ unsigned int flags, struct inet_cork_full *cork)
{
struct inet6_cork v6_cork;
struct sk_buff_head queue;
int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
int err;
- if (flags & MSG_PROBE)
+ if (flags & MSG_PROBE) {
+ dst_release(&rt->dst);
return NULL;
+ }
__skb_queue_head_init(&queue);
cork->base.flags = 0;
cork->base.addr = 0;
cork->base.opt = NULL;
- cork->base.dst = NULL;
v6_cork.opt = NULL;
- err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
+ err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
if (err) {
ip6_cork_release(cork, &v6_cork);
return ERR_PTR(err);
@@ -1871,7 +2042,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
if (ipc6->dontfrag < 0)
ipc6->dontfrag = inet6_sk(sk)->dontfrag;
- err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
+ err = __ip6_append_data(sk, &queue, cork, &v6_cork,
&current->task_frag, getfrag, from,
length + exthdrlen, transhdrlen + exthdrlen,
flags, ipc6);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 4703b09808d0..2fb4c6ad7243 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -89,38 +89,17 @@ struct ip6_tnl_net {
struct ip6_tnl __rcu *collect_md_tun;
};
-static struct net_device_stats *ip6_get_stats(struct net_device *dev)
+static inline int ip6_tnl_mpls_supported(void)
{
- struct pcpu_sw_netstats tmp, sum = { 0 };
- int i;
-
- for_each_possible_cpu(i) {
- unsigned int start;
- const struct pcpu_sw_netstats *tstats =
- per_cpu_ptr(dev->tstats, i);
-
- do {
- start = u64_stats_fetch_begin_irq(&tstats->syncp);
- tmp.rx_packets = tstats->rx_packets;
- tmp.rx_bytes = tstats->rx_bytes;
- tmp.tx_packets = tstats->tx_packets;
- tmp.tx_bytes = tstats->tx_bytes;
- } while (u64_stats_fetch_retry_irq(&tstats->syncp, start));
-
- sum.rx_packets += tmp.rx_packets;
- sum.rx_bytes += tmp.rx_bytes;
- sum.tx_packets += tmp.tx_packets;
- sum.tx_bytes += tmp.tx_bytes;
- }
- dev->stats.rx_packets = sum.rx_packets;
- dev->stats.rx_bytes = sum.rx_bytes;
- dev->stats.tx_packets = sum.tx_packets;
- dev->stats.tx_bytes = sum.tx_bytes;
- return &dev->stats;
+ return IS_ENABLED(CONFIG_MPLS);
}
+#define for_each_ip6_tunnel_rcu(start) \
+ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+
/**
* ip6_tnl_lookup - fetch tunnel matching the end-point addresses
+ * @net: network namespace
* @link: ifindex of underlying interface
* @remote: the address of the tunnel exit-point
* @local: the address of the tunnel entry-point
@@ -131,9 +110,6 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev)
* else %NULL
**/
-#define for_each_ip6_tunnel_rcu(start) \
- for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
-
static struct ip6_tnl *
ip6_tnl_lookup(struct net *net, int link,
const struct in6_addr *remote, const struct in6_addr *local)
@@ -198,6 +174,7 @@ ip6_tnl_lookup(struct net *net, int link,
/**
* ip6_tnl_bucket - get head of list matching given tunnel parameters
+ * @ip6n: the private data for ip6_vti in the netns
* @p: parameters containing tunnel end-points
*
* Description:
@@ -224,6 +201,7 @@ ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p)
/**
* ip6_tnl_link - add tunnel to hash table
+ * @ip6n: the private data for ip6_vti in the netns
* @t: tunnel to be added
**/
@@ -240,6 +218,7 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
/**
* ip6_tnl_unlink - remove tunnel from hash table
+ * @ip6n: the private data for ip6_vti in the netns
* @t: tunnel to be removed
**/
@@ -278,8 +257,6 @@ static int ip6_tnl_create2(struct net_device *dev)
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
int err;
- t = netdev_priv(dev);
-
dev->rtnl_link_ops = &ip6_link_ops;
err = register_netdevice(dev);
if (err < 0)
@@ -287,7 +264,6 @@ static int ip6_tnl_create2(struct net_device *dev)
strcpy(t->parms.name, dev->name);
- dev_hold(dev);
ip6_tnl_link(ip6n, t);
return 0;
@@ -297,8 +273,8 @@ out:
/**
* ip6_tnl_create - create a new tunnel
+ * @net: network namespace
* @p: tunnel parameters
- * @pt: pointer to new tunnel
*
* Description:
* Create tunnel matching given parameters.
@@ -317,7 +293,7 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p)
if (p->name[0]) {
if (!dev_valid_name(p->name))
goto failed;
- strlcpy(name, p->name, IFNAMSIZ);
+ strscpy(name, p->name, IFNAMSIZ);
} else {
sprintf(name, "ip6tnl%%d");
}
@@ -346,6 +322,7 @@ failed:
/**
* ip6_tnl_locate - find or create tunnel matching given parameters
+ * @net: network namespace
* @p: tunnel parameters
* @create: != 0 if allowed to create new tunnel if no match found
*
@@ -404,12 +381,13 @@ ip6_tnl_dev_uninit(struct net_device *dev)
else
ip6_tnl_unlink(ip6n, t);
dst_cache_reset(&t->dst_cache);
- dev_put(dev);
+ netdev_put(dev, &t->dev_tracker);
}
/**
- * parse_tvl_tnl_enc_lim - handle encapsulation limit option
+ * ip6_tnl_parse_tlv_enc_lim - handle encapsulation limit option
* @skb: received socket buffer
+ * @raw: the ICMPv6 error message data
*
* Return:
* 0 if none was found,
@@ -478,14 +456,9 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
}
EXPORT_SYMBOL(ip6_tnl_parse_tlv_enc_lim);
-/**
- * ip6_tnl_err - tunnel error handler
- *
- * Description:
- * ip6_tnl_err() should handle errors in the tunnel according
- * to the specifications in RFC 2473.
- **/
-
+/* ip6_tnl_err() should handle errors in the tunnel according to the
+ * specifications in RFC 2473.
+ */
static int
ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
u8 *type, u8 *code, int *msg, __u32 *info, int offset)
@@ -718,6 +691,20 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
}
+static int
+mplsip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ __u32 rel_info = ntohl(info);
+ int err, rel_msg = 0;
+ u8 rel_type = type;
+ u8 rel_code = code;
+
+ err = ip6_tnl_err(skb, IPPROTO_MPLS, opt, &rel_type, &rel_code,
+ &rel_msg, &rel_info, offset);
+ return err;
+}
+
static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
const struct ipv6hdr *ipv6h,
struct sk_buff *skb)
@@ -740,6 +727,14 @@ static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
return IP6_ECN_decapsulate(ipv6h, skb);
}
+static inline int mplsip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
+ const struct ipv6hdr *ipv6h,
+ struct sk_buff *skb)
+{
+ /* ECN is not supported in AF_MPLS */
+ return 0;
+}
+
__u32 ip6_tnl_get_cap(struct ip6_tnl *t,
const struct in6_addr *laddr,
const struct in6_addr *raddr)
@@ -801,7 +796,6 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
struct sk_buff *skb),
bool log_ecn_err)
{
- struct pcpu_sw_netstats *tstats;
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
int err;
@@ -840,6 +834,7 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
} else {
skb->dev = tunnel->dev;
+ skb_reset_mac_header(skb);
}
skb_reset_network_header(skb);
@@ -860,11 +855,7 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
}
}
- tstats = this_cpu_ptr(tunnel->dev->tstats);
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
- u64_stats_update_end(&tstats->syncp);
+ dev_sw_netstats_rx_add(tunnel->dev, skb->len);
skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
@@ -886,7 +877,15 @@ int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb,
struct metadata_dst *tun_dst,
bool log_ecn_err)
{
- return __ip6_tnl_rcv(t, skb, tpi, tun_dst, ip6ip6_dscp_ecn_decapsulate,
+ int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
+ const struct ipv6hdr *ipv6h,
+ struct sk_buff *skb);
+
+ dscp_ecn_decapsulate = ip6ip6_dscp_ecn_decapsulate;
+ if (tpi->proto == htons(ETH_P_IP))
+ dscp_ecn_decapsulate = ip4ip6_dscp_ecn_decapsulate;
+
+ return __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
log_ecn_err);
}
EXPORT_SYMBOL(ip6_tnl_rcv);
@@ -901,6 +900,11 @@ static const struct tnl_ptk_info tpi_v4 = {
.proto = htons(ETH_P_IP),
};
+static const struct tnl_ptk_info tpi_mpls = {
+ /* no tunnel info required for mplsip6. */
+ .proto = htons(ETH_P_MPLS_UC),
+};
+
static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
const struct tnl_ptk_info *tpi,
int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
@@ -958,6 +962,12 @@ static int ip6ip6_rcv(struct sk_buff *skb)
ip6ip6_dscp_ecn_decapsulate);
}
+static int mplsip6_rcv(struct sk_buff *skb)
+{
+ return ipxip6_rcv(skb, IPPROTO_MPLS, &tpi_mpls,
+ mplsip6_dscp_ecn_decapsulate);
+}
+
struct ipv6_tel_txoption {
struct ipv6_txoptions ops;
__u8 dst_opt[8];
@@ -1019,14 +1029,14 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t,
if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false,
0, IFA_F_TENTATIVE)))
- pr_warn("%s xmit: Local address not yet configured!\n",
- p->name);
+ pr_warn_ratelimited("%s xmit: Local address not yet configured!\n",
+ p->name);
else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) &&
!ipv6_addr_is_multicast(raddr) &&
unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev,
true, 0, IFA_F_TENTATIVE)))
- pr_warn("%s xmit: Routing loop! Remote address found on this node!\n",
- p->name);
+ pr_warn_ratelimited("%s xmit: Routing loop! Remote address found on this node!\n",
+ p->name);
else
ret = 1;
rcu_read_unlock();
@@ -1070,10 +1080,13 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0;
unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen;
unsigned int max_headroom = psh_hlen;
+ __be16 payload_protocol;
bool use_cache = false;
u8 hop_limit;
int err = -1;
+ payload_protocol = skb_protocol(skb, true);
+
if (t->parms.collect_md) {
hop_limit = skb_tunnel_info(skb)->key.ttl;
goto route_lookup;
@@ -1083,7 +1096,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
/* NBMA tunnel */
if (ipv6_addr_any(&t->parms.raddr)) {
- if (skb->protocol == htons(ETH_P_IPV6)) {
+ if (payload_protocol == htons(ETH_P_IPV6)) {
struct in6_addr *addr6;
struct neighbour *neigh;
int addr_type;
@@ -1104,6 +1117,14 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
neigh_release(neigh);
+ } else if (payload_protocol == htons(ETH_P_IP)) {
+ const struct rtable *rt = skb_rtable(skb);
+
+ if (!rt)
+ goto tx_err_link_failure;
+
+ if (rt->rt_gw_family == AF_INET6)
+ memcpy(&fl6->daddr, &rt->rt_gw6, sizeof(fl6->daddr));
}
} else if (t->parms.proto != 0 && !(t->parms.flags &
(IP6_TNL_F_USE_ORIG_TCLASS |
@@ -1207,9 +1228,9 @@ route_lookup:
skb_dst_set(skb, dst);
if (hop_limit == 0) {
- if (skb->protocol == htons(ETH_P_IP))
+ if (payload_protocol == htons(ETH_P_IP))
hop_limit = ip_hdr(skb)->ttl;
- else if (skb->protocol == htons(ETH_P_IPV6))
+ else if (payload_protocol == htons(ETH_P_IPV6))
hop_limit = ipv6_hdr(skb)->hop_limit;
else
hop_limit = ip6_dst_hoplimit(dst);
@@ -1253,22 +1274,22 @@ tx_err_dst_release:
EXPORT_SYMBOL(ip6_tnl_xmit);
static inline int
-ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
+ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev,
+ u8 protocol)
{
struct ip6_tnl *t = netdev_priv(dev);
+ struct ipv6hdr *ipv6h;
const struct iphdr *iph;
int encap_limit = -1;
+ __u16 offset;
struct flowi6 fl6;
- __u8 dsfield;
+ __u8 dsfield, orig_dsfield;
__u32 mtu;
u8 tproto;
int err;
- iph = ip_hdr(skb);
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-
tproto = READ_ONCE(t->parms.proto);
- if (tproto != IPPROTO_IPIP && tproto != 0)
+ if (tproto != protocol && tproto != 0)
return -1;
if (t->parms.collect_md) {
@@ -1281,129 +1302,102 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
return -1;
key = &tun_info->key;
memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_IPIP;
+ fl6.flowi6_proto = protocol;
fl6.saddr = key->u.ipv6.src;
fl6.daddr = key->u.ipv6.dst;
fl6.flowlabel = key->label;
dsfield = key->tos;
+ switch (protocol) {
+ case IPPROTO_IPIP:
+ iph = ip_hdr(skb);
+ orig_dsfield = ipv4_get_dsfield(iph);
+ break;
+ case IPPROTO_IPV6:
+ ipv6h = ipv6_hdr(skb);
+ orig_dsfield = ipv6_get_dsfield(ipv6h);
+ break;
+ default:
+ orig_dsfield = dsfield;
+ break;
+ }
} else {
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
encap_limit = t->parms.encap_limit;
+ if (protocol == IPPROTO_IPV6) {
+ offset = ip6_tnl_parse_tlv_enc_lim(skb,
+ skb_network_header(skb));
+ /* ip6_tnl_parse_tlv_enc_lim() might have
+ * reallocated skb->head
+ */
+ if (offset > 0) {
+ struct ipv6_tlv_tnl_enc_lim *tel;
- memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_IPIP;
-
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
- dsfield = ipv4_get_dsfield(iph);
- else
- dsfield = ip6_tclass(t->parms.flowinfo);
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
- fl6.flowi6_mark = skb->mark;
- else
- fl6.flowi6_mark = t->parms.fwmark;
- }
-
- fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
- dsfield = INET_ECN_encapsulate(dsfield, ipv4_get_dsfield(iph));
-
- if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
- return -1;
-
- skb_set_inner_ipproto(skb, IPPROTO_IPIP);
-
- err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
- IPPROTO_IPIP);
- if (err != 0) {
- /* XXX: send ICMP error even if DF is not set. */
- if (err == -EMSGSIZE)
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(mtu));
- return -1;
- }
-
- return 0;
-}
-
-static inline int
-ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
-{
- struct ip6_tnl *t = netdev_priv(dev);
- struct ipv6hdr *ipv6h;
- int encap_limit = -1;
- __u16 offset;
- struct flowi6 fl6;
- __u8 dsfield;
- __u32 mtu;
- u8 tproto;
- int err;
-
- ipv6h = ipv6_hdr(skb);
- tproto = READ_ONCE(t->parms.proto);
- if ((tproto != IPPROTO_IPV6 && tproto != 0) ||
- ip6_tnl_addr_conflict(t, ipv6h))
- return -1;
-
- if (t->parms.collect_md) {
- struct ip_tunnel_info *tun_info;
- const struct ip_tunnel_key *key;
-
- tun_info = skb_tunnel_info(skb);
- if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
- ip_tunnel_info_af(tun_info) != AF_INET6))
- return -1;
- key = &tun_info->key;
- memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_IPV6;
- fl6.saddr = key->u.ipv6.src;
- fl6.daddr = key->u.ipv6.dst;
- fl6.flowlabel = key->label;
- dsfield = key->tos;
- } else {
- offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
- /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
- ipv6h = ipv6_hdr(skb);
- if (offset > 0) {
- struct ipv6_tlv_tnl_enc_lim *tel;
-
- tel = (void *)&skb_network_header(skb)[offset];
- if (tel->encap_limit == 0) {
- icmpv6_send(skb, ICMPV6_PARAMPROB,
- ICMPV6_HDR_FIELD, offset + 2);
- return -1;
+ tel = (void *)&skb_network_header(skb)[offset];
+ if (tel->encap_limit == 0) {
+ icmpv6_ndo_send(skb, ICMPV6_PARAMPROB,
+ ICMPV6_HDR_FIELD, offset + 2);
+ return -1;
+ }
+ encap_limit = tel->encap_limit - 1;
}
- encap_limit = tel->encap_limit - 1;
- } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
- encap_limit = t->parms.encap_limit;
}
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_IPV6;
+ fl6.flowi6_proto = protocol;
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
- dsfield = ipv6_get_dsfield(ipv6h);
- else
- dsfield = ip6_tclass(t->parms.flowinfo);
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
- fl6.flowlabel |= ip6_flowlabel(ipv6h);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
else
fl6.flowi6_mark = t->parms.fwmark;
+ switch (protocol) {
+ case IPPROTO_IPIP:
+ iph = ip_hdr(skb);
+ orig_dsfield = ipv4_get_dsfield(iph);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ dsfield = orig_dsfield;
+ else
+ dsfield = ip6_tclass(t->parms.flowinfo);
+ break;
+ case IPPROTO_IPV6:
+ ipv6h = ipv6_hdr(skb);
+ orig_dsfield = ipv6_get_dsfield(ipv6h);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ dsfield = orig_dsfield;
+ else
+ dsfield = ip6_tclass(t->parms.flowinfo);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
+ fl6.flowlabel |= ip6_flowlabel(ipv6h);
+ break;
+ default:
+ orig_dsfield = dsfield = ip6_tclass(t->parms.flowinfo);
+ break;
+ }
}
fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
- dsfield = INET_ECN_encapsulate(dsfield, ipv6_get_dsfield(ipv6h));
+ dsfield = INET_ECN_encapsulate(dsfield, orig_dsfield);
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
return -1;
- skb_set_inner_ipproto(skb, IPPROTO_IPV6);
+ skb_set_inner_ipproto(skb, protocol);
err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
- IPPROTO_IPV6);
+ protocol);
if (err != 0) {
+ /* XXX: send ICMP error even if DF is not set. */
if (err == -EMSGSIZE)
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ switch (protocol) {
+ case IPPROTO_IPIP:
+ icmp_ndo_send(skb, ICMP_DEST_UNREACH,
+ ICMP_FRAG_NEEDED, htonl(mtu));
+ break;
+ case IPPROTO_IPV6:
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ break;
+ default:
+ break;
+ }
return -1;
}
@@ -1415,6 +1409,7 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
struct net_device_stats *stats = &t->dev->stats;
+ u8 ipproto;
int ret;
if (!pskb_inet_may_pull(skb))
@@ -1422,15 +1417,21 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev)
switch (skb->protocol) {
case htons(ETH_P_IP):
- ret = ip4ip6_tnl_xmit(skb, dev);
+ ipproto = IPPROTO_IPIP;
break;
case htons(ETH_P_IPV6):
- ret = ip6ip6_tnl_xmit(skb, dev);
+ if (ip6_tnl_addr_conflict(t, ipv6_hdr(skb)))
+ goto tx_err;
+ ipproto = IPPROTO_IPV6;
+ break;
+ case htons(ETH_P_MPLS_UC):
+ ipproto = IPPROTO_MPLS;
break;
default:
goto tx_err;
}
+ ret = ipxip6_tnl_xmit(skb, dev, ipproto);
if (ret < 0)
goto tx_err;
@@ -1449,10 +1450,10 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
struct net_device *tdev = NULL;
struct __ip6_tnl_parm *p = &t->parms;
struct flowi6 *fl6 = &t->fl.u.ip6;
- unsigned int mtu;
int t_hlen;
+ int mtu;
- memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
+ __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr));
memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
/* Set up flowi template */
@@ -1497,12 +1498,13 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
dev->hard_header_len = tdev->hard_header_len + t_hlen;
mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU);
- dev->mtu = mtu - t_hlen;
+ mtu = mtu - t_hlen;
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
- dev->mtu -= 8;
+ mtu -= 8;
- if (dev->mtu < IPV6_MIN_MTU)
- dev->mtu = IPV6_MIN_MTU;
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+ WRITE_ONCE(dev->mtu, mtu);
}
}
}
@@ -1516,7 +1518,7 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
* ip6_tnl_change() updates the tunnel parameters
**/
-static int
+static void
ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p)
{
t->parms.laddr = p->laddr;
@@ -1530,29 +1532,25 @@ ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p)
t->parms.fwmark = p->fwmark;
dst_cache_reset(&t->dst_cache);
ip6_tnl_link_config(t);
- return 0;
}
-static int ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
+static void ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
{
struct net *net = t->net;
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
- int err;
ip6_tnl_unlink(ip6n, t);
synchronize_net();
- err = ip6_tnl_change(t, p);
+ ip6_tnl_change(t, p);
ip6_tnl_link(ip6n, t);
netdev_state_change(t->dev);
- return err;
}
-static int ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
+static void ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
{
/* for default tnl0 device allow to change only the proto */
t->parms.proto = p->proto;
netdev_state_change(t->dev);
- return 0;
}
static void
@@ -1584,9 +1582,10 @@ ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p)
}
/**
- * ip6_tnl_ioctl - configure ipv6 tunnels from userspace
+ * ip6_tnl_siocdevprivate - configure ipv6 tunnels from userspace
* @dev: virtual device associated with tunnel
- * @ifr: parameters passed from userspace
+ * @ifr: unused
+ * @data: parameters passed from userspace
* @cmd: command to be performed
*
* Description:
@@ -1612,7 +1611,8 @@ ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p)
**/
static int
-ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+ void __user *data, int cmd)
{
int err = 0;
struct ip6_tnl_parm p;
@@ -1626,7 +1626,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
switch (cmd) {
case SIOCGETTUNNEL:
if (dev == ip6n->fb_tnl_dev) {
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+ if (copy_from_user(&p, data, sizeof(p))) {
err = -EFAULT;
break;
}
@@ -1638,9 +1638,8 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
memset(&p, 0, sizeof(p));
}
ip6_tnl_parm_to_user(&p, &t->parms);
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) {
+ if (copy_to_user(data, &p, sizeof(p)))
err = -EFAULT;
- }
break;
case SIOCADDTUNNEL:
case SIOCCHGTUNNEL:
@@ -1648,7 +1647,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
break;
err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ if (copy_from_user(&p, data, sizeof(p)))
break;
err = -EINVAL;
if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP &&
@@ -1665,14 +1664,14 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
} else
t = netdev_priv(dev);
if (dev == ip6n->fb_tnl_dev)
- err = ip6_tnl0_update(t, &p1);
+ ip6_tnl0_update(t, &p1);
else
- err = ip6_tnl_update(t, &p1);
+ ip6_tnl_update(t, &p1);
}
if (!IS_ERR(t)) {
err = 0;
ip6_tnl_parm_to_user(&p, &t->parms);
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ if (copy_to_user(data, &p, sizeof(p)))
err = -EFAULT;
} else {
@@ -1686,7 +1685,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
if (dev == ip6n->fb_tnl_dev) {
err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ if (copy_from_user(&p, data, sizeof(p)))
break;
err = -ENOENT;
ip6_tnl_parm_from_user(&p1, &p);
@@ -1805,9 +1804,9 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
.ndo_init = ip6_tnl_dev_init,
.ndo_uninit = ip6_tnl_dev_uninit,
.ndo_start_xmit = ip6_tnl_start_xmit,
- .ndo_do_ioctl = ip6_tnl_ioctl,
+ .ndo_siocdevprivate = ip6_tnl_siocdevprivate,
.ndo_change_mtu = ip6_tnl_change_mtu,
- .ndo_get_stats = ip6_get_stats,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
@@ -1828,6 +1827,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
static void ip6_tnl_dev_setup(struct net_device *dev)
{
dev->netdev_ops = &ip6_tnl_netdev_ops;
+ dev->header_ops = &ip_tunnel_header_ops;
dev->needs_free_netdev = true;
dev->priv_destructor = ip6_dev_free;
@@ -1884,6 +1884,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
dev->min_mtu = ETH_MIN_MTU;
dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len;
+ netdev_hold(dev, &t->dev_tracker, GFP_KERNEL);
return 0;
destroy_dst:
@@ -1927,7 +1928,6 @@ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev)
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
t->parms.proto = IPPROTO_IPV6;
- dev_hold(dev);
rcu_assign_pointer(ip6n->tnls_wc[0], t);
return 0;
@@ -1989,39 +1989,6 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[],
parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
}
-static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[],
- struct ip_tunnel_encap *ipencap)
-{
- bool ret = false;
-
- memset(ipencap, 0, sizeof(*ipencap));
-
- if (!data)
- return ret;
-
- if (data[IFLA_IPTUN_ENCAP_TYPE]) {
- ret = true;
- ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
- }
-
- if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
- ret = true;
- ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
- }
-
- if (data[IFLA_IPTUN_ENCAP_SPORT]) {
- ret = true;
- ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
- }
-
- if (data[IFLA_IPTUN_ENCAP_DPORT]) {
- ret = true;
- ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
- }
-
- return ret;
-}
-
static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[],
struct netlink_ext_ack *extack)
@@ -2034,7 +2001,7 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
nt = netdev_priv(dev);
- if (ip6_tnl_netlink_encap_parms(data, &ipencap)) {
+ if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
err = ip6_tnl_encap_setup(nt, &ipencap);
if (err < 0)
return err;
@@ -2071,7 +2038,7 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
if (dev == ip6n->fb_tnl_dev)
return -EINVAL;
- if (ip6_tnl_netlink_encap_parms(data, &ipencap)) {
+ if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
int err = ip6_tnl_encap_setup(t, &ipencap);
if (err < 0)
@@ -2088,7 +2055,8 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
} else
t = netdev_priv(dev);
- return ip6_tnl_update(t, &p);
+ ip6_tnl_update(t, &p);
+ return 0;
}
static void ip6_tnl_dellink(struct net_device *dev, struct list_head *head)
@@ -2218,6 +2186,12 @@ static struct xfrm6_tunnel ip6ip6_handler __read_mostly = {
.priority = 1,
};
+static struct xfrm6_tunnel mplsip6_handler __read_mostly = {
+ .handler = mplsip6_rcv,
+ .err_handler = mplsip6_err,
+ .priority = 1,
+};
+
static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list)
{
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
@@ -2240,6 +2214,16 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head
t = rtnl_dereference(t->next);
}
}
+
+ t = rtnl_dereference(ip6n->tnls_wc[0]);
+ while (t) {
+ /* If dev is in the same netns, it has already
+ * been added to the list by the previous loop.
+ */
+ if (!net_eq(dev_net(t->dev), net))
+ unregister_netdevice_queue(t->dev, list);
+ t = rtnl_dereference(t->next);
+ }
}
static int __net_init ip6_tnl_init_net(struct net *net)
@@ -2332,6 +2316,15 @@ static int __init ip6_tunnel_init(void)
pr_err("%s: can't register ip6ip6\n", __func__);
goto out_ip6ip6;
}
+
+ if (ip6_tnl_mpls_supported()) {
+ err = xfrm6_tunnel_register(&mplsip6_handler, AF_MPLS);
+ if (err < 0) {
+ pr_err("%s: can't register mplsip6\n", __func__);
+ goto out_mplsip6;
+ }
+ }
+
err = rtnl_link_register(&ip6_link_ops);
if (err < 0)
goto rtnl_link_failed;
@@ -2339,6 +2332,9 @@ static int __init ip6_tunnel_init(void)
return 0;
rtnl_link_failed:
+ if (ip6_tnl_mpls_supported())
+ xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS);
+out_mplsip6:
xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6);
out_ip6ip6:
xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET);
@@ -2361,6 +2357,9 @@ static void __exit ip6_tunnel_cleanup(void)
if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6))
pr_info("%s: can't deregister ip6ip6\n", __func__);
+ if (ip6_tnl_mpls_supported() &&
+ xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS))
+ pr_info("%s: can't deregister mplsip6\n", __func__);
unregister_pernet_device(&ip6_tnl_net_ops);
}
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index 58956a6b66a2..cdc4d4ee2420 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -25,17 +25,12 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
goto error;
if (cfg->ipv6_v6only) {
- int val = 1;
-
- err = kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY,
- (char *) &val, sizeof(val));
+ err = ip6_sock_set_v6only(sock->sk);
if (err < 0)
goto error;
}
if (cfg->bind_ifindex) {
- err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTOIFINDEX,
- (void *)&cfg->bind_ifindex,
- sizeof(cfg->bind_ifindex));
+ err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true);
if (err < 0)
goto error;
}
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 524006aa0d78..151337d7f67b 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -125,6 +125,7 @@ vti6_tnl_lookup(struct net *net, const struct in6_addr *remote,
/**
* vti6_tnl_bucket - get head of list matching given tunnel parameters
+ * @ip6n: the private data for ip6_vti in the netns
* @p: parameters containing tunnel end-points
*
* Description:
@@ -153,7 +154,7 @@ vti6_tnl_link(struct vti6_net *ip6n, struct ip6_tnl *t)
{
struct ip6_tnl __rcu **tp = vti6_tnl_bucket(ip6n, &t->parms);
- rcu_assign_pointer(t->next , rtnl_dereference(*tp));
+ rcu_assign_pointer(t->next, rtnl_dereference(*tp));
rcu_assign_pointer(*tp, t);
}
@@ -192,7 +193,6 @@ static int vti6_tnl_create2(struct net_device *dev)
strcpy(t->parms.name, dev->name);
- dev_hold(dev);
vti6_tnl_link(ip6n, t);
return 0;
@@ -211,7 +211,7 @@ static struct ip6_tnl *vti6_tnl_create(struct net *net, struct __ip6_tnl_parm *p
if (p->name[0]) {
if (!dev_valid_name(p->name))
goto failed;
- strlcpy(name, p->name, IFNAMSIZ);
+ strscpy(name, p->name, IFNAMSIZ);
} else {
sprintf(name, "ip6_vti%%d");
}
@@ -293,10 +293,11 @@ static void vti6_dev_uninit(struct net_device *dev)
RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL);
else
vti6_tnl_unlink(ip6n, t);
- dev_put(dev);
+ netdev_put(dev, &t->dev_tracker);
}
-static int vti6_rcv(struct sk_buff *skb)
+static int vti6_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type)
{
struct ip6_tnl *t;
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
@@ -311,7 +312,7 @@ static int vti6_rcv(struct sk_buff *skb)
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
rcu_read_unlock();
- return 0;
+ goto discard;
}
ipv6h = ipv6_hdr(skb);
@@ -323,7 +324,10 @@ static int vti6_rcv(struct sk_buff *skb)
rcu_read_unlock();
- return xfrm6_rcv_tnl(skb, t);
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t;
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
+ return xfrm_input(skb, nexthdr, spi, encap_type);
}
rcu_read_unlock();
return -EINVAL;
@@ -332,11 +336,17 @@ discard:
return 0;
}
+static int vti6_rcv(struct sk_buff *skb)
+{
+ int nexthdr = skb_network_header(skb)[IP6CB(skb)->nhoff];
+
+ return vti6_input_proto(skb, nexthdr, 0, 0);
+}
+
static int vti6_rcv_cb(struct sk_buff *skb, int err)
{
unsigned short family;
struct net_device *dev;
- struct pcpu_sw_netstats *tstats;
struct xfrm_state *x;
const struct xfrm_mode *inner_mode;
struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6;
@@ -379,12 +389,7 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err)
skb_scrub_packet(skb, !net_eq(t->net, dev_net(skb->dev)));
skb->dev = dev;
-
- tstats = this_cpu_ptr(dev->tstats);
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
- u64_stats_update_end(&tstats->syncp);
+ dev_sw_netstats_rx_add(dev, skb->len);
return 0;
}
@@ -450,25 +455,46 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
int mtu;
if (!dst) {
- fl->u.ip6.flowi6_oif = dev->ifindex;
- fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC;
- dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6);
- if (dst->error) {
- dst_release(dst);
- dst = NULL;
+ switch (skb->protocol) {
+ case htons(ETH_P_IP): {
+ struct rtable *rt;
+
+ fl->u.ip4.flowi4_oif = dev->ifindex;
+ fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC;
+ rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4);
+ if (IS_ERR(rt))
+ goto tx_err_link_failure;
+ dst = &rt->dst;
+ skb_dst_set(skb, dst);
+ break;
+ }
+ case htons(ETH_P_IPV6):
+ fl->u.ip6.flowi6_oif = dev->ifindex;
+ fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC;
+ dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6);
+ if (dst->error) {
+ dst_release(dst);
+ dst = NULL;
+ goto tx_err_link_failure;
+ }
+ skb_dst_set(skb, dst);
+ break;
+ default:
goto tx_err_link_failure;
}
- skb_dst_set(skb, dst);
}
dst_hold(dst);
- dst = xfrm_lookup(t->net, dst, fl, NULL, 0);
+ dst = xfrm_lookup_route(t->net, dst, fl, NULL, 0);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
dst = NULL;
goto tx_err_link_failure;
}
+ if (dst->flags & DST_XFRM_QUEUE)
+ goto xmit;
+
x = dst->xfrm;
if (!vti6_state_check(x, &t->parms.raddr, &t->parms.laddr))
goto tx_err_link_failure;
@@ -494,16 +520,19 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
if (mtu < IPV6_MIN_MTU)
mtu = IPV6_MIN_MTU;
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
} else {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(mtu));
+ if (!(ip_hdr(skb)->frag_off & htons(IP_DF)))
+ goto xmit;
+ icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
}
err = -EMSGSIZE;
goto tx_err_dst_release;
}
+xmit:
skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev)));
skb_dst_set(skb, dst);
skb->dev = skb_dst(skb)->dev;
@@ -631,7 +660,7 @@ static void vti6_link_config(struct ip6_tnl *t, bool keep_mtu)
struct net_device *tdev = NULL;
int mtu;
- memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
+ __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr));
memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
p->flags &= ~(IP6_TNL_F_CAP_XMIT | IP6_TNL_F_CAP_RCV |
@@ -742,13 +771,14 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
}
/**
- * vti6_ioctl - configure vti6 tunnels from userspace
+ * vti6_siocdevprivate - configure vti6 tunnels from userspace
* @dev: virtual device associated with tunnel
- * @ifr: parameters passed from userspace
+ * @ifr: unused
+ * @data: parameters passed from userspace
* @cmd: command to be performed
*
* Description:
- * vti6_ioctl() is used for managing vti6 tunnels
+ * vti6_siocdevprivate() is used for managing vti6 tunnels
* from userspace.
*
* The possible commands are the following:
@@ -769,7 +799,7 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
* %-ENODEV if attempting to change or delete a nonexisting device
**/
static int
-vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+vti6_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd)
{
int err = 0;
struct ip6_tnl_parm2 p;
@@ -778,10 +808,12 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
struct net *net = dev_net(dev);
struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+ memset(&p1, 0, sizeof(p1));
+
switch (cmd) {
case SIOCGETTUNNEL:
if (dev == ip6n->fb_tnl_dev) {
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+ if (copy_from_user(&p, data, sizeof(p))) {
err = -EFAULT;
break;
}
@@ -793,7 +825,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
if (!t)
t = netdev_priv(dev);
vti6_parm_to_user(&p, &t->parms);
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ if (copy_to_user(data, &p, sizeof(p)))
err = -EFAULT;
break;
case SIOCADDTUNNEL:
@@ -802,7 +834,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
break;
err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ if (copy_from_user(&p, data, sizeof(p)))
break;
err = -EINVAL;
if (p.proto != IPPROTO_IPV6 && p.proto != 0)
@@ -823,7 +855,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
if (t) {
err = 0;
vti6_parm_to_user(&p, &t->parms);
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ if (copy_to_user(data, &p, sizeof(p)))
err = -EFAULT;
} else
@@ -836,7 +868,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
if (dev == ip6n->fb_tnl_dev) {
err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ if (copy_from_user(&p, data, sizeof(p)))
break;
err = -ENOENT;
vti6_parm_from_user(&p1, &p);
@@ -861,8 +893,8 @@ static const struct net_device_ops vti6_netdev_ops = {
.ndo_init = vti6_dev_init,
.ndo_uninit = vti6_dev_uninit,
.ndo_start_xmit = vti6_tnl_xmit,
- .ndo_do_ioctl = vti6_ioctl,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_siocdevprivate = vti6_siocdevprivate,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
@@ -876,6 +908,7 @@ static const struct net_device_ops vti6_netdev_ops = {
static void vti6_dev_setup(struct net_device *dev)
{
dev->netdev_ops = &vti6_netdev_ops;
+ dev->header_ops = &ip_tunnel_header_ops;
dev->needs_free_netdev = true;
dev->priv_destructor = vti6_dev_free;
@@ -903,6 +936,7 @@ static inline int vti6_dev_init_gen(struct net_device *dev)
dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
if (!dev->tstats)
return -ENOMEM;
+ netdev_hold(dev, &t->dev_tracker, GFP_KERNEL);
return 0;
}
@@ -934,7 +968,6 @@ static int __net_init vti6_fb_tnl_dev_init(struct net_device *dev)
struct vti6_net *ip6n = net_generic(net, vti6_net_id);
t->parms.proto = IPPROTO_IPV6;
- dev_hold(dev);
rcu_assign_pointer(ip6n->tnls_wc[0], t);
return 0;
@@ -1167,6 +1200,7 @@ static struct pernet_operations vti6_net_ops = {
static struct xfrm6_protocol vti_esp6_protocol __read_mostly = {
.handler = vti6_rcv,
+ .input_handler = vti6_input_proto,
.cb_handler = vti6_rcv_cb,
.err_handler = vti6_err,
.priority = 100,
@@ -1174,6 +1208,7 @@ static struct xfrm6_protocol vti_esp6_protocol __read_mostly = {
static struct xfrm6_protocol vti_ah6_protocol __read_mostly = {
.handler = vti6_rcv,
+ .input_handler = vti6_input_proto,
.cb_handler = vti6_rcv_cb,
.err_handler = vti6_err,
.priority = 100,
@@ -1181,11 +1216,39 @@ static struct xfrm6_protocol vti_ah6_protocol __read_mostly = {
static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = {
.handler = vti6_rcv,
+ .input_handler = vti6_input_proto,
.cb_handler = vti6_rcv_cb,
.err_handler = vti6_err,
.priority = 100,
};
+#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL)
+static int vti6_rcv_tunnel(struct sk_buff *skb)
+{
+ const xfrm_address_t *saddr;
+ __be32 spi;
+
+ saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr;
+ spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr);
+
+ return vti6_input_proto(skb, IPPROTO_IPV6, spi, 0);
+}
+
+static struct xfrm6_tunnel vti_ipv6_handler __read_mostly = {
+ .handler = vti6_rcv_tunnel,
+ .cb_handler = vti6_rcv_cb,
+ .err_handler = vti6_err,
+ .priority = 0,
+};
+
+static struct xfrm6_tunnel vti_ip6ip_handler __read_mostly = {
+ .handler = vti6_rcv_tunnel,
+ .cb_handler = vti6_rcv_cb,
+ .err_handler = vti6_err,
+ .priority = 0,
+};
+#endif
+
/**
* vti6_tunnel_init - register protocol and reserve needed resources
*
@@ -1211,6 +1274,15 @@ static int __init vti6_tunnel_init(void)
err = xfrm6_protocol_register(&vti_ipcomp6_protocol, IPPROTO_COMP);
if (err < 0)
goto xfrm_proto_comp_failed;
+#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL)
+ msg = "ipv6 tunnel";
+ err = xfrm6_tunnel_register(&vti_ipv6_handler, AF_INET6);
+ if (err < 0)
+ goto vti_tunnel_ipv6_failed;
+ err = xfrm6_tunnel_register(&vti_ip6ip_handler, AF_INET);
+ if (err < 0)
+ goto vti_tunnel_ip6ip_failed;
+#endif
msg = "netlink interface";
err = rtnl_link_register(&vti6_link_ops);
@@ -1220,6 +1292,12 @@ static int __init vti6_tunnel_init(void)
return 0;
rtnl_link_failed:
+#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL)
+ err = xfrm6_tunnel_deregister(&vti_ip6ip_handler, AF_INET);
+vti_tunnel_ip6ip_failed:
+ err = xfrm6_tunnel_deregister(&vti_ipv6_handler, AF_INET6);
+vti_tunnel_ipv6_failed:
+#endif
xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP);
xfrm_proto_comp_failed:
xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH);
@@ -1238,6 +1316,10 @@ pernet_dev_failed:
static void __exit vti6_tunnel_cleanup(void)
{
rtnl_link_unregister(&vti6_link_ops);
+#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL)
+ xfrm6_tunnel_deregister(&vti_ip6ip_handler, AF_INET);
+ xfrm6_tunnel_deregister(&vti_ipv6_handler, AF_INET6);
+#endif
xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP);
xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH);
xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index bfa49ff70531..facdc78a43e5 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -62,7 +62,12 @@ struct ip6mr_result {
Note that the changes are semaphored via rtnl_lock.
*/
-static DEFINE_RWLOCK(mrt_lock);
+static DEFINE_SPINLOCK(mrt_lock);
+
+static struct net_device *vif_dev_read(const struct vif_device *vif)
+{
+ return rcu_dereference(vif->dev);
+}
/* Multicast router control variables */
@@ -85,11 +90,13 @@ static void ip6mr_free_table(struct mr_table *mrt);
static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
struct net_device *dev, struct sk_buff *skb,
struct mfc6_cache *cache);
-static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
+static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt,
mifi_t mifi, int assert);
static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
int cmd);
-static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
+static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt);
+static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack);
static int ip6mr_rtm_dumproute(struct sk_buff *skb,
struct netlink_callback *cb);
static void mroute_clean_tables(struct mr_table *mrt, int flags);
@@ -97,7 +104,9 @@ static void ipmr_expire_process(struct timer_list *t);
#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
#define ip6mr_for_each_table(mrt, net) \
- list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list)
+ list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list, \
+ lockdep_rtnl_is_held() || \
+ list_empty(&net->ipv6.mr6_tables))
static struct mr_table *ip6mr_mr_table_iter(struct net *net,
struct mr_table *mrt)
@@ -180,10 +189,6 @@ static int ip6mr_rule_match(struct fib_rule *rule, struct flowi *flp, int flags)
return 1;
}
-static const struct nla_policy ip6mr_rule_policy[FRA_MAX + 1] = {
- FRA_GENERIC_POLICY,
-};
-
static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh, struct nlattr **tb,
struct netlink_ext_ack *extack)
@@ -216,7 +221,6 @@ static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = {
.compare = ip6mr_rule_compare,
.fill = ip6mr_rule_fill,
.nlgroup = RTNLGRP_IPV6_RULE,
- .policy = ip6mr_rule_policy,
.owner = THIS_MODULE,
};
@@ -246,7 +250,9 @@ static int __net_init ip6mr_rules_init(struct net *net)
return 0;
err2:
+ rtnl_lock();
ip6mr_free_table(mrt);
+ rtnl_unlock();
err1:
fib_rules_unregister(ops);
return err;
@@ -256,13 +262,12 @@ static void __net_exit ip6mr_rules_exit(struct net *net)
{
struct mr_table *mrt, *next;
- rtnl_lock();
+ ASSERT_RTNL();
list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) {
list_del(&mrt->list);
ip6mr_free_table(mrt);
}
fib_rules_unregister(net->ipv6.mr6_rules_ops);
- rtnl_unlock();
}
static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
@@ -319,10 +324,9 @@ static int __net_init ip6mr_rules_init(struct net *net)
static void __net_exit ip6mr_rules_exit(struct net *net)
{
- rtnl_lock();
+ ASSERT_RTNL();
ip6mr_free_table(net->ipv6.mrt6);
net->ipv6.mrt6 = NULL;
- rtnl_unlock();
}
static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
@@ -401,7 +405,7 @@ static void ip6mr_free_table(struct mr_table *mrt)
*/
static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(mrt_lock)
+ __acquires(RCU)
{
struct mr_vif_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
@@ -413,14 +417,14 @@ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
iter->mrt = mrt;
- read_lock(&mrt_lock);
+ rcu_read_lock();
return mr_vif_seq_start(seq, pos);
}
static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v)
- __releases(mrt_lock)
+ __releases(RCU)
{
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
}
static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
@@ -433,7 +437,11 @@ static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
"Interface BytesIn PktsIn BytesOut PktsOut Flags\n");
} else {
const struct vif_device *vif = v;
- const char *name = vif->dev ? vif->dev->name : "none";
+ const struct net_device *vif_dev;
+ const char *name;
+
+ vif_dev = vif_dev_read(vif);
+ name = vif_dev ? vif_dev->name : "none";
seq_printf(seq,
"%2td %-10s %8ld %7ld %8ld %7ld %05X\n",
@@ -552,14 +560,11 @@ static int pim6_rcv(struct sk_buff *skb)
if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
goto drop;
- reg_vif_num = mrt->mroute_reg_vif_num;
- read_lock(&mrt_lock);
+ /* Pairs with WRITE_ONCE() in mif6_add()/mif6_delete() */
+ reg_vif_num = READ_ONCE(mrt->mroute_reg_vif_num);
if (reg_vif_num >= 0)
- reg_dev = mrt->vif_table[reg_vif_num].dev;
- if (reg_dev)
- dev_hold(reg_dev);
- read_unlock(&mrt_lock);
+ reg_dev = vif_dev_read(&mrt->vif_table[reg_vif_num]);
if (!reg_dev)
goto drop;
@@ -574,7 +579,6 @@ static int pim6_rcv(struct sk_buff *skb)
netif_rx(skb);
- dev_put(reg_dev);
return 0;
drop:
kfree_skb(skb);
@@ -604,11 +608,12 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb,
if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
goto tx_err;
- read_lock(&mrt_lock);
dev->stats.tx_bytes += skb->len;
dev->stats.tx_packets++;
- ip6mr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, MRT6MSG_WHOLEPKT);
- read_unlock(&mrt_lock);
+ rcu_read_lock();
+ ip6mr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num),
+ MRT6MSG_WHOLEPKT);
+ rcu_read_unlock();
kfree_skb(skb);
return NETDEV_TX_OK;
@@ -674,10 +679,11 @@ failure:
static int call_ip6mr_vif_entry_notifiers(struct net *net,
enum fib_event_type event_type,
struct vif_device *vif,
+ struct net_device *vif_dev,
mifi_t vif_index, u32 tb_id)
{
return mr_call_vif_notifiers(net, RTNL_FAMILY_IP6MR, event_type,
- vif, vif_index, tb_id,
+ vif, vif_dev, vif_index, tb_id,
&net->ipv6.ipmr_seq);
}
@@ -702,23 +708,21 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify,
v = &mrt->vif_table[vifi];
- if (VIF_EXISTS(mrt, vifi))
- call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net),
- FIB_EVENT_VIF_DEL, v, vifi,
- mrt->id);
-
- write_lock_bh(&mrt_lock);
- dev = v->dev;
- v->dev = NULL;
-
- if (!dev) {
- write_unlock_bh(&mrt_lock);
+ dev = rtnl_dereference(v->dev);
+ if (!dev)
return -EADDRNOTAVAIL;
- }
+
+ call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net),
+ FIB_EVENT_VIF_DEL, v, dev,
+ vifi, mrt->id);
+ spin_lock(&mrt_lock);
+ RCU_INIT_POINTER(v->dev, NULL);
#ifdef CONFIG_IPV6_PIMSM_V2
- if (vifi == mrt->mroute_reg_vif_num)
- mrt->mroute_reg_vif_num = -1;
+ if (vifi == mrt->mroute_reg_vif_num) {
+ /* Pairs with READ_ONCE() in ip6mr_cache_report() and reg_vif_xmit() */
+ WRITE_ONCE(mrt->mroute_reg_vif_num, -1);
+ }
#endif
if (vifi + 1 == mrt->maxvif) {
@@ -727,16 +731,16 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify,
if (VIF_EXISTS(mrt, tmp))
break;
}
- mrt->maxvif = tmp + 1;
+ WRITE_ONCE(mrt->maxvif, tmp + 1);
}
- write_unlock_bh(&mrt_lock);
+ spin_unlock(&mrt_lock);
dev_set_allmulti(dev, -1);
in6_dev = __in6_dev_get(dev);
if (in6_dev) {
- in6_dev->cnf.mc_forwarding--;
+ atomic_dec(&in6_dev->cnf.mc_forwarding);
inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
NETCONFA_MC_FORWARDING,
dev->ifindex, &in6_dev->cnf);
@@ -745,7 +749,7 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify,
if ((v->flags & MIFF_REGISTER) && !notify)
unregister_netdevice_queue(dev, head);
- dev_put(dev);
+ netdev_put(dev, &v->dev_tracker);
return 0;
}
@@ -830,7 +834,7 @@ static void ipmr_expire_process(struct timer_list *t)
spin_unlock(&mfc_unres_lock);
}
-/* Fill oifs list. It is called under write locked mrt_lock. */
+/* Fill oifs list. It is called under locked mrt_lock. */
static void ip6mr_update_thresholds(struct mr_table *mrt,
struct mr_mfc *cache,
@@ -904,7 +908,7 @@ static int mif6_add(struct net *net, struct mr_table *mrt,
in6_dev = __in6_dev_get(dev);
if (in6_dev) {
- in6_dev->cnf.mc_forwarding++;
+ atomic_inc(&in6_dev->cnf.mc_forwarding);
inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
NETCONFA_MC_FORWARDING,
dev->ifindex, &in6_dev->cnf);
@@ -916,17 +920,18 @@ static int mif6_add(struct net *net, struct mr_table *mrt,
MIFF_REGISTER);
/* And finish update writing critical data */
- write_lock_bh(&mrt_lock);
- v->dev = dev;
+ spin_lock(&mrt_lock);
+ rcu_assign_pointer(v->dev, dev);
+ netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC);
#ifdef CONFIG_IPV6_PIMSM_V2
if (v->flags & MIFF_REGISTER)
- mrt->mroute_reg_vif_num = vifi;
+ WRITE_ONCE(mrt->mroute_reg_vif_num, vifi);
#endif
if (vifi + 1 > mrt->maxvif)
- mrt->maxvif = vifi + 1;
- write_unlock_bh(&mrt_lock);
+ WRITE_ONCE(mrt->maxvif, vifi + 1);
+ spin_unlock(&mrt_lock);
call_ip6mr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD,
- v, vifi, mrt->id);
+ v, dev, vifi, mrt->id);
return 0;
}
@@ -1023,18 +1028,21 @@ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt,
((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE;
}
rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
- } else
+ } else {
+ rcu_read_lock();
ip6_mr_forward(net, mrt, skb->dev, skb, c);
+ rcu_read_unlock();
+ }
}
}
/*
* Bounce a cache query up to pim6sd and netlink.
*
- * Called under mrt_lock.
+ * Called under rcu_read_lock()
*/
-static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
+static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt,
mifi_t mifi, int assert)
{
struct sock *mroute6_sk;
@@ -1043,7 +1051,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
int ret;
#ifdef CONFIG_IPV6_PIMSM_V2
- if (assert == MRT6MSG_WHOLEPKT)
+ if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE)
skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt)
+sizeof(*msg));
else
@@ -1059,7 +1067,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
skb->ip_summed = CHECKSUM_UNNECESSARY;
#ifdef CONFIG_IPV6_PIMSM_V2
- if (assert == MRT6MSG_WHOLEPKT) {
+ if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) {
/* Ugly, but we have no choice with this interface.
Duplicate old header, fix length etc.
And all this only to mangle msg->im6_msgtype and
@@ -1071,8 +1079,11 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
skb_reset_transport_header(skb);
msg = (struct mrt6msg *)skb_transport_header(skb);
msg->im6_mbz = 0;
- msg->im6_msgtype = MRT6MSG_WHOLEPKT;
- msg->im6_mif = mrt->mroute_reg_vif_num;
+ msg->im6_msgtype = assert;
+ if (assert == MRT6MSG_WRMIFWHOLE)
+ msg->im6_mif = mifi;
+ else
+ msg->im6_mif = READ_ONCE(mrt->mroute_reg_vif_num);
msg->im6_pad = 0;
msg->im6_src = ipv6_hdr(pkt)->saddr;
msg->im6_dst = ipv6_hdr(pkt)->daddr;
@@ -1107,10 +1118,8 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
- rcu_read_lock();
mroute6_sk = rcu_dereference(mrt->mroute_sk);
if (!mroute6_sk) {
- rcu_read_unlock();
kfree_skb(skb);
return -EINVAL;
}
@@ -1119,7 +1128,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
/* Deliver to user space multicast routing algorithms */
ret = sock_queue_rcv_skb(mroute6_sk, skb);
- rcu_read_unlock();
+
if (ret < 0) {
net_warn_ratelimited("mroute6: pending queue full, dropping entries\n");
kfree_skb(skb);
@@ -1243,7 +1252,7 @@ static int ip6mr_device_event(struct notifier_block *this,
ip6mr_for_each_table(mrt, net) {
v = &mrt->vif_table[0];
for (ct = 0; ct < mrt->maxvif; ct++, v++) {
- if (v->dev == dev)
+ if (rcu_access_pointer(v->dev) == dev)
mif6_delete(mrt, ct, 1, NULL);
}
}
@@ -1262,7 +1271,7 @@ static int ip6mr_dump(struct net *net, struct notifier_block *nb,
struct netlink_ext_ack *extack)
{
return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump,
- ip6mr_mr_table_iter, &mrt_lock, extack);
+ ip6mr_mr_table_iter, extack);
}
static struct notifier_block ip6_mr_notifier = {
@@ -1326,7 +1335,9 @@ static int __net_init ip6mr_net_init(struct net *net)
proc_cache_fail:
remove_proc_entry("ip6_mr_vif", net->proc_net);
proc_vif_fail:
+ rtnl_lock();
ip6mr_rules_exit(net);
+ rtnl_unlock();
#endif
ip6mr_rules_fail:
ip6mr_notifier_exit(net);
@@ -1339,13 +1350,23 @@ static void __net_exit ip6mr_net_exit(struct net *net)
remove_proc_entry("ip6_mr_cache", net->proc_net);
remove_proc_entry("ip6_mr_vif", net->proc_net);
#endif
- ip6mr_rules_exit(net);
ip6mr_notifier_exit(net);
}
+static void __net_exit ip6mr_net_exit_batch(struct list_head *net_list)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list)
+ ip6mr_rules_exit(net);
+ rtnl_unlock();
+}
+
static struct pernet_operations ip6mr_net_ops = {
.init = ip6mr_net_init,
.exit = ip6mr_net_exit,
+ .exit_batch = ip6mr_net_exit_batch,
};
int __init ip6_mr_init(void)
@@ -1374,7 +1395,7 @@ int __init ip6_mr_init(void)
}
#endif
err = rtnl_register_module(THIS_MODULE, RTNL_FAMILY_IP6MR, RTM_GETROUTE,
- NULL, ip6mr_rtm_dumproute, 0);
+ ip6mr_rtm_getroute, ip6mr_rtm_dumproute, 0);
if (err == 0)
return 0;
@@ -1425,12 +1446,12 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt,
&mfc->mf6cc_mcastgrp.sin6_addr, parent);
rcu_read_unlock();
if (c) {
- write_lock_bh(&mrt_lock);
+ spin_lock(&mrt_lock);
c->_c.mfc_parent = mfc->mf6cc_parent;
ip6mr_update_thresholds(mrt, &c->_c, ttls);
if (!mrtsock)
c->_c.mfc_flags |= MFC_STATIC;
- write_unlock_bh(&mrt_lock);
+ spin_unlock(&mrt_lock);
call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
c, mrt->id);
mr6_netlink_event(mrt, c, RTM_NEWROUTE);
@@ -1548,15 +1569,15 @@ static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk)
struct net *net = sock_net(sk);
rtnl_lock();
- write_lock_bh(&mrt_lock);
+ spin_lock(&mrt_lock);
if (rtnl_dereference(mrt->mroute_sk)) {
err = -EADDRINUSE;
} else {
rcu_assign_pointer(mrt->mroute_sk, sk);
sock_set_flag(sk, SOCK_RCU_FREE);
- net->ipv6.devconf_all->mc_forwarding++;
+ atomic_inc(&net->ipv6.devconf_all->mc_forwarding);
}
- write_unlock_bh(&mrt_lock);
+ spin_unlock(&mrt_lock);
if (!err)
inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
@@ -1570,25 +1591,30 @@ static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk)
int ip6mr_sk_done(struct sock *sk)
{
- int err = -EACCES;
struct net *net = sock_net(sk);
+ struct ipv6_devconf *devconf;
struct mr_table *mrt;
+ int err = -EACCES;
if (sk->sk_type != SOCK_RAW ||
inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
return err;
+ devconf = net->ipv6.devconf_all;
+ if (!devconf || !atomic_read(&devconf->mc_forwarding))
+ return err;
+
rtnl_lock();
ip6mr_for_each_table(mrt, net) {
if (sk == rtnl_dereference(mrt->mroute_sk)) {
- write_lock_bh(&mrt_lock);
+ spin_lock(&mrt_lock);
RCU_INIT_POINTER(mrt->mroute_sk, NULL);
/* Note that mroute_sk had SOCK_RCU_FREE set,
* so the RCU grace period before sk freeing
* is guaranteed by sk_destruct()
*/
- net->ipv6.devconf_all->mc_forwarding--;
- write_unlock_bh(&mrt_lock);
+ atomic_dec(&devconf->mc_forwarding);
+ spin_unlock(&mrt_lock);
inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
NETCONFA_MC_FORWARDING,
NETCONFA_IFINDEX_ALL,
@@ -1627,7 +1653,8 @@ EXPORT_SYMBOL(mroute6_is_socket);
* MOSPF/PIM router set up we can clean this up.
*/
-int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
+int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval,
+ unsigned int optlen)
{
int ret, parent = 0;
struct mif6ctl vif;
@@ -1663,7 +1690,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
case MRT6_ADD_MIF:
if (optlen < sizeof(vif))
return -EINVAL;
- if (copy_from_user(&vif, optval, sizeof(vif)))
+ if (copy_from_sockptr(&vif, optval, sizeof(vif)))
return -EFAULT;
if (vif.mif6c_mifi >= MAXMIFS)
return -ENFILE;
@@ -1676,7 +1703,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
case MRT6_DEL_MIF:
if (optlen < sizeof(mifi_t))
return -EINVAL;
- if (copy_from_user(&mifi, optval, sizeof(mifi_t)))
+ if (copy_from_sockptr(&mifi, optval, sizeof(mifi_t)))
return -EFAULT;
rtnl_lock();
ret = mif6_delete(mrt, mifi, 0, NULL);
@@ -1690,12 +1717,12 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
case MRT6_ADD_MFC:
case MRT6_DEL_MFC:
parent = -1;
- /* fall through */
+ fallthrough;
case MRT6_ADD_MFC_PROXY:
case MRT6_DEL_MFC_PROXY:
if (optlen < sizeof(mfc))
return -EINVAL;
- if (copy_from_user(&mfc, optval, sizeof(mfc)))
+ if (copy_from_sockptr(&mfc, optval, sizeof(mfc)))
return -EFAULT;
if (parent == 0)
parent = mfc.mf6cc_parent;
@@ -1716,7 +1743,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
if (optlen != sizeof(flags))
return -EINVAL;
- if (get_user(flags, (int __user *)optval))
+ if (copy_from_sockptr(&flags, optval, sizeof(flags)))
return -EFAULT;
rtnl_lock();
mroute_clean_tables(mrt, flags);
@@ -1733,7 +1760,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
if (optlen != sizeof(v))
return -EINVAL;
- if (get_user(v, (int __user *)optval))
+ if (copy_from_sockptr(&v, optval, sizeof(v)))
return -EFAULT;
mrt->mroute_do_assert = v;
return 0;
@@ -1742,18 +1769,22 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
#ifdef CONFIG_IPV6_PIMSM_V2
case MRT6_PIM:
{
+ bool do_wrmifwhole;
int v;
if (optlen != sizeof(v))
return -EINVAL;
- if (get_user(v, (int __user *)optval))
+ if (copy_from_sockptr(&v, optval, sizeof(v)))
return -EFAULT;
+
+ do_wrmifwhole = (v == MRT6MSG_WRMIFWHOLE);
v = !!v;
rtnl_lock();
ret = 0;
if (v != mrt->mroute_do_pim) {
mrt->mroute_do_pim = v;
mrt->mroute_do_assert = v;
+ mrt->mroute_do_wrvifwhole = do_wrmifwhole;
}
rtnl_unlock();
return ret;
@@ -1767,7 +1798,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
if (optlen != sizeof(u32))
return -EINVAL;
- if (get_user(v, (u32 __user *)optval))
+ if (copy_from_sockptr(&v, optval, sizeof(v)))
return -EFAULT;
/* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */
if (v != RT_TABLE_DEFAULT && v >= 100000000)
@@ -1799,8 +1830,8 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
* Getsock opt support for the multicast routing system.
*/
-int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
- int __user *optlen)
+int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval,
+ sockptr_t optlen)
{
int olr;
int val;
@@ -1831,16 +1862,16 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
return -ENOPROTOOPT;
}
- if (get_user(olr, optlen))
+ if (copy_from_sockptr(&olr, optlen, sizeof(int)))
return -EFAULT;
olr = min_t(int, olr, sizeof(int));
if (olr < 0)
return -EINVAL;
- if (put_user(olr, optlen))
+ if (copy_to_sockptr(optlen, &olr, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &val, olr))
+ if (copy_to_sockptr(optval, &val, olr))
return -EFAULT;
return 0;
}
@@ -1869,20 +1900,20 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
if (vr.mifi >= mrt->maxvif)
return -EINVAL;
vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif);
- read_lock(&mrt_lock);
+ rcu_read_lock();
vif = &mrt->vif_table[vr.mifi];
if (VIF_EXISTS(mrt, vr.mifi)) {
- vr.icount = vif->pkt_in;
- vr.ocount = vif->pkt_out;
- vr.ibytes = vif->bytes_in;
- vr.obytes = vif->bytes_out;
- read_unlock(&mrt_lock);
+ vr.icount = READ_ONCE(vif->pkt_in);
+ vr.ocount = READ_ONCE(vif->pkt_out);
+ vr.ibytes = READ_ONCE(vif->bytes_in);
+ vr.obytes = READ_ONCE(vif->bytes_out);
+ rcu_read_unlock();
if (copy_to_user(arg, &vr, sizeof(vr)))
return -EFAULT;
return 0;
}
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -EADDRNOTAVAIL;
case SIOCGETSGCNT_IN6:
if (copy_from_user(&sr, arg, sizeof(sr)))
@@ -1944,20 +1975,20 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
if (vr.mifi >= mrt->maxvif)
return -EINVAL;
vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif);
- read_lock(&mrt_lock);
+ rcu_read_lock();
vif = &mrt->vif_table[vr.mifi];
if (VIF_EXISTS(mrt, vr.mifi)) {
- vr.icount = vif->pkt_in;
- vr.ocount = vif->pkt_out;
- vr.ibytes = vif->bytes_in;
- vr.obytes = vif->bytes_out;
- read_unlock(&mrt_lock);
+ vr.icount = READ_ONCE(vif->pkt_in);
+ vr.ocount = READ_ONCE(vif->pkt_out);
+ vr.ibytes = READ_ONCE(vif->bytes_in);
+ vr.obytes = READ_ONCE(vif->bytes_out);
+ rcu_read_unlock();
if (copy_to_user(arg, &vr, sizeof(vr)))
return -EFAULT;
return 0;
}
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -EADDRNOTAVAIL;
case SIOCGETSGCNT_IN6:
if (copy_from_user(&sr, arg, sizeof(sr)))
@@ -1999,21 +2030,22 @@ static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct
static int ip6mr_forward2(struct net *net, struct mr_table *mrt,
struct sk_buff *skb, int vifi)
{
- struct ipv6hdr *ipv6h;
struct vif_device *vif = &mrt->vif_table[vifi];
- struct net_device *dev;
+ struct net_device *vif_dev;
+ struct ipv6hdr *ipv6h;
struct dst_entry *dst;
struct flowi6 fl6;
- if (!vif->dev)
+ vif_dev = vif_dev_read(vif);
+ if (!vif_dev)
goto out_free;
#ifdef CONFIG_IPV6_PIMSM_V2
if (vif->flags & MIFF_REGISTER) {
- vif->pkt_out++;
- vif->bytes_out += skb->len;
- vif->dev->stats.tx_bytes += skb->len;
- vif->dev->stats.tx_packets++;
+ WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
+ WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
+ vif_dev->stats.tx_bytes += skb->len;
+ vif_dev->stats.tx_packets++;
ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT);
goto out_free;
}
@@ -2046,14 +2078,13 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt,
* not mrouter) cannot join to more than one interface - it will
* result in receiving multiple packets.
*/
- dev = vif->dev;
- skb->dev = dev;
- vif->pkt_out++;
- vif->bytes_out += skb->len;
+ skb->dev = vif_dev;
+ WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
+ WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
/* We are about to write */
/* XXX: extension headers? */
- if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(dev)))
+ if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(vif_dev)))
goto out_free;
ipv6h = ipv6_hdr(skb);
@@ -2062,7 +2093,7 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt,
IP6CB(skb)->flags |= IP6SKB_FORWARDED;
return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
- net, NULL, skb, skb->dev, dev,
+ net, NULL, skb, skb->dev, vif_dev,
ip6mr_forward2_finish);
out_free:
@@ -2070,17 +2101,20 @@ out_free:
return 0;
}
+/* Called with rcu_read_lock() */
static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev)
{
int ct;
- for (ct = mrt->maxvif - 1; ct >= 0; ct--) {
- if (mrt->vif_table[ct].dev == dev)
+ /* Pairs with WRITE_ONCE() in mif6_delete()/mif6_add() */
+ for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) {
+ if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev)
break;
}
return ct;
}
+/* Called under rcu_read_lock() */
static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
struct net_device *dev, struct sk_buff *skb,
struct mfc6_cache *c)
@@ -2100,20 +2134,16 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
/* For an (*,G) entry, we only check that the incoming
* interface is part of the static tree.
*/
- rcu_read_lock();
cache_proxy = mr_mfc_find_any_parent(mrt, vif);
if (cache_proxy &&
- cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) {
- rcu_read_unlock();
+ cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255)
goto forward;
- }
- rcu_read_unlock();
}
/*
* Wrong interface: drop packet and (maybe) send PIM assert.
*/
- if (mrt->vif_table[vif].dev != dev) {
+ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) {
c->_c.mfc_un.res.wrong_if++;
if (true_vifi >= 0 && mrt->mroute_do_assert &&
@@ -2129,13 +2159,18 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
MFC_ASSERT_THRESH)) {
c->_c.mfc_un.res.last_assert = jiffies;
ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF);
+ if (mrt->mroute_do_wrvifwhole)
+ ip6mr_cache_report(mrt, skb, true_vifi,
+ MRT6MSG_WRMIFWHOLE);
}
goto dont_forward;
}
forward:
- mrt->vif_table[vif].pkt_in++;
- mrt->vif_table[vif].bytes_in += skb->len;
+ WRITE_ONCE(mrt->vif_table[vif].pkt_in,
+ mrt->vif_table[vif].pkt_in + 1);
+ WRITE_ONCE(mrt->vif_table[vif].bytes_in,
+ mrt->vif_table[vif].bytes_in + skb->len);
/*
* Forward the frame
@@ -2213,7 +2248,6 @@ int ip6_mr_input(struct sk_buff *skb)
return err;
}
- read_lock(&mrt_lock);
cache = ip6mr_cache_find(mrt,
&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr);
if (!cache) {
@@ -2234,19 +2268,15 @@ int ip6_mr_input(struct sk_buff *skb)
vif = ip6mr_find_vif(mrt, dev);
if (vif >= 0) {
int err = ip6mr_cache_unresolved(mrt, vif, skb, dev);
- read_unlock(&mrt_lock);
return err;
}
- read_unlock(&mrt_lock);
kfree_skb(skb);
return -ENODEV;
}
ip6_mr_forward(net, mrt, dev, skb, cache);
- read_unlock(&mrt_lock);
-
return 0;
}
@@ -2262,7 +2292,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
if (!mrt)
return -ENOENT;
- read_lock(&mrt_lock);
+ rcu_read_lock();
cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr);
if (!cache && skb->dev) {
int vif = ip6mr_find_vif(mrt, skb->dev);
@@ -2280,14 +2310,14 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
dev = skb->dev;
if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) {
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -ENODEV;
}
/* really correct? */
skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
if (!skb2) {
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -ENOMEM;
}
@@ -2310,13 +2340,13 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
iph->daddr = rt->rt6i_dst.addr;
err = ip6mr_cache_unresolved(mrt, vif, skb2, dev);
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return err;
}
err = mr_fill_mroute(mrt, skb, &cache->_c, rtm);
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return err;
}
@@ -2435,7 +2465,7 @@ static size_t mrt6msg_netlink_msgsize(size_t payloadlen)
return len;
}
-static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt)
+static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt)
{
struct net *net = read_pnet(&mrt->net);
struct nlmsghdr *nlh;
@@ -2483,6 +2513,95 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE_R, -ENOBUFS);
}
+static const struct nla_policy ip6mr_getroute_policy[RTA_MAX + 1] = {
+ [RTA_SRC] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+ [RTA_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+ [RTA_TABLE] = { .type = NLA_U32 },
+};
+
+static int ip6mr_rtm_valid_getroute_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct rtmsg *rtm;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, ip6mr_getroute_policy,
+ extack);
+ if (err)
+ return err;
+
+ rtm = nlmsg_data(nlh);
+ if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
+ (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
+ rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol ||
+ rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Invalid values in header for multicast route get request");
+ return -EINVAL;
+ }
+
+ if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
+ (tb[RTA_DST] && !rtm->rtm_dst_len)) {
+ NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct in6_addr src = {}, grp = {};
+ struct nlattr *tb[RTA_MAX + 1];
+ struct mfc6_cache *cache;
+ struct mr_table *mrt;
+ struct sk_buff *skb;
+ u32 tableid;
+ int err;
+
+ err = ip6mr_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[RTA_SRC])
+ src = nla_get_in6_addr(tb[RTA_SRC]);
+ if (tb[RTA_DST])
+ grp = nla_get_in6_addr(tb[RTA_DST]);
+ tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0;
+
+ mrt = ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT);
+ if (!mrt) {
+ NL_SET_ERR_MSG_MOD(extack, "MR table does not exist");
+ return -ENOENT;
+ }
+
+ /* entries are added/deleted only under RTNL */
+ rcu_read_lock();
+ cache = ip6mr_cache_find(mrt, &src, &grp);
+ rcu_read_unlock();
+ if (!cache) {
+ NL_SET_ERR_MSG_MOD(extack, "MR cache entry not found");
+ return -ENOENT;
+ }
+
+ skb = nlmsg_new(mr6_msgsize(false, mrt->maxvif), GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ err = ip6mr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, cache, RTM_NEWROUTE, 0);
+ if (err < 0) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+}
+
static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
{
const struct nlmsghdr *nlh = cb->nlh;
@@ -2501,7 +2620,7 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
mrt = ip6mr_get_table(sock_net(skb->sk), filter.table_id);
if (!mrt) {
- if (filter.dump_all_families)
+ if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR)
return skb->len;
NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist");
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 3752bd3e92ce..72d4858dec18 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -91,6 +91,7 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x)
t->props.mode = x->props.mode;
memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr));
memcpy(&t->mark, &x->mark, sizeof(t->mark));
+ t->if_id = x->if_id;
if (xfrm_init_state(t))
goto error;
@@ -135,7 +136,8 @@ out:
return err;
}
-static int ipcomp6_init_state(struct xfrm_state *x)
+static int ipcomp6_init_state(struct xfrm_state *x,
+ struct netlink_ext_ack *extack)
{
int err = -EINVAL;
@@ -147,17 +149,20 @@ static int ipcomp6_init_state(struct xfrm_state *x)
x->props.header_len += sizeof(struct ipv6hdr);
break;
default:
+ NL_SET_ERR_MSG(extack, "Unsupported XFRM mode for IPcomp");
goto out;
}
- err = ipcomp_init_state(x);
+ err = ipcomp_init_state(x, extack);
if (err)
goto out;
if (x->props.mode == XFRM_MODE_TUNNEL) {
err = ipcomp6_tunnel_attach(x);
- if (err)
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Kernel error: failed to initialize the associated state");
goto out;
+ }
}
err = 0;
@@ -171,18 +176,17 @@ static int ipcomp6_rcv_cb(struct sk_buff *skb, int err)
}
static const struct xfrm_type ipcomp6_type = {
- .description = "IPCOMP6",
.owner = THIS_MODULE,
.proto = IPPROTO_COMP,
.init_state = ipcomp6_init_state,
.destructor = ipcomp_destroy,
.input = ipcomp_input,
.output = ipcomp_output,
- .hdr_offset = xfrm6_find_1stfragopt,
};
static struct xfrm6_protocol ipcomp6_protocol = {
.handler = xfrm6_rcv,
+ .input_handler = xfrm_input,
.cb_handler = ipcomp6_rcv_cb,
.err_handler = ipcomp6_err,
.priority = 0,
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index debdaeba5d8c..532f4478c884 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -55,6 +55,8 @@
struct ip6_ra_chain *ip6_ra_chain;
DEFINE_RWLOCK(ip6_ra_lock);
+DEFINE_STATIC_KEY_FALSE(ip6_min_hopcount);
+
int ip6_ra_control(struct sock *sk, int sel)
{
struct ip6_ra_chain *ra, *new_ra, **rap;
@@ -136,8 +138,261 @@ static bool setsockopt_needs_rtnl(int optname)
return false;
}
-static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
+static int copy_group_source_from_sockptr(struct group_source_req *greqs,
+ sockptr_t optval, int optlen)
+{
+ if (in_compat_syscall()) {
+ struct compat_group_source_req gr32;
+
+ if (optlen < sizeof(gr32))
+ return -EINVAL;
+ if (copy_from_sockptr(&gr32, optval, sizeof(gr32)))
+ return -EFAULT;
+ greqs->gsr_interface = gr32.gsr_interface;
+ greqs->gsr_group = gr32.gsr_group;
+ greqs->gsr_source = gr32.gsr_source;
+ } else {
+ if (optlen < sizeof(*greqs))
+ return -EINVAL;
+ if (copy_from_sockptr(greqs, optval, sizeof(*greqs)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int do_ipv6_mcast_group_source(struct sock *sk, int optname,
+ sockptr_t optval, int optlen)
+{
+ struct group_source_req greqs;
+ int omode, add;
+ int ret;
+
+ ret = copy_group_source_from_sockptr(&greqs, optval, optlen);
+ if (ret)
+ return ret;
+
+ if (greqs.gsr_group.ss_family != AF_INET6 ||
+ greqs.gsr_source.ss_family != AF_INET6)
+ return -EADDRNOTAVAIL;
+
+ if (optname == MCAST_BLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 1;
+ } else if (optname == MCAST_UNBLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 0;
+ } else if (optname == MCAST_JOIN_SOURCE_GROUP) {
+ struct sockaddr_in6 *psin6;
+ int retv;
+
+ psin6 = (struct sockaddr_in6 *)&greqs.gsr_group;
+ retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface,
+ &psin6->sin6_addr,
+ MCAST_INCLUDE);
+ /* prior join w/ different source is ok */
+ if (retv && retv != -EADDRINUSE)
+ return retv;
+ omode = MCAST_INCLUDE;
+ add = 1;
+ } else /* MCAST_LEAVE_SOURCE_GROUP */ {
+ omode = MCAST_INCLUDE;
+ add = 0;
+ }
+ return ip6_mc_source(add, omode, sk, &greqs);
+}
+
+static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
+ int optlen)
+{
+ struct group_filter *gsf;
+ int ret;
+
+ if (optlen < GROUP_FILTER_SIZE(0))
+ return -EINVAL;
+ if (optlen > READ_ONCE(sysctl_optmem_max))
+ return -ENOBUFS;
+
+ gsf = memdup_sockptr(optval, optlen);
+ if (IS_ERR(gsf))
+ return PTR_ERR(gsf);
+
+ /* numsrc >= (4G-140)/128 overflow in 32 bits */
+ ret = -ENOBUFS;
+ if (gsf->gf_numsrc >= 0x1ffffffU ||
+ gsf->gf_numsrc > sysctl_mld_max_msf)
+ goto out_free_gsf;
+
+ ret = -EINVAL;
+ if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen)
+ goto out_free_gsf;
+
+ ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist_flex);
+out_free_gsf:
+ kfree(gsf);
+ return ret;
+}
+
+static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
+ int optlen)
+{
+ const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
+ struct compat_group_filter *gf32;
+ void *p;
+ int ret;
+ int n;
+
+ if (optlen < size0)
+ return -EINVAL;
+ if (optlen > READ_ONCE(sysctl_optmem_max) - 4)
+ return -ENOBUFS;
+
+ p = kmalloc(optlen + 4, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */
+ ret = -EFAULT;
+ if (copy_from_sockptr(gf32, optval, optlen))
+ goto out_free_p;
+
+ /* numsrc >= (4G-140)/128 overflow in 32 bits */
+ ret = -ENOBUFS;
+ n = gf32->gf_numsrc;
+ if (n >= 0x1ffffffU || n > sysctl_mld_max_msf)
+ goto out_free_p;
+
+ ret = -EINVAL;
+ if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen)
+ goto out_free_p;
+
+ ret = ip6_mc_msfilter(sk, &(struct group_filter){
+ .gf_interface = gf32->gf_interface,
+ .gf_group = gf32->gf_group,
+ .gf_fmode = gf32->gf_fmode,
+ .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist_flex);
+
+out_free_p:
+ kfree(p);
+ return ret;
+}
+
+static int ipv6_mcast_join_leave(struct sock *sk, int optname,
+ sockptr_t optval, int optlen)
+{
+ struct sockaddr_in6 *psin6;
+ struct group_req greq;
+
+ if (optlen < sizeof(greq))
+ return -EINVAL;
+ if (copy_from_sockptr(&greq, optval, sizeof(greq)))
+ return -EFAULT;
+
+ if (greq.gr_group.ss_family != AF_INET6)
+ return -EADDRNOTAVAIL;
+ psin6 = (struct sockaddr_in6 *)&greq.gr_group;
+ if (optname == MCAST_JOIN_GROUP)
+ return ipv6_sock_mc_join(sk, greq.gr_interface,
+ &psin6->sin6_addr);
+ return ipv6_sock_mc_drop(sk, greq.gr_interface, &psin6->sin6_addr);
+}
+
+static int compat_ipv6_mcast_join_leave(struct sock *sk, int optname,
+ sockptr_t optval, int optlen)
+{
+ struct compat_group_req gr32;
+ struct sockaddr_in6 *psin6;
+
+ if (optlen < sizeof(gr32))
+ return -EINVAL;
+ if (copy_from_sockptr(&gr32, optval, sizeof(gr32)))
+ return -EFAULT;
+
+ if (gr32.gr_group.ss_family != AF_INET6)
+ return -EADDRNOTAVAIL;
+ psin6 = (struct sockaddr_in6 *)&gr32.gr_group;
+ if (optname == MCAST_JOIN_GROUP)
+ return ipv6_sock_mc_join(sk, gr32.gr_interface,
+ &psin6->sin6_addr);
+ return ipv6_sock_mc_drop(sk, gr32.gr_interface, &psin6->sin6_addr);
+}
+
+static int ipv6_set_opt_hdr(struct sock *sk, int optname, sockptr_t optval,
+ int optlen)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_opt_hdr *new = NULL;
+ struct net *net = sock_net(sk);
+ struct ipv6_txoptions *opt;
+ int err;
+
+ /* hop-by-hop / destination options are privileged option */
+ if (optname != IPV6_RTHDR && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW))
+ return -EPERM;
+
+ /* remove any sticky options header with a zero option
+ * length, per RFC3542.
+ */
+ if (optlen > 0) {
+ if (sockptr_is_null(optval))
+ return -EINVAL;
+ if (optlen < sizeof(struct ipv6_opt_hdr) ||
+ optlen & 0x7 ||
+ optlen > 8 * 255)
+ return -EINVAL;
+
+ new = memdup_sockptr(optval, optlen);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+ if (unlikely(ipv6_optlen(new) > optlen)) {
+ kfree(new);
+ return -EINVAL;
+ }
+ }
+
+ opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
+ opt = ipv6_renew_options(sk, opt, optname, new);
+ kfree(new);
+ if (IS_ERR(opt))
+ return PTR_ERR(opt);
+
+ /* routing header option needs extra check */
+ err = -EINVAL;
+ if (optname == IPV6_RTHDR && opt && opt->srcrt) {
+ struct ipv6_rt_hdr *rthdr = opt->srcrt;
+ switch (rthdr->type) {
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ case IPV6_SRCRT_TYPE_2:
+ if (rthdr->hdrlen != 2 || rthdr->segments_left != 1)
+ goto sticky_done;
+ break;
+#endif
+ case IPV6_SRCRT_TYPE_4:
+ {
+ struct ipv6_sr_hdr *srh =
+ (struct ipv6_sr_hdr *)opt->srcrt;
+
+ if (!seg6_validate_srh(srh, optlen, false))
+ goto sticky_done;
+ break;
+ }
+ default:
+ goto sticky_done;
+ }
+ }
+
+ err = 0;
+ opt = ipv6_update_options(sk, opt);
+sticky_done:
+ if (opt) {
+ atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
+ txopt_put(opt);
+ }
+ return err;
+}
+
+int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct net *net = sock_net(sk);
@@ -145,11 +400,11 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
int retv = -ENOPROTOOPT;
bool needs_rtnl = setsockopt_needs_rtnl(optname);
- if (!optval)
+ if (sockptr_is_null(optval))
val = 0;
else {
if (optlen >= sizeof(int)) {
- if (get_user(val, (int __user *) optval))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
} else
val = 0;
@@ -162,7 +417,13 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
if (needs_rtnl)
rtnl_lock();
- lock_sock(sk);
+ sockopt_lock_sock(sk);
+
+ /* Another thread has converted the socket into IPv4 with
+ * IPV6_ADDRFORM concurrently.
+ */
+ if (unlikely(sk->sk_family != AF_INET6))
+ goto unlock;
switch (optname) {
@@ -170,9 +431,6 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
if (optlen < sizeof(int))
goto e_inval;
if (val == PF_INET) {
- struct ipv6_txoptions *opt;
- struct sk_buff *pktopt;
-
if (sk->sk_type == SOCK_RAW)
break;
@@ -188,10 +446,10 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
retv = -EBUSY;
break;
}
- break;
} else {
break;
}
+
if (sk->sk_state != TCP_ESTABLISHED) {
retv = -ENOTCONN;
break;
@@ -203,8 +461,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
break;
}
- fl6_free_socklist(sk);
__ipv6_sock_mc_close(sk);
+ __ipv6_sock_ac_close(sk);
/*
* Sock is moving from IPv6 to IPv4 (sk_prot), so
@@ -215,12 +473,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
if (sk->sk_protocol == IPPROTO_TCP) {
struct inet_connection_sock *icsk = inet_csk(sk);
- local_bh_disable();
+
sock_prot_inuse_add(net, sk->sk_prot, -1);
sock_prot_inuse_add(net, &tcp_prot, 1);
- local_bh_enable();
- sk->sk_prot = &tcp_prot;
- icsk->icsk_af_ops = &ipv4_specific;
+
+ /* Paired with READ_ONCE(sk->sk_prot) in inet6_stream_ops */
+ WRITE_ONCE(sk->sk_prot, &tcp_prot);
+ /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */
+ WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific);
sk->sk_socket->ops = &inet_stream_ops;
sk->sk_family = PF_INET;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
@@ -229,22 +489,23 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
if (sk->sk_protocol == IPPROTO_UDPLITE)
prot = &udplite_prot;
- local_bh_disable();
+
sock_prot_inuse_add(net, sk->sk_prot, -1);
sock_prot_inuse_add(net, prot, 1);
- local_bh_enable();
- sk->sk_prot = prot;
+
+ /* Paired with READ_ONCE(sk->sk_prot) in inet6_dgram_ops */
+ WRITE_ONCE(sk->sk_prot, prot);
sk->sk_socket->ops = &inet_dgram_ops;
sk->sk_family = PF_INET;
}
- opt = xchg((__force struct ipv6_txoptions **)&np->opt,
- NULL);
- if (opt) {
- atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
- txopt_put(opt);
- }
- pktopt = xchg(&np->pktoptions, NULL);
- kfree_skb(pktopt);
+
+ /* Disable all options not to allocate memory anymore,
+ * but there is still a race. See the lockless path
+ * in udpv6_sendmsg() and ipv6_local_rxpmtu().
+ */
+ np->rxopt.all = 0;
+
+ inet6_cleanup_sock(sk);
/*
* ... and add it to the refcnt debug socks count
@@ -343,7 +604,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
/* RFC 3542, 6.5: default traffic class of 0x0 */
if (val == -1)
val = 0;
- np->tclass = val;
+ if (sk->sk_type == SOCK_STREAM) {
+ val &= ~INET_ECN_MASK;
+ val |= np->tclass & INET_ECN_MASK;
+ }
+ if (np->tclass != val) {
+ np->tclass = val;
+ sk_dst_reset(sk);
+ }
retv = 0;
break;
@@ -369,8 +637,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
break;
case IPV6_TRANSPARENT:
- if (valbool && !ns_capable(net->user_ns, CAP_NET_RAW) &&
- !ns_capable(net->user_ns, CAP_NET_ADMIN)) {
+ if (valbool && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW) &&
+ !sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) {
retv = -EPERM;
break;
}
@@ -400,82 +668,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
case IPV6_RTHDRDSTOPTS:
case IPV6_RTHDR:
case IPV6_DSTOPTS:
- {
- struct ipv6_txoptions *opt;
- struct ipv6_opt_hdr *new = NULL;
-
- /* hop-by-hop / destination options are privileged option */
- retv = -EPERM;
- if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
- break;
-
- /* remove any sticky options header with a zero option
- * length, per RFC3542.
- */
- if (optlen == 0)
- optval = NULL;
- else if (!optval)
- goto e_inval;
- else if (optlen < sizeof(struct ipv6_opt_hdr) ||
- optlen & 0x7 || optlen > 8 * 255)
- goto e_inval;
- else {
- new = memdup_user(optval, optlen);
- if (IS_ERR(new)) {
- retv = PTR_ERR(new);
- break;
- }
- if (unlikely(ipv6_optlen(new) > optlen)) {
- kfree(new);
- goto e_inval;
- }
- }
-
- opt = rcu_dereference_protected(np->opt,
- lockdep_sock_is_held(sk));
- opt = ipv6_renew_options(sk, opt, optname, new);
- kfree(new);
- if (IS_ERR(opt)) {
- retv = PTR_ERR(opt);
- break;
- }
-
- /* routing header option needs extra check */
- retv = -EINVAL;
- if (optname == IPV6_RTHDR && opt && opt->srcrt) {
- struct ipv6_rt_hdr *rthdr = opt->srcrt;
- switch (rthdr->type) {
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
- case IPV6_SRCRT_TYPE_2:
- if (rthdr->hdrlen != 2 ||
- rthdr->segments_left != 1)
- goto sticky_done;
-
- break;
-#endif
- case IPV6_SRCRT_TYPE_4:
- {
- struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)
- opt->srcrt;
-
- if (!seg6_validate_srh(srh, optlen))
- goto sticky_done;
- break;
- }
- default:
- goto sticky_done;
- }
- }
-
- retv = 0;
- opt = ipv6_update_options(sk, opt);
-sticky_done:
- if (opt) {
- atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
- txopt_put(opt);
- }
+ retv = ipv6_set_opt_hdr(sk, optname, optval, optlen);
break;
- }
case IPV6_PKTINFO:
{
@@ -483,12 +677,13 @@ sticky_done:
if (optlen == 0)
goto e_inval;
- else if (optlen < sizeof(struct in6_pktinfo) || !optval)
+ else if (optlen < sizeof(struct in6_pktinfo) ||
+ sockptr_is_null(optval))
goto e_inval;
- if (copy_from_user(&pkt, optval, sizeof(struct in6_pktinfo))) {
- retv = -EFAULT;
- break;
+ if (copy_from_sockptr(&pkt, optval, sizeof(pkt))) {
+ retv = -EFAULT;
+ break;
}
if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex))
goto e_inval;
@@ -529,7 +724,7 @@ sticky_done:
refcount_set(&opt->refcnt, 1);
opt->tot_len = sizeof(*opt) + optlen;
retv = -EFAULT;
- if (copy_from_user(opt+1, optval, optlen))
+ if (copy_from_sockptr(opt + 1, optval, optlen))
goto done;
msg.msg_controllen = optlen;
@@ -651,7 +846,7 @@ done:
break;
retv = -EFAULT;
- if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq)))
+ if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq)))
break;
if (optname == IPV6_ADD_MEMBERSHIP)
@@ -669,7 +864,7 @@ done:
goto e_inval;
retv = -EFAULT;
- if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq)))
+ if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq)))
break;
if (optname == IPV6_JOIN_ANYCAST)
@@ -687,105 +882,26 @@ done:
case MCAST_JOIN_GROUP:
case MCAST_LEAVE_GROUP:
- {
- struct group_req greq;
- struct sockaddr_in6 *psin6;
-
- if (optlen < sizeof(struct group_req))
- goto e_inval;
-
- retv = -EFAULT;
- if (copy_from_user(&greq, optval, sizeof(struct group_req)))
- break;
- if (greq.gr_group.ss_family != AF_INET6) {
- retv = -EADDRNOTAVAIL;
- break;
- }
- psin6 = (struct sockaddr_in6 *)&greq.gr_group;
- if (optname == MCAST_JOIN_GROUP)
- retv = ipv6_sock_mc_join(sk, greq.gr_interface,
- &psin6->sin6_addr);
+ if (in_compat_syscall())
+ retv = compat_ipv6_mcast_join_leave(sk, optname, optval,
+ optlen);
else
- retv = ipv6_sock_mc_drop(sk, greq.gr_interface,
- &psin6->sin6_addr);
+ retv = ipv6_mcast_join_leave(sk, optname, optval,
+ optlen);
break;
- }
case MCAST_JOIN_SOURCE_GROUP:
case MCAST_LEAVE_SOURCE_GROUP:
case MCAST_BLOCK_SOURCE:
case MCAST_UNBLOCK_SOURCE:
- {
- struct group_source_req greqs;
- int omode, add;
-
- if (optlen < sizeof(struct group_source_req))
- goto e_inval;
- if (copy_from_user(&greqs, optval, sizeof(greqs))) {
- retv = -EFAULT;
- break;
- }
- if (greqs.gsr_group.ss_family != AF_INET6 ||
- greqs.gsr_source.ss_family != AF_INET6) {
- retv = -EADDRNOTAVAIL;
- break;
- }
- if (optname == MCAST_BLOCK_SOURCE) {
- omode = MCAST_EXCLUDE;
- add = 1;
- } else if (optname == MCAST_UNBLOCK_SOURCE) {
- omode = MCAST_EXCLUDE;
- add = 0;
- } else if (optname == MCAST_JOIN_SOURCE_GROUP) {
- struct sockaddr_in6 *psin6;
-
- psin6 = (struct sockaddr_in6 *)&greqs.gsr_group;
- retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface,
- &psin6->sin6_addr,
- MCAST_INCLUDE);
- /* prior join w/ different source is ok */
- if (retv && retv != -EADDRINUSE)
- break;
- omode = MCAST_INCLUDE;
- add = 1;
- } else /* MCAST_LEAVE_SOURCE_GROUP */ {
- omode = MCAST_INCLUDE;
- add = 0;
- }
- retv = ip6_mc_source(add, omode, sk, &greqs);
+ retv = do_ipv6_mcast_group_source(sk, optname, optval, optlen);
break;
- }
case MCAST_MSFILTER:
- {
- struct group_filter *gsf;
-
- if (optlen < GROUP_FILTER_SIZE(0))
- goto e_inval;
- if (optlen > sysctl_optmem_max) {
- retv = -ENOBUFS;
- break;
- }
- gsf = memdup_user(optval, optlen);
- if (IS_ERR(gsf)) {
- retv = PTR_ERR(gsf);
- break;
- }
- /* numsrc >= (4G-140)/128 overflow in 32 bits */
- if (gsf->gf_numsrc >= 0x1ffffffU ||
- gsf->gf_numsrc > sysctl_mld_max_msf) {
- kfree(gsf);
- retv = -ENOBUFS;
- break;
- }
- if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) {
- kfree(gsf);
- retv = -EINVAL;
- break;
- }
- retv = ip6_mc_msfilter(sk, gsf);
- kfree(gsf);
-
+ if (in_compat_syscall())
+ retv = compat_ipv6_set_mcast_msfilter(sk, optval,
+ optlen);
+ else
+ retv = ipv6_set_mcast_msfilter(sk, optval, optlen);
break;
- }
case IPV6_ROUTER_ALERT:
if (optlen < sizeof(int))
goto e_inval;
@@ -833,79 +949,29 @@ done:
case IPV6_IPSEC_POLICY:
case IPV6_XFRM_POLICY:
retv = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ if (!sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN))
break;
retv = xfrm_user_policy(sk, optname, optval, optlen);
break;
case IPV6_ADDR_PREFERENCES:
- {
- unsigned int pref = 0;
- unsigned int prefmask = ~0;
-
if (optlen < sizeof(int))
goto e_inval;
-
- retv = -EINVAL;
-
- /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */
- switch (val & (IPV6_PREFER_SRC_PUBLIC|
- IPV6_PREFER_SRC_TMP|
- IPV6_PREFER_SRC_PUBTMP_DEFAULT)) {
- case IPV6_PREFER_SRC_PUBLIC:
- pref |= IPV6_PREFER_SRC_PUBLIC;
- break;
- case IPV6_PREFER_SRC_TMP:
- pref |= IPV6_PREFER_SRC_TMP;
- break;
- case IPV6_PREFER_SRC_PUBTMP_DEFAULT:
- break;
- case 0:
- goto pref_skip_pubtmp;
- default:
- goto e_inval;
- }
-
- prefmask &= ~(IPV6_PREFER_SRC_PUBLIC|
- IPV6_PREFER_SRC_TMP);
-pref_skip_pubtmp:
-
- /* check HOME/COA conflicts */
- switch (val & (IPV6_PREFER_SRC_HOME|IPV6_PREFER_SRC_COA)) {
- case IPV6_PREFER_SRC_HOME:
- break;
- case IPV6_PREFER_SRC_COA:
- pref |= IPV6_PREFER_SRC_COA;
- case 0:
- goto pref_skip_coa;
- default:
- goto e_inval;
- }
-
- prefmask &= ~IPV6_PREFER_SRC_COA;
-pref_skip_coa:
-
- /* check CGA/NONCGA conflicts */
- switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) {
- case IPV6_PREFER_SRC_CGA:
- case IPV6_PREFER_SRC_NONCGA:
- case 0:
- break;
- default:
- goto e_inval;
- }
-
- np->srcprefs = (np->srcprefs & prefmask) | pref;
- retv = 0;
-
+ retv = __ip6_sock_set_addr_preferences(sk, val);
break;
- }
case IPV6_MINHOPCOUNT:
if (optlen < sizeof(int))
goto e_inval;
if (val < 0 || val > 255)
goto e_inval;
- np->min_hopcount = val;
+
+ if (val)
+ static_branch_enable(&ip6_min_hopcount);
+
+ /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount
+ * while we are changing it.
+ */
+ WRITE_ONCE(np->min_hopcount, val);
retv = 0;
break;
case IPV6_DONTFRAG:
@@ -921,23 +987,32 @@ pref_skip_coa:
np->rxopt.bits.recvfragsize = valbool;
retv = 0;
break;
+ case IPV6_RECVERR_RFC4884:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ if (val < 0 || val > 1)
+ goto e_inval;
+ np->recverr_rfc4884 = valbool;
+ retv = 0;
+ break;
}
- release_sock(sk);
+unlock:
+ sockopt_release_sock(sk);
if (needs_rtnl)
rtnl_unlock();
return retv;
e_inval:
- release_sock(sk);
+ sockopt_release_sock(sk);
if (needs_rtnl)
rtnl_unlock();
return -EINVAL;
}
-int ipv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
+int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+ unsigned int optlen)
{
int err;
@@ -958,41 +1033,8 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
}
EXPORT_SYMBOL(ipv6_setsockopt);
-#ifdef CONFIG_COMPAT
-int compat_ipv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- int err;
-
- if (level == SOL_IP && sk->sk_type != SOCK_RAW) {
- if (udp_prot.compat_setsockopt != NULL)
- return udp_prot.compat_setsockopt(sk, level, optname,
- optval, optlen);
- return udp_prot.setsockopt(sk, level, optname, optval, optlen);
- }
-
- if (level != SOL_IPV6)
- return -ENOPROTOOPT;
-
- if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER)
- return compat_mc_setsockopt(sk, level, optname, optval, optlen,
- ipv6_setsockopt);
-
- err = do_ipv6_setsockopt(sk, level, optname, optval, optlen);
-#ifdef CONFIG_NETFILTER
- /* we need to exclude all possible ENOPROTOOPTs except default case */
- if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY &&
- optname != IPV6_XFRM_POLICY)
- err = compat_nf_setsockopt(sk, PF_INET6, optname, optval,
- optlen);
-#endif
- return err;
-}
-EXPORT_SYMBOL(compat_ipv6_setsockopt);
-#endif
-
static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt,
- int optname, char __user *optval, int len)
+ int optname, sockptr_t optval, int len)
{
struct ipv6_opt_hdr *hdr;
@@ -1020,13 +1062,81 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt,
return 0;
len = min_t(unsigned int, len, ipv6_optlen(hdr));
- if (copy_to_user(optval, hdr, len))
+ if (copy_to_sockptr(optval, hdr, len))
return -EFAULT;
return len;
}
-static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen, unsigned int flags)
+static int ipv6_get_msfilter(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
+{
+ const int size0 = offsetof(struct group_filter, gf_slist_flex);
+ struct group_filter gsf;
+ int num;
+ int err;
+
+ if (len < size0)
+ return -EINVAL;
+ if (copy_from_sockptr(&gsf, optval, size0))
+ return -EFAULT;
+ if (gsf.gf_group.ss_family != AF_INET6)
+ return -EADDRNOTAVAIL;
+ num = gsf.gf_numsrc;
+ sockopt_lock_sock(sk);
+ err = ip6_mc_msfget(sk, &gsf, optval, size0);
+ if (!err) {
+ if (num > gsf.gf_numsrc)
+ num = gsf.gf_numsrc;
+ len = GROUP_FILTER_SIZE(num);
+ if (copy_to_sockptr(optlen, &len, sizeof(int)) ||
+ copy_to_sockptr(optval, &gsf, size0))
+ err = -EFAULT;
+ }
+ sockopt_release_sock(sk);
+ return err;
+}
+
+static int compat_ipv6_get_msfilter(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
+{
+ const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
+ struct compat_group_filter gf32;
+ struct group_filter gf;
+ int err;
+ int num;
+
+ if (len < size0)
+ return -EINVAL;
+
+ if (copy_from_sockptr(&gf32, optval, size0))
+ return -EFAULT;
+ gf.gf_interface = gf32.gf_interface;
+ gf.gf_fmode = gf32.gf_fmode;
+ num = gf.gf_numsrc = gf32.gf_numsrc;
+ gf.gf_group = gf32.gf_group;
+
+ if (gf.gf_group.ss_family != AF_INET6)
+ return -EADDRNOTAVAIL;
+
+ sockopt_lock_sock(sk);
+ err = ip6_mc_msfget(sk, &gf, optval, size0);
+ sockopt_release_sock(sk);
+ if (err)
+ return err;
+ if (num > gf.gf_numsrc)
+ num = gf.gf_numsrc;
+ len = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32));
+ if (copy_to_sockptr(optlen, &len, sizeof(int)) ||
+ copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode),
+ &gf.gf_fmode, sizeof(gf32.gf_fmode)) ||
+ copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc),
+ &gf.gf_numsrc, sizeof(gf32.gf_numsrc)))
+ return -EFAULT;
+ return 0;
+}
+
+int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, sockptr_t optlen)
{
struct ipv6_pinfo *np = inet6_sk(sk);
int len;
@@ -1035,7 +1145,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
if (ip6_mroute_opt(optname))
return ip6_mroute_getsockopt(sk, optname, optval, optlen);
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
switch (optname) {
case IPV6_ADDRFORM:
@@ -1048,23 +1158,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
val = sk->sk_family;
break;
case MCAST_MSFILTER:
- {
- struct group_filter gsf;
- int err;
-
- if (len < GROUP_FILTER_SIZE(0))
- return -EINVAL;
- if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0)))
- return -EFAULT;
- if (gsf.gf_group.ss_family != AF_INET6)
- return -EADDRNOTAVAIL;
- lock_sock(sk);
- err = ip6_mc_msfget(sk, &gsf,
- (struct group_filter __user *)optval, optlen);
- release_sock(sk);
- return err;
- }
-
+ if (in_compat_syscall())
+ return compat_ipv6_get_msfilter(sk, optval, optlen, len);
+ return ipv6_get_msfilter(sk, optval, optlen, len);
case IPV6_2292PKTOPTIONS:
{
struct msghdr msg;
@@ -1073,15 +1169,21 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
if (sk->sk_type != SOCK_STREAM)
return -ENOPROTOOPT;
- msg.msg_control = optval;
+ if (optval.is_kernel) {
+ msg.msg_control_is_user = false;
+ msg.msg_control = optval.kernel;
+ } else {
+ msg.msg_control_is_user = true;
+ msg.msg_control_user = optval.user;
+ }
msg.msg_controllen = len;
- msg.msg_flags = flags;
+ msg.msg_flags = 0;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
skb = np->pktoptions;
if (skb)
ip6_datagram_recv_ctl(sk, &msg, skb);
- release_sock(sk);
+ sockopt_release_sock(sk);
if (!skb) {
if (np->rxopt.bits.rxinfo) {
struct in6_pktinfo src_info;
@@ -1118,7 +1220,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
}
}
len -= msg.msg_controllen;
- return put_user(len, optlen);
+ return copy_to_sockptr(optlen, &len, sizeof(int));
}
case IPV6_MTU:
{
@@ -1170,15 +1272,15 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
{
struct ipv6_txoptions *opt;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
opt = rcu_dereference_protected(np->opt,
lockdep_sock_is_held(sk));
len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len);
- release_sock(sk);
+ sockopt_release_sock(sk);
/* check if ipv6_getsockopt_sticky() returns err code */
if (len < 0)
return len;
- return put_user(len, optlen);
+ return copy_to_sockptr(optlen, &len, sizeof(int));
}
case IPV6_RECVHOPOPTS:
@@ -1232,9 +1334,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
if (!mtuinfo.ip6m_mtu)
return -ENOTCONN;
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &mtuinfo, len))
+ if (copy_to_sockptr(optval, &mtuinfo, len))
return -EFAULT;
return 0;
@@ -1311,7 +1413,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
if (len < sizeof(freq))
return -EINVAL;
- if (copy_from_user(&freq, optval, sizeof(freq)))
+ if (copy_from_sockptr(&freq, optval, sizeof(freq)))
return -EFAULT;
if (freq.flr_action != IPV6_FL_A_GET)
@@ -1326,9 +1428,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
if (val < 0)
return val;
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &freq, len))
+ if (copy_to_sockptr(optval, &freq, len))
return -EFAULT;
return 0;
@@ -1372,13 +1474,17 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
val = np->rtalert_isolate;
break;
+ case IPV6_RECVERR_RFC4884:
+ val = np->recverr_rfc4884;
+ break;
+
default:
return -ENOPROTOOPT;
}
len = min_t(unsigned int, sizeof(int), len);
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &val, len))
+ if (copy_to_sockptr(optval, &val, len))
return -EFAULT;
return 0;
}
@@ -1394,7 +1500,8 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname,
if (level != SOL_IPV6)
return -ENOPROTOOPT;
- err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, 0);
+ err = do_ipv6_getsockopt(sk, level, optname,
+ USER_SOCKPTR(optval), USER_SOCKPTR(optlen));
#ifdef CONFIG_NETFILTER
/* we need to exclude all possible ENOPROTOOPTs except default case */
if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) {
@@ -1411,43 +1518,3 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname,
return err;
}
EXPORT_SYMBOL(ipv6_getsockopt);
-
-#ifdef CONFIG_COMPAT
-int compat_ipv6_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- int err;
-
- if (level == SOL_IP && sk->sk_type != SOCK_RAW) {
- if (udp_prot.compat_getsockopt != NULL)
- return udp_prot.compat_getsockopt(sk, level, optname,
- optval, optlen);
- return udp_prot.getsockopt(sk, level, optname, optval, optlen);
- }
-
- if (level != SOL_IPV6)
- return -ENOPROTOOPT;
-
- if (optname == MCAST_MSFILTER)
- return compat_mc_getsockopt(sk, level, optname, optval, optlen,
- ipv6_getsockopt);
-
- err = do_ipv6_getsockopt(sk, level, optname, optval, optlen,
- MSG_CMSG_COMPAT);
-#ifdef CONFIG_NETFILTER
- /* we need to exclude all possible ENOPROTOOPTs except default case */
- if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) {
- int len;
-
- if (get_user(len, optlen))
- return -EFAULT;
-
- err = compat_nf_getsockopt(sk, PF_INET6, optname, optval, &len);
- if (err >= 0)
- err = put_user(len, optlen);
- }
-#endif
- return err;
-}
-EXPORT_SYMBOL(compat_ipv6_getsockopt);
-#endif
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index eaa4c2cc2fbb..7860383295d8 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -29,7 +29,6 @@
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/jiffies.h>
-#include <linux/times.h>
#include <linux/net.h>
#include <linux/in.h>
#include <linux/in6.h>
@@ -42,6 +41,7 @@
#include <linux/slab.h>
#include <linux/pkt_sched.h>
#include <net/mld.h>
+#include <linux/workqueue.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
@@ -67,18 +67,14 @@ static int __mld2_query_bugs[] __attribute__((__unused__)) = {
BUILD_BUG_ON_ZERO(offsetof(struct mld2_grec, grec_mca) % 4)
};
+static struct workqueue_struct *mld_wq;
static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT;
static void igmp6_join_group(struct ifmcaddr6 *ma);
static void igmp6_leave_group(struct ifmcaddr6 *ma);
-static void igmp6_timer_handler(struct timer_list *t);
+static void mld_mca_work(struct work_struct *work);
-static void mld_gq_timer_expire(struct timer_list *t);
-static void mld_ifc_timer_expire(struct timer_list *t);
static void mld_ifc_event(struct inet6_dev *idev);
-static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc);
-static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc);
-static void mld_clear_delrec(struct inet6_dev *idev);
static bool mld_in_v1_mode(const struct inet6_dev *idev);
static int sf_setstate(struct ifmcaddr6 *pmc);
static void sf_markstate(struct ifmcaddr6 *pmc);
@@ -112,12 +108,52 @@ int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT;
/*
* socket join on multicast group
*/
+#define mc_dereference(e, idev) \
+ rcu_dereference_protected(e, lockdep_is_held(&(idev)->mc_lock))
+
+#define sock_dereference(e, sk) \
+ rcu_dereference_protected(e, lockdep_sock_is_held(sk))
+
+#define for_each_pmc_socklock(np, sk, pmc) \
+ for (pmc = sock_dereference((np)->ipv6_mc_list, sk); \
+ pmc; \
+ pmc = sock_dereference(pmc->next, sk))
#define for_each_pmc_rcu(np, pmc) \
- for (pmc = rcu_dereference(np->ipv6_mc_list); \
- pmc != NULL; \
+ for (pmc = rcu_dereference((np)->ipv6_mc_list); \
+ pmc; \
pmc = rcu_dereference(pmc->next))
+#define for_each_psf_mclock(mc, psf) \
+ for (psf = mc_dereference((mc)->mca_sources, mc->idev); \
+ psf; \
+ psf = mc_dereference(psf->sf_next, mc->idev))
+
+#define for_each_psf_rcu(mc, psf) \
+ for (psf = rcu_dereference((mc)->mca_sources); \
+ psf; \
+ psf = rcu_dereference(psf->sf_next))
+
+#define for_each_psf_tomb(mc, psf) \
+ for (psf = mc_dereference((mc)->mca_tomb, mc->idev); \
+ psf; \
+ psf = mc_dereference(psf->sf_next, mc->idev))
+
+#define for_each_mc_mclock(idev, mc) \
+ for (mc = mc_dereference((idev)->mc_list, idev); \
+ mc; \
+ mc = mc_dereference(mc->next, idev))
+
+#define for_each_mc_rcu(idev, mc) \
+ for (mc = rcu_dereference((idev)->mc_list); \
+ mc; \
+ mc = rcu_dereference(mc->next))
+
+#define for_each_mc_tomb(idev, mc) \
+ for (mc = mc_dereference((idev)->mc_tomb, idev); \
+ mc; \
+ mc = mc_dereference(mc->next, idev))
+
static int unsolicited_report_interval(struct inet6_dev *idev)
{
int iv;
@@ -144,15 +180,11 @@ static int __ipv6_sock_mc_join(struct sock *sk, int ifindex,
if (!ipv6_addr_is_multicast(addr))
return -EINVAL;
- rcu_read_lock();
- for_each_pmc_rcu(np, mc_lst) {
+ for_each_pmc_socklock(np, sk, mc_lst) {
if ((ifindex == 0 || mc_lst->ifindex == ifindex) &&
- ipv6_addr_equal(&mc_lst->addr, addr)) {
- rcu_read_unlock();
+ ipv6_addr_equal(&mc_lst->addr, addr))
return -EADDRINUSE;
- }
}
- rcu_read_unlock();
mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL);
@@ -179,8 +211,7 @@ static int __ipv6_sock_mc_join(struct sock *sk, int ifindex,
mc_lst->ifindex = dev->ifindex;
mc_lst->sfmode = mode;
- rwlock_init(&mc_lst->sflock);
- mc_lst->sflist = NULL;
+ RCU_INIT_POINTER(mc_lst->sflist, NULL);
/*
* now add/increase the group membership on the device
@@ -227,7 +258,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
return -EINVAL;
for (lnk = &np->ipv6_mc_list;
- (mc_lst = rtnl_dereference(*lnk)) != NULL;
+ (mc_lst = sock_dereference(*lnk, sk)) != NULL;
lnk = &mc_lst->next) {
if ((ifindex == 0 || mc_lst->ifindex == ifindex) &&
ipv6_addr_equal(&mc_lst->addr, addr)) {
@@ -239,11 +270,12 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
if (dev) {
struct inet6_dev *idev = __in6_dev_get(dev);
- (void) ip6_mc_leave_src(sk, mc_lst, idev);
+ ip6_mc_leave_src(sk, mc_lst, idev);
if (idev)
__ipv6_dev_mc_dec(idev, &mc_lst->addr);
- } else
- (void) ip6_mc_leave_src(sk, mc_lst, NULL);
+ } else {
+ ip6_mc_leave_src(sk, mc_lst, NULL);
+ }
atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc);
kfree_rcu(mc_lst, rcu);
@@ -255,10 +287,9 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
}
EXPORT_SYMBOL(ipv6_sock_mc_drop);
-/* called with rcu_read_lock() */
-static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net,
- const struct in6_addr *group,
- int ifindex)
+static struct inet6_dev *ip6_mc_find_dev_rtnl(struct net *net,
+ const struct in6_addr *group,
+ int ifindex)
{
struct net_device *dev = NULL;
struct inet6_dev *idev = NULL;
@@ -270,19 +301,17 @@ static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net,
dev = rt->dst.dev;
ip6_rt_put(rt);
}
- } else
- dev = dev_get_by_index_rcu(net, ifindex);
+ } else {
+ dev = __dev_get_by_index(net, ifindex);
+ }
if (!dev)
return NULL;
idev = __in6_dev_get(dev);
if (!idev)
return NULL;
- read_lock_bh(&idev->lock);
- if (idev->dead) {
- read_unlock_bh(&idev->lock);
+ if (idev->dead)
return NULL;
- }
return idev;
}
@@ -294,7 +323,7 @@ void __ipv6_sock_mc_close(struct sock *sk)
ASSERT_RTNL();
- while ((mc_lst = rtnl_dereference(np->ipv6_mc_list)) != NULL) {
+ while ((mc_lst = sock_dereference(np->ipv6_mc_list, sk)) != NULL) {
struct net_device *dev;
np->ipv6_mc_list = mc_lst->next;
@@ -303,11 +332,12 @@ void __ipv6_sock_mc_close(struct sock *sk)
if (dev) {
struct inet6_dev *idev = __in6_dev_get(dev);
- (void) ip6_mc_leave_src(sk, mc_lst, idev);
+ ip6_mc_leave_src(sk, mc_lst, idev);
if (idev)
__ipv6_dev_mc_dec(idev, &mc_lst->addr);
- } else
- (void) ip6_mc_leave_src(sk, mc_lst, NULL);
+ } else {
+ ip6_mc_leave_src(sk, mc_lst, NULL);
+ }
atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc);
kfree_rcu(mc_lst, rcu);
@@ -320,8 +350,11 @@ void ipv6_sock_mc_close(struct sock *sk)
if (!rcu_access_pointer(np->ipv6_mc_list))
return;
+
rtnl_lock();
+ lock_sock(sk);
__ipv6_sock_mc_close(sk);
+ release_sock(sk);
rtnl_unlock();
}
@@ -336,7 +369,6 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
struct net *net = sock_net(sk);
int i, j, rv;
int leavegroup = 0;
- int pmclocked = 0;
int err;
source = &((struct sockaddr_in6 *)&pgsr->gsr_source)->sin6_addr;
@@ -345,16 +377,14 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
if (!ipv6_addr_is_multicast(group))
return -EINVAL;
- rcu_read_lock();
- idev = ip6_mc_find_dev_rcu(net, group, pgsr->gsr_interface);
- if (!idev) {
- rcu_read_unlock();
+ idev = ip6_mc_find_dev_rtnl(net, group, pgsr->gsr_interface);
+ if (!idev)
return -ENODEV;
- }
err = -EADDRNOTAVAIL;
- for_each_pmc_rcu(inet6, pmc) {
+ mutex_lock(&idev->mc_lock);
+ for_each_pmc_socklock(inet6, sk, pmc) {
if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface)
continue;
if (ipv6_addr_equal(&pmc->addr, group))
@@ -365,7 +395,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
goto done;
}
/* if a source filter was set, must be the same mode as before */
- if (pmc->sflist) {
+ if (rcu_access_pointer(pmc->sflist)) {
if (pmc->sfmode != omode) {
err = -EINVAL;
goto done;
@@ -377,10 +407,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
pmc->sfmode = omode;
}
- write_lock(&pmc->sflock);
- pmclocked = 1;
-
- psl = pmc->sflist;
+ psl = sock_dereference(pmc->sflist, sk);
if (!add) {
if (!psl)
goto done; /* err = -EADDRNOTAVAIL */
@@ -420,7 +447,8 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
if (psl)
count += psl->sl_max;
- newpsl = sock_kmalloc(sk, IP6_SFLSIZE(count), GFP_ATOMIC);
+ newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count),
+ GFP_KERNEL);
if (!newpsl) {
err = -ENOBUFS;
goto done;
@@ -430,9 +458,12 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
if (psl) {
for (i = 0; i < psl->sl_count; i++)
newpsl->sl_addr[i] = psl->sl_addr[i];
- sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max));
+ atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+ &sk->sk_omem_alloc);
}
- pmc->sflist = psl = newpsl;
+ rcu_assign_pointer(pmc->sflist, newpsl);
+ kfree_rcu(psl, rcu);
+ psl = newpsl;
}
rv = 1; /* > 0 for insert logic below if sl_count is 0 */
for (i = 0; i < psl->sl_count; i++) {
@@ -448,16 +479,14 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
/* update the interface list */
ip6_mc_add_src(idev, group, omode, 1, source, 1);
done:
- if (pmclocked)
- write_unlock(&pmc->sflock);
- read_unlock_bh(&idev->lock);
- rcu_read_unlock();
+ mutex_unlock(&idev->mc_lock);
if (leavegroup)
err = ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group);
return err;
}
-int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
+int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf,
+ struct sockaddr_storage *list)
{
const struct in6_addr *group;
struct ipv6_mc_socklist *pmc;
@@ -476,13 +505,9 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
gsf->gf_fmode != MCAST_EXCLUDE)
return -EINVAL;
- rcu_read_lock();
- idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface);
-
- if (!idev) {
- rcu_read_unlock();
+ idev = ip6_mc_find_dev_rtnl(net, group, gsf->gf_interface);
+ if (!idev)
return -ENODEV;
- }
err = 0;
@@ -491,7 +516,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
goto done;
}
- for_each_pmc_rcu(inet6, pmc) {
+ for_each_pmc_socklock(inet6, sk, pmc) {
if (pmc->ifindex != gsf->gf_interface)
continue;
if (ipv6_addr_equal(&pmc->addr, group))
@@ -502,103 +527,91 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
goto done;
}
if (gsf->gf_numsrc) {
- newpsl = sock_kmalloc(sk, IP6_SFLSIZE(gsf->gf_numsrc),
- GFP_ATOMIC);
+ newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr,
+ gsf->gf_numsrc),
+ GFP_KERNEL);
if (!newpsl) {
err = -ENOBUFS;
goto done;
}
newpsl->sl_max = newpsl->sl_count = gsf->gf_numsrc;
- for (i = 0; i < newpsl->sl_count; ++i) {
+ for (i = 0; i < newpsl->sl_count; ++i, ++list) {
struct sockaddr_in6 *psin6;
- psin6 = (struct sockaddr_in6 *)&gsf->gf_slist[i];
+ psin6 = (struct sockaddr_in6 *)list;
newpsl->sl_addr[i] = psin6->sin6_addr;
}
+ mutex_lock(&idev->mc_lock);
err = ip6_mc_add_src(idev, group, gsf->gf_fmode,
- newpsl->sl_count, newpsl->sl_addr, 0);
+ newpsl->sl_count, newpsl->sl_addr, 0);
if (err) {
- sock_kfree_s(sk, newpsl, IP6_SFLSIZE(newpsl->sl_max));
+ mutex_unlock(&idev->mc_lock);
+ sock_kfree_s(sk, newpsl, struct_size(newpsl, sl_addr,
+ newpsl->sl_max));
goto done;
}
+ mutex_unlock(&idev->mc_lock);
} else {
newpsl = NULL;
- (void) ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0);
+ mutex_lock(&idev->mc_lock);
+ ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0);
+ mutex_unlock(&idev->mc_lock);
}
- write_lock(&pmc->sflock);
- psl = pmc->sflist;
+ mutex_lock(&idev->mc_lock);
+ psl = sock_dereference(pmc->sflist, sk);
if (psl) {
- (void) ip6_mc_del_src(idev, group, pmc->sfmode,
- psl->sl_count, psl->sl_addr, 0);
- sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max));
- } else
- (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
- pmc->sflist = newpsl;
+ ip6_mc_del_src(idev, group, pmc->sfmode,
+ psl->sl_count, psl->sl_addr, 0);
+ atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+ &sk->sk_omem_alloc);
+ } else {
+ ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
+ }
+ rcu_assign_pointer(pmc->sflist, newpsl);
+ mutex_unlock(&idev->mc_lock);
+ kfree_rcu(psl, rcu);
pmc->sfmode = gsf->gf_fmode;
- write_unlock(&pmc->sflock);
err = 0;
done:
- read_unlock_bh(&idev->lock);
- rcu_read_unlock();
if (leavegroup)
err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group);
return err;
}
int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
- struct group_filter __user *optval, int __user *optlen)
+ sockptr_t optval, size_t ss_offset)
{
- int err, i, count, copycount;
+ struct ipv6_pinfo *inet6 = inet6_sk(sk);
const struct in6_addr *group;
struct ipv6_mc_socklist *pmc;
- struct inet6_dev *idev;
- struct ipv6_pinfo *inet6 = inet6_sk(sk);
struct ip6_sf_socklist *psl;
- struct net *net = sock_net(sk);
+ int i, count, copycount;
group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr;
if (!ipv6_addr_is_multicast(group))
return -EINVAL;
- rcu_read_lock();
- idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface);
-
- if (!idev) {
- rcu_read_unlock();
- return -ENODEV;
- }
-
- err = -EADDRNOTAVAIL;
/* changes to the ipv6_mc_list require the socket lock and
- * rtnl lock. We have the socket lock and rcu read lock,
- * so reading the list is safe.
+ * rtnl lock. We have the socket lock, so reading the list is safe.
*/
- for_each_pmc_rcu(inet6, pmc) {
+ for_each_pmc_socklock(inet6, sk, pmc) {
if (pmc->ifindex != gsf->gf_interface)
continue;
if (ipv6_addr_equal(group, &pmc->addr))
break;
}
if (!pmc) /* must have a prior join */
- goto done;
+ return -EADDRNOTAVAIL;
+
gsf->gf_fmode = pmc->sfmode;
- psl = pmc->sflist;
+ psl = sock_dereference(pmc->sflist, sk);
count = psl ? psl->sl_count : 0;
- read_unlock_bh(&idev->lock);
- rcu_read_unlock();
copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
gsf->gf_numsrc = count;
- if (put_user(GROUP_FILTER_SIZE(copycount), optlen) ||
- copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
- return -EFAULT;
- }
- /* changes to psl require the socket lock, and a write lock
- * on pmc->sflock. We have the socket lock so reading here is safe.
- */
for (i = 0; i < copycount; i++) {
struct sockaddr_in6 *psin6;
struct sockaddr_storage ss;
@@ -607,14 +620,11 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
memset(&ss, 0, sizeof(ss));
psin6->sin6_family = AF_INET6;
psin6->sin6_addr = psl->sl_addr[i];
- if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss)))
+ if (copy_to_sockptr_offset(optval, ss_offset, &ss, sizeof(ss)))
return -EFAULT;
+ ss_offset += sizeof(ss);
}
return 0;
-done:
- read_unlock_bh(&idev->lock);
- rcu_read_unlock();
- return err;
}
bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
@@ -634,8 +644,7 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
rcu_read_unlock();
return np->mc_all;
}
- read_lock(&mc->sflock);
- psl = mc->sflist;
+ psl = rcu_dereference(mc->sflist);
if (!psl) {
rv = mc->sfmode == MCAST_EXCLUDE;
} else {
@@ -650,12 +659,12 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
rv = false;
}
- read_unlock(&mc->sflock);
rcu_read_unlock();
return rv;
}
+/* called with mc_lock */
static void igmp6_group_added(struct ifmcaddr6 *mc)
{
struct net_device *dev = mc->idev->dev;
@@ -665,13 +674,11 @@ static void igmp6_group_added(struct ifmcaddr6 *mc)
IPV6_ADDR_SCOPE_LINKLOCAL)
return;
- spin_lock_bh(&mc->mca_lock);
if (!(mc->mca_flags&MAF_LOADED)) {
mc->mca_flags |= MAF_LOADED;
if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0)
dev_mc_add(dev, buf);
}
- spin_unlock_bh(&mc->mca_lock);
if (!(dev->flags & IFF_UP) || (mc->mca_flags & MAF_NOREPORT))
return;
@@ -692,6 +699,7 @@ static void igmp6_group_added(struct ifmcaddr6 *mc)
mld_ifc_event(mc->idev);
}
+/* called with mc_lock */
static void igmp6_group_dropped(struct ifmcaddr6 *mc)
{
struct net_device *dev = mc->idev->dev;
@@ -701,28 +709,25 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc)
IPV6_ADDR_SCOPE_LINKLOCAL)
return;
- spin_lock_bh(&mc->mca_lock);
if (mc->mca_flags&MAF_LOADED) {
mc->mca_flags &= ~MAF_LOADED;
if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0)
dev_mc_del(dev, buf);
}
- spin_unlock_bh(&mc->mca_lock);
if (mc->mca_flags & MAF_NOREPORT)
return;
if (!mc->idev->dead)
igmp6_leave_group(mc);
- spin_lock_bh(&mc->mca_lock);
- if (del_timer(&mc->mca_timer))
+ if (cancel_delayed_work(&mc->mca_work))
refcount_dec(&mc->mca_refcnt);
- spin_unlock_bh(&mc->mca_lock);
}
/*
* deleted ifmcaddr6 manipulation
+ * called with mc_lock
*/
static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
{
@@ -734,12 +739,10 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
* for deleted items allows change reports to use common code with
* non-deleted or query-response MCA's.
*/
- pmc = kzalloc(sizeof(*pmc), GFP_ATOMIC);
+ pmc = kzalloc(sizeof(*pmc), GFP_KERNEL);
if (!pmc)
return;
- spin_lock_bh(&im->mca_lock);
- spin_lock_init(&pmc->mca_lock);
pmc->idev = im->idev;
in6_dev_hold(idev);
pmc->mca_addr = im->mca_addr;
@@ -748,90 +751,110 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
if (pmc->mca_sfmode == MCAST_INCLUDE) {
struct ip6_sf_list *psf;
- pmc->mca_tomb = im->mca_tomb;
- pmc->mca_sources = im->mca_sources;
- im->mca_tomb = im->mca_sources = NULL;
- for (psf = pmc->mca_sources; psf; psf = psf->sf_next)
+ rcu_assign_pointer(pmc->mca_tomb,
+ mc_dereference(im->mca_tomb, idev));
+ rcu_assign_pointer(pmc->mca_sources,
+ mc_dereference(im->mca_sources, idev));
+ RCU_INIT_POINTER(im->mca_tomb, NULL);
+ RCU_INIT_POINTER(im->mca_sources, NULL);
+
+ for_each_psf_mclock(pmc, psf)
psf->sf_crcount = pmc->mca_crcount;
}
- spin_unlock_bh(&im->mca_lock);
- spin_lock_bh(&idev->mc_lock);
- pmc->next = idev->mc_tomb;
- idev->mc_tomb = pmc;
- spin_unlock_bh(&idev->mc_lock);
+ rcu_assign_pointer(pmc->next, idev->mc_tomb);
+ rcu_assign_pointer(idev->mc_tomb, pmc);
}
+/* called with mc_lock */
static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
{
- struct ifmcaddr6 *pmc, *pmc_prev;
- struct ip6_sf_list *psf;
+ struct ip6_sf_list *psf, *sources, *tomb;
struct in6_addr *pmca = &im->mca_addr;
+ struct ifmcaddr6 *pmc, *pmc_prev;
- spin_lock_bh(&idev->mc_lock);
pmc_prev = NULL;
- for (pmc = idev->mc_tomb; pmc; pmc = pmc->next) {
+ for_each_mc_tomb(idev, pmc) {
if (ipv6_addr_equal(&pmc->mca_addr, pmca))
break;
pmc_prev = pmc;
}
if (pmc) {
if (pmc_prev)
- pmc_prev->next = pmc->next;
+ rcu_assign_pointer(pmc_prev->next, pmc->next);
else
- idev->mc_tomb = pmc->next;
+ rcu_assign_pointer(idev->mc_tomb, pmc->next);
}
- spin_unlock_bh(&idev->mc_lock);
- spin_lock_bh(&im->mca_lock);
if (pmc) {
im->idev = pmc->idev;
if (im->mca_sfmode == MCAST_INCLUDE) {
- swap(im->mca_tomb, pmc->mca_tomb);
- swap(im->mca_sources, pmc->mca_sources);
- for (psf = im->mca_sources; psf; psf = psf->sf_next)
+ tomb = rcu_replace_pointer(im->mca_tomb,
+ mc_dereference(pmc->mca_tomb, pmc->idev),
+ lockdep_is_held(&im->idev->mc_lock));
+ rcu_assign_pointer(pmc->mca_tomb, tomb);
+
+ sources = rcu_replace_pointer(im->mca_sources,
+ mc_dereference(pmc->mca_sources, pmc->idev),
+ lockdep_is_held(&im->idev->mc_lock));
+ rcu_assign_pointer(pmc->mca_sources, sources);
+ for_each_psf_mclock(im, psf)
psf->sf_crcount = idev->mc_qrv;
} else {
im->mca_crcount = idev->mc_qrv;
}
in6_dev_put(pmc->idev);
ip6_mc_clear_src(pmc);
- kfree(pmc);
+ kfree_rcu(pmc, rcu);
}
- spin_unlock_bh(&im->mca_lock);
}
+/* called with mc_lock */
static void mld_clear_delrec(struct inet6_dev *idev)
{
struct ifmcaddr6 *pmc, *nextpmc;
- spin_lock_bh(&idev->mc_lock);
- pmc = idev->mc_tomb;
- idev->mc_tomb = NULL;
- spin_unlock_bh(&idev->mc_lock);
+ pmc = mc_dereference(idev->mc_tomb, idev);
+ RCU_INIT_POINTER(idev->mc_tomb, NULL);
for (; pmc; pmc = nextpmc) {
- nextpmc = pmc->next;
+ nextpmc = mc_dereference(pmc->next, idev);
ip6_mc_clear_src(pmc);
in6_dev_put(pmc->idev);
- kfree(pmc);
+ kfree_rcu(pmc, rcu);
}
/* clear dead sources, too */
- read_lock_bh(&idev->lock);
- for (pmc = idev->mc_list; pmc; pmc = pmc->next) {
+ for_each_mc_mclock(idev, pmc) {
struct ip6_sf_list *psf, *psf_next;
- spin_lock_bh(&pmc->mca_lock);
- psf = pmc->mca_tomb;
- pmc->mca_tomb = NULL;
- spin_unlock_bh(&pmc->mca_lock);
+ psf = mc_dereference(pmc->mca_tomb, idev);
+ RCU_INIT_POINTER(pmc->mca_tomb, NULL);
for (; psf; psf = psf_next) {
- psf_next = psf->sf_next;
- kfree(psf);
+ psf_next = mc_dereference(psf->sf_next, idev);
+ kfree_rcu(psf, rcu);
}
}
- read_unlock_bh(&idev->lock);
+}
+
+static void mld_clear_query(struct inet6_dev *idev)
+{
+ struct sk_buff *skb;
+
+ spin_lock_bh(&idev->mc_query_lock);
+ while ((skb = __skb_dequeue(&idev->mc_query_queue)))
+ kfree_skb(skb);
+ spin_unlock_bh(&idev->mc_query_lock);
+}
+
+static void mld_clear_report(struct inet6_dev *idev)
+{
+ struct sk_buff *skb;
+
+ spin_lock_bh(&idev->mc_report_lock);
+ while ((skb = __skb_dequeue(&idev->mc_report_queue)))
+ kfree_skb(skb);
+ spin_unlock_bh(&idev->mc_report_lock);
}
static void mca_get(struct ifmcaddr6 *mc)
@@ -843,21 +866,22 @@ static void ma_put(struct ifmcaddr6 *mc)
{
if (refcount_dec_and_test(&mc->mca_refcnt)) {
in6_dev_put(mc->idev);
- kfree(mc);
+ kfree_rcu(mc, rcu);
}
}
+/* called with mc_lock */
static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
const struct in6_addr *addr,
unsigned int mode)
{
struct ifmcaddr6 *mc;
- mc = kzalloc(sizeof(*mc), GFP_ATOMIC);
+ mc = kzalloc(sizeof(*mc), GFP_KERNEL);
if (!mc)
return NULL;
- timer_setup(&mc->mca_timer, igmp6_timer_handler, 0);
+ INIT_DELAYED_WORK(&mc->mca_work, mld_mca_work);
mc->mca_addr = *addr;
mc->idev = idev; /* reference taken by caller */
@@ -865,7 +889,6 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
/* mca_stamp should be updated upon changes */
mc->mca_cstamp = mc->mca_tstamp = jiffies;
refcount_set(&mc->mca_refcnt, 1);
- spin_lock_init(&mc->mca_lock);
mc->mca_sfmode = mode;
mc->mca_sfcount[mode] = 1;
@@ -894,18 +917,17 @@ static int __ipv6_dev_mc_inc(struct net_device *dev,
if (!idev)
return -EINVAL;
- write_lock_bh(&idev->lock);
if (idev->dead) {
- write_unlock_bh(&idev->lock);
in6_dev_put(idev);
return -ENODEV;
}
- for (mc = idev->mc_list; mc; mc = mc->next) {
+ mutex_lock(&idev->mc_lock);
+ for_each_mc_mclock(idev, mc) {
if (ipv6_addr_equal(&mc->mca_addr, addr)) {
mc->mca_users++;
- write_unlock_bh(&idev->lock);
ip6_mc_add_src(idev, &mc->mca_addr, mode, 0, NULL, 0);
+ mutex_unlock(&idev->mc_lock);
in6_dev_put(idev);
return 0;
}
@@ -913,22 +935,19 @@ static int __ipv6_dev_mc_inc(struct net_device *dev,
mc = mca_alloc(idev, addr, mode);
if (!mc) {
- write_unlock_bh(&idev->lock);
+ mutex_unlock(&idev->mc_lock);
in6_dev_put(idev);
return -ENOMEM;
}
- mc->next = idev->mc_list;
- idev->mc_list = mc;
+ rcu_assign_pointer(mc->next, idev->mc_list);
+ rcu_assign_pointer(idev->mc_list, mc);
- /* Hold this for the code below before we unlock,
- * it is already exposed via idev->mc_list.
- */
mca_get(mc);
- write_unlock_bh(&idev->lock);
mld_del_delrec(idev, mc);
igmp6_group_added(mc);
+ mutex_unlock(&idev->mc_lock);
ma_put(mc);
return 0;
}
@@ -940,33 +959,35 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
EXPORT_SYMBOL(ipv6_dev_mc_inc);
/*
- * device multicast group del
+ * device multicast group del
*/
int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr)
{
- struct ifmcaddr6 *ma, **map;
+ struct ifmcaddr6 *ma, __rcu **map;
ASSERT_RTNL();
- write_lock_bh(&idev->lock);
- for (map = &idev->mc_list; (ma = *map) != NULL; map = &ma->next) {
+ mutex_lock(&idev->mc_lock);
+ for (map = &idev->mc_list;
+ (ma = mc_dereference(*map, idev));
+ map = &ma->next) {
if (ipv6_addr_equal(&ma->mca_addr, addr)) {
if (--ma->mca_users == 0) {
*map = ma->next;
- write_unlock_bh(&idev->lock);
igmp6_group_dropped(ma);
ip6_mc_clear_src(ma);
+ mutex_unlock(&idev->mc_lock);
ma_put(ma);
return 0;
}
- write_unlock_bh(&idev->lock);
+ mutex_unlock(&idev->mc_lock);
return 0;
}
}
- write_unlock_bh(&idev->lock);
+ mutex_unlock(&idev->mc_lock);
return -ENOENT;
}
@@ -1000,8 +1021,7 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
rcu_read_lock();
idev = __in6_dev_get(dev);
if (idev) {
- read_lock_bh(&idev->lock);
- for (mc = idev->mc_list; mc; mc = mc->next) {
+ for_each_mc_rcu(idev, mc) {
if (ipv6_addr_equal(&mc->mca_addr, group))
break;
}
@@ -1009,8 +1029,7 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
if (src_addr && !ipv6_addr_any(src_addr)) {
struct ip6_sf_list *psf;
- spin_lock_bh(&mc->mca_lock);
- for (psf = mc->mca_sources; psf; psf = psf->sf_next) {
+ for_each_psf_rcu(mc, psf) {
if (ipv6_addr_equal(&psf->sf_addr, src_addr))
break;
}
@@ -1020,89 +1039,107 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
mc->mca_sfcount[MCAST_EXCLUDE];
else
rv = mc->mca_sfcount[MCAST_EXCLUDE] != 0;
- spin_unlock_bh(&mc->mca_lock);
} else
rv = true; /* don't filter unspecified source */
}
- read_unlock_bh(&idev->lock);
}
rcu_read_unlock();
return rv;
}
-static void mld_gq_start_timer(struct inet6_dev *idev)
+/* called with mc_lock */
+static void mld_gq_start_work(struct inet6_dev *idev)
{
- unsigned long tv = prandom_u32() % idev->mc_maxdelay;
+ unsigned long tv = prandom_u32_max(idev->mc_maxdelay);
idev->mc_gq_running = 1;
- if (!mod_timer(&idev->mc_gq_timer, jiffies+tv+2))
+ if (!mod_delayed_work(mld_wq, &idev->mc_gq_work, tv + 2))
in6_dev_hold(idev);
}
-static void mld_gq_stop_timer(struct inet6_dev *idev)
+/* called with mc_lock */
+static void mld_gq_stop_work(struct inet6_dev *idev)
{
idev->mc_gq_running = 0;
- if (del_timer(&idev->mc_gq_timer))
+ if (cancel_delayed_work(&idev->mc_gq_work))
__in6_dev_put(idev);
}
-static void mld_ifc_start_timer(struct inet6_dev *idev, unsigned long delay)
+/* called with mc_lock */
+static void mld_ifc_start_work(struct inet6_dev *idev, unsigned long delay)
{
- unsigned long tv = prandom_u32() % delay;
+ unsigned long tv = prandom_u32_max(delay);
- if (!mod_timer(&idev->mc_ifc_timer, jiffies+tv+2))
+ if (!mod_delayed_work(mld_wq, &idev->mc_ifc_work, tv + 2))
in6_dev_hold(idev);
}
-static void mld_ifc_stop_timer(struct inet6_dev *idev)
+/* called with mc_lock */
+static void mld_ifc_stop_work(struct inet6_dev *idev)
{
idev->mc_ifc_count = 0;
- if (del_timer(&idev->mc_ifc_timer))
+ if (cancel_delayed_work(&idev->mc_ifc_work))
__in6_dev_put(idev);
}
-static void mld_dad_start_timer(struct inet6_dev *idev, unsigned long delay)
+/* called with mc_lock */
+static void mld_dad_start_work(struct inet6_dev *idev, unsigned long delay)
{
- unsigned long tv = prandom_u32() % delay;
+ unsigned long tv = prandom_u32_max(delay);
- if (!mod_timer(&idev->mc_dad_timer, jiffies+tv+2))
+ if (!mod_delayed_work(mld_wq, &idev->mc_dad_work, tv + 2))
in6_dev_hold(idev);
}
-static void mld_dad_stop_timer(struct inet6_dev *idev)
+static void mld_dad_stop_work(struct inet6_dev *idev)
+{
+ if (cancel_delayed_work(&idev->mc_dad_work))
+ __in6_dev_put(idev);
+}
+
+static void mld_query_stop_work(struct inet6_dev *idev)
{
- if (del_timer(&idev->mc_dad_timer))
+ spin_lock_bh(&idev->mc_query_lock);
+ if (cancel_delayed_work(&idev->mc_query_work))
+ __in6_dev_put(idev);
+ spin_unlock_bh(&idev->mc_query_lock);
+}
+
+static void mld_report_stop_work(struct inet6_dev *idev)
+{
+ if (cancel_delayed_work_sync(&idev->mc_report_work))
__in6_dev_put(idev);
}
/*
- * IGMP handling (alias multicast ICMPv6 messages)
+ * IGMP handling (alias multicast ICMPv6 messages)
+ * called with mc_lock
*/
-
static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime)
{
unsigned long delay = resptime;
- /* Do not start timer for these addresses */
+ /* Do not start work for these addresses */
if (ipv6_addr_is_ll_all_nodes(&ma->mca_addr) ||
IPV6_ADDR_MC_SCOPE(&ma->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL)
return;
- if (del_timer(&ma->mca_timer)) {
+ if (cancel_delayed_work(&ma->mca_work)) {
refcount_dec(&ma->mca_refcnt);
- delay = ma->mca_timer.expires - jiffies;
+ delay = ma->mca_work.timer.expires - jiffies;
}
if (delay >= resptime)
- delay = prandom_u32() % resptime;
+ delay = prandom_u32_max(resptime);
- ma->mca_timer.expires = jiffies + delay;
- if (!mod_timer(&ma->mca_timer, jiffies + delay))
+ if (!mod_delayed_work(mld_wq, &ma->mca_work, delay))
refcount_inc(&ma->mca_refcnt);
ma->mca_flags |= MAF_TIMER_RUNNING;
}
-/* mark EXCLUDE-mode sources */
+/* mark EXCLUDE-mode sources
+ * called with mc_lock
+ */
static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs,
const struct in6_addr *srcs)
{
@@ -1110,7 +1147,7 @@ static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs,
int i, scount;
scount = 0;
- for (psf = pmc->mca_sources; psf; psf = psf->sf_next) {
+ for_each_psf_mclock(pmc, psf) {
if (scount == nsrcs)
break;
for (i = 0; i < nsrcs; i++) {
@@ -1131,6 +1168,7 @@ static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs,
return true;
}
+/* called with mc_lock */
static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs,
const struct in6_addr *srcs)
{
@@ -1143,7 +1181,7 @@ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs,
/* mark INCLUDE-mode sources */
scount = 0;
- for (psf = pmc->mca_sources; psf; psf = psf->sf_next) {
+ for_each_psf_mclock(pmc, psf) {
if (scount == nsrcs)
break;
for (i = 0; i < nsrcs; i++) {
@@ -1308,18 +1346,18 @@ static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld,
if (v1_query)
mld_set_v1_mode(idev);
- /* cancel MLDv2 report timer */
- mld_gq_stop_timer(idev);
- /* cancel the interface change timer */
- mld_ifc_stop_timer(idev);
+ /* cancel MLDv2 report work */
+ mld_gq_stop_work(idev);
+ /* cancel the interface change work */
+ mld_ifc_stop_work(idev);
/* clear deleted report items */
mld_clear_delrec(idev);
return 0;
}
-static int mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld,
- unsigned long *max_delay)
+static void mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld,
+ unsigned long *max_delay)
{
*max_delay = max(msecs_to_jiffies(mldv2_mrc(mld)), 1UL);
@@ -1329,24 +1367,43 @@ static int mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld,
idev->mc_maxdelay = *max_delay;
- return 0;
+ return;
}
/* called with rcu_read_lock() */
-int igmp6_event_query(struct sk_buff *skb)
+void igmp6_event_query(struct sk_buff *skb)
+{
+ struct inet6_dev *idev = __in6_dev_get(skb->dev);
+
+ if (!idev || idev->dead)
+ goto out;
+
+ spin_lock_bh(&idev->mc_query_lock);
+ if (skb_queue_len(&idev->mc_query_queue) < MLD_MAX_SKBS) {
+ __skb_queue_tail(&idev->mc_query_queue, skb);
+ if (!mod_delayed_work(mld_wq, &idev->mc_query_work, 0))
+ in6_dev_hold(idev);
+ skb = NULL;
+ }
+ spin_unlock_bh(&idev->mc_query_lock);
+out:
+ kfree_skb(skb);
+}
+
+static void __mld_query_work(struct sk_buff *skb)
{
struct mld2_query *mlh2 = NULL;
- struct ifmcaddr6 *ma;
const struct in6_addr *group;
unsigned long max_delay;
struct inet6_dev *idev;
+ struct ifmcaddr6 *ma;
struct mld_msg *mld;
int group_type;
int mark = 0;
int len, err;
if (!pskb_may_pull(skb, sizeof(struct in6_addr)))
- return -EINVAL;
+ goto kfree_skb;
/* compute payload length excluding extension headers */
len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr);
@@ -1363,11 +1420,11 @@ int igmp6_event_query(struct sk_buff *skb)
ipv6_hdr(skb)->hop_limit != 1 ||
!(IP6CB(skb)->flags & IP6SKB_ROUTERALERT) ||
IP6CB(skb)->ra != htons(IPV6_OPT_ROUTERALERT_MLD))
- return -EINVAL;
+ goto kfree_skb;
- idev = __in6_dev_get(skb->dev);
+ idev = in6_dev_get(skb->dev);
if (!idev)
- return 0;
+ goto kfree_skb;
mld = (struct mld_msg *)icmp6_hdr(skb);
group = &mld->mld_mca;
@@ -1375,60 +1432,54 @@ int igmp6_event_query(struct sk_buff *skb)
if (group_type != IPV6_ADDR_ANY &&
!(group_type&IPV6_ADDR_MULTICAST))
- return -EINVAL;
+ goto out;
if (len < MLD_V1_QUERY_LEN) {
- return -EINVAL;
+ goto out;
} else if (len == MLD_V1_QUERY_LEN || mld_in_v1_mode(idev)) {
err = mld_process_v1(idev, mld, &max_delay,
len == MLD_V1_QUERY_LEN);
if (err < 0)
- return err;
+ goto out;
} else if (len >= MLD_V2_QUERY_LEN_MIN) {
int srcs_offset = sizeof(struct mld2_query) -
sizeof(struct icmp6hdr);
if (!pskb_may_pull(skb, srcs_offset))
- return -EINVAL;
+ goto out;
mlh2 = (struct mld2_query *)skb_transport_header(skb);
- err = mld_process_v2(idev, mlh2, &max_delay);
- if (err < 0)
- return err;
+ mld_process_v2(idev, mlh2, &max_delay);
if (group_type == IPV6_ADDR_ANY) { /* general query */
if (mlh2->mld2q_nsrcs)
- return -EINVAL; /* no sources allowed */
+ goto out; /* no sources allowed */
- mld_gq_start_timer(idev);
- return 0;
+ mld_gq_start_work(idev);
+ goto out;
}
/* mark sources to include, if group & source-specific */
if (mlh2->mld2q_nsrcs != 0) {
if (!pskb_may_pull(skb, srcs_offset +
ntohs(mlh2->mld2q_nsrcs) * sizeof(struct in6_addr)))
- return -EINVAL;
+ goto out;
mlh2 = (struct mld2_query *)skb_transport_header(skb);
mark = 1;
}
} else {
- return -EINVAL;
+ goto out;
}
- read_lock_bh(&idev->lock);
if (group_type == IPV6_ADDR_ANY) {
- for (ma = idev->mc_list; ma; ma = ma->next) {
- spin_lock_bh(&ma->mca_lock);
+ for_each_mc_mclock(idev, ma) {
igmp6_group_queried(ma, max_delay);
- spin_unlock_bh(&ma->mca_lock);
}
} else {
- for (ma = idev->mc_list; ma; ma = ma->next) {
+ for_each_mc_mclock(idev, ma) {
if (!ipv6_addr_equal(group, &ma->mca_addr))
continue;
- spin_lock_bh(&ma->mca_lock);
if (ma->mca_flags & MAF_TIMER_RUNNING) {
/* gsquery <- gsquery && mark */
if (!mark)
@@ -1443,34 +1494,88 @@ int igmp6_event_query(struct sk_buff *skb)
if (!(ma->mca_flags & MAF_GSQUERY) ||
mld_marksources(ma, ntohs(mlh2->mld2q_nsrcs), mlh2->mld2q_srcs))
igmp6_group_queried(ma, max_delay);
- spin_unlock_bh(&ma->mca_lock);
break;
}
}
- read_unlock_bh(&idev->lock);
- return 0;
+out:
+ in6_dev_put(idev);
+kfree_skb:
+ consume_skb(skb);
+}
+
+static void mld_query_work(struct work_struct *work)
+{
+ struct inet6_dev *idev = container_of(to_delayed_work(work),
+ struct inet6_dev,
+ mc_query_work);
+ struct sk_buff_head q;
+ struct sk_buff *skb;
+ bool rework = false;
+ int cnt = 0;
+
+ skb_queue_head_init(&q);
+
+ spin_lock_bh(&idev->mc_query_lock);
+ while ((skb = __skb_dequeue(&idev->mc_query_queue))) {
+ __skb_queue_tail(&q, skb);
+
+ if (++cnt >= MLD_MAX_QUEUE) {
+ rework = true;
+ break;
+ }
+ }
+ spin_unlock_bh(&idev->mc_query_lock);
+
+ mutex_lock(&idev->mc_lock);
+ while ((skb = __skb_dequeue(&q)))
+ __mld_query_work(skb);
+ mutex_unlock(&idev->mc_lock);
+
+ if (rework && queue_delayed_work(mld_wq, &idev->mc_query_work, 0))
+ return;
+
+ in6_dev_put(idev);
}
/* called with rcu_read_lock() */
-int igmp6_event_report(struct sk_buff *skb)
+void igmp6_event_report(struct sk_buff *skb)
+{
+ struct inet6_dev *idev = __in6_dev_get(skb->dev);
+
+ if (!idev || idev->dead)
+ goto out;
+
+ spin_lock_bh(&idev->mc_report_lock);
+ if (skb_queue_len(&idev->mc_report_queue) < MLD_MAX_SKBS) {
+ __skb_queue_tail(&idev->mc_report_queue, skb);
+ if (!mod_delayed_work(mld_wq, &idev->mc_report_work, 0))
+ in6_dev_hold(idev);
+ skb = NULL;
+ }
+ spin_unlock_bh(&idev->mc_report_lock);
+out:
+ kfree_skb(skb);
+}
+
+static void __mld_report_work(struct sk_buff *skb)
{
- struct ifmcaddr6 *ma;
struct inet6_dev *idev;
+ struct ifmcaddr6 *ma;
struct mld_msg *mld;
int addr_type;
/* Our own report looped back. Ignore it. */
if (skb->pkt_type == PACKET_LOOPBACK)
- return 0;
+ goto kfree_skb;
/* send our report if the MC router may not have heard this report */
if (skb->pkt_type != PACKET_MULTICAST &&
skb->pkt_type != PACKET_BROADCAST)
- return 0;
+ goto kfree_skb;
if (!pskb_may_pull(skb, sizeof(*mld) - sizeof(struct icmp6hdr)))
- return -EINVAL;
+ goto kfree_skb;
mld = (struct mld_msg *)icmp6_hdr(skb);
@@ -1478,29 +1583,62 @@ int igmp6_event_report(struct sk_buff *skb)
addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr);
if (addr_type != IPV6_ADDR_ANY &&
!(addr_type&IPV6_ADDR_LINKLOCAL))
- return -EINVAL;
+ goto kfree_skb;
- idev = __in6_dev_get(skb->dev);
+ idev = in6_dev_get(skb->dev);
if (!idev)
- return -ENODEV;
+ goto kfree_skb;
/*
- * Cancel the timer for this group
+ * Cancel the work for this group
*/
- read_lock_bh(&idev->lock);
- for (ma = idev->mc_list; ma; ma = ma->next) {
+ for_each_mc_mclock(idev, ma) {
if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) {
- spin_lock(&ma->mca_lock);
- if (del_timer(&ma->mca_timer))
+ if (cancel_delayed_work(&ma->mca_work))
refcount_dec(&ma->mca_refcnt);
- ma->mca_flags &= ~(MAF_LAST_REPORTER|MAF_TIMER_RUNNING);
- spin_unlock(&ma->mca_lock);
+ ma->mca_flags &= ~(MAF_LAST_REPORTER |
+ MAF_TIMER_RUNNING);
break;
}
}
- read_unlock_bh(&idev->lock);
- return 0;
+
+ in6_dev_put(idev);
+kfree_skb:
+ consume_skb(skb);
+}
+
+static void mld_report_work(struct work_struct *work)
+{
+ struct inet6_dev *idev = container_of(to_delayed_work(work),
+ struct inet6_dev,
+ mc_report_work);
+ struct sk_buff_head q;
+ struct sk_buff *skb;
+ bool rework = false;
+ int cnt = 0;
+
+ skb_queue_head_init(&q);
+ spin_lock_bh(&idev->mc_report_lock);
+ while ((skb = __skb_dequeue(&idev->mc_report_queue))) {
+ __skb_queue_tail(&q, skb);
+
+ if (++cnt >= MLD_MAX_QUEUE) {
+ rework = true;
+ break;
+ }
+ }
+ spin_unlock_bh(&idev->mc_report_lock);
+
+ mutex_lock(&idev->mc_lock);
+ while ((skb = __skb_dequeue(&q)))
+ __mld_report_work(skb);
+ mutex_unlock(&idev->mc_lock);
+
+ if (rework && queue_delayed_work(mld_wq, &idev->mc_report_work, 0))
+ return;
+
+ in6_dev_put(idev);
}
static bool is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type,
@@ -1553,7 +1691,7 @@ mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted)
struct ip6_sf_list *psf;
int scount = 0;
- for (psf = pmc->mca_sources; psf; psf = psf->sf_next) {
+ for_each_psf_mclock(pmc, psf) {
if (!is_in(pmc, psf, type, gdeleted, sdeleted))
continue;
scount++;
@@ -1588,26 +1726,26 @@ static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb,
static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu)
{
+ u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT,
+ 2, 0, 0, IPV6_TLV_PADN, 0 };
struct net_device *dev = idev->dev;
- struct net *net = dev_net(dev);
- struct sock *sk = net->ipv6.igmp_sk;
- struct sk_buff *skb;
- struct mld2_report *pmr;
- struct in6_addr addr_buf;
- const struct in6_addr *saddr;
int hlen = LL_RESERVED_SPACE(dev);
int tlen = dev->needed_tailroom;
- unsigned int size = mtu + hlen + tlen;
+ struct net *net = dev_net(dev);
+ const struct in6_addr *saddr;
+ struct in6_addr addr_buf;
+ struct mld2_report *pmr;
+ struct sk_buff *skb;
+ unsigned int size;
+ struct sock *sk;
int err;
- u8 ra[8] = { IPPROTO_ICMPV6, 0,
- IPV6_TLV_ROUTERALERT, 2, 0, 0,
- IPV6_TLV_PADN, 0 };
- /* we assume size > sizeof(ra) here */
- /* limit our allocations to order-0 page */
- size = min_t(int, size, SKB_MAX_ORDER(0, 0));
+ sk = net->ipv6.igmp_sk;
+ /* we assume size > sizeof(ra) here
+ * Also try to not allocate high-order pages for big MTU
+ */
+ size = min_t(int, mtu, PAGE_SIZE / 2) + hlen + tlen;
skb = sock_alloc_send_skb(sk, size, 1, &err);
-
if (!skb)
return NULL;
@@ -1615,7 +1753,7 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu)
skb_reserve(skb, hlen);
skb_tailroom_reserve(skb, mtu, tlen);
- if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) {
+ if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) {
/* <draft-ietf-magma-mld-source-05.txt>:
* use unspecified address as the source address
* when a valid link-local address is not available.
@@ -1727,15 +1865,18 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc,
#define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0)
+/* called with mc_lock */
static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
- int type, int gdeleted, int sdeleted, int crsend)
+ int type, int gdeleted, int sdeleted,
+ int crsend)
{
+ struct ip6_sf_list *psf, *psf_prev, *psf_next;
+ int scount, stotal, first, isquery, truncate;
+ struct ip6_sf_list __rcu **psf_list;
struct inet6_dev *idev = pmc->idev;
struct net_device *dev = idev->dev;
- struct mld2_report *pmr;
struct mld2_grec *pgr = NULL;
- struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list;
- int scount, stotal, first, isquery, truncate;
+ struct mld2_report *pmr;
unsigned int mtu;
if (pmc->mca_flags & MAF_NOREPORT)
@@ -1754,7 +1895,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
psf_list = sdeleted ? &pmc->mca_tomb : &pmc->mca_sources;
- if (!*psf_list)
+ if (!rcu_access_pointer(*psf_list))
goto empty_source;
pmr = skb ? (struct mld2_report *)skb_transport_header(skb) : NULL;
@@ -1770,10 +1911,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
}
first = 1;
psf_prev = NULL;
- for (psf = *psf_list; psf; psf = psf_next) {
+ for (psf = mc_dereference(*psf_list, idev);
+ psf;
+ psf = psf_next) {
struct in6_addr *psrc;
- psf_next = psf->sf_next;
+ psf_next = mc_dereference(psf->sf_next, idev);
if (!is_in(pmc, psf, type, gdeleted, sdeleted) && !crsend) {
psf_prev = psf;
@@ -1820,10 +1963,12 @@ decrease_sf_crcount:
psf->sf_crcount--;
if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
if (psf_prev)
- psf_prev->sf_next = psf->sf_next;
+ rcu_assign_pointer(psf_prev->sf_next,
+ mc_dereference(psf->sf_next, idev));
else
- *psf_list = psf->sf_next;
- kfree(psf);
+ rcu_assign_pointer(*psf_list,
+ mc_dereference(psf->sf_next, idev));
+ kfree_rcu(psf, rcu);
continue;
}
}
@@ -1852,72 +1997,73 @@ empty_source:
return skb;
}
+/* called with mc_lock */
static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc)
{
struct sk_buff *skb = NULL;
int type;
- read_lock_bh(&idev->lock);
if (!pmc) {
- for (pmc = idev->mc_list; pmc; pmc = pmc->next) {
+ for_each_mc_mclock(idev, pmc) {
if (pmc->mca_flags & MAF_NOREPORT)
continue;
- spin_lock_bh(&pmc->mca_lock);
if (pmc->mca_sfcount[MCAST_EXCLUDE])
type = MLD2_MODE_IS_EXCLUDE;
else
type = MLD2_MODE_IS_INCLUDE;
skb = add_grec(skb, pmc, type, 0, 0, 0);
- spin_unlock_bh(&pmc->mca_lock);
}
} else {
- spin_lock_bh(&pmc->mca_lock);
if (pmc->mca_sfcount[MCAST_EXCLUDE])
type = MLD2_MODE_IS_EXCLUDE;
else
type = MLD2_MODE_IS_INCLUDE;
skb = add_grec(skb, pmc, type, 0, 0, 0);
- spin_unlock_bh(&pmc->mca_lock);
}
- read_unlock_bh(&idev->lock);
if (skb)
mld_sendpack(skb);
}
/*
* remove zero-count source records from a source filter list
+ * called with mc_lock
*/
-static void mld_clear_zeros(struct ip6_sf_list **ppsf)
+static void mld_clear_zeros(struct ip6_sf_list __rcu **ppsf, struct inet6_dev *idev)
{
struct ip6_sf_list *psf_prev, *psf_next, *psf;
psf_prev = NULL;
- for (psf = *ppsf; psf; psf = psf_next) {
- psf_next = psf->sf_next;
+ for (psf = mc_dereference(*ppsf, idev);
+ psf;
+ psf = psf_next) {
+ psf_next = mc_dereference(psf->sf_next, idev);
if (psf->sf_crcount == 0) {
if (psf_prev)
- psf_prev->sf_next = psf->sf_next;
+ rcu_assign_pointer(psf_prev->sf_next,
+ mc_dereference(psf->sf_next, idev));
else
- *ppsf = psf->sf_next;
- kfree(psf);
- } else
+ rcu_assign_pointer(*ppsf,
+ mc_dereference(psf->sf_next, idev));
+ kfree_rcu(psf, rcu);
+ } else {
psf_prev = psf;
+ }
}
}
+/* called with mc_lock */
static void mld_send_cr(struct inet6_dev *idev)
{
struct ifmcaddr6 *pmc, *pmc_prev, *pmc_next;
struct sk_buff *skb = NULL;
int type, dtype;
- read_lock_bh(&idev->lock);
- spin_lock(&idev->mc_lock);
-
/* deleted MCA's */
pmc_prev = NULL;
- for (pmc = idev->mc_tomb; pmc; pmc = pmc_next) {
- pmc_next = pmc->next;
+ for (pmc = mc_dereference(idev->mc_tomb, idev);
+ pmc;
+ pmc = pmc_next) {
+ pmc_next = mc_dereference(pmc->next, idev);
if (pmc->mca_sfmode == MCAST_INCLUDE) {
type = MLD2_BLOCK_OLD_SOURCES;
dtype = MLD2_BLOCK_OLD_SOURCES;
@@ -1931,26 +2077,25 @@ static void mld_send_cr(struct inet6_dev *idev)
}
pmc->mca_crcount--;
if (pmc->mca_crcount == 0) {
- mld_clear_zeros(&pmc->mca_tomb);
- mld_clear_zeros(&pmc->mca_sources);
+ mld_clear_zeros(&pmc->mca_tomb, idev);
+ mld_clear_zeros(&pmc->mca_sources, idev);
}
}
- if (pmc->mca_crcount == 0 && !pmc->mca_tomb &&
- !pmc->mca_sources) {
+ if (pmc->mca_crcount == 0 &&
+ !rcu_access_pointer(pmc->mca_tomb) &&
+ !rcu_access_pointer(pmc->mca_sources)) {
if (pmc_prev)
- pmc_prev->next = pmc_next;
+ rcu_assign_pointer(pmc_prev->next, pmc_next);
else
- idev->mc_tomb = pmc_next;
+ rcu_assign_pointer(idev->mc_tomb, pmc_next);
in6_dev_put(pmc->idev);
- kfree(pmc);
+ kfree_rcu(pmc, rcu);
} else
pmc_prev = pmc;
}
- spin_unlock(&idev->mc_lock);
/* change recs */
- for (pmc = idev->mc_list; pmc; pmc = pmc->next) {
- spin_lock_bh(&pmc->mca_lock);
+ for_each_mc_mclock(idev, pmc) {
if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
type = MLD2_BLOCK_OLD_SOURCES;
dtype = MLD2_ALLOW_NEW_SOURCES;
@@ -1970,9 +2115,7 @@ static void mld_send_cr(struct inet6_dev *idev)
skb = add_grec(skb, pmc, type, 0, 0, 0);
pmc->mca_crcount--;
}
- spin_unlock_bh(&pmc->mca_lock);
}
- read_unlock_bh(&idev->lock);
if (!skb)
return;
(void) mld_sendpack(skb);
@@ -2074,6 +2217,7 @@ err_out:
goto out;
}
+/* called with mc_lock */
static void mld_send_initial_cr(struct inet6_dev *idev)
{
struct sk_buff *skb;
@@ -2084,47 +2228,49 @@ static void mld_send_initial_cr(struct inet6_dev *idev)
return;
skb = NULL;
- read_lock_bh(&idev->lock);
- for (pmc = idev->mc_list; pmc; pmc = pmc->next) {
- spin_lock_bh(&pmc->mca_lock);
+ for_each_mc_mclock(idev, pmc) {
if (pmc->mca_sfcount[MCAST_EXCLUDE])
type = MLD2_CHANGE_TO_EXCLUDE;
else
type = MLD2_ALLOW_NEW_SOURCES;
skb = add_grec(skb, pmc, type, 0, 0, 1);
- spin_unlock_bh(&pmc->mca_lock);
}
- read_unlock_bh(&idev->lock);
if (skb)
mld_sendpack(skb);
}
void ipv6_mc_dad_complete(struct inet6_dev *idev)
{
+ mutex_lock(&idev->mc_lock);
idev->mc_dad_count = idev->mc_qrv;
if (idev->mc_dad_count) {
mld_send_initial_cr(idev);
idev->mc_dad_count--;
if (idev->mc_dad_count)
- mld_dad_start_timer(idev,
- unsolicited_report_interval(idev));
+ mld_dad_start_work(idev,
+ unsolicited_report_interval(idev));
}
+ mutex_unlock(&idev->mc_lock);
}
-static void mld_dad_timer_expire(struct timer_list *t)
+static void mld_dad_work(struct work_struct *work)
{
- struct inet6_dev *idev = from_timer(idev, t, mc_dad_timer);
-
+ struct inet6_dev *idev = container_of(to_delayed_work(work),
+ struct inet6_dev,
+ mc_dad_work);
+ mutex_lock(&idev->mc_lock);
mld_send_initial_cr(idev);
if (idev->mc_dad_count) {
idev->mc_dad_count--;
if (idev->mc_dad_count)
- mld_dad_start_timer(idev,
- unsolicited_report_interval(idev));
+ mld_dad_start_work(idev,
+ unsolicited_report_interval(idev));
}
+ mutex_unlock(&idev->mc_lock);
in6_dev_put(idev);
}
+/* called with mc_lock */
static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode,
const struct in6_addr *psfsrc)
{
@@ -2132,7 +2278,7 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode,
int rv = 0;
psf_prev = NULL;
- for (psf = pmc->mca_sources; psf; psf = psf->sf_next) {
+ for_each_psf_mclock(pmc, psf) {
if (ipv6_addr_equal(&psf->sf_addr, psfsrc))
break;
psf_prev = psf;
@@ -2147,21 +2293,27 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode,
/* no more filters for this source */
if (psf_prev)
- psf_prev->sf_next = psf->sf_next;
+ rcu_assign_pointer(psf_prev->sf_next,
+ mc_dereference(psf->sf_next, idev));
else
- pmc->mca_sources = psf->sf_next;
+ rcu_assign_pointer(pmc->mca_sources,
+ mc_dereference(psf->sf_next, idev));
+
if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) &&
!mld_in_v1_mode(idev)) {
psf->sf_crcount = idev->mc_qrv;
- psf->sf_next = pmc->mca_tomb;
- pmc->mca_tomb = psf;
+ rcu_assign_pointer(psf->sf_next,
+ mc_dereference(pmc->mca_tomb, idev));
+ rcu_assign_pointer(pmc->mca_tomb, psf);
rv = 1;
- } else
- kfree(psf);
+ } else {
+ kfree_rcu(psf, rcu);
+ }
}
return rv;
}
+/* called with mc_lock */
static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca,
int sfmode, int sfcount, const struct in6_addr *psfsrc,
int delta)
@@ -2172,24 +2324,19 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca,
if (!idev)
return -ENODEV;
- read_lock_bh(&idev->lock);
- for (pmc = idev->mc_list; pmc; pmc = pmc->next) {
+
+ for_each_mc_mclock(idev, pmc) {
if (ipv6_addr_equal(pmca, &pmc->mca_addr))
break;
}
- if (!pmc) {
- /* MCA not found?? bug */
- read_unlock_bh(&idev->lock);
+ if (!pmc)
return -ESRCH;
- }
- spin_lock_bh(&pmc->mca_lock);
+
sf_markstate(pmc);
if (!delta) {
- if (!pmc->mca_sfcount[sfmode]) {
- spin_unlock_bh(&pmc->mca_lock);
- read_unlock_bh(&idev->lock);
+ if (!pmc->mca_sfcount[sfmode])
return -EINVAL;
- }
+
pmc->mca_sfcount[sfmode]--;
}
err = 0;
@@ -2209,18 +2356,19 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca,
pmc->mca_sfmode = MCAST_INCLUDE;
pmc->mca_crcount = idev->mc_qrv;
idev->mc_ifc_count = pmc->mca_crcount;
- for (psf = pmc->mca_sources; psf; psf = psf->sf_next)
+ for_each_psf_mclock(pmc, psf)
psf->sf_crcount = 0;
mld_ifc_event(pmc->idev);
- } else if (sf_setstate(pmc) || changerec)
+ } else if (sf_setstate(pmc) || changerec) {
mld_ifc_event(pmc->idev);
- spin_unlock_bh(&pmc->mca_lock);
- read_unlock_bh(&idev->lock);
+ }
+
return err;
}
/*
* Add multicast single-source filter to the interface list
+ * called with mc_lock
*/
static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode,
const struct in6_addr *psfsrc)
@@ -2228,40 +2376,45 @@ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode,
struct ip6_sf_list *psf, *psf_prev;
psf_prev = NULL;
- for (psf = pmc->mca_sources; psf; psf = psf->sf_next) {
+ for_each_psf_mclock(pmc, psf) {
if (ipv6_addr_equal(&psf->sf_addr, psfsrc))
break;
psf_prev = psf;
}
if (!psf) {
- psf = kzalloc(sizeof(*psf), GFP_ATOMIC);
+ psf = kzalloc(sizeof(*psf), GFP_KERNEL);
if (!psf)
return -ENOBUFS;
psf->sf_addr = *psfsrc;
if (psf_prev) {
- psf_prev->sf_next = psf;
- } else
- pmc->mca_sources = psf;
+ rcu_assign_pointer(psf_prev->sf_next, psf);
+ } else {
+ rcu_assign_pointer(pmc->mca_sources, psf);
+ }
}
psf->sf_count[sfmode]++;
return 0;
}
+/* called with mc_lock */
static void sf_markstate(struct ifmcaddr6 *pmc)
{
struct ip6_sf_list *psf;
int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE];
- for (psf = pmc->mca_sources; psf; psf = psf->sf_next)
+ for_each_psf_mclock(pmc, psf) {
if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
psf->sf_oldin = mca_xcount ==
psf->sf_count[MCAST_EXCLUDE] &&
!psf->sf_count[MCAST_INCLUDE];
- } else
+ } else {
psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0;
+ }
+ }
}
+/* called with mc_lock */
static int sf_setstate(struct ifmcaddr6 *pmc)
{
struct ip6_sf_list *psf, *dpsf;
@@ -2270,7 +2423,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc)
int new_in, rv;
rv = 0;
- for (psf = pmc->mca_sources; psf; psf = psf->sf_next) {
+ for_each_psf_mclock(pmc, psf) {
if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
!psf->sf_count[MCAST_INCLUDE];
@@ -2280,8 +2433,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc)
if (!psf->sf_oldin) {
struct ip6_sf_list *prev = NULL;
- for (dpsf = pmc->mca_tomb; dpsf;
- dpsf = dpsf->sf_next) {
+ for_each_psf_tomb(pmc, dpsf) {
if (ipv6_addr_equal(&dpsf->sf_addr,
&psf->sf_addr))
break;
@@ -2289,10 +2441,14 @@ static int sf_setstate(struct ifmcaddr6 *pmc)
}
if (dpsf) {
if (prev)
- prev->sf_next = dpsf->sf_next;
+ rcu_assign_pointer(prev->sf_next,
+ mc_dereference(dpsf->sf_next,
+ pmc->idev));
else
- pmc->mca_tomb = dpsf->sf_next;
- kfree(dpsf);
+ rcu_assign_pointer(pmc->mca_tomb,
+ mc_dereference(dpsf->sf_next,
+ pmc->idev));
+ kfree_rcu(dpsf, rcu);
}
psf->sf_crcount = qrv;
rv++;
@@ -2303,18 +2459,19 @@ static int sf_setstate(struct ifmcaddr6 *pmc)
* add or update "delete" records if an active filter
* is now inactive
*/
- for (dpsf = pmc->mca_tomb; dpsf; dpsf = dpsf->sf_next)
+
+ for_each_psf_tomb(pmc, dpsf)
if (ipv6_addr_equal(&dpsf->sf_addr,
&psf->sf_addr))
break;
if (!dpsf) {
- dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC);
+ dpsf = kmalloc(sizeof(*dpsf), GFP_KERNEL);
if (!dpsf)
continue;
*dpsf = *psf;
- /* pmc->mca_lock held by callers */
- dpsf->sf_next = pmc->mca_tomb;
- pmc->mca_tomb = dpsf;
+ rcu_assign_pointer(dpsf->sf_next,
+ mc_dereference(pmc->mca_tomb, pmc->idev));
+ rcu_assign_pointer(pmc->mca_tomb, dpsf);
}
dpsf->sf_crcount = qrv;
rv++;
@@ -2325,6 +2482,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc)
/*
* Add multicast source filter list to the interface list
+ * called with mc_lock
*/
static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
int sfmode, int sfcount, const struct in6_addr *psfsrc,
@@ -2336,17 +2494,13 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
if (!idev)
return -ENODEV;
- read_lock_bh(&idev->lock);
- for (pmc = idev->mc_list; pmc; pmc = pmc->next) {
+
+ for_each_mc_mclock(idev, pmc) {
if (ipv6_addr_equal(pmca, &pmc->mca_addr))
break;
}
- if (!pmc) {
- /* MCA not found?? bug */
- read_unlock_bh(&idev->lock);
+ if (!pmc)
return -ESRCH;
- }
- spin_lock_bh(&pmc->mca_lock);
sf_markstate(pmc);
isexclude = pmc->mca_sfmode == MCAST_EXCLUDE;
@@ -2377,36 +2531,40 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
pmc->mca_crcount = idev->mc_qrv;
idev->mc_ifc_count = pmc->mca_crcount;
- for (psf = pmc->mca_sources; psf; psf = psf->sf_next)
+ for_each_psf_mclock(pmc, psf)
psf->sf_crcount = 0;
mld_ifc_event(idev);
- } else if (sf_setstate(pmc))
+ } else if (sf_setstate(pmc)) {
mld_ifc_event(idev);
- spin_unlock_bh(&pmc->mca_lock);
- read_unlock_bh(&idev->lock);
+ }
return err;
}
+/* called with mc_lock */
static void ip6_mc_clear_src(struct ifmcaddr6 *pmc)
{
struct ip6_sf_list *psf, *nextpsf;
- for (psf = pmc->mca_tomb; psf; psf = nextpsf) {
- nextpsf = psf->sf_next;
- kfree(psf);
+ for (psf = mc_dereference(pmc->mca_tomb, pmc->idev);
+ psf;
+ psf = nextpsf) {
+ nextpsf = mc_dereference(psf->sf_next, pmc->idev);
+ kfree_rcu(psf, rcu);
}
- pmc->mca_tomb = NULL;
- for (psf = pmc->mca_sources; psf; psf = nextpsf) {
- nextpsf = psf->sf_next;
- kfree(psf);
+ RCU_INIT_POINTER(pmc->mca_tomb, NULL);
+ for (psf = mc_dereference(pmc->mca_sources, pmc->idev);
+ psf;
+ psf = nextpsf) {
+ nextpsf = mc_dereference(psf->sf_next, pmc->idev);
+ kfree_rcu(psf, rcu);
}
- pmc->mca_sources = NULL;
+ RCU_INIT_POINTER(pmc->mca_sources, NULL);
pmc->mca_sfmode = MCAST_EXCLUDE;
pmc->mca_sfcount[MCAST_INCLUDE] = 0;
pmc->mca_sfcount[MCAST_EXCLUDE] = 1;
}
-
+/* called with mc_lock */
static void igmp6_join_group(struct ifmcaddr6 *ma)
{
unsigned long delay;
@@ -2416,95 +2574,118 @@ static void igmp6_join_group(struct ifmcaddr6 *ma)
igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
- delay = prandom_u32() % unsolicited_report_interval(ma->idev);
+ delay = prandom_u32_max(unsolicited_report_interval(ma->idev));
- spin_lock_bh(&ma->mca_lock);
- if (del_timer(&ma->mca_timer)) {
+ if (cancel_delayed_work(&ma->mca_work)) {
refcount_dec(&ma->mca_refcnt);
- delay = ma->mca_timer.expires - jiffies;
+ delay = ma->mca_work.timer.expires - jiffies;
}
- if (!mod_timer(&ma->mca_timer, jiffies + delay))
+ if (!mod_delayed_work(mld_wq, &ma->mca_work, delay))
refcount_inc(&ma->mca_refcnt);
ma->mca_flags |= MAF_TIMER_RUNNING | MAF_LAST_REPORTER;
- spin_unlock_bh(&ma->mca_lock);
}
static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
struct inet6_dev *idev)
{
+ struct ip6_sf_socklist *psl;
int err;
- write_lock_bh(&iml->sflock);
- if (!iml->sflist) {
+ psl = sock_dereference(iml->sflist, sk);
+
+ if (idev)
+ mutex_lock(&idev->mc_lock);
+
+ if (!psl) {
/* any-source empty exclude case */
err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0);
} else {
err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode,
- iml->sflist->sl_count, iml->sflist->sl_addr, 0);
- sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max));
- iml->sflist = NULL;
+ psl->sl_count, psl->sl_addr, 0);
+ RCU_INIT_POINTER(iml->sflist, NULL);
+ atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+ &sk->sk_omem_alloc);
+ kfree_rcu(psl, rcu);
}
- write_unlock_bh(&iml->sflock);
+
+ if (idev)
+ mutex_unlock(&idev->mc_lock);
+
return err;
}
+/* called with mc_lock */
static void igmp6_leave_group(struct ifmcaddr6 *ma)
{
if (mld_in_v1_mode(ma->idev)) {
- if (ma->mca_flags & MAF_LAST_REPORTER)
+ if (ma->mca_flags & MAF_LAST_REPORTER) {
igmp6_send(&ma->mca_addr, ma->idev->dev,
ICMPV6_MGM_REDUCTION);
+ }
} else {
mld_add_delrec(ma->idev, ma);
mld_ifc_event(ma->idev);
}
}
-static void mld_gq_timer_expire(struct timer_list *t)
+static void mld_gq_work(struct work_struct *work)
{
- struct inet6_dev *idev = from_timer(idev, t, mc_gq_timer);
+ struct inet6_dev *idev = container_of(to_delayed_work(work),
+ struct inet6_dev,
+ mc_gq_work);
- idev->mc_gq_running = 0;
+ mutex_lock(&idev->mc_lock);
mld_send_report(idev, NULL);
+ idev->mc_gq_running = 0;
+ mutex_unlock(&idev->mc_lock);
+
in6_dev_put(idev);
}
-static void mld_ifc_timer_expire(struct timer_list *t)
+static void mld_ifc_work(struct work_struct *work)
{
- struct inet6_dev *idev = from_timer(idev, t, mc_ifc_timer);
+ struct inet6_dev *idev = container_of(to_delayed_work(work),
+ struct inet6_dev,
+ mc_ifc_work);
+ mutex_lock(&idev->mc_lock);
mld_send_cr(idev);
+
if (idev->mc_ifc_count) {
idev->mc_ifc_count--;
if (idev->mc_ifc_count)
- mld_ifc_start_timer(idev,
- unsolicited_report_interval(idev));
+ mld_ifc_start_work(idev,
+ unsolicited_report_interval(idev));
}
+ mutex_unlock(&idev->mc_lock);
in6_dev_put(idev);
}
+/* called with mc_lock */
static void mld_ifc_event(struct inet6_dev *idev)
{
if (mld_in_v1_mode(idev))
return;
+
idev->mc_ifc_count = idev->mc_qrv;
- mld_ifc_start_timer(idev, 1);
+ mld_ifc_start_work(idev, 1);
}
-static void igmp6_timer_handler(struct timer_list *t)
+static void mld_mca_work(struct work_struct *work)
{
- struct ifmcaddr6 *ma = from_timer(ma, t, mca_timer);
+ struct ifmcaddr6 *ma = container_of(to_delayed_work(work),
+ struct ifmcaddr6, mca_work);
+ mutex_lock(&ma->idev->mc_lock);
if (mld_in_v1_mode(ma->idev))
igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
else
mld_send_report(ma->idev, ma);
-
- spin_lock(&ma->mca_lock);
ma->mca_flags |= MAF_LAST_REPORTER;
ma->mca_flags &= ~MAF_TIMER_RUNNING;
- spin_unlock(&ma->mca_lock);
+ mutex_unlock(&ma->idev->mc_lock);
+
ma_put(ma);
}
@@ -2516,10 +2697,10 @@ void ipv6_mc_unmap(struct inet6_dev *idev)
/* Install multicast list, except for all-nodes (already installed) */
- read_lock_bh(&idev->lock);
- for (i = idev->mc_list; i; i = i->next)
+ mutex_lock(&idev->mc_lock);
+ for_each_mc_mclock(idev, i)
igmp6_group_dropped(i);
- read_unlock_bh(&idev->lock);
+ mutex_unlock(&idev->mc_lock);
}
void ipv6_mc_remap(struct inet6_dev *idev)
@@ -2528,25 +2709,25 @@ void ipv6_mc_remap(struct inet6_dev *idev)
}
/* Device going down */
-
void ipv6_mc_down(struct inet6_dev *idev)
{
struct ifmcaddr6 *i;
+ mutex_lock(&idev->mc_lock);
/* Withdraw multicast list */
-
- read_lock_bh(&idev->lock);
-
- for (i = idev->mc_list; i; i = i->next)
+ for_each_mc_mclock(idev, i)
igmp6_group_dropped(i);
+ mutex_unlock(&idev->mc_lock);
- /* Should stop timer after group drop. or we will
- * start timer again in mld_ifc_event()
+ /* Should stop work after group drop. or we will
+ * start work again in mld_ifc_event()
*/
- mld_ifc_stop_timer(idev);
- mld_gq_stop_timer(idev);
- mld_dad_stop_timer(idev);
- read_unlock_bh(&idev->lock);
+ synchronize_net();
+ mld_query_stop_work(idev);
+ mld_report_stop_work(idev);
+ mld_ifc_stop_work(idev);
+ mld_gq_stop_work(idev);
+ mld_dad_stop_work(idev);
}
static void ipv6_mc_reset(struct inet6_dev *idev)
@@ -2566,29 +2747,33 @@ void ipv6_mc_up(struct inet6_dev *idev)
/* Install multicast list, except for all-nodes (already installed) */
- read_lock_bh(&idev->lock);
ipv6_mc_reset(idev);
- for (i = idev->mc_list; i; i = i->next) {
+ mutex_lock(&idev->mc_lock);
+ for_each_mc_mclock(idev, i) {
mld_del_delrec(idev, i);
igmp6_group_added(i);
}
- read_unlock_bh(&idev->lock);
+ mutex_unlock(&idev->mc_lock);
}
/* IPv6 device initialization. */
void ipv6_mc_init_dev(struct inet6_dev *idev)
{
- write_lock_bh(&idev->lock);
- spin_lock_init(&idev->mc_lock);
idev->mc_gq_running = 0;
- timer_setup(&idev->mc_gq_timer, mld_gq_timer_expire, 0);
- idev->mc_tomb = NULL;
+ INIT_DELAYED_WORK(&idev->mc_gq_work, mld_gq_work);
+ RCU_INIT_POINTER(idev->mc_tomb, NULL);
idev->mc_ifc_count = 0;
- timer_setup(&idev->mc_ifc_timer, mld_ifc_timer_expire, 0);
- timer_setup(&idev->mc_dad_timer, mld_dad_timer_expire, 0);
+ INIT_DELAYED_WORK(&idev->mc_ifc_work, mld_ifc_work);
+ INIT_DELAYED_WORK(&idev->mc_dad_work, mld_dad_work);
+ INIT_DELAYED_WORK(&idev->mc_query_work, mld_query_work);
+ INIT_DELAYED_WORK(&idev->mc_report_work, mld_report_work);
+ skb_queue_head_init(&idev->mc_query_queue);
+ skb_queue_head_init(&idev->mc_report_queue);
+ spin_lock_init(&idev->mc_query_lock);
+ spin_lock_init(&idev->mc_report_lock);
+ mutex_init(&idev->mc_lock);
ipv6_mc_reset(idev);
- write_unlock_bh(&idev->lock);
}
/*
@@ -2599,9 +2784,13 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev)
{
struct ifmcaddr6 *i;
- /* Deactivate timers */
+ /* Deactivate works */
ipv6_mc_down(idev);
+ mutex_lock(&idev->mc_lock);
mld_clear_delrec(idev);
+ mutex_unlock(&idev->mc_lock);
+ mld_clear_query(idev);
+ mld_clear_report(idev);
/* Delete all-nodes address. */
/* We cannot call ipv6_dev_mc_dec() directly, our caller in
@@ -2613,15 +2802,14 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev)
if (idev->cnf.forwarding)
__ipv6_dev_mc_dec(idev, &in6addr_linklocal_allrouters);
- write_lock_bh(&idev->lock);
- while ((i = idev->mc_list) != NULL) {
- idev->mc_list = i->next;
+ mutex_lock(&idev->mc_lock);
+ while ((i = mc_dereference(idev->mc_list, idev))) {
+ rcu_assign_pointer(idev->mc_list, mc_dereference(i->next, idev));
- write_unlock_bh(&idev->lock);
+ ip6_mc_clear_src(i);
ma_put(i);
- write_lock_bh(&idev->lock);
}
- write_unlock_bh(&idev->lock);
+ mutex_unlock(&idev->mc_lock);
}
static void ipv6_mc_rejoin_groups(struct inet6_dev *idev)
@@ -2630,13 +2818,14 @@ static void ipv6_mc_rejoin_groups(struct inet6_dev *idev)
ASSERT_RTNL();
+ mutex_lock(&idev->mc_lock);
if (mld_in_v1_mode(idev)) {
- read_lock_bh(&idev->lock);
- for (pmc = idev->mc_list; pmc; pmc = pmc->next)
+ for_each_mc_mclock(idev, pmc)
igmp6_join_group(pmc);
- read_unlock_bh(&idev->lock);
- } else
+ } else {
mld_send_report(idev, NULL);
+ }
+ mutex_unlock(&idev->mc_lock);
}
static int ipv6_mc_netdev_event(struct notifier_block *this,
@@ -2683,13 +2872,12 @@ static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq)
idev = __in6_dev_get(state->dev);
if (!idev)
continue;
- read_lock_bh(&idev->lock);
- im = idev->mc_list;
+
+ im = rcu_dereference(idev->mc_list);
if (im) {
state->idev = idev;
break;
}
- read_unlock_bh(&idev->lock);
}
return im;
}
@@ -2698,11 +2886,8 @@ static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr
{
struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
- im = im->next;
+ im = rcu_dereference(im->next);
while (!im) {
- if (likely(state->idev))
- read_unlock_bh(&state->idev->lock);
-
state->dev = next_net_device_rcu(state->dev);
if (!state->dev) {
state->idev = NULL;
@@ -2711,8 +2896,7 @@ static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr
state->idev = __in6_dev_get(state->dev);
if (!state->idev)
continue;
- read_lock_bh(&state->idev->lock);
- im = state->idev->mc_list;
+ im = rcu_dereference(state->idev->mc_list);
}
return im;
}
@@ -2746,10 +2930,8 @@ static void igmp6_mc_seq_stop(struct seq_file *seq, void *v)
{
struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
- if (likely(state->idev)) {
- read_unlock_bh(&state->idev->lock);
+ if (likely(state->idev))
state->idev = NULL;
- }
state->dev = NULL;
rcu_read_unlock();
}
@@ -2764,8 +2946,8 @@ static int igmp6_mc_seq_show(struct seq_file *seq, void *v)
state->dev->ifindex, state->dev->name,
&im->mca_addr,
im->mca_users, im->mca_flags,
- (im->mca_flags&MAF_TIMER_RUNNING) ?
- jiffies_to_clock_t(im->mca_timer.expires-jiffies) : 0);
+ (im->mca_flags & MAF_TIMER_RUNNING) ?
+ jiffies_to_clock_t(im->mca_work.timer.expires - jiffies) : 0);
return 0;
}
@@ -2799,19 +2981,16 @@ static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq)
idev = __in6_dev_get(state->dev);
if (unlikely(idev == NULL))
continue;
- read_lock_bh(&idev->lock);
- im = idev->mc_list;
+
+ im = rcu_dereference(idev->mc_list);
if (likely(im)) {
- spin_lock_bh(&im->mca_lock);
- psf = im->mca_sources;
+ psf = rcu_dereference(im->mca_sources);
if (likely(psf)) {
state->im = im;
state->idev = idev;
break;
}
- spin_unlock_bh(&im->mca_lock);
}
- read_unlock_bh(&idev->lock);
}
return psf;
}
@@ -2820,14 +2999,10 @@ static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_s
{
struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);
- psf = psf->sf_next;
+ psf = rcu_dereference(psf->sf_next);
while (!psf) {
- spin_unlock_bh(&state->im->mca_lock);
- state->im = state->im->next;
+ state->im = rcu_dereference(state->im->next);
while (!state->im) {
- if (likely(state->idev))
- read_unlock_bh(&state->idev->lock);
-
state->dev = next_net_device_rcu(state->dev);
if (!state->dev) {
state->idev = NULL;
@@ -2836,13 +3011,11 @@ static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_s
state->idev = __in6_dev_get(state->dev);
if (!state->idev)
continue;
- read_lock_bh(&state->idev->lock);
- state->im = state->idev->mc_list;
+ state->im = rcu_dereference(state->idev->mc_list);
}
if (!state->im)
break;
- spin_lock_bh(&state->im->mca_lock);
- psf = state->im->mca_sources;
+ psf = rcu_dereference(state->im->mca_sources);
}
out:
return psf;
@@ -2879,14 +3052,12 @@ static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v)
__releases(RCU)
{
struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);
- if (likely(state->im)) {
- spin_unlock_bh(&state->im->mca_lock);
+
+ if (likely(state->im))
state->im = NULL;
- }
- if (likely(state->idev)) {
- read_unlock_bh(&state->idev->lock);
+ if (likely(state->idev))
state->idev = NULL;
- }
+
state->dev = NULL;
rcu_read_unlock();
}
@@ -2967,6 +3138,7 @@ static int __net_init igmp6_net_init(struct net *net)
}
inet6_sk(net->ipv6.igmp_sk)->hop_limit = 1;
+ net->ipv6.igmp_sk->sk_allocation = GFP_KERNEL;
err = inet_ctl_sock_create(&net->ipv6.mc_autojoin_sk, PF_INET6,
SOCK_RAW, IPPROTO_ICMPV6, net);
@@ -3004,7 +3176,19 @@ static struct pernet_operations igmp6_net_ops = {
int __init igmp6_init(void)
{
- return register_pernet_subsys(&igmp6_net_ops);
+ int err;
+
+ err = register_pernet_subsys(&igmp6_net_ops);
+ if (err)
+ return err;
+
+ mld_wq = create_workqueue("mld");
+ if (!mld_wq) {
+ unregister_pernet_subsys(&igmp6_net_ops);
+ return -ENOMEM;
+ }
+
+ return err;
}
int __init igmp6_late_init(void)
@@ -3015,6 +3199,7 @@ int __init igmp6_late_init(void)
void igmp6_cleanup(void)
{
unregister_pernet_subsys(&igmp6_net_ops);
+ destroy_workqueue(mld_wq);
}
void igmp6_late_cleanup(void)
diff --git a/net/ipv6/mcast_snoop.c b/net/ipv6/mcast_snoop.c
index d3d6b6a66e5f..04d5fcdfa6e0 100644
--- a/net/ipv6/mcast_snoop.c
+++ b/net/ipv6/mcast_snoop.c
@@ -109,7 +109,7 @@ static int ipv6_mc_check_mld_msg(struct sk_buff *skb)
struct mld_msg *mld;
if (!ipv6_mc_may_pull(skb, len))
- return -EINVAL;
+ return -ENODATA;
mld = (struct mld_msg *)skb_transport_header(skb);
@@ -122,7 +122,7 @@ static int ipv6_mc_check_mld_msg(struct sk_buff *skb)
case ICMPV6_MGM_QUERY:
return ipv6_mc_check_mld_query(skb);
default:
- return -ENOMSG;
+ return -ENODATA;
}
}
@@ -131,7 +131,7 @@ static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb)
return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo);
}
-int ipv6_mc_check_icmpv6(struct sk_buff *skb)
+static int ipv6_mc_check_icmpv6(struct sk_buff *skb)
{
unsigned int len = skb_transport_offset(skb) + sizeof(struct icmp6hdr);
unsigned int transport_len = ipv6_transport_len(skb);
@@ -150,7 +150,6 @@ int ipv6_mc_check_icmpv6(struct sk_buff *skb)
return 0;
}
-EXPORT_SYMBOL(ipv6_mc_check_icmpv6);
/**
* ipv6_mc_check_mld - checks whether this is a sane MLD packet
@@ -161,7 +160,10 @@ EXPORT_SYMBOL(ipv6_mc_check_icmpv6);
*
* -EINVAL: A broken packet was detected, i.e. it violates some internet
* standard
- * -ENOMSG: IP header validation succeeded but it is not an MLD packet.
+ * -ENOMSG: IP header validation succeeded but it is not an ICMPv6 packet
+ * with a hop-by-hop option.
+ * -ENODATA: IP+ICMPv6 header with hop-by-hop option validation succeeded
+ * but it is not an MLD packet.
* -ENOMEM: A memory allocation failure happened.
*
* Caller needs to set the skb network header and free any returned skb if it
diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
index 878fcec14949..83d2a8be263f 100644
--- a/net/ipv6/mip6.c
+++ b/net/ipv6/mip6.c
@@ -247,63 +247,14 @@ static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb,
return err;
}
-static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb,
- u8 **nexthdr)
-{
- u16 offset = sizeof(struct ipv6hdr);
- struct ipv6_opt_hdr *exthdr =
- (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
- const unsigned char *nh = skb_network_header(skb);
- unsigned int packet_len = skb_tail_pointer(skb) -
- skb_network_header(skb);
- int found_rhdr = 0;
-
- *nexthdr = &ipv6_hdr(skb)->nexthdr;
-
- while (offset + 1 <= packet_len) {
-
- switch (**nexthdr) {
- case NEXTHDR_HOP:
- break;
- case NEXTHDR_ROUTING:
- found_rhdr = 1;
- break;
- case NEXTHDR_DEST:
- /*
- * HAO MUST NOT appear more than once.
- * XXX: It is better to try to find by the end of
- * XXX: packet if HAO exists.
- */
- if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) {
- net_dbg_ratelimited("mip6: hao exists already, override\n");
- return offset;
- }
-
- if (found_rhdr)
- return offset;
-
- break;
- default:
- return offset;
- }
-
- offset += ipv6_optlen(exthdr);
- *nexthdr = &exthdr->nexthdr;
- exthdr = (struct ipv6_opt_hdr *)(nh + offset);
- }
-
- return offset;
-}
-
-static int mip6_destopt_init_state(struct xfrm_state *x)
+static int mip6_destopt_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
if (x->id.spi) {
- pr_info("%s: spi is not 0: %u\n", __func__, x->id.spi);
+ NL_SET_ERR_MSG(extack, "SPI must be 0");
return -EINVAL;
}
if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) {
- pr_info("%s: state's mode is not %u: %u\n",
- __func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode);
+ NL_SET_ERR_MSG(extack, "XFRM mode must be XFRM_MODE_ROUTEOPTIMIZATION");
return -EINVAL;
}
@@ -324,7 +275,6 @@ static void mip6_destopt_destroy(struct xfrm_state *x)
}
static const struct xfrm_type mip6_destopt_type = {
- .description = "MIP6DESTOPT",
.owner = THIS_MODULE,
.proto = IPPROTO_DSTOPTS,
.flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_LOCAL_COADDR,
@@ -333,7 +283,6 @@ static const struct xfrm_type mip6_destopt_type = {
.input = mip6_destopt_input,
.output = mip6_destopt_output,
.reject = mip6_destopt_reject,
- .hdr_offset = mip6_destopt_offset,
};
static int mip6_rthdr_input(struct xfrm_state *x, struct sk_buff *skb)
@@ -383,62 +332,14 @@ static int mip6_rthdr_output(struct xfrm_state *x, struct sk_buff *skb)
return 0;
}
-static int mip6_rthdr_offset(struct xfrm_state *x, struct sk_buff *skb,
- u8 **nexthdr)
-{
- u16 offset = sizeof(struct ipv6hdr);
- struct ipv6_opt_hdr *exthdr =
- (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
- const unsigned char *nh = skb_network_header(skb);
- unsigned int packet_len = skb_tail_pointer(skb) -
- skb_network_header(skb);
- int found_rhdr = 0;
-
- *nexthdr = &ipv6_hdr(skb)->nexthdr;
-
- while (offset + 1 <= packet_len) {
-
- switch (**nexthdr) {
- case NEXTHDR_HOP:
- break;
- case NEXTHDR_ROUTING:
- if (offset + 3 <= packet_len) {
- struct ipv6_rt_hdr *rt;
- rt = (struct ipv6_rt_hdr *)(nh + offset);
- if (rt->type != 0)
- return offset;
- }
- found_rhdr = 1;
- break;
- case NEXTHDR_DEST:
- if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
- return offset;
-
- if (found_rhdr)
- return offset;
-
- break;
- default:
- return offset;
- }
-
- offset += ipv6_optlen(exthdr);
- *nexthdr = &exthdr->nexthdr;
- exthdr = (struct ipv6_opt_hdr *)(nh + offset);
- }
-
- return offset;
-}
-
-static int mip6_rthdr_init_state(struct xfrm_state *x)
+static int mip6_rthdr_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
if (x->id.spi) {
- pr_info("%s: spi is not 0: %u\n", __func__, x->id.spi);
+ NL_SET_ERR_MSG(extack, "SPI must be 0");
return -EINVAL;
}
if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) {
- pr_info("%s: state's mode is not %u: %u\n",
- __func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode);
+ NL_SET_ERR_MSG(extack, "XFRM mode must be XFRM_MODE_ROUTEOPTIMIZATION");
return -EINVAL;
}
@@ -456,7 +357,6 @@ static void mip6_rthdr_destroy(struct xfrm_state *x)
}
static const struct xfrm_type mip6_rthdr_type = {
- .description = "MIP6RT",
.owner = THIS_MODULE,
.proto = IPPROTO_ROUTING,
.flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_REMOTE_COADDR,
@@ -464,7 +364,6 @@ static const struct xfrm_type mip6_rthdr_type = {
.destructor = mip6_rthdr_destroy,
.input = mip6_rthdr_input,
.output = mip6_rthdr_output,
- .hdr_offset = mip6_rthdr_offset,
};
static int __init mip6_init(void)
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 53caf59c591e..3a553494ff16 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -81,6 +81,7 @@ static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb);
static int pndisc_constructor(struct pneigh_entry *n);
static void pndisc_destructor(struct pneigh_entry *n);
static void pndisc_redo(struct sk_buff *skb);
+static int ndisc_is_multicast(const void *pkey);
static const struct neigh_ops ndisc_generic_ops = {
.family = AF_INET6,
@@ -115,6 +116,7 @@ struct neigh_table nd_tbl = {
.pconstructor = pndisc_constructor,
.pdestructor = pndisc_destructor,
.proxy_redo = pndisc_redo,
+ .is_multicast = ndisc_is_multicast,
.allow_add = ndisc_allow_add,
.id = "ndisc_cache",
.parms = {
@@ -126,6 +128,7 @@ struct neigh_table nd_tbl = {
[NEIGH_VAR_RETRANS_TIME] = ND_RETRANS_TIMER,
[NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME,
[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+ [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
[NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
[NEIGH_VAR_PROXY_QLEN] = 64,
@@ -140,7 +143,7 @@ struct neigh_table nd_tbl = {
};
EXPORT_SYMBOL_GPL(nd_tbl);
-void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data,
+void __ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data,
int data_len, int pad)
{
int space = __ndisc_opt_addr_space(data_len, pad);
@@ -163,7 +166,7 @@ void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data,
EXPORT_SYMBOL_GPL(__ndisc_fill_addr_option);
static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type,
- void *data, u8 icmp6_type)
+ const void *data, u8 icmp6_type)
{
__ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len,
ndisc_addr_option_pad(skb->dev->type));
@@ -197,6 +200,7 @@ static inline int ndisc_is_useropt(const struct net_device *dev,
return opt->nd_opt_type == ND_OPT_RDNSS ||
opt->nd_opt_type == ND_OPT_DNSSL ||
opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL ||
+ opt->nd_opt_type == ND_OPT_PREF64 ||
ndisc_ops_is_useropt(dev, opt->nd_opt_type);
}
@@ -463,9 +467,8 @@ static void ip6_nd_hdr(struct sk_buff *skb,
hdr->daddr = *daddr;
}
-static void ndisc_send_skb(struct sk_buff *skb,
- const struct in6_addr *daddr,
- const struct in6_addr *saddr)
+void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
{
struct dst_entry *dst = skb_dst(skb);
struct net *net = dev_net(skb->dev);
@@ -512,6 +515,7 @@ static void ndisc_send_skb(struct sk_buff *skb,
rcu_read_unlock();
}
+EXPORT_SYMBOL(ndisc_send_skb);
void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
const struct in6_addr *solicited_addr,
@@ -595,22 +599,16 @@ static void ndisc_send_unsol_na(struct net_device *dev)
in6_dev_put(idev);
}
-void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
- const struct in6_addr *daddr, const struct in6_addr *saddr,
- u64 nonce)
+struct sk_buff *ndisc_ns_create(struct net_device *dev, const struct in6_addr *solicit,
+ const struct in6_addr *saddr, u64 nonce)
{
- struct sk_buff *skb;
- struct in6_addr addr_buf;
int inc_opt = dev->addr_len;
- int optlen = 0;
+ struct sk_buff *skb;
struct nd_msg *msg;
+ int optlen = 0;
- if (!saddr) {
- if (ipv6_get_lladdr(dev, &addr_buf,
- (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)))
- return;
- saddr = &addr_buf;
- }
+ if (!saddr)
+ return NULL;
if (ipv6_addr_any(saddr))
inc_opt = false;
@@ -622,7 +620,7 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
if (!skb)
- return;
+ return NULL;
msg = skb_put(skb, sizeof(*msg));
*msg = (struct nd_msg) {
@@ -644,7 +642,28 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
memcpy(opt + 2, &nonce, 6);
}
- ndisc_send_skb(skb, daddr, saddr);
+ return skb;
+}
+EXPORT_SYMBOL(ndisc_ns_create);
+
+void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
+ const struct in6_addr *daddr, const struct in6_addr *saddr,
+ u64 nonce)
+{
+ struct in6_addr addr_buf;
+ struct sk_buff *skb;
+
+ if (!saddr) {
+ if (ipv6_get_lladdr(dev, &addr_buf,
+ (IFA_F_TENTATIVE | IFA_F_OPTIMISTIC)))
+ return;
+ saddr = &addr_buf;
+ }
+
+ skb = ndisc_ns_create(dev, solicit, saddr, nonce);
+
+ if (skb)
+ ndisc_send_skb(skb, daddr, saddr);
}
void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr,
@@ -948,6 +967,25 @@ out:
in6_dev_put(idev);
}
+static int accept_untracked_na(struct net_device *dev, struct in6_addr *saddr)
+{
+ struct inet6_dev *idev = __in6_dev_get(dev);
+
+ switch (idev->cnf.accept_untracked_na) {
+ case 0: /* Don't accept untracked na (absent in neighbor cache) */
+ return 0;
+ case 1: /* Create new entries from na if currently untracked */
+ return 1;
+ case 2: /* Create new entries from untracked na only if saddr is in the
+ * same subnet as an address configured on the interface that
+ * received the na
+ */
+ return !!ipv6_chk_prefix(saddr, dev);
+ default:
+ return 0;
+ }
+}
+
static void ndisc_recv_na(struct sk_buff *skb)
{
struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb);
@@ -961,6 +999,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
struct inet6_dev *idev = __in6_dev_get(dev);
struct inet6_ifaddr *ifp;
struct neighbour *neigh;
+ u8 new_state;
if (skb->len < sizeof(struct nd_msg)) {
ND_PRINTK(2, warn, "NA: packet too short\n");
@@ -981,6 +1020,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
/* For some 802.11 wireless deployments (and possibly other networks),
* there will be a NA proxy and unsolicitd packets are attacks
* and thus should not be accepted.
+ * drop_unsolicited_na takes precedence over accept_untracked_na
*/
if (!msg->icmph.icmp6_solicited && idev &&
idev->cnf.drop_unsolicited_na)
@@ -1021,9 +1061,33 @@ static void ndisc_recv_na(struct sk_buff *skb)
in6_ifa_put(ifp);
return;
}
+
neigh = neigh_lookup(&nd_tbl, &msg->target, dev);
- if (neigh) {
+ /* RFC 9131 updates original Neighbour Discovery RFC 4861.
+ * NAs with Target LL Address option without a corresponding
+ * entry in the neighbour cache can now create a STALE neighbour
+ * cache entry on routers.
+ *
+ * entry accept fwding solicited behaviour
+ * ------- ------ ------ --------- ----------------------
+ * present X X 0 Set state to STALE
+ * present X X 1 Set state to REACHABLE
+ * absent 0 X X Do nothing
+ * absent 1 0 X Do nothing
+ * absent 1 1 X Add a new STALE entry
+ *
+ * Note that we don't do a (daddr == all-routers-mcast) check.
+ */
+ new_state = msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE;
+ if (!neigh && lladdr && idev && idev->cnf.forwarding) {
+ if (accept_untracked_na(dev, saddr)) {
+ neigh = neigh_create(&nd_tbl, &msg->target, dev);
+ new_state = NUD_STALE;
+ }
+ }
+
+ if (neigh && !IS_ERR(neigh)) {
u8 old_flags = neigh->flags;
struct net *net = dev_net(dev);
@@ -1043,7 +1107,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
}
ndisc_update(dev, neigh, lladdr,
- msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE,
+ new_state,
NEIGH_UPDATE_F_WEAK_OVERRIDE|
(msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)|
NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
@@ -1170,6 +1234,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
struct neighbour *neigh = NULL;
struct inet6_dev *in6_dev;
struct fib6_info *rt = NULL;
+ u32 defrtr_usr_metric;
struct net *net;
int lifetime;
struct ndisc_options ndopts;
@@ -1300,18 +1365,24 @@ static void ndisc_router_discovery(struct sk_buff *skb)
return;
}
}
- if (rt && lifetime == 0) {
- ip6_del_rt(net, rt);
+ /* Set default route metric as specified by user */
+ defrtr_usr_metric = in6_dev->cnf.ra_defrtr_metric;
+ /* delete the route if lifetime is 0 or if metric needs change */
+ if (rt && (lifetime == 0 || rt->fib6_metric != defrtr_usr_metric)) {
+ ip6_del_rt(net, rt, false);
rt = NULL;
}
- ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, for dev: %s\n",
- rt, lifetime, skb->dev->name);
+ ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, metric: %d, for dev: %s\n",
+ rt, lifetime, defrtr_usr_metric, skb->dev->name);
if (!rt && lifetime) {
ND_PRINTK(3, info, "RA: adding default router\n");
+ if (neigh)
+ neigh_release(neigh);
+
rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr,
- skb->dev, pref);
+ skb->dev, pref, defrtr_usr_metric);
if (!rt) {
ND_PRINTK(0, err,
"RA: %s failed to add default route\n",
@@ -1330,8 +1401,12 @@ static void ndisc_router_discovery(struct sk_buff *skb)
return;
}
neigh->flags |= NTF_ROUTER;
- } else if (rt) {
+ } else if (rt && IPV6_EXTRACT_PREF(rt->fib6_flags) != pref) {
+ struct nl_info nlinfo = {
+ .nl_net = net,
+ };
rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
+ inet6_rt_notify(RTM_NEWROUTE, rt, &nlinfo, NLM_F_REPLACE);
}
if (rt)
@@ -1358,8 +1433,8 @@ skip_defrtr:
if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) {
rtime = (rtime*HZ)/1000;
- if (rtime < HZ/10)
- rtime = HZ/10;
+ if (rtime < HZ/100)
+ rtime = HZ/100;
NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime);
in6_dev->tstamp = jiffies;
send_ifinfo_notify = true;
@@ -1384,12 +1459,6 @@ skip_defrtr:
}
}
- /*
- * Send a notify if RA changed managed/otherconf flags or timer settings
- */
- if (send_ifinfo_notify)
- inet6_ifinfo_notify(RTM_NEWLINK, in6_dev);
-
skip_linkparms:
/*
@@ -1489,6 +1558,11 @@ skip_routeinfo:
memcpy(&n, ((u8 *)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu));
mtu = ntohl(n);
+ if (in6_dev->ra_mtu != mtu) {
+ in6_dev->ra_mtu = mtu;
+ send_ifinfo_notify = true;
+ }
+
if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) {
ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu);
} else if (in6_dev->cnf.mtu6 != mtu) {
@@ -1512,6 +1586,12 @@ skip_routeinfo:
ND_PRINTK(2, warn, "RA: invalid RA options\n");
}
out:
+ /* Send a notify if RA changed managed/otherconf flags or
+ * timer settings or ra_mtu value
+ */
+ if (send_ifinfo_notify)
+ inet6_ifinfo_notify(RTM_NEWLINK, in6_dev);
+
fib6_info_release(rt);
if (neigh)
neigh_release(neigh);
@@ -1705,6 +1785,11 @@ static void pndisc_redo(struct sk_buff *skb)
kfree_skb(skb);
}
+static int ndisc_is_multicast(const void *pkey)
+{
+ return ipv6_addr_is_multicast((struct in6_addr *)pkey);
+}
+
static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb)
{
struct inet6_dev *idev = __in6_dev_get(skb->dev);
@@ -1777,12 +1862,13 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event,
struct netdev_notifier_change_info *change_info;
struct net *net = dev_net(dev);
struct inet6_dev *idev;
+ bool evict_nocarrier;
switch (event) {
case NETDEV_CHANGEADDR:
neigh_changeaddr(&nd_tbl, dev);
fib6_run_gc(0, net, false);
- /* fallthrough */
+ fallthrough;
case NETDEV_UP:
idev = in6_dev_get(dev);
if (!idev)
@@ -1793,10 +1879,19 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event,
in6_dev_put(idev);
break;
case NETDEV_CHANGE:
+ idev = in6_dev_get(dev);
+ if (!idev)
+ evict_nocarrier = true;
+ else {
+ evict_nocarrier = idev->cnf.ndisc_evict_nocarrier &&
+ net->ipv6.devconf_all->ndisc_evict_nocarrier;
+ in6_dev_put(idev);
+ }
+
change_info = ptr;
if (change_info->flags_changed & IFF_NOARP)
neigh_changeaddr(&nd_tbl, dev);
- if (!netif_carrier_ok(dev))
+ if (evict_nocarrier && !netif_carrier_ok(dev))
neigh_carrier_down(&nd_tbl, dev);
break;
case NETDEV_DOWN:
@@ -1834,7 +1929,8 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl,
}
}
-int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos)
+int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
struct net_device *dev = ctl->extra1;
struct inet6_dev *idev;
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 409e79b84a83..857713d7a38a 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -20,17 +20,18 @@
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include "../bridge/br_private.h"
-int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
+int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff *skb)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
- struct sock *sk = sk_to_full_sk(skb->sk);
+ struct sock *sk = sk_to_full_sk(sk_partial);
+ struct net_device *dev = skb_dst(skb)->dev;
+ struct flow_keys flkeys;
unsigned int hh_len;
struct dst_entry *dst;
int strict = (ipv6_addr_type(&iph->daddr) &
(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
struct flowi6 fl6 = {
- .flowi6_oif = sk && sk->sk_bound_dev_if ? sk->sk_bound_dev_if :
- strict ? skb_dst(skb)->dev->ifindex : 0,
+ .flowi6_l3mdev = l3mdev_master_ifindex(dev),
.flowi6_mark = skb->mark,
.flowi6_uid = sock_net_uid(net, sk),
.daddr = iph->daddr,
@@ -38,6 +39,12 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
};
int err;
+ if (sk && sk->sk_bound_dev_if)
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ else if (strict)
+ fl6.flowi6_oif = dev->ifindex;
+
+ fib6_rules_early_flow_dissect(net, skb, &fl6, &flkeys);
dst = ip6_route_output(net, sk, &fl6);
err = dst->error;
if (err) {
@@ -84,7 +91,7 @@ static int nf_ip6_reroute(struct sk_buff *skb,
if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) ||
!ipv6_addr_equal(&iph->saddr, &rt_info->saddr) ||
skb->mark != rt_info->mark)
- return ip6_route_me_harder(entry->state.net, skb);
+ return ip6_route_me_harder(entry->state.net, entry->state.sk, skb);
}
return 0;
}
@@ -119,6 +126,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
struct sk_buff *))
{
int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+ bool mono_delivery_time = skb->mono_delivery_time;
ktime_t tstamp = skb->tstamp;
struct ip6_frag_state state;
u8 *prevhdr, nexthdr = 0;
@@ -184,7 +192,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
if (iter.frag)
ip6_fraglist_prepare(skb, &iter);
- skb->tstamp = tstamp;
+ skb_set_delivery_time(skb, tstamp, mono_delivery_time);
err = output(net, sk, data, skb);
if (err || !iter.frag)
break;
@@ -217,7 +225,7 @@ slow_path:
goto blackhole;
}
- skb2->tstamp = tstamp;
+ skb_set_delivery_time(skb2, tstamp, mono_delivery_time);
err = output(net, sk, data, skb2);
if (err)
goto blackhole;
@@ -245,9 +253,6 @@ static const struct nf_ipv6_ops ipv6ops = {
.route_input = ip6_route_input,
.fragment = ip6_fragment,
.reroute = nf_ip6_reroute,
-#if IS_MODULE(CONFIG_IPV6) && IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
- .br_defrag = nf_ct_frag6_gather,
-#endif
#if IS_MODULE(CONFIG_IPV6)
.br_fragment = br_ip6_fragment,
#endif
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 0594131fa46d..0ba62f4868f9 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -47,14 +47,6 @@ config NFT_FIB_IPV6
endif # NF_TABLES_IPV6
endif # NF_TABLES
-config NF_FLOW_TABLE_IPV6
- tristate "Netfilter flow table IPv6 module"
- depends on NF_FLOW_TABLE
- help
- This option adds the flow table IPv6 support.
-
- To compile it as a module, choose M here.
-
config NF_DUP_IPV6
tristate "Netfilter IPv6 packet duplication to alternate destination"
depends on !NF_CONNTRACK || NF_CONNTRACK
@@ -69,7 +61,10 @@ config NF_REJECT_IPV6
config NF_LOG_IPV6
tristate "IPv6 packet logging"
default m if NETFILTER_ADVANCED=n
- select NF_LOG_COMMON
+ select NF_LOG_SYSLOG
+ help
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects CONFIG_NF_LOG_SYSLOG.
config IP6_NF_IPTABLES
tristate "IP6 tables support (required for filtering)"
@@ -127,7 +122,7 @@ config IP6_NF_MATCH_HL
tristate '"hl" hoplimit match support'
depends on NETFILTER_ADVANCED
select NETFILTER_XT_MATCH_HL
- ---help---
+ help
This is a backwards-compat option for the user's convenience
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_MATCH_HL.
@@ -153,7 +148,7 @@ config IP6_NF_MATCH_RPFILTER
tristate '"rpfilter" reverse path filter match support'
depends on NETFILTER_ADVANCED
depends on IP6_NF_MANGLE || IP6_NF_RAW
- ---help---
+ help
This option allows you to match packets whose replies would
go out via the interface the packet came in.
@@ -183,7 +178,7 @@ config IP6_NF_TARGET_HL
tristate '"HL" hoplimit target support'
depends on NETFILTER_ADVANCED && IP6_NF_MANGLE
select NETFILTER_XT_TARGET_HL
- ---help---
+ help
This is a backwards-compatible option for the user's convenience
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_TARGET_HL.
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 731a74c60dca..b8d6dc9aeeb6 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -18,9 +18,6 @@ obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
obj-$(CONFIG_NF_SOCKET_IPV6) += nf_socket_ipv6.o
obj-$(CONFIG_NF_TPROXY_IPV6) += nf_tproxy_ipv6.o
-# logging
-obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o
-
# reject
obj-$(CONFIG_NF_REJECT_IPV6) += nf_reject_ipv6.o
@@ -31,9 +28,6 @@ obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o
obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o
obj-$(CONFIG_NFT_FIB_IPV6) += nft_fib_ipv6.o
-# flow table support
-obj-$(CONFIG_NF_FLOW_TABLE_IPV6) += nf_flow_table_ipv6.o
-
# matches
obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index c973ace208c5..2d816277f2c5 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -51,7 +51,7 @@ ip6_packet_match(const struct sk_buff *skb,
const char *outdev,
const struct ip6t_ip6 *ip6info,
unsigned int *protoff,
- int *fragoff, bool *hotdrop)
+ u16 *fragoff, bool *hotdrop)
{
unsigned long ret;
const struct ipv6hdr *ipv6 = ipv6_hdr(skb);
@@ -247,10 +247,10 @@ ip6t_next_entry(const struct ip6t_entry *entry)
/* Returns one of the generic firewall policies, like NF_ACCEPT. */
unsigned int
-ip6t_do_table(struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct xt_table *table)
+ip6t_do_table(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
{
+ const struct xt_table *table = priv;
unsigned int hook = state->hook;
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
/* Initializing verdict to NF_DROP keeps gcc happy. */
@@ -273,6 +273,7 @@ ip6t_do_table(struct sk_buff *skb,
* things we don't know, ie. tcp syn flag or ports). If the
* rule is also a fragment-specific rule, non-fragments won't
* match it. */
+ acpar.fragoff = 0;
acpar.hotdrop = false;
acpar.state = state;
@@ -884,7 +885,7 @@ copy_entries_to_user(unsigned int total_size,
return ret;
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
static void compat_standard_from_user(void *dst, const void *src)
{
int v = *(compat_int_t *)src;
@@ -960,8 +961,7 @@ static int compat_table_info(const struct xt_table_info *info,
}
#endif
-static int get_info(struct net *net, void __user *user,
- const int *len, int compat)
+static int get_info(struct net *net, void __user *user, const int *len)
{
char name[XT_TABLE_MAXNAMELEN];
struct xt_table *t;
@@ -974,18 +974,18 @@ static int get_info(struct net *net, void __user *user,
return -EFAULT;
name[XT_TABLE_MAXNAMELEN-1] = '\0';
-#ifdef CONFIG_COMPAT
- if (compat)
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
xt_compat_lock(AF_INET6);
#endif
t = xt_request_find_table_lock(net, AF_INET6, name);
if (!IS_ERR(t)) {
struct ip6t_getinfo info;
const struct xt_table_info *private = t->private;
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct xt_table_info tmp;
- if (compat) {
+ if (in_compat_syscall()) {
ret = compat_table_info(private, &tmp);
xt_compat_flush_offsets(AF_INET6);
private = &tmp;
@@ -1010,8 +1010,8 @@ static int get_info(struct net *net, void __user *user,
module_put(t->me);
} else
ret = PTR_ERR(t);
-#ifdef CONFIG_COMPAT
- if (compat)
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
xt_compat_unlock(AF_INET6);
#endif
return ret;
@@ -1120,7 +1120,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
}
static int
-do_replace(struct net *net, const void __user *user, unsigned int len)
+do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
int ret;
struct ip6t_replace tmp;
@@ -1128,7 +1128,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
void *loc_cpu_entry;
struct ip6t_entry *iter;
- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
return -EFAULT;
/* overflow check */
@@ -1144,8 +1144,8 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
return -ENOMEM;
loc_cpu_entry = newinfo->entries;
- if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
- tmp.size) != 0) {
+ if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
+ tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
}
@@ -1169,8 +1169,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
}
static int
-do_add_counters(struct net *net, const void __user *user, unsigned int len,
- int compat)
+do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
{
unsigned int i;
struct xt_counters_info tmp;
@@ -1181,7 +1180,7 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
struct ip6t_entry *iter;
unsigned int addend;
- paddc = xt_copy_counters_from_user(user, len, &tmp, compat);
+ paddc = xt_copy_counters(arg, len, &tmp);
if (IS_ERR(paddc))
return PTR_ERR(paddc);
t = xt_find_table_lock(net, AF_INET6, tmp.name);
@@ -1217,7 +1216,7 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
return ret;
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct compat_ip6t_replace {
char name[XT_TABLE_MAXNAMELEN];
u32 valid_hooks;
@@ -1227,7 +1226,7 @@ struct compat_ip6t_replace {
u32 underflow[NF_INET_NUMHOOKS];
u32 num_counters;
compat_uptr_t counters; /* struct xt_counters * */
- struct compat_ip6t_entry entries[0];
+ struct compat_ip6t_entry entries[];
};
static int
@@ -1445,6 +1444,8 @@ translate_compat_table(struct net *net,
if (!newinfo)
goto out_unlock;
+ memset(newinfo->entries, 0, size);
+
newinfo->number = compatr->num_entries;
for (i = 0; i < NF_INET_NUMHOOKS; i++) {
newinfo->hook_entry[i] = compatr->hook_entry[i];
@@ -1495,7 +1496,7 @@ out_unlock:
}
static int
-compat_do_replace(struct net *net, void __user *user, unsigned int len)
+compat_do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
int ret;
struct compat_ip6t_replace tmp;
@@ -1503,7 +1504,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
void *loc_cpu_entry;
struct ip6t_entry *iter;
- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
return -EFAULT;
/* overflow check */
@@ -1519,8 +1520,8 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
return -ENOMEM;
loc_cpu_entry = newinfo->entries;
- if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
- tmp.size) != 0) {
+ if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
+ tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
}
@@ -1543,35 +1544,10 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
return ret;
}
-static int
-compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user,
- unsigned int len)
-{
- int ret;
-
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
- switch (cmd) {
- case IP6T_SO_SET_REPLACE:
- ret = compat_do_replace(sock_net(sk), user, len);
- break;
-
- case IP6T_SO_SET_ADD_COUNTERS:
- ret = do_add_counters(sock_net(sk), user, len, 1);
- break;
-
- default:
- ret = -EINVAL;
- }
-
- return ret;
-}
-
struct compat_ip6t_get_entries {
char name[XT_TABLE_MAXNAMELEN];
compat_uint_t size;
- struct compat_ip6t_entry entrytable[0];
+ struct compat_ip6t_entry entrytable[];
};
static int
@@ -1643,33 +1619,10 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
xt_compat_unlock(AF_INET6);
return ret;
}
-
-static int do_ip6t_get_ctl(struct sock *, int, void __user *, int *);
-
-static int
-compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
-{
- int ret;
-
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
- switch (cmd) {
- case IP6T_SO_GET_INFO:
- ret = get_info(sock_net(sk), user, len, 1);
- break;
- case IP6T_SO_GET_ENTRIES:
- ret = compat_get_entries(sock_net(sk), user, len);
- break;
- default:
- ret = do_ip6t_get_ctl(sk, cmd, user, len);
- }
- return ret;
-}
#endif
static int
-do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+do_ip6t_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len)
{
int ret;
@@ -1678,11 +1631,16 @@ do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
switch (cmd) {
case IP6T_SO_SET_REPLACE:
- ret = do_replace(sock_net(sk), user, len);
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ ret = compat_do_replace(sock_net(sk), arg, len);
+ else
+#endif
+ ret = do_replace(sock_net(sk), arg, len);
break;
case IP6T_SO_SET_ADD_COUNTERS:
- ret = do_add_counters(sock_net(sk), user, len, 0);
+ ret = do_add_counters(sock_net(sk), arg, len);
break;
default:
@@ -1702,11 +1660,16 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
switch (cmd) {
case IP6T_SO_GET_INFO:
- ret = get_info(sock_net(sk), user, len, 0);
+ ret = get_info(sock_net(sk), user, len);
break;
case IP6T_SO_GET_ENTRIES:
- ret = get_entries(sock_net(sk), user, len);
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ ret = compat_get_entries(sock_net(sk), user, len);
+ else
+#endif
+ ret = get_entries(sock_net(sk), user, len);
break;
case IP6T_SO_GET_REVISION_MATCH:
@@ -1763,10 +1726,11 @@ static void __ip6t_unregister_table(struct net *net, struct xt_table *table)
int ip6t_register_table(struct net *net, const struct xt_table *table,
const struct ip6t_replace *repl,
- const struct nf_hook_ops *ops,
- struct xt_table **res)
+ const struct nf_hook_ops *template_ops)
{
- int ret;
+ struct nf_hook_ops *ops;
+ unsigned int num_ops;
+ int ret, i;
struct xt_table_info *newinfo;
struct xt_table_info bootstrap = {0};
void *loc_cpu_entry;
@@ -1780,39 +1744,62 @@ int ip6t_register_table(struct net *net, const struct xt_table *table,
memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(net, newinfo, loc_cpu_entry, repl);
- if (ret != 0)
- goto out_free;
+ if (ret != 0) {
+ xt_free_table_info(newinfo);
+ return ret;
+ }
new_table = xt_register_table(net, table, &bootstrap, newinfo);
if (IS_ERR(new_table)) {
- ret = PTR_ERR(new_table);
- goto out_free;
+ xt_free_table_info(newinfo);
+ return PTR_ERR(new_table);
}
- /* set res now, will see skbs right after nf_register_net_hooks */
- WRITE_ONCE(*res, new_table);
- if (!ops)
+ if (!template_ops)
return 0;
- ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
- if (ret != 0) {
- __ip6t_unregister_table(net, new_table);
- *res = NULL;
+ num_ops = hweight32(table->valid_hooks);
+ if (num_ops == 0) {
+ ret = -EINVAL;
+ goto out_free;
}
+ ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL);
+ if (!ops) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+
+ for (i = 0; i < num_ops; i++)
+ ops[i].priv = new_table;
+
+ new_table->ops = ops;
+
+ ret = nf_register_net_hooks(net, ops, num_ops);
+ if (ret != 0)
+ goto out_free;
+
return ret;
out_free:
- xt_free_table_info(newinfo);
+ __ip6t_unregister_table(net, new_table);
return ret;
}
-void ip6t_unregister_table(struct net *net, struct xt_table *table,
- const struct nf_hook_ops *ops)
+void ip6t_unregister_table_pre_exit(struct net *net, const char *name)
+{
+ struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name);
+
+ if (table)
+ nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
+}
+
+void ip6t_unregister_table_exit(struct net *net, const char *name)
{
- if (ops)
- nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
- __ip6t_unregister_table(net, table);
+ struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name);
+
+ if (table)
+ __ip6t_unregister_table(net, table);
}
/* Returns 1 if the type and code is matched by the range, 0 otherwise */
@@ -1867,7 +1854,7 @@ static struct xt_target ip6t_builtin_tg[] __read_mostly = {
.name = XT_STANDARD_TARGET,
.targetsize = sizeof(int),
.family = NFPROTO_IPV6,
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
.compatsize = sizeof(compat_int_t),
.compat_from_user = compat_standard_from_user,
.compat_to_user = compat_standard_to_user,
@@ -1886,15 +1873,9 @@ static struct nf_sockopt_ops ip6t_sockopts = {
.set_optmin = IP6T_BASE_CTL,
.set_optmax = IP6T_SO_SET_MAX+1,
.set = do_ip6t_set_ctl,
-#ifdef CONFIG_COMPAT
- .compat_set = compat_do_ip6t_set_ctl,
-#endif
.get_optmin = IP6T_BASE_CTL,
.get_optmax = IP6T_SO_GET_MAX+1,
.get = do_ip6t_get_ctl,
-#ifdef CONFIG_COMPAT
- .compat_get = compat_do_ip6t_get_ctl,
-#endif
.owner = THIS_MODULE,
};
@@ -1968,7 +1949,8 @@ static void __exit ip6_tables_fini(void)
}
EXPORT_SYMBOL(ip6t_register_table);
-EXPORT_SYMBOL(ip6t_unregister_table);
+EXPORT_SYMBOL(ip6t_unregister_table_pre_exit);
+EXPORT_SYMBOL(ip6t_unregister_table_exit);
EXPORT_SYMBOL(ip6t_do_table);
module_init(ip6_tables_init);
diff --git a/net/ipv6/netfilter/ip6t_NPT.c b/net/ipv6/netfilter/ip6t_NPT.c
index 9ee077bf4f49..787c74aa85e3 100644
--- a/net/ipv6/netfilter/ip6t_NPT.c
+++ b/net/ipv6/netfilter/ip6t_NPT.c
@@ -77,16 +77,43 @@ static bool ip6t_npt_map_pfx(const struct ip6t_npt_tginfo *npt,
return true;
}
+static struct ipv6hdr *icmpv6_bounced_ipv6hdr(struct sk_buff *skb,
+ struct ipv6hdr *_bounced_hdr)
+{
+ if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
+ return NULL;
+
+ if (!icmpv6_is_err(icmp6_hdr(skb)->icmp6_type))
+ return NULL;
+
+ return skb_header_pointer(skb,
+ skb_transport_offset(skb) + sizeof(struct icmp6hdr),
+ sizeof(struct ipv6hdr),
+ _bounced_hdr);
+}
+
static unsigned int
ip6t_snpt_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ip6t_npt_tginfo *npt = par->targinfo;
+ struct ipv6hdr _bounced_hdr;
+ struct ipv6hdr *bounced_hdr;
+ struct in6_addr bounced_pfx;
if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->saddr)) {
icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD,
offsetof(struct ipv6hdr, saddr));
return NF_DROP;
}
+
+ /* rewrite dst addr of bounced packet which was sent to dst range */
+ bounced_hdr = icmpv6_bounced_ipv6hdr(skb, &_bounced_hdr);
+ if (bounced_hdr) {
+ ipv6_addr_prefix(&bounced_pfx, &bounced_hdr->daddr, npt->src_pfx_len);
+ if (ipv6_addr_cmp(&bounced_pfx, &npt->src_pfx.in6) == 0)
+ ip6t_npt_map_pfx(npt, &bounced_hdr->daddr);
+ }
+
return XT_CONTINUE;
}
@@ -94,12 +121,24 @@ static unsigned int
ip6t_dnpt_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ip6t_npt_tginfo *npt = par->targinfo;
+ struct ipv6hdr _bounced_hdr;
+ struct ipv6hdr *bounced_hdr;
+ struct in6_addr bounced_pfx;
if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->daddr)) {
icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD,
offsetof(struct ipv6hdr, daddr));
return NF_DROP;
}
+
+ /* rewrite src addr of bounced packet which was sent from dst range */
+ bounced_hdr = icmpv6_bounced_ipv6hdr(skb, &_bounced_hdr);
+ if (bounced_hdr) {
+ ipv6_addr_prefix(&bounced_pfx, &bounced_hdr->saddr, npt->src_pfx_len);
+ if (ipv6_addr_cmp(&bounced_pfx, &npt->src_pfx.in6) == 0)
+ ip6t_npt_map_pfx(npt, &bounced_hdr->saddr);
+ }
+
return XT_CONTINUE;
}
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index 3ac5485049f0..a35019d2e480 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -61,7 +61,7 @@ reject_tg6(struct sk_buff *skb, const struct xt_action_param *par)
/* Do nothing */
break;
case IP6T_TCP_RESET:
- nf_send_reset6(net, skb, xt_hooknum(par));
+ nf_send_reset6(net, par->state->sk, skb, xt_hooknum(par));
break;
case IP6T_ICMP6_POLICY_FAIL:
nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, xt_hooknum(par));
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index fd1f52a21bf1..d51d0c3e5fe9 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -121,3 +121,4 @@ module_exit(synproxy_tg6_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Intercept IPv6 TCP connections and establish them using syncookies");
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index 4e15a14435e4..70da2f2ce064 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -74,8 +74,7 @@ static bool ah_mt6(const struct sk_buff *skb, struct xt_action_param *par)
ahinfo->hdrres, ah->reserved,
!(ahinfo->hdrres && ah->reserved));
- return (ah != NULL) &&
- spi_match(ahinfo->spis[0], ahinfo->spis[1],
+ return spi_match(ahinfo->spis[0], ahinfo->spis[1],
ntohl(ah->spi),
!!(ahinfo->invflags & IP6T_AH_INV_SPI)) &&
(!ahinfo->hdrlen ||
diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c
index fb91eeee4a1e..3aad6439386b 100644
--- a/net/ipv6/netfilter/ip6t_frag.c
+++ b/net/ipv6/netfilter/ip6t_frag.c
@@ -85,8 +85,7 @@ frag_mt6(const struct sk_buff *skb, struct xt_action_param *par)
!((fraginfo->flags & IP6T_FRAG_NMF) &&
(ntohs(fh->frag_off) & IP6_MF)));
- return (fh != NULL) &&
- id_match(fraginfo->ids[0], fraginfo->ids[1],
+ return id_match(fraginfo->ids[0], fraginfo->ids[1],
ntohl(fh->identification),
!!(fraginfo->invflags & IP6T_FRAG_INV_IDS)) &&
!((fraginfo->flags & IP6T_FRAG_RES) &&
diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c
index 467b2a86031b..e7a3fb9355ee 100644
--- a/net/ipv6/netfilter/ip6t_hbh.c
+++ b/net/ipv6/netfilter/ip6t_hbh.c
@@ -86,8 +86,7 @@ hbh_mt6(const struct sk_buff *skb, struct xt_action_param *par)
((optinfo->hdrlen == hdrlen) ^
!!(optinfo->invflags & IP6T_OPTS_INV_LEN))));
- ret = (oh != NULL) &&
- (!(optinfo->flags & IP6T_OPTS_LEN) ||
+ ret = (!(optinfo->flags & IP6T_OPTS_LEN) ||
((optinfo->hdrlen == hdrlen) ^
!!(optinfo->invflags & IP6T_OPTS_INV_LEN)));
diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c
index d800801a5dd2..a01d9b842bd0 100644
--- a/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -37,8 +37,10 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
bool ret = false;
struct flowi6 fl6 = {
.flowi6_iif = LOOPBACK_IFINDEX,
+ .flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev),
.flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
.flowi6_proto = iph->nexthdr,
+ .flowi6_uid = sock_net_uid(net, NULL),
.daddr = iph->saddr,
};
int lookup_flags;
@@ -55,9 +57,7 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
if (rpfilter_addr_linklocal(&iph->saddr)) {
lookup_flags |= RT6_LOOKUP_F_IFACE;
fl6.flowi6_oif = dev->ifindex;
- /* Set flowi6_oif for vrf devices to lookup route in l3mdev domain. */
- } else if (netif_is_l3_master(dev) || netif_is_l3_slave(dev) ||
- (flags & XT_RPFILTER_LOOSE) == 0)
+ } else if ((flags & XT_RPFILTER_LOOSE) == 0)
fl6.flowi6_oif = dev->ifindex;
rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags);
@@ -72,9 +72,7 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
goto out;
}
- if (rt->rt6i_idev->dev == dev ||
- l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == dev->ifindex ||
- (flags & XT_RPFILTER_LOOSE))
+ if (rt->rt6i_idev->dev == dev || (flags & XT_RPFILTER_LOOSE))
ret = true;
out:
ip6_rt_put(rt);
diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c
index f633dc84ca3f..4ad8b2032f1f 100644
--- a/net/ipv6/netfilter/ip6t_rt.c
+++ b/net/ipv6/netfilter/ip6t_rt.c
@@ -25,12 +25,7 @@ MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
static inline bool
segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert)
{
- bool r;
- pr_debug("segsleft_match:%c 0x%x <= 0x%x <= 0x%x\n",
- invert ? '!' : ' ', min, id, max);
- r = (id >= min && id <= max) ^ invert;
- pr_debug(" result %s\n", r ? "PASS" : "FAILED");
- return r;
+ return (id >= min && id <= max) ^ invert;
}
static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par)
@@ -65,32 +60,7 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par)
return false;
}
- pr_debug("IPv6 RT LEN %u %u ", hdrlen, rh->hdrlen);
- pr_debug("TYPE %04X ", rh->type);
- pr_debug("SGS_LEFT %u %02X\n", rh->segments_left, rh->segments_left);
-
- pr_debug("IPv6 RT segsleft %02X ",
- segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1],
- rh->segments_left,
- !!(rtinfo->invflags & IP6T_RT_INV_SGS)));
- pr_debug("type %02X %02X %02X ",
- rtinfo->rt_type, rh->type,
- (!(rtinfo->flags & IP6T_RT_TYP) ||
- ((rtinfo->rt_type == rh->type) ^
- !!(rtinfo->invflags & IP6T_RT_INV_TYP))));
- pr_debug("len %02X %04X %02X ",
- rtinfo->hdrlen, hdrlen,
- !(rtinfo->flags & IP6T_RT_LEN) ||
- ((rtinfo->hdrlen == hdrlen) ^
- !!(rtinfo->invflags & IP6T_RT_INV_LEN)));
- pr_debug("res %02X %02X %02X ",
- rtinfo->flags & IP6T_RT_RES,
- ((const struct rt0_hdr *)rh)->reserved,
- !((rtinfo->flags & IP6T_RT_RES) &&
- (((const struct rt0_hdr *)rh)->reserved)));
-
- ret = (rh != NULL) &&
- (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1],
+ ret = (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1],
rh->segments_left,
!!(rtinfo->invflags & IP6T_RT_INV_SGS))) &&
(!(rtinfo->flags & IP6T_RT_LEN) ||
@@ -108,22 +78,22 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par)
reserved),
sizeof(_reserved),
&_reserved);
+ if (!rp) {
+ par->hotdrop = true;
+ return false;
+ }
ret = (*rp == 0);
}
- pr_debug("#%d ", rtinfo->addrnr);
if (!(rtinfo->flags & IP6T_RT_FST)) {
return ret;
} else if (rtinfo->flags & IP6T_RT_FST_NSTRICT) {
- pr_debug("Not strict ");
if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) {
- pr_debug("There isn't enough space\n");
return false;
} else {
unsigned int i = 0;
- pr_debug("#%d ", rtinfo->addrnr);
for (temp = 0;
temp < (unsigned int)((hdrlen - 8) / 16);
temp++) {
@@ -139,26 +109,20 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par)
return false;
}
- if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) {
- pr_debug("i=%d temp=%d;\n", i, temp);
+ if (ipv6_addr_equal(ap, &rtinfo->addrs[i]))
i++;
- }
if (i == rtinfo->addrnr)
break;
}
- pr_debug("i=%d #%d\n", i, rtinfo->addrnr);
if (i == rtinfo->addrnr)
return ret;
else
return false;
}
} else {
- pr_debug("Strict ");
if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) {
- pr_debug("There isn't enough space\n");
return false;
} else {
- pr_debug("#%d ", rtinfo->addrnr);
for (temp = 0; temp < rtinfo->addrnr; temp++) {
ap = skb_header_pointer(skb,
ptr
@@ -174,7 +138,6 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par)
if (!ipv6_addr_equal(ap, &rtinfo->addrs[temp]))
break;
}
- pr_debug("temp=%d #%d\n", temp, rtinfo->addrnr);
if (temp == rtinfo->addrnr &&
temp == (unsigned int)((hdrlen - 8) / 16))
return ret;
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index 32667f5d5a33..df785ebda0ca 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -19,39 +19,25 @@ MODULE_DESCRIPTION("ip6tables filter table");
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT))
-static int __net_init ip6table_filter_table_init(struct net *net);
-
static const struct xt_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
.priority = NF_IP6_PRI_FILTER,
- .table_init = ip6table_filter_table_init,
};
-/* The work comes in here from netfilter.c. */
-static unsigned int
-ip6table_filter_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip6t_do_table(skb, state, state->net->ipv6.ip6table_filter);
-}
-
static struct nf_hook_ops *filter_ops __read_mostly;
/* Default to forward because I got too much mail already. */
static bool forward = true;
module_param(forward, bool, 0000);
-static int __net_init ip6table_filter_table_init(struct net *net)
+static int ip6table_filter_table_init(struct net *net)
{
struct ip6t_replace *repl;
int err;
- if (net->ipv6.ip6table_filter)
- return 0;
-
repl = ip6t_alloc_initial_table(&packet_filter);
if (repl == NULL)
return -ENOMEM;
@@ -59,44 +45,55 @@ static int __net_init ip6table_filter_table_init(struct net *net)
((struct ip6t_standard *)repl->entries)[1].target.verdict =
forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
- err = ip6t_register_table(net, &packet_filter, repl, filter_ops,
- &net->ipv6.ip6table_filter);
+ err = ip6t_register_table(net, &packet_filter, repl, filter_ops);
kfree(repl);
return err;
}
static int __net_init ip6table_filter_net_init(struct net *net)
{
- if (net == &init_net || !forward)
+ if (!forward)
return ip6table_filter_table_init(net);
return 0;
}
+static void __net_exit ip6table_filter_net_pre_exit(struct net *net)
+{
+ ip6t_unregister_table_pre_exit(net, "filter");
+}
+
static void __net_exit ip6table_filter_net_exit(struct net *net)
{
- if (!net->ipv6.ip6table_filter)
- return;
- ip6t_unregister_table(net, net->ipv6.ip6table_filter, filter_ops);
- net->ipv6.ip6table_filter = NULL;
+ ip6t_unregister_table_exit(net, "filter");
}
static struct pernet_operations ip6table_filter_net_ops = {
.init = ip6table_filter_net_init,
+ .pre_exit = ip6table_filter_net_pre_exit,
.exit = ip6table_filter_net_exit,
};
static int __init ip6table_filter_init(void)
{
- int ret;
+ int ret = xt_register_template(&packet_filter,
+ ip6table_filter_table_init);
- filter_ops = xt_hook_ops_alloc(&packet_filter, ip6table_filter_hook);
- if (IS_ERR(filter_ops))
+ if (ret < 0)
+ return ret;
+
+ filter_ops = xt_hook_ops_alloc(&packet_filter, ip6t_do_table);
+ if (IS_ERR(filter_ops)) {
+ xt_unregister_template(&packet_filter);
return PTR_ERR(filter_ops);
+ }
ret = register_pernet_subsys(&ip6table_filter_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ xt_unregister_template(&packet_filter);
kfree(filter_ops);
+ return ret;
+ }
return ret;
}
@@ -104,6 +101,7 @@ static int __init ip6table_filter_init(void)
static void __exit ip6table_filter_fini(void)
{
unregister_pernet_subsys(&ip6table_filter_net_ops);
+ xt_unregister_template(&packet_filter);
kfree(filter_ops);
}
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index 070afb97fa2b..a88b2ce4a3cb 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -20,19 +20,16 @@ MODULE_DESCRIPTION("ip6tables mangle table");
(1 << NF_INET_LOCAL_OUT) | \
(1 << NF_INET_POST_ROUTING))
-static int __net_init ip6table_mangle_table_init(struct net *net);
-
static const struct xt_table packet_mangler = {
.name = "mangle",
.valid_hooks = MANGLE_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
.priority = NF_IP6_PRI_MANGLE,
- .table_init = ip6table_mangle_table_init,
};
static unsigned int
-ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
+ip6t_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
unsigned int ret;
struct in6_addr saddr, daddr;
@@ -49,7 +46,7 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
/* flowlabel and prio (includes version, which shouldn't change either */
flowlabel = *((u_int32_t *)ipv6_hdr(skb));
- ret = ip6t_do_table(skb, state, state->net->ipv6.ip6table_mangle);
+ ret = ip6t_do_table(priv, skb, state);
if (ret != NF_DROP && ret != NF_STOLEN &&
(!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) ||
@@ -57,7 +54,7 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
skb->mark != mark ||
ipv6_hdr(skb)->hop_limit != hop_limit ||
flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) {
- err = ip6_route_me_harder(state->net, skb);
+ err = ip6_route_me_harder(state->net, state->sk, skb);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -71,66 +68,67 @@ ip6table_mangle_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
if (state->hook == NF_INET_LOCAL_OUT)
- return ip6t_mangle_out(skb, state);
- return ip6t_do_table(skb, state, state->net->ipv6.ip6table_mangle);
+ return ip6t_mangle_out(priv, skb, state);
+ return ip6t_do_table(priv, skb, state);
}
static struct nf_hook_ops *mangle_ops __read_mostly;
-static int __net_init ip6table_mangle_table_init(struct net *net)
+static int ip6table_mangle_table_init(struct net *net)
{
struct ip6t_replace *repl;
int ret;
- if (net->ipv6.ip6table_mangle)
- return 0;
-
repl = ip6t_alloc_initial_table(&packet_mangler);
if (repl == NULL)
return -ENOMEM;
- ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops,
- &net->ipv6.ip6table_mangle);
+ ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops);
kfree(repl);
return ret;
}
-static void __net_exit ip6table_mangle_net_exit(struct net *net)
+static void __net_exit ip6table_mangle_net_pre_exit(struct net *net)
{
- if (!net->ipv6.ip6table_mangle)
- return;
+ ip6t_unregister_table_pre_exit(net, "mangle");
+}
- ip6t_unregister_table(net, net->ipv6.ip6table_mangle, mangle_ops);
- net->ipv6.ip6table_mangle = NULL;
+static void __net_exit ip6table_mangle_net_exit(struct net *net)
+{
+ ip6t_unregister_table_exit(net, "mangle");
}
static struct pernet_operations ip6table_mangle_net_ops = {
+ .pre_exit = ip6table_mangle_net_pre_exit,
.exit = ip6table_mangle_net_exit,
};
static int __init ip6table_mangle_init(void)
{
- int ret;
+ int ret = xt_register_template(&packet_mangler,
+ ip6table_mangle_table_init);
+
+ if (ret < 0)
+ return ret;
mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook);
- if (IS_ERR(mangle_ops))
+ if (IS_ERR(mangle_ops)) {
+ xt_unregister_template(&packet_mangler);
return PTR_ERR(mangle_ops);
+ }
ret = register_pernet_subsys(&ip6table_mangle_net_ops);
if (ret < 0) {
+ xt_unregister_template(&packet_mangler);
kfree(mangle_ops);
return ret;
}
- ret = ip6table_mangle_table_init(&init_net);
- if (ret) {
- unregister_pernet_subsys(&ip6table_mangle_net_ops);
- kfree(mangle_ops);
- }
return ret;
}
static void __exit ip6table_mangle_fini(void)
{
unregister_pernet_subsys(&ip6table_mangle_net_ops);
+ xt_unregister_template(&packet_mangler);
kfree(mangle_ops);
}
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index 0f4875952efc..bf3cb3a13600 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -15,7 +15,11 @@
#include <net/netfilter/nf_nat.h>
-static int __net_init ip6table_nat_table_init(struct net *net);
+struct ip6table_nat_pernet {
+ struct nf_hook_ops *nf_nat_ops;
+};
+
+static unsigned int ip6table_nat_net_id __read_mostly;
static const struct xt_table nf_nat_ipv6_table = {
.name = "nat",
@@ -25,37 +29,29 @@ static const struct xt_table nf_nat_ipv6_table = {
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
- .table_init = ip6table_nat_table_init,
};
-static unsigned int ip6table_nat_do_chain(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip6t_do_table(skb, state, state->net->ipv6.ip6table_nat);
-}
-
static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
{
- .hook = ip6table_nat_do_chain,
+ .hook = ip6t_do_table,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP6_PRI_NAT_DST,
},
{
- .hook = ip6table_nat_do_chain,
+ .hook = ip6t_do_table,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP6_PRI_NAT_SRC,
},
{
- .hook = ip6table_nat_do_chain,
+ .hook = ip6t_do_table,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP6_PRI_NAT_DST,
},
{
- .hook = ip6table_nat_do_chain,
+ .hook = ip6t_do_table,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_NAT_SRC,
@@ -64,85 +60,110 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
static int ip6t_nat_register_lookups(struct net *net)
{
+ struct ip6table_nat_pernet *xt_nat_net;
+ struct nf_hook_ops *ops;
+ struct xt_table *table;
int i, ret;
+ table = xt_find_table(net, NFPROTO_IPV6, "nat");
+ if (WARN_ON_ONCE(!table))
+ return -ENOENT;
+
+ xt_nat_net = net_generic(net, ip6table_nat_net_id);
+ ops = kmemdup(nf_nat_ipv6_ops, sizeof(nf_nat_ipv6_ops), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++) {
- ret = nf_nat_ipv6_register_fn(net, &nf_nat_ipv6_ops[i]);
+ ops[i].priv = table;
+ ret = nf_nat_ipv6_register_fn(net, &ops[i]);
if (ret) {
while (i)
- nf_nat_ipv6_unregister_fn(net, &nf_nat_ipv6_ops[--i]);
+ nf_nat_ipv6_unregister_fn(net, &ops[--i]);
+ kfree(ops);
return ret;
}
}
+ xt_nat_net->nf_nat_ops = ops;
return 0;
}
static void ip6t_nat_unregister_lookups(struct net *net)
{
+ struct ip6table_nat_pernet *xt_nat_net = net_generic(net, ip6table_nat_net_id);
+ struct nf_hook_ops *ops = xt_nat_net->nf_nat_ops;
int i;
+ if (!ops)
+ return;
+
for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++)
- nf_nat_ipv6_unregister_fn(net, &nf_nat_ipv6_ops[i]);
+ nf_nat_ipv6_unregister_fn(net, &ops[i]);
+
+ kfree(ops);
}
-static int __net_init ip6table_nat_table_init(struct net *net)
+static int ip6table_nat_table_init(struct net *net)
{
struct ip6t_replace *repl;
int ret;
- if (net->ipv6.ip6table_nat)
- return 0;
-
repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table);
if (repl == NULL)
return -ENOMEM;
ret = ip6t_register_table(net, &nf_nat_ipv6_table, repl,
- NULL, &net->ipv6.ip6table_nat);
+ NULL);
if (ret < 0) {
kfree(repl);
return ret;
}
ret = ip6t_nat_register_lookups(net);
- if (ret < 0) {
- ip6t_unregister_table(net, net->ipv6.ip6table_nat, NULL);
- net->ipv6.ip6table_nat = NULL;
- }
+ if (ret < 0)
+ ip6t_unregister_table_exit(net, "nat");
+
kfree(repl);
return ret;
}
-static void __net_exit ip6table_nat_net_exit(struct net *net)
+static void __net_exit ip6table_nat_net_pre_exit(struct net *net)
{
- if (!net->ipv6.ip6table_nat)
- return;
ip6t_nat_unregister_lookups(net);
- ip6t_unregister_table(net, net->ipv6.ip6table_nat, NULL);
- net->ipv6.ip6table_nat = NULL;
+}
+
+static void __net_exit ip6table_nat_net_exit(struct net *net)
+{
+ ip6t_unregister_table_exit(net, "nat");
}
static struct pernet_operations ip6table_nat_net_ops = {
+ .pre_exit = ip6table_nat_net_pre_exit,
.exit = ip6table_nat_net_exit,
+ .id = &ip6table_nat_net_id,
+ .size = sizeof(struct ip6table_nat_pernet),
};
static int __init ip6table_nat_init(void)
{
- int ret = register_pernet_subsys(&ip6table_nat_net_ops);
+ int ret = xt_register_template(&nf_nat_ipv6_table,
+ ip6table_nat_table_init);
- if (ret)
+ if (ret < 0)
return ret;
- ret = ip6table_nat_table_init(&init_net);
+ ret = register_pernet_subsys(&ip6table_nat_net_ops);
if (ret)
- unregister_pernet_subsys(&ip6table_nat_net_ops);
+ xt_unregister_template(&nf_nat_ipv6_table);
+
return ret;
}
static void __exit ip6table_nat_exit(void)
{
unregister_pernet_subsys(&ip6table_nat_net_ops);
+ xt_unregister_template(&nf_nat_ipv6_table);
}
module_init(ip6table_nat_init);
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index a22100b1cf2c..08861d5d1f4d 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -11,8 +11,6 @@
#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
-static int __net_init ip6table_raw_table_init(struct net *net);
-
static bool raw_before_defrag __read_mostly;
MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag");
module_param(raw_before_defrag, bool, 0000);
@@ -23,7 +21,6 @@ static const struct xt_table packet_raw = {
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
.priority = NF_IP6_PRI_RAW,
- .table_init = ip6table_raw_table_init,
};
static const struct xt_table packet_raw_before_defrag = {
@@ -32,20 +29,11 @@ static const struct xt_table packet_raw_before_defrag = {
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
.priority = NF_IP6_PRI_RAW_BEFORE_DEFRAG,
- .table_init = ip6table_raw_table_init,
};
-/* The work comes in here from netfilter.c. */
-static unsigned int
-ip6table_raw_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip6t_do_table(skb, state, state->net->ipv6.ip6table_raw);
-}
-
static struct nf_hook_ops *rawtable_ops __read_mostly;
-static int __net_init ip6table_raw_table_init(struct net *net)
+static int ip6table_raw_table_init(struct net *net)
{
struct ip6t_replace *repl;
const struct xt_table *table = &packet_raw;
@@ -54,63 +42,64 @@ static int __net_init ip6table_raw_table_init(struct net *net)
if (raw_before_defrag)
table = &packet_raw_before_defrag;
- if (net->ipv6.ip6table_raw)
- return 0;
-
repl = ip6t_alloc_initial_table(table);
if (repl == NULL)
return -ENOMEM;
- ret = ip6t_register_table(net, table, repl, rawtable_ops,
- &net->ipv6.ip6table_raw);
+ ret = ip6t_register_table(net, table, repl, rawtable_ops);
kfree(repl);
return ret;
}
+static void __net_exit ip6table_raw_net_pre_exit(struct net *net)
+{
+ ip6t_unregister_table_pre_exit(net, "raw");
+}
+
static void __net_exit ip6table_raw_net_exit(struct net *net)
{
- if (!net->ipv6.ip6table_raw)
- return;
- ip6t_unregister_table(net, net->ipv6.ip6table_raw, rawtable_ops);
- net->ipv6.ip6table_raw = NULL;
+ ip6t_unregister_table_exit(net, "raw");
}
static struct pernet_operations ip6table_raw_net_ops = {
+ .pre_exit = ip6table_raw_net_pre_exit,
.exit = ip6table_raw_net_exit,
};
static int __init ip6table_raw_init(void)
{
- int ret;
const struct xt_table *table = &packet_raw;
+ int ret;
if (raw_before_defrag) {
table = &packet_raw_before_defrag;
-
pr_info("Enabling raw table before defrag\n");
}
+ ret = xt_register_template(table, ip6table_raw_table_init);
+ if (ret < 0)
+ return ret;
+
/* Register hooks */
- rawtable_ops = xt_hook_ops_alloc(table, ip6table_raw_hook);
- if (IS_ERR(rawtable_ops))
+ rawtable_ops = xt_hook_ops_alloc(table, ip6t_do_table);
+ if (IS_ERR(rawtable_ops)) {
+ xt_unregister_template(table);
return PTR_ERR(rawtable_ops);
+ }
ret = register_pernet_subsys(&ip6table_raw_net_ops);
if (ret < 0) {
kfree(rawtable_ops);
+ xt_unregister_template(table);
return ret;
}
- ret = ip6table_raw_table_init(&init_net);
- if (ret) {
- unregister_pernet_subsys(&ip6table_raw_net_ops);
- kfree(rawtable_ops);
- }
return ret;
}
static void __exit ip6table_raw_fini(void)
{
unregister_pernet_subsys(&ip6table_raw_net_ops);
+ xt_unregister_template(&packet_raw);
kfree(rawtable_ops);
}
diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c
index a74335fe2bd9..4df14a9bae78 100644
--- a/net/ipv6/netfilter/ip6table_security.c
+++ b/net/ipv6/netfilter/ip6table_security.c
@@ -24,80 +24,72 @@ MODULE_DESCRIPTION("ip6tables security table, for MAC rules");
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT)
-static int __net_init ip6table_security_table_init(struct net *net);
-
static const struct xt_table security_table = {
.name = "security",
.valid_hooks = SECURITY_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
.priority = NF_IP6_PRI_SECURITY,
- .table_init = ip6table_security_table_init,
};
-static unsigned int
-ip6table_security_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip6t_do_table(skb, state, state->net->ipv6.ip6table_security);
-}
-
static struct nf_hook_ops *sectbl_ops __read_mostly;
-static int __net_init ip6table_security_table_init(struct net *net)
+static int ip6table_security_table_init(struct net *net)
{
struct ip6t_replace *repl;
int ret;
- if (net->ipv6.ip6table_security)
- return 0;
-
repl = ip6t_alloc_initial_table(&security_table);
if (repl == NULL)
return -ENOMEM;
- ret = ip6t_register_table(net, &security_table, repl, sectbl_ops,
- &net->ipv6.ip6table_security);
+ ret = ip6t_register_table(net, &security_table, repl, sectbl_ops);
kfree(repl);
return ret;
}
+static void __net_exit ip6table_security_net_pre_exit(struct net *net)
+{
+ ip6t_unregister_table_pre_exit(net, "security");
+}
+
static void __net_exit ip6table_security_net_exit(struct net *net)
{
- if (!net->ipv6.ip6table_security)
- return;
- ip6t_unregister_table(net, net->ipv6.ip6table_security, sectbl_ops);
- net->ipv6.ip6table_security = NULL;
+ ip6t_unregister_table_exit(net, "security");
}
static struct pernet_operations ip6table_security_net_ops = {
+ .pre_exit = ip6table_security_net_pre_exit,
.exit = ip6table_security_net_exit,
};
static int __init ip6table_security_init(void)
{
- int ret;
+ int ret = xt_register_template(&security_table,
+ ip6table_security_table_init);
+
+ if (ret < 0)
+ return ret;
- sectbl_ops = xt_hook_ops_alloc(&security_table, ip6table_security_hook);
- if (IS_ERR(sectbl_ops))
+ sectbl_ops = xt_hook_ops_alloc(&security_table, ip6t_do_table);
+ if (IS_ERR(sectbl_ops)) {
+ xt_unregister_template(&security_table);
return PTR_ERR(sectbl_ops);
+ }
ret = register_pernet_subsys(&ip6table_security_net_ops);
if (ret < 0) {
kfree(sectbl_ops);
+ xt_unregister_template(&security_table);
return ret;
}
- ret = ip6table_security_table_init(&init_net);
- if (ret) {
- unregister_pernet_subsys(&ip6table_security_net_ops);
- kfree(sectbl_ops);
- }
return ret;
}
static void __exit ip6table_security_fini(void)
{
unregister_pernet_subsys(&ip6table_security_net_ops);
+ xt_unregister_template(&security_table);
kfree(sectbl_ops);
}
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index fed9666a2f7d..38db0064d661 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -15,28 +15,13 @@
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/string.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/jiffies.h>
#include <linux/net.h>
-#include <linux/list.h>
#include <linux/netdevice.h>
-#include <linux/in6.h>
#include <linux/ipv6.h>
-#include <linux/icmpv6.h>
-#include <linux/random.h>
#include <linux/slab.h>
-#include <net/sock.h>
-#include <net/snmp.h>
#include <net/ipv6_frag.h>
-#include <net/protocol.h>
-#include <net/transp_v6.h>
-#include <net/rawv6.h>
-#include <net/ndisc.h>
-#include <net/addrconf.h>
-#include <net/inet_ecn.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
#include <linux/sysctl.h>
#include <linux/netfilter.h>
@@ -44,11 +29,18 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/netns/generic.h>
static const char nf_frags_cache_name[] = "nf-frags";
+static unsigned int nf_frag_pernet_id __read_mostly;
static struct inet_frags nf_frags;
+static struct nft_ct_frag6_pernet *nf_frag_pernet(struct net *net)
+{
+ return net_generic(net, nf_frag_pernet_id);
+}
+
#ifdef CONFIG_SYSCTL
static struct ctl_table nf_ct_frag6_sysctl_table[] = {
@@ -75,6 +67,7 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = {
static int nf_ct_frag6_sysctl_register(struct net *net)
{
+ struct nft_ct_frag6_pernet *nf_frag;
struct ctl_table *table;
struct ctl_table_header *hdr;
@@ -86,18 +79,19 @@ static int nf_ct_frag6_sysctl_register(struct net *net)
goto err_alloc;
}
- table[0].data = &net->nf_frag.fqdir->timeout;
- table[1].data = &net->nf_frag.fqdir->low_thresh;
- table[1].extra2 = &net->nf_frag.fqdir->high_thresh;
- table[2].data = &net->nf_frag.fqdir->high_thresh;
- table[2].extra1 = &net->nf_frag.fqdir->low_thresh;
- table[2].extra2 = &init_net.nf_frag.fqdir->high_thresh;
+ nf_frag = nf_frag_pernet(net);
+
+ table[0].data = &nf_frag->fqdir->timeout;
+ table[1].data = &nf_frag->fqdir->low_thresh;
+ table[1].extra2 = &nf_frag->fqdir->high_thresh;
+ table[2].data = &nf_frag->fqdir->high_thresh;
+ table[2].extra1 = &nf_frag->fqdir->low_thresh;
hdr = register_net_sysctl(net, "net/netfilter", table);
if (hdr == NULL)
goto err_reg;
- net->nf_frag_frags_hdr = hdr;
+ nf_frag->nf_frag_frags_hdr = hdr;
return 0;
err_reg:
@@ -109,10 +103,11 @@ err_alloc:
static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net)
{
+ struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net);
struct ctl_table *table;
- table = net->nf_frag_frags_hdr->ctl_table_arg;
- unregister_net_sysctl_table(net->nf_frag_frags_hdr);
+ table = nf_frag->nf_frag_frags_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(nf_frag->nf_frag_frags_hdr);
if (!net_eq(net, &init_net))
kfree(table);
}
@@ -149,6 +144,7 @@ static void nf_ct_frag6_expire(struct timer_list *t)
static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
const struct ipv6hdr *hdr, int iif)
{
+ struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net);
struct frag_v6_compare_key key = {
.id = id,
.saddr = hdr->saddr,
@@ -158,7 +154,7 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
};
struct inet_frag_queue *q;
- q = inet_frag_find(net->nf_frag.fqdir, &key);
+ q = inet_frag_find(nf_frag->fqdir, &key);
if (!q)
return NULL;
@@ -267,6 +263,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
fq->iif = dev->ifindex;
fq->q.stamp = skb->tstamp;
+ fq->q.mono_delivery_time = skb->mono_delivery_time;
fq->q.meat += skb->len;
fq->ecn |= ecn;
if (payload_len > fq->q.max_size)
@@ -355,6 +352,7 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
ipv6_hdr(skb)->payload_len = htons(payload_len);
ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn);
IP6CB(skb)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size;
+ IP6CB(skb)->flags |= IP6SKB_FRAGMENTED;
/* Yes, and fold redundant checksum back. 8) */
if (skb->ip_summed == CHECKSUM_COMPLETE)
@@ -439,6 +437,7 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff)
int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
{
u16 savethdr = skb->transport_header;
+ u8 nexthdr = NEXTHDR_FRAGMENT;
int fhoff, nhoff, ret;
struct frag_hdr *fhdr;
struct frag_queue *fq;
@@ -454,6 +453,14 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0)
return 0;
+ /* Discard the first fragment if it does not include all headers
+ * RFC 8200, Section 4.5
+ */
+ if (ipv6frag_thdr_truncated(skb, fhoff, &nexthdr)) {
+ pr_debug("Drop incomplete fragment\n");
+ return 0;
+ }
+
if (!pskb_may_pull(skb, fhoff + sizeof(*fhdr)))
return -ENOMEM;
@@ -485,37 +492,44 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
static int nf_ct_net_init(struct net *net)
{
+ struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net);
int res;
- res = fqdir_init(&net->nf_frag.fqdir, &nf_frags, net);
+ res = fqdir_init(&nf_frag->fqdir, &nf_frags, net);
if (res < 0)
return res;
- net->nf_frag.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH;
- net->nf_frag.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH;
- net->nf_frag.fqdir->timeout = IPV6_FRAG_TIMEOUT;
+ nf_frag->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH;
+ nf_frag->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH;
+ nf_frag->fqdir->timeout = IPV6_FRAG_TIMEOUT;
res = nf_ct_frag6_sysctl_register(net);
if (res < 0)
- fqdir_exit(net->nf_frag.fqdir);
+ fqdir_exit(nf_frag->fqdir);
return res;
}
static void nf_ct_net_pre_exit(struct net *net)
{
- fqdir_pre_exit(net->nf_frag.fqdir);
+ struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net);
+
+ fqdir_pre_exit(nf_frag->fqdir);
}
static void nf_ct_net_exit(struct net *net)
{
+ struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net);
+
nf_ct_frags6_sysctl_unregister(net);
- fqdir_exit(net->nf_frag.fqdir);
+ fqdir_exit(nf_frag->fqdir);
}
static struct pernet_operations nf_ct_net_ops = {
.init = nf_ct_net_init,
.pre_exit = nf_ct_net_pre_exit,
.exit = nf_ct_net_exit,
+ .id = &nf_frag_pernet_id,
+ .size = sizeof(struct nft_ct_frag6_pernet),
};
static const struct rhashtable_params nfct_rhash_params = {
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index 6646a87fb5dc..cb4eb1d2c620 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -89,10 +89,10 @@ static const struct nf_hook_ops ipv6_defrag_ops[] = {
static void __net_exit defrag6_net_exit(struct net *net)
{
- if (net->nf.defrag_ipv6) {
+ if (net->nf.defrag_ipv6_users) {
nf_unregister_net_hooks(net, ipv6_defrag_ops,
ARRAY_SIZE(ipv6_defrag_ops));
- net->nf.defrag_ipv6 = false;
+ net->nf.defrag_ipv6_users = 0;
}
}
@@ -132,19 +132,21 @@ int nf_defrag_ipv6_enable(struct net *net)
{
int err = 0;
- might_sleep();
-
- if (net->nf.defrag_ipv6)
- return 0;
-
mutex_lock(&defrag6_mutex);
- if (net->nf.defrag_ipv6)
+ if (net->nf.defrag_ipv6_users == UINT_MAX) {
+ err = -EOVERFLOW;
+ goto out_unlock;
+ }
+
+ if (net->nf.defrag_ipv6_users) {
+ net->nf.defrag_ipv6_users++;
goto out_unlock;
+ }
err = nf_register_net_hooks(net, ipv6_defrag_ops,
ARRAY_SIZE(ipv6_defrag_ops));
if (err == 0)
- net->nf.defrag_ipv6 = true;
+ net->nf.defrag_ipv6_users = 1;
out_unlock:
mutex_unlock(&defrag6_mutex);
@@ -152,6 +154,19 @@ int nf_defrag_ipv6_enable(struct net *net)
}
EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable);
+void nf_defrag_ipv6_disable(struct net *net)
+{
+ mutex_lock(&defrag6_mutex);
+ if (net->nf.defrag_ipv6_users) {
+ net->nf.defrag_ipv6_users--;
+ if (net->nf.defrag_ipv6_users == 0)
+ nf_unregister_net_hooks(net, ipv6_defrag_ops,
+ ARRAY_SIZE(ipv6_defrag_ops));
+ }
+ mutex_unlock(&defrag6_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_defrag_ipv6_disable);
+
module_init(nf_defrag_init);
module_exit(nf_defrag_fini);
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
deleted file mode 100644
index a8566ee12e83..000000000000
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <linux/rhashtable.h>
-#include <net/netfilter/nf_flow_table.h>
-#include <net/netfilter/nf_tables.h>
-
-static struct nf_flowtable_type flowtable_ipv6 = {
- .family = NFPROTO_IPV6,
- .init = nf_flow_table_init,
- .setup = nf_flow_table_offload_setup,
- .action = nf_flow_rule_route_ipv6,
- .free = nf_flow_table_free,
- .hook = nf_flow_offload_ipv6_hook,
- .owner = THIS_MODULE,
-};
-
-static int __init nf_flow_ipv6_module_init(void)
-{
- nft_register_flowtable_type(&flowtable_ipv6);
-
- return 0;
-}
-
-static void __exit nf_flow_ipv6_module_exit(void)
-{
- nft_unregister_flowtable_type(&flowtable_ipv6);
-}
-
-module_init(nf_flow_ipv6_module_init);
-module_exit(nf_flow_ipv6_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
-MODULE_ALIAS_NF_FLOWTABLE(AF_INET6);
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
deleted file mode 100644
index 22b80db6d882..000000000000
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ /dev/null
@@ -1,425 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/skbuff.h>
-#include <linux/if_arp.h>
-#include <linux/ip.h>
-#include <net/ipv6.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/tcp.h>
-#include <net/route.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv6.h>
-#include <linux/netfilter/xt_LOG.h>
-#include <net/netfilter/nf_log.h>
-
-static const struct nf_loginfo default_loginfo = {
- .type = NF_LOG_TYPE_LOG,
- .u = {
- .log = {
- .level = LOGLEVEL_NOTICE,
- .logflags = NF_LOG_DEFAULT_MASK,
- },
- },
-};
-
-/* One level of recursion won't kill us */
-static void dump_ipv6_packet(struct net *net, struct nf_log_buf *m,
- const struct nf_loginfo *info,
- const struct sk_buff *skb, unsigned int ip6hoff,
- int recurse)
-{
- u_int8_t currenthdr;
- int fragment;
- struct ipv6hdr _ip6h;
- const struct ipv6hdr *ih;
- unsigned int ptr;
- unsigned int hdrlen = 0;
- unsigned int logflags;
-
- if (info->type == NF_LOG_TYPE_LOG)
- logflags = info->u.log.logflags;
- else
- logflags = NF_LOG_DEFAULT_MASK;
-
- ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
- if (ih == NULL) {
- nf_log_buf_add(m, "TRUNCATED");
- return;
- }
-
- /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
- nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
-
- /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
- nf_log_buf_add(m, "LEN=%zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
- ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
- (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
- ih->hop_limit,
- (ntohl(*(__be32 *)ih) & 0x000fffff));
-
- fragment = 0;
- ptr = ip6hoff + sizeof(struct ipv6hdr);
- currenthdr = ih->nexthdr;
- while (currenthdr != NEXTHDR_NONE && nf_ip6_ext_hdr(currenthdr)) {
- struct ipv6_opt_hdr _hdr;
- const struct ipv6_opt_hdr *hp;
-
- hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
- if (hp == NULL) {
- nf_log_buf_add(m, "TRUNCATED");
- return;
- }
-
- /* Max length: 48 "OPT (...) " */
- if (logflags & NF_LOG_IPOPT)
- nf_log_buf_add(m, "OPT ( ");
-
- switch (currenthdr) {
- case IPPROTO_FRAGMENT: {
- struct frag_hdr _fhdr;
- const struct frag_hdr *fh;
-
- nf_log_buf_add(m, "FRAG:");
- fh = skb_header_pointer(skb, ptr, sizeof(_fhdr),
- &_fhdr);
- if (fh == NULL) {
- nf_log_buf_add(m, "TRUNCATED ");
- return;
- }
-
- /* Max length: 6 "65535 " */
- nf_log_buf_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8);
-
- /* Max length: 11 "INCOMPLETE " */
- if (fh->frag_off & htons(0x0001))
- nf_log_buf_add(m, "INCOMPLETE ");
-
- nf_log_buf_add(m, "ID:%08x ",
- ntohl(fh->identification));
-
- if (ntohs(fh->frag_off) & 0xFFF8)
- fragment = 1;
-
- hdrlen = 8;
-
- break;
- }
- case IPPROTO_DSTOPTS:
- case IPPROTO_ROUTING:
- case IPPROTO_HOPOPTS:
- if (fragment) {
- if (logflags & NF_LOG_IPOPT)
- nf_log_buf_add(m, ")");
- return;
- }
- hdrlen = ipv6_optlen(hp);
- break;
- /* Max Length */
- case IPPROTO_AH:
- if (logflags & NF_LOG_IPOPT) {
- struct ip_auth_hdr _ahdr;
- const struct ip_auth_hdr *ah;
-
- /* Max length: 3 "AH " */
- nf_log_buf_add(m, "AH ");
-
- if (fragment) {
- nf_log_buf_add(m, ")");
- return;
- }
-
- ah = skb_header_pointer(skb, ptr, sizeof(_ahdr),
- &_ahdr);
- if (ah == NULL) {
- /*
- * Max length: 26 "INCOMPLETE [65535
- * bytes] )"
- */
- nf_log_buf_add(m, "INCOMPLETE [%u bytes] )",
- skb->len - ptr);
- return;
- }
-
- /* Length: 15 "SPI=0xF1234567 */
- nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
-
- }
-
- hdrlen = ipv6_authlen(hp);
- break;
- case IPPROTO_ESP:
- if (logflags & NF_LOG_IPOPT) {
- struct ip_esp_hdr _esph;
- const struct ip_esp_hdr *eh;
-
- /* Max length: 4 "ESP " */
- nf_log_buf_add(m, "ESP ");
-
- if (fragment) {
- nf_log_buf_add(m, ")");
- return;
- }
-
- /*
- * Max length: 26 "INCOMPLETE [65535 bytes] )"
- */
- eh = skb_header_pointer(skb, ptr, sizeof(_esph),
- &_esph);
- if (eh == NULL) {
- nf_log_buf_add(m, "INCOMPLETE [%u bytes] )",
- skb->len - ptr);
- return;
- }
-
- /* Length: 16 "SPI=0xF1234567 )" */
- nf_log_buf_add(m, "SPI=0x%x )",
- ntohl(eh->spi));
- }
- return;
- default:
- /* Max length: 20 "Unknown Ext Hdr 255" */
- nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr);
- return;
- }
- if (logflags & NF_LOG_IPOPT)
- nf_log_buf_add(m, ") ");
-
- currenthdr = hp->nexthdr;
- ptr += hdrlen;
- }
-
- switch (currenthdr) {
- case IPPROTO_TCP:
- if (nf_log_dump_tcp_header(m, skb, currenthdr, fragment,
- ptr, logflags))
- return;
- break;
- case IPPROTO_UDP:
- case IPPROTO_UDPLITE:
- if (nf_log_dump_udp_header(m, skb, currenthdr, fragment, ptr))
- return;
- break;
- case IPPROTO_ICMPV6: {
- struct icmp6hdr _icmp6h;
- const struct icmp6hdr *ic;
-
- /* Max length: 13 "PROTO=ICMPv6 " */
- nf_log_buf_add(m, "PROTO=ICMPv6 ");
-
- if (fragment)
- break;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h);
- if (ic == NULL) {
- nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
- skb->len - ptr);
- return;
- }
-
- /* Max length: 18 "TYPE=255 CODE=255 " */
- nf_log_buf_add(m, "TYPE=%u CODE=%u ",
- ic->icmp6_type, ic->icmp6_code);
-
- switch (ic->icmp6_type) {
- case ICMPV6_ECHO_REQUEST:
- case ICMPV6_ECHO_REPLY:
- /* Max length: 19 "ID=65535 SEQ=65535 " */
- nf_log_buf_add(m, "ID=%u SEQ=%u ",
- ntohs(ic->icmp6_identifier),
- ntohs(ic->icmp6_sequence));
- break;
- case ICMPV6_MGM_QUERY:
- case ICMPV6_MGM_REPORT:
- case ICMPV6_MGM_REDUCTION:
- break;
-
- case ICMPV6_PARAMPROB:
- /* Max length: 17 "POINTER=ffffffff " */
- nf_log_buf_add(m, "POINTER=%08x ",
- ntohl(ic->icmp6_pointer));
- /* Fall through */
- case ICMPV6_DEST_UNREACH:
- case ICMPV6_PKT_TOOBIG:
- case ICMPV6_TIME_EXCEED:
- /* Max length: 3+maxlen */
- if (recurse) {
- nf_log_buf_add(m, "[");
- dump_ipv6_packet(net, m, info, skb,
- ptr + sizeof(_icmp6h), 0);
- nf_log_buf_add(m, "] ");
- }
-
- /* Max length: 10 "MTU=65535 " */
- if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) {
- nf_log_buf_add(m, "MTU=%u ",
- ntohl(ic->icmp6_mtu));
- }
- }
- break;
- }
- /* Max length: 10 "PROTO=255 " */
- default:
- nf_log_buf_add(m, "PROTO=%u ", currenthdr);
- }
-
- /* Max length: 15 "UID=4294967295 " */
- if ((logflags & NF_LOG_UID) && recurse)
- nf_log_dump_sk_uid_gid(net, m, skb->sk);
-
- /* Max length: 16 "MARK=0xFFFFFFFF " */
- if (recurse && skb->mark)
- nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
-}
-
-static void dump_ipv6_mac_header(struct nf_log_buf *m,
- const struct nf_loginfo *info,
- const struct sk_buff *skb)
-{
- struct net_device *dev = skb->dev;
- unsigned int logflags = 0;
-
- if (info->type == NF_LOG_TYPE_LOG)
- logflags = info->u.log.logflags;
-
- if (!(logflags & NF_LOG_MACDECODE))
- goto fallback;
-
- switch (dev->type) {
- case ARPHRD_ETHER:
- nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
- eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
- ntohs(eth_hdr(skb)->h_proto));
- return;
- default:
- break;
- }
-
-fallback:
- nf_log_buf_add(m, "MAC=");
- if (dev->hard_header_len &&
- skb->mac_header != skb->network_header) {
- const unsigned char *p = skb_mac_header(skb);
- unsigned int len = dev->hard_header_len;
- unsigned int i;
-
- if (dev->type == ARPHRD_SIT) {
- p -= ETH_HLEN;
-
- if (p < skb->head)
- p = NULL;
- }
-
- if (p != NULL) {
- nf_log_buf_add(m, "%02x", *p++);
- for (i = 1; i < len; i++)
- nf_log_buf_add(m, ":%02x", *p++);
- }
- nf_log_buf_add(m, " ");
-
- if (dev->type == ARPHRD_SIT) {
- const struct iphdr *iph =
- (struct iphdr *)skb_mac_header(skb);
- nf_log_buf_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr,
- &iph->daddr);
- }
- } else {
- nf_log_buf_add(m, " ");
- }
-}
-
-static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
- unsigned int hooknum, const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *loginfo,
- const char *prefix)
-{
- struct nf_log_buf *m;
-
- /* FIXME: Disabled from containers until syslog ns is supported */
- if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
- return;
-
- m = nf_log_buf_open();
-
- if (!loginfo)
- loginfo = &default_loginfo;
-
- nf_log_dump_packet_common(m, pf, hooknum, skb, in, out,
- loginfo, prefix);
-
- if (in != NULL)
- dump_ipv6_mac_header(m, loginfo, skb);
-
- dump_ipv6_packet(net, m, loginfo, skb, skb_network_offset(skb), 1);
-
- nf_log_buf_close(m);
-}
-
-static struct nf_logger nf_ip6_logger __read_mostly = {
- .name = "nf_log_ipv6",
- .type = NF_LOG_TYPE_LOG,
- .logfn = nf_log_ip6_packet,
- .me = THIS_MODULE,
-};
-
-static int __net_init nf_log_ipv6_net_init(struct net *net)
-{
- return nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger);
-}
-
-static void __net_exit nf_log_ipv6_net_exit(struct net *net)
-{
- nf_log_unset(net, &nf_ip6_logger);
-}
-
-static struct pernet_operations nf_log_ipv6_net_ops = {
- .init = nf_log_ipv6_net_init,
- .exit = nf_log_ipv6_net_exit,
-};
-
-static int __init nf_log_ipv6_init(void)
-{
- int ret;
-
- ret = register_pernet_subsys(&nf_log_ipv6_net_ops);
- if (ret < 0)
- return ret;
-
- ret = nf_log_register(NFPROTO_IPV6, &nf_ip6_logger);
- if (ret < 0) {
- pr_err("failed to register logger\n");
- goto err1;
- }
-
- return 0;
-
-err1:
- unregister_pernet_subsys(&nf_log_ipv6_net_ops);
- return ret;
-}
-
-static void __exit nf_log_ipv6_exit(void)
-{
- unregister_pernet_subsys(&nf_log_ipv6_net_ops);
- nf_log_unregister(&nf_ip6_logger);
-}
-
-module_init(nf_log_ipv6_init);
-module_exit(nf_log_ipv6_exit);
-
-MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("Netfilter IPv6 packet logging");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NF_LOGGER(AF_INET6, 0);
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index 5fae66f66671..f61d4f18e1cf 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -12,6 +12,140 @@
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_bridge.h>
+static bool nf_reject_v6_csum_ok(struct sk_buff *skb, int hook)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ int thoff;
+ __be16 fo;
+ u8 proto = ip6h->nexthdr;
+
+ if (skb_csum_unnecessary(skb))
+ return true;
+
+ if (ip6h->payload_len &&
+ pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h)))
+ return false;
+
+ ip6h = ipv6_hdr(skb);
+ thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo);
+ if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0)
+ return false;
+
+ if (!nf_reject_verify_csum(skb, thoff, proto))
+ return true;
+
+ return nf_ip6_checksum(skb, hook, thoff, proto) == 0;
+}
+
+static int nf_reject_ip6hdr_validate(struct sk_buff *skb)
+{
+ struct ipv6hdr *hdr;
+ u32 pkt_len;
+
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ return 0;
+
+ hdr = ipv6_hdr(skb);
+ if (hdr->version != 6)
+ return 0;
+
+ pkt_len = ntohs(hdr->payload_len);
+ if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
+ return 0;
+
+ return 1;
+}
+
+struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook)
+{
+ struct sk_buff *nskb;
+ const struct tcphdr *oth;
+ struct tcphdr _oth;
+ unsigned int otcplen;
+ struct ipv6hdr *nip6h;
+
+ if (!nf_reject_ip6hdr_validate(oldskb))
+ return NULL;
+
+ oth = nf_reject_ip6_tcphdr_get(oldskb, &_oth, &otcplen, hook);
+ if (!oth)
+ return NULL;
+
+ nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct tcphdr) +
+ LL_MAX_HEADER, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+
+ nskb->dev = (struct net_device *)dev;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP,
+ net->ipv6.devconf_all->hop_limit);
+ nf_reject_ip6_tcphdr_put(nskb, oldskb, oth, otcplen);
+ nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr));
+
+ return nskb;
+}
+EXPORT_SYMBOL_GPL(nf_reject_skb_v6_tcp_reset);
+
+struct sk_buff *nf_reject_skb_v6_unreach(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook, u8 code)
+{
+ struct sk_buff *nskb;
+ struct ipv6hdr *nip6h;
+ struct icmp6hdr *icmp6h;
+ unsigned int len;
+
+ if (!nf_reject_ip6hdr_validate(oldskb))
+ return NULL;
+
+ /* Include "As much of invoking packet as possible without the ICMPv6
+ * packet exceeding the minimum IPv6 MTU" in the ICMP payload.
+ */
+ len = min_t(unsigned int, 1220, oldskb->len);
+
+ if (!pskb_may_pull(oldskb, len))
+ return NULL;
+
+ if (!nf_reject_v6_csum_ok(oldskb, hook))
+ return NULL;
+
+ nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr) +
+ LL_MAX_HEADER + len, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+
+ nskb->dev = (struct net_device *)dev;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_ICMPV6,
+ net->ipv6.devconf_all->hop_limit);
+
+ skb_reset_transport_header(nskb);
+ icmp6h = skb_put_zero(nskb, sizeof(struct icmp6hdr));
+ icmp6h->icmp6_type = ICMPV6_DEST_UNREACH;
+ icmp6h->icmp6_code = code;
+
+ skb_put_data(nskb, skb_network_header(oldskb), len);
+ nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr));
+
+ icmp6h->icmp6_cksum =
+ csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr,
+ nskb->len - sizeof(struct ipv6hdr),
+ IPPROTO_ICMPV6,
+ csum_partial(icmp6h,
+ nskb->len - sizeof(struct ipv6hdr),
+ 0));
+
+ return nskb;
+}
+EXPORT_SYMBOL_GPL(nf_reject_skb_v6_unreach);
+
const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
struct tcphdr *otcph,
unsigned int *otcplen, int hook)
@@ -126,7 +260,23 @@ void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb,
}
EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put);
-void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
+static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in)
+{
+ struct dst_entry *dst = NULL;
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(struct flowi));
+ fl.u.ip6.daddr = ipv6_hdr(skb_in)->saddr;
+ nf_ip6_route(dev_net(skb_in->dev), &dst, &fl, false);
+ if (!dst)
+ return -1;
+
+ skb_dst_set(skb_in, dst);
+ return 0;
+}
+
+void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
+ int hook)
{
struct net_device *br_indev __maybe_unused;
struct sk_buff *nskb;
@@ -154,9 +304,17 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
fl6.daddr = oip6h->saddr;
fl6.fl6_sport = otcph->dest;
fl6.fl6_dport = otcph->source;
+
+ if (hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) {
+ nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false);
+ if (!dst)
+ return;
+ skb_dst_set(oldskb, dst);
+ }
+
fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev);
fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark);
- security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6));
+ security_skb_classify_flow(oldskb, flowi6_to_flowi_common(&fl6));
dst = ip6_route_output(net, NULL, &fl6);
if (dst->error) {
dst_release(dst);
@@ -210,7 +368,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
dev_queue_xmit(nskb);
} else
#endif
- ip6_local_out(net, nskb->sk, nskb);
+ ip6_local_out(net, sk, nskb);
}
EXPORT_SYMBOL_GPL(nf_send_reset6);
@@ -230,7 +388,7 @@ static bool reject6_csum_ok(struct sk_buff *skb, int hook)
if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0)
return false;
- if (!nf_reject_verify_csum(proto))
+ if (!nf_reject_verify_csum(skb, thoff, proto))
return true;
return nf_ip6_checksum(skb, hook, thoff, proto) == 0;
@@ -245,6 +403,10 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in,
if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL)
skb_in->dev = net->loopback_dev;
+ if ((hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_INGRESS) &&
+ nf_reject6_fill_skb_dst(skb_in) < 0)
+ return;
+
icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0);
}
EXPORT_SYMBOL_GPL(nf_send_unreach6);
diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c
index b9df879c48d3..a7690ec62325 100644
--- a/net/ipv6/netfilter/nf_socket_ipv6.c
+++ b/net/ipv6/netfilter/nf_socket_ipv6.c
@@ -83,8 +83,8 @@ nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
{
switch (protocol) {
case IPPROTO_TCP:
- return inet6_lookup(net, &tcp_hashinfo, skb, doff,
- saddr, sport, daddr, dport,
+ return inet6_lookup(net, net->ipv4.tcp_death_row.hashinfo,
+ skb, doff, saddr, sport, daddr, dport,
in->ifindex);
case IPPROTO_UDP:
return udp6_lib_lookup(net, saddr, sport, daddr, dport,
@@ -97,9 +97,9 @@ nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
const struct net_device *indev)
{
- __be16 uninitialized_var(dport), uninitialized_var(sport);
+ __be16 dport, sport;
const struct in6_addr *daddr = NULL, *saddr = NULL;
- struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct ipv6hdr *iph = ipv6_hdr(skb), ipv6_var;
struct sk_buff *data_skb = NULL;
int doff = 0;
int thoff = 0, tproto;
@@ -129,8 +129,6 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
thoff + sizeof(*hp);
} else if (tproto == IPPROTO_ICMPV6) {
- struct ipv6hdr ipv6_var;
-
if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
&sport, &dport, &ipv6_var))
return NULL;
diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c
index 6bac68fb27a3..929502e51203 100644
--- a/net/ipv6/netfilter/nf_tproxy_ipv6.c
+++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c
@@ -80,6 +80,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
const struct net_device *in,
const enum nf_tproxy_lookup_t lookup_type)
{
+ struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
struct sock *sk;
switch (protocol) {
@@ -93,7 +94,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
switch (lookup_type) {
case NF_TPROXY_LOOKUP_LISTENER:
- sk = inet6_lookup_listener(net, &tcp_hashinfo, skb,
+ sk = inet6_lookup_listener(net, hinfo, skb,
thoff + __tcp_hdrlen(hp),
saddr, sport,
daddr, ntohs(dport),
@@ -108,9 +109,8 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
*/
break;
case NF_TPROXY_LOOKUP_ESTABLISHED:
- sk = __inet6_lookup_established(net, &tcp_hashinfo,
- saddr, sport, daddr, ntohs(dport),
- in->ifindex, 0);
+ sk = __inet6_lookup_established(net, hinfo, saddr, sport, daddr,
+ ntohs(dport), in->ifindex, 0);
break;
default:
BUG();
diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c
index 2af32200507d..70a405b4006f 100644
--- a/net/ipv6/netfilter/nft_dup_ipv6.c
+++ b/net/ipv6/netfilter/nft_dup_ipv6.c
@@ -13,8 +13,8 @@
#include <net/netfilter/ipv6/nf_dup_ipv6.h>
struct nft_dup_ipv6 {
- enum nft_registers sreg_addr:8;
- enum nft_registers sreg_dev:8;
+ u8 sreg_addr;
+ u8 sreg_dev;
};
static void nft_dup_ipv6_eval(const struct nft_expr *expr,
@@ -38,16 +38,16 @@ static int nft_dup_ipv6_init(const struct nft_ctx *ctx,
if (tb[NFTA_DUP_SREG_ADDR] == NULL)
return -EINVAL;
- priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]);
- err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in6_addr));
+ err = nft_parse_register_load(tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr,
+ sizeof(struct in6_addr));
if (err < 0)
return err;
- if (tb[NFTA_DUP_SREG_DEV] != NULL) {
- priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]);
- return nft_validate_register_load(priv->sreg_dev, sizeof(int));
- }
- return 0;
+ if (tb[NFTA_DUP_SREG_DEV])
+ err = nft_parse_register_load(tb[NFTA_DUP_SREG_DEV],
+ &priv->sreg_dev, sizeof(int));
+
+ return err;
}
static int nft_dup_ipv6_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -73,6 +73,7 @@ static const struct nft_expr_ops nft_dup_ipv6_ops = {
.eval = nft_dup_ipv6_eval,
.init = nft_dup_ipv6_init,
.dump = nft_dup_ipv6_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nla_policy nft_dup_ipv6_policy[NFTA_DUP_MAX + 1] = {
@@ -105,3 +106,4 @@ module_exit(nft_dup_ipv6_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "dup");
+MODULE_DESCRIPTION("IPv6 nftables packet duplication support");
diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c
index 7ece86afd079..36dc14b34388 100644
--- a/net/ipv6/netfilter/nft_fib_ipv6.c
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -30,6 +30,10 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv,
fl6->daddr = iph->daddr;
fl6->saddr = iph->saddr;
} else {
+ if (nft_hook(pkt) == NF_INET_FORWARD &&
+ priv->flags & NFTA_FIB_F_IIF)
+ fl6->flowi6_iif = nft_out(pkt)->ifindex;
+
fl6->daddr = iph->saddr;
fl6->saddr = iph->daddr;
}
@@ -37,6 +41,8 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv,
if (ipv6_addr_type(&fl6->daddr) & IPV6_ADDR_LINKLOCAL) {
lookup_flags |= RT6_LOOKUP_F_IFACE;
fl6->flowi6_oif = get_ifindex(dev ? dev : pkt->skb->dev);
+ } else if (priv->flags & NFTA_FIB_F_IIF) {
+ fl6->flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev);
}
if (ipv6_addr_type(&fl6->saddr) & IPV6_ADDR_UNICAST)
@@ -60,6 +66,7 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
struct flowi6 fl6 = {
.flowi6_iif = LOOPBACK_IFINDEX,
.flowi6_proto = pkt->tprot,
+ .flowi6_uid = sock_net_uid(nft_net(pkt), NULL),
};
u32 ret = 0;
@@ -135,6 +142,17 @@ void nft_fib6_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
}
EXPORT_SYMBOL_GPL(nft_fib6_eval_type);
+static bool nft_fib_v6_skip_icmpv6(const struct sk_buff *skb, u8 next, const struct ipv6hdr *iph)
+{
+ if (likely(next != IPPROTO_ICMPV6))
+ return false;
+
+ if (ipv6_addr_type(&iph->saddr) != IPV6_ADDR_ANY)
+ return false;
+
+ return ipv6_addr_type(&iph->daddr) & IPV6_ADDR_LINKLOCAL;
+}
+
void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
@@ -146,6 +164,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
struct flowi6 fl6 = {
.flowi6_iif = LOOPBACK_IFINDEX,
.flowi6_proto = pkt->tprot,
+ .flowi6_uid = sock_net_uid(nft_net(pkt), NULL),
};
struct rt6_info *rt;
int lookup_flags;
@@ -163,10 +182,13 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph);
- if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
- nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
- nft_fib_store_result(dest, priv, nft_in(pkt));
- return;
+ if (nft_hook(pkt) == NF_INET_PRE_ROUTING ||
+ nft_hook(pkt) == NF_INET_INGRESS) {
+ if (nft_fib_is_loopback(pkt->skb, nft_in(pkt)) ||
+ nft_fib_v6_skip_icmpv6(pkt->skb, pkt->tprot, iph)) {
+ nft_fib_store_result(dest, priv, nft_in(pkt));
+ return;
+ }
}
*dest = 0;
@@ -179,7 +201,8 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL))
goto put_rt_err;
- if (oif && oif != rt->rt6i_idev->dev)
+ if (oif && oif != rt->rt6i_idev->dev &&
+ l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) != oif->ifindex)
goto put_rt_err;
nft_fib_store_result(dest, priv, rt->rt6i_idev->dev);
@@ -197,6 +220,7 @@ static const struct nft_expr_ops nft_fib6_type_ops = {
.init = nft_fib_init,
.dump = nft_fib_dump,
.validate = nft_fib_validate,
+ .reduce = nft_fib_reduce,
};
static const struct nft_expr_ops nft_fib6_ops = {
@@ -206,6 +230,7 @@ static const struct nft_expr_ops nft_fib6_ops = {
.init = nft_fib_init,
.dump = nft_fib_dump,
.validate = nft_fib_validate,
+ .reduce = nft_fib_reduce,
};
static const struct nft_expr_ops *
@@ -255,3 +280,4 @@ module_exit(nft_fib6_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
MODULE_ALIAS_NFT_AF_EXPR(10, "fib");
+MODULE_DESCRIPTION("nftables fib / ipv6 route lookup support");
diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c
index 680a28ce29fd..5c61294f410e 100644
--- a/net/ipv6/netfilter/nft_reject_ipv6.c
+++ b/net/ipv6/netfilter/nft_reject_ipv6.c
@@ -28,7 +28,8 @@ static void nft_reject_ipv6_eval(const struct nft_expr *expr,
nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset6(nft_net(pkt), pkt->skb, nft_hook(pkt));
+ nf_send_reset6(nft_net(pkt), nft_sk(pkt), pkt->skb,
+ nft_hook(pkt));
break;
default:
break;
@@ -45,6 +46,7 @@ static const struct nft_expr_ops nft_reject_ipv6_ops = {
.init = nft_reject_init,
.dump = nft_reject_dump,
.validate = nft_reject_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_reject_ipv6_type __read_mostly = {
@@ -72,3 +74,4 @@ module_exit(nft_reject_ipv6_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "reject");
+MODULE_DESCRIPTION("IPv6 packet rejection for nftables");
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index af36acc1a644..2685c3f15e9d 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -15,29 +15,11 @@ static u32 __ipv6_select_ident(struct net *net,
const struct in6_addr *dst,
const struct in6_addr *src)
{
- const struct {
- struct in6_addr dst;
- struct in6_addr src;
- } __aligned(SIPHASH_ALIGNMENT) combined = {
- .dst = *dst,
- .src = *src,
- };
- u32 hash, id;
-
- /* Note the following code is not safe, but this is okay. */
- if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
- get_random_bytes(&net->ipv4.ip_id_key,
- sizeof(net->ipv4.ip_id_key));
-
- hash = siphash(&combined, sizeof(combined), &net->ipv4.ip_id_key);
-
- /* Treat id of 0 as unset and if we get 0 back from ip_idents_reserve,
- * set the hight order instead thus minimizing possible future
- * collisions.
- */
- id = ip_idents_reserve(hash, 1);
- if (unlikely(!id))
- id = 1 << 31;
+ u32 id;
+
+ do {
+ id = get_random_u32();
+ } while (!id);
return id;
}
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 98ac32b49d8c..86c26e48d065 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -20,8 +20,14 @@
#include <net/udp.h>
#include <net/transp_v6.h>
#include <linux/proc_fs.h>
+#include <linux/bpf-cgroup.h>
#include <net/ping.h>
+static void ping_v6_destroy(struct sock *sk)
+{
+ inet6_destroy_sock(sk);
+}
+
/* Compatibility glue so we can support IPv6 when it's compiled as a module */
static int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len,
int *addr_len)
@@ -44,6 +50,20 @@ static int dummy_ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
return 0;
}
+static int ping_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+ int addr_len)
+{
+ /* This check is replicated from __ip6_datagram_connect() and
+ * intended to prevent BPF program called below from accessing
+ * bytes that are out of the bound specified by user in addr_len.
+ */
+
+ if (addr_len < SIN6_LEN_RFC2133)
+ return -EINVAL;
+
+ return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr);
+}
+
static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
@@ -59,13 +79,13 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct pingfakehdr pfh;
struct ipcm6_cookie ipc6;
- pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
-
err = ping_common_sendmsg(AF_INET6, msg, len, &user_icmph,
sizeof(user_icmph));
if (err)
return err;
+ memset(&fl6, 0, sizeof(fl6));
+
if (msg->msg_name) {
DECLARE_SOCKADDR(struct sockaddr_in6 *, u, msg->msg_name);
if (msg->msg_namelen < sizeof(*u))
@@ -74,12 +94,15 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
return -EAFNOSUPPORT;
}
daddr = &(u->sin6_addr);
+ if (np->sndflow)
+ fl6.flowlabel = u->sin6_flowinfo & IPV6_FLOWINFO_MASK;
if (__ipv6_addr_needs_scope_id(ipv6_addr_type(daddr)))
oif = u->sin6_scope_id;
} else {
if (sk->sk_state != TCP_ESTABLISHED)
return -EDESTADDRREQ;
daddr = &sk->sk_v6_daddr;
+ fl6.flowlabel = np->flow_label;
}
if (!oif)
@@ -99,21 +122,37 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
(oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if))
return -EINVAL;
- /* TODO: use ip6_datagram_send_ctl to get options from cmsg */
+ ipcm6_init_sk(&ipc6, np);
+ ipc6.sockc.tsflags = sk->sk_tsflags;
+ ipc6.sockc.mark = sk->sk_mark;
- memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_oif = oif;
+
+ if (msg->msg_controllen) {
+ struct ipv6_txoptions opt = {};
+
+ opt.tot_len = sizeof(opt);
+ ipc6.opt = &opt;
+
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6);
+ if (err < 0)
+ return err;
+
+ /* Changes to txoptions and flow info are not implemented, yet.
+ * Drop the options.
+ */
+ ipc6.opt = NULL;
+ }
fl6.flowi6_proto = IPPROTO_ICMPV6;
fl6.saddr = np->saddr;
fl6.daddr = *daddr;
- fl6.flowi6_oif = oif;
- fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_mark = ipc6.sockc.mark;
fl6.flowi6_uid = sk->sk_uid;
fl6.fl6_icmp_type = user_icmph.icmp6_type;
fl6.fl6_icmp_code = user_icmph.icmp6_code;
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
- ipcm6_init_sk(&ipc6, np);
fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false);
@@ -135,11 +174,12 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
pfh.wcheck = 0;
pfh.family = AF_INET6;
- ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+ if (ipc6.hlimit < 0)
+ ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
lock_sock(sk);
err = ip6_append_data(sk, ping_getfrag, &pfh, len,
- 0, &ipc6, &fl6, rt,
+ sizeof(struct icmp6hdr), &ipc6, &fl6, rt,
MSG_DONTWAIT);
if (err) {
@@ -165,6 +205,8 @@ struct proto pingv6_prot = {
.owner = THIS_MODULE,
.init = ping_init_sock,
.close = ping_close,
+ .destroy = ping_v6_destroy,
+ .pre_connect = ping_v6_pre_connect,
.connect = ip6_datagram_connect_v6_only,
.disconnect = __udp_disconnect,
.setsockopt = ipv6_setsockopt,
@@ -176,6 +218,7 @@ struct proto pingv6_prot = {
.hash = ping_hash,
.unhash = ping_unhash,
.get_port = ping_get_port,
+ .put_port = ping_unhash,
.obj_size = sizeof(struct raw6_sock),
};
EXPORT_SYMBOL_GPL(pingv6_prot);
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index bbff3e02e302..d6306aa46bb1 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -126,6 +126,7 @@ static const struct snmp_mib snmp6_udp6_list[] = {
SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS),
SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS),
SNMP_MIB_ITEM("Udp6IgnoredMulti", UDP_MIB_IGNOREDMULTI),
+ SNMP_MIB_ITEM("Udp6MemErrors", UDP_MIB_MEMERRORS),
SNMP_MIB_SENTINEL
};
@@ -137,6 +138,7 @@ static const struct snmp_mib snmp6_udplite6_list[] = {
SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS),
SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS),
SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS),
+ SNMP_MIB_ITEM("UdpLite6MemErrors", UDP_MIB_MEMERRORS),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index dfe5e603ffe1..722de9dd0ff7 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -61,46 +61,30 @@
#define ICMPV6_HDRLEN 4 /* ICMPv6 header, RFC 4443 Section 2.1 */
-struct raw_hashinfo raw_v6_hashinfo = {
- .lock = __RW_LOCK_UNLOCKED(raw_v6_hashinfo.lock),
-};
+struct raw_hashinfo raw_v6_hashinfo;
EXPORT_SYMBOL_GPL(raw_v6_hashinfo);
-struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
- unsigned short num, const struct in6_addr *loc_addr,
- const struct in6_addr *rmt_addr, int dif, int sdif)
+bool raw_v6_match(struct net *net, struct sock *sk, unsigned short num,
+ const struct in6_addr *loc_addr,
+ const struct in6_addr *rmt_addr, int dif, int sdif)
{
- bool is_multicast = ipv6_addr_is_multicast(loc_addr);
-
- sk_for_each_from(sk)
- if (inet_sk(sk)->inet_num == num) {
-
- if (!net_eq(sock_net(sk), net))
- continue;
-
- if (!ipv6_addr_any(&sk->sk_v6_daddr) &&
- !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr))
- continue;
-
- if (!raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
- dif, sdif))
- continue;
-
- if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
- if (ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr))
- goto found;
- if (is_multicast &&
- inet6_mc_check(sk, loc_addr, rmt_addr))
- goto found;
- continue;
- }
- goto found;
- }
- sk = NULL;
-found:
- return sk;
+ if (inet_sk(sk)->inet_num != num ||
+ !net_eq(sock_net(sk), net) ||
+ (!ipv6_addr_any(&sk->sk_v6_daddr) &&
+ !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
+ !raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+ dif, sdif))
+ return false;
+
+ if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) ||
+ ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr) ||
+ (ipv6_addr_is_multicast(loc_addr) &&
+ inet6_mc_check(sk, loc_addr, rmt_addr)))
+ return true;
+
+ return false;
}
-EXPORT_SYMBOL_GPL(__raw_v6_lookup);
+EXPORT_SYMBOL_GPL(raw_v6_match);
/*
* 0 - deliver
@@ -156,31 +140,27 @@ EXPORT_SYMBOL(rawv6_mh_filter_unregister);
*/
static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
{
+ struct net *net = dev_net(skb->dev);
+ struct hlist_nulls_head *hlist;
+ struct hlist_nulls_node *hnode;
const struct in6_addr *saddr;
const struct in6_addr *daddr;
struct sock *sk;
bool delivered = false;
__u8 hash;
- struct net *net;
saddr = &ipv6_hdr(skb)->saddr;
daddr = saddr + 1;
hash = nexthdr & (RAW_HTABLE_SIZE - 1);
-
- read_lock(&raw_v6_hashinfo.lock);
- sk = sk_head(&raw_v6_hashinfo.ht[hash]);
-
- if (!sk)
- goto out;
-
- net = dev_net(skb->dev);
- sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr,
- inet6_iif(skb), inet6_sdif(skb));
-
- while (sk) {
+ hlist = &raw_v6_hashinfo.ht[hash];
+ rcu_read_lock();
+ sk_nulls_for_each(sk, hnode, hlist) {
int filtered;
+ if (!raw_v6_match(net, sk, nexthdr, daddr, saddr,
+ inet6_iif(skb), inet6_sdif(skb)))
+ continue;
delivered = true;
switch (nexthdr) {
case IPPROTO_ICMPV6:
@@ -219,23 +199,14 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
rawv6_rcv(sk, clone);
}
}
- sk = __raw_v6_lookup(net, sk_next(sk), nexthdr, daddr, saddr,
- inet6_iif(skb), inet6_sdif(skb));
}
-out:
- read_unlock(&raw_v6_hashinfo.lock);
+ rcu_read_unlock();
return delivered;
}
bool raw6_local_deliver(struct sk_buff *skb, int nexthdr)
{
- struct sock *raw_sk;
-
- raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (RAW_HTABLE_SIZE - 1)]);
- if (raw_sk && !ipv6_raw_deliver(skb, nexthdr))
- raw_sk = NULL;
-
- return raw_sk != NULL;
+ return ipv6_raw_deliver(skb, nexthdr);
}
/* This cleans up af_inet6 a bit. -DaveM */
@@ -298,7 +269,7 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
*/
v4addr = LOOPBACK4_IPV6;
if (!(addr_type & IPV6_ADDR_MULTICAST) &&
- !sock_net(sk)->ipv6.sysctl.ip_nonlocal_bind) {
+ !ipv6_can_nonlocal_bind(sock_net(sk), inet)) {
err = -EADDRNOTAVAIL;
if (!ipv6_chk_addr(sock_net(sk), &addr->sin6_addr,
dev, 0)) {
@@ -354,37 +325,32 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb,
if (np->recverr || harderr) {
sk->sk_err = err;
- sk->sk_error_report(sk);
+ sk_error_report(sk);
}
}
void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
u8 type, u8 code, int inner_offset, __be32 info)
{
+ struct net *net = dev_net(skb->dev);
+ struct hlist_nulls_head *hlist;
+ struct hlist_nulls_node *hnode;
struct sock *sk;
int hash;
- const struct in6_addr *saddr, *daddr;
- struct net *net;
hash = nexthdr & (RAW_HTABLE_SIZE - 1);
-
- read_lock(&raw_v6_hashinfo.lock);
- sk = sk_head(&raw_v6_hashinfo.ht[hash]);
- if (sk) {
+ hlist = &raw_v6_hashinfo.ht[hash];
+ rcu_read_lock();
+ sk_nulls_for_each(sk, hnode, hlist) {
/* Note: ipv6_hdr(skb) != skb->data */
const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data;
- saddr = &ip6h->saddr;
- daddr = &ip6h->daddr;
- net = dev_net(skb->dev);
-
- while ((sk = __raw_v6_lookup(net, sk, nexthdr, saddr, daddr,
- inet6_iif(skb), inet6_iif(skb)))) {
- rawv6_err(sk, skb, NULL, type, code,
- inner_offset, info);
- sk = sk_next(sk);
- }
+
+ if (!raw_v6_match(net, sk, nexthdr, &ip6h->saddr, &ip6h->daddr,
+ inet6_iif(skb), inet6_iif(skb)))
+ continue;
+ rawv6_err(sk, skb, NULL, type, code, inner_offset, info);
}
- read_unlock(&raw_v6_hashinfo.lock);
+ rcu_read_unlock();
}
static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb)
@@ -460,7 +426,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
*/
static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int noblock, int flags, int *addr_len)
+ int flags, int *addr_len)
{
struct ipv6_pinfo *np = inet6_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
@@ -477,7 +443,7 @@ static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (np->rxpmtu && np->rxopt.bits.rxpmtu)
return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
- skb = skb_recv_datagram(sk, flags, noblock, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb)
goto out;
@@ -512,7 +478,7 @@ static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
*addr_len = sizeof(*sin6);
}
- sock_recv_ts_and_drops(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
if (np->rxopt.all)
ip6_datagram_recv_ctl(sk, msg, skb);
@@ -746,7 +712,7 @@ static int raw6_getfrag(void *from, char *to, int offset, int len, int odd,
skb->csum = csum_block_add(
skb->csum,
csum_partial_copy_nocheck(rfv->c + offset,
- to, copy, 0),
+ to, copy),
odd);
odd = 0;
@@ -915,7 +881,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.flowi6_oif = np->mcast_oif;
else if (!fl6.flowi6_oif)
fl6.flowi6_oif = np->ucast_oif;
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
if (hdrincl)
fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH;
@@ -972,13 +938,13 @@ do_confirm:
}
static int rawv6_seticmpfilter(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
+ sockptr_t optval, int optlen)
{
switch (optname) {
case ICMPV6_FILTER:
if (optlen > sizeof(struct icmp6_filter))
optlen = sizeof(struct icmp6_filter);
- if (copy_from_user(&raw6_sk(sk)->filter, optval, optlen))
+ if (copy_from_sockptr(&raw6_sk(sk)->filter, optval, optlen))
return -EFAULT;
return 0;
default:
@@ -1015,12 +981,15 @@ static int rawv6_geticmpfilter(struct sock *sk, int level, int optname,
static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct raw6_sock *rp = raw6_sk(sk);
int val;
- if (get_user(val, (int __user *)optval))
+ if (optlen < sizeof(val))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
switch (optname) {
@@ -1062,7 +1031,7 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
}
static int rawv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
switch (level) {
case SOL_RAW:
@@ -1076,7 +1045,7 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname,
if (optname == IPV6_CHECKSUM ||
optname == IPV6_HDRINCL)
break;
- /* fall through */
+ fallthrough;
default:
return ipv6_setsockopt(sk, level, optname, optval, optlen);
}
@@ -1084,30 +1053,6 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname,
return do_rawv6_setsockopt(sk, level, optname, optval, optlen);
}
-#ifdef CONFIG_COMPAT
-static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- switch (level) {
- case SOL_RAW:
- break;
- case SOL_ICMPV6:
- if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
- return -EOPNOTSUPP;
- return rawv6_seticmpfilter(sk, level, optname, optval, optlen);
- case SOL_IPV6:
- if (optname == IPV6_CHECKSUM ||
- optname == IPV6_HDRINCL)
- break;
- /* fall through */
- default:
- return compat_ipv6_setsockopt(sk, level, optname,
- optval, optlen);
- }
- return do_rawv6_setsockopt(sk, level, optname, optval, optlen);
-}
-#endif
-
static int do_rawv6_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -1161,7 +1106,7 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname,
if (optname == IPV6_CHECKSUM ||
optname == IPV6_HDRINCL)
break;
- /* fall through */
+ fallthrough;
default:
return ipv6_getsockopt(sk, level, optname, optval, optlen);
}
@@ -1169,30 +1114,6 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname,
return do_rawv6_getsockopt(sk, level, optname, optval, optlen);
}
-#ifdef CONFIG_COMPAT
-static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- switch (level) {
- case SOL_RAW:
- break;
- case SOL_ICMPV6:
- if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
- return -EOPNOTSUPP;
- return rawv6_geticmpfilter(sk, level, optname, optval, optlen);
- case SOL_IPV6:
- if (optname == IPV6_CHECKSUM ||
- optname == IPV6_HDRINCL)
- break;
- /* fall through */
- default:
- return compat_ipv6_getsockopt(sk, level, optname,
- optval, optlen);
- }
- return do_rawv6_getsockopt(sk, level, optname, optval, optlen);
-}
-#endif
-
static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
switch (cmd) {
@@ -1297,8 +1218,6 @@ struct proto rawv6_prot = {
.usersize = sizeof_field(struct raw6_sock, filter),
.h.raw_hash = &raw_v6_hashinfo,
#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_rawv6_setsockopt,
- .compat_getsockopt = compat_rawv6_getsockopt,
.compat_ioctl = compat_rawv6_ioctl,
#endif
.diag_destroy = raw_abort,
@@ -1377,8 +1296,7 @@ const struct proto_ops inet6_sockraw_ops = {
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet6_compat_ioctl,
#endif
};
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 1f5d4d196dcc..ff866f2a879e 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -42,6 +42,8 @@
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
#include <net/sock.h>
#include <net/snmp.h>
@@ -192,6 +194,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
fq->iif = dev->ifindex;
fq->q.stamp = skb->tstamp;
+ fq->q.mono_delivery_time = skb->mono_delivery_time;
fq->q.meat += skb->len;
fq->ecn |= ecn;
add_frag_mem_limit(fq->q.fqdir, skb->truesize);
@@ -322,6 +325,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
struct frag_queue *fq;
const struct ipv6hdr *hdr = ipv6_hdr(skb);
struct net *net = dev_net(skb_dst(skb)->dev);
+ u8 nexthdr;
int iif;
if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED)
@@ -340,7 +344,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
hdr = ipv6_hdr(skb);
fhdr = (struct frag_hdr *)skb_transport_header(skb);
- if (!(fhdr->frag_off & htons(0xFFF9))) {
+ if (!(fhdr->frag_off & htons(IP6_OFFSET | IP6_MF))) {
/* It is not a fragmented frame */
skb->transport_header += sizeof(struct frag_hdr);
__IP6_INC_STATS(net,
@@ -348,9 +352,25 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb);
IP6CB(skb)->flags |= IP6SKB_FRAGMENTED;
+ IP6CB(skb)->frag_max_size = ntohs(hdr->payload_len) +
+ sizeof(struct ipv6hdr);
return 1;
}
+ /* RFC 8200, Section 4.5 Fragment Header:
+ * If the first fragment does not include all headers through an
+ * Upper-Layer header, then that fragment should be discarded and
+ * an ICMP Parameter Problem, Code 3, message should be sent to
+ * the source of the fragment, with the Pointer field set to zero.
+ */
+ nexthdr = hdr->nexthdr;
+ if (ipv6frag_thdr_truncated(skb, skb_transport_offset(skb), &nexthdr)) {
+ __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
+ IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_INCOMP, 0);
+ return -1;
+ }
+
iif = skb->dev ? skb->dev->ifindex : 0;
fq = fq_find(net, fhdr->identification, hdr, iif);
if (fq) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2931224b674e..2f355f0ec32a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -41,6 +41,7 @@
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <net/net_namespace.h>
#include <net/snmp.h>
#include <net/ipv6.h>
@@ -61,6 +62,7 @@
#include <net/l3mdev.h>
#include <net/ip.h>
#include <linux/uaccess.h>
+#include <linux/btf_ids.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
@@ -80,9 +82,11 @@ enum rt6_nud_state {
RT6_NUD_SUCCEED = 1
};
-static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
+INDIRECT_CALLABLE_SCOPE
+struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int ip6_default_advmss(const struct dst_entry *dst);
-static unsigned int ip6_mtu(const struct dst_entry *dst);
+INDIRECT_CALLABLE_SCOPE
+unsigned int ip6_mtu(const struct dst_entry *dst);
static struct dst_entry *ip6_negative_advice(struct dst_entry *);
static void ip6_dst_destroy(struct dst_entry *);
static void ip6_dst_ifdown(struct dst_entry *,
@@ -126,6 +130,7 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
struct uncached_list {
spinlock_t lock;
struct list_head head;
+ struct list_head quarantine;
};
static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
@@ -145,42 +150,46 @@ void rt6_uncached_list_del(struct rt6_info *rt)
{
if (!list_empty(&rt->rt6i_uncached)) {
struct uncached_list *ul = rt->rt6i_uncached_list;
- struct net *net = dev_net(rt->dst.dev);
spin_lock_bh(&ul->lock);
- list_del(&rt->rt6i_uncached);
- atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
+ list_del_init(&rt->rt6i_uncached);
spin_unlock_bh(&ul->lock);
}
}
-static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
+static void rt6_uncached_list_flush_dev(struct net_device *dev)
{
- struct net_device *loopback_dev = net->loopback_dev;
int cpu;
- if (dev == loopback_dev)
- return;
-
for_each_possible_cpu(cpu) {
struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
- struct rt6_info *rt;
+ struct rt6_info *rt, *safe;
+
+ if (list_empty(&ul->head))
+ continue;
spin_lock_bh(&ul->lock);
- list_for_each_entry(rt, &ul->head, rt6i_uncached) {
+ list_for_each_entry_safe(rt, safe, &ul->head, rt6i_uncached) {
struct inet6_dev *rt_idev = rt->rt6i_idev;
struct net_device *rt_dev = rt->dst.dev;
+ bool handled = false;
if (rt_idev->dev == dev) {
- rt->rt6i_idev = in6_dev_get(loopback_dev);
+ rt->rt6i_idev = in6_dev_get(blackhole_netdev);
in6_dev_put(rt_idev);
+ handled = true;
}
if (rt_dev == dev) {
rt->dst.dev = blackhole_netdev;
- dev_hold(rt->dst.dev);
- dev_put(rt_dev);
+ netdev_ref_replace(rt_dev, blackhole_netdev,
+ &rt->dst.dev_tracker,
+ GFP_ATOMIC);
+ handled = true;
}
+ if (handled)
+ list_move(&rt->rt6i_uncached,
+ &ul->quarantine);
}
spin_unlock_bh(&ul->lock);
}
@@ -257,34 +266,16 @@ static struct dst_ops ip6_dst_ops_template = {
.confirm_neigh = ip6_confirm_neigh,
};
-static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
-{
- unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
-
- return mtu ? : dst->dev->mtu;
-}
-
-static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu,
- bool confirm_neigh)
-{
-}
-
-static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb)
-{
-}
-
static struct dst_ops ip6_dst_blackhole_ops = {
- .family = AF_INET6,
- .destroy = ip6_dst_destroy,
- .check = ip6_dst_check,
- .mtu = ip6_blackhole_mtu,
- .default_advmss = ip6_default_advmss,
- .update_pmtu = ip6_rt_blackhole_update_pmtu,
- .redirect = ip6_rt_blackhole_redirect,
- .cow_metrics = dst_cow_metrics_generic,
- .neigh_lookup = ip6_dst_neigh_lookup,
+ .family = AF_INET6,
+ .default_advmss = ip6_default_advmss,
+ .neigh_lookup = ip6_dst_neigh_lookup,
+ .check = ip6_dst_check,
+ .destroy = ip6_dst_destroy,
+ .cow_metrics = dst_cow_metrics_generic,
+ .update_pmtu = dst_blackhole_update_pmtu,
+ .redirect = dst_blackhole_redirect,
+ .mtu = dst_blackhole_mtu,
};
static const u32 ip6_template_metrics[RTAX_MAX] = {
@@ -342,9 +333,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
static void rt6_info_init(struct rt6_info *rt)
{
- struct dst_entry *dst = &rt->dst;
-
- memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
+ memset_after(rt, 0, dst);
INIT_LIST_HEAD(&rt->rt6i_uncached);
}
@@ -388,13 +377,12 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
{
struct rt6_info *rt = (struct rt6_info *)dst;
struct inet6_dev *idev = rt->rt6i_idev;
- struct net_device *loopback_dev =
- dev_net(dev)->loopback_dev;
- if (idev && idev->dev != loopback_dev) {
- struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
- if (loopback_idev) {
- rt->rt6i_idev = loopback_idev;
+ if (idev && idev->dev != blackhole_netdev) {
+ struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev);
+
+ if (blackhole_idev) {
+ rt->rt6i_idev = blackhole_idev;
in6_dev_put(idev);
}
}
@@ -431,9 +419,12 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
struct fib6_info *sibling, *next_sibling;
struct fib6_info *match = res->f6i;
- if ((!match->fib6_nsiblings && !match->nh) || have_oif_match)
+ if (!match->nh && (!match->fib6_nsiblings || have_oif_match))
goto out;
+ if (match->nh && have_oif_match && res->nh)
+ return;
+
/* We might have already computed the hash for ICMPv6 errors. In such
* case it will always be non-zero. Otherwise now is the time to do it.
*/
@@ -605,6 +596,7 @@ struct __rt6_probe_work {
struct work_struct work;
struct in6_addr target;
struct net_device *dev;
+ netdevice_tracker dev_tracker;
};
static void rt6_probe_deferred(struct work_struct *w)
@@ -615,7 +607,7 @@ static void rt6_probe_deferred(struct work_struct *w)
addrconf_addr_solict_mult(&work->target, &mcaddr);
ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
- dev_put(work->dev);
+ netdev_put(work->dev, &work->dev_tracker);
kfree(work);
}
@@ -669,7 +661,7 @@ static void rt6_probe(struct fib6_nh *fib6_nh)
} else {
INIT_WORK(&work->work, rt6_probe_deferred);
work->target = *nh_gw;
- dev_hold(dev);
+ netdev_hold(dev, &work->dev_tracker, GFP_ATOMIC);
work->dev = dev;
schedule_work(&work->work);
}
@@ -984,7 +976,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
gwaddr, dev);
if (rt && !lifetime) {
- ip6_del_rt(net, rt);
+ ip6_del_rt(net, rt, false);
rt = NULL;
}
@@ -1062,8 +1054,6 @@ static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
flags |= DST_NOCOUNT;
if (rt->dst_nopolicy)
flags |= DST_NOPOLICY;
- if (rt->dst_host)
- flags |= DST_HOST;
return flags;
}
@@ -1209,7 +1199,7 @@ fallback:
return nrt;
}
-static struct rt6_info *ip6_pol_route_lookup(struct net *net,
+INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net,
struct fib6_table *table,
struct flowi6 *fl6,
const struct sk_buff *skb,
@@ -1219,9 +1209,6 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
struct fib6_node *fn;
struct rt6_info *rt;
- if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
- flags &= ~RT6_LOOKUP_F_IFACE;
-
rcu_read_lock();
fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
@@ -1349,7 +1336,6 @@ static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
ip6_rt_copy_init(rt, res);
rt->rt6i_flags |= RTF_CACHE;
- rt->dst.flags |= DST_HOST;
rt->rt6i_dst.addr = *daddr;
rt->rt6i_dst.plen = 128;
@@ -1380,7 +1366,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
rcu_read_lock();
dev = ip6_rt_get_dev_rcu(res);
- pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
+ pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT);
rcu_read_unlock();
if (!pcpu_rt) {
fib6_info_release(f6i);
@@ -1388,9 +1374,18 @@ static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
}
ip6_rt_copy_init(pcpu_rt, res);
pcpu_rt->rt6i_flags |= RTF_PCPU;
+
+ if (f6i->nh)
+ pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev));
+
return pcpu_rt;
}
+static bool rt6_is_valid(const struct rt6_info *rt6)
+{
+ return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev));
+}
+
/* It should be called with rcu_read_lock() acquired */
static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
{
@@ -1398,6 +1393,19 @@ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu);
+ if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) {
+ struct rt6_info *prev, **p;
+
+ p = this_cpu_ptr(res->nh->rt6i_pcpu);
+ prev = xchg(p, NULL);
+ if (prev) {
+ dst_dev_put(&prev->dst);
+ dst_release(&prev->dst);
+ }
+
+ pcpu_rt = NULL;
+ }
+
return pcpu_rt;
}
@@ -1477,17 +1485,24 @@ static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
static u32 rt6_exception_hash(const struct in6_addr *dst,
const struct in6_addr *src)
{
- static u32 seed __read_mostly;
- u32 val;
+ static siphash_aligned_key_t rt6_exception_key;
+ struct {
+ struct in6_addr dst;
+ struct in6_addr src;
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
+ .dst = *dst,
+ };
+ u64 val;
- net_get_random_once(&seed, sizeof(seed));
- val = jhash2((const u32 *)dst, sizeof(*dst)/sizeof(u32), seed);
+ net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key));
#ifdef CONFIG_IPV6_SUBTREES
if (src)
- val = jhash2((const u32 *)src, sizeof(*src)/sizeof(u32), val);
+ combined.src = *src;
#endif
- return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
+ val = siphash(&combined, sizeof(combined), &rt6_exception_key);
+
+ return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
}
/* Helper function to find the cached rt in the hash table
@@ -1642,6 +1657,7 @@ static int rt6_insert_exception(struct rt6_info *nrt,
struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
struct fib6_nh *nh = res->nh;
+ int max_depth;
int err = 0;
spin_lock_bh(&rt6_exception_lock);
@@ -1696,7 +1712,9 @@ static int rt6_insert_exception(struct rt6_info *nrt,
bucket->depth++;
net->ipv6.rt6_stats->fib_rt_cache++;
- if (bucket->depth > FIB6_MAX_DEPTH)
+ /* Randomize max depth to avoid some side channels attacks. */
+ max_depth = FIB6_MAX_DEPTH + prandom_u32_max(FIB6_MAX_DEPTH);
+ while (bucket->depth > max_depth)
rt6_exception_remove_oldest(bucket);
out:
@@ -2078,13 +2096,10 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
if (rt->rt6i_flags & RTF_GATEWAY) {
struct neighbour *neigh;
- __u8 neigh_flags = 0;
neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
- if (neigh)
- neigh_flags = neigh->flags;
- if (!(neigh_flags & NTF_ROUTER)) {
+ if (!(neigh && (neigh->flags & NTF_ROUTER))) {
RT6_TRACE("purging route %p via non-router but gateway\n",
rt);
rt6_remove_exception(bucket, rt6_ex);
@@ -2163,9 +2178,6 @@ int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
saved_fn = fn;
- if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
- oif = 0;
-
redo_rt6_select:
rt6_select(net, fn, oif, res, strict);
if (res->f6i == net->ipv6.fib6_null_entry) {
@@ -2229,7 +2241,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
* if caller sets RT6_LOOKUP_F_DST_NOREF flag.
*/
rt6_uncached_list_add(rt);
- atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
rcu_read_unlock();
return rt;
@@ -2255,7 +2266,7 @@ out:
}
EXPORT_SYMBOL_GPL(ip6_pol_route);
-static struct rt6_info *ip6_pol_route_input(struct net *net,
+INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net,
struct fib6_table *table,
struct flowi6 *fl6,
const struct sk_buff *skb,
@@ -2322,12 +2333,131 @@ out:
}
}
+static u32 rt6_multipath_custom_hash_outer(const struct net *net,
+ const struct sk_buff *skb,
+ bool *p_has_inner)
+{
+ u32 hash_fields = ip6_multipath_hash_fields(net);
+ struct flow_keys keys, hash_keys;
+
+ if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
+ return 0;
+
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
+
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
+ hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
+ hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
+ hash_keys.basic.ip_proto = keys.basic.ip_proto;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
+ hash_keys.tags.flow_label = keys.tags.flow_label;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
+ hash_keys.ports.src = keys.ports.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
+ hash_keys.ports.dst = keys.ports.dst;
+
+ *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
+ return flow_hash_from_keys(&hash_keys);
+}
+
+static u32 rt6_multipath_custom_hash_inner(const struct net *net,
+ const struct sk_buff *skb,
+ bool has_inner)
+{
+ u32 hash_fields = ip6_multipath_hash_fields(net);
+ struct flow_keys keys, hash_keys;
+
+ /* We assume the packet carries an encapsulation, but if none was
+ * encountered during dissection of the outer flow, then there is no
+ * point in calling the flow dissector again.
+ */
+ if (!has_inner)
+ return 0;
+
+ if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
+ return 0;
+
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ skb_flow_dissect_flow_keys(skb, &keys, 0);
+
+ if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
+ return 0;
+
+ if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
+ hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
+ hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+ } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
+ hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
+ hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
+ hash_keys.tags.flow_label = keys.tags.flow_label;
+ }
+
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
+ hash_keys.basic.ip_proto = keys.basic.ip_proto;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
+ hash_keys.ports.src = keys.ports.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
+ hash_keys.ports.dst = keys.ports.dst;
+
+ return flow_hash_from_keys(&hash_keys);
+}
+
+static u32 rt6_multipath_custom_hash_skb(const struct net *net,
+ const struct sk_buff *skb)
+{
+ u32 mhash, mhash_inner;
+ bool has_inner = true;
+
+ mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner);
+ mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner);
+
+ return jhash_2words(mhash, mhash_inner, 0);
+}
+
+static u32 rt6_multipath_custom_hash_fl6(const struct net *net,
+ const struct flowi6 *fl6)
+{
+ u32 hash_fields = ip6_multipath_hash_fields(net);
+ struct flow_keys hash_keys;
+
+ if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
+ return 0;
+
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
+ hash_keys.addrs.v6addrs.src = fl6->saddr;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
+ hash_keys.addrs.v6addrs.dst = fl6->daddr;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
+ hash_keys.basic.ip_proto = fl6->flowi6_proto;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
+ hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
+ hash_keys.ports.src = fl6->fl6_sport;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
+ hash_keys.ports.dst = fl6->fl6_dport;
+
+ return flow_hash_from_keys(&hash_keys);
+}
+
/* if skb is set it will be used and fl6 can be NULL */
u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
const struct sk_buff *skb, struct flow_keys *flkeys)
{
struct flow_keys hash_keys;
- u32 mhash;
+ u32 mhash = 0;
switch (ip6_multipath_hash_policy(net)) {
case 0:
@@ -2341,6 +2471,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
hash_keys.basic.ip_proto = fl6->flowi6_proto;
}
+ mhash = flow_hash_from_keys(&hash_keys);
break;
case 1:
if (skb) {
@@ -2353,7 +2484,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
memset(&hash_keys, 0, sizeof(hash_keys));
- if (!flkeys) {
+ if (!flkeys) {
skb_flow_dissect_flow_keys(skb, &keys, flag);
flkeys = &keys;
}
@@ -2372,6 +2503,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
hash_keys.ports.dst = fl6->fl6_dport;
hash_keys.basic.ip_proto = fl6->flowi6_proto;
}
+ mhash = flow_hash_from_keys(&hash_keys);
break;
case 2:
memset(&hash_keys, 0, sizeof(hash_keys));
@@ -2408,9 +2540,15 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
hash_keys.basic.ip_proto = fl6->flowi6_proto;
}
+ mhash = flow_hash_from_keys(&hash_keys);
+ break;
+ case 3:
+ if (skb)
+ mhash = rt6_multipath_custom_hash_skb(net, skb);
+ else
+ mhash = rt6_multipath_custom_hash_fl6(net, fl6);
break;
}
- mhash = flow_hash_from_keys(&hash_keys);
return mhash >> 1;
}
@@ -2446,7 +2584,7 @@ void ip6_route_input(struct sk_buff *skb)
&fl6, skb, flags));
}
-static struct rt6_info *ip6_pol_route_output(struct net *net,
+INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net,
struct fib6_table *table,
struct flowi6 *fl6,
const struct sk_buff *skb,
@@ -2493,20 +2631,20 @@ struct dst_entry *ip6_route_output_flags(struct net *net,
struct flowi6 *fl6,
int flags)
{
- struct dst_entry *dst;
- struct rt6_info *rt6;
+ struct dst_entry *dst;
+ struct rt6_info *rt6;
- rcu_read_lock();
- dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
- rt6 = (struct rt6_info *)dst;
- /* For dst cached in uncached_list, refcnt is already taken. */
- if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
- dst = &net->ipv6.ip6_null_entry->dst;
- dst_hold(dst);
- }
- rcu_read_unlock();
+ rcu_read_lock();
+ dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
+ rt6 = (struct rt6_info *)dst;
+ /* For dst cached in uncached_list, refcnt is already taken. */
+ if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
+ dst = &net->ipv6.ip6_null_entry->dst;
+ dst_hold(dst);
+ }
+ rcu_read_unlock();
- return dst;
+ return dst;
}
EXPORT_SYMBOL_GPL(ip6_route_output_flags);
@@ -2588,7 +2726,8 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
return NULL;
}
-static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
+INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst,
+ u32 cookie)
{
struct dst_entry *dst_ret;
struct fib6_info *from;
@@ -2596,6 +2735,9 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
rt = container_of(dst, struct rt6_info, dst);
+ if (rt->sernum)
+ return rt6_is_valid(rt) ? dst : NULL;
+
rcu_read_lock();
/* All IPV6 dsts are created with ->obsolete set to the value
@@ -2615,6 +2757,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
return dst_ret;
}
+EXPORT_INDIRECT_CALLABLE(ip6_dst_check);
static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
{
@@ -2655,7 +2798,7 @@ static void ip6_link_failure(struct sk_buff *skb)
if (from) {
fn = rcu_dereference(from->fib6_node);
if (fn && (rt->rt6i_flags & RTF_DEFAULT))
- fn->fn_sernum = -1;
+ WRITE_ONCE(fn->fn_sernum, -1);
}
}
rcu_read_unlock();
@@ -2700,8 +2843,10 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
const struct in6_addr *daddr, *saddr;
struct rt6_info *rt6 = (struct rt6_info *)dst;
- if (dst_metric_locked(dst, RTAX_MTU))
- return;
+ /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU)
+ * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it.
+ * [see also comment in rt6_mtu_change_route()]
+ */
if (iph) {
daddr = &iph->daddr;
@@ -2717,7 +2862,8 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
if (confirm_neigh)
dst_confirm_neigh(dst, daddr);
- mtu = max_t(u32, mtu, IPV6_MIN_MTU);
+ if (mtu < IPV6_MIN_MTU)
+ return;
if (mtu >= dst_mtu(dst))
return;
@@ -2888,7 +3034,7 @@ struct ip6rd_flowi {
struct in6_addr gateway;
};
-static struct rt6_info *__ip6_route_redirect(struct net *net,
+INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net,
struct fib6_table *table,
struct flowi6 *fl6,
const struct sk_buff *skb,
@@ -2906,12 +3052,6 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
struct fib6_info *rt;
struct fib6_node *fn;
- /* l3mdev_update_flow overrides oif if the device is enslaved; in
- * this case we must match on the real ingress device, so reset it
- */
- if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
- fl6->flowi6_oif = skb->dev->ifindex;
-
/* Get the "current" route for this destination and
* check if the redirect has come from appropriate router.
*
@@ -3060,28 +3200,11 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst)
return mtu;
}
-static unsigned int ip6_mtu(const struct dst_entry *dst)
+INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst)
{
- struct inet6_dev *idev;
- unsigned int mtu;
-
- mtu = dst_metric_raw(dst, RTAX_MTU);
- if (mtu)
- goto out;
-
- mtu = IPV6_MIN_MTU;
-
- rcu_read_lock();
- idev = __in6_dev_get(dst->dev);
- if (idev)
- mtu = idev->cnf.mtu6;
- rcu_read_unlock();
-
-out:
- mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
-
- return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
+ return ip6_dst_mtu_maybe_forward(dst, false);
}
+EXPORT_INDIRECT_CALLABLE(ip6_mtu);
/* MTU selection:
* 1. mtu on route is locked - use it
@@ -3142,7 +3265,6 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
goto out;
}
- rt->dst.flags |= DST_HOST;
rt->dst.input = ip6_input;
rt->dst.output = ip6_output;
rt->rt6i_gateway = fl6->daddr;
@@ -3155,7 +3277,6 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
* do proper release of the net_device
*/
rt6_uncached_list_add(rt);
- atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
@@ -3171,20 +3292,24 @@ static int ip6_dst_gc(struct dst_ops *ops)
int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
+ unsigned int val;
int entries;
entries = dst_entries_get_fast(ops);
+ if (entries > rt_max_size)
+ entries = dst_entries_get_slow(ops);
+
if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
entries <= rt_max_size)
goto out;
- net->ipv6.ip6_rt_gc_expire++;
- fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
+ fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true);
entries = dst_entries_get_slow(ops);
if (entries < ops->gc_thresh)
- net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
+ atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1);
out:
- net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
+ val = atomic_read(&net->ipv6.ip6_rt_gc_expire);
+ atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity));
return entries > rt_max_size;
}
@@ -3376,7 +3501,7 @@ static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
if ((flags & RTF_REJECT) ||
(dev && (dev->flags & IFF_LOOPBACK) &&
!(addr_type & IPV6_ADDR_LOOPBACK) &&
- !(flags & RTF_LOCAL)))
+ !(flags & (RTF_ANYCAST | RTF_LOCAL))))
return true;
return false;
@@ -3395,6 +3520,11 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
#ifdef CONFIG_IPV6_ROUTER_PREF
fib6_nh->last_probe = jiffies;
#endif
+ if (cfg->fc_is_fdb) {
+ fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
+ fib6_nh->fib_nh_gw_family = AF_INET6;
+ return 0;
+ }
err = -ENODEV;
if (cfg->fc_ifindex) {
@@ -3475,7 +3605,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
!netif_carrier_ok(dev))
fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
- err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
+ err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap,
cfg->fc_encap_type, cfg, gfp_flags, extack);
if (err)
goto out;
@@ -3488,6 +3618,8 @@ pcpu_alloc:
}
fib6_nh->fib_nh_dev = dev;
+ netdev_tracker_alloc(dev, &fib6_nh->fib_nh_dev_tracker, gfp_flags);
+
fib6_nh->fib_nh_oif = dev->ifindex;
err = 0;
out:
@@ -3497,8 +3629,7 @@ out:
if (err) {
lwtstate_put(fib6_nh->fib_nh_lws);
fib6_nh->fib_nh_lws = NULL;
- if (dev)
- dev_put(dev);
+ dev_put(dev);
}
return err;
@@ -3519,26 +3650,29 @@ void fib6_nh_release(struct fib6_nh *fib6_nh)
rcu_read_unlock();
- if (fib6_nh->rt6i_pcpu) {
- int cpu;
+ fib6_nh_release_dsts(fib6_nh);
+ free_percpu(fib6_nh->rt6i_pcpu);
- for_each_possible_cpu(cpu) {
- struct rt6_info **ppcpu_rt;
- struct rt6_info *pcpu_rt;
+ fib_nh_common_release(&fib6_nh->nh_common);
+}
- ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
- pcpu_rt = *ppcpu_rt;
- if (pcpu_rt) {
- dst_dev_put(&pcpu_rt->dst);
- dst_release(&pcpu_rt->dst);
- *ppcpu_rt = NULL;
- }
- }
+void fib6_nh_release_dsts(struct fib6_nh *fib6_nh)
+{
+ int cpu;
- free_percpu(fib6_nh->rt6i_pcpu);
- }
+ if (!fib6_nh->rt6i_pcpu)
+ return;
- fib_nh_common_release(&fib6_nh->nh_common);
+ for_each_possible_cpu(cpu) {
+ struct rt6_info *pcpu_rt, **ppcpu_rt;
+
+ ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
+ pcpu_rt = xchg(ppcpu_rt, NULL);
+ if (pcpu_rt) {
+ dst_dev_put(&pcpu_rt->dst);
+ dst_release(&pcpu_rt->dst);
+ }
+ }
}
static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
@@ -3622,7 +3756,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
err = PTR_ERR(rt->fib6_metrics);
/* Do not leave garbage there. */
rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
- goto out;
+ goto out_free;
}
if (cfg->fc_flags & RTF_ADDRCONF)
@@ -3645,21 +3779,19 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
rt->fib6_dst.plen = cfg->fc_dst_len;
- if (rt->fib6_dst.plen == 128)
- rt->dst_host = true;
#ifdef CONFIG_IPV6_SUBTREES
ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
rt->fib6_src.plen = cfg->fc_src_len;
#endif
if (nh) {
- if (!nexthop_get(nh)) {
- NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
- goto out;
- }
if (rt->fib6_src.plen) {
NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
- goto out;
+ goto out_free;
+ }
+ if (!nexthop_get(nh)) {
+ NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+ goto out_free;
}
rt->nh = nh;
fib6_nh = nexthop_fib6_nh(rt->nh);
@@ -3696,6 +3828,10 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
out:
fib6_info_release(rt);
return ERR_PTR(err);
+out_free:
+ ip_fib_metrics_put(rt->fib6_metrics);
+ kfree(rt);
+ return ERR_PTR(err);
}
int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
@@ -3735,9 +3871,12 @@ out:
return err;
}
-int ip6_del_rt(struct net *net, struct fib6_info *rt)
+int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify)
{
- struct nl_info info = { .nl_net = net };
+ struct nl_info info = {
+ .nl_net = net,
+ .skip_notify = skip_notify
+ };
return __ip6_del_rt(rt, &info);
}
@@ -4166,7 +4305,7 @@ static struct fib6_info *rt6_add_route_info(struct net *net,
.fc_nlinfo.nl_net = net,
};
- cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
+ cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
cfg.fc_dst = *prefix;
cfg.fc_gateway = *gwaddr;
@@ -4215,11 +4354,12 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
struct fib6_info *rt6_add_dflt_router(struct net *net,
const struct in6_addr *gwaddr,
struct net_device *dev,
- unsigned int pref)
+ unsigned int pref,
+ u32 defrtr_usr_metric)
{
struct fib6_config cfg = {
.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
- .fc_metric = IP6_RT_PRIO_USER,
+ .fc_metric = defrtr_usr_metric,
.fc_ifindex = dev->ifindex,
.fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
@@ -4258,7 +4398,7 @@ restart:
(!idev || idev->cnf.accept_ra != 2) &&
fib6_info_hold_safe(rt)) {
rcu_read_unlock();
- ip6_del_rt(net, rt);
+ ip6_del_rt(net, rt, false);
goto restart;
}
}
@@ -4309,41 +4449,29 @@ static void rtmsg_to_fib6_config(struct net *net,
};
}
-int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg)
{
struct fib6_config cfg;
- struct in6_rtmsg rtmsg;
int err;
- switch (cmd) {
- case SIOCADDRT: /* Add a route */
- case SIOCDELRT: /* Delete a route */
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- return -EPERM;
- err = copy_from_user(&rtmsg, arg,
- sizeof(struct in6_rtmsg));
- if (err)
- return -EFAULT;
-
- rtmsg_to_fib6_config(net, &rtmsg, &cfg);
+ if (cmd != SIOCADDRT && cmd != SIOCDELRT)
+ return -EINVAL;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
- rtnl_lock();
- switch (cmd) {
- case SIOCADDRT:
- err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
- break;
- case SIOCDELRT:
- err = ip6_route_del(&cfg, NULL);
- break;
- default:
- err = -EINVAL;
- }
- rtnl_unlock();
+ rtmsg_to_fib6_config(net, rtmsg, &cfg);
- return err;
+ rtnl_lock();
+ switch (cmd) {
+ case SIOCADDRT:
+ err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
+ break;
+ case SIOCDELRT:
+ err = ip6_route_del(&cfg, NULL);
+ break;
}
-
- return -EINVAL;
+ rtnl_unlock();
+ return err;
}
/*
@@ -4355,9 +4483,10 @@ static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
struct dst_entry *dst = skb_dst(skb);
struct net *net = dev_net(dst->dev);
struct inet6_dev *idev;
+ SKB_DR(reason);
int type;
- if (netif_is_l3_master(skb->dev) &&
+ if (netif_is_l3_master(skb->dev) ||
dst->dev == net->loopback_dev)
idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
else
@@ -4367,11 +4496,14 @@ static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
case IPSTATS_MIB_INNOROUTES:
type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
if (type == IPV6_ADDR_ANY) {
+ SKB_DR_SET(reason, IP_INADDRERRORS);
IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
break;
}
- /* FALLTHROUGH */
+ SKB_DR_SET(reason, IP_INNOROUTES);
+ fallthrough;
case IPSTATS_MIB_OUTNOROUTES:
+ SKB_DR_OR(reason, IP_OUTNOROUTES);
IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
break;
}
@@ -4381,7 +4513,7 @@ static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
skb_dst_drop(skb);
icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return 0;
}
@@ -4437,8 +4569,15 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net,
}
f6i = ip6_route_info_create(&cfg, gfp_flags, NULL);
- if (!IS_ERR(f6i))
+ if (!IS_ERR(f6i)) {
f6i->dst_nocount = true;
+
+ if (!anycast &&
+ (net->ipv6.devconf_all->disable_policy ||
+ idev->cnf.disable_policy))
+ f6i->dst_nopolicy = true;
+ }
+
return f6i;
}
@@ -4758,7 +4897,7 @@ void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
void rt6_disable_ip(struct net_device *dev, unsigned long event)
{
rt6_sync_down_dev(dev, event);
- rt6_uncached_list_flush_dev(dev_net(dev), dev);
+ rt6_uncached_list_flush_dev(dev);
neigh_ifdown(&nd_tbl, dev);
}
@@ -4871,6 +5010,12 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
err = -EINVAL;
rtm = nlmsg_data(nlh);
+ if (rtm->rtm_tos) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid dsfield (tos): option not available for IPv6");
+ goto errout;
+ }
+
*cfg = (struct fib6_config){
.fc_table = rtm->rtm_table,
.fc_dst_len = rtm->rtm_dst_len,
@@ -5072,6 +5217,19 @@ out:
return should_notify;
}
+static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla,
+ struct netlink_ext_ack *extack)
+{
+ if (nla_len(nla) < sizeof(*gw)) {
+ NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY");
+ return -EINVAL;
+ }
+
+ *gw = nla_get_in6_addr(nla);
+
+ return 0;
+}
+
static int ip6_route_multipath_add(struct fib6_config *cfg,
struct netlink_ext_ack *extack)
{
@@ -5112,10 +5270,18 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
if (nla) {
- r_cfg.fc_gateway = nla_get_in6_addr(nla);
+ err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
+ extack);
+ if (err)
+ goto cleanup;
+
r_cfg.fc_flags |= RTF_GATEWAY;
}
r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
+
+ /* RTA_ENCAP_TYPE length checked in
+ * lwtunnel_valid_encap_type_attr
+ */
nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
if (nla)
r_cfg.fc_encap_type = nla_get_u16(nla);
@@ -5196,9 +5362,11 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
* nexthops have been replaced by first new, the rest should
* be added to it.
*/
- cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
- NLM_F_REPLACE);
- cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
+ if (cfg->fc_nlinfo.nlh) {
+ cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
+ NLM_F_REPLACE);
+ cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
+ }
nhn++;
}
@@ -5260,9 +5428,10 @@ static int ip6_route_multipath_del(struct fib6_config *cfg,
{
struct fib6_config r_cfg;
struct rtnexthop *rtnh;
+ int last_err = 0;
int remaining;
int attrlen;
- int err = 1, last_err = 0;
+ int err;
remaining = cfg->fc_mp_len;
rtnh = (struct rtnexthop *)cfg->fc_mp;
@@ -5279,7 +5448,13 @@ static int ip6_route_multipath_del(struct fib6_config *cfg,
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
if (nla) {
- nla_memcpy(&r_cfg.fc_gateway, nla, 16);
+ err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
+ extack);
+ if (err) {
+ last_err = err;
+ goto next_rtnh;
+ }
+
r_cfg.fc_flags |= RTF_GATEWAY;
}
}
@@ -5287,6 +5462,7 @@ static int ip6_route_multipath_del(struct fib6_config *cfg,
if (err)
last_err = err;
+next_rtnh:
rtnh = rtnh_next(rtnh, &remaining);
}
@@ -5532,6 +5708,10 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
goto nla_put_failure;
+
+ if (dst->lwtstate &&
+ lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
+ goto nla_put_failure;
} else if (rt->fib6_nsiblings) {
struct fib6_info *sibling, *next_sibling;
struct nlattr *mp;
@@ -5541,14 +5721,15 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
goto nla_put_failure;
if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
- rt->fib6_nh->fib_nh_weight, AF_INET6) < 0)
+ rt->fib6_nh->fib_nh_weight, AF_INET6,
+ 0) < 0)
goto nla_put_failure;
list_for_each_entry_safe(sibling, next_sibling,
&rt->fib6_siblings, fib6_siblings) {
if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
sibling->fib6_nh->fib_nh_weight,
- AF_INET6) < 0)
+ AF_INET6, 0) < 0)
goto nla_put_failure;
}
@@ -5560,7 +5741,8 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
if (nexthop_is_blackhole(rt->nh))
rtm->rtm_type = RTN_BLACKHOLE;
- if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
+ if (READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode) &&
+ rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
goto nla_put_failure;
rtm->rtm_flags |= nh_flags;
@@ -5578,10 +5760,12 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
}
if (!dst) {
- if (rt->offload)
+ if (READ_ONCE(rt->offload))
rtm->rtm_flags |= RTM_F_OFFLOAD;
- if (rt->trap)
+ if (READ_ONCE(rt->trap))
rtm->rtm_flags |= RTM_F_TRAP;
+ if (READ_ONCE(rt->offload_failed))
+ rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED;
}
if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
@@ -5757,7 +5941,7 @@ int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip)
rcu_read_unlock();
if (err)
- return count += w.count;
+ return count + w.count;
}
return -1;
@@ -6012,11 +6196,6 @@ void fib6_rt_update(struct net *net, struct fib6_info *rt,
struct sk_buff *skb;
int err = -ENOBUFS;
- /* call_fib6_entry_notifiers will be removed when in-kernel notifier
- * is implemented and supported for nexthop objects
- */
- call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL);
-
skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
if (!skb)
goto errout;
@@ -6037,6 +6216,59 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
}
+void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i,
+ bool offload, bool trap, bool offload_failed)
+{
+ struct sk_buff *skb;
+ int err;
+
+ if (READ_ONCE(f6i->offload) == offload &&
+ READ_ONCE(f6i->trap) == trap &&
+ READ_ONCE(f6i->offload_failed) == offload_failed)
+ return;
+
+ WRITE_ONCE(f6i->offload, offload);
+ WRITE_ONCE(f6i->trap, trap);
+
+ /* 2 means send notifications only if offload_failed was changed. */
+ if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 &&
+ READ_ONCE(f6i->offload_failed) == offload_failed)
+ return;
+
+ WRITE_ONCE(f6i->offload_failed, offload_failed);
+
+ if (!rcu_access_pointer(f6i->fib6_node))
+ /* The route was removed from the tree, do not send
+ * notification.
+ */
+ return;
+
+ if (!net->ipv6.sysctl.fib_notify_on_flag_change)
+ return;
+
+ skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL);
+ if (!skb) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+
+ err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0,
+ 0, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL);
+ return;
+
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
+}
+EXPORT_SYMBOL(fib6_info_hw_flags_set);
+
static int ip6_route_dev_notify(struct notifier_block *this,
unsigned long event, void *ptr)
{
@@ -6094,9 +6326,8 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v)
#ifdef CONFIG_SYSCTL
-static
-int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net;
int delay;
@@ -6116,11 +6347,11 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
static struct ctl_table ipv6_route_table_template[] = {
{
- .procname = "flush",
- .data = &init_net.ipv6.sysctl.flush_delay,
+ .procname = "max_size",
+ .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
.maxlen = sizeof(int),
- .mode = 0200,
- .proc_handler = ipv6_sysctl_rtcache_flush
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
},
{
.procname = "gc_thresh",
@@ -6130,11 +6361,11 @@ static struct ctl_table ipv6_route_table_template[] = {
.proc_handler = proc_dointvec,
},
{
- .procname = "max_size",
- .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
+ .procname = "flush",
+ .data = &init_net.ipv6.sysctl.flush_delay,
.maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
+ .mode = 0200,
+ .proc_handler = ipv6_sysctl_rtcache_flush
},
{
.procname = "gc_min_interval",
@@ -6206,10 +6437,10 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
GFP_KERNEL);
if (table) {
- table[0].data = &net->ipv6.sysctl.flush_delay;
- table[0].extra1 = net;
+ table[0].data = &net->ipv6.sysctl.ip6_rt_max_size;
table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
- table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
+ table[2].data = &net->ipv6.sysctl.flush_delay;
+ table[2].extra1 = net;
table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
@@ -6221,7 +6452,7 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
- table[0].procname = NULL;
+ table[1].procname = NULL;
}
return table;
@@ -6290,7 +6521,7 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
net->ipv6.sysctl.skip_notify_on_dev_down = 0;
- net->ipv6.ip6_rt_gc_expire = 30*HZ;
+ atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ);
ret = 0;
out:
@@ -6324,10 +6555,16 @@ static void __net_exit ip6_route_net_exit(struct net *net)
static int __net_init ip6_route_net_init_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
- proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
- sizeof(struct ipv6_route_iter));
- proc_create_net_single("rt6_stats", 0444, net->proc_net,
- rt6_stats_seq_show, NULL);
+ if (!proc_create_net("ipv6_route", 0, net->proc_net,
+ &ipv6_route_seq_ops,
+ sizeof(struct ipv6_route_iter)))
+ return -ENOMEM;
+
+ if (!proc_create_net_single("rt6_stats", 0444, net->proc_net,
+ rt6_stats_seq_show, NULL)) {
+ remove_proc_entry("ipv6_route", net->proc_net);
+ return -ENOMEM;
+ }
#endif
return 0;
}
@@ -6396,6 +6633,43 @@ void __init ip6_route_init_special_entries(void)
#endif
}
+#if IS_BUILTIN(CONFIG_IPV6)
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)
+
+BTF_ID_LIST(btf_fib6_info_id)
+BTF_ID(struct, fib6_info)
+
+static const struct bpf_iter_seq_info ipv6_route_seq_info = {
+ .seq_ops = &ipv6_route_seq_ops,
+ .init_seq_private = bpf_iter_init_seq_net,
+ .fini_seq_private = bpf_iter_fini_seq_net,
+ .seq_priv_size = sizeof(struct ipv6_route_iter),
+};
+
+static struct bpf_iter_reg ipv6_route_reg_info = {
+ .target = "ipv6_route",
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__ipv6_route, rt),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &ipv6_route_seq_info,
+};
+
+static int __init bpf_iter_register(void)
+{
+ ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id;
+ return bpf_iter_reg_target(&ipv6_route_reg_info);
+}
+
+static void bpf_iter_unregister(void)
+{
+ bpf_iter_unreg_target(&ipv6_route_reg_info);
+}
+#endif
+#endif
+
int __init ip6_route_init(void)
{
int ret;
@@ -6404,7 +6678,7 @@ int __init ip6_route_init(void)
ret = -ENOMEM;
ip6_dst_ops_template.kmem_cachep =
kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
- SLAB_HWCACHE_ALIGN, NULL);
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
if (!ip6_dst_ops_template.kmem_cachep)
goto out;
@@ -6458,10 +6732,19 @@ int __init ip6_route_init(void)
if (ret)
goto out_register_late_subsys;
+#if IS_BUILTIN(CONFIG_IPV6)
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+ ret = bpf_iter_register();
+ if (ret)
+ goto out_register_late_subsys;
+#endif
+#endif
+
for_each_possible_cpu(cpu) {
struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
INIT_LIST_HEAD(&ul->head);
+ INIT_LIST_HEAD(&ul->quarantine);
spin_lock_init(&ul->lock);
}
@@ -6490,6 +6773,11 @@ out_kmem_cache:
void ip6_route_cleanup(void)
{
+#if IS_BUILTIN(CONFIG_IPV6)
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+ bpf_iter_unregister();
+#endif
+#endif
unregister_netdevice_notifier(&ip6_route_dev_notifier);
unregister_pernet_subsys(&ip6_route_net_late_ops);
fib6_rules_cleanup();
diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c
new file mode 100644
index 000000000000..488aec9e1a74
--- /dev/null
+++ b/net/ipv6/rpl.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Authors:
+ * (C) 2020 Alexander Aring <alex.aring@gmail.com>
+ */
+
+#include <net/ipv6.h>
+#include <net/rpl.h>
+
+#define IPV6_PFXTAIL_LEN(x) (sizeof(struct in6_addr) - (x))
+#define IPV6_RPL_BEST_ADDR_COMPRESSION 15
+
+static void ipv6_rpl_addr_decompress(struct in6_addr *dst,
+ const struct in6_addr *daddr,
+ const void *post, unsigned char pfx)
+{
+ memcpy(dst, daddr, pfx);
+ memcpy(&dst->s6_addr[pfx], post, IPV6_PFXTAIL_LEN(pfx));
+}
+
+static void ipv6_rpl_addr_compress(void *dst, const struct in6_addr *addr,
+ unsigned char pfx)
+{
+ memcpy(dst, &addr->s6_addr[pfx], IPV6_PFXTAIL_LEN(pfx));
+}
+
+static void *ipv6_rpl_segdata_pos(const struct ipv6_rpl_sr_hdr *hdr, int i)
+{
+ return (void *)&hdr->rpl_segdata[i * IPV6_PFXTAIL_LEN(hdr->cmpri)];
+}
+
+size_t ipv6_rpl_srh_size(unsigned char n, unsigned char cmpri,
+ unsigned char cmpre)
+{
+ return (n * IPV6_PFXTAIL_LEN(cmpri)) + IPV6_PFXTAIL_LEN(cmpre);
+}
+
+void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr,
+ const struct ipv6_rpl_sr_hdr *inhdr,
+ const struct in6_addr *daddr, unsigned char n)
+{
+ int i;
+
+ outhdr->nexthdr = inhdr->nexthdr;
+ outhdr->hdrlen = (((n + 1) * sizeof(struct in6_addr)) >> 3);
+ outhdr->pad = 0;
+ outhdr->type = inhdr->type;
+ outhdr->segments_left = inhdr->segments_left;
+ outhdr->cmpri = 0;
+ outhdr->cmpre = 0;
+
+ for (i = 0; i < n; i++)
+ ipv6_rpl_addr_decompress(&outhdr->rpl_segaddr[i], daddr,
+ ipv6_rpl_segdata_pos(inhdr, i),
+ inhdr->cmpri);
+
+ ipv6_rpl_addr_decompress(&outhdr->rpl_segaddr[n], daddr,
+ ipv6_rpl_segdata_pos(inhdr, n),
+ inhdr->cmpre);
+}
+
+static unsigned char ipv6_rpl_srh_calc_cmpri(const struct ipv6_rpl_sr_hdr *inhdr,
+ const struct in6_addr *daddr,
+ unsigned char n)
+{
+ unsigned char plen;
+ int i;
+
+ for (plen = 0; plen < sizeof(*daddr); plen++) {
+ for (i = 0; i < n; i++) {
+ if (daddr->s6_addr[plen] !=
+ inhdr->rpl_segaddr[i].s6_addr[plen])
+ return plen;
+ }
+ }
+
+ return IPV6_RPL_BEST_ADDR_COMPRESSION;
+}
+
+static unsigned char ipv6_rpl_srh_calc_cmpre(const struct in6_addr *daddr,
+ const struct in6_addr *last_segment)
+{
+ unsigned int plen;
+
+ for (plen = 0; plen < sizeof(*daddr); plen++) {
+ if (daddr->s6_addr[plen] != last_segment->s6_addr[plen])
+ return plen;
+ }
+
+ return IPV6_RPL_BEST_ADDR_COMPRESSION;
+}
+
+void ipv6_rpl_srh_compress(struct ipv6_rpl_sr_hdr *outhdr,
+ const struct ipv6_rpl_sr_hdr *inhdr,
+ const struct in6_addr *daddr, unsigned char n)
+{
+ unsigned char cmpri, cmpre;
+ size_t seglen;
+ int i;
+
+ cmpri = ipv6_rpl_srh_calc_cmpri(inhdr, daddr, n);
+ cmpre = ipv6_rpl_srh_calc_cmpre(daddr, &inhdr->rpl_segaddr[n]);
+
+ outhdr->nexthdr = inhdr->nexthdr;
+ seglen = (n * IPV6_PFXTAIL_LEN(cmpri)) + IPV6_PFXTAIL_LEN(cmpre);
+ outhdr->hdrlen = seglen >> 3;
+ if (seglen & 0x7) {
+ outhdr->hdrlen++;
+ outhdr->pad = 8 - (seglen & 0x7);
+ } else {
+ outhdr->pad = 0;
+ }
+ outhdr->type = inhdr->type;
+ outhdr->segments_left = inhdr->segments_left;
+ outhdr->cmpri = cmpri;
+ outhdr->cmpre = cmpre;
+
+ for (i = 0; i < n; i++)
+ ipv6_rpl_addr_compress(ipv6_rpl_segdata_pos(outhdr, i),
+ &inhdr->rpl_segaddr[i], cmpri);
+
+ ipv6_rpl_addr_compress(ipv6_rpl_segdata_pos(outhdr, n),
+ &inhdr->rpl_segaddr[n], cmpre);
+}
diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c
new file mode 100644
index 000000000000..ff691d9f4a04
--- /dev/null
+++ b/net/ipv6/rpl_iptunnel.c
@@ -0,0 +1,376 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Authors:
+ * (C) 2020 Alexander Aring <alex.aring@gmail.com>
+ */
+
+#include <linux/rpl_iptunnel.h>
+
+#include <net/dst_cache.h>
+#include <net/ip6_route.h>
+#include <net/lwtunnel.h>
+#include <net/ipv6.h>
+#include <net/rpl.h>
+
+struct rpl_iptunnel_encap {
+ struct ipv6_rpl_sr_hdr srh[0];
+};
+
+struct rpl_lwt {
+ struct dst_cache cache;
+ struct rpl_iptunnel_encap tuninfo;
+};
+
+static inline struct rpl_lwt *rpl_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return (struct rpl_lwt *)lwt->data;
+}
+
+static inline struct rpl_iptunnel_encap *
+rpl_encap_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return &rpl_lwt_lwtunnel(lwt)->tuninfo;
+}
+
+static const struct nla_policy rpl_iptunnel_policy[RPL_IPTUNNEL_MAX + 1] = {
+ [RPL_IPTUNNEL_SRH] = { .type = NLA_BINARY },
+};
+
+static bool rpl_validate_srh(struct net *net, struct ipv6_rpl_sr_hdr *srh,
+ size_t seglen)
+{
+ int err;
+
+ if ((srh->hdrlen << 3) != seglen)
+ return false;
+
+ /* check at least one segment and seglen fit with segments_left */
+ if (!srh->segments_left ||
+ (srh->segments_left * sizeof(struct in6_addr)) != seglen)
+ return false;
+
+ if (srh->cmpri || srh->cmpre)
+ return false;
+
+ err = ipv6_chk_rpl_srh_loop(net, srh->rpl_segaddr,
+ srh->segments_left);
+ if (err)
+ return false;
+
+ if (ipv6_addr_type(&srh->rpl_segaddr[srh->segments_left - 1]) &
+ IPV6_ADDR_MULTICAST)
+ return false;
+
+ return true;
+}
+
+static int rpl_build_state(struct net *net, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RPL_IPTUNNEL_MAX + 1];
+ struct lwtunnel_state *newts;
+ struct ipv6_rpl_sr_hdr *srh;
+ struct rpl_lwt *rlwt;
+ int err, srh_len;
+
+ if (family != AF_INET6)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, RPL_IPTUNNEL_MAX, nla,
+ rpl_iptunnel_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[RPL_IPTUNNEL_SRH])
+ return -EINVAL;
+
+ srh = nla_data(tb[RPL_IPTUNNEL_SRH]);
+ srh_len = nla_len(tb[RPL_IPTUNNEL_SRH]);
+
+ if (srh_len < sizeof(*srh))
+ return -EINVAL;
+
+ /* verify that SRH is consistent */
+ if (!rpl_validate_srh(net, srh, srh_len - sizeof(*srh)))
+ return -EINVAL;
+
+ newts = lwtunnel_state_alloc(srh_len + sizeof(*rlwt));
+ if (!newts)
+ return -ENOMEM;
+
+ rlwt = rpl_lwt_lwtunnel(newts);
+
+ err = dst_cache_init(&rlwt->cache, GFP_ATOMIC);
+ if (err) {
+ kfree(newts);
+ return err;
+ }
+
+ memcpy(&rlwt->tuninfo.srh, srh, srh_len);
+
+ newts->type = LWTUNNEL_ENCAP_RPL;
+ newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
+ *ts = newts;
+
+ return 0;
+}
+
+static void rpl_destroy_state(struct lwtunnel_state *lwt)
+{
+ dst_cache_destroy(&rpl_lwt_lwtunnel(lwt)->cache);
+}
+
+static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt,
+ const struct ipv6_rpl_sr_hdr *srh)
+{
+ struct ipv6_rpl_sr_hdr *isrh, *csrh;
+ const struct ipv6hdr *oldhdr;
+ struct ipv6hdr *hdr;
+ unsigned char *buf;
+ size_t hdrlen;
+ int err;
+
+ oldhdr = ipv6_hdr(skb);
+
+ buf = kcalloc(struct_size(srh, segments.addr, srh->segments_left), 2, GFP_ATOMIC);
+ if (!buf)
+ return -ENOMEM;
+
+ isrh = (struct ipv6_rpl_sr_hdr *)buf;
+ csrh = (struct ipv6_rpl_sr_hdr *)(buf + ((srh->hdrlen + 1) << 3));
+
+ memcpy(isrh, srh, sizeof(*isrh));
+ memcpy(isrh->rpl_segaddr, &srh->rpl_segaddr[1],
+ (srh->segments_left - 1) * 16);
+ isrh->rpl_segaddr[srh->segments_left - 1] = oldhdr->daddr;
+
+ ipv6_rpl_srh_compress(csrh, isrh, &srh->rpl_segaddr[0],
+ isrh->segments_left - 1);
+
+ hdrlen = ((csrh->hdrlen + 1) << 3);
+
+ err = skb_cow_head(skb, hdrlen + skb->mac_len);
+ if (unlikely(err)) {
+ kfree(buf);
+ return err;
+ }
+
+ skb_pull(skb, sizeof(struct ipv6hdr));
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ sizeof(struct ipv6hdr));
+
+ skb_push(skb, sizeof(struct ipv6hdr) + hdrlen);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+
+ hdr = ipv6_hdr(skb);
+ memmove(hdr, oldhdr, sizeof(*hdr));
+ isrh = (void *)hdr + sizeof(*hdr);
+ memcpy(isrh, csrh, hdrlen);
+
+ isrh->nexthdr = hdr->nexthdr;
+ hdr->nexthdr = NEXTHDR_ROUTING;
+ hdr->daddr = srh->rpl_segaddr[0];
+
+ ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen);
+
+ kfree(buf);
+
+ return 0;
+}
+
+static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct rpl_iptunnel_encap *tinfo;
+
+ if (skb->protocol != htons(ETH_P_IPV6))
+ return -EINVAL;
+
+ tinfo = rpl_encap_lwtunnel(dst->lwtstate);
+
+ return rpl_do_srh_inline(skb, rlwt, tinfo->srh);
+}
+
+static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct dst_entry *dst = NULL;
+ struct rpl_lwt *rlwt;
+ int err;
+
+ rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate);
+
+ err = rpl_do_srh(skb, rlwt);
+ if (unlikely(err))
+ goto drop;
+
+ preempt_disable();
+ dst = dst_cache_get(&rlwt->cache);
+ preempt_enable();
+
+ if (unlikely(!dst)) {
+ struct ipv6hdr *hdr = ipv6_hdr(skb);
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.daddr = hdr->daddr;
+ fl6.saddr = hdr->saddr;
+ fl6.flowlabel = ip6_flowinfo(hdr);
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_proto = hdr->nexthdr;
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ err = dst->error;
+ dst_release(dst);
+ goto drop;
+ }
+
+ preempt_disable();
+ dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr);
+ preempt_enable();
+ }
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+ if (unlikely(err))
+ goto drop;
+
+ return dst_output(net, sk, skb);
+
+drop:
+ kfree_skb(skb);
+ return err;
+}
+
+static int rpl_input(struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct dst_entry *dst = NULL;
+ struct rpl_lwt *rlwt;
+ int err;
+
+ rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate);
+
+ err = rpl_do_srh(skb, rlwt);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ preempt_disable();
+ dst = dst_cache_get(&rlwt->cache);
+ preempt_enable();
+
+ skb_dst_drop(skb);
+
+ if (!dst) {
+ ip6_route_input(skb);
+ dst = skb_dst(skb);
+ if (!dst->error) {
+ preempt_disable();
+ dst_cache_set_ip6(&rlwt->cache, dst,
+ &ipv6_hdr(skb)->saddr);
+ preempt_enable();
+ }
+ } else {
+ skb_dst_set(skb, dst);
+ }
+
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+ if (unlikely(err))
+ return err;
+
+ return dst_input(skb);
+}
+
+static int nla_put_rpl_srh(struct sk_buff *skb, int attrtype,
+ struct rpl_iptunnel_encap *tuninfo)
+{
+ struct rpl_iptunnel_encap *data;
+ struct nlattr *nla;
+ int len;
+
+ len = RPL_IPTUNNEL_SRH_SIZE(tuninfo->srh);
+
+ nla = nla_reserve(skb, attrtype, len);
+ if (!nla)
+ return -EMSGSIZE;
+
+ data = nla_data(nla);
+ memcpy(data, tuninfo->srh, len);
+
+ return 0;
+}
+
+static int rpl_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct rpl_iptunnel_encap *tuninfo = rpl_encap_lwtunnel(lwtstate);
+
+ if (nla_put_rpl_srh(skb, RPL_IPTUNNEL_SRH, tuninfo))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int rpl_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ struct rpl_iptunnel_encap *tuninfo = rpl_encap_lwtunnel(lwtstate);
+
+ return nla_total_size(RPL_IPTUNNEL_SRH_SIZE(tuninfo->srh));
+}
+
+static int rpl_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct rpl_iptunnel_encap *a_hdr = rpl_encap_lwtunnel(a);
+ struct rpl_iptunnel_encap *b_hdr = rpl_encap_lwtunnel(b);
+ int len = RPL_IPTUNNEL_SRH_SIZE(a_hdr->srh);
+
+ if (len != RPL_IPTUNNEL_SRH_SIZE(b_hdr->srh))
+ return 1;
+
+ return memcmp(a_hdr, b_hdr, len);
+}
+
+static const struct lwtunnel_encap_ops rpl_ops = {
+ .build_state = rpl_build_state,
+ .destroy_state = rpl_destroy_state,
+ .output = rpl_output,
+ .input = rpl_input,
+ .fill_encap = rpl_fill_encap_info,
+ .get_encap_size = rpl_encap_nlsize,
+ .cmp_encap = rpl_encap_cmp,
+ .owner = THIS_MODULE,
+};
+
+int __init rpl_init(void)
+{
+ int err;
+
+ err = lwtunnel_encap_add_ops(&rpl_ops, LWTUNNEL_ENCAP_RPL);
+ if (err)
+ goto out;
+
+ pr_info("RPL Segment Routing with IPv6\n");
+
+ return 0;
+
+out:
+ return err;
+}
+
+void rpl_exit(void)
+{
+ lwtunnel_encap_del_ops(&rpl_ops, LWTUNNEL_ENCAP_RPL);
+}
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 75421a472d25..29346a6eec9f 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -25,10 +25,11 @@
#include <net/seg6_hmac.h>
#endif
-bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len)
+bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced)
{
- int trailing;
unsigned int tlv_offset;
+ int max_last_entry;
+ int trailing;
if (srh->type != IPV6_SRCRT_TYPE_4)
return false;
@@ -36,8 +37,17 @@ bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len)
if (((srh->hdrlen + 1) << 3) != len)
return false;
- if (srh->segments_left > srh->first_segment)
+ if (!reduced && srh->segments_left > srh->first_segment) {
return false;
+ } else {
+ max_last_entry = (srh->hdrlen / 2) - 1;
+
+ if (srh->first_segment > max_last_entry)
+ return false;
+
+ if (srh->segments_left > srh->first_segment + 1)
+ return false;
+ }
tlv_offset = sizeof(*srh) + ((srh->first_segment + 1) << 4);
@@ -65,6 +75,65 @@ bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len)
return true;
}
+struct ipv6_sr_hdr *seg6_get_srh(struct sk_buff *skb, int flags)
+{
+ struct ipv6_sr_hdr *srh;
+ int len, srhoff = 0;
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, &flags) < 0)
+ return NULL;
+
+ if (!pskb_may_pull(skb, srhoff + sizeof(*srh)))
+ return NULL;
+
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+ len = (srh->hdrlen + 1) << 3;
+
+ if (!pskb_may_pull(skb, srhoff + len))
+ return NULL;
+
+ /* note that pskb_may_pull may change pointers in header;
+ * for this reason it is necessary to reload them when needed.
+ */
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+ if (!seg6_validate_srh(srh, len, true))
+ return NULL;
+
+ return srh;
+}
+
+/* Determine if an ICMP invoking packet contains a segment routing
+ * header. If it does, extract the offset to the true destination
+ * address, which is in the first segment address.
+ */
+void seg6_icmp_srh(struct sk_buff *skb, struct inet6_skb_parm *opt)
+{
+ __u16 network_header = skb->network_header;
+ struct ipv6_sr_hdr *srh;
+
+ /* Update network header to point to the invoking packet
+ * inside the ICMP packet, so we can use the seg6_get_srh()
+ * helper.
+ */
+ skb_reset_network_header(skb);
+
+ srh = seg6_get_srh(skb, 0);
+ if (!srh)
+ goto out;
+
+ if (srh->type != IPV6_SRCRT_TYPE_4)
+ goto out;
+
+ opt->flags |= IP6SKB_SEG6;
+ opt->srhoff = (unsigned char *)srh - skb->data;
+
+out:
+ /* Restore the network header back to the ICMP packet */
+ skb->network_header = network_header;
+}
+
static struct genl_family seg6_genl_family;
static const struct nla_policy seg6_genl_policy[SEG6_ATTR_MAX + 1] = {
@@ -112,9 +181,6 @@ static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info)
hinfo = seg6_hmac_info_lookup(net, hmackeyid);
if (!slen) {
- if (!hinfo)
- err = -ENOENT;
-
err = seg6_hmac_info_del(net, hmackeyid);
goto out_unlock;
@@ -125,6 +191,11 @@ static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info)
goto out_unlock;
}
+ if (slen > nla_len(info->attrs[SEG6_ATTR_SECRET])) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
if (hinfo) {
err = seg6_hmac_info_del(net, hmackeyid);
if (err)
@@ -367,7 +438,11 @@ static int __net_init seg6_net_init(struct net *net)
net->ipv6.seg6_data = sdata;
#ifdef CONFIG_IPV6_SEG6_HMAC
- seg6_hmac_net_init(net);
+ if (seg6_hmac_net_init(net)) {
+ kfree(rcu_dereference_raw(sdata->tun_src));
+ kfree(sdata);
+ return -ENOMEM;
+ }
#endif
return 0;
@@ -381,7 +456,7 @@ static void __net_exit seg6_net_exit(struct net *net)
seg6_hmac_net_exit(net);
#endif
- kfree(sdata->tun_src);
+ kfree(rcu_dereference_raw(sdata->tun_src));
kfree(sdata);
}
@@ -429,12 +504,13 @@ static struct genl_family seg6_genl_family __ro_after_init = {
.parallel_ops = true,
.ops = seg6_genl_ops,
.n_ops = ARRAY_SIZE(seg6_genl_ops),
+ .resv_start_op = SEG6_CMD_GET_TUNSRC + 1,
.module = THIS_MODULE,
};
int __init seg6_init(void)
{
- int err = -ENOMEM;
+ int err;
err = genl_register_family(&seg6_genl_family);
if (err)
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index ffcfcd2b128f..d43c50a7310d 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -34,9 +34,7 @@
#include <net/addrconf.h>
#include <net/xfrm.h>
-#include <linux/cryptohash.h>
#include <crypto/hash.h>
-#include <crypto/sha.h>
#include <net/seg6.h>
#include <net/genetlink.h>
#include <net/seg6_hmac.h>
@@ -401,17 +399,13 @@ int __init seg6_hmac_init(void)
{
return seg6_hmac_init_algo();
}
-EXPORT_SYMBOL(seg6_hmac_init);
int __net_init seg6_hmac_net_init(struct net *net)
{
struct seg6_pernet_data *sdata = seg6_pernet(net);
- rhashtable_init(&sdata->hmac_infos, &rht_params);
-
- return 0;
+ return rhashtable_init(&sdata->hmac_infos, &rht_params);
}
-EXPORT_SYMBOL(seg6_hmac_net_init);
void seg6_hmac_exit(void)
{
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 8c52efe299cc..34db881204d2 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -26,10 +26,30 @@
#ifdef CONFIG_IPV6_SEG6_HMAC
#include <net/seg6_hmac.h>
#endif
+#include <linux/netfilter.h>
+
+static size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo)
+{
+ int head = 0;
+
+ switch (tuninfo->mode) {
+ case SEG6_IPTUN_MODE_INLINE:
+ break;
+ case SEG6_IPTUN_MODE_ENCAP:
+ case SEG6_IPTUN_MODE_ENCAP_RED:
+ head = sizeof(struct ipv6hdr);
+ break;
+ case SEG6_IPTUN_MODE_L2ENCAP:
+ case SEG6_IPTUN_MODE_L2ENCAP_RED:
+ return 0;
+ }
+
+ return ((tuninfo->srh->hdrlen + 1) << 3) + head;
+}
struct seg6_lwt {
struct dst_cache cache;
- struct seg6_iptunnel_encap tuninfo[0];
+ struct seg6_iptunnel_encap tuninfo[];
};
static inline struct seg6_lwt *seg6_lwt_lwtunnel(struct lwtunnel_state *lwt)
@@ -143,6 +163,14 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb));
memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+
+ /* the control block has been erased, so we have to set the
+ * iif once again.
+ * We read the receiving interface index directly from the
+ * skb->skb_iif as it is done in the IPv4 receiving path (i.e.:
+ * ip_rcv_core(...)).
+ */
+ IP6CB(skb)->iif = skb->skb_iif;
}
hdr->nexthdr = NEXTHDR_ROUTING;
@@ -163,12 +191,132 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
}
#endif
+ hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
skb_postpush_rcsum(skb, hdr, tot_len);
return 0;
}
EXPORT_SYMBOL_GPL(seg6_do_srh_encap);
+/* encapsulate an IPv6 packet within an outer IPv6 header with reduced SRH */
+static int seg6_do_srh_encap_red(struct sk_buff *skb,
+ struct ipv6_sr_hdr *osrh, int proto)
+{
+ __u8 first_seg = osrh->first_segment;
+ struct dst_entry *dst = skb_dst(skb);
+ struct net *net = dev_net(dst->dev);
+ struct ipv6hdr *hdr, *inner_hdr;
+ int hdrlen = ipv6_optlen(osrh);
+ int red_tlv_offset, tlv_offset;
+ struct ipv6_sr_hdr *isrh;
+ bool skip_srh = false;
+ __be32 flowlabel;
+ int tot_len, err;
+ int red_hdrlen;
+ int tlvs_len;
+
+ if (first_seg > 0) {
+ red_hdrlen = hdrlen - sizeof(struct in6_addr);
+ } else {
+ /* NOTE: if tag/flags and/or other TLVs are introduced in the
+ * seg6_iptunnel infrastructure, they should be considered when
+ * deciding to skip the SRH.
+ */
+ skip_srh = !sr_has_hmac(osrh);
+
+ red_hdrlen = skip_srh ? 0 : hdrlen;
+ }
+
+ tot_len = red_hdrlen + sizeof(struct ipv6hdr);
+
+ err = skb_cow_head(skb, tot_len + skb->mac_len);
+ if (unlikely(err))
+ return err;
+
+ inner_hdr = ipv6_hdr(skb);
+ flowlabel = seg6_make_flowlabel(net, skb, inner_hdr);
+
+ skb_push(skb, tot_len);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+ hdr = ipv6_hdr(skb);
+
+ /* based on seg6_do_srh_encap() */
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
+ flowlabel);
+ hdr->hop_limit = inner_hdr->hop_limit;
+ } else {
+ ip6_flow_hdr(hdr, 0, flowlabel);
+ hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb));
+
+ memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+ IP6CB(skb)->iif = skb->skb_iif;
+ }
+
+ /* no matter if we have to skip the SRH or not, the first segment
+ * always comes in the pushed IPv6 header.
+ */
+ hdr->daddr = osrh->segments[first_seg];
+
+ if (skip_srh) {
+ hdr->nexthdr = proto;
+
+ set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr);
+ goto out;
+ }
+
+ /* we cannot skip the SRH, slow path */
+
+ hdr->nexthdr = NEXTHDR_ROUTING;
+ isrh = (void *)hdr + sizeof(struct ipv6hdr);
+
+ if (unlikely(!first_seg)) {
+ /* this is a very rare case; we have only one SID but
+ * we cannot skip the SRH since we are carrying some
+ * other info.
+ */
+ memcpy(isrh, osrh, hdrlen);
+ goto srcaddr;
+ }
+
+ tlv_offset = sizeof(*osrh) + (first_seg + 1) * sizeof(struct in6_addr);
+ red_tlv_offset = tlv_offset - sizeof(struct in6_addr);
+
+ memcpy(isrh, osrh, red_tlv_offset);
+
+ tlvs_len = hdrlen - tlv_offset;
+ if (unlikely(tlvs_len > 0)) {
+ const void *s = (const void *)osrh + tlv_offset;
+ void *d = (void *)isrh + red_tlv_offset;
+
+ memcpy(d, s, tlvs_len);
+ }
+
+ --isrh->first_segment;
+ isrh->hdrlen -= 2;
+
+srcaddr:
+ isrh->nexthdr = proto;
+ set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr);
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ if (unlikely(!skip_srh && sr_has_hmac(isrh))) {
+ err = seg6_push_hmac(net, &hdr->saddr, isrh);
+ if (unlikely(err))
+ return err;
+ }
+#endif
+
+out:
+ hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
+ skb_postpush_rcsum(skb, hdr, tot_len);
+
+ return 0;
+}
+
/* insert an SRH within an IPv6 packet, just after the IPv6 header */
int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
{
@@ -215,6 +363,8 @@ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
}
#endif
+ hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen);
return 0;
@@ -239,6 +389,7 @@ static int seg6_do_srh(struct sk_buff *skb)
return err;
break;
case SEG6_IPTUN_MODE_ENCAP:
+ case SEG6_IPTUN_MODE_ENCAP_RED:
err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6);
if (err)
return err;
@@ -250,7 +401,11 @@ static int seg6_do_srh(struct sk_buff *skb)
else
return -EINVAL;
- err = seg6_do_srh_encap(skb, tinfo->srh, proto);
+ if (tinfo->mode == SEG6_IPTUN_MODE_ENCAP)
+ err = seg6_do_srh_encap(skb, tinfo->srh, proto);
+ else
+ err = seg6_do_srh_encap_red(skb, tinfo->srh, proto);
+
if (err)
return err;
@@ -259,6 +414,7 @@ static int seg6_do_srh(struct sk_buff *skb)
skb->protocol = htons(ETH_P_IPV6);
break;
case SEG6_IPTUN_MODE_L2ENCAP:
+ case SEG6_IPTUN_MODE_L2ENCAP_RED:
if (!skb_mac_header_was_set(skb))
return -EINVAL;
@@ -268,7 +424,13 @@ static int seg6_do_srh(struct sk_buff *skb)
skb_mac_header_rebuild(skb);
skb_push(skb, skb->mac_len);
- err = seg6_do_srh_encap(skb, tinfo->srh, IPPROTO_ETHERNET);
+ if (tinfo->mode == SEG6_IPTUN_MODE_L2ENCAP)
+ err = seg6_do_srh_encap(skb, tinfo->srh,
+ IPPROTO_ETHERNET);
+ else
+ err = seg6_do_srh_encap_red(skb, tinfo->srh,
+ IPPROTO_ETHERNET);
+
if (err)
return err;
@@ -276,13 +438,20 @@ static int seg6_do_srh(struct sk_buff *skb)
break;
}
- ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+ nf_reset_ct(skb);
return 0;
}
-static int seg6_input(struct sk_buff *skb)
+static int seg6_input_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ return dst_input(skb);
+}
+
+static int seg6_input_core(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
struct dst_entry *orig_dst = skb_dst(skb);
struct dst_entry *dst = NULL;
@@ -320,15 +489,46 @@ static int seg6_input(struct sk_buff *skb)
if (unlikely(err))
return err;
- return dst_input(skb);
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
+ dev_net(skb->dev), NULL, skb, NULL,
+ skb_dst(skb)->dev, seg6_input_finish);
+
+ return seg6_input_finish(dev_net(skb->dev), NULL, skb);
}
-static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int seg6_input_nf(struct sk_buff *skb)
+{
+ struct net_device *dev = skb_dst(skb)->dev;
+ struct net *net = dev_net(skb->dev);
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, NULL,
+ skb, NULL, dev, seg6_input_core);
+ case htons(ETH_P_IPV6):
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, NULL,
+ skb, NULL, dev, seg6_input_core);
+ }
+
+ return -EINVAL;
+}
+
+static int seg6_input(struct sk_buff *skb)
+{
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return seg6_input_nf(skb);
+
+ return seg6_input_core(dev_net(skb->dev), NULL, skb);
+}
+
+static int seg6_output_core(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
struct dst_entry *orig_dst = skb_dst(skb);
struct dst_entry *dst = NULL;
struct seg6_lwt *slwt;
- int err = -EINVAL;
+ int err;
err = seg6_do_srh(skb);
if (unlikely(err))
@@ -370,13 +570,41 @@ static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (unlikely(err))
goto drop;
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb,
+ NULL, skb_dst(skb)->dev, dst_output);
+
return dst_output(net, sk, skb);
drop:
kfree_skb(skb);
return err;
}
-static int seg6_build_state(struct nlattr *nla,
+static int seg6_output_nf(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct net_device *dev = skb_dst(skb)->dev;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb,
+ NULL, dev, seg6_output_core);
+ case htons(ETH_P_IPV6):
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb,
+ NULL, dev, seg6_output_core);
+ }
+
+ return -EINVAL;
+}
+
+static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return seg6_output_nf(net, sk, skb);
+
+ return seg6_output_core(net, sk, skb);
+}
+
+static int seg6_build_state(struct net *net, struct nlattr *nla,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
@@ -421,12 +649,16 @@ static int seg6_build_state(struct nlattr *nla,
break;
case SEG6_IPTUN_MODE_L2ENCAP:
break;
+ case SEG6_IPTUN_MODE_ENCAP_RED:
+ break;
+ case SEG6_IPTUN_MODE_L2ENCAP_RED:
+ break;
default:
return -EINVAL;
}
/* verify that SRH is consistent */
- if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo)))
+ if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo), false))
return -EINVAL;
newts = lwtunnel_state_alloc(tuninfo_len + sizeof(*slwt));
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 8165802d8e05..8370726ae7bf 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -7,6 +7,7 @@
* eBPF support: Mathieu Xhonneux <m.xhonneux@gmail.com>
*/
+#include <linux/filter.h>
#include <linux/types.h>
#include <linux/skbuff.h>
#include <linux/net.h>
@@ -30,14 +31,41 @@
#include <net/seg6_local.h>
#include <linux/etherdevice.h>
#include <linux/bpf.h>
+#include <linux/netfilter.h>
+
+#define SEG6_F_ATTR(i) BIT(i)
struct seg6_local_lwt;
+/* callbacks used for customizing the creation and destruction of a behavior */
+struct seg6_local_lwtunnel_ops {
+ int (*build_state)(struct seg6_local_lwt *slwt, const void *cfg,
+ struct netlink_ext_ack *extack);
+ void (*destroy_state)(struct seg6_local_lwt *slwt);
+};
+
struct seg6_action_desc {
int action;
unsigned long attrs;
+
+ /* The optattrs field is used for specifying all the optional
+ * attributes supported by a specific behavior.
+ * It means that if one of these attributes is not provided in the
+ * netlink message during the behavior creation, no errors will be
+ * returned to the userspace.
+ *
+ * Each attribute can be only of two types (mutually exclusive):
+ * 1) required or 2) optional.
+ * Every user MUST obey to this rule! If you set an attribute as
+ * required the same attribute CANNOT be set as optional and vice
+ * versa.
+ */
+ unsigned long optattrs;
+
int (*input)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
int static_headroom;
+
+ struct seg6_local_lwtunnel_ops slwt_ops;
};
struct bpf_lwt_prog {
@@ -45,6 +73,106 @@ struct bpf_lwt_prog {
char *name;
};
+/* default length values (expressed in bits) for both Locator-Block and
+ * Locator-Node Function.
+ *
+ * Both SEG6_LOCAL_LCBLOCK_DBITS and SEG6_LOCAL_LCNODE_FN_DBITS *must* be:
+ * i) greater than 0;
+ * ii) evenly divisible by 8. In other terms, the lengths of the
+ * Locator-Block and Locator-Node Function must be byte-aligned (we can
+ * relax this constraint in the future if really needed).
+ *
+ * Moreover, a third condition must hold:
+ * iii) SEG6_LOCAL_LCBLOCK_DBITS + SEG6_LOCAL_LCNODE_FN_DBITS <= 128.
+ *
+ * The correctness of SEG6_LOCAL_LCBLOCK_DBITS and SEG6_LOCAL_LCNODE_FN_DBITS
+ * values are checked during the kernel compilation. If the compilation stops,
+ * check the value of these parameters to see if they meet conditions (i), (ii)
+ * and (iii).
+ */
+#define SEG6_LOCAL_LCBLOCK_DBITS 32
+#define SEG6_LOCAL_LCNODE_FN_DBITS 16
+
+/* The following next_csid_chk_{cntr,lcblock,lcblock_fn}_bits macros can be
+ * used directly to check whether the lengths (in bits) of Locator-Block and
+ * Locator-Node Function are valid according to (i), (ii), (iii).
+ */
+#define next_csid_chk_cntr_bits(blen, flen) \
+ ((blen) + (flen) > 128)
+
+#define next_csid_chk_lcblock_bits(blen) \
+({ \
+ typeof(blen) __tmp = blen; \
+ (!__tmp || __tmp > 120 || (__tmp & 0x07)); \
+})
+
+#define next_csid_chk_lcnode_fn_bits(flen) \
+ next_csid_chk_lcblock_bits(flen)
+
+/* Supported Flavor operations are reported in this bitmask */
+#define SEG6_LOCAL_FLV_SUPP_OPS (BIT(SEG6_LOCAL_FLV_OP_NEXT_CSID))
+
+struct seg6_flavors_info {
+ /* Flavor operations */
+ __u32 flv_ops;
+
+ /* Locator-Block length, expressed in bits */
+ __u8 lcblock_bits;
+ /* Locator-Node Function length, expressed in bits*/
+ __u8 lcnode_func_bits;
+};
+
+enum seg6_end_dt_mode {
+ DT_INVALID_MODE = -EINVAL,
+ DT_LEGACY_MODE = 0,
+ DT_VRF_MODE = 1,
+};
+
+struct seg6_end_dt_info {
+ enum seg6_end_dt_mode mode;
+
+ struct net *net;
+ /* VRF device associated to the routing table used by the SRv6
+ * End.DT4/DT6 behavior for routing IPv4/IPv6 packets.
+ */
+ int vrf_ifindex;
+ int vrf_table;
+
+ /* tunneled packet family (IPv4 or IPv6).
+ * Protocol and header length are inferred from family.
+ */
+ u16 family;
+};
+
+struct pcpu_seg6_local_counters {
+ u64_stats_t packets;
+ u64_stats_t bytes;
+ u64_stats_t errors;
+
+ struct u64_stats_sync syncp;
+};
+
+/* This struct groups all the SRv6 Behavior counters supported so far.
+ *
+ * put_nla_counters() makes use of this data structure to collect all counter
+ * values after the per-CPU counter evaluation has been performed.
+ * Finally, each counter value (in seg6_local_counters) is stored in the
+ * corresponding netlink attribute and sent to user space.
+ *
+ * NB: we don't want to expose this structure to user space!
+ */
+struct seg6_local_counters {
+ __u64 packets;
+ __u64 bytes;
+ __u64 errors;
+};
+
+#define seg6_local_alloc_pcpu_counters(__gfp) \
+ __netdev_alloc_pcpu_stats(struct pcpu_seg6_local_counters, \
+ ((__gfp) | __GFP_ZERO))
+
+#define SEG6_F_LOCAL_COUNTERS SEG6_F_ATTR(SEG6_LOCAL_COUNTERS)
+
struct seg6_local_lwt {
int action;
struct ipv6_sr_hdr *srh;
@@ -54,9 +182,19 @@ struct seg6_local_lwt {
int iif;
int oif;
struct bpf_lwt_prog bpf;
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ struct seg6_end_dt_info dt_info;
+#endif
+ struct seg6_flavors_info flv_info;
+
+ struct pcpu_seg6_local_counters __percpu *pcpu_counters;
int headroom;
struct seg6_action_desc *desc;
+ /* unlike the required attrs, we have to track the optional attributes
+ * that have been effectively parsed.
+ */
+ unsigned long parsed_optattrs;
};
static struct seg6_local_lwt *seg6_local_lwtunnel(struct lwtunnel_state *lwt)
@@ -64,46 +202,14 @@ static struct seg6_local_lwt *seg6_local_lwtunnel(struct lwtunnel_state *lwt)
return (struct seg6_local_lwt *)lwt->data;
}
-static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb)
-{
- struct ipv6_sr_hdr *srh;
- int len, srhoff = 0;
-
- if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
- return NULL;
-
- if (!pskb_may_pull(skb, srhoff + sizeof(*srh)))
- return NULL;
-
- srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
-
- len = (srh->hdrlen + 1) << 3;
-
- if (!pskb_may_pull(skb, srhoff + len))
- return NULL;
-
- /* note that pskb_may_pull may change pointers in header;
- * for this reason it is necessary to reload them when needed.
- */
- srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
-
- if (!seg6_validate_srh(srh, len))
- return NULL;
-
- return srh;
-}
-
static struct ipv6_sr_hdr *get_and_validate_srh(struct sk_buff *skb)
{
struct ipv6_sr_hdr *srh;
- srh = get_srh(skb);
+ srh = seg6_get_srh(skb, IP6_FH_F_SKIP_RH);
if (!srh)
return NULL;
- if (srh->segments_left == 0)
- return NULL;
-
#ifdef CONFIG_IPV6_SEG6_HMAC
if (!seg6_hmac_validate_skb(skb))
return NULL;
@@ -117,7 +223,7 @@ static bool decap_and_validate(struct sk_buff *skb, int proto)
struct ipv6_sr_hdr *srh;
unsigned int off = 0;
- srh = get_srh(skb);
+ srh = seg6_get_srh(skb, 0);
if (srh && srh->segments_left > 0)
return false;
@@ -163,6 +269,7 @@ seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
struct flowi6 fl6;
int dev_flags = 0;
+ memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_iif = skb->dev->ifindex;
fl6.daddr = nhaddr ? *nhaddr : hdr->daddr;
fl6.saddr = hdr->saddr;
@@ -215,8 +322,50 @@ int seg6_lookup_nexthop(struct sk_buff *skb,
return seg6_lookup_any_nexthop(skb, nhaddr, tbl_id, false);
}
-/* regular endpoint function */
-static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+static __u8 seg6_flv_lcblock_octects(const struct seg6_flavors_info *finfo)
+{
+ return finfo->lcblock_bits >> 3;
+}
+
+static __u8 seg6_flv_lcnode_func_octects(const struct seg6_flavors_info *finfo)
+{
+ return finfo->lcnode_func_bits >> 3;
+}
+
+static bool seg6_next_csid_is_arg_zero(const struct in6_addr *addr,
+ const struct seg6_flavors_info *finfo)
+{
+ __u8 fnc_octects = seg6_flv_lcnode_func_octects(finfo);
+ __u8 blk_octects = seg6_flv_lcblock_octects(finfo);
+ __u8 arg_octects;
+ int i;
+
+ arg_octects = 16 - blk_octects - fnc_octects;
+ for (i = 0; i < arg_octects; ++i) {
+ if (addr->s6_addr[blk_octects + fnc_octects + i] != 0x00)
+ return false;
+ }
+
+ return true;
+}
+
+/* assume that DA.Argument length > 0 */
+static void seg6_next_csid_advance_arg(struct in6_addr *addr,
+ const struct seg6_flavors_info *finfo)
+{
+ __u8 fnc_octects = seg6_flv_lcnode_func_octects(finfo);
+ __u8 blk_octects = seg6_flv_lcblock_octects(finfo);
+
+ /* advance DA.Argument */
+ memmove(&addr->s6_addr[blk_octects],
+ &addr->s6_addr[blk_octects + fnc_octects],
+ 16 - blk_octects - fnc_octects);
+
+ memset(&addr->s6_addr[16 - fnc_octects], 0x00, fnc_octects);
+}
+
+static int input_action_end_core(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
{
struct ipv6_sr_hdr *srh;
@@ -235,6 +384,38 @@ drop:
return -EINVAL;
}
+static int end_next_csid_core(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ const struct seg6_flavors_info *finfo = &slwt->flv_info;
+ struct in6_addr *daddr = &ipv6_hdr(skb)->daddr;
+
+ if (seg6_next_csid_is_arg_zero(daddr, finfo))
+ return input_action_end_core(skb, slwt);
+
+ /* update DA */
+ seg6_next_csid_advance_arg(daddr, finfo);
+
+ seg6_lookup_nexthop(skb, NULL, 0);
+
+ return dst_input(skb);
+}
+
+static bool seg6_next_csid_enabled(__u32 fops)
+{
+ return fops & BIT(SEG6_LOCAL_FLV_OP_NEXT_CSID);
+}
+
+/* regular endpoint function */
+static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ const struct seg6_flavors_info *finfo = &slwt->flv_info;
+
+ if (seg6_next_csid_enabled(finfo->flv_ops))
+ return end_next_csid_core(skb, slwt);
+
+ return input_action_end_core(skb, slwt);
+}
+
/* regular endpoint, and forward to specified nexthop */
static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt)
{
@@ -331,12 +512,33 @@ drop:
return -EINVAL;
}
+static int input_action_end_dx6_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct in6_addr *nhaddr = NULL;
+ struct seg6_local_lwt *slwt;
+
+ slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
+
+ /* The inner packet is not associated to any local interface,
+ * so we do not call netif_rx().
+ *
+ * If slwt->nh6 is set to ::, then lookup the nexthop for the
+ * inner packet's DA. Otherwise, use the specified nexthop.
+ */
+ if (!ipv6_addr_any(&slwt->nh6))
+ nhaddr = &slwt->nh6;
+
+ seg6_lookup_nexthop(skb, nhaddr, 0);
+
+ return dst_input(skb);
+}
+
/* decapsulate and forward to specified nexthop */
static int input_action_end_dx6(struct sk_buff *skb,
struct seg6_local_lwt *slwt)
{
- struct in6_addr *nhaddr = NULL;
-
/* this function accepts IPv6 encapsulated packets, with either
* an SRH with SL=0, or no SRH.
*/
@@ -347,33 +549,49 @@ static int input_action_end_dx6(struct sk_buff *skb,
if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
goto drop;
- /* The inner packet is not associated to any local interface,
- * so we do not call netif_rx().
- *
- * If slwt->nh6 is set to ::, then lookup the nexthop for the
- * inner packet's DA. Otherwise, use the specified nexthop.
- */
-
- if (!ipv6_addr_any(&slwt->nh6))
- nhaddr = &slwt->nh6;
-
skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+ nf_reset_ct(skb);
- seg6_lookup_nexthop(skb, nhaddr, 0);
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
+ dev_net(skb->dev), NULL, skb, NULL,
+ skb_dst(skb)->dev, input_action_end_dx6_finish);
- return dst_input(skb);
+ return input_action_end_dx6_finish(dev_net(skb->dev), NULL, skb);
drop:
kfree_skb(skb);
return -EINVAL;
}
-static int input_action_end_dx4(struct sk_buff *skb,
- struct seg6_local_lwt *slwt)
+static int input_action_end_dx4_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct seg6_local_lwt *slwt;
struct iphdr *iph;
__be32 nhaddr;
int err;
+ slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
+
+ iph = ip_hdr(skb);
+
+ nhaddr = slwt->nh4.s_addr ?: iph->daddr;
+
+ skb_dst_drop(skb);
+
+ err = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev);
+ if (err) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return dst_input(skb);
+}
+
+static int input_action_end_dx4(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
if (!decap_and_validate(skb, IPPROTO_IPIP))
goto drop;
@@ -381,17 +599,215 @@ static int input_action_end_dx4(struct sk_buff *skb,
goto drop;
skb->protocol = htons(ETH_P_IP);
+ skb_set_transport_header(skb, sizeof(struct iphdr));
+ nf_reset_ct(skb);
- iph = ip_hdr(skb);
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+ dev_net(skb->dev), NULL, skb, NULL,
+ skb_dst(skb)->dev, input_action_end_dx4_finish);
- nhaddr = slwt->nh4.s_addr ?: iph->daddr;
+ return input_action_end_dx4_finish(dev_net(skb->dev), NULL, skb);
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+#ifdef CONFIG_NET_L3_MASTER_DEV
+static struct net *fib6_config_get_net(const struct fib6_config *fib6_cfg)
+{
+ const struct nl_info *nli = &fib6_cfg->fc_nlinfo;
+
+ return nli->nl_net;
+}
+
+static int __seg6_end_dt_vrf_build(struct seg6_local_lwt *slwt, const void *cfg,
+ u16 family, struct netlink_ext_ack *extack)
+{
+ struct seg6_end_dt_info *info = &slwt->dt_info;
+ int vrf_ifindex;
+ struct net *net;
+
+ net = fib6_config_get_net(cfg);
+
+ /* note that vrf_table was already set by parse_nla_vrftable() */
+ vrf_ifindex = l3mdev_ifindex_lookup_by_table_id(L3MDEV_TYPE_VRF, net,
+ info->vrf_table);
+ if (vrf_ifindex < 0) {
+ if (vrf_ifindex == -EPERM) {
+ NL_SET_ERR_MSG(extack,
+ "Strict mode for VRF is disabled");
+ } else if (vrf_ifindex == -ENODEV) {
+ NL_SET_ERR_MSG(extack,
+ "Table has no associated VRF device");
+ } else {
+ pr_debug("seg6local: SRv6 End.DT* creation error=%d\n",
+ vrf_ifindex);
+ }
+
+ return vrf_ifindex;
+ }
+
+ info->net = net;
+ info->vrf_ifindex = vrf_ifindex;
+
+ info->family = family;
+ info->mode = DT_VRF_MODE;
+
+ return 0;
+}
+
+/* The SRv6 End.DT4/DT6 behavior extracts the inner (IPv4/IPv6) packet and
+ * routes the IPv4/IPv6 packet by looking at the configured routing table.
+ *
+ * In the SRv6 End.DT4/DT6 use case, we can receive traffic (IPv6+Segment
+ * Routing Header packets) from several interfaces and the outer IPv6
+ * destination address (DA) is used for retrieving the specific instance of the
+ * End.DT4/DT6 behavior that should process the packets.
+ *
+ * However, the inner IPv4/IPv6 packet is not really bound to any receiving
+ * interface and thus the End.DT4/DT6 sets the VRF (associated with the
+ * corresponding routing table) as the *receiving* interface.
+ * In other words, the End.DT4/DT6 processes a packet as if it has been received
+ * directly by the VRF (and not by one of its slave devices, if any).
+ * In this way, the VRF interface is used for routing the IPv4/IPv6 packet in
+ * according to the routing table configured by the End.DT4/DT6 instance.
+ *
+ * This design allows you to get some interesting features like:
+ * 1) the statistics on rx packets;
+ * 2) the possibility to install a packet sniffer on the receiving interface
+ * (the VRF one) for looking at the incoming packets;
+ * 3) the possibility to leverage the netfilter prerouting hook for the inner
+ * IPv4 packet.
+ *
+ * This function returns:
+ * - the sk_buff* when the VRF rcv handler has processed the packet correctly;
+ * - NULL when the skb is consumed by the VRF rcv handler;
+ * - a pointer which encodes a negative error number in case of error.
+ * Note that in this case, the function takes care of freeing the skb.
+ */
+static struct sk_buff *end_dt_vrf_rcv(struct sk_buff *skb, u16 family,
+ struct net_device *dev)
+{
+ /* based on l3mdev_ip_rcv; we are only interested in the master */
+ if (unlikely(!netif_is_l3_master(dev) && !netif_has_l3_rx_handler(dev)))
+ goto drop;
+
+ if (unlikely(!dev->l3mdev_ops->l3mdev_l3_rcv))
+ goto drop;
+
+ /* the decap packet IPv4/IPv6 does not come with any mac header info.
+ * We must unset the mac header to allow the VRF device to rebuild it,
+ * just in case there is a sniffer attached on the device.
+ */
+ skb_unset_mac_header(skb);
+
+ skb = dev->l3mdev_ops->l3mdev_l3_rcv(dev, skb, family);
+ if (!skb)
+ /* the skb buffer was consumed by the handler */
+ return NULL;
+
+ /* when a packet is received by a VRF or by one of its slaves, the
+ * master device reference is set into the skb.
+ */
+ if (unlikely(skb->dev != dev || skb->skb_iif != dev->ifindex))
+ goto drop;
+
+ return skb;
+
+drop:
+ kfree_skb(skb);
+ return ERR_PTR(-EINVAL);
+}
+
+static struct net_device *end_dt_get_vrf_rcu(struct sk_buff *skb,
+ struct seg6_end_dt_info *info)
+{
+ int vrf_ifindex = info->vrf_ifindex;
+ struct net *net = info->net;
+
+ if (unlikely(vrf_ifindex < 0))
+ goto error;
+
+ if (unlikely(!net_eq(dev_net(skb->dev), net)))
+ goto error;
+
+ return dev_get_by_index_rcu(net, vrf_ifindex);
+
+error:
+ return NULL;
+}
+
+static struct sk_buff *end_dt_vrf_core(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt, u16 family)
+{
+ struct seg6_end_dt_info *info = &slwt->dt_info;
+ struct net_device *vrf;
+ __be16 protocol;
+ int hdrlen;
+
+ vrf = end_dt_get_vrf_rcu(skb, info);
+ if (unlikely(!vrf))
+ goto drop;
+
+ switch (family) {
+ case AF_INET:
+ protocol = htons(ETH_P_IP);
+ hdrlen = sizeof(struct iphdr);
+ break;
+ case AF_INET6:
+ protocol = htons(ETH_P_IPV6);
+ hdrlen = sizeof(struct ipv6hdr);
+ break;
+ case AF_UNSPEC:
+ fallthrough;
+ default:
+ goto drop;
+ }
+
+ if (unlikely(info->family != AF_UNSPEC && info->family != family)) {
+ pr_warn_once("seg6local: SRv6 End.DT* family mismatch");
+ goto drop;
+ }
+
+ skb->protocol = protocol;
skb_dst_drop(skb);
- skb_set_transport_header(skb, sizeof(struct iphdr));
+ skb_set_transport_header(skb, hdrlen);
+ nf_reset_ct(skb);
- err = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev);
- if (err)
+ return end_dt_vrf_rcv(skb, family, vrf);
+
+drop:
+ kfree_skb(skb);
+ return ERR_PTR(-EINVAL);
+}
+
+static int input_action_end_dt4(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ struct iphdr *iph;
+ int err;
+
+ if (!decap_and_validate(skb, IPPROTO_IPIP))
+ goto drop;
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto drop;
+
+ skb = end_dt_vrf_core(skb, slwt, AF_INET);
+ if (!skb)
+ /* packet has been processed and consumed by the VRF */
+ return 0;
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ iph = ip_hdr(skb);
+
+ err = ip_route_input(skb, iph->daddr, iph->saddr, 0, skb->dev);
+ if (unlikely(err))
goto drop;
return dst_input(skb);
@@ -401,6 +817,54 @@ drop:
return -EINVAL;
}
+static int seg6_end_dt4_build(struct seg6_local_lwt *slwt, const void *cfg,
+ struct netlink_ext_ack *extack)
+{
+ return __seg6_end_dt_vrf_build(slwt, cfg, AF_INET, extack);
+}
+
+static enum
+seg6_end_dt_mode seg6_end_dt6_parse_mode(struct seg6_local_lwt *slwt)
+{
+ unsigned long parsed_optattrs = slwt->parsed_optattrs;
+ bool legacy, vrfmode;
+
+ legacy = !!(parsed_optattrs & SEG6_F_ATTR(SEG6_LOCAL_TABLE));
+ vrfmode = !!(parsed_optattrs & SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE));
+
+ if (!(legacy ^ vrfmode))
+ /* both are absent or present: invalid DT6 mode */
+ return DT_INVALID_MODE;
+
+ return legacy ? DT_LEGACY_MODE : DT_VRF_MODE;
+}
+
+static enum seg6_end_dt_mode seg6_end_dt6_get_mode(struct seg6_local_lwt *slwt)
+{
+ struct seg6_end_dt_info *info = &slwt->dt_info;
+
+ return info->mode;
+}
+
+static int seg6_end_dt6_build(struct seg6_local_lwt *slwt, const void *cfg,
+ struct netlink_ext_ack *extack)
+{
+ enum seg6_end_dt_mode mode = seg6_end_dt6_parse_mode(slwt);
+ struct seg6_end_dt_info *info = &slwt->dt_info;
+
+ switch (mode) {
+ case DT_LEGACY_MODE:
+ info->mode = DT_LEGACY_MODE;
+ return 0;
+ case DT_VRF_MODE:
+ return __seg6_end_dt_vrf_build(slwt, cfg, AF_INET6, extack);
+ default:
+ NL_SET_ERR_MSG(extack, "table or vrftable must be specified");
+ return -EINVAL;
+ }
+}
+#endif
+
static int input_action_end_dt6(struct sk_buff *skb,
struct seg6_local_lwt *slwt)
{
@@ -410,6 +874,28 @@ static int input_action_end_dt6(struct sk_buff *skb,
if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
goto drop;
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ if (seg6_end_dt6_get_mode(slwt) == DT_LEGACY_MODE)
+ goto legacy_mode;
+
+ /* DT6_VRF_MODE */
+ skb = end_dt_vrf_core(skb, slwt, AF_INET6);
+ if (!skb)
+ /* packet has been processed and consumed by the VRF */
+ return 0;
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ /* note: this time we do not need to specify the table because the VRF
+ * takes care of selecting the correct table.
+ */
+ seg6_lookup_any_nexthop(skb, NULL, 0, true);
+
+ return dst_input(skb);
+
+legacy_mode:
+#endif
skb_set_transport_header(skb, sizeof(struct ipv6hdr));
seg6_lookup_any_nexthop(skb, NULL, slwt->table, true);
@@ -421,6 +907,36 @@ drop:
return -EINVAL;
}
+#ifdef CONFIG_NET_L3_MASTER_DEV
+static int seg6_end_dt46_build(struct seg6_local_lwt *slwt, const void *cfg,
+ struct netlink_ext_ack *extack)
+{
+ return __seg6_end_dt_vrf_build(slwt, cfg, AF_UNSPEC, extack);
+}
+
+static int input_action_end_dt46(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ unsigned int off = 0;
+ int nexthdr;
+
+ nexthdr = ipv6_find_hdr(skb, &off, -1, NULL, NULL);
+ if (unlikely(nexthdr < 0))
+ goto drop;
+
+ switch (nexthdr) {
+ case IPPROTO_IPIP:
+ return input_action_end_dt4(skb, slwt);
+ case IPPROTO_IPV6:
+ return input_action_end_dt6(skb, slwt);
+ }
+
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+#endif
+
/* push an SRH on top of the current one */
static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt)
{
@@ -435,7 +951,6 @@ static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt)
if (err)
goto drop;
- ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
skb_set_transport_header(skb, sizeof(struct ipv6hdr));
seg6_lookup_nexthop(skb, NULL, 0);
@@ -467,7 +982,6 @@ static int input_action_end_b6_encap(struct sk_buff *skb,
if (err)
goto drop;
- ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
skb_set_transport_header(skb, sizeof(struct ipv6hdr));
seg6_lookup_nexthop(skb, NULL, 0);
@@ -495,7 +1009,7 @@ bool seg6_bpf_has_valid_srh(struct sk_buff *skb)
return false;
srh->hdrlen = (u8)(srh_state->hdrlen >> 3);
- if (!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))
+ if (!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3, true))
return false;
srh_state->valid = true;
@@ -562,52 +1076,95 @@ static struct seg6_action_desc seg6_action_table[] = {
{
.action = SEG6_LOCAL_ACTION_END,
.attrs = 0,
+ .optattrs = SEG6_F_LOCAL_COUNTERS |
+ SEG6_F_ATTR(SEG6_LOCAL_FLAVORS),
.input = input_action_end,
},
{
.action = SEG6_LOCAL_ACTION_END_X,
- .attrs = (1 << SEG6_LOCAL_NH6),
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_NH6),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
.input = input_action_end_x,
},
{
.action = SEG6_LOCAL_ACTION_END_T,
- .attrs = (1 << SEG6_LOCAL_TABLE),
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_TABLE),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
.input = input_action_end_t,
},
{
.action = SEG6_LOCAL_ACTION_END_DX2,
- .attrs = (1 << SEG6_LOCAL_OIF),
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_OIF),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
.input = input_action_end_dx2,
},
{
.action = SEG6_LOCAL_ACTION_END_DX6,
- .attrs = (1 << SEG6_LOCAL_NH6),
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_NH6),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
.input = input_action_end_dx6,
},
{
.action = SEG6_LOCAL_ACTION_END_DX4,
- .attrs = (1 << SEG6_LOCAL_NH4),
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_NH4),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
.input = input_action_end_dx4,
},
{
+ .action = SEG6_LOCAL_ACTION_END_DT4,
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ .input = input_action_end_dt4,
+ .slwt_ops = {
+ .build_state = seg6_end_dt4_build,
+ },
+#endif
+ },
+ {
.action = SEG6_LOCAL_ACTION_END_DT6,
- .attrs = (1 << SEG6_LOCAL_TABLE),
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ .attrs = 0,
+ .optattrs = SEG6_F_LOCAL_COUNTERS |
+ SEG6_F_ATTR(SEG6_LOCAL_TABLE) |
+ SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE),
+ .slwt_ops = {
+ .build_state = seg6_end_dt6_build,
+ },
+#else
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_TABLE),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
+#endif
.input = input_action_end_dt6,
},
{
+ .action = SEG6_LOCAL_ACTION_END_DT46,
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ .input = input_action_end_dt46,
+ .slwt_ops = {
+ .build_state = seg6_end_dt46_build,
+ },
+#endif
+ },
+ {
.action = SEG6_LOCAL_ACTION_END_B6,
- .attrs = (1 << SEG6_LOCAL_SRH),
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_SRH),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
.input = input_action_end_b6,
},
{
.action = SEG6_LOCAL_ACTION_END_B6_ENCAP,
- .attrs = (1 << SEG6_LOCAL_SRH),
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_SRH),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
.input = input_action_end_b6_encap,
.static_headroom = sizeof(struct ipv6hdr),
},
{
.action = SEG6_LOCAL_ACTION_END_BPF,
- .attrs = (1 << SEG6_LOCAL_BPF),
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_BPF),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
.input = input_action_end_bpf,
},
@@ -628,27 +1185,71 @@ static struct seg6_action_desc *__get_action_desc(int action)
return NULL;
}
-static int seg6_local_input(struct sk_buff *skb)
+static bool seg6_lwtunnel_counters_enabled(struct seg6_local_lwt *slwt)
+{
+ return slwt->parsed_optattrs & SEG6_F_LOCAL_COUNTERS;
+}
+
+static void seg6_local_update_counters(struct seg6_local_lwt *slwt,
+ unsigned int len, int err)
+{
+ struct pcpu_seg6_local_counters *pcounters;
+
+ pcounters = this_cpu_ptr(slwt->pcpu_counters);
+ u64_stats_update_begin(&pcounters->syncp);
+
+ if (likely(!err)) {
+ u64_stats_inc(&pcounters->packets);
+ u64_stats_add(&pcounters->bytes, len);
+ } else {
+ u64_stats_inc(&pcounters->errors);
+ }
+
+ u64_stats_update_end(&pcounters->syncp);
+}
+
+static int seg6_local_input_core(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
struct dst_entry *orig_dst = skb_dst(skb);
struct seg6_action_desc *desc;
struct seg6_local_lwt *slwt;
+ unsigned int len = skb->len;
+ int rc;
+
+ slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
+ desc = slwt->desc;
+
+ rc = desc->input(skb, slwt);
+
+ if (!seg6_lwtunnel_counters_enabled(slwt))
+ return rc;
+
+ seg6_local_update_counters(slwt, len, rc);
+ return rc;
+}
+
+static int seg6_local_input(struct sk_buff *skb)
+{
if (skb->protocol != htons(ETH_P_IPV6)) {
kfree_skb(skb);
return -EINVAL;
}
- slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
- desc = slwt->desc;
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
+ dev_net(skb->dev), NULL, skb, skb->dev, NULL,
+ seg6_local_input_core);
- return desc->input(skb, slwt);
+ return seg6_local_input_core(dev_net(skb->dev), NULL, skb);
}
static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
[SEG6_LOCAL_ACTION] = { .type = NLA_U32 },
[SEG6_LOCAL_SRH] = { .type = NLA_BINARY },
[SEG6_LOCAL_TABLE] = { .type = NLA_U32 },
+ [SEG6_LOCAL_VRFTABLE] = { .type = NLA_U32 },
[SEG6_LOCAL_NH4] = { .type = NLA_BINARY,
.len = sizeof(struct in_addr) },
[SEG6_LOCAL_NH6] = { .type = NLA_BINARY,
@@ -656,9 +1257,12 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
[SEG6_LOCAL_IIF] = { .type = NLA_U32 },
[SEG6_LOCAL_OIF] = { .type = NLA_U32 },
[SEG6_LOCAL_BPF] = { .type = NLA_NESTED },
+ [SEG6_LOCAL_COUNTERS] = { .type = NLA_NESTED },
+ [SEG6_LOCAL_FLAVORS] = { .type = NLA_NESTED },
};
-static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
{
struct ipv6_sr_hdr *srh;
int len;
@@ -670,7 +1274,7 @@ static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt)
if (len < sizeof(*srh) + sizeof(struct in6_addr))
return -EINVAL;
- if (!seg6_validate_srh(srh, len))
+ if (!seg6_validate_srh(srh, len, false))
return -EINVAL;
slwt->srh = kmemdup(srh, len, GFP_KERNEL);
@@ -710,7 +1314,13 @@ static int cmp_nla_srh(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
return memcmp(a->srh, b->srh, len);
}
-static int parse_nla_table(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+static void destroy_attr_srh(struct seg6_local_lwt *slwt)
+{
+ kfree(slwt->srh);
+}
+
+static int parse_nla_table(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
{
slwt->table = nla_get_u32(attrs[SEG6_LOCAL_TABLE]);
@@ -733,7 +1343,56 @@ static int cmp_nla_table(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
return 0;
}
-static int parse_nla_nh4(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+static struct
+seg6_end_dt_info *seg6_possible_end_dt_info(struct seg6_local_lwt *slwt)
+{
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ return &slwt->dt_info;
+#else
+ return ERR_PTR(-EOPNOTSUPP);
+#endif
+}
+
+static int parse_nla_vrftable(struct nlattr **attrs,
+ struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
+{
+ struct seg6_end_dt_info *info = seg6_possible_end_dt_info(slwt);
+
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+
+ info->vrf_table = nla_get_u32(attrs[SEG6_LOCAL_VRFTABLE]);
+
+ return 0;
+}
+
+static int put_nla_vrftable(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ struct seg6_end_dt_info *info = seg6_possible_end_dt_info(slwt);
+
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+
+ if (nla_put_u32(skb, SEG6_LOCAL_VRFTABLE, info->vrf_table))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int cmp_nla_vrftable(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+ struct seg6_end_dt_info *info_a = seg6_possible_end_dt_info(a);
+ struct seg6_end_dt_info *info_b = seg6_possible_end_dt_info(b);
+
+ if (info_a->vrf_table != info_b->vrf_table)
+ return 1;
+
+ return 0;
+}
+
+static int parse_nla_nh4(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
{
memcpy(&slwt->nh4, nla_data(attrs[SEG6_LOCAL_NH4]),
sizeof(struct in_addr));
@@ -759,7 +1418,8 @@ static int cmp_nla_nh4(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
return memcmp(&a->nh4, &b->nh4, sizeof(struct in_addr));
}
-static int parse_nla_nh6(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+static int parse_nla_nh6(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
{
memcpy(&slwt->nh6, nla_data(attrs[SEG6_LOCAL_NH6]),
sizeof(struct in6_addr));
@@ -785,7 +1445,8 @@ static int cmp_nla_nh6(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
return memcmp(&a->nh6, &b->nh6, sizeof(struct in6_addr));
}
-static int parse_nla_iif(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+static int parse_nla_iif(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
{
slwt->iif = nla_get_u32(attrs[SEG6_LOCAL_IIF]);
@@ -808,7 +1469,8 @@ static int cmp_nla_iif(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
return 0;
}
-static int parse_nla_oif(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+static int parse_nla_oif(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
{
slwt->oif = nla_get_u32(attrs[SEG6_LOCAL_OIF]);
@@ -838,7 +1500,8 @@ static const struct nla_policy bpf_prog_policy[SEG6_LOCAL_BPF_PROG_MAX + 1] = {
.len = MAX_PROG_NAME },
};
-static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
{
struct nlattr *tb[SEG6_LOCAL_BPF_PROG_MAX + 1];
struct bpf_prog *p;
@@ -901,16 +1564,324 @@ static int cmp_nla_bpf(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
return strcmp(a->bpf.name, b->bpf.name);
}
+static void destroy_attr_bpf(struct seg6_local_lwt *slwt)
+{
+ kfree(slwt->bpf.name);
+ if (slwt->bpf.prog)
+ bpf_prog_put(slwt->bpf.prog);
+}
+
+static const struct
+nla_policy seg6_local_counters_policy[SEG6_LOCAL_CNT_MAX + 1] = {
+ [SEG6_LOCAL_CNT_PACKETS] = { .type = NLA_U64 },
+ [SEG6_LOCAL_CNT_BYTES] = { .type = NLA_U64 },
+ [SEG6_LOCAL_CNT_ERRORS] = { .type = NLA_U64 },
+};
+
+static int parse_nla_counters(struct nlattr **attrs,
+ struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
+{
+ struct pcpu_seg6_local_counters __percpu *pcounters;
+ struct nlattr *tb[SEG6_LOCAL_CNT_MAX + 1];
+ int ret;
+
+ ret = nla_parse_nested_deprecated(tb, SEG6_LOCAL_CNT_MAX,
+ attrs[SEG6_LOCAL_COUNTERS],
+ seg6_local_counters_policy, NULL);
+ if (ret < 0)
+ return ret;
+
+ /* basic support for SRv6 Behavior counters requires at least:
+ * packets, bytes and errors.
+ */
+ if (!tb[SEG6_LOCAL_CNT_PACKETS] || !tb[SEG6_LOCAL_CNT_BYTES] ||
+ !tb[SEG6_LOCAL_CNT_ERRORS])
+ return -EINVAL;
+
+ /* counters are always zero initialized */
+ pcounters = seg6_local_alloc_pcpu_counters(GFP_KERNEL);
+ if (!pcounters)
+ return -ENOMEM;
+
+ slwt->pcpu_counters = pcounters;
+
+ return 0;
+}
+
+static int seg6_local_fill_nla_counters(struct sk_buff *skb,
+ struct seg6_local_counters *counters)
+{
+ if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_PACKETS, counters->packets,
+ SEG6_LOCAL_CNT_PAD))
+ return -EMSGSIZE;
+
+ if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_BYTES, counters->bytes,
+ SEG6_LOCAL_CNT_PAD))
+ return -EMSGSIZE;
+
+ if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_ERRORS, counters->errors,
+ SEG6_LOCAL_CNT_PAD))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int put_nla_counters(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ struct seg6_local_counters counters = { 0, 0, 0 };
+ struct nlattr *nest;
+ int rc, i;
+
+ nest = nla_nest_start(skb, SEG6_LOCAL_COUNTERS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ for_each_possible_cpu(i) {
+ struct pcpu_seg6_local_counters *pcounters;
+ u64 packets, bytes, errors;
+ unsigned int start;
+
+ pcounters = per_cpu_ptr(slwt->pcpu_counters, i);
+ do {
+ start = u64_stats_fetch_begin_irq(&pcounters->syncp);
+
+ packets = u64_stats_read(&pcounters->packets);
+ bytes = u64_stats_read(&pcounters->bytes);
+ errors = u64_stats_read(&pcounters->errors);
+
+ } while (u64_stats_fetch_retry_irq(&pcounters->syncp, start));
+
+ counters.packets += packets;
+ counters.bytes += bytes;
+ counters.errors += errors;
+ }
+
+ rc = seg6_local_fill_nla_counters(skb, &counters);
+ if (rc < 0) {
+ nla_nest_cancel(skb, nest);
+ return rc;
+ }
+
+ return nla_nest_end(skb, nest);
+}
+
+static int cmp_nla_counters(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+ /* a and b are equal if both have pcpu_counters set or not */
+ return (!!((unsigned long)a->pcpu_counters)) ^
+ (!!((unsigned long)b->pcpu_counters));
+}
+
+static void destroy_attr_counters(struct seg6_local_lwt *slwt)
+{
+ free_percpu(slwt->pcpu_counters);
+}
+
+static const
+struct nla_policy seg6_local_flavors_policy[SEG6_LOCAL_FLV_MAX + 1] = {
+ [SEG6_LOCAL_FLV_OPERATION] = { .type = NLA_U32 },
+ [SEG6_LOCAL_FLV_LCBLOCK_BITS] = { .type = NLA_U8 },
+ [SEG6_LOCAL_FLV_LCNODE_FN_BITS] = { .type = NLA_U8 },
+};
+
+/* check whether the lengths of the Locator-Block and Locator-Node Function
+ * are compatible with the dimension of a C-SID container.
+ */
+static int seg6_chk_next_csid_cfg(__u8 block_len, __u8 func_len)
+{
+ /* Locator-Block and Locator-Node Function cannot exceed 128 bits
+ * (i.e. C-SID container lenghts).
+ */
+ if (next_csid_chk_cntr_bits(block_len, func_len))
+ return -EINVAL;
+
+ /* Locator-Block length must be greater than zero and evenly divisible
+ * by 8. There must be room for a Locator-Node Function, at least.
+ */
+ if (next_csid_chk_lcblock_bits(block_len))
+ return -EINVAL;
+
+ /* Locator-Node Function length must be greater than zero and evenly
+ * divisible by 8. There must be room for the Locator-Block.
+ */
+ if (next_csid_chk_lcnode_fn_bits(func_len))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int seg6_parse_nla_next_csid_cfg(struct nlattr **tb,
+ struct seg6_flavors_info *finfo,
+ struct netlink_ext_ack *extack)
+{
+ __u8 func_len = SEG6_LOCAL_LCNODE_FN_DBITS;
+ __u8 block_len = SEG6_LOCAL_LCBLOCK_DBITS;
+ int rc;
+
+ if (tb[SEG6_LOCAL_FLV_LCBLOCK_BITS])
+ block_len = nla_get_u8(tb[SEG6_LOCAL_FLV_LCBLOCK_BITS]);
+
+ if (tb[SEG6_LOCAL_FLV_LCNODE_FN_BITS])
+ func_len = nla_get_u8(tb[SEG6_LOCAL_FLV_LCNODE_FN_BITS]);
+
+ rc = seg6_chk_next_csid_cfg(block_len, func_len);
+ if (rc < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid Locator Block/Node Function lengths");
+ return rc;
+ }
+
+ finfo->lcblock_bits = block_len;
+ finfo->lcnode_func_bits = func_len;
+
+ return 0;
+}
+
+static int parse_nla_flavors(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
+{
+ struct seg6_flavors_info *finfo = &slwt->flv_info;
+ struct nlattr *tb[SEG6_LOCAL_FLV_MAX + 1];
+ unsigned long fops;
+ int rc;
+
+ rc = nla_parse_nested_deprecated(tb, SEG6_LOCAL_FLV_MAX,
+ attrs[SEG6_LOCAL_FLAVORS],
+ seg6_local_flavors_policy, NULL);
+ if (rc < 0)
+ return rc;
+
+ /* this attribute MUST always be present since it represents the Flavor
+ * operation(s) to be carried out.
+ */
+ if (!tb[SEG6_LOCAL_FLV_OPERATION])
+ return -EINVAL;
+
+ fops = nla_get_u32(tb[SEG6_LOCAL_FLV_OPERATION]);
+ if (fops & ~SEG6_LOCAL_FLV_SUPP_OPS) {
+ NL_SET_ERR_MSG(extack, "Unsupported Flavor operation(s)");
+ return -EOPNOTSUPP;
+ }
+
+ finfo->flv_ops = fops;
+
+ if (seg6_next_csid_enabled(fops)) {
+ /* Locator-Block and Locator-Node Function lengths can be
+ * provided by the user space. Otherwise, default values are
+ * applied.
+ */
+ rc = seg6_parse_nla_next_csid_cfg(tb, finfo, extack);
+ if (rc < 0)
+ return rc;
+ }
+
+ return 0;
+}
+
+static int seg6_fill_nla_next_csid_cfg(struct sk_buff *skb,
+ struct seg6_flavors_info *finfo)
+{
+ if (nla_put_u8(skb, SEG6_LOCAL_FLV_LCBLOCK_BITS, finfo->lcblock_bits))
+ return -EMSGSIZE;
+
+ if (nla_put_u8(skb, SEG6_LOCAL_FLV_LCNODE_FN_BITS,
+ finfo->lcnode_func_bits))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int put_nla_flavors(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ struct seg6_flavors_info *finfo = &slwt->flv_info;
+ __u32 fops = finfo->flv_ops;
+ struct nlattr *nest;
+ int rc;
+
+ nest = nla_nest_start(skb, SEG6_LOCAL_FLAVORS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, SEG6_LOCAL_FLV_OPERATION, fops)) {
+ rc = -EMSGSIZE;
+ goto err;
+ }
+
+ if (seg6_next_csid_enabled(fops)) {
+ rc = seg6_fill_nla_next_csid_cfg(skb, finfo);
+ if (rc < 0)
+ goto err;
+ }
+
+ return nla_nest_end(skb, nest);
+
+err:
+ nla_nest_cancel(skb, nest);
+ return rc;
+}
+
+static int seg6_cmp_nla_next_csid_cfg(struct seg6_flavors_info *finfo_a,
+ struct seg6_flavors_info *finfo_b)
+{
+ if (finfo_a->lcblock_bits != finfo_b->lcblock_bits)
+ return 1;
+
+ if (finfo_a->lcnode_func_bits != finfo_b->lcnode_func_bits)
+ return 1;
+
+ return 0;
+}
+
+static int cmp_nla_flavors(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+ struct seg6_flavors_info *finfo_a = &a->flv_info;
+ struct seg6_flavors_info *finfo_b = &b->flv_info;
+
+ if (finfo_a->flv_ops != finfo_b->flv_ops)
+ return 1;
+
+ if (seg6_next_csid_enabled(finfo_a->flv_ops)) {
+ if (seg6_cmp_nla_next_csid_cfg(finfo_a, finfo_b))
+ return 1;
+ }
+
+ return 0;
+}
+
+static int encap_size_flavors(struct seg6_local_lwt *slwt)
+{
+ struct seg6_flavors_info *finfo = &slwt->flv_info;
+ int nlsize;
+
+ nlsize = nla_total_size(0) + /* nest SEG6_LOCAL_FLAVORS */
+ nla_total_size(4); /* SEG6_LOCAL_FLV_OPERATION */
+
+ if (seg6_next_csid_enabled(finfo->flv_ops))
+ nlsize += nla_total_size(1) + /* SEG6_LOCAL_FLV_LCBLOCK_BITS */
+ nla_total_size(1); /* SEG6_LOCAL_FLV_LCNODE_FN_BITS */
+
+ return nlsize;
+}
+
struct seg6_action_param {
- int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt);
+ int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack);
int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
int (*cmp)(struct seg6_local_lwt *a, struct seg6_local_lwt *b);
+
+ /* optional destroy() callback useful for releasing resources which
+ * have been previously acquired in the corresponding parse()
+ * function.
+ */
+ void (*destroy)(struct seg6_local_lwt *slwt);
};
static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
[SEG6_LOCAL_SRH] = { .parse = parse_nla_srh,
.put = put_nla_srh,
- .cmp = cmp_nla_srh },
+ .cmp = cmp_nla_srh,
+ .destroy = destroy_attr_srh },
[SEG6_LOCAL_TABLE] = { .parse = parse_nla_table,
.put = put_nla_table,
@@ -934,14 +1905,140 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
[SEG6_LOCAL_BPF] = { .parse = parse_nla_bpf,
.put = put_nla_bpf,
- .cmp = cmp_nla_bpf },
+ .cmp = cmp_nla_bpf,
+ .destroy = destroy_attr_bpf },
+
+ [SEG6_LOCAL_VRFTABLE] = { .parse = parse_nla_vrftable,
+ .put = put_nla_vrftable,
+ .cmp = cmp_nla_vrftable },
+
+ [SEG6_LOCAL_COUNTERS] = { .parse = parse_nla_counters,
+ .put = put_nla_counters,
+ .cmp = cmp_nla_counters,
+ .destroy = destroy_attr_counters },
+ [SEG6_LOCAL_FLAVORS] = { .parse = parse_nla_flavors,
+ .put = put_nla_flavors,
+ .cmp = cmp_nla_flavors },
};
-static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+/* call the destroy() callback (if available) for each set attribute in
+ * @parsed_attrs, starting from the first attribute up to the @max_parsed
+ * (excluded) attribute.
+ */
+static void __destroy_attrs(unsigned long parsed_attrs, int max_parsed,
+ struct seg6_local_lwt *slwt)
+{
+ struct seg6_action_param *param;
+ int i;
+
+ /* Every required seg6local attribute is identified by an ID which is
+ * encoded as a flag (i.e: 1 << ID) in the 'attrs' bitmask;
+ *
+ * We scan the 'parsed_attrs' bitmask, starting from the first attribute
+ * up to the @max_parsed (excluded) attribute.
+ * For each set attribute, we retrieve the corresponding destroy()
+ * callback. If the callback is not available, then we skip to the next
+ * attribute; otherwise, we call the destroy() callback.
+ */
+ for (i = SEG6_LOCAL_SRH; i < max_parsed; ++i) {
+ if (!(parsed_attrs & SEG6_F_ATTR(i)))
+ continue;
+
+ param = &seg6_action_params[i];
+
+ if (param->destroy)
+ param->destroy(slwt);
+ }
+}
+
+/* release all the resources that may have been acquired during parsing
+ * operations.
+ */
+static void destroy_attrs(struct seg6_local_lwt *slwt)
+{
+ unsigned long attrs = slwt->desc->attrs | slwt->parsed_optattrs;
+
+ __destroy_attrs(attrs, SEG6_LOCAL_MAX + 1, slwt);
+}
+
+static int parse_nla_optional_attrs(struct nlattr **attrs,
+ struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
+{
+ struct seg6_action_desc *desc = slwt->desc;
+ unsigned long parsed_optattrs = 0;
+ struct seg6_action_param *param;
+ int err, i;
+
+ for (i = SEG6_LOCAL_SRH; i < SEG6_LOCAL_MAX + 1; ++i) {
+ if (!(desc->optattrs & SEG6_F_ATTR(i)) || !attrs[i])
+ continue;
+
+ /* once here, the i-th attribute is provided by the
+ * userspace AND it is identified optional as well.
+ */
+ param = &seg6_action_params[i];
+
+ err = param->parse(attrs, slwt, extack);
+ if (err < 0)
+ goto parse_optattrs_err;
+
+ /* current attribute has been correctly parsed */
+ parsed_optattrs |= SEG6_F_ATTR(i);
+ }
+
+ /* store in the tunnel state all the optional attributed successfully
+ * parsed.
+ */
+ slwt->parsed_optattrs = parsed_optattrs;
+
+ return 0;
+
+parse_optattrs_err:
+ __destroy_attrs(parsed_optattrs, i, slwt);
+
+ return err;
+}
+
+/* call the custom constructor of the behavior during its initialization phase
+ * and after that all its attributes have been parsed successfully.
+ */
+static int
+seg6_local_lwtunnel_build_state(struct seg6_local_lwt *slwt, const void *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct seg6_action_desc *desc = slwt->desc;
+ struct seg6_local_lwtunnel_ops *ops;
+
+ ops = &desc->slwt_ops;
+ if (!ops->build_state)
+ return 0;
+
+ return ops->build_state(slwt, cfg, extack);
+}
+
+/* call the custom destructor of the behavior which is invoked before the
+ * tunnel is going to be destroyed.
+ */
+static void seg6_local_lwtunnel_destroy_state(struct seg6_local_lwt *slwt)
+{
+ struct seg6_action_desc *desc = slwt->desc;
+ struct seg6_local_lwtunnel_ops *ops;
+
+ ops = &desc->slwt_ops;
+ if (!ops->destroy_state)
+ return;
+
+ ops->destroy_state(slwt);
+}
+
+static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
{
struct seg6_action_param *param;
struct seg6_action_desc *desc;
+ unsigned long invalid_attrs;
int i, err;
desc = __get_action_desc(slwt->action);
@@ -954,24 +2051,58 @@ static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
slwt->desc = desc;
slwt->headroom += desc->static_headroom;
- for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
- if (desc->attrs & (1 << i)) {
+ /* Forcing the desc->optattrs *set* and the desc->attrs *set* to be
+ * disjoined, this allow us to release acquired resources by optional
+ * attributes and by required attributes independently from each other
+ * without any interference.
+ * In other terms, we are sure that we do not release some the acquired
+ * resources twice.
+ *
+ * Note that if an attribute is configured both as required and as
+ * optional, it means that the user has messed something up in the
+ * seg6_action_table. Therefore, this check is required for SRv6
+ * behaviors to work properly.
+ */
+ invalid_attrs = desc->attrs & desc->optattrs;
+ if (invalid_attrs) {
+ WARN_ONCE(1,
+ "An attribute cannot be both required AND optional");
+ return -EINVAL;
+ }
+
+ /* parse the required attributes */
+ for (i = SEG6_LOCAL_SRH; i < SEG6_LOCAL_MAX + 1; i++) {
+ if (desc->attrs & SEG6_F_ATTR(i)) {
if (!attrs[i])
return -EINVAL;
param = &seg6_action_params[i];
- err = param->parse(attrs, slwt);
+ err = param->parse(attrs, slwt, extack);
if (err < 0)
- return err;
+ goto parse_attrs_err;
}
}
+ /* parse the optional attributes, if any */
+ err = parse_nla_optional_attrs(attrs, slwt, extack);
+ if (err < 0)
+ goto parse_attrs_err;
+
return 0;
+
+parse_attrs_err:
+ /* release any resource that may have been acquired during the i-1
+ * parse() operations.
+ */
+ __destroy_attrs(desc->attrs, i, slwt);
+
+ return err;
}
-static int seg6_local_build_state(struct nlattr *nla, unsigned int family,
- const void *cfg, struct lwtunnel_state **ts,
+static int seg6_local_build_state(struct net *net, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[SEG6_LOCAL_MAX + 1];
@@ -998,10 +2129,14 @@ static int seg6_local_build_state(struct nlattr *nla, unsigned int family,
slwt = seg6_local_lwtunnel(newts);
slwt->action = nla_get_u32(tb[SEG6_LOCAL_ACTION]);
- err = parse_nla_action(tb, slwt);
+ err = parse_nla_action(tb, slwt, extack);
if (err < 0)
goto out_free;
+ err = seg6_local_lwtunnel_build_state(slwt, cfg, extack);
+ if (err < 0)
+ goto out_destroy_attrs;
+
newts->type = LWTUNNEL_ENCAP_SEG6_LOCAL;
newts->flags = LWTUNNEL_STATE_INPUT_REDIRECT;
newts->headroom = slwt->headroom;
@@ -1010,8 +2145,9 @@ static int seg6_local_build_state(struct nlattr *nla, unsigned int family,
return 0;
+out_destroy_attrs:
+ destroy_attrs(slwt);
out_free:
- kfree(slwt->srh);
kfree(newts);
return err;
}
@@ -1020,12 +2156,9 @@ static void seg6_local_destroy_state(struct lwtunnel_state *lwt)
{
struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
- kfree(slwt->srh);
+ seg6_local_lwtunnel_destroy_state(slwt);
- if (slwt->desc->attrs & (1 << SEG6_LOCAL_BPF)) {
- kfree(slwt->bpf.name);
- bpf_prog_put(slwt->bpf.prog);
- }
+ destroy_attrs(slwt);
return;
}
@@ -1035,13 +2168,16 @@ static int seg6_local_fill_encap(struct sk_buff *skb,
{
struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
struct seg6_action_param *param;
+ unsigned long attrs;
int i, err;
if (nla_put_u32(skb, SEG6_LOCAL_ACTION, slwt->action))
return -EMSGSIZE;
- for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
- if (slwt->desc->attrs & (1 << i)) {
+ attrs = slwt->desc->attrs | slwt->parsed_optattrs;
+
+ for (i = SEG6_LOCAL_SRH; i < SEG6_LOCAL_MAX + 1; i++) {
+ if (attrs & SEG6_F_ATTR(i)) {
param = &seg6_action_params[i];
err = param->put(skb, slwt);
if (err < 0)
@@ -1060,31 +2196,46 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt)
nlsize = nla_total_size(4); /* action */
- attrs = slwt->desc->attrs;
+ attrs = slwt->desc->attrs | slwt->parsed_optattrs;
- if (attrs & (1 << SEG6_LOCAL_SRH))
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_SRH))
nlsize += nla_total_size((slwt->srh->hdrlen + 1) << 3);
- if (attrs & (1 << SEG6_LOCAL_TABLE))
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_TABLE))
nlsize += nla_total_size(4);
- if (attrs & (1 << SEG6_LOCAL_NH4))
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_NH4))
nlsize += nla_total_size(4);
- if (attrs & (1 << SEG6_LOCAL_NH6))
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_NH6))
nlsize += nla_total_size(16);
- if (attrs & (1 << SEG6_LOCAL_IIF))
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_IIF))
nlsize += nla_total_size(4);
- if (attrs & (1 << SEG6_LOCAL_OIF))
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_OIF))
nlsize += nla_total_size(4);
- if (attrs & (1 << SEG6_LOCAL_BPF))
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_BPF))
nlsize += nla_total_size(sizeof(struct nlattr)) +
nla_total_size(MAX_PROG_NAME) +
nla_total_size(4);
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE))
+ nlsize += nla_total_size(4);
+
+ if (attrs & SEG6_F_LOCAL_COUNTERS)
+ nlsize += nla_total_size(0) + /* nest SEG6_LOCAL_COUNTERS */
+ /* SEG6_LOCAL_CNT_PACKETS */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* SEG6_LOCAL_CNT_BYTES */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* SEG6_LOCAL_CNT_ERRORS */
+ nla_total_size_64bit(sizeof(__u64));
+
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_FLAVORS))
+ nlsize += encap_size_flavors(slwt);
+
return nlsize;
}
@@ -1093,6 +2244,7 @@ static int seg6_local_cmp_encap(struct lwtunnel_state *a,
{
struct seg6_local_lwt *slwt_a, *slwt_b;
struct seg6_action_param *param;
+ unsigned long attrs_a, attrs_b;
int i;
slwt_a = seg6_local_lwtunnel(a);
@@ -1101,11 +2253,14 @@ static int seg6_local_cmp_encap(struct lwtunnel_state *a,
if (slwt_a->action != slwt_b->action)
return 1;
- if (slwt_a->desc->attrs != slwt_b->desc->attrs)
+ attrs_a = slwt_a->desc->attrs | slwt_a->parsed_optattrs;
+ attrs_b = slwt_b->desc->attrs | slwt_b->parsed_optattrs;
+
+ if (attrs_a != attrs_b)
return 1;
- for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
- if (slwt_a->desc->attrs & (1 << i)) {
+ for (i = SEG6_LOCAL_SRH; i < SEG6_LOCAL_MAX + 1; i++) {
+ if (attrs_a & SEG6_F_ATTR(i)) {
param = &seg6_action_params[i];
if (param->cmp(slwt_a, slwt_b))
return 1;
@@ -1127,6 +2282,24 @@ static const struct lwtunnel_encap_ops seg6_local_ops = {
int __init seg6_local_init(void)
{
+ /* If the max total number of defined attributes is reached, then your
+ * kernel build stops here.
+ *
+ * This check is required to avoid arithmetic overflows when processing
+ * behavior attributes and the maximum number of defined attributes
+ * exceeds the allowed value.
+ */
+ BUILD_BUG_ON(SEG6_LOCAL_MAX + 1 > BITS_PER_TYPE(unsigned long));
+
+ /* If the default NEXT-C-SID Locator-Block/Node Function lengths (in
+ * bits) have been changed with invalid values, kernel build stops
+ * here.
+ */
+ BUILD_BUG_ON(next_csid_chk_cntr_bits(SEG6_LOCAL_LCBLOCK_DBITS,
+ SEG6_LOCAL_LCNODE_FN_DBITS));
+ BUILD_BUG_ON(next_csid_chk_lcblock_bits(SEG6_LOCAL_LCBLOCK_DBITS));
+ BUILD_BUG_ON(next_csid_chk_lcnode_fn_bits(SEG6_LOCAL_LCNODE_FN_DBITS));
+
return lwtunnel_encap_add_ops(&seg6_local_ops,
LWTUNNEL_ENCAP_SEG6_LOCAL);
}
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 98954830c40b..5703d3cbea9b 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -83,6 +83,13 @@ struct sit_net {
struct net_device *fb_tunnel_dev;
};
+static inline struct sit_net *dev_to_sit_net(struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+
+ return net_generic(t->net, sit_net_id);
+}
+
/*
* Must be invoked with rcu_read_lock
*/
@@ -197,7 +204,7 @@ static int ipip6_tunnel_create(struct net_device *dev)
struct sit_net *sitn = net_generic(net, sit_net_id);
int err;
- memcpy(dev->dev_addr, &t->parms.iph.saddr, 4);
+ __dev_addr_set(dev, &t->parms.iph.saddr, 4);
memcpy(dev->broadcast, &t->parms.iph.daddr, 4);
if ((__force u16)t->parms.i_flags & SIT_ISATAP)
@@ -211,8 +218,6 @@ static int ipip6_tunnel_create(struct net_device *dev)
ipip6_tunnel_clone_6rd(dev, sitn);
- dev_hold(dev);
-
ipip6_tunnel_link(sitn, t);
return 0;
@@ -249,7 +254,7 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net,
if (parms->name[0]) {
if (!dev_valid_name(parms->name))
goto failed;
- strlcpy(name, parms->name, IFNAMSIZ);
+ strscpy(name, parms->name, IFNAMSIZ);
} else {
strcpy(name, "sit%d");
}
@@ -266,6 +271,9 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net,
if (ipip6_tunnel_create(dev) < 0)
goto failed_free;
+ if (!parms->name[0])
+ strcpy(parms->name, dev->name);
+
return nt;
failed_free:
@@ -291,14 +299,17 @@ __ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr)
}
-static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
- struct ip_tunnel_prl __user *a)
+static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __user *a)
{
+ struct ip_tunnel *t = netdev_priv(dev);
struct ip_tunnel_prl kprl, *kp;
struct ip_tunnel_prl_entry *prl;
unsigned int cmax, c = 0, ca, len;
int ret = 0;
+ if (dev == dev_to_sit_net(dev)->fb_tunnel_dev)
+ return -EINVAL;
+
if (copy_from_user(&kprl, a, sizeof(kprl)))
return -EFAULT;
cmax = kprl.datalen / sizeof(kprl);
@@ -309,12 +320,10 @@ static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
* we try harder to allocate.
*/
kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ?
- kcalloc(cmax, sizeof(*kp), GFP_KERNEL | __GFP_NOWARN) :
+ kcalloc(cmax, sizeof(*kp), GFP_KERNEL_ACCOUNT | __GFP_NOWARN) :
NULL;
- rcu_read_lock();
-
- ca = t->prl_count < cmax ? t->prl_count : cmax;
+ ca = min(t->prl_count, cmax);
if (!kp) {
/* We don't try hard to allocate much memory for
@@ -322,14 +331,15 @@ static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
* For root users, retry allocating enough memory for
* the answer.
*/
- kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC);
+ kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC | __GFP_ACCOUNT |
+ __GFP_NOWARN);
if (!kp) {
ret = -ENOMEM;
goto out;
}
}
- c = 0;
+ rcu_read_lock();
for_each_prl_rcu(t->prl) {
if (c >= cmax)
break;
@@ -341,7 +351,7 @@ static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
if (kprl.addr != htonl(INADDR_ANY))
break;
}
-out:
+
rcu_read_unlock();
len = sizeof(*kp) * c;
@@ -350,7 +360,7 @@ out:
ret = -EFAULT;
kfree(kp);
-
+out:
return ret;
}
@@ -441,6 +451,35 @@ out:
return err;
}
+static int ipip6_tunnel_prl_ctl(struct net_device *dev,
+ struct ip_tunnel_prl __user *data, int cmd)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_prl prl;
+ int err;
+
+ if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (dev == dev_to_sit_net(dev)->fb_tunnel_dev)
+ return -EINVAL;
+
+ if (copy_from_user(&prl, data, sizeof(prl)))
+ return -EFAULT;
+
+ switch (cmd) {
+ case SIOCDELPRL:
+ err = ipip6_tunnel_del_prl(t, &prl);
+ break;
+ case SIOCADDPRL:
+ case SIOCCHGPRL:
+ err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL);
+ break;
+ }
+ dst_cache_reset(&t->dst_cache);
+ netdev_state_change(dev);
+ return err;
+}
+
static int
isatap_chksrc(struct sk_buff *skb, const struct iphdr *iph, struct ip_tunnel *t)
{
@@ -480,7 +519,7 @@ static void ipip6_tunnel_uninit(struct net_device *dev)
ipip6_tunnel_del_prl(tunnel, NULL);
}
dst_cache_reset(&tunnel->dst_cache);
- dev_put(dev);
+ netdev_put(dev, &tunnel->dev_tracker);
}
static int ipip6_err(struct sk_buff *skb, u32 info)
@@ -645,8 +684,6 @@ static int ipip6_rcv(struct sk_buff *skb)
tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev,
iph->saddr, iph->daddr, sifindex);
if (tunnel) {
- struct pcpu_sw_netstats *tstats;
-
if (tunnel->parms.iph.protocol != IPPROTO_IPV6 &&
tunnel->parms.iph.protocol != 0)
goto out;
@@ -669,6 +706,8 @@ static int ipip6_rcv(struct sk_buff *skb)
* old iph is no longer valid
*/
iph = (const struct iphdr *)skb_mac_header(skb);
+ skb_reset_mac_header(skb);
+
err = IP_ECN_decapsulate(iph, skb);
if (unlikely(err)) {
if (log_ecn_error)
@@ -681,11 +720,7 @@ static int ipip6_rcv(struct sk_buff *skb)
}
}
- tstats = this_cpu_ptr(tunnel->dev->tstats);
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
- u64_stats_update_end(&tstats->syncp);
+ dev_sw_netstats_rx_add(tunnel->dev, skb->len);
netif_rx(skb);
@@ -739,6 +774,8 @@ static int sit_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
tpi = &ipip_tpi;
if (iptunnel_pull_header(skb, 0, tpi->proto, false))
goto drop;
+ skb_reset_mac_header(skb);
+
return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
}
@@ -911,7 +948,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, fl4.saddr);
}
- if (rt->rt_type != RTN_UNICAST) {
+ if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
ip_rt_put(rt);
dev->stats.tx_carrier_errors++;
goto tx_error_icmp;
@@ -932,7 +969,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
if (df) {
mtu = dst_mtu(&rt->dst) - t_hlen;
- if (mtu < 68) {
+ if (mtu < IPV4_MIN_MTU) {
dev->stats.collisions++;
ip_rt_put(rt);
goto tx_error;
@@ -947,7 +984,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
skb_dst_update_pmtu_no_confirm(skb, mtu);
if (skb->len > mtu && !skb_is_gso(skb)) {
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
ip_rt_put(rt);
goto tx_error;
}
@@ -1087,11 +1124,12 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev)
if (tdev && !netif_is_l3_master(tdev)) {
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+ int mtu;
- dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
- dev->mtu = tdev->mtu - t_hlen;
- if (dev->mtu < IPV6_MIN_MTU)
- dev->mtu = IPV6_MIN_MTU;
+ mtu = tdev->mtu - t_hlen;
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+ WRITE_ONCE(dev->mtu, mtu);
}
}
@@ -1105,7 +1143,7 @@ static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p,
synchronize_net();
t->parms.iph.saddr = p->iph.saddr;
t->parms.iph.daddr = p->iph.daddr;
- memcpy(t->dev->dev_addr, &p->iph.saddr, 4);
+ __dev_addr_set(t->dev, &p->iph.saddr, 4);
memcpy(t->dev->broadcast, &p->iph.daddr, 4);
ipip6_tunnel_link(sitn, t);
t->parms.iph.ttl = p->iph.ttl;
@@ -1151,7 +1189,54 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t,
netdev_state_change(t->dev);
return 0;
}
-#endif
+
+static int
+ipip6_tunnel_get6rd(struct net_device *dev, struct ip_tunnel_parm __user *data)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_6rd ip6rd;
+ struct ip_tunnel_parm p;
+
+ if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) {
+ if (copy_from_user(&p, data, sizeof(p)))
+ return -EFAULT;
+ t = ipip6_tunnel_locate(t->net, &p, 0);
+ }
+ if (!t)
+ t = netdev_priv(dev);
+
+ ip6rd.prefix = t->ip6rd.prefix;
+ ip6rd.relay_prefix = t->ip6rd.relay_prefix;
+ ip6rd.prefixlen = t->ip6rd.prefixlen;
+ ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen;
+ if (copy_to_user(data, &ip6rd, sizeof(ip6rd)))
+ return -EFAULT;
+ return 0;
+}
+
+static int
+ipip6_tunnel_6rdctl(struct net_device *dev, struct ip_tunnel_6rd __user *data,
+ int cmd)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_6rd ip6rd;
+ int err;
+
+ if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (copy_from_user(&ip6rd, data, sizeof(ip6rd)))
+ return -EFAULT;
+
+ if (cmd != SIOCDEL6RD) {
+ err = ipip6_tunnel_update_6rd(t, &ip6rd);
+ if (err < 0)
+ return err;
+ } else
+ ipip6_tunnel_clone_6rd(dev, dev_to_sit_net(dev));
+ return 0;
+}
+
+#endif /* CONFIG_IPV6_SIT_6RD */
static bool ipip6_valid_ip_proto(u8 ipproto)
{
@@ -1164,194 +1249,156 @@ static bool ipip6_valid_ip_proto(u8 ipproto)
}
static int
-ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+__ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm *p)
+{
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!ipip6_valid_ip_proto(p->iph.protocol))
+ return -EINVAL;
+ if (p->iph.version != 4 ||
+ p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)))
+ return -EINVAL;
+
+ if (p->iph.ttl)
+ p->iph.frag_off |= htons(IP_DF);
+ return 0;
+}
+
+static int
+ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm *p)
{
- int err = 0;
- struct ip_tunnel_parm p;
- struct ip_tunnel_prl prl;
struct ip_tunnel *t = netdev_priv(dev);
- struct net *net = t->net;
- struct sit_net *sitn = net_generic(net, sit_net_id);
-#ifdef CONFIG_IPV6_SIT_6RD
- struct ip_tunnel_6rd ip6rd;
-#endif
- switch (cmd) {
- case SIOCGETTUNNEL:
-#ifdef CONFIG_IPV6_SIT_6RD
- case SIOCGET6RD:
-#endif
- if (dev == sitn->fb_tunnel_dev) {
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
- err = -EFAULT;
- break;
- }
- t = ipip6_tunnel_locate(net, &p, 0);
- if (!t)
- t = netdev_priv(dev);
- }
+ if (dev == dev_to_sit_net(dev)->fb_tunnel_dev)
+ t = ipip6_tunnel_locate(t->net, p, 0);
+ if (!t)
+ t = netdev_priv(dev);
+ memcpy(p, &t->parms, sizeof(*p));
+ return 0;
+}
- err = -EFAULT;
- if (cmd == SIOCGETTUNNEL) {
- memcpy(&p, &t->parms, sizeof(p));
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p,
- sizeof(p)))
- goto done;
-#ifdef CONFIG_IPV6_SIT_6RD
+static int
+ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm *p)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err;
+
+ err = __ipip6_tunnel_ioctl_validate(t->net, p);
+ if (err)
+ return err;
+
+ t = ipip6_tunnel_locate(t->net, p, 1);
+ if (!t)
+ return -ENOBUFS;
+ return 0;
+}
+
+static int
+ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm *p)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err;
+
+ err = __ipip6_tunnel_ioctl_validate(t->net, p);
+ if (err)
+ return err;
+
+ t = ipip6_tunnel_locate(t->net, p, 0);
+ if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) {
+ if (!t)
+ return -ENOENT;
+ } else {
+ if (t) {
+ if (t->dev != dev)
+ return -EEXIST;
} else {
- ip6rd.prefix = t->ip6rd.prefix;
- ip6rd.relay_prefix = t->ip6rd.relay_prefix;
- ip6rd.prefixlen = t->ip6rd.prefixlen;
- ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen;
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &ip6rd,
- sizeof(ip6rd)))
- goto done;
-#endif
+ if (((dev->flags & IFF_POINTOPOINT) && !p->iph.daddr) ||
+ (!(dev->flags & IFF_POINTOPOINT) && p->iph.daddr))
+ return -EINVAL;
+ t = netdev_priv(dev);
}
- err = 0;
- break;
- case SIOCADDTUNNEL:
- case SIOCCHGTUNNEL:
- err = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- goto done;
-
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
-
- err = -EINVAL;
- if (!ipip6_valid_ip_proto(p.iph.protocol))
- goto done;
- if (p.iph.version != 4 ||
- p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
- goto done;
- if (p.iph.ttl)
- p.iph.frag_off |= htons(IP_DF);
-
- t = ipip6_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
-
- if (dev != sitn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
- if (t) {
- if (t->dev != dev) {
- err = -EEXIST;
- break;
- }
- } else {
- if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
- (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
- err = -EINVAL;
- break;
- }
- t = netdev_priv(dev);
- }
+ ipip6_tunnel_update(t, p, t->fwmark);
+ }
- ipip6_tunnel_update(t, &p, t->fwmark);
- }
+ return 0;
+}
- if (t) {
- err = 0;
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
- err = -EFAULT;
- } else
- err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
- break;
+static int
+ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm *p)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) {
+ t = ipip6_tunnel_locate(t->net, p, 0);
+ if (!t)
+ return -ENOENT;
+ if (t == netdev_priv(dev_to_sit_net(dev)->fb_tunnel_dev))
+ return -EPERM;
+ dev = t->dev;
+ }
+ unregister_netdevice(dev);
+ return 0;
+}
+
+static int
+ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
+{
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ return ipip6_tunnel_get(dev, p);
+ case SIOCADDTUNNEL:
+ return ipip6_tunnel_add(dev, p);
+ case SIOCCHGTUNNEL:
+ return ipip6_tunnel_change(dev, p);
case SIOCDELTUNNEL:
- err = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- goto done;
-
- if (dev == sitn->fb_tunnel_dev) {
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
- err = -ENOENT;
- t = ipip6_tunnel_locate(net, &p, 0);
- if (!t)
- goto done;
- err = -EPERM;
- if (t == netdev_priv(sitn->fb_tunnel_dev))
- goto done;
- dev = t->dev;
- }
- unregister_netdevice(dev);
- err = 0;
- break;
+ return ipip6_tunnel_del(dev, p);
+ default:
+ return -EINVAL;
+ }
+}
+static int
+ipip6_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+ void __user *data, int cmd)
+{
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ case SIOCDELTUNNEL:
+ return ip_tunnel_siocdevprivate(dev, ifr, data, cmd);
case SIOCGETPRL:
- err = -EINVAL;
- if (dev == sitn->fb_tunnel_dev)
- goto done;
- err = ipip6_tunnel_get_prl(t, ifr->ifr_ifru.ifru_data);
- break;
-
+ return ipip6_tunnel_get_prl(dev, data);
case SIOCADDPRL:
case SIOCDELPRL:
case SIOCCHGPRL:
- err = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- goto done;
- err = -EINVAL;
- if (dev == sitn->fb_tunnel_dev)
- goto done;
- err = -EFAULT;
- if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl)))
- goto done;
-
- switch (cmd) {
- case SIOCDELPRL:
- err = ipip6_tunnel_del_prl(t, &prl);
- break;
- case SIOCADDPRL:
- case SIOCCHGPRL:
- err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL);
- break;
- }
- dst_cache_reset(&t->dst_cache);
- netdev_state_change(dev);
- break;
-
+ return ipip6_tunnel_prl_ctl(dev, data, cmd);
#ifdef CONFIG_IPV6_SIT_6RD
+ case SIOCGET6RD:
+ return ipip6_tunnel_get6rd(dev, data);
case SIOCADD6RD:
case SIOCCHG6RD:
case SIOCDEL6RD:
- err = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- goto done;
-
- err = -EFAULT;
- if (copy_from_user(&ip6rd, ifr->ifr_ifru.ifru_data,
- sizeof(ip6rd)))
- goto done;
-
- if (cmd != SIOCDEL6RD) {
- err = ipip6_tunnel_update_6rd(t, &ip6rd);
- if (err < 0)
- goto done;
- } else
- ipip6_tunnel_clone_6rd(dev, sitn);
-
- err = 0;
- break;
+ return ipip6_tunnel_6rdctl(dev, data, cmd);
#endif
-
default:
- err = -EINVAL;
+ return -EINVAL;
}
-
-done:
- return err;
}
static const struct net_device_ops ipip6_netdev_ops = {
.ndo_init = ipip6_tunnel_init,
.ndo_uninit = ipip6_tunnel_uninit,
.ndo_start_xmit = sit_tunnel_xmit,
- .ndo_do_ioctl = ipip6_tunnel_ioctl,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_siocdevprivate = ipip6_tunnel_siocdevprivate,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
+ .ndo_tunnel_ctl = ipip6_tunnel_ctl,
};
static void ipip6_dev_free(struct net_device *dev)
@@ -1374,11 +1421,11 @@ static void ipip6_tunnel_setup(struct net_device *dev)
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
dev->netdev_ops = &ipip6_netdev_ops;
+ dev->header_ops = &ip_tunnel_header_ops;
dev->needs_free_netdev = true;
dev->priv_destructor = ipip6_dev_free;
dev->type = ARPHRD_SIT;
- dev->hard_header_len = LL_MAX_HEADER + t_hlen;
dev->mtu = ETH_DATA_LEN - t_hlen;
dev->min_mtu = IPV6_MIN_MTU;
dev->max_mtu = IP6_MAX_MTU - t_hlen;
@@ -1410,7 +1457,7 @@ static int ipip6_tunnel_init(struct net_device *dev)
dev->tstats = NULL;
return err;
}
-
+ netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL);
return 0;
}
@@ -1426,7 +1473,6 @@ static void __net_init ipip6_fb_tunnel_init(struct net_device *dev)
iph->ihl = 5;
iph->ttl = 64;
- dev_hold(dev);
rcu_assign_pointer(sitn->tunnels_wc[0], tunnel);
}
@@ -1459,71 +1505,12 @@ static void ipip6_netlink_parms(struct nlattr *data[],
if (!data)
return;
- if (data[IFLA_IPTUN_LINK])
- parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
-
- if (data[IFLA_IPTUN_LOCAL])
- parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]);
-
- if (data[IFLA_IPTUN_REMOTE])
- parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]);
-
- if (data[IFLA_IPTUN_TTL]) {
- parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
- if (parms->iph.ttl)
- parms->iph.frag_off = htons(IP_DF);
- }
-
- if (data[IFLA_IPTUN_TOS])
- parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
-
- if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
- parms->iph.frag_off = htons(IP_DF);
-
- if (data[IFLA_IPTUN_FLAGS])
- parms->i_flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]);
-
- if (data[IFLA_IPTUN_PROTO])
- parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+ ip_tunnel_netlink_parms(data, parms);
if (data[IFLA_IPTUN_FWMARK])
*fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
}
-/* This function returns true when ENCAP attributes are present in the nl msg */
-static bool ipip6_netlink_encap_parms(struct nlattr *data[],
- struct ip_tunnel_encap *ipencap)
-{
- bool ret = false;
-
- memset(ipencap, 0, sizeof(*ipencap));
-
- if (!data)
- return ret;
-
- if (data[IFLA_IPTUN_ENCAP_TYPE]) {
- ret = true;
- ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
- }
-
- if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
- ret = true;
- ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
- }
-
- if (data[IFLA_IPTUN_ENCAP_SPORT]) {
- ret = true;
- ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
- }
-
- if (data[IFLA_IPTUN_ENCAP_DPORT]) {
- ret = true;
- ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
- }
-
- return ret;
-}
-
#ifdef CONFIG_IPV6_SIT_6RD
/* This function returns true when 6RD attributes are present in the nl msg */
static bool ipip6_netlink_6rd_parms(struct nlattr *data[],
@@ -1575,7 +1562,7 @@ static int ipip6_newlink(struct net *src_net, struct net_device *dev,
nt = netdev_priv(dev);
- if (ipip6_netlink_encap_parms(data, &ipencap)) {
+ if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
err = ip_tunnel_encap_setup(nt, &ipencap);
if (err < 0)
return err;
@@ -1599,8 +1586,11 @@ static int ipip6_newlink(struct net *src_net, struct net_device *dev,
}
#ifdef CONFIG_IPV6_SIT_6RD
- if (ipip6_netlink_6rd_parms(data, &ip6rd))
+ if (ipip6_netlink_6rd_parms(data, &ip6rd)) {
err = ipip6_tunnel_update_6rd(nt, &ip6rd);
+ if (err < 0)
+ unregister_netdevice_queue(dev, NULL);
+ }
#endif
return err;
@@ -1624,7 +1614,7 @@ static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[],
if (dev == sitn->fb_tunnel_dev)
return -EINVAL;
- if (ipip6_netlink_encap_parms(data, &ipencap)) {
+ if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
err = ip_tunnel_encap_setup(t, &ipencap);
if (err < 0)
return err;
@@ -1818,9 +1808,9 @@ static void __net_exit sit_destroy_tunnels(struct net *net,
if (dev->rtnl_link_ops == &sit_link_ops)
unregister_netdevice_queue(dev, head);
- for (prio = 1; prio < 4; prio++) {
+ for (prio = 0; prio < 4; prio++) {
int h;
- for (h = 0; h < IP6_SIT_HASH_SIZE; h++) {
+ for (h = 0; h < (prio ? IP6_SIT_HASH_SIZE : 1); h++) {
struct ip_tunnel *t;
t = rtnl_dereference(sitn->tunnels[prio][h]);
@@ -1878,7 +1868,6 @@ static int __net_init sit_init_net(struct net *net)
return 0;
err_reg_dev:
- ipip6_dev_free(sitn->fb_tunnel_dev);
free_netdev(sitn->fb_tunnel_dev);
err_alloc_dev:
return err;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 13235a012388..5014aa663452 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -20,7 +20,7 @@
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
-static siphash_key_t syncookie6_secret[2] __read_mostly;
+static siphash_aligned_key_t syncookie6_secret[2];
/* RFC 2460, Section 8.3:
* [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..]
@@ -136,12 +136,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
__u32 cookie = ntohl(th->ack_seq) - 1;
struct sock *ret = sk;
struct request_sock *req;
- int mss;
+ int full_space, mss;
struct dst_entry *dst;
__u8 rcv_wscale;
u32 tsoff = 0;
- if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst)
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) ||
+ !th->ack || th->rst)
goto out;
if (tcp_synq_no_recent_overflow(sk))
@@ -170,7 +171,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
goto out;
ret = NULL;
- req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk, false);
+ req = cookie_tcp_reqsk_alloc(&tcp6_request_sock_ops,
+ &tcp_request_sock_ipv6_ops, sk, skb);
if (!req)
goto out;
@@ -178,9 +180,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
treq = tcp_rsk(req);
treq->tfo_listener = false;
- if (IS_ENABLED(CONFIG_MPTCP))
- treq->is_mptcp = 0;
-
if (security_inet_conn_request(sk, skb, req))
goto out_free;
@@ -236,7 +235,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
fl6.fl6_dport = ireq->ir_rmt_port;
fl6.fl6_sport = inet_sk(sk)->inet_sport;
fl6.flowi6_uid = sk->sk_uid;
- security_req_classify_flow(req, flowi6_to_flowi(&fl6));
+ security_req_classify_flow(req, flowi6_to_flowi_common(&fl6));
dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
if (IS_ERR(dst))
@@ -244,7 +243,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
}
req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
- tcp_select_initial_window(sk, tcp_full_space(sk), req->mss,
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ full_space = tcp_full_space(sk);
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
+ req->rsk_window_clamp = full_space;
+
+ tcp_select_initial_window(sk, full_space, req->mss,
&req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(dst, RTAX_INITRWND));
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index ec8fcfc60a27..94a0a294c6a1 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -17,24 +17,44 @@
#include <net/addrconf.h>
#include <net/inet_frag.h>
#include <net/netevent.h>
+#include <net/ip_fib.h>
#ifdef CONFIG_NETLABEL
#include <net/calipso.h>
#endif
+#include <linux/ioam6.h>
static int flowlabel_reflect_max = 0x7;
-static int auto_flowlabels_min;
static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
+static u32 rt6_multipath_hash_fields_all_mask =
+ FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
+static u32 ioam6_id_max = IOAM6_DEFAULT_ID;
+static u64 ioam6_id_wide_max = IOAM6_DEFAULT_ID_WIDE;
static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net;
int ret;
net = container_of(table->data, struct net,
ipv6.sysctl.multipath_hash_policy);
- ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net);
+
+ return ret;
+}
+
+static int
+proc_rt6_multipath_hash_fields(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct net *net;
+ int ret;
+
+ net = container_of(table->data, struct net,
+ ipv6.sysctl.multipath_hash_fields);
+ ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos);
if (write && ret == 0)
call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net);
@@ -45,39 +65,38 @@ static struct ctl_table ipv6_table_template[] = {
{
.procname = "bindv6only",
.data = &init_net.ipv6.sysctl.bindv6only,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "anycast_src_echo_reply",
.data = &init_net.ipv6.sysctl.anycast_src_echo_reply,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "flowlabel_consistency",
.data = &init_net.ipv6.sysctl.flowlabel_consistency,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "auto_flowlabels",
.data = &init_net.ipv6.sysctl.auto_flowlabels,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &auto_flowlabels_min,
+ .proc_handler = proc_dou8vec_minmax,
.extra2 = &auto_flowlabels_max
},
{
.procname = "fwmark_reflect",
.data = &init_net.ipv6.sysctl.fwmark_reflect,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "idgen_retries",
@@ -96,16 +115,16 @@ static struct ctl_table ipv6_table_template[] = {
{
.procname = "flowlabel_state_ranges",
.data = &init_net.ipv6.sysctl.flowlabel_state_ranges,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "ip_nonlocal_bind",
.data = &init_net.ipv6.sysctl.ip_nonlocal_bind,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "flowlabel_reflect",
@@ -147,11 +166,20 @@ static struct ctl_table ipv6_table_template[] = {
{
.procname = "fib_multipath_hash_policy",
.data = &init_net.ipv6.sysctl.multipath_hash_policy,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_rt6_multipath_hash_policy,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = SYSCTL_THREE,
+ },
+ {
+ .procname = "fib_multipath_hash_fields",
+ .data = &init_net.ipv6.sysctl.multipath_hash_fields,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_rt6_multipath_hash_fields,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = &rt6_multipath_hash_fields_all_mask,
},
{
.procname = "seg6_flowlabel",
@@ -160,6 +188,31 @@ static struct ctl_table ipv6_table_template[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "fib_notify_on_flag_change",
+ .data = &init_net.ipv6.sysctl.fib_notify_on_flag_change,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "ioam6_id",
+ .data = &init_net.ipv6.sysctl.ioam6_id,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra2 = &ioam6_id_max,
+ },
+ {
+ .procname = "ioam6_id_wide",
+ .data = &init_net.ipv6.sysctl.ioam6_id_wide,
+ .maxlen = sizeof(u64),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra2 = &ioam6_id_wide_max,
+ },
{ }
};
@@ -203,29 +256,16 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
struct ctl_table *ipv6_table;
struct ctl_table *ipv6_route_table;
struct ctl_table *ipv6_icmp_table;
- int err;
+ int err, i;
err = -ENOMEM;
ipv6_table = kmemdup(ipv6_table_template, sizeof(ipv6_table_template),
GFP_KERNEL);
if (!ipv6_table)
goto out;
- ipv6_table[0].data = &net->ipv6.sysctl.bindv6only;
- ipv6_table[1].data = &net->ipv6.sysctl.anycast_src_echo_reply;
- ipv6_table[2].data = &net->ipv6.sysctl.flowlabel_consistency;
- ipv6_table[3].data = &net->ipv6.sysctl.auto_flowlabels;
- ipv6_table[4].data = &net->ipv6.sysctl.fwmark_reflect;
- ipv6_table[5].data = &net->ipv6.sysctl.idgen_retries;
- ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay;
- ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges;
- ipv6_table[8].data = &net->ipv6.sysctl.ip_nonlocal_bind;
- ipv6_table[9].data = &net->ipv6.sysctl.flowlabel_reflect;
- ipv6_table[10].data = &net->ipv6.sysctl.max_dst_opts_cnt;
- ipv6_table[11].data = &net->ipv6.sysctl.max_hbh_opts_cnt;
- ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len;
- ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len;
- ipv6_table[14].data = &net->ipv6.sysctl.multipath_hash_policy,
- ipv6_table[15].data = &net->ipv6.sysctl.seg6_flowlabel;
+ /* Update the variables to point into the current struct net */
+ for (i = 0; i < ARRAY_SIZE(ipv6_table_template) - 1; i++)
+ ipv6_table[i].data += (void *)net - (void *)&init_net;
ipv6_route_table = ipv6_route_sysctl_init(net);
if (!ipv6_route_table)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index eaf09e6b7844..2a3f9296df1e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -72,7 +72,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb);
static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req);
-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
+INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
static const struct inet_connection_sock_af_ops ipv6_mapped;
const struct inet_connection_sock_af_ops ipv6_specific;
@@ -107,9 +107,9 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
if (dst && dst_hold_safe(dst)) {
const struct rt6_info *rt = (const struct rt6_info *)dst;
- sk->sk_rx_dst = dst;
- inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
- tcp_inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
+ rcu_assign_pointer(sk->sk_rx_dst, dst);
+ sk->sk_rx_dst_ifindex = skb->skb_iif;
+ sk->sk_rx_dst_cookie = rt6_get_cookie(rt);
}
}
@@ -146,17 +146,18 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
int addr_len)
{
struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
- struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct in6_addr *saddr = NULL, *final_p, final;
+ struct inet_timewait_death_row *tcp_death_row;
struct ipv6_pinfo *np = tcp_inet6_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- struct in6_addr *saddr = NULL, *final_p, final;
+ struct net *net = sock_net(sk);
struct ipv6_txoptions *opt;
- struct flowi6 fl6;
struct dst_entry *dst;
+ struct flowi6 fl6;
int addr_type;
int err;
- struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
if (addr_len < SIN6_LEN_RFC2133)
return -EINVAL;
@@ -230,14 +231,15 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
u32 exthdrlen = icsk->icsk_ext_hdr_len;
struct sockaddr_in sin;
- if (__ipv6_only_sock(sk))
+ if (ipv6_only_sock(sk))
return -ENETUNREACH;
sin.sin_family = AF_INET;
sin.sin_port = usin->sin6_port;
sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
- icsk->icsk_af_ops = &ipv6_mapped;
+ /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */
+ WRITE_ONCE(icsk->icsk_af_ops, &ipv6_mapped);
if (sk_is_mptcp(sk))
mptcpv6_handle_mapped(sk, true);
sk->sk_backlog_rcv = tcp_v4_do_rcv;
@@ -249,7 +251,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
if (err) {
icsk->icsk_ext_hdr_len = exthdrlen;
- icsk->icsk_af_ops = &ipv6_specific;
+ /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */
+ WRITE_ONCE(icsk->icsk_af_ops, &ipv6_specific);
if (sk_is_mptcp(sk))
mptcpv6_handle_mapped(sk, false);
sk->sk_backlog_rcv = tcp_v6_do_rcv;
@@ -278,17 +281,35 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
final_p = fl6_update_dst(&fl6, opt, &final);
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
- dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
+ dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
goto failure;
}
+ tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
+
if (!saddr) {
+ struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
+ struct in6_addr prev_v6_rcv_saddr;
+
+ if (icsk->icsk_bind2_hash) {
+ prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo,
+ sk, net, inet->inet_num);
+ prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+ }
saddr = &fl6.saddr;
sk->sk_v6_rcv_saddr = *saddr;
+
+ if (prev_addr_hashbucket) {
+ err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
+ if (err) {
+ sk->sk_v6_rcv_saddr = prev_v6_rcv_saddr;
+ goto failure;
+ }
+ }
}
/* set the source address */
@@ -321,8 +342,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
sk->sk_v6_daddr.s6_addr32,
inet->inet_sport,
inet->inet_dport));
- tp->tsoffset = secure_tcpv6_ts_off(sock_net(sk),
- np->saddr.s6_addr32,
+ tp->tsoffset = secure_tcpv6_ts_off(net, np->saddr.s6_addr32,
sk->sk_v6_daddr.s6_addr32);
}
@@ -348,11 +368,20 @@ failure:
static void tcp_v6_mtu_reduced(struct sock *sk)
{
struct dst_entry *dst;
+ u32 mtu;
if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
return;
- dst = inet6_csk_update_pmtu(sk, tcp_sk(sk)->mtu_info);
+ mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
+
+ /* Drop requests trying to increase our current mss.
+ * Check done in __ip6_rt_update_pmtu() is too late.
+ */
+ if (tcp_mtu_to_mss(sk, mtu) >= tcp_sk(sk)->mss_cache)
+ return;
+
+ dst = inet6_csk_update_pmtu(sk, mtu);
if (!dst)
return;
@@ -376,7 +405,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
bool fatal;
int err;
- sk = __inet6_lookup_established(net, &tcp_hashinfo,
+ sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
&hdr->daddr, th->dest,
&hdr->saddr, ntohs(th->source),
skb->dev->ifindex, inet6_sdif(skb));
@@ -405,9 +434,12 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (sk->sk_state == TCP_CLOSE)
goto out;
- if (ipv6_hdr(skb)->hop_limit < tcp_inet6_sk(sk)->min_hopcount) {
- __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
- goto out;
+ if (static_branch_unlikely(&ip6_min_hopcount)) {
+ /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */
+ if (ipv6_hdr(skb)->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) {
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+ goto out;
+ }
}
tp = tcp_sk(sk);
@@ -433,6 +465,8 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
}
if (type == ICMPV6_PKT_TOOBIG) {
+ u32 mtu = ntohl(info);
+
/* We are not interested in TCP_LISTEN and open_requests
* (SYN-ACKs send out by Linux are always <576bytes so
* they should go through unfragmented).
@@ -443,7 +477,11 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (!ip6_sk_accept_pmtu(sk))
goto out;
- tp->mtu_info = ntohl(info);
+ if (mtu < IPV6_MIN_MTU)
+ goto out;
+
+ WRITE_ONCE(tp->mtu_info, mtu);
+
if (!sock_owned_by_user(sk))
tcp_v6_mtu_reduced(sk);
else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
@@ -458,24 +496,35 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
case TCP_SYN_SENT:
case TCP_SYN_RECV:
/* Only in fast or simultaneous open. If a fast open socket is
- * is already accepted it is treated as a connected one below.
+ * already accepted it is treated as a connected one below.
*/
if (fastopen && !fastopen->sk)
break;
+ ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th);
+
if (!sock_owned_by_user(sk)) {
sk->sk_err = err;
- sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
+ sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
tcp_done(sk);
} else
sk->sk_err_soft = err;
goto out;
+ case TCP_LISTEN:
+ break;
+ default:
+ /* check if this ICMP message allows revert of backoff.
+ * (see RFC 6069)
+ */
+ if (!fastopen && type == ICMPV6_DEST_UNREACH &&
+ code == ICMPV6_NOROUTE)
+ tcp_ld_RTO_revert(sk, seq);
}
if (!sock_owned_by_user(sk) && np->recverr) {
sk->sk_err = err;
- sk->sk_error_report(sk);
+ sk_error_report(sk);
} else
sk->sk_err_soft = err;
@@ -490,7 +539,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
struct ipv6_pinfo *np = tcp_inet6_sk(sk);
@@ -498,13 +548,14 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi6 *fl6 = &fl->u.ip6;
struct sk_buff *skb;
int err = -ENOMEM;
+ u8 tclass;
/* First, grab a route. */
if (!dst && (dst = inet6_csk_route_req(sk, fl6, req,
IPPROTO_TCP)) == NULL)
goto done;
- skb = tcp_make_synack(sk, dst, req, foc, synack_type);
+ skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
if (skb) {
__tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
@@ -514,12 +565,21 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
if (np->repflow && ireq->pktopts)
fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts));
+ tclass = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
+ (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
+ (np->tclass & INET_ECN_MASK) :
+ np->tclass;
+
+ if (!INET_ECN_is_capable(tclass) &&
+ tcp_bpf_ca_needs_ecn((struct sock *)req))
+ tclass |= INET_ECN_ECT_0;
+
rcu_read_lock();
opt = ireq->ipv6_opt;
if (!opt)
opt = rcu_dereference(np->opt);
- err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass,
- sk->sk_priority);
+ err = ip6_xmit(sk, skb, fl6, skb->mark ? : sk->sk_mark, opt,
+ tclass, sk->sk_priority);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -532,7 +592,7 @@ done:
static void tcp_v6_reqsk_destructor(struct request_sock *req)
{
kfree(inet_rsk(req)->ipv6_opt);
- kfree_skb(inet_rsk(req)->pktopts);
+ consume_skb(inet_rsk(req)->pktopts);
}
#ifdef CONFIG_TCP_MD5SIG
@@ -556,22 +616,25 @@ static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk,
}
static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
- char __user *optval, int optlen)
+ sockptr_t optval, int optlen)
{
struct tcp_md5sig cmd;
struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr;
int l3index = 0;
u8 prefixlen;
+ u8 flags;
if (optlen < sizeof(cmd))
return -EINVAL;
- if (copy_from_user(&cmd, optval, sizeof(cmd)))
+ if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
return -EFAULT;
if (sin6->sin6_family != AF_INET6)
return -EINVAL;
+ flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
+
if (optname == TCP_MD5SIG_EXT &&
cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
prefixlen = cmd.tcpm_prefixlen;
@@ -582,7 +645,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
prefixlen = ipv6_addr_v4mapped(&sin6->sin6_addr) ? 32 : 128;
}
- if (optname == TCP_MD5SIG_EXT &&
+ if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
struct net_device *dev;
@@ -603,9 +666,9 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
if (ipv6_addr_v4mapped(&sin6->sin6_addr))
return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3],
AF_INET, prefixlen,
- l3index);
+ l3index, flags);
return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr,
- AF_INET6, prefixlen, l3index);
+ AF_INET6, prefixlen, l3index, flags);
}
if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
@@ -613,12 +676,12 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
if (ipv6_addr_v4mapped(&sin6->sin6_addr))
return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3],
- AF_INET, prefixlen, l3index,
+ AF_INET, prefixlen, l3index, flags,
cmd.tcpm_key, cmd.tcpm_keylen,
GFP_KERNEL);
return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr,
- AF_INET6, prefixlen, l3index,
+ AF_INET6, prefixlen, l3index, flags,
cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
}
@@ -729,57 +792,6 @@ clear_hash_noput:
#endif
-static bool tcp_v6_inbound_md5_hash(const struct sock *sk,
- const struct sk_buff *skb,
- int dif, int sdif)
-{
-#ifdef CONFIG_TCP_MD5SIG
- const __u8 *hash_location = NULL;
- struct tcp_md5sig_key *hash_expected;
- const struct ipv6hdr *ip6h = ipv6_hdr(skb);
- const struct tcphdr *th = tcp_hdr(skb);
- int genhash, l3index;
- u8 newhash[16];
-
- /* sdif set, means packet ingressed via a device
- * in an L3 domain and dif is set to the l3mdev
- */
- l3index = sdif ? dif : 0;
-
- hash_expected = tcp_v6_md5_do_lookup(sk, &ip6h->saddr, l3index);
- hash_location = tcp_parse_md5sig_option(th);
-
- /* We've parsed the options - do we have a hash? */
- if (!hash_expected && !hash_location)
- return false;
-
- if (hash_expected && !hash_location) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
- return true;
- }
-
- if (!hash_expected && hash_location) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
- return true;
- }
-
- /* check the signature */
- genhash = tcp_v6_md5_hash_skb(newhash,
- hash_expected,
- NULL, skb);
-
- if (genhash || memcmp(hash_location, newhash, 16) != 0) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
- net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u L3 index %d\n",
- genhash ? "failed" : "mismatch",
- &ip6h->saddr, ntohs(th->source),
- &ip6h->daddr, ntohs(th->dest), l3index);
- return true;
- }
-#endif
- return false;
-}
-
static void tcp_v6_init_req(struct request_sock *req,
const struct sock *sk_listener,
struct sk_buff *skb)
@@ -807,9 +819,15 @@ static void tcp_v6_init_req(struct request_sock *req,
}
static struct dst_entry *tcp_v6_route_req(const struct sock *sk,
+ struct sk_buff *skb,
struct flowi *fl,
- const struct request_sock *req)
+ struct request_sock *req)
{
+ tcp_v6_init_req(req, sk, skb);
+
+ if (security_inet_conn_request(sk, skb, req))
+ return NULL;
+
return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP);
}
@@ -830,7 +848,6 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
.req_md5_lookup = tcp_v6_md5_lookup,
.calc_md5_hash = tcp_v6_md5_hash_skb,
#endif
- .init_req = tcp_v6_init_req,
#ifdef CONFIG_SYN_COOKIES
.cookie_init_seq = cookie_v6_init_sequence,
#endif
@@ -843,7 +860,7 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 win, u32 tsval, u32 tsecr,
int oif, struct tcp_md5sig_key *key, int rst,
- u8 tclass, __be32 label, u32 priority)
+ u8 tclass, __be32 label, u32 priority, u32 txhash)
{
const struct tcphdr *th = tcp_hdr(skb);
struct tcphdr *t1;
@@ -852,8 +869,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
struct sock *ctl_sk = net->ipv6.tcp_sk;
unsigned int tot_len = sizeof(struct tcphdr);
+ __be32 mrst = 0, *topt;
struct dst_entry *dst;
- __be32 *topt;
__u32 mark = 0;
if (tsecr)
@@ -863,12 +880,20 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
tot_len += TCPOLEN_MD5SIG_ALIGNED;
#endif
- buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
- GFP_ATOMIC);
+#ifdef CONFIG_MPTCP
+ if (rst && !key) {
+ mrst = mptcp_reset_option(skb);
+
+ if (mrst)
+ tot_len += sizeof(__be32);
+ }
+#endif
+
+ buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
if (!buff)
return;
- skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len);
+ skb_reserve(buff, MAX_TCP_HEADER);
t1 = skb_push(buff, tot_len);
skb_reset_transport_header(buff);
@@ -893,6 +918,9 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
*topt++ = htonl(tsecr);
}
+ if (mrst)
+ *topt++ = mrst;
+
#ifdef CONFIG_TCP_MD5SIG
if (key) {
*topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
@@ -909,7 +937,6 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
fl6.flowlabel = label;
buff->ip_summed = CHECKSUM_PARTIAL;
- buff->csum = 0;
__tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr);
@@ -924,31 +951,34 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
}
if (sk) {
- if (sk->sk_state == TCP_TIME_WAIT) {
+ if (sk->sk_state == TCP_TIME_WAIT)
mark = inet_twsk(sk)->tw_mark;
- /* autoflowlabel relies on buff->hash */
- skb_set_hash(buff, inet_twsk(sk)->tw_txhash,
- PKT_HASH_TYPE_L4);
- } else {
+ else
mark = sk->sk_mark;
- }
- buff->tstamp = tcp_transmit_time(sk);
+ skb_set_delivery_time(buff, tcp_transmit_time(sk), true);
+ }
+ if (txhash) {
+ /* autoflowlabel/skb_get_hash_flowi6 rely on buff->hash */
+ skb_set_hash(buff, txhash, PKT_HASH_TYPE_L4);
}
fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
fl6.fl6_dport = t1->dest;
fl6.fl6_sport = t1->source;
fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
- security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+ security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
/* Pass a socket to ip6_dst_lookup either it is for RST
* Underlying function will use this to retrieve the network
* namespace
*/
- dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL);
+ if (sk && sk->sk_state != TCP_TIME_WAIT)
+ dst = ip6_dst_lookup_flow(net, sk, &fl6, NULL); /*sk's xfrm_policy can be referred*/
+ else
+ dst = ip6_dst_lookup_flow(net, ctl_sk, &fl6, NULL);
if (!IS_ERR(dst)) {
skb_dst_set(buff, dst);
- ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass,
- priority);
+ ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL,
+ tclass & ~INET_ECN_MASK, priority);
TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
if (rst)
TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
@@ -973,6 +1003,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
__be32 label = 0;
u32 priority = 0;
struct net *net;
+ u32 txhash = 0;
int oif = 0;
if (th->rst)
@@ -1008,11 +1039,10 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
* Incoming packet is checked with md5 hash with finding key,
* no RST generated if md5 hash doesn't match.
*/
- sk1 = inet6_lookup_listener(net,
- &tcp_hashinfo, NULL, 0,
- &ipv6h->saddr,
- th->source, &ipv6h->daddr,
- ntohs(th->source), dif, sdif);
+ sk1 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
+ NULL, 0, &ipv6h->saddr, th->source,
+ &ipv6h->daddr, ntohs(th->source),
+ dif, sdif);
if (!sk1)
goto out;
@@ -1046,18 +1076,20 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
if (np->repflow)
label = ip6_flowlabel(ipv6h);
priority = sk->sk_priority;
+ txhash = sk->sk_hash;
}
if (sk->sk_state == TCP_TIME_WAIT) {
label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel);
priority = inet_twsk(sk)->tw_priority;
+ txhash = inet_twsk(sk)->tw_txhash;
}
} else {
if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET)
label = ip6_flowlabel(ipv6h);
}
- tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0,
- label, priority);
+ tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1,
+ ipv6_get_dsfield(ipv6h), label, priority, txhash);
#ifdef CONFIG_TCP_MD5SIG
out:
@@ -1068,10 +1100,10 @@ out:
static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key, u8 tclass,
- __be32 label, u32 priority)
+ __be32 label, u32 priority, u32 txhash)
{
tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0,
- tclass, label, priority);
+ tclass, label, priority, txhash);
}
static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
@@ -1083,7 +1115,8 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp_raw() + tcptw->tw_ts_offset,
tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
- tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority);
+ tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority,
+ tw->tw_txhash);
inet_twsk_put(tw);
}
@@ -1110,7 +1143,8 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
req->ts_recent, sk->sk_bound_dev_if,
tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index),
- 0, 0, sk->sk_priority);
+ ipv6_get_dsfield(ipv6_hdr(skb)), 0, sk->sk_priority,
+ tcp_rsk(req)->txhash);
}
@@ -1148,6 +1182,11 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
if (!ipv6_unicast_destination(skb))
goto drop;
+ if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) {
+ __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS);
+ return 0;
+ }
+
return tcp_conn_request(&tcp6_request_sock_ops,
&tcp_request_sock_ipv6_ops, sk, skb);
@@ -1177,6 +1216,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
const struct ipv6_pinfo *np = tcp_inet6_sk(sk);
struct ipv6_txoptions *opt;
struct inet_sock *newinet;
+ bool found_dup_sk = false;
struct tcp_sock *newtp;
struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
@@ -1198,7 +1238,6 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk);
- newinet = inet_sk(newsk);
newnp = tcp_inet6_sk(newsk);
newtp = tcp_sk(newsk);
@@ -1298,6 +1337,12 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
if (np->repflow)
newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));
+ /* Set ToS of the new socket based upon the value of incoming SYN.
+ * ECT bits are set later in tcp_init_transfer().
+ */
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
+ newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
+
/* Clone native IPv6 options from listening socket (if any)
Yes, keeping reference count would be much more clever,
@@ -1338,7 +1383,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
* across. Shucks.
*/
tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr,
- AF_INET6, 128, l3index, key->key, key->keylen,
+ AF_INET6, 128, l3index, key->flags, key->key, key->keylen,
sk_gfp_mask(sk, GFP_ATOMIC));
}
#endif
@@ -1348,7 +1393,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
tcp_done(newsk);
goto out;
}
- *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
+ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
+ &found_dup_sk);
if (*own_req) {
tcp_move_syn(newtp, req);
@@ -1363,6 +1409,15 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
skb_set_owner_r(newnp->pktoptions, newsk);
}
}
+ } else {
+ if (!req_unhash && found_dup_sk) {
+ /* This code path should only be executed in the
+ * syncookie case only
+ */
+ bh_unlock_sock(newsk);
+ sock_put(newsk);
+ newsk = NULL;
+ }
}
return newsk;
@@ -1376,6 +1431,8 @@ out:
return NULL;
}
+INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
+ u32));
/* The socket must have it's spinlock held when we get
* here, unless it is a TCP_LISTEN socket.
*
@@ -1384,10 +1441,12 @@ out:
* This is because we cannot sleep with the original spinlock
* held.
*/
-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
+INDIRECT_CALLABLE_SCOPE
+int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct ipv6_pinfo *np = tcp_inet6_sk(sk);
struct sk_buff *opt_skb = NULL;
+ enum skb_drop_reason reason;
struct tcp_sock *tp;
/* Imagine: socket is IPv6. IPv4 packet arrives,
@@ -1422,16 +1481,21 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
if (np->rxopt.all)
opt_skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC));
+ reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
- struct dst_entry *dst = sk->sk_rx_dst;
+ struct dst_entry *dst;
+
+ dst = rcu_dereference_protected(sk->sk_rx_dst,
+ lockdep_sock_is_held(sk));
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
if (dst) {
- if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
- dst->ops->check(dst, np->rx_dst_cookie) == NULL) {
+ if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
+ INDIRECT_CALL_1(dst->ops->check, ip6_dst_check,
+ dst, sk->sk_rx_dst_cookie) == NULL) {
+ RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
dst_release(dst);
- sk->sk_rx_dst = NULL;
}
}
@@ -1471,9 +1535,11 @@ reset:
discard:
if (opt_skb)
__kfree_skb(opt_skb);
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return 0;
csum_err:
+ reason = SKB_DROP_REASON_TCP_CSUM;
+ trace_tcp_bad_csum(skb);
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
@@ -1508,7 +1574,7 @@ ipv6_pktoptions:
}
}
- kfree_skb(opt_skb);
+ consume_skb(opt_skb);
return 0;
}
@@ -1538,7 +1604,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
{
- struct sk_buff *skb_to_free;
+ enum skb_drop_reason drop_reason;
int sdif = inet6_sdif(skb);
int dif = inet6_iif(skb);
const struct tcphdr *th;
@@ -1548,6 +1614,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
int ret;
struct net *net = dev_net(skb->dev);
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
@@ -1561,8 +1628,10 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
th = (const struct tcphdr *)skb->data;
- if (unlikely(th->doff < sizeof(struct tcphdr)/4))
+ if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
+ drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
goto bad_packet;
+ }
if (!pskb_may_pull(skb, th->doff*4))
goto discard_it;
@@ -1573,7 +1642,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
hdr = ipv6_hdr(skb);
lookup:
- sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th),
+ sk = __inet6_lookup_skb(net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th),
th->source, th->dest, inet6_iif(skb), sdif,
&refcounted);
if (!sk)
@@ -1589,7 +1658,10 @@ process:
struct sock *nsk;
sk = req->rsk_listener;
- if (tcp_v6_inbound_md5_hash(sk, skb, dif, sdif)) {
+ drop_reason = tcp_inbound_md5_hash(sk, skb,
+ &hdr->saddr, &hdr->daddr,
+ AF_INET6, dif, sdif);
+ if (drop_reason) {
sk_drops_add(sk, skb);
reqsk_put(req);
goto discard_it;
@@ -1599,10 +1671,18 @@ process:
goto csum_error;
}
if (unlikely(sk->sk_state != TCP_LISTEN)) {
- inet_csk_reqsk_queue_drop_and_put(sk, req);
- goto lookup;
+ nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
+ if (!nsk) {
+ inet_csk_reqsk_queue_drop_and_put(sk, req);
+ goto lookup;
+ }
+ sk = nsk;
+ /* reuseport_migrate_sock() has already held one sk_refcnt
+ * before returning.
+ */
+ } else {
+ sock_hold(sk);
}
- sock_hold(sk);
refcounted = true;
nsk = NULL;
if (!tcp_filter(sk, skb)) {
@@ -1610,6 +1690,8 @@ process:
hdr = ipv6_hdr(skb);
tcp_v6_fill_cb(skb, hdr, th);
nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
+ } else {
+ drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
}
if (!nsk) {
reqsk_put(req);
@@ -1636,19 +1718,29 @@ process:
return 0;
}
}
- if (hdr->hop_limit < tcp_inet6_sk(sk)->min_hopcount) {
- __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
- goto discard_and_relse;
+
+ if (static_branch_unlikely(&ip6_min_hopcount)) {
+ /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */
+ if (hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) {
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+ goto discard_and_relse;
+ }
}
- if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
+ if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
goto discard_and_relse;
+ }
- if (tcp_v6_inbound_md5_hash(sk, skb, dif, sdif))
+ drop_reason = tcp_inbound_md5_hash(sk, skb, &hdr->saddr, &hdr->daddr,
+ AF_INET6, dif, sdif);
+ if (drop_reason)
goto discard_and_relse;
- if (tcp_filter(sk, skb))
+ if (tcp_filter(sk, skb)) {
+ drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
goto discard_and_relse;
+ }
th = (const struct tcphdr *)skb->data;
hdr = ipv6_hdr(skb);
tcp_v6_fill_cb(skb, hdr, th);
@@ -1666,23 +1758,19 @@ process:
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
if (!sock_owned_by_user(sk)) {
- skb_to_free = sk->sk_rx_skb_cache;
- sk->sk_rx_skb_cache = NULL;
ret = tcp_v6_do_rcv(sk, skb);
} else {
- if (tcp_add_backlog(sk, skb))
+ if (tcp_add_backlog(sk, skb, &drop_reason))
goto discard_and_relse;
- skb_to_free = NULL;
}
bh_unlock_sock(sk);
- if (skb_to_free)
- __kfree_skb(skb_to_free);
put_and_return:
if (refcounted)
sock_put(sk);
return ret ? -1 : 0;
no_tcp_socket:
+ drop_reason = SKB_DROP_REASON_NO_SOCKET;
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
@@ -1690,6 +1778,8 @@ no_tcp_socket:
if (tcp_checksum_complete(skb)) {
csum_error:
+ drop_reason = SKB_DROP_REASON_TCP_CSUM;
+ trace_tcp_bad_csum(skb);
__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
@@ -1698,7 +1788,8 @@ bad_packet:
}
discard_it:
- kfree_skb(skb);
+ SKB_DR_OR(drop_reason, NOT_SPECIFIED);
+ kfree_skb_reason(skb, drop_reason);
return 0;
discard_and_relse:
@@ -1709,6 +1800,7 @@ discard_and_relse:
do_time_wait:
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
inet_twsk_put(inet_twsk(sk));
goto discard_it;
}
@@ -1725,7 +1817,7 @@ do_time_wait:
{
struct sock *sk2;
- sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo,
+ sk2 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
skb, __tcp_hdrlen(th),
&ipv6_hdr(skb)->saddr, th->source,
&ipv6_hdr(skb)->daddr,
@@ -1742,7 +1834,7 @@ do_time_wait:
}
}
/* to ACK */
- /* fall through */
+ fallthrough;
case TCP_TW_ACK:
tcp_v6_timewait_ack(sk, skb);
break;
@@ -1756,8 +1848,9 @@ do_time_wait:
goto discard_it;
}
-INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb)
+void tcp_v6_early_demux(struct sk_buff *skb)
{
+ struct net *net = dev_net(skb->dev);
const struct ipv6hdr *hdr;
const struct tcphdr *th;
struct sock *sk;
@@ -1775,7 +1868,7 @@ INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb)
return;
/* Note : We use inet6_iif() here, not tcp_v6_iif() */
- sk = __inet6_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
+ sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
&hdr->saddr, th->source,
&hdr->daddr, ntohs(th->dest),
inet6_iif(skb), inet6_sdif(skb));
@@ -1783,12 +1876,12 @@ INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb)
skb->sk = sk;
skb->destructor = sock_edemux;
if (sk_fullsock(sk)) {
- struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
+ struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
if (dst)
- dst = dst_check(dst, tcp_inet6_sk(sk)->rx_dst_cookie);
+ dst = dst_check(dst, sk->sk_rx_dst_cookie);
if (dst &&
- inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
+ sk->sk_rx_dst_ifindex == skb->skb_iif)
skb_dst_set_noref(skb, dst);
}
}
@@ -1800,6 +1893,11 @@ static struct timewait_sock_ops tcp6_timewait_sock_ops = {
.twsk_destructor = tcp_twsk_destructor,
};
+INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)
+{
+ __tcp_v6_send_check(skb, &sk->sk_v6_rcv_saddr, &sk->sk_v6_daddr);
+}
+
const struct inet_connection_sock_af_ops ipv6_specific = {
.queue_xmit = inet6_csk_xmit,
.send_check = tcp_v6_send_check,
@@ -1813,10 +1911,6 @@ const struct inet_connection_sock_af_ops ipv6_specific = {
.getsockopt = ipv6_getsockopt,
.addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in6),
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_ipv6_setsockopt,
- .compat_getsockopt = compat_ipv6_getsockopt,
-#endif
.mtu_reduced = tcp_v6_mtu_reduced,
};
@@ -1843,10 +1937,6 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = {
.getsockopt = ipv6_getsockopt,
.addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in6),
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_ipv6_setsockopt,
- .compat_getsockopt = compat_ipv6_getsockopt,
-#endif
.mtu_reduced = tcp_v4_mtu_reduced,
};
@@ -1981,7 +2071,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
jiffies_to_clock_t(icsk->icsk_rto),
jiffies_to_clock_t(icsk->icsk_ack.ato),
(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sp),
- tp->snd_cwnd,
+ tcp_snd_cwnd(tp),
state == TCP_LISTEN ?
fastopenq->max_qlen :
(tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)
@@ -2078,6 +2168,7 @@ struct proto tcpv6_prot = {
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
+ .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
.keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
@@ -2087,11 +2178,18 @@ struct proto tcpv6_prot = {
.hash = inet6_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
+ .put_port = inet_put_port,
+#ifdef CONFIG_BPF_SYSCALL
+ .psock_update_sk_prot = tcp_bpf_update_proto,
+#endif
.enter_memory_pressure = tcp_enter_memory_pressure,
.leave_memory_pressure = tcp_leave_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
+
.memory_allocated = &tcp_memory_allocated,
+ .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
+
.memory_pressure = &tcp_memory_pressure,
.orphan_count = &tcp_orphan_count,
.sysctl_mem = sysctl_tcp_mem,
@@ -2102,21 +2200,13 @@ struct proto tcpv6_prot = {
.slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp6_timewait_sock_ops,
.rsk_prot = &tcp6_request_sock_ops,
- .h.hashinfo = &tcp_hashinfo,
+ .h.hashinfo = NULL,
.no_autobind = true,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_tcp_setsockopt,
- .compat_getsockopt = compat_tcp_getsockopt,
-#endif
.diag_destroy = tcp_abort,
};
+EXPORT_SYMBOL_GPL(tcpv6_prot);
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct inet6_protocol tcpv6_protocol = {
- .early_demux = tcp_v6_early_demux,
- .early_demux_handler = tcp_v6_early_demux,
+static const struct inet6_protocol tcpv6_protocol = {
.handler = tcp_v6_rcv,
.err_handler = tcp_v6_err,
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
@@ -2144,7 +2234,7 @@ static void __net_exit tcpv6_net_exit(struct net *net)
static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list)
{
- inet_twsk_purge(&tcp_hashinfo, AF_INET6);
+ tcp_twsk_purge(net_exit_list, AF_INET6);
}
static struct pernet_operations tcpv6_net_ops = {
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index 1796856bc24f..39db5a226855 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -7,6 +7,7 @@
*/
#include <linux/indirect_call_wrapper.h>
#include <linux/skbuff.h>
+#include <net/gro.h>
#include <net/protocol.h>
#include <net/tcp.h>
#include <net/ip6_checksum.h>
diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c
index 21e7b95ddbfa..00e8d8b1c9a7 100644
--- a/net/ipv6/tunnel6.c
+++ b/net/ipv6/tunnel6.c
@@ -21,8 +21,14 @@
static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly;
static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly;
+static struct xfrm6_tunnel __rcu *tunnelmpls6_handlers __read_mostly;
static DEFINE_MUTEX(tunnel6_mutex);
+static inline int xfrm6_tunnel_mpls_supported(void)
+{
+ return IS_ENABLED(CONFIG_MPLS);
+}
+
int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family)
{
struct xfrm6_tunnel __rcu **pprev;
@@ -32,8 +38,21 @@ int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family)
mutex_lock(&tunnel6_mutex);
- for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers;
- (t = rcu_dereference_protected(*pprev,
+ switch (family) {
+ case AF_INET6:
+ pprev = &tunnel6_handlers;
+ break;
+ case AF_INET:
+ pprev = &tunnel46_handlers;
+ break;
+ case AF_MPLS:
+ pprev = &tunnelmpls6_handlers;
+ break;
+ default:
+ goto err;
+ }
+
+ for (; (t = rcu_dereference_protected(*pprev,
lockdep_is_held(&tunnel6_mutex))) != NULL;
pprev = &t->next) {
if (t->priority > priority)
@@ -62,8 +81,21 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family)
mutex_lock(&tunnel6_mutex);
- for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers;
- (t = rcu_dereference_protected(*pprev,
+ switch (family) {
+ case AF_INET6:
+ pprev = &tunnel6_handlers;
+ break;
+ case AF_INET:
+ pprev = &tunnel46_handlers;
+ break;
+ case AF_MPLS:
+ pprev = &tunnelmpls6_handlers;
+ break;
+ default:
+ goto err;
+ }
+
+ for (; (t = rcu_dereference_protected(*pprev,
lockdep_is_held(&tunnel6_mutex))) != NULL;
pprev = &t->next) {
if (t == handler) {
@@ -73,6 +105,7 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family)
}
}
+err:
mutex_unlock(&tunnel6_mutex);
synchronize_net();
@@ -86,6 +119,24 @@ EXPORT_SYMBOL(xfrm6_tunnel_deregister);
handler != NULL; \
handler = rcu_dereference(handler->next)) \
+static int tunnelmpls6_rcv(struct sk_buff *skb)
+{
+ struct xfrm6_tunnel *handler;
+
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ goto drop;
+
+ for_each_tunnel_rcu(tunnelmpls6_handlers, handler)
+ if (!handler->handler(skb))
+ return 0;
+
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
static int tunnel6_rcv(struct sk_buff *skb)
{
struct xfrm6_tunnel *handler;
@@ -104,6 +155,33 @@ drop:
return 0;
}
+#if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL)
+static int tunnel6_rcv_cb(struct sk_buff *skb, u8 proto, int err)
+{
+ struct xfrm6_tunnel __rcu *head;
+ struct xfrm6_tunnel *handler;
+ int ret;
+
+ head = (proto == IPPROTO_IPV6) ? tunnel6_handlers : tunnel46_handlers;
+
+ for_each_tunnel_rcu(head, handler) {
+ if (handler->cb_handler) {
+ ret = handler->cb_handler(skb, err);
+ if (ret <= 0)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static const struct xfrm_input_afinfo tunnel6_input_afinfo = {
+ .family = AF_INET6,
+ .is_ipip = true,
+ .callback = tunnel6_rcv_cb,
+};
+#endif
+
static int tunnel46_rcv(struct sk_buff *skb)
{
struct xfrm6_tunnel *handler;
@@ -146,6 +224,18 @@ static int tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return -ENOENT;
}
+static int tunnelmpls6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ struct xfrm6_tunnel *handler;
+
+ for_each_tunnel_rcu(tunnelmpls6_handlers, handler)
+ if (!handler->err_handler(skb, opt, type, code, offset, info))
+ return 0;
+
+ return -ENOENT;
+}
+
static const struct inet6_protocol tunnel6_protocol = {
.handler = tunnel6_rcv,
.err_handler = tunnel6_err,
@@ -158,6 +248,12 @@ static const struct inet6_protocol tunnel46_protocol = {
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
};
+static const struct inet6_protocol tunnelmpls6_protocol = {
+ .handler = tunnelmpls6_rcv,
+ .err_handler = tunnelmpls6_err,
+ .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
static int __init tunnel6_init(void)
{
if (inet6_add_protocol(&tunnel6_protocol, IPPROTO_IPV6)) {
@@ -169,15 +265,39 @@ static int __init tunnel6_init(void)
inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6);
return -EAGAIN;
}
+ if (xfrm6_tunnel_mpls_supported() &&
+ inet6_add_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS)) {
+ pr_err("%s: can't add protocol\n", __func__);
+ inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6);
+ inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP);
+ return -EAGAIN;
+ }
+#if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL)
+ if (xfrm_input_register_afinfo(&tunnel6_input_afinfo)) {
+ pr_err("%s: can't add input afinfo\n", __func__);
+ inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6);
+ inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP);
+ if (xfrm6_tunnel_mpls_supported())
+ inet6_del_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS);
+ return -EAGAIN;
+ }
+#endif
return 0;
}
static void __exit tunnel6_fini(void)
{
+#if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL)
+ if (xfrm_input_unregister_afinfo(&tunnel6_input_afinfo))
+ pr_err("%s: can't remove input afinfo\n", __func__);
+#endif
if (inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP))
pr_err("%s: can't remove protocol\n", __func__);
if (inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6))
pr_err("%s: can't remove protocol\n", __func__);
+ if (xfrm6_tunnel_mpls_supported() &&
+ inet6_del_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS))
+ pr_err("%s: can't remove protocol\n", __func__);
}
module_init(tunnel6_init);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 5dc439a391fe..bc65e5b7195b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -17,6 +17,7 @@
* YOSHIFUJI Hideaki @USAGI: convert /proc/net/udp6 to seq_file.
*/
+#include <linux/bpf-cgroup.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
@@ -40,6 +41,7 @@
#include <net/transp_v6.h>
#include <net/ip6_route.h>
#include <net/raw.h>
+#include <net/seg6.h>
#include <net/tcp_states.h>
#include <net/ip6_checksum.h>
#include <net/ip6_tunnel.h>
@@ -54,6 +56,20 @@
#include <trace/events/skb.h>
#include "udp_impl.h"
+static void udpv6_destruct_sock(struct sock *sk)
+{
+ udp_destruct_common(sk);
+ inet6_sock_destruct(sk);
+}
+
+int udpv6_init_sock(struct sock *sk)
+{
+ skb_queue_head_init(&udp_sk(sk)->reader_queue);
+ sk->sk_destruct = udpv6_destruct_sock;
+ set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
+ return 0;
+}
+
static u32 udp6_ehashfn(const struct net *net,
const struct in6_addr *laddr,
const u16 lport,
@@ -103,7 +119,7 @@ static int compute_score(struct sock *sk, struct net *net,
const struct in6_addr *daddr, unsigned short hnum,
int dif, int sdif)
{
- int score;
+ int bound_dev_if, score;
struct inet_sock *inet;
bool dev_match;
@@ -130,10 +146,12 @@ static int compute_score(struct sock *sk, struct net *net,
score++;
}
- dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif);
+ bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
+ dev_match = udp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif);
if (!dev_match)
return -1;
- score++;
+ if (bound_dev_if)
+ score++;
if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
score++;
@@ -141,6 +159,24 @@ static int compute_score(struct sock *sk, struct net *net,
return score;
}
+static struct sock *lookup_reuseport(struct net *net, struct sock *sk,
+ struct sk_buff *skb,
+ const struct in6_addr *saddr,
+ __be16 sport,
+ const struct in6_addr *daddr,
+ unsigned int hnum)
+{
+ struct sock *reuse_sk = NULL;
+ u32 hash;
+
+ if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) {
+ hash = udp6_ehashfn(net, daddr, hnum, saddr, sport);
+ reuse_sk = reuseport_select_sock(sk, hash, skb,
+ sizeof(struct udphdr));
+ }
+ return reuse_sk;
+}
+
/* called with rcu_read_lock() */
static struct sock *udp6_lib_lookup2(struct net *net,
const struct in6_addr *saddr, __be16 sport,
@@ -150,7 +186,6 @@ static struct sock *udp6_lib_lookup2(struct net *net,
{
struct sock *sk, *result;
int score, badness;
- u32 hash = 0;
result = NULL;
badness = -1;
@@ -158,23 +193,44 @@ static struct sock *udp6_lib_lookup2(struct net *net,
score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif, sdif);
if (score > badness) {
- if (sk->sk_reuseport &&
- sk->sk_state != TCP_ESTABLISHED) {
- hash = udp6_ehashfn(net, daddr, hnum,
- saddr, sport);
-
- result = reuseport_select_sock(sk, hash, skb,
- sizeof(struct udphdr));
- if (result && !reuseport_has_conns(sk, false))
- return result;
- }
- result = sk;
+ result = lookup_reuseport(net, sk, skb,
+ saddr, sport, daddr, hnum);
+ /* Fall back to scoring if group has connections */
+ if (result && !reuseport_has_conns(sk))
+ return result;
+
+ result = result ? : sk;
badness = score;
}
}
return result;
}
+static inline struct sock *udp6_lookup_run_bpf(struct net *net,
+ struct udp_table *udptable,
+ struct sk_buff *skb,
+ const struct in6_addr *saddr,
+ __be16 sport,
+ const struct in6_addr *daddr,
+ u16 hnum, const int dif)
+{
+ struct sock *sk, *reuse_sk;
+ bool no_reuseport;
+
+ if (udptable != &udp_table)
+ return NULL; /* only UDP is supported */
+
+ no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_UDP, saddr, sport,
+ daddr, hnum, dif, &sk);
+ if (no_reuseport || IS_ERR_OR_NULL(sk))
+ return sk;
+
+ reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum);
+ if (reuse_sk)
+ sk = reuse_sk;
+ return sk;
+}
+
/* rcu_read_lock() must be held */
struct sock *__udp6_lib_lookup(struct net *net,
const struct in6_addr *saddr, __be16 sport,
@@ -185,25 +241,42 @@ struct sock *__udp6_lib_lookup(struct net *net,
unsigned short hnum = ntohs(dport);
unsigned int hash2, slot2;
struct udp_hslot *hslot2;
- struct sock *result;
+ struct sock *result, *sk;
hash2 = ipv6_portaddr_hash(net, daddr, hnum);
slot2 = hash2 & udptable->mask;
hslot2 = &udptable->hash2[slot2];
+ /* Lookup connected or non-wildcard sockets */
result = udp6_lib_lookup2(net, saddr, sport,
daddr, hnum, dif, sdif,
hslot2, skb);
- if (!result) {
- hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
- slot2 = hash2 & udptable->mask;
+ if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED)
+ goto done;
+
+ /* Lookup redirect from BPF */
+ if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
+ sk = udp6_lookup_run_bpf(net, udptable, skb,
+ saddr, sport, daddr, hnum, dif);
+ if (sk) {
+ result = sk;
+ goto done;
+ }
+ }
- hslot2 = &udptable->hash2[slot2];
+ /* Got non-wildcard socket or error on first lookup */
+ if (result)
+ goto done;
- result = udp6_lib_lookup2(net, saddr, sport,
- &in6addr_any, hnum, dif, sdif,
- hslot2, skb);
- }
+ /* Lookup wildcard sockets */
+ hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
+ slot2 = hash2 & udptable->mask;
+ hslot2 = &udptable->hash2[slot2];
+
+ result = udp6_lib_lookup2(net, saddr, sport,
+ &in6addr_any, hnum, dif, sdif,
+ hslot2, skb);
+done:
if (IS_ERR(result))
return NULL;
return result;
@@ -221,7 +294,7 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
inet6_sdif(skb), udptable, skb);
}
-struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
+struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
__be16 sport, __be16 dport)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
@@ -230,7 +303,6 @@ struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
&iph->daddr, dport, inet6_iif(skb),
inet6_sdif(skb), &udp_table, NULL);
}
-EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb);
/* Must be called under rcu_read_lock().
* Does increment socket refcount.
@@ -265,7 +337,7 @@ static int udp6_skb_len(struct sk_buff *skb)
*/
int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int noblock, int flags, int *addr_len)
+ int flags, int *addr_len)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct inet_sock *inet = inet_sk(sk);
@@ -285,7 +357,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
try_again:
off = sk_peek_offset(sk, flags);
- skb = __skb_recv_udp(sk, flags, noblock, &off, &err);
+ skb = __skb_recv_udp(sk, flags, &off, &err);
if (!skb)
return err;
@@ -334,7 +406,7 @@ try_again:
if (!peeking)
SNMP_INC_STATS(mib, UDP_MIB_INDATAGRAMS);
- sock_recv_ts_and_drops(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
/* Copy the address. */
if (msg->msg_name) {
@@ -355,9 +427,8 @@ try_again:
}
*addr_len = sizeof(*sin6);
- if (cgroup_bpf_enabled)
- BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
- (struct sockaddr *)sin6);
+ BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
+ (struct sockaddr *)sin6);
}
if (udp_sk(sk)->gro_enabled)
@@ -449,12 +520,14 @@ static struct sock *__udp6_lib_err_encap(struct net *net,
const struct ipv6hdr *hdr, int offset,
struct udphdr *uh,
struct udp_table *udptable,
+ struct sock *sk,
struct sk_buff *skb,
struct inet6_skb_parm *opt,
u8 type, u8 code, __be32 info)
{
+ int (*lookup)(struct sock *sk, struct sk_buff *skb);
int network_offset, transport_offset;
- struct sock *sk;
+ struct udp_sock *up;
network_offset = skb_network_offset(skb);
transport_offset = skb_transport_offset(skb);
@@ -465,18 +538,28 @@ static struct sock *__udp6_lib_err_encap(struct net *net,
/* Transport header needs to point to the UDP header */
skb_set_transport_header(skb, offset);
+ if (sk) {
+ up = udp_sk(sk);
+
+ lookup = READ_ONCE(up->encap_err_lookup);
+ if (lookup && lookup(sk, skb))
+ sk = NULL;
+
+ goto out;
+ }
+
sk = __udp6_lib_lookup(net, &hdr->daddr, uh->source,
&hdr->saddr, uh->dest,
inet6_iif(skb), 0, udptable, skb);
if (sk) {
- int (*lookup)(struct sock *sk, struct sk_buff *skb);
- struct udp_sock *up = udp_sk(sk);
+ up = udp_sk(sk);
lookup = READ_ONCE(up->encap_err_lookup);
if (!lookup || lookup(sk, skb))
sk = NULL;
}
+out:
if (!sk) {
sk = ERR_PTR(__udp6_lib_err_encap_no_sk(skb, opt, type, code,
offset, info));
@@ -495,7 +578,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
struct ipv6_pinfo *np;
const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
const struct in6_addr *saddr = &hdr->saddr;
- const struct in6_addr *daddr = &hdr->daddr;
+ const struct in6_addr *daddr = seg6_get_daddr(skb, opt) ? : &hdr->daddr;
struct udphdr *uh = (struct udphdr *)(skb->data+offset);
bool tunnel = false;
struct sock *sk;
@@ -505,16 +588,17 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
inet6_iif(skb), inet6_sdif(skb), udptable, NULL);
- if (!sk) {
+
+ if (!sk || udp_sk(sk)->encap_type) {
/* No socket for error: try tunnels before discarding */
- sk = ERR_PTR(-ENOENT);
if (static_branch_unlikely(&udpv6_encap_needed_key)) {
sk = __udp6_lib_err_encap(net, hdr, offset, uh,
- udptable, skb,
+ udptable, sk, skb,
opt, type, code, info);
if (!sk)
return 0;
- }
+ } else
+ sk = ERR_PTR(-ENOENT);
if (IS_ERR(sk)) {
__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
@@ -546,8 +630,11 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
}
/* Tunnels don't have an application socket: don't pass errors back */
- if (tunnel)
+ if (tunnel) {
+ if (udp_sk(sk)->encap_err_rcv)
+ udp_sk(sk)->encap_err_rcv(sk, skb, offset);
goto out;
+ }
if (!np->recverr) {
if (!harderr || sk->sk_state != TCP_ESTABLISHED)
@@ -557,7 +644,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
}
sk->sk_err = err;
- sk->sk_error_report(sk);
+ sk_error_report(sk);
out:
return 0;
}
@@ -577,13 +664,20 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
rc = __udp_enqueue_schedule_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
+ enum skb_drop_reason drop_reason;
/* Note that an ENOMEM error is charged twice */
- if (rc == -ENOMEM)
+ if (rc == -ENOMEM) {
UDP6_INC_STATS(sock_net(sk),
UDP_MIB_RCVBUFERRORS, is_udplite);
+ drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
+ } else {
+ UDP6_INC_STATS(sock_net(sk),
+ UDP_MIB_MEMERRORS, is_udplite);
+ drop_reason = SKB_DROP_REASON_PROTO_MEM;
+ }
UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
return -1;
}
@@ -599,11 +693,14 @@ static __inline__ int udpv6_err(struct sk_buff *skb,
static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{
+ enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
- if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
+ if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
goto drop;
+ }
if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) {
int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
@@ -630,9 +727,9 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
ret = encap_rcv(sk, skb);
if (ret <= 0) {
- __UDP_INC_STATS(sock_net(sk),
- UDP_MIB_INDATAGRAMS,
- is_udplite);
+ __UDP6_INC_STATS(sock_net(sk),
+ UDP_MIB_INDATAGRAMS,
+ is_udplite);
return -ret;
}
}
@@ -643,7 +740,7 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
/*
* UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c).
*/
- if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
+ if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
if (up->pcrlen == 0) { /* full coverage was set */
net_dbg_ratelimited("UDPLITE6: partial coverage %d while full coverage %d requested\n",
@@ -662,8 +759,10 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
udp_lib_checksum_complete(skb))
goto csum_error;
- if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr)))
+ if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) {
+ drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
goto drop;
+ }
udp_csum_pull_header(skb);
@@ -672,11 +771,12 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
return __udpv6_queue_rcv_skb(sk, skb);
csum_error:
+ drop_reason = SKB_DROP_REASON_UDP_CSUM;
__UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
__UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
atomic_inc(&sk->sk_drops);
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
return -1;
}
@@ -693,6 +793,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
skb_list_walk_safe(segs, skb, next) {
__skb_pull(skb, skb_transport_offset(skb));
+ udp_post_segment_fix_csum(skb);
ret = udpv6_queue_rcv_one_skb(sk, skb);
if (ret > 0)
ip6_protocol_deliver_rcu(dev_net(skb->dev), skb, ret,
@@ -716,7 +817,7 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
(inet->inet_dport && inet->inet_dport != rmt_port) ||
(!ipv6_addr_any(&sk->sk_v6_daddr) &&
!ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
- !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif) ||
+ !udp_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif, sdif) ||
(!ipv6_addr_any(&sk->sk_v6_rcv_saddr) &&
!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)))
return false;
@@ -813,7 +914,7 @@ static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
if (udp_sk_rx_dst_set(sk, dst)) {
const struct rt6_info *rt = (const struct rt6_info *)dst;
- inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
+ sk->sk_rx_dst_cookie = rt6_get_cookie(rt);
}
}
@@ -839,10 +940,12 @@ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
int proto)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
const struct in6_addr *saddr, *daddr;
struct net *net = dev_net(skb->dev);
struct udphdr *uh;
struct sock *sk;
+ bool refcounted;
u32 ulen = 0;
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
@@ -879,21 +982,23 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
goto csum_error;
/* Check if the socket is already available, e.g. due to early demux */
- sk = skb_steal_sock(skb);
+ sk = skb_steal_sock(skb, &refcounted);
if (sk) {
struct dst_entry *dst = skb_dst(skb);
int ret;
- if (unlikely(sk->sk_rx_dst != dst))
+ if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst))
udp6_sk_rx_dst_set(sk, dst);
if (!uh->check && !udp_sk(sk)->no_check6_rx) {
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
goto report_csum_error;
}
ret = udp6_unicast_rcv_skb(sk, skb, uh);
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
return ret;
}
@@ -912,6 +1017,8 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
return udp6_unicast_rcv_skb(sk, skb, uh);
}
+ reason = SKB_DROP_REASON_NO_SOCKET;
+
if (!uh->check)
goto report_csum_error;
@@ -924,10 +1031,12 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
__UDP6_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return 0;
short_packet:
+ if (reason == SKB_DROP_REASON_NOT_SPECIFIED)
+ reason = SKB_DROP_REASON_PKT_TOO_SMALL;
net_dbg_ratelimited("UDP%sv6: short packet: From [%pI6c]:%u %d/%d to [%pI6c]:%u\n",
proto == IPPROTO_UDPLITE ? "-Lite" : "",
saddr, ntohs(uh->source),
@@ -938,10 +1047,12 @@ short_packet:
report_csum_error:
udp6_csum_zero_error(skb);
csum_error:
+ if (reason == SKB_DROP_REASON_NOT_SPECIFIED)
+ reason = SKB_DROP_REASON_UDP_CSUM;
__UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
discard:
__UDP6_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return 0;
}
@@ -960,7 +1071,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net,
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
if (sk->sk_state == TCP_ESTABLISHED &&
- INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif, sdif))
+ inet6_match(net, sk, rmt_addr, loc_addr, ports, dif, sdif))
return sk;
/* Only check first socket in chain */
break;
@@ -968,7 +1079,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net,
return NULL;
}
-INDIRECT_CALLABLE_SCOPE void udp_v6_early_demux(struct sk_buff *skb)
+void udp_v6_early_demux(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
const struct udphdr *uh;
@@ -996,10 +1107,10 @@ INDIRECT_CALLABLE_SCOPE void udp_v6_early_demux(struct sk_buff *skb)
skb->sk = sk;
skb->destructor = sock_efree;
- dst = READ_ONCE(sk->sk_rx_dst);
+ dst = rcu_dereference(sk->sk_rx_dst);
if (dst)
- dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie);
+ dst = dst_check(dst, sk->sk_rx_dst_cookie);
if (dst) {
/* set noref for now.
* any place which wants to hold dst has to call
@@ -1040,7 +1151,7 @@ static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
* bytes that are out of the bound specified by user in addr_len.
*/
if (uaddr->sa_family == AF_INET) {
- if (__ipv6_only_sock(sk))
+ if (ipv6_only_sock(sk))
return -EAFNOSUPPORT;
return udp_pre_connect(sk, uaddr, addr_len);
}
@@ -1056,6 +1167,9 @@ static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
* @sk: socket we are sending on
* @skb: sk_buff containing the filled-in UDP header
* (checksum field must be zeroed out)
+ * @saddr: source address
+ * @daddr: destination address
+ * @len: length of packet
*/
static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
const struct in6_addr *saddr,
@@ -1127,7 +1241,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
kfree_skb(skb);
return -EINVAL;
}
- if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) {
+ if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) {
kfree_skb(skb);
return -EINVAL;
}
@@ -1187,23 +1301,17 @@ static int udp_v6_push_pending_frames(struct sock *sk)
{
struct sk_buff *skb;
struct udp_sock *up = udp_sk(sk);
- struct flowi6 fl6;
int err = 0;
if (up->pending == AF_INET)
return udp_push_pending_frames(sk);
- /* ip6_finish_skb will release the cork, so make a copy of
- * fl6 here.
- */
- fl6 = inet_sk(sk)->cork.fl.u.ip6;
-
skb = ip6_finish_skb(sk);
if (!skb)
goto out;
- err = udp_v6_send_skb(skb, &fl6, &inet_sk(sk)->cork.base);
-
+ err = udp_v6_send_skb(skb, &inet_sk(sk)->cork.fl.u.ip6,
+ &inet_sk(sk)->cork.base);
out:
up->len = 0;
up->pending = 0;
@@ -1221,19 +1329,20 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct ipv6_txoptions *opt = NULL;
struct ipv6_txoptions *opt_to_free = NULL;
struct ip6_flowlabel *flowlabel = NULL;
- struct flowi6 fl6;
+ struct inet_cork_full cork;
+ struct flowi6 *fl6 = &cork.fl.u.ip6;
struct dst_entry *dst;
struct ipcm6_cookie ipc6;
int addr_len = msg->msg_namelen;
bool connected = false;
int ulen = len;
- int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
+ int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE;
int err;
int is_udplite = IS_UDPLITE(sk);
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
ipcm6_init(&ipc6);
- ipc6.gso_size = up->gso_size;
+ ipc6.gso_size = READ_ONCE(up->gso_size);
ipc6.sockc.tsflags = sk->sk_tsflags;
ipc6.sockc.mark = sk->sk_mark;
@@ -1278,15 +1387,12 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
msg->msg_name = &sin;
msg->msg_namelen = sizeof(sin);
do_udp_sendmsg:
- if (__ipv6_only_sock(sk))
+ if (ipv6_only_sock(sk))
return -ENETUNREACH;
return udp_sendmsg(sk, msg, len);
}
}
- if (up->pending == AF_INET)
- return udp_sendmsg(sk, msg, len);
-
/* Rough check on arithmetic overflow,
better check is made in ip6_append_data().
*/
@@ -1295,6 +1401,8 @@ do_udp_sendmsg:
getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
if (up->pending) {
+ if (up->pending == AF_INET)
+ return udp_sendmsg(sk, msg, len);
/*
* There are pending frames.
* The socket lock must be held while it's corked.
@@ -1312,19 +1420,19 @@ do_udp_sendmsg:
}
ulen += sizeof(struct udphdr);
- memset(&fl6, 0, sizeof(fl6));
+ memset(fl6, 0, sizeof(*fl6));
if (sin6) {
if (sin6->sin6_port == 0)
return -EINVAL;
- fl6.fl6_dport = sin6->sin6_port;
+ fl6->fl6_dport = sin6->sin6_port;
daddr = &sin6->sin6_addr;
if (np->sndflow) {
- fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
- if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
- flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+ fl6->flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+ if (fl6->flowlabel & IPV6_FLOWLABEL_MASK) {
+ flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
if (IS_ERR(flowlabel))
return -EINVAL;
}
@@ -1341,25 +1449,24 @@ do_udp_sendmsg:
if (addr_len >= sizeof(struct sockaddr_in6) &&
sin6->sin6_scope_id &&
__ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr)))
- fl6.flowi6_oif = sin6->sin6_scope_id;
+ fl6->flowi6_oif = sin6->sin6_scope_id;
} else {
if (sk->sk_state != TCP_ESTABLISHED)
return -EDESTADDRREQ;
- fl6.fl6_dport = inet->inet_dport;
+ fl6->fl6_dport = inet->inet_dport;
daddr = &sk->sk_v6_daddr;
- fl6.flowlabel = np->flow_label;
+ fl6->flowlabel = np->flow_label;
connected = true;
}
- if (!fl6.flowi6_oif)
- fl6.flowi6_oif = sk->sk_bound_dev_if;
+ if (!fl6->flowi6_oif)
+ fl6->flowi6_oif = READ_ONCE(sk->sk_bound_dev_if);
- if (!fl6.flowi6_oif)
- fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
+ if (!fl6->flowi6_oif)
+ fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
- fl6.flowi6_mark = ipc6.sockc.mark;
- fl6.flowi6_uid = sk->sk_uid;
+ fl6->flowi6_uid = sk->sk_uid;
if (msg->msg_controllen) {
opt = &opt_space;
@@ -1369,14 +1476,14 @@ do_udp_sendmsg:
err = udp_cmsg_send(sk, msg, &ipc6.gso_size);
if (err > 0)
- err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6,
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, fl6,
&ipc6);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
}
- if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
- flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+ if ((fl6->flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
+ flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
if (IS_ERR(flowlabel))
return -EINVAL;
}
@@ -1393,15 +1500,17 @@ do_udp_sendmsg:
opt = ipv6_fixup_options(&opt_space, opt);
ipc6.opt = opt;
- fl6.flowi6_proto = sk->sk_protocol;
- fl6.daddr = *daddr;
- if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr))
- fl6.saddr = np->saddr;
- fl6.fl6_sport = inet->inet_sport;
+ fl6->flowi6_proto = sk->sk_protocol;
+ fl6->flowi6_mark = ipc6.sockc.mark;
+ fl6->daddr = *daddr;
+ if (ipv6_addr_any(&fl6->saddr) && !ipv6_addr_any(&np->saddr))
+ fl6->saddr = np->saddr;
+ fl6->fl6_sport = inet->inet_sport;
- if (cgroup_bpf_enabled && !connected) {
+ if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
- (struct sockaddr *)sin6, &fl6.saddr);
+ (struct sockaddr *)sin6,
+ &fl6->saddr);
if (err)
goto out_no_dst;
if (sin6) {
@@ -1417,32 +1526,32 @@ do_udp_sendmsg:
err = -EINVAL;
goto out_no_dst;
}
- fl6.fl6_dport = sin6->sin6_port;
- fl6.daddr = sin6->sin6_addr;
+ fl6->fl6_dport = sin6->sin6_port;
+ fl6->daddr = sin6->sin6_addr;
}
}
- if (ipv6_addr_any(&fl6.daddr))
- fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
+ if (ipv6_addr_any(&fl6->daddr))
+ fl6->daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
- final_p = fl6_update_dst(&fl6, opt, &final);
+ final_p = fl6_update_dst(fl6, opt, &final);
if (final_p)
connected = false;
- if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) {
- fl6.flowi6_oif = np->mcast_oif;
+ if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr)) {
+ fl6->flowi6_oif = np->mcast_oif;
connected = false;
- } else if (!fl6.flowi6_oif)
- fl6.flowi6_oif = np->ucast_oif;
+ } else if (!fl6->flowi6_oif)
+ fl6->flowi6_oif = np->ucast_oif;
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
if (ipc6.tclass < 0)
ipc6.tclass = np->tclass;
- fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
+ fl6->flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6->flowlabel);
- dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p, connected);
+ dst = ip6_sk_dst_lookup_flow(sk, fl6, final_p, connected);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
dst = NULL;
@@ -1450,7 +1559,7 @@ do_udp_sendmsg:
}
if (ipc6.hlimit < 0)
- ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+ ipc6.hlimit = ip6_sk_dst_hoplimit(np, fl6, dst);
if (msg->msg_flags&MSG_CONFIRM)
goto do_confirm;
@@ -1458,17 +1567,17 @@ back_from_confirm:
/* Lockless fast path for the non-corking case */
if (!corkreq) {
- struct inet_cork_full cork;
struct sk_buff *skb;
skb = ip6_make_skb(sk, getfrag, msg, ulen,
sizeof(struct udphdr), &ipc6,
- &fl6, (struct rt6_info *)dst,
+ (struct rt6_info *)dst,
msg->msg_flags, &cork);
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
- err = udp_v6_send_skb(skb, &fl6, &cork.base);
- goto out;
+ err = udp_v6_send_skb(skb, fl6, &cork.base);
+ /* ip6_make_skb steals dst reference */
+ goto out_no_dst;
}
lock_sock(sk);
@@ -1489,7 +1598,7 @@ do_append_data:
ipc6.dontfrag = np->dontfrag;
up->len += ulen;
err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr),
- &ipc6, &fl6, (struct rt6_info *)dst,
+ &ipc6, fl6, (struct rt6_info *)dst,
corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
if (err)
udp_v6_flush_pending_frames(sk);
@@ -1524,7 +1633,7 @@ out_no_dst:
do_confirm:
if (msg->msg_flags & MSG_PROBE)
- dst_confirm_neigh(dst, &fl6.daddr);
+ dst_confirm_neigh(dst, &fl6->daddr);
if (!(msg->msg_flags&MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
@@ -1535,6 +1644,9 @@ void udpv6_destroy_sock(struct sock *sk)
{
struct udp_sock *up = udp_sk(sk);
lock_sock(sk);
+
+ /* protects from races with udp_abort() */
+ sock_set_flag(sk, SOCK_DEAD);
udp_v6_flush_pending_frames(sk);
release_sock(sk);
@@ -1545,8 +1657,10 @@ void udpv6_destroy_sock(struct sock *sk)
if (encap_destroy)
encap_destroy(sk);
}
- if (up->encap_enabled)
+ if (up->encap_enabled) {
static_branch_dec(&udpv6_encap_needed_key);
+ udp_encap_disable();
+ }
}
inet6_destroy_sock(sk);
@@ -1555,26 +1669,16 @@ void udpv6_destroy_sock(struct sock *sk)
/*
* Socket option code for UDP
*/
-int udpv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
+int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+ unsigned int optlen)
{
if (level == SOL_UDP || level == SOL_UDPLITE)
- return udp_lib_setsockopt(sk, level, optname, optval, optlen,
+ return udp_lib_setsockopt(sk, level, optname,
+ optval, optlen,
udp_v6_push_pending_frames);
return ipv6_setsockopt(sk, level, optname, optval, optlen);
}
-#ifdef CONFIG_COMPAT
-int compat_udpv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- if (level == SOL_UDP || level == SOL_UDPLITE)
- return udp_lib_setsockopt(sk, level, optname, optval, optlen,
- udp_v6_push_pending_frames);
- return compat_ipv6_setsockopt(sk, level, optname, optval, optlen);
-}
-#endif
-
int udpv6_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -1583,22 +1687,7 @@ int udpv6_getsockopt(struct sock *sk, int level, int optname,
return ipv6_getsockopt(sk, level, optname, optval, optlen);
}
-#ifdef CONFIG_COMPAT
-int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- if (level == SOL_UDP || level == SOL_UDPLITE)
- return udp_lib_getsockopt(sk, level, optname, optval, optlen);
- return compat_ipv6_getsockopt(sk, level, optname, optval, optlen);
-}
-#endif
-
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct inet6_protocol udpv6_protocol = {
- .early_demux = udp_v6_early_demux,
- .early_demux_handler = udp_v6_early_demux,
+static const struct inet6_protocol udpv6_protocol = {
.handler = udpv6_rcv,
.err_handler = udpv6_err,
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
@@ -1658,7 +1747,7 @@ struct proto udpv6_prot = {
.connect = ip6_datagram_connect,
.disconnect = udp_disconnect,
.ioctl = udp_ioctl,
- .init = udp_init_sock,
+ .init = udpv6_init_sock,
.destroy = udpv6_destroy_sock,
.setsockopt = udpv6_setsockopt,
.getsockopt = udpv6_getsockopt,
@@ -1669,16 +1758,19 @@ struct proto udpv6_prot = {
.unhash = udp_lib_unhash,
.rehash = udp_v6_rehash,
.get_port = udp_v6_get_port,
+ .put_port = udp_lib_unhash,
+#ifdef CONFIG_BPF_SYSCALL
+ .psock_update_sk_prot = udp_bpf_update_proto,
+#endif
+
.memory_allocated = &udp_memory_allocated,
+ .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc,
+
.sysctl_mem = sysctl_udp_mem,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
.obj_size = sizeof(struct udp6_sock),
.h.udp_table = &udp_table,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_udpv6_setsockopt,
- .compat_getsockopt = compat_udpv6_getsockopt,
-#endif
.diag_destroy = udp_abort,
};
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index 20e324b6f358..0590f566379d 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -12,22 +12,17 @@ int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int);
int __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int,
__be32, struct udp_table *);
+int udpv6_init_sock(struct sock *sk);
int udp_v6_get_port(struct sock *sk, unsigned short snum);
void udp_v6_rehash(struct sock *sk);
int udpv6_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
-int udpv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen);
-#ifdef CONFIG_COMPAT
-int compat_udpv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen);
-int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen);
-#endif
+int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+ unsigned int optlen);
int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
-int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
- int flags, int *addr_len);
+int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+ int *addr_len);
void udpv6_destroy_sock(struct sock *sk);
#ifdef CONFIG_PROC_FS
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 584157a07759..7720d04ed396 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -13,6 +13,7 @@
#include <net/udp.h>
#include <net/ip6_checksum.h>
#include "ip6_offload.h"
+#include <net/gro.h>
static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
netdev_features_t features)
@@ -28,10 +29,6 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
int tnl_hlen;
int err;
- mss = skb_shinfo(skb)->gso_size;
- if (unlikely(skb->len <= mss))
- goto out;
-
if (skb->encapsulation && skb_shinfo(skb)->gso_type &
(SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))
segs = skb_udp_tunnel_segment(skb, features, true);
@@ -46,7 +43,11 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
goto out;
if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
- return __udp_gso_segment(skb, features);
+ return __udp_gso_segment(skb, features, true);
+
+ mss = skb_shinfo(skb)->gso_size;
+ if (unlikely(skb->len <= mss))
+ goto out;
/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
* do checksum of UDP packets sent as multiple IP fragments.
@@ -111,12 +112,22 @@ out:
return segs;
}
+static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport,
+ __be16 dport)
+{
+ const struct ipv6hdr *iph = skb_gro_network_header(skb);
+
+ return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
+ &iph->daddr, dport, inet6_iif(skb),
+ inet6_sdif(skb), &udp_table, NULL);
+}
+
INDIRECT_CALLABLE_SCOPE
struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
+ struct sock *sk = NULL;
struct sk_buff *pp;
- struct sock *sk;
if (unlikely(!uh))
goto flush;
@@ -134,10 +145,11 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb)
skip:
NAPI_GRO_CB(skb)->is_ipv6 = 1;
- rcu_read_lock();
- sk = static_branch_unlikely(&udpv6_encap_needed_key) ? udp6_lib_lookup_skb(skb, uh->source, uh->dest) : NULL;
+
+ if (static_branch_unlikely(&udpv6_encap_needed_key))
+ sk = udp6_gro_lookup_skb(skb, uh->source, uh->dest);
+
pp = udp_gro_receive(head, skb, uh, sk);
- rcu_read_unlock();
return pp;
flush:
@@ -150,7 +162,8 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff)
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
- if (NAPI_GRO_CB(skb)->is_flist) {
+ /* do fraglist only if there is no outer UDP encap (or we already processed it) */
+ if (NAPI_GRO_CB(skb)->is_flist && !NAPI_GRO_CB(skb)->encap_mark) {
uh->len = htons(skb->len - nhoff);
skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4);
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index bf7a7acd39b1..67eaf3ca14ce 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -12,6 +12,13 @@
#include <linux/proc_fs.h>
#include "udp_impl.h"
+static int udplitev6_sk_init(struct sock *sk)
+{
+ udpv6_init_sock(sk);
+ udp_sk(sk)->pcflag = UDPLITE_BIT;
+ return 0;
+}
+
static int udplitev6_rcv(struct sk_buff *skb)
{
return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
@@ -38,7 +45,7 @@ struct proto udplitev6_prot = {
.connect = ip6_datagram_connect,
.disconnect = udp_disconnect,
.ioctl = udp_ioctl,
- .init = udplite_sk_init,
+ .init = udplitev6_sk_init,
.destroy = udpv6_destroy_sock,
.setsockopt = udpv6_setsockopt,
.getsockopt = udpv6_getsockopt,
@@ -48,14 +55,13 @@ struct proto udplitev6_prot = {
.unhash = udp_lib_unhash,
.rehash = udp_v6_rehash,
.get_port = udp_v6_get_port,
+
.memory_allocated = &udp_memory_allocated,
+ .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc,
+
.sysctl_mem = sysctl_udp_mem,
.obj_size = sizeof(struct udp6_sock),
.h.udp_table = &udplite_table,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_udpv6_setsockopt,
- .compat_getsockopt = compat_udpv6_getsockopt,
-#endif
};
static struct inet_protosw udplite6_protosw = {
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index a52cb3fc6df5..04cbeefd8982 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -17,11 +17,6 @@
#include <net/ipv6.h>
#include <net/xfrm.h>
-int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb)
-{
- return xfrm6_extract_header(skb);
-}
-
int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
struct ip6_tnl *t)
{
@@ -35,9 +30,12 @@ EXPORT_SYMBOL(xfrm6_rcv_spi);
static int xfrm6_transport_finish2(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
- if (xfrm_trans_queue(skb, ip6_rcv_finish))
- __kfree_skb(skb);
- return -1;
+ if (xfrm_trans_queue(skb, ip6_rcv_finish)) {
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ return 0;
}
int xfrm6_transport_finish(struct sk_buff *skb, int async)
@@ -60,13 +58,106 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
if (xo && (xo->flags & XFRM_GRO)) {
skb_mac_header_rebuild(skb);
skb_reset_transport_header(skb);
- return -1;
+ return 0;
}
NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
dev_net(skb->dev), NULL, skb, skb->dev, NULL,
xfrm6_transport_finish2);
- return -1;
+ return 0;
+}
+
+/* If it's a keepalive packet, then just eat it.
+ * If it's an encapsulated packet, then pass it to the
+ * IPsec xfrm input.
+ * Returns 0 if skb passed to xfrm or was dropped.
+ * Returns >0 if skb should be passed to UDP.
+ * Returns <0 if skb should be resubmitted (-ret is protocol)
+ */
+int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct udp_sock *up = udp_sk(sk);
+ struct udphdr *uh;
+ struct ipv6hdr *ip6h;
+ int len;
+ int ip6hlen = sizeof(struct ipv6hdr);
+
+ __u8 *udpdata;
+ __be32 *udpdata32;
+ __u16 encap_type = up->encap_type;
+
+ /* if this is not encapsulated socket, then just return now */
+ if (!encap_type)
+ return 1;
+
+ /* If this is a paged skb, make sure we pull up
+ * whatever data we need to look at. */
+ len = skb->len - sizeof(struct udphdr);
+ if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8)))
+ return 1;
+
+ /* Now we can get the pointers */
+ uh = udp_hdr(skb);
+ udpdata = (__u8 *)uh + sizeof(struct udphdr);
+ udpdata32 = (__be32 *)udpdata;
+
+ switch (encap_type) {
+ default:
+ case UDP_ENCAP_ESPINUDP:
+ /* Check if this is a keepalive packet. If so, eat it. */
+ if (len == 1 && udpdata[0] == 0xff) {
+ goto drop;
+ } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) {
+ /* ESP Packet without Non-ESP header */
+ len = sizeof(struct udphdr);
+ } else
+ /* Must be an IKE packet.. pass it through */
+ return 1;
+ break;
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ /* Check if this is a keepalive packet. If so, eat it. */
+ if (len == 1 && udpdata[0] == 0xff) {
+ goto drop;
+ } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) &&
+ udpdata32[0] == 0 && udpdata32[1] == 0) {
+
+ /* ESP Packet with Non-IKE marker */
+ len = sizeof(struct udphdr) + 2 * sizeof(u32);
+ } else
+ /* Must be an IKE packet.. pass it through */
+ return 1;
+ break;
+ }
+
+ /* At this point we are sure that this is an ESPinUDP packet,
+ * so we need to remove 'len' bytes from the packet (the UDP
+ * header and optional ESP marker bytes) and then modify the
+ * protocol to ESP, and then call into the transform receiver.
+ */
+ if (skb_unclone(skb, GFP_ATOMIC))
+ goto drop;
+
+ /* Now we can update and verify the packet length... */
+ ip6h = ipv6_hdr(skb);
+ ip6h->payload_len = htons(ntohs(ip6h->payload_len) - len);
+ if (skb->len < ip6hlen + len) {
+ /* packet is too small!?! */
+ goto drop;
+ }
+
+ /* pull the data buffer up to the ESP header and set the
+ * transport header to point to ESP. Keep UDP on the stack
+ * for later.
+ */
+ __skb_pull(skb, len);
+ skb_reset_transport_header(skb);
+
+ /* process ESP */
+ return xfrm6_rcv_encap(skb, IPPROTO_ESP, 0, encap_type);
+
+drop:
+ kfree_skb(skb);
+ return 0;
}
int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t)
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index fbe51d40bd7e..ad07904642ca 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -16,31 +16,7 @@
#include <net/ip6_route.h>
#include <net/xfrm.h>
-int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb,
- u8 **prevhdr)
-{
- return ip6_find_1stfragopt(skb, prevhdr);
-}
-EXPORT_SYMBOL(xfrm6_find_1stfragopt);
-
-static int xfrm6_local_dontfrag(struct sk_buff *skb)
-{
- int proto;
- struct sock *sk = skb->sk;
-
- if (sk) {
- if (sk->sk_family != AF_INET6)
- return 0;
-
- proto = sk->sk_protocol;
- if (proto == IPPROTO_UDP || proto == IPPROTO_RAW)
- return inet6_sk(sk)->dontfrag;
- }
-
- return 0;
-}
-
-static void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu)
+void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu)
{
struct flowi6 fl6;
struct sock *sk = skb->sk;
@@ -64,89 +40,29 @@ void xfrm6_local_error(struct sk_buff *skb, u32 mtu)
ipv6_local_error(sk, EMSGSIZE, &fl6, mtu);
}
-static int xfrm6_tunnel_check_size(struct sk_buff *skb)
-{
- int mtu, ret = 0;
- struct dst_entry *dst = skb_dst(skb);
-
- if (skb->ignore_df)
- goto out;
-
- mtu = dst_mtu(dst);
- if (mtu < IPV6_MIN_MTU)
- mtu = IPV6_MIN_MTU;
-
- if ((!skb_is_gso(skb) && skb->len > mtu) ||
- (skb_is_gso(skb) &&
- !skb_gso_validate_network_len(skb, ip6_skb_dst_mtu(skb)))) {
- skb->dev = dst->dev;
- skb->protocol = htons(ETH_P_IPV6);
-
- if (xfrm6_local_dontfrag(skb))
- xfrm6_local_rxpmtu(skb, mtu);
- else if (skb->sk)
- xfrm_local_error(skb, mtu);
- else
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- ret = -EMSGSIZE;
- }
-out:
- return ret;
-}
-
-int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb)
-{
- int err;
-
- err = xfrm6_tunnel_check_size(skb);
- if (err)
- return err;
-
- XFRM_MODE_SKB_CB(skb)->protocol = ipv6_hdr(skb)->nexthdr;
-
- return xfrm6_extract_header(skb);
-}
-
-int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb)
+static int __xfrm6_output_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
-
-#ifdef CONFIG_NETFILTER
- IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED;
-#endif
-
return xfrm_output(sk, skb);
}
-static int __xfrm6_output_state_finish(struct xfrm_state *x, struct sock *sk,
- struct sk_buff *skb)
-{
- const struct xfrm_state_afinfo *afinfo;
- int ret = -EAFNOSUPPORT;
-
- rcu_read_lock();
- afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode.family);
- if (likely(afinfo))
- ret = afinfo->output_finish(sk, skb);
- else
- kfree_skb(skb);
- rcu_read_unlock();
-
- return ret;
-}
-
-static int __xfrm6_output_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int xfrm6_noneed_fragment(struct sk_buff *skb)
{
- struct xfrm_state *x = skb_dst(skb)->xfrm;
-
- return __xfrm6_output_state_finish(x, sk, skb);
+ struct frag_hdr *fh;
+ u8 prevhdr = ipv6_hdr(skb)->nexthdr;
+
+ if (prevhdr != NEXTHDR_FRAGMENT)
+ return 0;
+ fh = (struct frag_hdr *)(skb->data + sizeof(struct ipv6hdr));
+ if (fh->nexthdr == NEXTHDR_ESP || fh->nexthdr == NEXTHDR_AUTH)
+ return 1;
+ return 0;
}
static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct xfrm_state *x = dst->xfrm;
- int mtu;
+ unsigned int mtu;
bool toobig;
#ifdef CONFIG_NETFILTER
@@ -166,10 +82,13 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
toobig = skb->len > mtu && !skb_is_gso(skb);
- if (toobig && xfrm6_local_dontfrag(skb)) {
+ if (toobig && xfrm6_local_dontfrag(skb->sk)) {
xfrm6_local_rxpmtu(skb, mtu);
kfree_skb(skb);
return -EMSGSIZE;
+ } else if (toobig && xfrm6_noneed_fragment(skb)) {
+ skb->ignore_df = 1;
+ goto skip_frag;
} else if (!skb->ignore_df && toobig && skb->sk) {
xfrm_local_error(skb, mtu);
kfree_skb(skb);
@@ -181,7 +100,7 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
__xfrm6_output_finish);
skip_frag:
- return __xfrm6_output_state_finish(x, sk, skb);
+ return xfrm_output(sk, skb);
}
int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index af7a4b8b1e9c..4a4b0e49ec92 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -33,8 +33,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
int err;
memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_oif = l3mdev_master_ifindex_by_index(net, oif);
- fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF;
+ fl6.flowi6_l3mdev = l3mdev_master_ifindex_by_index(net, oif);
fl6.flowi6_mark = mark;
memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr));
if (saddr)
@@ -74,11 +73,11 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
struct rt6_info *rt = (struct rt6_info *)xdst->route;
xdst->u.dst.dev = dev;
- dev_hold(dev);
+ netdev_hold(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC);
xdst->u.rt6.rt6i_idev = in6_dev_get(dev);
if (!xdst->u.rt6.rt6i_idev) {
- dev_put(dev);
+ netdev_put(dev, &xdst->u.dst.dev_tracker);
return -ENODEV;
}
@@ -92,7 +91,6 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
xdst->u.rt6.rt6i_src = rt->rt6i_src;
INIT_LIST_HEAD(&xdst->u.rt6.rt6i_uncached);
rt6_uncached_list_add(&xdst->u.rt6);
- atomic_inc(&dev_net(dev)->ipv6.rt6_stats->fib_rt_uncache);
return 0;
}
diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c
index 34cb65c7d5a7..ea2f805d3b01 100644
--- a/net/ipv6/xfrm6_protocol.c
+++ b/net/ipv6/xfrm6_protocol.c
@@ -14,6 +14,7 @@
#include <linux/mutex.h>
#include <linux/skbuff.h>
#include <linux/icmpv6.h>
+#include <net/ip6_route.h>
#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/xfrm.h>
@@ -58,6 +59,53 @@ static int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
return 0;
}
+int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type)
+{
+ int ret;
+ struct xfrm6_protocol *handler;
+ struct xfrm6_protocol __rcu **head = proto_handlers(nexthdr);
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
+
+ if (!head)
+ goto out;
+
+ if (!skb_dst(skb)) {
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ int flags = RT6_LOOKUP_F_HAS_SADDR;
+ struct dst_entry *dst;
+ struct flowi6 fl6 = {
+ .flowi6_iif = skb->dev->ifindex,
+ .daddr = ip6h->daddr,
+ .saddr = ip6h->saddr,
+ .flowlabel = ip6_flowinfo(ip6h),
+ .flowi6_mark = skb->mark,
+ .flowi6_proto = ip6h->nexthdr,
+ };
+
+ dst = ip6_route_input_lookup(dev_net(skb->dev), skb->dev, &fl6,
+ skb, flags);
+ if (dst->error)
+ goto drop;
+ skb_dst_set(skb, dst);
+ }
+
+ for_each_protocol_rcu(*head, handler)
+ if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL)
+ return ret;
+
+out:
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL(xfrm6_rcv_encap);
+
static int xfrm6_esp_rcv(struct sk_buff *skb)
{
int ret;
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 78daadecbdef..6610b2198fa9 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -13,37 +13,11 @@
*/
#include <net/xfrm.h>
-#include <linux/pfkeyv2.h>
-#include <linux/ipsec.h>
-#include <linux/netfilter_ipv6.h>
-#include <linux/export.h>
-#include <net/dsfield.h>
-#include <net/ipv6.h>
-#include <net/addrconf.h>
-
-int xfrm6_extract_header(struct sk_buff *skb)
-{
- struct ipv6hdr *iph = ipv6_hdr(skb);
-
- XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph);
- XFRM_MODE_SKB_CB(skb)->id = 0;
- XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF);
- XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph);
- XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit;
- XFRM_MODE_SKB_CB(skb)->optlen = 0;
- memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl,
- sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl));
-
- return 0;
-}
static struct xfrm_state_afinfo xfrm6_state_afinfo = {
.family = AF_INET6,
.proto = IPPROTO_IPV6,
.output = xfrm6_output,
- .output_finish = xfrm6_output_finish,
- .extract_input = xfrm6_extract_input,
- .extract_output = xfrm6_extract_output,
.transport_finish = xfrm6_transport_finish,
.local_error = xfrm6_local_error,
};
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index e11bdb0aaa15..1323f2f6928e 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -78,7 +78,7 @@ static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(struct net *net, const
hlist_for_each_entry_rcu(x6spi,
&xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)],
- list_byaddr) {
+ list_byaddr, lockdep_is_held(&xfrm6_tunnel_spi_lock)) {
if (xfrm6_addr_equal(&x6spi->addr, saddr))
return x6spi;
}
@@ -270,13 +270,17 @@ static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
}
-static int xfrm6_tunnel_init_state(struct xfrm_state *x)
+static int xfrm6_tunnel_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
- if (x->props.mode != XFRM_MODE_TUNNEL)
+ if (x->props.mode != XFRM_MODE_TUNNEL) {
+ NL_SET_ERR_MSG(extack, "IPv6 tunnel can only be used with tunnel mode");
return -EINVAL;
+ }
- if (x->encap)
+ if (x->encap) {
+ NL_SET_ERR_MSG(extack, "IPv6 tunnel is not compatible with encapsulation");
return -EINVAL;
+ }
x->props.header_len = sizeof(struct ipv6hdr);
@@ -291,7 +295,6 @@ static void xfrm6_tunnel_destroy(struct xfrm_state *x)
}
static const struct xfrm_type xfrm6_tunnel_type = {
- .description = "IP6IP6",
.owner = THIS_MODULE,
.proto = IPPROTO_IPV6,
.init_state = xfrm6_tunnel_init_state,
@@ -303,13 +306,13 @@ static const struct xfrm_type xfrm6_tunnel_type = {
static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = {
.handler = xfrm6_tunnel_rcv,
.err_handler = xfrm6_tunnel_err,
- .priority = 2,
+ .priority = 3,
};
static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = {
.handler = xfrm6_tunnel_rcv,
.err_handler = xfrm6_tunnel_err,
- .priority = 2,
+ .priority = 3,
};
static int __net_init xfrm6_tunnel_net_init(struct net *net)