aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv6
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv6')
-rw-r--r--net/ipv6/addrconf.c19
-rw-r--r--net/ipv6/addrconf_core.c6
-rw-r--r--net/ipv6/af_inet6.c46
-rw-r--r--net/ipv6/ah6.c4
-rw-r--r--net/ipv6/esp6.c23
-rw-r--r--net/ipv6/esp6_offload.c4
-rw-r--r--net/ipv6/fib6_rules.c12
-rw-r--r--net/ipv6/icmp.c7
-rw-r--r--net/ipv6/inet6_hashtables.c2
-rw-r--r--net/ipv6/ip6_fib.c214
-rw-r--r--net/ipv6/ip6_flowlabel.c27
-rw-r--r--net/ipv6/ip6_output.c340
-rw-r--r--net/ipv6/ipcomp6.c3
-rw-r--r--net/ipv6/mip6.c6
-rw-r--r--net/ipv6/ndisc.c11
-rw-r--r--net/ipv6/netfilter.c129
-rw-r--r--net/ipv6/netfilter/ip6t_SYNPROXY.c420
-rw-r--r--net/ipv6/netfilter/ip6table_raw.c2
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c53
-rw-r--r--net/ipv6/proc.c4
-rw-r--r--net/ipv6/raw.c4
-rw-r--r--net/ipv6/reassembly.c52
-rw-r--r--net/ipv6/route.c1471
-rw-r--r--net/ipv6/sysctl_net_ipv6.c5
-rw-r--r--net/ipv6/tcp_ipv6.c31
-rw-r--r--net/ipv6/udp.c33
-rw-r--r--net/ipv6/xfrm6_state.c137
27 files changed, 1830 insertions, 1235 deletions
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 081bb517e40d..521e3203e83a 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2417,9 +2417,13 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
goto out;
for_each_fib6_node_rt_rcu(fn) {
- if (rt->fib6_nh.fib_nh_dev->ifindex != dev->ifindex)
+ /* prefix routes only use builtin fib6_nh */
+ if (rt->nh)
continue;
- if (no_gw && rt->fib6_nh.fib_nh_gw_family)
+
+ if (rt->fib6_nh->fib_nh_dev->ifindex != dev->ifindex)
+ continue;
+ if (no_gw && rt->fib6_nh->fib_nh_gw_family)
continue;
if ((rt->fib6_flags & flags) != flags)
continue;
@@ -3123,11 +3127,9 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
struct in_device *in_dev = __in_dev_get_rtnl(dev);
if (in_dev && (dev->flags & IFF_UP)) {
struct in_ifaddr *ifa;
-
int flag = scope;
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
-
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
addr.s6_addr32[3] = ifa->ifa_local;
if (ifa->ifa_scope == RT_SCOPE_LINK)
@@ -6350,16 +6352,17 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
list_for_each_entry(ifa, &idev->addr_list, if_list) {
spin_lock(&ifa->lock);
if (ifa->rt) {
- struct fib6_info *rt = ifa->rt;
+ /* host routes only use builtin fib6_nh */
+ struct fib6_nh *nh = ifa->rt->fib6_nh;
int cpu;
rcu_read_lock();
ifa->rt->dst_nopolicy = val ? true : false;
- if (rt->rt6i_pcpu) {
+ if (nh->rt6i_pcpu) {
for_each_possible_cpu(cpu) {
struct rt6_info **rtp;
- rtp = per_cpu_ptr(rt->rt6i_pcpu, cpu);
+ rtp = per_cpu_ptr(nh->rt6i_pcpu, cpu);
addrconf_set_nopolicy(*rtp, val);
}
}
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 5b1246635e02..783f3c1466da 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -183,6 +183,11 @@ static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
return -EAFNOSUPPORT;
}
+static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt)
+{
+ return -EAFNOSUPPORT;
+}
+
const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
.ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
.ipv6_route_input = eafnosupport_ipv6_route_input,
@@ -192,6 +197,7 @@ const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
.fib6_select_path = eafnosupport_fib6_select_path,
.ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6,
.fib6_nh_init = eafnosupport_fib6_nh_init,
+ .ip6_del_rt = eafnosupport_ip6_del_rt,
};
EXPORT_SYMBOL_GPL(ipv6_stub);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 5352708b7b2d..ef37e0574f54 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -208,7 +208,7 @@ lookup_protocol:
np->mc_loop = 1;
np->mc_all = 1;
np->pmtudisc = IPV6_PMTUDISC_WANT;
- np->repflow = net->ipv6.sysctl.flowlabel_reflect;
+ np->repflow = net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED;
sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
/* Init the ipv4 part of the socket since we can have sockets
@@ -564,6 +564,39 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
}
EXPORT_SYMBOL(inet6_ioctl);
+INDIRECT_CALLABLE_DECLARE(int udpv6_sendmsg(struct sock *, struct msghdr *,
+ size_t));
+int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+{
+ struct sock *sk = sock->sk;
+
+ if (unlikely(inet_send_prepare(sk)))
+ return -EAGAIN;
+
+ return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udpv6_sendmsg,
+ sk, msg, size);
+}
+
+INDIRECT_CALLABLE_DECLARE(int udpv6_recvmsg(struct sock *, struct msghdr *,
+ size_t, int, int, int *));
+int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ int addr_len = 0;
+ int err;
+
+ if (likely(!(flags & MSG_ERRQUEUE)))
+ sock_rps_record_flow(sk);
+
+ err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udpv6_recvmsg,
+ sk, msg, size, flags & MSG_DONTWAIT,
+ flags & ~MSG_DONTWAIT, &addr_len);
+ if (err >= 0)
+ msg->msg_namelen = addr_len;
+ return err;
+}
+
const struct proto_ops inet6_stream_ops = {
.family = PF_INET6,
.owner = THIS_MODULE,
@@ -580,8 +613,8 @@ const struct proto_ops inet6_stream_ops = {
.shutdown = inet_shutdown, /* ok */
.setsockopt = sock_common_setsockopt, /* ok */
.getsockopt = sock_common_getsockopt, /* ok */
- .sendmsg = inet_sendmsg, /* ok */
- .recvmsg = inet_recvmsg, /* ok */
+ .sendmsg = inet6_sendmsg, /* retpoline's sake */
+ .recvmsg = inet6_recvmsg, /* retpoline's sake */
#ifdef CONFIG_MMU
.mmap = tcp_mmap,
#endif
@@ -614,8 +647,8 @@ const struct proto_ops inet6_dgram_ops = {
.shutdown = inet_shutdown, /* ok */
.setsockopt = sock_common_setsockopt, /* ok */
.getsockopt = sock_common_getsockopt, /* ok */
- .sendmsg = inet_sendmsg, /* ok */
- .recvmsg = inet_recvmsg, /* ok */
+ .sendmsg = inet6_sendmsg, /* retpoline's sake */
+ .recvmsg = inet6_recvmsg, /* retpoline's sake */
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
.set_peek_off = sk_set_peek_off,
@@ -922,6 +955,9 @@ static const struct ipv6_stub ipv6_stub_impl = {
.ip6_mtu_from_fib6 = ip6_mtu_from_fib6,
.fib6_nh_init = fib6_nh_init,
.fib6_nh_release = fib6_nh_release,
+ .fib6_update_sernum = fib6_update_sernum_stub,
+ .fib6_rt_update = fib6_rt_update,
+ .ip6_del_rt = ip6_del_rt,
.udpv6_encap_enable = udpv6_encap_enable,
.ndisc_send_na = ndisc_send_na,
.nd_tbl = &nd_tbl,
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 68b9e92e469e..25e1172fd1c3 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -793,9 +793,7 @@ static void __exit ah6_fini(void)
if (xfrm6_protocol_deregister(&ah6_protocol, IPPROTO_AH) < 0)
pr_info("%s: can't remove protocol\n", __func__);
- if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0)
- pr_info("%s: can't remove xfrm type\n", __func__);
-
+ xfrm_unregister_type(&ah6_type, AF_INET6);
}
module_init(ah6_init);
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index ae6a739c5f52..a3b403ba8f8f 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -41,8 +41,6 @@ struct esp_skb_cb {
#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
-static u32 esp6_get_mtu(struct xfrm_state *x, int mtu);
-
/*
* Allocate an AEAD request structure with extra space for SG and IV.
*
@@ -447,7 +445,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
u32 padto;
- padto = min(x->tfcpad, esp6_get_mtu(x, dst->child_mtu_cached));
+ padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached));
if (skb->len < padto)
esp.tfclen = padto - skb->len;
}
@@ -687,21 +685,6 @@ out:
return ret;
}
-static u32 esp6_get_mtu(struct xfrm_state *x, int mtu)
-{
- struct crypto_aead *aead = x->data;
- u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
- unsigned int net_adj;
-
- if (x->props.mode != XFRM_MODE_TUNNEL)
- net_adj = sizeof(struct ipv6hdr);
- else
- net_adj = 0;
-
- return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
- net_adj) & ~(blksize - 1)) + net_adj - 2;
-}
-
static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
@@ -919,7 +902,6 @@ static const struct xfrm_type esp6_type = {
.flags = XFRM_TYPE_REPLAY_PROT,
.init_state = esp6_init_state,
.destructor = esp6_destroy,
- .get_mtu = esp6_get_mtu,
.input = esp6_input,
.output = esp6_output,
.hdr_offset = xfrm6_find_1stfragopt,
@@ -951,8 +933,7 @@ static void __exit esp6_fini(void)
{
if (xfrm6_protocol_deregister(&esp6_protocol, IPPROTO_ESP) < 0)
pr_info("%s: can't remove protocol\n", __func__);
- if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0)
- pr_info("%s: can't remove xfrm type\n", __func__);
+ xfrm_unregister_type(&esp6_type, AF_INET6);
}
module_init(esp6_init);
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index d0d8528b294a..e31626ffccd1 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -336,9 +336,7 @@ static int __init esp6_offload_init(void)
static void __exit esp6_offload_exit(void)
{
- if (xfrm_unregister_type_offload(&esp6_type_offload, AF_INET6) < 0)
- pr_info("%s: can't remove xfrm type offload\n", __func__);
-
+ xfrm_unregister_type_offload(&esp6_type_offload, AF_INET6);
inet6_del_offload(&esp6_offload, IPPROTO_ESP);
}
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index bcfae13409b5..d22b6c140f23 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -113,14 +113,15 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags);
if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN)
return &rt->dst;
- ip6_rt_put(rt);
+ ip6_rt_put_flags(rt, flags);
rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
if (rt->dst.error != -EAGAIN)
return &rt->dst;
- ip6_rt_put(rt);
+ ip6_rt_put_flags(rt, flags);
}
- dst_hold(&net->ipv6.ip6_null_entry->dst);
+ if (!(flags & RT6_LOOKUP_F_DST_NOREF))
+ dst_hold(&net->ipv6.ip6_null_entry->dst);
return &net->ipv6.ip6_null_entry->dst;
}
@@ -237,13 +238,14 @@ static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
goto out;
}
again:
- ip6_rt_put(rt);
+ ip6_rt_put_flags(rt, flags);
err = -EAGAIN;
rt = NULL;
goto out;
discard_pkt:
- dst_hold(&rt->dst);
+ if (!(flags & RT6_LOOKUP_F_DST_NOREF))
+ dst_hold(&rt->dst);
out:
res->rt6 = rt;
return err;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 375b4b4f9bf5..62c997201970 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -75,9 +75,9 @@
*
* On SMP we have one ICMP socket per-cpu.
*/
-static inline struct sock *icmpv6_sk(struct net *net)
+static struct sock *icmpv6_sk(struct net *net)
{
- return *this_cpu_ptr(net->ipv6.icmp_sk);
+ return this_cpu_read(*net->ipv6.icmp_sk);
}
static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
@@ -703,6 +703,9 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY;
memset(&fl6, 0, sizeof(fl6));
+ if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES)
+ fl6.flowlabel = ip6_flowlabel(ipv6_hdr(skb));
+
fl6.flowi6_proto = IPPROTO_ICMPV6;
fl6.daddr = ipv6_hdr(skb)->saddr;
if (saddr)
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index b2a55f300318..cf60fae9533b 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -174,7 +174,7 @@ struct sock *inet6_lookup_listener(struct net *net,
saddr, sport, &in6addr_any, hnum,
dif, sdif);
done:
- if (unlikely(IS_ERR(result)))
+ if (IS_ERR(result))
return NULL;
return result;
}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 9180c8b6f764..49884f96232b 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -143,20 +143,19 @@ static __be32 addr_bit_set(const void *token, int fn_bit)
addr[fn_bit >> 5];
}
-struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
+struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
{
struct fib6_info *f6i;
+ size_t sz = sizeof(*f6i);
- f6i = kzalloc(sizeof(*f6i), gfp_flags);
+ if (with_fib6_nh)
+ sz += sizeof(struct fib6_nh);
+
+ f6i = kzalloc(sz, gfp_flags);
if (!f6i)
return NULL;
- f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
- if (!f6i->rt6i_pcpu) {
- kfree(f6i);
- return NULL;
- }
-
+ /* fib6_siblings is a union with nh_list, so this initializes both */
INIT_LIST_HEAD(&f6i->fib6_siblings);
refcount_set(&f6i->fib6_ref, 1);
@@ -166,36 +165,15 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
void fib6_info_destroy_rcu(struct rcu_head *head)
{
struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
- struct rt6_exception_bucket *bucket;
WARN_ON(f6i->fib6_node);
- bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1);
- kfree(bucket);
-
- if (f6i->rt6i_pcpu) {
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct rt6_info **ppcpu_rt;
- struct rt6_info *pcpu_rt;
-
- ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
- pcpu_rt = *ppcpu_rt;
- if (pcpu_rt) {
- dst_dev_put(&pcpu_rt->dst);
- dst_release(&pcpu_rt->dst);
- *ppcpu_rt = NULL;
- }
- }
-
- free_percpu(f6i->rt6i_pcpu);
- }
-
- fib6_nh_release(&f6i->fib6_nh);
+ if (f6i->nh)
+ nexthop_put(f6i->nh);
+ else
+ fib6_nh_release(f6i->fib6_nh);
ip_fib_metrics_put(f6i->fib6_metrics);
-
kfree(f6i);
}
EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);
@@ -338,9 +316,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
if (rt->dst.error == -EAGAIN) {
- ip6_rt_put(rt);
+ ip6_rt_put_flags(rt, flags);
rt = net->ipv6.ip6_null_entry;
- dst_hold(&rt->dst);
+ if (!(flags | RT6_LOOKUP_F_DST_NOREF))
+ dst_hold(&rt->dst);
}
return &rt->dst;
@@ -389,14 +368,30 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
return call_fib6_notifier(nb, net, event_type, &info.info);
}
-static int call_fib6_entry_notifiers(struct net *net,
- enum fib_event_type event_type,
- struct fib6_info *rt,
- struct netlink_ext_ack *extack)
+int call_fib6_entry_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct fib6_info *rt,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_entry_notifier_info info = {
+ .info.extack = extack,
+ .rt = rt,
+ };
+
+ rt->fib6_table->fib_seq++;
+ return call_fib6_notifiers(net, event_type, &info.info);
+}
+
+int call_fib6_multipath_entry_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct fib6_info *rt,
+ unsigned int nsiblings,
+ struct netlink_ext_ack *extack)
{
struct fib6_entry_notifier_info info = {
.info.extack = extack,
.rt = rt,
+ .nsiblings = nsiblings,
};
rt->fib6_table->fib_seq++;
@@ -469,12 +464,19 @@ static int fib6_dump_node(struct fib6_walker *w)
struct fib6_info *rt;
for_each_fib6_walker_rt(w) {
- res = rt6_dump_route(rt, w->args);
- if (res < 0) {
+ res = rt6_dump_route(rt, w->args, w->skip_in_node);
+ if (res >= 0) {
/* Frame is full, suspend walking */
w->leaf = rt;
+
+ /* We'll restart from this node, so if some routes were
+ * already dumped, skip them next time.
+ */
+ w->skip_in_node += res;
+
return 1;
}
+ w->skip_in_node = 0;
/* Multipath routes are dumped in one route with the
* RTA_MULTIPATH attribute. Jump 'rt' to point to the
@@ -526,6 +528,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
if (cb->args[4] == 0) {
w->count = 0;
w->skip = 0;
+ w->skip_in_node = 0;
spin_lock_bh(&table->tb6_lock);
res = fib6_walk(net, w);
@@ -541,6 +544,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
w->state = FWS_INIT;
w->node = w->root;
w->skip = w->count;
+ w->skip_in_node = 0;
} else
w->skip = 0;
@@ -558,9 +562,10 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true,
+ .filter.dump_routes = true };
const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
- struct rt6_rtnl_dump_arg arg = {};
unsigned int h, s_h;
unsigned int e = 0, s_e;
struct fib6_walker *w;
@@ -577,13 +582,10 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
} else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
struct rtmsg *rtm = nlmsg_data(nlh);
- arg.filter.flags = rtm->rtm_flags & (RTM_F_PREFIX|RTM_F_CLONED);
+ if (rtm->rtm_flags & RTM_F_PREFIX)
+ arg.filter.flags = RTM_F_PREFIX;
}
- /* fib entries are never clones */
- if (arg.filter.flags & RTM_F_CLONED)
- goto out;
-
w = (void *)cb->args[2];
if (!w) {
/* New dump:
@@ -895,16 +897,14 @@ insert_above:
return ln;
}
-static void fib6_drop_pcpu_from(struct fib6_info *f6i,
- const struct fib6_table *table)
+static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh,
+ const struct fib6_info *match,
+ const struct fib6_table *table)
{
int cpu;
- /* Make sure rt6_make_pcpu_route() wont add other percpu routes
- * while we are cleaning them here.
- */
- f6i->fib6_destroying = 1;
- mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
+ if (!fib6_nh->rt6i_pcpu)
+ return;
/* release the reference to this fib entry from
* all of its cached pcpu routes
@@ -913,9 +913,15 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i,
struct rt6_info **ppcpu_rt;
struct rt6_info *pcpu_rt;
- ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
+ ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
pcpu_rt = *ppcpu_rt;
- if (pcpu_rt) {
+
+ /* only dropping the 'from' reference if the cached route
+ * is using 'match'. The cached pcpu_rt->from only changes
+ * from a fib6_info to NULL (ip6_dst_destroy); it can never
+ * change from one fib6_info reference to another
+ */
+ if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) {
struct fib6_info *from;
from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
@@ -924,13 +930,53 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i,
}
}
+struct fib6_nh_pcpu_arg {
+ struct fib6_info *from;
+ const struct fib6_table *table;
+};
+
+static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_pcpu_arg *arg = _arg;
+
+ __fib6_drop_pcpu_from(nh, arg->from, arg->table);
+ return 0;
+}
+
+static void fib6_drop_pcpu_from(struct fib6_info *f6i,
+ const struct fib6_table *table)
+{
+ /* Make sure rt6_make_pcpu_route() wont add other percpu routes
+ * while we are cleaning them here.
+ */
+ f6i->fib6_destroying = 1;
+ mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
+
+ if (f6i->nh) {
+ struct fib6_nh_pcpu_arg arg = {
+ .from = f6i,
+ .table = table
+ };
+
+ nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from,
+ &arg);
+ } else {
+ struct fib6_nh *fib6_nh;
+
+ fib6_nh = f6i->fib6_nh;
+ __fib6_drop_pcpu_from(fib6_nh, f6i, table);
+ }
+}
+
static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
struct net *net)
{
struct fib6_table *table = rt->fib6_table;
- if (rt->rt6i_pcpu)
- fib6_drop_pcpu_from(rt, table);
+ fib6_drop_pcpu_from(rt, table);
+
+ if (rt->nh && !list_empty(&rt->nh_list))
+ list_del_init(&rt->nh_list);
if (refcount_read(&rt->fib6_ref) != 1) {
/* This route is used as dummy address holder in some split
@@ -1101,11 +1147,13 @@ next_iter:
add:
nlflags |= NLM_F_CREATE;
- err = call_fib6_entry_notifiers(info->nl_net,
- FIB_EVENT_ENTRY_ADD,
- rt, extack);
- if (err)
- return err;
+ if (!info->skip_notify_kernel) {
+ err = call_fib6_entry_notifiers(info->nl_net,
+ FIB_EVENT_ENTRY_ADD,
+ rt, extack);
+ if (err)
+ return err;
+ }
rcu_assign_pointer(rt->fib6_next, iter);
fib6_info_hold(rt);
@@ -1130,11 +1178,13 @@ add:
return -ENOENT;
}
- err = call_fib6_entry_notifiers(info->nl_net,
- FIB_EVENT_ENTRY_REPLACE,
- rt, extack);
- if (err)
- return err;
+ if (!info->skip_notify_kernel) {
+ err = call_fib6_entry_notifiers(info->nl_net,
+ FIB_EVENT_ENTRY_REPLACE,
+ rt, extack);
+ if (err)
+ return err;
+ }
fib6_info_hold(rt);
rcu_assign_pointer(rt->fib6_node, fn);
@@ -1218,6 +1268,14 @@ void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
__fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
}
+/* allow ipv4 to update sernum via ipv6_stub */
+void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i)
+{
+ spin_lock_bh(&f6i->fib6_table->tb6_lock);
+ fib6_update_sernum_upto_root(net, f6i);
+ spin_unlock_bh(&f6i->fib6_table->tb6_lock);
+}
+
/*
* Add routing information to the routing tree.
* <destination addr>/<source addr>
@@ -1331,6 +1389,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
err = fib6_add_rt2node(fn, rt, info, extack);
if (!err) {
+ if (rt->nh)
+ list_add(&rt->nh_list, &rt->nh->f6i_list);
__fib6_update_sernum_upto_root(rt, sernum);
fib6_start_gc(info->nl_net, rt);
}
@@ -1536,7 +1596,8 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root,
if (plen == fn->fn_bit)
return fn;
- prev = fn;
+ if (fn->fn_flags & RTN_RTINFO)
+ prev = fn;
next:
/*
@@ -1807,9 +1868,11 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
fib6_purge_rt(rt, fn, net);
- call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
+ if (!info->skip_notify_kernel)
+ call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
if (!info->skip_notify)
inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
+
fib6_info_release(rt);
}
@@ -2041,6 +2104,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
c.w.func = fib6_clean_node;
c.w.count = 0;
c.w.skip = 0;
+ c.w.skip_in_node = 0;
c.func = func;
c.sernum = sernum;
c.arg = arg;
@@ -2292,9 +2356,13 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
{
struct fib6_info *rt = v;
struct ipv6_route_iter *iter = seq->private;
+ struct fib6_nh *fib6_nh = rt->fib6_nh;
unsigned int flags = rt->fib6_flags;
const struct net_device *dev;
+ if (rt->nh)
+ fib6_nh = nexthop_fib6_nh(rt->nh);
+
seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
#ifdef CONFIG_IPV6_SUBTREES
@@ -2302,14 +2370,14 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
#else
seq_puts(seq, "00000000000000000000000000000000 00 ");
#endif
- if (rt->fib6_nh.fib_nh_gw_family) {
+ if (fib6_nh->fib_nh_gw_family) {
flags |= RTF_GATEWAY;
- seq_printf(seq, "%pi6", &rt->fib6_nh.fib_nh_gw6);
+ seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
} else {
seq_puts(seq, "00000000000000000000000000000000");
}
- dev = rt->fib6_nh.fib_nh_dev;
+ dev = fib6_nh->fib_nh_dev;
seq_printf(seq, " %08x %08x %08x %08x %8s\n",
rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
flags, dev ? dev->name : "");
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 545e339b8c4f..ad284b1fd308 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -17,6 +17,7 @@
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/pid_namespace.h>
+#include <linux/jump_label_ratelimit.h>
#include <net/net_namespace.h>
#include <net/sock.h>
@@ -53,6 +54,9 @@ static DEFINE_SPINLOCK(ip6_fl_lock);
static DEFINE_SPINLOCK(ip6_sk_fl_lock);
+DEFINE_STATIC_KEY_DEFERRED_FALSE(ipv6_flowlabel_exclusive, HZ);
+EXPORT_SYMBOL(ipv6_flowlabel_exclusive);
+
#define for_each_fl_rcu(hash, fl) \
for (fl = rcu_dereference_bh(fl_ht[(hash)]); \
fl != NULL; \
@@ -90,6 +94,13 @@ static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label)
return fl;
}
+static bool fl_shared_exclusive(struct ip6_flowlabel *fl)
+{
+ return fl->share == IPV6_FL_S_EXCL ||
+ fl->share == IPV6_FL_S_PROCESS ||
+ fl->share == IPV6_FL_S_USER;
+}
+
static void fl_free_rcu(struct rcu_head *head)
{
struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu);
@@ -103,8 +114,13 @@ static void fl_free_rcu(struct rcu_head *head)
static void fl_free(struct ip6_flowlabel *fl)
{
- if (fl)
- call_rcu(&fl->rcu, fl_free_rcu);
+ if (!fl)
+ return;
+
+ if (fl_shared_exclusive(fl) || fl->opt)
+ static_branch_slow_dec_deferred(&ipv6_flowlabel_exclusive);
+
+ call_rcu(&fl->rcu, fl_free_rcu);
}
static void fl_release(struct ip6_flowlabel *fl)
@@ -240,7 +256,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net,
/* Socket flowlabel lists */
-struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label)
+struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label)
{
struct ipv6_fl_socklist *sfl;
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -260,7 +276,7 @@ struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label)
rcu_read_unlock_bh();
return NULL;
}
-EXPORT_SYMBOL_GPL(fl6_sock_lookup);
+EXPORT_SYMBOL_GPL(__fl6_sock_lookup);
void fl6_free_socklist(struct sock *sk)
{
@@ -419,6 +435,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
}
fl->dst = freq->flr_dst;
atomic_set(&fl->users, 1);
+ if (fl_shared_exclusive(fl) || fl->opt)
+ static_branch_deferred_inc(&ipv6_flowlabel_exclusive);
switch (fl->share) {
case IPV6_FL_S_EXCL:
case IPV6_FL_S_ANY:
@@ -854,6 +872,7 @@ int ip6_flowlabel_init(void)
void ip6_flowlabel_cleanup(void)
{
+ static_key_deferred_flush(&ipv6_flowlabel_exclusive);
del_timer(&ip6_fl_gc_timer);
unregister_pernet_subsys(&ip6_flowlabel_net_ops);
}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 21efcd02f337..8e49fd62eea9 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -124,16 +124,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
return -EINVAL;
}
-static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- int ret;
-
- ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
- if (ret) {
- kfree_skb(skb);
- return ret;
- }
-
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) {
@@ -150,6 +142,22 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
return ip6_finish_output2(net, sk, skb);
}
+static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ switch (ret) {
+ case NET_XMIT_SUCCESS:
+ return __ip6_finish_output(net, sk, skb);
+ case NET_XMIT_CN:
+ return __ip6_finish_output(net, sk, skb) ? : ret;
+ default:
+ kfree_skb(skb);
+ return ret;
+ }
+}
+
int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev;
@@ -588,6 +596,169 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
skb_copy_secmark(to, from);
}
+int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
+ u8 nexthdr, __be32 frag_id,
+ struct ip6_fraglist_iter *iter)
+{
+ unsigned int first_len;
+ struct frag_hdr *fh;
+
+ /* BUILD HEADER */
+ *prevhdr = NEXTHDR_FRAGMENT;
+ iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
+ if (!iter->tmp_hdr)
+ return -ENOMEM;
+
+ iter->frag = skb_shinfo(skb)->frag_list;
+ skb_frag_list_init(skb);
+
+ iter->offset = 0;
+ iter->hlen = hlen;
+ iter->frag_id = frag_id;
+ iter->nexthdr = nexthdr;
+
+ __skb_pull(skb, hlen);
+ fh = __skb_push(skb, sizeof(struct frag_hdr));
+ __skb_push(skb, hlen);
+ skb_reset_network_header(skb);
+ memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
+
+ fh->nexthdr = nexthdr;
+ fh->reserved = 0;
+ fh->frag_off = htons(IP6_MF);
+ fh->identification = frag_id;
+
+ first_len = skb_pagelen(skb);
+ skb->data_len = first_len - skb_headlen(skb);
+ skb->len = first_len;
+ ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
+
+ return 0;
+}
+EXPORT_SYMBOL(ip6_fraglist_init);
+
+void ip6_fraglist_prepare(struct sk_buff *skb,
+ struct ip6_fraglist_iter *iter)
+{
+ struct sk_buff *frag = iter->frag;
+ unsigned int hlen = iter->hlen;
+ struct frag_hdr *fh;
+
+ frag->ip_summed = CHECKSUM_NONE;
+ skb_reset_transport_header(frag);
+ fh = __skb_push(frag, sizeof(struct frag_hdr));
+ __skb_push(frag, hlen);
+ skb_reset_network_header(frag);
+ memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
+ iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
+ fh->nexthdr = iter->nexthdr;
+ fh->reserved = 0;
+ fh->frag_off = htons(iter->offset);
+ if (frag->next)
+ fh->frag_off |= htons(IP6_MF);
+ fh->identification = iter->frag_id;
+ ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
+ ip6_copy_metadata(frag, skb);
+}
+EXPORT_SYMBOL(ip6_fraglist_prepare);
+
+void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
+ unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
+ u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
+{
+ state->prevhdr = prevhdr;
+ state->nexthdr = nexthdr;
+ state->frag_id = frag_id;
+
+ state->hlen = hlen;
+ state->mtu = mtu;
+
+ state->left = skb->len - hlen; /* Space per frame */
+ state->ptr = hlen; /* Where to start from */
+
+ state->hroom = hdr_room;
+ state->troom = needed_tailroom;
+
+ state->offset = 0;
+}
+EXPORT_SYMBOL(ip6_frag_init);
+
+struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
+{
+ u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
+ struct sk_buff *frag;
+ struct frag_hdr *fh;
+ unsigned int len;
+
+ len = state->left;
+ /* IF: it doesn't fit, use 'mtu' - the data space left */
+ if (len > state->mtu)
+ len = state->mtu;
+ /* IF: we are not sending up to and including the packet end
+ then align the next start on an eight byte boundary */
+ if (len < state->left)
+ len &= ~7;
+
+ /* Allocate buffer */
+ frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
+ state->hroom + state->troom, GFP_ATOMIC);
+ if (!frag)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Set up data on packet
+ */
+
+ ip6_copy_metadata(frag, skb);
+ skb_reserve(frag, state->hroom);
+ skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
+ skb_reset_network_header(frag);
+ fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
+ frag->transport_header = (frag->network_header + state->hlen +
+ sizeof(struct frag_hdr));
+
+ /*
+ * Charge the memory for the fragment to any owner
+ * it might possess
+ */
+ if (skb->sk)
+ skb_set_owner_w(frag, skb->sk);
+
+ /*
+ * Copy the packet header into the new buffer.
+ */
+ skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
+
+ fragnexthdr_offset = skb_network_header(frag);
+ fragnexthdr_offset += prevhdr - skb_network_header(skb);
+ *fragnexthdr_offset = NEXTHDR_FRAGMENT;
+
+ /*
+ * Build fragment header.
+ */
+ fh->nexthdr = state->nexthdr;
+ fh->reserved = 0;
+ fh->identification = state->frag_id;
+
+ /*
+ * Copy a block of the IP datagram.
+ */
+ BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
+ len));
+ state->left -= len;
+
+ fh->frag_off = htons(state->offset);
+ if (state->left > 0)
+ fh->frag_off |= htons(IP6_MF);
+ ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
+
+ state->ptr += len;
+ state->offset += len;
+
+ return frag;
+}
+EXPORT_SYMBOL(ip6_frag_next);
+
int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
int (*output)(struct net *, struct sock *, struct sk_buff *))
{
@@ -595,12 +766,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
inet6_sk(skb->sk) : NULL;
- struct ipv6hdr *tmp_hdr;
- struct frag_hdr *fh;
- unsigned int mtu, hlen, left, len, nexthdr_offset;
- int hroom, troom;
+ struct ip6_frag_state state;
+ unsigned int mtu, hlen, nexthdr_offset;
+ int hroom, err = 0;
__be32 frag_id;
- int ptr, offset = 0, err = 0;
u8 *prevhdr, nexthdr = 0;
err = ip6_find_1stfragopt(skb, &prevhdr);
@@ -647,6 +816,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
hroom = LL_RESERVED_SPACE(rt->dst.dev);
if (skb_has_frag_list(skb)) {
unsigned int first_len = skb_pagelen(skb);
+ struct ip6_fraglist_iter iter;
struct sk_buff *frag2;
if (first_len - hlen > mtu ||
@@ -674,74 +844,29 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
skb->truesize -= frag->truesize;
}
- err = 0;
- offset = 0;
- /* BUILD HEADER */
-
- *prevhdr = NEXTHDR_FRAGMENT;
- tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
- if (!tmp_hdr) {
- err = -ENOMEM;
+ err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
+ &iter);
+ if (err < 0)
goto fail;
- }
- frag = skb_shinfo(skb)->frag_list;
- skb_frag_list_init(skb);
-
- __skb_pull(skb, hlen);
- fh = __skb_push(skb, sizeof(struct frag_hdr));
- __skb_push(skb, hlen);
- skb_reset_network_header(skb);
- memcpy(skb_network_header(skb), tmp_hdr, hlen);
-
- fh->nexthdr = nexthdr;
- fh->reserved = 0;
- fh->frag_off = htons(IP6_MF);
- fh->identification = frag_id;
-
- first_len = skb_pagelen(skb);
- skb->data_len = first_len - skb_headlen(skb);
- skb->len = first_len;
- ipv6_hdr(skb)->payload_len = htons(first_len -
- sizeof(struct ipv6hdr));
for (;;) {
/* Prepare header of the next frame,
* before previous one went down. */
- if (frag) {
- frag->ip_summed = CHECKSUM_NONE;
- skb_reset_transport_header(frag);
- fh = __skb_push(frag, sizeof(struct frag_hdr));
- __skb_push(frag, hlen);
- skb_reset_network_header(frag);
- memcpy(skb_network_header(frag), tmp_hdr,
- hlen);
- offset += skb->len - hlen - sizeof(struct frag_hdr);
- fh->nexthdr = nexthdr;
- fh->reserved = 0;
- fh->frag_off = htons(offset);
- if (frag->next)
- fh->frag_off |= htons(IP6_MF);
- fh->identification = frag_id;
- ipv6_hdr(frag)->payload_len =
- htons(frag->len -
- sizeof(struct ipv6hdr));
- ip6_copy_metadata(frag, skb);
- }
+ if (iter.frag)
+ ip6_fraglist_prepare(skb, &iter);
err = output(net, sk, skb);
if (!err)
IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
IPSTATS_MIB_FRAGCREATES);
- if (err || !frag)
+ if (err || !iter.frag)
break;
- skb = frag;
- frag = skb->next;
- skb_mark_not_on_list(skb);
+ skb = ip6_fraglist_next(&iter);
}
- kfree(tmp_hdr);
+ kfree(iter.tmp_hdr);
if (err == 0) {
IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
@@ -749,7 +874,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
return 0;
}
- kfree_skb_list(frag);
+ kfree_skb_list(iter.frag);
IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
IPSTATS_MIB_FRAGFAILS);
@@ -766,91 +891,26 @@ slow_path_clean:
}
slow_path:
- left = skb->len - hlen; /* Space per frame */
- ptr = hlen; /* Where to start from */
-
/*
* Fragment the datagram.
*/
- troom = rt->dst.dev->needed_tailroom;
+ ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
+ LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
+ &state);
/*
* Keep copying data until we run out.
*/
- while (left > 0) {
- u8 *fragnexthdr_offset;
-
- len = left;
- /* IF: it doesn't fit, use 'mtu' - the data space left */
- if (len > mtu)
- len = mtu;
- /* IF: we are not sending up to and including the packet end
- then align the next start on an eight byte boundary */
- if (len < left) {
- len &= ~7;
- }
- /* Allocate buffer */
- frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
- hroom + troom, GFP_ATOMIC);
- if (!frag) {
- err = -ENOMEM;
+ while (state.left > 0) {
+ frag = ip6_frag_next(skb, &state);
+ if (IS_ERR(frag)) {
+ err = PTR_ERR(frag);
goto fail;
}
/*
- * Set up data on packet
- */
-
- ip6_copy_metadata(frag, skb);
- skb_reserve(frag, hroom);
- skb_put(frag, len + hlen + sizeof(struct frag_hdr));
- skb_reset_network_header(frag);
- fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
- frag->transport_header = (frag->network_header + hlen +
- sizeof(struct frag_hdr));
-
- /*
- * Charge the memory for the fragment to any owner
- * it might possess
- */
- if (skb->sk)
- skb_set_owner_w(frag, skb->sk);
-
- /*
- * Copy the packet header into the new buffer.
- */
- skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
-
- fragnexthdr_offset = skb_network_header(frag);
- fragnexthdr_offset += prevhdr - skb_network_header(skb);
- *fragnexthdr_offset = NEXTHDR_FRAGMENT;
-
- /*
- * Build fragment header.
- */
- fh->nexthdr = nexthdr;
- fh->reserved = 0;
- fh->identification = frag_id;
-
- /*
- * Copy a block of the IP datagram.
- */
- BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
- len));
- left -= len;
-
- fh->frag_off = htons(offset);
- if (left > 0)
- fh->frag_off |= htons(IP6_MF);
- ipv6_hdr(frag)->payload_len = htons(frag->len -
- sizeof(struct ipv6hdr));
-
- ptr += len;
- offset += len;
-
- /*
* Put this fragment into the sending queue.
*/
err = output(net, sk, frag);
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 51fd33294c7c..3752bd3e92ce 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -206,8 +206,7 @@ static void __exit ipcomp6_fini(void)
{
if (xfrm6_protocol_deregister(&ipcomp6_protocol, IPPROTO_COMP) < 0)
pr_info("%s: can't remove protocol\n", __func__);
- if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0)
- pr_info("%s: can't remove xfrm type\n", __func__);
+ xfrm_unregister_type(&ipcomp6_type, AF_INET6);
}
module_init(ipcomp6_init);
diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
index 91801432878c..878fcec14949 100644
--- a/net/ipv6/mip6.c
+++ b/net/ipv6/mip6.c
@@ -499,10 +499,8 @@ static void __exit mip6_fini(void)
{
if (rawv6_mh_filter_unregister(mip6_mh_filter) < 0)
pr_info("%s: can't remove rawv6 mh filter\n", __func__);
- if (xfrm_unregister_type(&mip6_rthdr_type, AF_INET6) < 0)
- pr_info("%s: can't remove xfrm type(rthdr)\n", __func__);
- if (xfrm_unregister_type(&mip6_destopt_type, AF_INET6) < 0)
- pr_info("%s: can't remove xfrm type(destopt)\n", __func__);
+ xfrm_unregister_type(&mip6_rthdr_type, AF_INET6);
+ xfrm_unregister_type(&mip6_destopt_type, AF_INET6);
}
module_init(mip6_init);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 09dd2edfb868..083cc1c94cd3 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1285,12 +1285,11 @@ static void ndisc_router_discovery(struct sk_buff *skb)
!in6_dev->cnf.accept_ra_rtr_pref)
pref = ICMPV6_ROUTER_PREF_MEDIUM;
#endif
-
+ /* routes added from RAs do not use nexthop objects */
rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
-
if (rt) {
- neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6,
- rt->fib6_nh.fib_nh_dev, NULL,
+ neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
+ rt->fib6_nh->fib_nh_dev, NULL,
&ipv6_hdr(skb)->saddr);
if (!neigh) {
ND_PRINTK(0, err,
@@ -1319,8 +1318,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
return;
}
- neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6,
- rt->fib6_nh.fib_nh_dev, NULL,
+ neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
+ rt->fib6_nh->fib_nh_dev, NULL,
&ipv6_hdr(skb)->saddr);
if (!neigh) {
ND_PRINTK(0, err,
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 1240ccd57f39..61819ed858b1 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -16,6 +16,9 @@
#include <net/ip6_route.h>
#include <net/xfrm.h>
#include <net/netfilter/nf_queue.h>
+#include <net/netfilter/nf_conntrack_bridge.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include "../bridge/br_private.h"
int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
{
@@ -109,16 +112,142 @@ int __nf_ip6_route(struct net *net, struct dst_entry **dst,
}
EXPORT_SYMBOL_GPL(__nf_ip6_route);
+int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ struct nf_ct_bridge_frag_data *data,
+ int (*output)(struct net *, struct sock *sk,
+ const struct nf_ct_bridge_frag_data *data,
+ struct sk_buff *))
+{
+ int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+ struct ip6_frag_state state;
+ u8 *prevhdr, nexthdr = 0;
+ unsigned int mtu, hlen;
+ int hroom, err = 0;
+ __be32 frag_id;
+
+ err = ip6_find_1stfragopt(skb, &prevhdr);
+ if (err < 0)
+ goto blackhole;
+ hlen = err;
+ nexthdr = *prevhdr;
+
+ mtu = skb->dev->mtu;
+ if (frag_max_size > mtu ||
+ frag_max_size < IPV6_MIN_MTU)
+ goto blackhole;
+
+ mtu = frag_max_size;
+ if (mtu < hlen + sizeof(struct frag_hdr) + 8)
+ goto blackhole;
+ mtu -= hlen + sizeof(struct frag_hdr);
+
+ frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
+ &ipv6_hdr(skb)->saddr);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (err = skb_checksum_help(skb)))
+ goto blackhole;
+
+ hroom = LL_RESERVED_SPACE(skb->dev);
+ if (skb_has_frag_list(skb)) {
+ unsigned int first_len = skb_pagelen(skb);
+ struct ip6_fraglist_iter iter;
+ struct sk_buff *frag2;
+
+ if (first_len - hlen > mtu ||
+ skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
+ goto blackhole;
+
+ if (skb_cloned(skb))
+ goto slow_path;
+
+ skb_walk_frags(skb, frag2) {
+ if (frag2->len > mtu ||
+ skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr)))
+ goto blackhole;
+
+ /* Partially cloned skb? */
+ if (skb_shared(frag2))
+ goto slow_path;
+ }
+
+ err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
+ &iter);
+ if (err < 0)
+ goto blackhole;
+
+ for (;;) {
+ /* Prepare header of the next frame,
+ * before previous one went down.
+ */
+ if (iter.frag)
+ ip6_fraglist_prepare(skb, &iter);
+
+ err = output(net, sk, data, skb);
+ if (err || !iter.frag)
+ break;
+
+ skb = ip6_fraglist_next(&iter);
+ }
+
+ kfree(iter.tmp_hdr);
+ if (!err)
+ return 0;
+
+ kfree_skb_list(iter.frag);
+ return err;
+ }
+slow_path:
+ /* This is a linearized skbuff, the original geometry is lost for us.
+ * This may also be a clone skbuff, we could preserve the geometry for
+ * the copies but probably not worth the effort.
+ */
+ ip6_frag_init(skb, hlen, mtu, skb->dev->needed_tailroom,
+ LL_RESERVED_SPACE(skb->dev), prevhdr, nexthdr, frag_id,
+ &state);
+
+ while (state.left > 0) {
+ struct sk_buff *skb2;
+
+ skb2 = ip6_frag_next(skb, &state);
+ if (IS_ERR(skb2)) {
+ err = PTR_ERR(skb2);
+ goto blackhole;
+ }
+
+ err = output(net, sk, data, skb2);
+ if (err)
+ goto blackhole;
+ }
+ consume_skb(skb);
+ return err;
+
+blackhole:
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_ip6_fragment);
+
static const struct nf_ipv6_ops ipv6ops = {
#if IS_MODULE(CONFIG_IPV6)
.chk_addr = ipv6_chk_addr,
.route_me_harder = ip6_route_me_harder,
.dev_get_saddr = ipv6_dev_get_saddr,
.route = __nf_ip6_route,
+#if IS_ENABLED(CONFIG_SYN_COOKIES)
+ .cookie_init_sequence = __cookie_v6_init_sequence,
+ .cookie_v6_check = __cookie_v6_check,
+#endif
#endif
.route_input = ip6_route_input,
.fragment = ip6_fragment,
.reroute = nf_ip6_reroute,
+#if IS_MODULE(CONFIG_IPV6) && IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+ .br_defrag = nf_ct_frag6_gather,
+#endif
+#if IS_MODULE(CONFIG_IPV6)
+ .br_fragment = br_ip6_fragment,
+#endif
};
int __init ipv6_netfilter_init(void)
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 41325d517478..e77ea1ed5edd 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -3,272 +3,11 @@
* Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
*/
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <net/ip6_checksum.h>
-#include <net/ip6_route.h>
-#include <net/tcp.h>
-
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_SYNPROXY.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/nf_conntrack_synproxy.h>
-#include <net/netfilter/nf_conntrack_ecache.h>
-
-static struct ipv6hdr *
-synproxy_build_ip(struct net *net, struct sk_buff *skb,
- const struct in6_addr *saddr,
- const struct in6_addr *daddr)
-{
- struct ipv6hdr *iph;
-
- skb_reset_network_header(skb);
- iph = skb_put(skb, sizeof(*iph));
- ip6_flow_hdr(iph, 0, 0);
- iph->hop_limit = net->ipv6.devconf_all->hop_limit;
- iph->nexthdr = IPPROTO_TCP;
- iph->saddr = *saddr;
- iph->daddr = *daddr;
-
- return iph;
-}
-
-static void
-synproxy_send_tcp(struct net *net,
- const struct sk_buff *skb, struct sk_buff *nskb,
- struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
- struct ipv6hdr *niph, struct tcphdr *nth,
- unsigned int tcp_hdr_size)
-{
- struct dst_entry *dst;
- struct flowi6 fl6;
-
- nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0);
- nskb->ip_summed = CHECKSUM_PARTIAL;
- nskb->csum_start = (unsigned char *)nth - nskb->head;
- nskb->csum_offset = offsetof(struct tcphdr, check);
-
- memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_TCP;
- fl6.saddr = niph->saddr;
- fl6.daddr = niph->daddr;
- fl6.fl6_sport = nth->source;
- fl6.fl6_dport = nth->dest;
- security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6));
- dst = ip6_route_output(net, NULL, &fl6);
- if (dst->error) {
- dst_release(dst);
- goto free_nskb;
- }
- dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
- if (IS_ERR(dst))
- goto free_nskb;
-
- skb_dst_set(nskb, dst);
-
- if (nfct) {
- nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
- nf_conntrack_get(nfct);
- }
-
- ip6_local_out(net, nskb->sk, nskb);
- return;
-
-free_nskb:
- kfree_skb(nskb);
-}
-
-static void
-synproxy_send_client_synack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct ipv6hdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
- u16 mss = opts->mss;
-
- iph = ipv6_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->dest;
- nth->dest = th->source;
- nth->seq = htonl(__cookie_v6_init_sequence(iph, th, &mss));
- nth->ack_seq = htonl(ntohl(th->seq) + 1);
- tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
- if (opts->options & XT_SYNPROXY_OPT_ECN)
- tcp_flag_word(nth) |= TCP_FLAG_ECE;
- nth->doff = tcp_hdr_size / 4;
- nth->window = 0;
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
- IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
-}
-static void
-synproxy_send_server_syn(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts, u32 recv_seq)
-{
- struct synproxy_net *snet = synproxy_pernet(net);
- struct sk_buff *nskb;
- struct ipv6hdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ipv6_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->source;
- nth->dest = th->dest;
- nth->seq = htonl(recv_seq - 1);
- /* ack_seq is used to relay our ISN to the synproxy hook to initialize
- * sequence number translation once a connection tracking entry exists.
- */
- nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
- tcp_flag_word(nth) = TCP_FLAG_SYN;
- if (opts->options & XT_SYNPROXY_OPT_ECN)
- tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
- nth->doff = tcp_hdr_size / 4;
- nth->window = th->window;
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
- niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_server_ack(struct net *net,
- const struct ip_ct_tcp *state,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct ipv6hdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ipv6_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->dest;
- nth->dest = th->source;
- nth->seq = htonl(ntohl(th->ack_seq));
- nth->ack_seq = htonl(ntohl(th->seq) + 1);
- tcp_flag_word(nth) = TCP_FLAG_ACK;
- nth->doff = tcp_hdr_size / 4;
- nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_client_ack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct ipv6hdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ipv6_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->source;
- nth->dest = th->dest;
- nth->seq = htonl(ntohl(th->seq) + 1);
- nth->ack_seq = th->ack_seq;
- tcp_flag_word(nth) = TCP_FLAG_ACK;
- nth->doff = tcp_hdr_size / 4;
- nth->window = htons(ntohs(th->window) >> opts->wscale);
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
- IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
-}
-
-static bool
-synproxy_recv_client_ack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- struct synproxy_options *opts, u32 recv_seq)
-{
- struct synproxy_net *snet = synproxy_pernet(net);
- int mss;
-
- mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1);
- if (mss == 0) {
- this_cpu_inc(snet->stats->cookie_invalid);
- return false;
- }
-
- this_cpu_inc(snet->stats->cookie_valid);
- opts->mss = mss;
- opts->options |= XT_SYNPROXY_OPT_MSS;
-
- if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
- synproxy_check_timestamp_cookie(opts);
-
- synproxy_send_server_syn(net, skb, th, opts, recv_seq);
- return true;
-}
+#include <net/netfilter/nf_synproxy.h>
static unsigned int
synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
@@ -304,13 +43,14 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
XT_SYNPROXY_OPT_SACK_PERM |
XT_SYNPROXY_OPT_ECN);
- synproxy_send_client_synack(net, skb, th, &opts);
+ synproxy_send_client_synack_ipv6(net, skb, th, &opts);
consume_skb(skb);
return NF_STOLEN;
} else if (th->ack && !(th->fin || th->rst || th->syn)) {
/* ACK from client */
- if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq))) {
+ if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts,
+ ntohl(th->seq))) {
consume_skb(skb);
return NF_STOLEN;
} else {
@@ -321,141 +61,6 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static unsigned int ipv6_synproxy_hook(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *nhs)
-{
- struct net *net = nhs->net;
- struct synproxy_net *snet = synproxy_pernet(net);
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct;
- struct nf_conn_synproxy *synproxy;
- struct synproxy_options opts = {};
- const struct ip_ct_tcp *state;
- struct tcphdr *th, _th;
- __be16 frag_off;
- u8 nexthdr;
- int thoff;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (ct == NULL)
- return NF_ACCEPT;
-
- synproxy = nfct_synproxy(ct);
- if (synproxy == NULL)
- return NF_ACCEPT;
-
- if (nf_is_loopback_packet(skb))
- return NF_ACCEPT;
-
- nexthdr = ipv6_hdr(skb)->nexthdr;
- thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
- &frag_off);
- if (thoff < 0 || nexthdr != IPPROTO_TCP)
- return NF_ACCEPT;
-
- th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
- if (th == NULL)
- return NF_DROP;
-
- state = &ct->proto.tcp;
- switch (state->state) {
- case TCP_CONNTRACK_CLOSE:
- if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
- ntohl(th->seq) + 1);
- break;
- }
-
- if (!th->syn || th->ack ||
- CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
- break;
-
- /* Reopened connection - reset the sequence number and timestamp
- * adjustments, they will get initialized once the connection is
- * reestablished.
- */
- nf_ct_seqadj_init(ct, ctinfo, 0);
- synproxy->tsoff = 0;
- this_cpu_inc(snet->stats->conn_reopened);
-
- /* fall through */
- case TCP_CONNTRACK_SYN_SENT:
- if (!synproxy_parse_options(skb, thoff, th, &opts))
- return NF_DROP;
-
- if (!th->syn && th->ack &&
- CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
- /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
- * therefore we need to add 1 to make the SYN sequence
- * number match the one of first SYN.
- */
- if (synproxy_recv_client_ack(net, skb, th, &opts,
- ntohl(th->seq) + 1)) {
- this_cpu_inc(snet->stats->cookie_retrans);
- consume_skb(skb);
- return NF_STOLEN;
- } else {
- return NF_DROP;
- }
- }
-
- synproxy->isn = ntohl(th->ack_seq);
- if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
- synproxy->its = opts.tsecr;
-
- nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
- break;
- case TCP_CONNTRACK_SYN_RECV:
- if (!th->syn || !th->ack)
- break;
-
- if (!synproxy_parse_options(skb, thoff, th, &opts))
- return NF_DROP;
-
- if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) {
- synproxy->tsoff = opts.tsval - synproxy->its;
- nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
- }
-
- opts.options &= ~(XT_SYNPROXY_OPT_MSS |
- XT_SYNPROXY_OPT_WSCALE |
- XT_SYNPROXY_OPT_SACK_PERM);
-
- swap(opts.tsval, opts.tsecr);
- synproxy_send_server_ack(net, state, skb, th, &opts);
-
- nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
- nf_conntrack_event_cache(IPCT_SEQADJ, ct);
-
- swap(opts.tsval, opts.tsecr);
- synproxy_send_client_ack(net, skb, th, &opts);
-
- consume_skb(skb);
- return NF_STOLEN;
- default:
- break;
- }
-
- synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
- return NF_ACCEPT;
-}
-
-static const struct nf_hook_ops ipv6_synproxy_ops[] = {
- {
- .hook = ipv6_synproxy_hook,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
- },
- {
- .hook = ipv6_synproxy_hook,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
- },
-};
-
static int synproxy_tg6_check(const struct xt_tgchk_param *par)
{
struct synproxy_net *snet = synproxy_pernet(par->net);
@@ -471,16 +76,12 @@ static int synproxy_tg6_check(const struct xt_tgchk_param *par)
if (err)
return err;
- if (snet->hook_ref6 == 0) {
- err = nf_register_net_hooks(par->net, ipv6_synproxy_ops,
- ARRAY_SIZE(ipv6_synproxy_ops));
- if (err) {
- nf_ct_netns_put(par->net, par->family);
- return err;
- }
+ err = nf_synproxy_ipv6_init(snet, par->net);
+ if (err) {
+ nf_ct_netns_put(par->net, par->family);
+ return err;
}
- snet->hook_ref6++;
return err;
}
@@ -488,10 +89,7 @@ static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par)
{
struct synproxy_net *snet = synproxy_pernet(par->net);
- snet->hook_ref6--;
- if (snet->hook_ref6 == 0)
- nf_unregister_net_hooks(par->net, ipv6_synproxy_ops,
- ARRAY_SIZE(ipv6_synproxy_ops));
+ nf_synproxy_ipv6_fini(snet, par->net);
nf_ct_netns_put(par->net, par->family);
}
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 3f7d4691c423..a22100b1cf2c 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -2,7 +2,7 @@
/*
* IPv6 raw table, a port of the IPv4 raw table to IPv6
*
- * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 84322ce81d70..398e1df41406 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -54,26 +54,21 @@ static struct inet_frags nf_frags;
static struct ctl_table nf_ct_frag6_sysctl_table[] = {
{
.procname = "nf_conntrack_frag6_timeout",
- .data = &init_net.nf_frag.frags.timeout,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_frag6_low_thresh",
- .data = &init_net.nf_frag.frags.low_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra2 = &init_net.nf_frag.frags.high_thresh
},
{
.procname = "nf_conntrack_frag6_high_thresh",
- .data = &init_net.nf_frag.frags.high_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra1 = &init_net.nf_frag.frags.low_thresh
},
{ }
};
@@ -89,15 +84,15 @@ static int nf_ct_frag6_sysctl_register(struct net *net)
GFP_KERNEL);
if (table == NULL)
goto err_alloc;
-
- table[0].data = &net->nf_frag.frags.timeout;
- table[1].data = &net->nf_frag.frags.low_thresh;
- table[1].extra2 = &net->nf_frag.frags.high_thresh;
- table[2].data = &net->nf_frag.frags.high_thresh;
- table[2].extra1 = &net->nf_frag.frags.low_thresh;
- table[2].extra2 = &init_net.nf_frag.frags.high_thresh;
}
+ table[0].data = &net->nf_frag.fqdir->timeout;
+ table[1].data = &net->nf_frag.fqdir->low_thresh;
+ table[1].extra2 = &net->nf_frag.fqdir->high_thresh;
+ table[2].data = &net->nf_frag.fqdir->high_thresh;
+ table[2].extra1 = &net->nf_frag.fqdir->low_thresh;
+ table[2].extra2 = &init_net.nf_frag.fqdir->high_thresh;
+
hdr = register_net_sysctl(net, "net/netfilter", table);
if (hdr == NULL)
goto err_reg;
@@ -144,12 +139,10 @@ static void nf_ct_frag6_expire(struct timer_list *t)
{
struct inet_frag_queue *frag = from_timer(frag, t, timer);
struct frag_queue *fq;
- struct net *net;
fq = container_of(frag, struct frag_queue, q);
- net = container_of(fq->q.net, struct net, nf_frag.frags);
- ip6frag_expire_frag_queue(net, fq);
+ ip6frag_expire_frag_queue(fq->q.fqdir->net, fq);
}
/* Creation primitives. */
@@ -165,7 +158,7 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
};
struct inet_frag_queue *q;
- q = inet_frag_find(&net->nf_frag.frags, &key);
+ q = inet_frag_find(net->nf_frag.fqdir, &key);
if (!q)
return NULL;
@@ -278,7 +271,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
fq->ecn |= ecn;
if (payload_len > fq->q.max_size)
fq->q.max_size = payload_len;
- add_frag_mem_limit(fq->q.net, skb->truesize);
+ add_frag_mem_limit(fq->q.fqdir, skb->truesize);
/* The first fragment.
* nhoffset is obtained from the first fragment, of course.
@@ -494,29 +487,35 @@ static int nf_ct_net_init(struct net *net)
{
int res;
- net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
- net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
- net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
- net->nf_frag.frags.f = &nf_frags;
-
- res = inet_frags_init_net(&net->nf_frag.frags);
+ res = fqdir_init(&net->nf_frag.fqdir, &nf_frags, net);
if (res < 0)
return res;
+
+ net->nf_frag.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH;
+ net->nf_frag.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH;
+ net->nf_frag.fqdir->timeout = IPV6_FRAG_TIMEOUT;
+
res = nf_ct_frag6_sysctl_register(net);
if (res < 0)
- inet_frags_exit_net(&net->nf_frag.frags);
+ fqdir_exit(net->nf_frag.fqdir);
return res;
}
+static void nf_ct_net_pre_exit(struct net *net)
+{
+ fqdir_pre_exit(net->nf_frag.fqdir);
+}
+
static void nf_ct_net_exit(struct net *net)
{
nf_ct_frags6_sysctl_unregister(net);
- inet_frags_exit_net(&net->nf_frag.frags);
+ fqdir_exit(net->nf_frag.fqdir);
}
static struct pernet_operations nf_ct_net_ops = {
- .init = nf_ct_net_init,
- .exit = nf_ct_net_exit,
+ .init = nf_ct_net_init,
+ .pre_exit = nf_ct_net_pre_exit,
+ .exit = nf_ct_net_exit,
};
static const struct rhashtable_params nfct_rhash_params = {
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 4a8da679866e..bbff3e02e302 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -44,8 +44,8 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "RAW6: inuse %d\n",
sock_prot_inuse_get(net, &rawv6_prot));
seq_printf(seq, "FRAG6: inuse %u memory %lu\n",
- atomic_read(&net->ipv6.frags.rhashtable.nelems),
- frag_mem_limit(&net->ipv6.frags));
+ atomic_read(&net->ipv6.fqdir->rhashtable.nelems),
+ frag_mem_limit(net->ipv6.fqdir));
return 0;
}
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 70693bc7ad9d..8a6131991e38 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -834,7 +834,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (!flowlabel)
+ if (IS_ERR(flowlabel))
return -EINVAL;
}
}
@@ -876,7 +876,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (!flowlabel)
+ if (IS_ERR(flowlabel))
return -EINVAL;
}
if (!(opt->opt_nflen|opt->opt_flen))
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index b2b2c0c38b87..ca05b16f1bb9 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -72,12 +72,10 @@ static void ip6_frag_expire(struct timer_list *t)
{
struct inet_frag_queue *frag = from_timer(frag, t, timer);
struct frag_queue *fq;
- struct net *net;
fq = container_of(frag, struct frag_queue, q);
- net = container_of(fq->q.net, struct net, ipv6.frags);
- ip6frag_expire_frag_queue(net, fq);
+ ip6frag_expire_frag_queue(fq->q.fqdir->net, fq);
}
static struct frag_queue *
@@ -96,7 +94,7 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
IPV6_ADDR_LINKLOCAL)))
key.iif = 0;
- q = inet_frag_find(&net->ipv6.frags, &key);
+ q = inet_frag_find(net->ipv6.fqdir, &key);
if (!q)
return NULL;
@@ -196,7 +194,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
fq->q.stamp = skb->tstamp;
fq->q.meat += skb->len;
fq->ecn |= ecn;
- add_frag_mem_limit(fq->q.net, skb->truesize);
+ add_frag_mem_limit(fq->q.fqdir, skb->truesize);
fragsize = -skb_network_offset(skb) + skb->len;
if (fragsize > fq->q.max_size)
@@ -250,7 +248,7 @@ err:
static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
struct sk_buff *prev_tail, struct net_device *dev)
{
- struct net *net = container_of(fq->q.net, struct net, ipv6.frags);
+ struct net *net = fq->q.fqdir->net;
unsigned int nhoff;
void *reasm_data;
int payload_len;
@@ -397,23 +395,18 @@ static const struct inet6_protocol frag_protocol = {
static struct ctl_table ip6_frags_ns_ctl_table[] = {
{
.procname = "ip6frag_high_thresh",
- .data = &init_net.ipv6.frags.high_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra1 = &init_net.ipv6.frags.low_thresh
},
{
.procname = "ip6frag_low_thresh",
- .data = &init_net.ipv6.frags.low_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra2 = &init_net.ipv6.frags.high_thresh
},
{
.procname = "ip6frag_time",
- .data = &init_net.ipv6.frags.timeout,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -445,12 +438,12 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net)
if (!table)
goto err_alloc;
- table[0].data = &net->ipv6.frags.high_thresh;
- table[0].extra1 = &net->ipv6.frags.low_thresh;
- table[1].data = &net->ipv6.frags.low_thresh;
- table[1].extra2 = &net->ipv6.frags.high_thresh;
- table[2].data = &net->ipv6.frags.timeout;
}
+ table[0].data = &net->ipv6.fqdir->high_thresh;
+ table[0].extra1 = &net->ipv6.fqdir->low_thresh;
+ table[1].data = &net->ipv6.fqdir->low_thresh;
+ table[1].extra2 = &net->ipv6.fqdir->high_thresh;
+ table[2].data = &net->ipv6.fqdir->timeout;
hdr = register_net_sysctl(net, "net/ipv6", table);
if (!hdr)
@@ -513,30 +506,35 @@ static int __net_init ipv6_frags_init_net(struct net *net)
{
int res;
- net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
- net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
- net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
- net->ipv6.frags.f = &ip6_frags;
-
- res = inet_frags_init_net(&net->ipv6.frags);
+ res = fqdir_init(&net->ipv6.fqdir, &ip6_frags, net);
if (res < 0)
return res;
+ net->ipv6.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH;
+ net->ipv6.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH;
+ net->ipv6.fqdir->timeout = IPV6_FRAG_TIMEOUT;
+
res = ip6_frags_ns_sysctl_register(net);
if (res < 0)
- inet_frags_exit_net(&net->ipv6.frags);
+ fqdir_exit(net->ipv6.fqdir);
return res;
}
+static void __net_exit ipv6_frags_pre_exit_net(struct net *net)
+{
+ fqdir_pre_exit(net->ipv6.fqdir);
+}
+
static void __net_exit ipv6_frags_exit_net(struct net *net)
{
ip6_frags_ns_sysctl_unregister(net);
- inet_frags_exit_net(&net->ipv6.frags);
+ fqdir_exit(net->ipv6.fqdir);
}
static struct pernet_operations ip6_frags_ops = {
- .init = ipv6_frags_init_net,
- .exit = ipv6_frags_exit_net,
+ .init = ipv6_frags_init_net,
+ .pre_exit = ipv6_frags_pre_exit_net,
+ .exit = ipv6_frags_exit_net,
};
static const struct rhashtable_params ip6_rhash_params = {
@@ -587,8 +585,8 @@ err_protocol:
void ipv6_frag_exit(void)
{
- inet_frags_fini(&ip6_frags);
ip6_frags_sysctl_unregister();
unregister_pernet_subsys(&ip6_frags_ops);
inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+ inet_frags_fini(&ip6_frags);
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 97a843cf164c..4d2e6b31a8d6 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -100,7 +100,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
int strict);
-static size_t rt6_nlmsg_size(struct fib6_info *rt);
+static size_t rt6_nlmsg_size(struct fib6_info *f6i);
static int rt6_fill_node(struct net *net, struct sk_buff *skb,
struct fib6_info *rt, struct dst_entry *dst,
struct in6_addr *dest, struct in6_addr *src,
@@ -176,7 +176,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
}
if (rt_dev == dev) {
- rt->dst.dev = loopback_dev;
+ rt->dst.dev = blackhole_netdev;
dev_hold(rt->dst.dev);
dev_put(rt_dev);
}
@@ -429,21 +429,27 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
struct fib6_info *sibling, *next_sibling;
struct fib6_info *match = res->f6i;
- if (!match->fib6_nsiblings || have_oif_match)
+ if ((!match->fib6_nsiblings && !match->nh) || have_oif_match)
goto out;
/* We might have already computed the hash for ICMPv6 errors. In such
* case it will always be non-zero. Otherwise now is the time to do it.
*/
- if (!fl6->mp_hash)
+ if (!fl6->mp_hash &&
+ (!match->nh || nexthop_is_multipath(match->nh)))
fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
- if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
+ if (unlikely(match->nh)) {
+ nexthop_path_fib6_result(res, fl6->mp_hash);
+ return;
+ }
+
+ if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
goto out;
list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
fib6_siblings) {
- const struct fib6_nh *nh = &sibling->fib6_nh;
+ const struct fib6_nh *nh = sibling->fib6_nh;
int nh_upper_bound;
nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
@@ -457,7 +463,7 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
out:
res->f6i = match;
- res->nh = &match->fib6_nh;
+ res->nh = match->fib6_nh;
}
/*
@@ -485,6 +491,45 @@ static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
return false;
}
+struct fib6_nh_dm_arg {
+ struct net *net;
+ const struct in6_addr *saddr;
+ int oif;
+ int flags;
+ struct fib6_nh *nh;
+};
+
+static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_dm_arg *arg = _arg;
+
+ arg->nh = nh;
+ return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif,
+ arg->flags);
+}
+
+/* returns fib6_nh from nexthop or NULL */
+static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh,
+ struct fib6_result *res,
+ const struct in6_addr *saddr,
+ int oif, int flags)
+{
+ struct fib6_nh_dm_arg arg = {
+ .net = net,
+ .saddr = saddr,
+ .oif = oif,
+ .flags = flags,
+ };
+
+ if (nexthop_is_blackhole(nh))
+ return NULL;
+
+ if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg))
+ return arg.nh;
+
+ return NULL;
+}
+
static void rt6_device_match(struct net *net, struct fib6_result *res,
const struct in6_addr *saddr, int oif, int flags)
{
@@ -493,14 +538,31 @@ static void rt6_device_match(struct net *net, struct fib6_result *res,
struct fib6_nh *nh;
if (!oif && ipv6_addr_any(saddr)) {
- nh = &f6i->fib6_nh;
+ if (unlikely(f6i->nh)) {
+ nh = nexthop_fib6_nh(f6i->nh);
+ if (nexthop_is_blackhole(f6i->nh))
+ goto out_blackhole;
+ } else {
+ nh = f6i->fib6_nh;
+ }
if (!(nh->fib_nh_flags & RTNH_F_DEAD))
goto out;
}
for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
- nh = &spf6i->fib6_nh;
- if (__rt6_device_match(net, nh, saddr, oif, flags)) {
+ bool matched = false;
+
+ if (unlikely(spf6i->nh)) {
+ nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr,
+ oif, flags);
+ if (nh)
+ matched = true;
+ } else {
+ nh = spf6i->fib6_nh;
+ if (__rt6_device_match(net, nh, saddr, oif, flags))
+ matched = true;
+ }
+ if (matched) {
res->f6i = spf6i;
goto out;
}
@@ -508,19 +570,32 @@ static void rt6_device_match(struct net *net, struct fib6_result *res,
if (oif && flags & RT6_LOOKUP_F_IFACE) {
res->f6i = net->ipv6.fib6_null_entry;
- nh = &res->f6i->fib6_nh;
+ nh = res->f6i->fib6_nh;
goto out;
}
- nh = &f6i->fib6_nh;
+ if (unlikely(f6i->nh)) {
+ nh = nexthop_fib6_nh(f6i->nh);
+ if (nexthop_is_blackhole(f6i->nh))
+ goto out_blackhole;
+ } else {
+ nh = f6i->fib6_nh;
+ }
+
if (nh->fib_nh_flags & RTNH_F_DEAD) {
res->f6i = net->ipv6.fib6_null_entry;
- nh = &res->f6i->fib6_nh;
+ nh = res->f6i->fib6_nh;
}
out:
res->nh = nh;
res->fib6_type = res->f6i->fib6_type;
res->fib6_flags = res->f6i->fib6_flags;
+ return;
+
+out_blackhole:
+ res->fib6_flags |= RTF_REJECT;
+ res->fib6_type = RTN_BLACKHOLE;
+ res->nh = nh;
}
#ifdef CONFIG_IPV6_ROUTER_PREF
@@ -691,6 +766,24 @@ out:
return rc;
}
+struct fib6_nh_frl_arg {
+ u32 flags;
+ int oif;
+ int strict;
+ int *mpri;
+ bool *do_rr;
+ struct fib6_nh *nh;
+};
+
+static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_frl_arg *arg = _arg;
+
+ arg->nh = nh;
+ return find_match(nh, arg->flags, arg->oif, arg->strict,
+ arg->mpri, arg->do_rr);
+}
+
static void __find_rr_leaf(struct fib6_info *f6i_start,
struct fib6_info *nomatch, u32 metric,
struct fib6_result *res, struct fib6_info **cont,
@@ -701,6 +794,7 @@ static void __find_rr_leaf(struct fib6_info *f6i_start,
for (f6i = f6i_start;
f6i && f6i != nomatch;
f6i = rcu_dereference(f6i->fib6_next)) {
+ bool matched = false;
struct fib6_nh *nh;
if (cont && f6i->fib6_metric != metric) {
@@ -711,8 +805,34 @@ static void __find_rr_leaf(struct fib6_info *f6i_start,
if (fib6_check_expired(f6i))
continue;
- nh = &f6i->fib6_nh;
- if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
+ if (unlikely(f6i->nh)) {
+ struct fib6_nh_frl_arg arg = {
+ .flags = f6i->fib6_flags,
+ .oif = oif,
+ .strict = strict,
+ .mpri = mpri,
+ .do_rr = do_rr
+ };
+
+ if (nexthop_is_blackhole(f6i->nh)) {
+ res->fib6_flags = RTF_REJECT;
+ res->fib6_type = RTN_BLACKHOLE;
+ res->f6i = f6i;
+ res->nh = nexthop_fib6_nh(f6i->nh);
+ return;
+ }
+ if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match,
+ &arg)) {
+ matched = true;
+ nh = arg.nh;
+ }
+ } else {
+ nh = f6i->fib6_nh;
+ if (find_match(nh, f6i->fib6_flags, oif, strict,
+ mpri, do_rr))
+ matched = true;
+ }
+ if (matched) {
res->f6i = f6i;
res->nh = nh;
res->fib6_flags = f6i->fib6_flags;
@@ -793,7 +913,7 @@ static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
out:
if (!res->f6i) {
res->f6i = net->ipv6.fib6_null_entry;
- res->nh = &res->f6i->fib6_nh;
+ res->nh = res->f6i->fib6_nh;
res->fib6_flags = res->f6i->fib6_flags;
res->fib6_type = res->f6i->fib6_type;
}
@@ -1114,6 +1234,8 @@ restart:
rt = net->ipv6.ip6_null_entry;
dst_hold(&rt->dst);
goto out;
+ } else if (res.fib6_flags & RTF_REJECT) {
+ goto do_create;
}
fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
@@ -1125,6 +1247,7 @@ restart:
if (ip6_hold_safe(net, &rt))
dst_use_noref(&rt->dst, jiffies);
} else {
+do_create:
rt = ip6_create_rt_rcu(&res);
}
@@ -1265,13 +1388,9 @@ static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
/* It should be called with rcu_read_lock() acquired */
static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
{
- struct rt6_info *pcpu_rt, **p;
-
- p = this_cpu_ptr(res->f6i->rt6i_pcpu);
- pcpu_rt = *p;
+ struct rt6_info *pcpu_rt;
- if (pcpu_rt)
- ip6_hold_safe(NULL, &pcpu_rt);
+ pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu);
return pcpu_rt;
}
@@ -1282,13 +1401,10 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net,
struct rt6_info *pcpu_rt, *prev, **p;
pcpu_rt = ip6_rt_pcpu_alloc(res);
- if (!pcpu_rt) {
- dst_hold(&net->ipv6.ip6_null_entry->dst);
- return net->ipv6.ip6_null_entry;
- }
+ if (!pcpu_rt)
+ return NULL;
- dst_hold(&pcpu_rt->dst);
- p = this_cpu_ptr(res->f6i->rt6i_pcpu);
+ p = this_cpu_ptr(res->nh->rt6i_pcpu);
prev = cmpxchg(p, NULL, pcpu_rt);
BUG_ON(prev);
@@ -1458,25 +1574,74 @@ static unsigned int fib6_mtu(const struct fib6_result *res)
return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
}
+#define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL
+
+/* used when the flushed bit is not relevant, only access to the bucket
+ * (ie., all bucket users except rt6_insert_exception);
+ *
+ * called under rcu lock; sometimes called with rt6_exception_lock held
+ */
+static
+struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh,
+ spinlock_t *lock)
+{
+ struct rt6_exception_bucket *bucket;
+
+ if (lock)
+ bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
+ lockdep_is_held(lock));
+ else
+ bucket = rcu_dereference(nh->rt6i_exception_bucket);
+
+ /* remove bucket flushed bit if set */
+ if (bucket) {
+ unsigned long p = (unsigned long)bucket;
+
+ p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED;
+ bucket = (struct rt6_exception_bucket *)p;
+ }
+
+ return bucket;
+}
+
+static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket)
+{
+ unsigned long p = (unsigned long)bucket;
+
+ return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED);
+}
+
+/* called with rt6_exception_lock held */
+static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh,
+ spinlock_t *lock)
+{
+ struct rt6_exception_bucket *bucket;
+ unsigned long p;
+
+ bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
+ lockdep_is_held(lock));
+
+ p = (unsigned long)bucket;
+ p |= FIB6_EXCEPTION_BUCKET_FLUSHED;
+ bucket = (struct rt6_exception_bucket *)p;
+ rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
+}
+
static int rt6_insert_exception(struct rt6_info *nrt,
const struct fib6_result *res)
{
struct net *net = dev_net(nrt->dst.dev);
struct rt6_exception_bucket *bucket;
+ struct fib6_info *f6i = res->f6i;
struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
- struct fib6_info *f6i = res->f6i;
+ struct fib6_nh *nh = res->nh;
int err = 0;
spin_lock_bh(&rt6_exception_lock);
- if (f6i->exception_bucket_flushed) {
- err = -EINVAL;
- goto out;
- }
-
- bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
+ bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
+ lockdep_is_held(&rt6_exception_lock));
if (!bucket) {
bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
GFP_ATOMIC);
@@ -1484,7 +1649,10 @@ static int rt6_insert_exception(struct rt6_info *nrt,
err = -ENOMEM;
goto out;
}
- rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
+ rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
+ } else if (fib6_nh_excptn_bucket_flushed(bucket)) {
+ err = -EINVAL;
+ goto out;
}
#ifdef CONFIG_IPV6_SUBTREES
@@ -1539,7 +1707,7 @@ out:
return err;
}
-void rt6_flush_exceptions(struct fib6_info *rt)
+static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
@@ -1547,25 +1715,46 @@ void rt6_flush_exceptions(struct fib6_info *rt)
int i;
spin_lock_bh(&rt6_exception_lock);
- /* Prevent rt6_insert_exception() to recreate the bucket list */
- rt->exception_bucket_flushed = 1;
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (!bucket)
goto out;
+ /* Prevent rt6_insert_exception() to recreate the bucket list */
+ if (!from)
+ fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock);
+
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
- hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
- rt6_remove_exception(bucket, rt6_ex);
- WARN_ON_ONCE(bucket->depth);
+ hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
+ if (!from ||
+ rcu_access_pointer(rt6_ex->rt6i->from) == from)
+ rt6_remove_exception(bucket, rt6_ex);
+ }
+ WARN_ON_ONCE(!from && bucket->depth);
bucket++;
}
-
out:
spin_unlock_bh(&rt6_exception_lock);
}
+static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg)
+{
+ struct fib6_info *f6i = arg;
+
+ fib6_nh_flush_exceptions(nh, f6i);
+
+ return 0;
+}
+
+void rt6_flush_exceptions(struct fib6_info *f6i)
+{
+ if (f6i->nh)
+ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions,
+ f6i);
+ else
+ fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
+}
+
/* Find cached rt in the hash table inside passed in rt
* Caller has to hold rcu_read_lock()
*/
@@ -1594,7 +1783,7 @@ static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
src_key = saddr;
find_ex:
#endif
- bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
+ bucket = fib6_nh_get_excptn_bucket(res->nh, NULL);
rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
@@ -1612,25 +1801,20 @@ find_ex:
}
/* Remove the passed in cached rt from the hash table that contains it */
-static int rt6_remove_exception_rt(struct rt6_info *rt)
+static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen,
+ const struct rt6_info *rt)
{
+ const struct in6_addr *src_key = NULL;
struct rt6_exception_bucket *bucket;
- struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
- struct fib6_info *from;
int err;
- from = rcu_dereference(rt->from);
- if (!from ||
- !(rt->rt6i_flags & RTF_CACHE))
- return -EINVAL;
-
- if (!rcu_access_pointer(from->rt6i_exception_bucket))
+ if (!rcu_access_pointer(nh->rt6i_exception_bucket))
return -ENOENT;
spin_lock_bh(&rt6_exception_lock);
- bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
+
#ifdef CONFIG_IPV6_SUBTREES
/* rt6i_src.plen != 0 indicates 'from' is in subtree
* and exception table is indexed by a hash of
@@ -1638,7 +1822,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt)
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
- if (from->fib6_src.plen)
+ if (plen)
src_key = &rt->rt6i_src.addr;
#endif
rt6_ex = __rt6_find_exception_spinlock(&bucket,
@@ -1655,23 +1839,60 @@ static int rt6_remove_exception_rt(struct rt6_info *rt)
return err;
}
-/* Find rt6_ex which contains the passed in rt cache and
- * refresh its stamp
- */
-static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
+struct fib6_nh_excptn_arg {
+ struct rt6_info *rt;
+ int plen;
+};
+
+static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_excptn_arg *arg = _arg;
+ int err;
+
+ err = fib6_nh_remove_exception(nh, arg->plen, arg->rt);
+ if (err == 0)
+ return 1;
+
+ return 0;
+}
+
+static int rt6_remove_exception_rt(struct rt6_info *rt)
{
- struct rt6_exception_bucket *bucket;
- struct in6_addr *src_key = NULL;
- struct rt6_exception *rt6_ex;
struct fib6_info *from;
- rcu_read_lock();
from = rcu_dereference(rt->from);
if (!from || !(rt->rt6i_flags & RTF_CACHE))
- goto unlock;
+ return -EINVAL;
- bucket = rcu_dereference(from->rt6i_exception_bucket);
+ if (from->nh) {
+ struct fib6_nh_excptn_arg arg = {
+ .rt = rt,
+ .plen = from->fib6_src.plen
+ };
+ int rc;
+
+ /* rc = 1 means an entry was found */
+ rc = nexthop_for_each_fib6_nh(from->nh,
+ rt6_nh_remove_exception_rt,
+ &arg);
+ return rc ? 0 : -ENOENT;
+ }
+ return fib6_nh_remove_exception(from->fib6_nh,
+ from->fib6_src.plen, rt);
+}
+
+/* Find rt6_ex which contains the passed in rt cache and
+ * refresh its stamp
+ */
+static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen,
+ const struct rt6_info *rt)
+{
+ const struct in6_addr *src_key = NULL;
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
+
+ bucket = fib6_nh_get_excptn_bucket(nh, NULL);
#ifdef CONFIG_IPV6_SUBTREES
/* rt6i_src.plen != 0 indicates 'from' is in subtree
* and exception table is indexed by a hash of
@@ -1679,15 +1900,63 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
- if (from->fib6_src.plen)
+ if (plen)
src_key = &rt->rt6i_src.addr;
#endif
- rt6_ex = __rt6_find_exception_rcu(&bucket,
- &rt->rt6i_dst.addr,
- src_key);
+ rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key);
if (rt6_ex)
rt6_ex->stamp = jiffies;
+}
+
+struct fib6_nh_match_arg {
+ const struct net_device *dev;
+ const struct in6_addr *gw;
+ struct fib6_nh *match;
+};
+
+/* determine if fib6_nh has given device and gateway */
+static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_match_arg *arg = _arg;
+
+ if (arg->dev != nh->fib_nh_dev ||
+ (arg->gw && !nh->fib_nh_gw_family) ||
+ (!arg->gw && nh->fib_nh_gw_family) ||
+ (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6)))
+ return 0;
+
+ arg->match = nh;
+
+ /* found a match, break the loop */
+ return 1;
+}
+
+static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
+{
+ struct fib6_info *from;
+ struct fib6_nh *fib6_nh;
+
+ rcu_read_lock();
+
+ from = rcu_dereference(rt->from);
+ if (!from || !(rt->rt6i_flags & RTF_CACHE))
+ goto unlock;
+ if (from->nh) {
+ struct fib6_nh_match_arg arg = {
+ .dev = rt->dst.dev,
+ .gw = &rt->rt6i_gateway,
+ };
+
+ nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg);
+
+ if (!arg.match)
+ return;
+ fib6_nh = arg.match;
+ } else {
+ fib6_nh = from->fib6_nh;
+ }
+ fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt);
unlock:
rcu_read_unlock();
}
@@ -1715,15 +1984,13 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
}
static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
- struct fib6_info *rt, int mtu)
+ const struct fib6_nh *nh, int mtu)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
int i;
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
-
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (!bucket)
return;
@@ -1745,21 +2012,19 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
-static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
- struct in6_addr *gateway)
+static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh,
+ const struct in6_addr *gateway)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
struct hlist_node *tmp;
int i;
- if (!rcu_access_pointer(rt->rt6i_exception_bucket))
+ if (!rcu_access_pointer(nh->rt6i_exception_bucket))
return;
spin_lock_bh(&rt6_exception_lock);
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
-
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (bucket) {
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
hlist_for_each_entry_safe(rt6_ex, tmp,
@@ -1824,23 +2089,21 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
gc_args->more++;
}
-void rt6_age_exceptions(struct fib6_info *rt,
- struct fib6_gc_args *gc_args,
- unsigned long now)
+static void fib6_nh_age_exceptions(const struct fib6_nh *nh,
+ struct fib6_gc_args *gc_args,
+ unsigned long now)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
struct hlist_node *tmp;
int i;
- if (!rcu_access_pointer(rt->rt6i_exception_bucket))
+ if (!rcu_access_pointer(nh->rt6i_exception_bucket))
return;
rcu_read_lock_bh();
spin_lock(&rt6_exception_lock);
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
-
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (bucket) {
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
hlist_for_each_entry_safe(rt6_ex, tmp,
@@ -1855,6 +2118,36 @@ void rt6_age_exceptions(struct fib6_info *rt,
rcu_read_unlock_bh();
}
+struct fib6_nh_age_excptn_arg {
+ struct fib6_gc_args *gc_args;
+ unsigned long now;
+};
+
+static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_age_excptn_arg *arg = _arg;
+
+ fib6_nh_age_exceptions(nh, arg->gc_args, arg->now);
+ return 0;
+}
+
+void rt6_age_exceptions(struct fib6_info *f6i,
+ struct fib6_gc_args *gc_args,
+ unsigned long now)
+{
+ if (f6i->nh) {
+ struct fib6_nh_age_excptn_arg arg = {
+ .gc_args = gc_args,
+ .now = now
+ };
+
+ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions,
+ &arg);
+ } else {
+ fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now);
+ }
+}
+
/* must be called with rcu lock held */
int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
struct flowi6 *fl6, struct fib6_result *res, int strict)
@@ -1891,9 +2184,12 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
const struct sk_buff *skb, int flags)
{
struct fib6_result res = {};
- struct rt6_info *rt;
+ struct rt6_info *rt = NULL;
int strict = 0;
+ WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) &&
+ !rcu_read_lock_held());
+
strict |= flags & RT6_LOOKUP_F_IFACE;
strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
if (net->ipv6.devconf_all->forwarding == 0)
@@ -1902,23 +2198,15 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
rcu_read_lock();
fib6_table_lookup(net, table, oif, fl6, &res, strict);
- if (res.f6i == net->ipv6.fib6_null_entry) {
- rt = net->ipv6.ip6_null_entry;
- rcu_read_unlock();
- dst_hold(&rt->dst);
- return rt;
- }
+ if (res.f6i == net->ipv6.fib6_null_entry)
+ goto out;
fib6_select_path(net, &res, fl6, oif, false, skb, strict);
/*Search through exception table */
rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
if (rt) {
- if (ip6_hold_safe(net, &rt))
- dst_use_noref(&rt->dst, jiffies);
-
- rcu_read_unlock();
- return rt;
+ goto out;
} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
!res.nh->fib_nh_gw_family)) {
/* Create a RTF_CACHE clone which will not be
@@ -1926,40 +2214,38 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
* the daddr in the skb during the neighbor look-up is different
* from the fl6->daddr used to look-up route here.
*/
- struct rt6_info *uncached_rt;
-
- uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
+ rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
- rcu_read_unlock();
-
- if (uncached_rt) {
- /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
- * No need for another dst_hold()
+ if (rt) {
+ /* 1 refcnt is taken during ip6_rt_cache_alloc().
+ * As rt6_uncached_list_add() does not consume refcnt,
+ * this refcnt is always returned to the caller even
+ * if caller sets RT6_LOOKUP_F_DST_NOREF flag.
*/
- rt6_uncached_list_add(uncached_rt);
+ rt6_uncached_list_add(rt);
atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
- } else {
- uncached_rt = net->ipv6.ip6_null_entry;
- dst_hold(&uncached_rt->dst);
- }
+ rcu_read_unlock();
- return uncached_rt;
+ return rt;
+ }
} else {
/* Get a percpu copy */
-
- struct rt6_info *pcpu_rt;
-
local_bh_disable();
- pcpu_rt = rt6_get_pcpu_route(&res);
+ rt = rt6_get_pcpu_route(&res);
- if (!pcpu_rt)
- pcpu_rt = rt6_make_pcpu_route(net, &res);
+ if (!rt)
+ rt = rt6_make_pcpu_route(net, &res);
local_bh_enable();
- rcu_read_unlock();
-
- return pcpu_rt;
}
+out:
+ if (!rt)
+ rt = net->ipv6.ip6_null_entry;
+ if (!(flags & RT6_LOOKUP_F_DST_NOREF))
+ ip6_hold_safe(net, &rt);
+ rcu_read_unlock();
+
+ return rt;
}
EXPORT_SYMBOL_GPL(ip6_pol_route);
@@ -2084,17 +2370,54 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
hash_keys.basic.ip_proto = fl6->flowi6_proto;
}
break;
+ case 2:
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ if (skb) {
+ struct flow_keys keys;
+
+ if (!flkeys) {
+ skb_flow_dissect_flow_keys(skb, &keys, 0);
+ flkeys = &keys;
+ }
+
+ /* Inner can be v4 or v6 */
+ if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
+ hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
+ } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
+ hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
+ hash_keys.tags.flow_label = flkeys->tags.flow_label;
+ hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
+ } else {
+ /* Same as case 0 */
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
+ }
+ } else {
+ /* Same as case 0 */
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ hash_keys.addrs.v6addrs.src = fl6->saddr;
+ hash_keys.addrs.v6addrs.dst = fl6->daddr;
+ hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
+ hash_keys.basic.ip_proto = fl6->flowi6_proto;
+ }
+ break;
}
mhash = flow_hash_from_keys(&hash_keys);
return mhash >> 1;
}
+/* Called with rcu held */
void ip6_route_input(struct sk_buff *skb)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
struct net *net = dev_net(skb->dev);
- int flags = RT6_LOOKUP_F_HAS_SADDR;
+ int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF;
struct ip_tunnel_info *tun_info;
struct flowi6 fl6 = {
.flowi6_iif = skb->dev->ifindex,
@@ -2116,8 +2439,8 @@ void ip6_route_input(struct sk_buff *skb)
if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
skb_dst_drop(skb);
- skb_dst_set(skb,
- ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
+ skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev,
+ &fl6, skb, flags));
}
static struct rt6_info *ip6_pol_route_output(struct net *net,
@@ -2129,8 +2452,9 @@ static struct rt6_info *ip6_pol_route_output(struct net *net,
return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
}
-struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
- struct flowi6 *fl6, int flags)
+struct dst_entry *ip6_route_output_flags_noref(struct net *net,
+ const struct sock *sk,
+ struct flowi6 *fl6, int flags)
{
bool any_src;
@@ -2138,6 +2462,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
struct dst_entry *dst;
+ /* This function does not take refcnt on the dst */
dst = l3mdev_link_scope_lookup(net, fl6);
if (dst)
return dst;
@@ -2145,6 +2470,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
fl6->flowi6_iif = LOOPBACK_IFINDEX;
+ flags |= RT6_LOOKUP_F_DST_NOREF;
any_src = ipv6_addr_any(&fl6->saddr);
if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
(fl6->flowi6_oif && any_src))
@@ -2157,6 +2483,28 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
}
+EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref);
+
+struct dst_entry *ip6_route_output_flags(struct net *net,
+ const struct sock *sk,
+ struct flowi6 *fl6,
+ int flags)
+{
+ struct dst_entry *dst;
+ struct rt6_info *rt6;
+
+ rcu_read_lock();
+ dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
+ rt6 = (struct rt6_info *)dst;
+ /* For dst cached in uncached_list, refcnt is already taken. */
+ if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
+ dst = &net->ipv6.ip6_null_entry->dst;
+ dst_hold(dst);
+ }
+ rcu_read_unlock();
+
+ return dst;
+}
EXPORT_SYMBOL_GPL(ip6_route_output_flags);
struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
@@ -2381,10 +2729,31 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
rcu_read_unlock();
return;
}
- res.nh = &res.f6i->fib6_nh;
res.fib6_flags = res.f6i->fib6_flags;
res.fib6_type = res.f6i->fib6_type;
+ if (res.f6i->nh) {
+ struct fib6_nh_match_arg arg = {
+ .dev = dst->dev,
+ .gw = &rt6->rt6i_gateway,
+ };
+
+ nexthop_for_each_fib6_nh(res.f6i->nh,
+ fib6_nh_find_match, &arg);
+
+ /* fib6_info uses a nexthop that does not have fib6_nh
+ * using the dst->dev + gw. Should be impossible.
+ */
+ if (!arg.match) {
+ rcu_read_unlock();
+ return;
+ }
+
+ res.nh = arg.match;
+ } else {
+ res.nh = res.f6i->fib6_nh;
+ }
+
nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
if (nrt6) {
rt6_do_update_pmtu(nrt6, mtu);
@@ -2491,6 +2860,21 @@ static bool ip6_redirect_nh_match(const struct fib6_result *res,
return true;
}
+struct fib6_nh_rd_arg {
+ struct fib6_result *res;
+ struct flowi6 *fl6;
+ const struct in6_addr *gw;
+ struct rt6_info **ret;
+};
+
+static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_rd_arg *arg = _arg;
+
+ arg->res->nh = nh;
+ return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret);
+}
+
/* Handle redirects */
struct ip6rd_flowi {
struct flowi6 fl6;
@@ -2506,6 +2890,12 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
struct rt6_info *ret = NULL;
struct fib6_result res = {};
+ struct fib6_nh_rd_arg arg = {
+ .res = &res,
+ .fl6 = fl6,
+ .gw = &rdfl->gateway,
+ .ret = &ret
+ };
struct fib6_info *rt;
struct fib6_node *fn;
@@ -2530,14 +2920,24 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
restart:
for_each_fib6_node_rt_rcu(fn) {
res.f6i = rt;
- res.nh = &rt->fib6_nh;
-
if (fib6_check_expired(rt))
continue;
if (rt->fib6_flags & RTF_REJECT)
break;
- if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
- goto out;
+ if (unlikely(rt->nh)) {
+ if (nexthop_is_blackhole(rt->nh))
+ continue;
+ /* on match, res->nh is filled in and potentially ret */
+ if (nexthop_for_each_fib6_nh(rt->nh,
+ fib6_nh_redirect_match,
+ &arg))
+ goto out;
+ } else {
+ res.nh = rt->fib6_nh;
+ if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway,
+ &ret))
+ goto out;
+ }
}
if (!rt)
@@ -2554,7 +2954,7 @@ restart:
}
res.f6i = rt;
- res.nh = &rt->fib6_nh;
+ res.nh = rt->fib6_nh;
out:
if (ret) {
ip6_hold_safe(net, &ret);
@@ -2781,10 +3181,9 @@ out:
return entries > rt_max_size;
}
-static struct rt6_info *ip6_nh_lookup_table(struct net *net,
- struct fib6_config *cfg,
- const struct in6_addr *gw_addr,
- u32 tbid, int flags)
+static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg,
+ const struct in6_addr *gw_addr, u32 tbid,
+ int flags, struct fib6_result *res)
{
struct flowi6 fl6 = {
.flowi6_oif = cfg->fc_ifindex,
@@ -2792,25 +3191,23 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net,
.saddr = cfg->fc_prefsrc,
};
struct fib6_table *table;
- struct rt6_info *rt;
+ int err;
table = fib6_get_table(net, tbid);
if (!table)
- return NULL;
+ return -EINVAL;
if (!ipv6_addr_any(&cfg->fc_prefsrc))
flags |= RT6_LOOKUP_F_HAS_SADDR;
flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
- rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
- /* if table lookup failed, fall back to full lookup */
- if (rt == net->ipv6.ip6_null_entry) {
- ip6_rt_put(rt);
- rt = NULL;
- }
+ err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags);
+ if (!err && res->f6i != net->ipv6.fib6_null_entry)
+ fib6_select_path(net, res, &fl6, cfg->fc_ifindex,
+ cfg->fc_ifindex != 0, NULL, flags);
- return rt;
+ return err;
}
static int ip6_route_check_nh_onlink(struct net *net,
@@ -2818,29 +3215,19 @@ static int ip6_route_check_nh_onlink(struct net *net,
const struct net_device *dev,
struct netlink_ext_ack *extack)
{
- u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
+ u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
const struct in6_addr *gw_addr = &cfg->fc_gateway;
- u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
- struct fib6_info *from;
- struct rt6_info *grt;
+ struct fib6_result res = {};
int err;
- err = 0;
- grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
- if (grt) {
- rcu_read_lock();
- from = rcu_dereference(grt->from);
- if (!grt->dst.error &&
- /* ignore match if it is the default route */
- from && !ipv6_addr_any(&from->fib6_dst.addr) &&
- (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
- NL_SET_ERR_MSG(extack,
- "Nexthop has invalid gateway or device mismatch");
- err = -EINVAL;
- }
- rcu_read_unlock();
-
- ip6_rt_put(grt);
+ err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res);
+ if (!err && !(res.fib6_flags & RTF_REJECT) &&
+ /* ignore match if it is the default route */
+ !ipv6_addr_any(&res.f6i->fib6_dst.addr) &&
+ (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop has invalid gateway or device mismatch");
+ err = -EINVAL;
}
return err;
@@ -2853,47 +3240,50 @@ static int ip6_route_check_nh(struct net *net,
{
const struct in6_addr *gw_addr = &cfg->fc_gateway;
struct net_device *dev = _dev ? *_dev : NULL;
- struct rt6_info *grt = NULL;
+ int flags = RT6_LOOKUP_F_IFACE;
+ struct fib6_result res = {};
int err = -EHOSTUNREACH;
if (cfg->fc_table) {
- int flags = RT6_LOOKUP_F_IFACE;
-
- grt = ip6_nh_lookup_table(net, cfg, gw_addr,
- cfg->fc_table, flags);
- if (grt) {
- if (grt->rt6i_flags & RTF_GATEWAY ||
- (dev && dev != grt->dst.dev)) {
- ip6_rt_put(grt);
- grt = NULL;
- }
- }
+ err = ip6_nh_lookup_table(net, cfg, gw_addr,
+ cfg->fc_table, flags, &res);
+ /* gw_addr can not require a gateway or resolve to a reject
+ * route. If a device is given, it must match the result.
+ */
+ if (err || res.fib6_flags & RTF_REJECT ||
+ res.nh->fib_nh_gw_family ||
+ (dev && dev != res.nh->fib_nh_dev))
+ err = -EHOSTUNREACH;
}
- if (!grt)
- grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
+ if (err < 0) {
+ struct flowi6 fl6 = {
+ .flowi6_oif = cfg->fc_ifindex,
+ .daddr = *gw_addr,
+ };
- if (!grt)
- goto out;
+ err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags);
+ if (err || res.fib6_flags & RTF_REJECT ||
+ res.nh->fib_nh_gw_family)
+ err = -EHOSTUNREACH;
+ if (err)
+ return err;
+
+ fib6_select_path(net, &res, &fl6, cfg->fc_ifindex,
+ cfg->fc_ifindex != 0, NULL, flags);
+ }
+
+ err = 0;
if (dev) {
- if (dev != grt->dst.dev) {
- ip6_rt_put(grt);
- goto out;
- }
+ if (dev != res.nh->fib_nh_dev)
+ err = -EHOSTUNREACH;
} else {
- *_dev = dev = grt->dst.dev;
- *idev = grt->rt6i_idev;
+ *_dev = dev = res.nh->fib_nh_dev;
dev_hold(dev);
- in6_dev_hold(grt->rt6i_idev);
+ *idev = in6_dev_get(dev);
}
- if (!(grt->rt6i_flags & RTF_GATEWAY))
- err = 0;
-
- ip6_rt_put(grt);
-
-out:
return err;
}
@@ -2934,11 +3324,15 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
goto out;
}
+ rcu_read_lock();
+
if (cfg->fc_flags & RTNH_F_ONLINK)
err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
else
err = ip6_route_check_nh(net, cfg, _dev, idev);
+ rcu_read_unlock();
+
if (err)
goto out;
}
@@ -3039,7 +3433,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
goto out;
}
}
- goto set_dev;
+ goto pcpu_alloc;
}
if (cfg->fc_flags & RTF_GATEWAY) {
@@ -3075,7 +3469,14 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
cfg->fc_encap_type, cfg, gfp_flags, extack);
if (err)
goto out;
-set_dev:
+
+pcpu_alloc:
+ fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
+ if (!fib6_nh->rt6i_pcpu) {
+ err = -ENOMEM;
+ goto out;
+ }
+
fib6_nh->fib_nh_dev = dev;
fib6_nh->fib_nh_oif = dev->ifindex;
err = 0;
@@ -3095,6 +3496,38 @@ out:
void fib6_nh_release(struct fib6_nh *fib6_nh)
{
+ struct rt6_exception_bucket *bucket;
+
+ rcu_read_lock();
+
+ fib6_nh_flush_exceptions(fib6_nh, NULL);
+ bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL);
+ if (bucket) {
+ rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL);
+ kfree(bucket);
+ }
+
+ rcu_read_unlock();
+
+ if (fib6_nh->rt6i_pcpu) {
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct rt6_info **ppcpu_rt;
+ struct rt6_info *pcpu_rt;
+
+ ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
+ pcpu_rt = *ppcpu_rt;
+ if (pcpu_rt) {
+ dst_dev_put(&pcpu_rt->dst);
+ dst_release(&pcpu_rt->dst);
+ *ppcpu_rt = NULL;
+ }
+ }
+
+ free_percpu(fib6_nh->rt6i_pcpu);
+ }
+
fib_nh_common_release(&fib6_nh->nh_common);
}
@@ -3104,7 +3537,9 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
{
struct net *net = cfg->fc_nlinfo.nl_net;
struct fib6_info *rt = NULL;
+ struct nexthop *nh = NULL;
struct fib6_table *table;
+ struct fib6_nh *fib6_nh;
int err = -EINVAL;
int addr_type;
@@ -3140,6 +3575,16 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
goto out;
}
#endif
+ if (cfg->fc_nh_id) {
+ nh = nexthop_find_by_id(net, cfg->fc_nh_id);
+ if (!nh) {
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ goto out;
+ }
+ err = fib6_check_nexthop(nh, cfg, extack);
+ if (err)
+ goto out;
+ }
err = -ENOBUFS;
if (cfg->fc_nlinfo.nlh &&
@@ -3157,7 +3602,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
goto out;
err = -ENOMEM;
- rt = fib6_info_alloc(gfp_flags);
+ rt = fib6_info_alloc(gfp_flags, !nh);
if (!rt)
goto out;
@@ -3197,19 +3642,35 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
rt->fib6_src.plen = cfg->fc_src_len;
#endif
- err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
- if (err)
- goto out;
+ if (nh) {
+ if (!nexthop_get(nh)) {
+ NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+ goto out;
+ }
+ if (rt->fib6_src.plen) {
+ NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
+ goto out;
+ }
+ rt->nh = nh;
+ fib6_nh = nexthop_fib6_nh(rt->nh);
+ } else {
+ err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
+ if (err)
+ goto out;
- /* We cannot add true routes via loopback here,
- * they would result in kernel looping; promote them to reject routes
- */
- addr_type = ipv6_addr_type(&cfg->fc_dst);
- if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
- rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
+ fib6_nh = rt->fib6_nh;
+
+ /* We cannot add true routes via loopback here, they would
+ * result in kernel looping; promote them to reject routes
+ */
+ addr_type = ipv6_addr_type(&cfg->fc_dst);
+ if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev,
+ addr_type))
+ rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
+ }
if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
- struct net_device *dev = fib6_info_nh_dev(rt);
+ struct net_device *dev = fib6_nh->fib_nh_dev;
if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
NL_SET_ERR_MSG(extack, "Invalid source address");
@@ -3301,6 +3762,12 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
info->skip_notify = 1;
}
+ info->skip_notify_kernel = 1;
+ call_fib6_multipath_entry_notifiers(net,
+ FIB_EVENT_ENTRY_DEL,
+ rt,
+ rt->fib6_nsiblings,
+ NULL);
list_for_each_entry_safe(sibling, next_sibling,
&rt->fib6_siblings,
fib6_siblings) {
@@ -3323,7 +3790,7 @@ out_put:
return err;
}
-static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
+static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
{
int rc = -ESRCH;
@@ -3339,10 +3806,49 @@ out:
return rc;
}
+static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt,
+ struct fib6_nh *nh)
+{
+ struct fib6_result res = {
+ .f6i = rt,
+ .nh = nh,
+ };
+ struct rt6_info *rt_cache;
+
+ rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src);
+ if (rt_cache)
+ return __ip6_del_cached_rt(rt_cache, cfg);
+
+ return 0;
+}
+
+struct fib6_nh_del_cached_rt_arg {
+ struct fib6_config *cfg;
+ struct fib6_info *f6i;
+};
+
+static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_del_cached_rt_arg *arg = _arg;
+ int rc;
+
+ rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh);
+ return rc != -ESRCH ? rc : 0;
+}
+
+static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i)
+{
+ struct fib6_nh_del_cached_rt_arg arg = {
+ .cfg = cfg,
+ .f6i = f6i
+ };
+
+ return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg);
+}
+
static int ip6_route_del(struct fib6_config *cfg,
struct netlink_ext_ack *extack)
{
- struct rt6_info *rt_cache;
struct fib6_table *table;
struct fib6_info *rt;
struct fib6_node *fn;
@@ -3365,26 +3871,45 @@ static int ip6_route_del(struct fib6_config *cfg,
for_each_fib6_node_rt_rcu(fn) {
struct fib6_nh *nh;
+ if (rt->nh && cfg->fc_nh_id &&
+ rt->nh->id != cfg->fc_nh_id)
+ continue;
+
if (cfg->fc_flags & RTF_CACHE) {
- struct fib6_result res = {
- .f6i = rt,
- };
- int rc;
-
- rt_cache = rt6_find_cached_rt(&res,
- &cfg->fc_dst,
- &cfg->fc_src);
- if (rt_cache) {
- rc = ip6_del_cached_rt(rt_cache, cfg);
- if (rc != -ESRCH) {
- rcu_read_unlock();
- return rc;
- }
+ int rc = 0;
+
+ if (rt->nh) {
+ rc = ip6_del_cached_rt_nh(cfg, rt);
+ } else if (cfg->fc_nh_id) {
+ continue;
+ } else {
+ nh = rt->fib6_nh;
+ rc = ip6_del_cached_rt(cfg, rt, nh);
+ }
+ if (rc != -ESRCH) {
+ rcu_read_unlock();
+ return rc;
}
continue;
}
- nh = &rt->fib6_nh;
+ if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
+ continue;
+ if (cfg->fc_protocol &&
+ cfg->fc_protocol != rt->fib6_protocol)
+ continue;
+
+ if (rt->nh) {
+ if (!fib6_info_hold_safe(rt))
+ continue;
+ rcu_read_unlock();
+
+ return __ip6_del_rt(rt, &cfg->fc_nlinfo);
+ }
+ if (cfg->fc_nh_id)
+ continue;
+
+ nh = rt->fib6_nh;
if (cfg->fc_ifindex &&
(!nh->fib_nh_dev ||
nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
@@ -3392,10 +3917,6 @@ static int ip6_route_del(struct fib6_config *cfg,
if (cfg->fc_flags & RTF_GATEWAY &&
!ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
continue;
- if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
- continue;
- if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
- continue;
if (!fib6_info_hold_safe(rt))
continue;
rcu_read_unlock();
@@ -3506,7 +4027,25 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
if (!res.f6i)
goto out;
- res.nh = &res.f6i->fib6_nh;
+ if (res.f6i->nh) {
+ struct fib6_nh_match_arg arg = {
+ .dev = dst->dev,
+ .gw = &rt->rt6i_gateway,
+ };
+
+ nexthop_for_each_fib6_nh(res.f6i->nh,
+ fib6_nh_find_match, &arg);
+
+ /* fib6_info uses a nexthop that does not have fib6_nh
+ * using the dst->dev. Should be impossible
+ */
+ if (!arg.match)
+ goto out;
+ res.nh = arg.match;
+ } else {
+ res.nh = res.f6i->fib6_nh;
+ }
+
res.fib6_flags = res.f6i->fib6_flags;
res.fib6_type = res.f6i->fib6_type;
nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
@@ -3558,12 +4097,15 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
goto out;
for_each_fib6_node_rt_rcu(fn) {
- if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
+ /* these routes do not use nexthops */
+ if (rt->nh)
+ continue;
+ if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
continue;
if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
- !rt->fib6_nh.fib_nh_gw_family)
+ !rt->fib6_nh->fib_nh_gw_family)
continue;
- if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
+ if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr))
continue;
if (!fib6_info_hold_safe(rt))
continue;
@@ -3621,8 +4163,13 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
rcu_read_lock();
for_each_fib6_node_rt_rcu(&table->tb6_root) {
- struct fib6_nh *nh = &rt->fib6_nh;
+ struct fib6_nh *nh;
+
+ /* RA routes do not use nexthops */
+ if (rt->nh)
+ continue;
+ nh = rt->fib6_nh;
if (dev == nh->fib_nh_dev &&
((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
ipv6_addr_equal(&nh->fib_nh_gw6, addr))
@@ -3873,7 +4420,8 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
struct net *net = ((struct arg_dev_net_ip *)arg)->net;
struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
- if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
+ if (!rt->nh &&
+ ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) &&
rt != net->ipv6.fib6_null_entry &&
ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
spin_lock_bh(&rt6_exception_lock);
@@ -3901,18 +4449,22 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
{
struct in6_addr *gateway = (struct in6_addr *)arg;
+ struct fib6_nh *nh;
+ /* RA routes do not use nexthops */
+ if (rt->nh)
+ return 0;
+
+ nh = rt->fib6_nh;
if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
- rt->fib6_nh.fib_nh_gw_family &&
- ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
+ nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
return -1;
- }
/* Further clean up cached routes in exception table.
* This is needed because cached route may have a different
* gateway than its 'parent' in the case of an ip redirect.
*/
- rt6_exceptions_clean_tohost(rt, gateway);
+ fib6_nh_exceptions_clean_tohost(nh, gateway);
return 0;
}
@@ -3950,11 +4502,12 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
return NULL;
}
+/* only called for fib entries with builtin fib6_nh */
static bool rt6_is_dead(const struct fib6_info *rt)
{
- if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
- (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
- ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
+ if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
+ (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN &&
+ ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev)))
return true;
return false;
@@ -3966,11 +4519,11 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt)
int total = 0;
if (!rt6_is_dead(rt))
- total += rt->fib6_nh.fib_nh_weight;
+ total += rt->fib6_nh->fib_nh_weight;
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
if (!rt6_is_dead(iter))
- total += iter->fib6_nh.fib_nh_weight;
+ total += iter->fib6_nh->fib_nh_weight;
}
return total;
@@ -3981,11 +4534,11 @@ static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
int upper_bound = -1;
if (!rt6_is_dead(rt)) {
- *weight += rt->fib6_nh.fib_nh_weight;
+ *weight += rt->fib6_nh->fib_nh_weight;
upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
total) - 1;
}
- atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
+ atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound);
}
static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
@@ -4028,9 +4581,9 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg)
const struct arg_netdev_event *arg = p_arg;
struct net *net = dev_net(arg->dev);
- if (rt != net->ipv6.fib6_null_entry &&
- rt->fib6_nh.fib_nh_dev == arg->dev) {
- rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
+ if (rt != net->ipv6.fib6_null_entry && !rt->nh &&
+ rt->fib6_nh->fib_nh_dev == arg->dev) {
+ rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
fib6_update_sernum_upto_root(net, rt);
rt6_multipath_rebalance(rt);
}
@@ -4053,15 +4606,16 @@ void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
}
+/* only called for fib entries with inline fib6_nh */
static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
const struct net_device *dev)
{
struct fib6_info *iter;
- if (rt->fib6_nh.fib_nh_dev == dev)
+ if (rt->fib6_nh->fib_nh_dev == dev)
return true;
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
- if (iter->fib6_nh.fib_nh_dev == dev)
+ if (iter->fib6_nh->fib_nh_dev == dev)
return true;
return false;
@@ -4082,12 +4636,12 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
struct fib6_info *iter;
unsigned int dead = 0;
- if (rt->fib6_nh.fib_nh_dev == down_dev ||
- rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
+ if (rt->fib6_nh->fib_nh_dev == down_dev ||
+ rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
dead++;
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
- if (iter->fib6_nh.fib_nh_dev == down_dev ||
- iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
+ if (iter->fib6_nh->fib_nh_dev == down_dev ||
+ iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
dead++;
return dead;
@@ -4099,11 +4653,11 @@ static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
{
struct fib6_info *iter;
- if (rt->fib6_nh.fib_nh_dev == dev)
- rt->fib6_nh.fib_nh_flags |= nh_flags;
+ if (rt->fib6_nh->fib_nh_dev == dev)
+ rt->fib6_nh->fib_nh_flags |= nh_flags;
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
- if (iter->fib6_nh.fib_nh_dev == dev)
- iter->fib6_nh.fib_nh_flags |= nh_flags;
+ if (iter->fib6_nh->fib_nh_dev == dev)
+ iter->fib6_nh->fib_nh_flags |= nh_flags;
}
/* called with write lock held for table with rt */
@@ -4113,17 +4667,17 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
const struct net_device *dev = arg->dev;
struct net *net = dev_net(dev);
- if (rt == net->ipv6.fib6_null_entry)
+ if (rt == net->ipv6.fib6_null_entry || rt->nh)
return 0;
switch (arg->event) {
case NETDEV_UNREGISTER:
- return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
+ return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
case NETDEV_DOWN:
if (rt->should_flush)
return -1;
if (!rt->fib6_nsiblings)
- return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
+ return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
if (rt6_multipath_uses_dev(rt, dev)) {
unsigned int count;
@@ -4139,10 +4693,10 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
}
return -2;
case NETDEV_CHANGE:
- if (rt->fib6_nh.fib_nh_dev != dev ||
+ if (rt->fib6_nh->fib_nh_dev != dev ||
rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
break;
- rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
+ rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
rt6_multipath_rebalance(rt);
break;
}
@@ -4176,9 +4730,36 @@ void rt6_disable_ip(struct net_device *dev, unsigned long event)
struct rt6_mtu_change_arg {
struct net_device *dev;
unsigned int mtu;
+ struct fib6_info *f6i;
};
-static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
+static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg)
+{
+ struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg;
+ struct fib6_info *f6i = arg->f6i;
+
+ /* For administrative MTU increase, there is no way to discover
+ * IPv6 PMTU increase, so PMTU increase should be updated here.
+ * Since RFC 1981 doesn't include administrative MTU increase
+ * update PMTU increase is a MUST. (i.e. jumbo frame)
+ */
+ if (nh->fib_nh_dev == arg->dev) {
+ struct inet6_dev *idev = __in6_dev_get(arg->dev);
+ u32 mtu = f6i->fib6_pmtu;
+
+ if (mtu >= arg->mtu ||
+ (mtu < arg->mtu && mtu == idev->cnf.mtu6))
+ fib6_metric_set(f6i, RTAX_MTU, arg->mtu);
+
+ spin_lock_bh(&rt6_exception_lock);
+ rt6_exceptions_update_pmtu(idev, nh, arg->mtu);
+ spin_unlock_bh(&rt6_exception_lock);
+ }
+
+ return 0;
+}
+
+static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg)
{
struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
struct inet6_dev *idev;
@@ -4193,24 +4774,17 @@ static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
if (!idev)
return 0;
- /* For administrative MTU increase, there is no way to discover
- IPv6 PMTU increase, so PMTU increase should be updated here.
- Since RFC 1981 doesn't include administrative MTU increase
- update PMTU increase is a MUST. (i.e. jumbo frame)
- */
- if (rt->fib6_nh.fib_nh_dev == arg->dev &&
- !fib6_metric_locked(rt, RTAX_MTU)) {
- u32 mtu = rt->fib6_pmtu;
-
- if (mtu >= arg->mtu ||
- (mtu < arg->mtu && mtu == idev->cnf.mtu6))
- fib6_metric_set(rt, RTAX_MTU, arg->mtu);
+ if (fib6_metric_locked(f6i, RTAX_MTU))
+ return 0;
- spin_lock_bh(&rt6_exception_lock);
- rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
- spin_unlock_bh(&rt6_exception_lock);
+ arg->f6i = f6i;
+ if (f6i->nh) {
+ /* fib6_nh_mtu_change only returns 0, so this is safe */
+ return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change,
+ arg);
}
- return 0;
+
+ return fib6_nh_mtu_change(f6i->fib6_nh, arg);
}
void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
@@ -4224,6 +4798,7 @@ void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
}
static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
+ [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 },
[RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
[RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
[RTA_OIF] = { .type = NLA_U32 },
@@ -4241,6 +4816,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_IP_PROTO] = { .type = NLA_U8 },
[RTA_SPORT] = { .type = NLA_U16 },
[RTA_DPORT] = { .type = NLA_U16 },
+ [RTA_NH_ID] = { .type = NLA_U32 },
};
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -4287,6 +4863,16 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
+ if (tb[RTA_NH_ID]) {
+ if (tb[RTA_GATEWAY] || tb[RTA_OIF] ||
+ tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop specification and nexthop id are mutually exclusive");
+ goto errout;
+ }
+ cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]);
+ }
+
if (tb[RTA_GATEWAY]) {
cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
cfg->fc_flags |= RTF_GATEWAY;
@@ -4430,6 +5016,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
{
struct fib6_info *rt_notif = NULL, *rt_last = NULL;
struct nl_info *info = &cfg->fc_nlinfo;
+ enum fib_event_type event_type;
struct fib6_config r_cfg;
struct rtnexthop *rtnh;
struct fib6_info *rt;
@@ -4489,7 +5076,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
goto cleanup;
}
- rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
+ rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
rt, &r_cfg);
@@ -4501,12 +5088,23 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
rtnh = rtnh_next(rtnh, &remaining);
}
+ if (list_empty(&rt6_nh_list)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid nexthop configuration - no valid nexthops");
+ return -EINVAL;
+ }
+
/* for add and replace send one notification with all nexthops.
* Skip the notification in fib6_add_rt2node and send one with
* the full route when done
*/
info->skip_notify = 1;
+ /* For add and replace, send one notification with all nexthops. For
+ * append, send one notification with all appended nexthops.
+ */
+ info->skip_notify_kernel = 1;
+
err_nh = NULL;
list_for_each_entry(nh, &rt6_nh_list, next) {
err = __ip6_ins_rt(nh->fib6_info, info, extack);
@@ -4543,6 +5141,15 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
nhn++;
}
+ event_type = replace ? FIB_EVENT_ENTRY_REPLACE : FIB_EVENT_ENTRY_ADD;
+ err = call_fib6_multipath_entry_notifiers(info->nl_net, event_type,
+ rt_notif, nhn - 1, extack);
+ if (err) {
+ /* Delete all the siblings that were just added */
+ err_nh = NULL;
+ goto add_errout;
+ }
+
/* success ... tell user about new route */
ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
goto cleanup;
@@ -4621,6 +5228,12 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
return err;
+ if (cfg.fc_nh_id &&
+ !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) {
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ return -EINVAL;
+ }
+
if (cfg.fc_mp)
return ip6_route_multipath_del(&cfg, extack);
else {
@@ -4648,17 +5261,46 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
return ip6_route_add(&cfg, GFP_KERNEL, extack);
}
-static size_t rt6_nlmsg_size(struct fib6_info *rt)
+/* add the overhead of this fib6_nh to nexthop_len */
+static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg)
{
- int nexthop_len = 0;
+ int *nexthop_len = arg;
- if (rt->fib6_nsiblings) {
- nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
- + NLA_ALIGN(sizeof(struct rtnexthop))
- + nla_total_size(16) /* RTA_GATEWAY */
- + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
+ *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */
+ + NLA_ALIGN(sizeof(struct rtnexthop))
+ + nla_total_size(16); /* RTA_GATEWAY */
- nexthop_len *= rt->fib6_nsiblings;
+ if (nh->fib_nh_lws) {
+ /* RTA_ENCAP_TYPE */
+ *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
+ /* RTA_ENCAP */
+ *nexthop_len += nla_total_size(2);
+ }
+
+ return 0;
+}
+
+static size_t rt6_nlmsg_size(struct fib6_info *f6i)
+{
+ int nexthop_len;
+
+ if (f6i->nh) {
+ nexthop_len = nla_total_size(4); /* RTA_NH_ID */
+ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
+ &nexthop_len);
+ } else {
+ struct fib6_nh *nh = f6i->fib6_nh;
+
+ nexthop_len = 0;
+ if (f6i->fib6_nsiblings) {
+ nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
+ + NLA_ALIGN(sizeof(struct rtnexthop))
+ + nla_total_size(16) /* RTA_GATEWAY */
+ + lwtunnel_get_encap_size(nh->fib_nh_lws);
+
+ nexthop_len *= f6i->fib6_nsiblings;
+ }
+ nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
}
return NLMSG_ALIGN(sizeof(struct rtmsg))
@@ -4674,10 +5316,38 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt)
+ nla_total_size(sizeof(struct rta_cacheinfo))
+ nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
+ nla_total_size(1) /* RTA_PREF */
- + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
+ nexthop_len;
}
+static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
+ unsigned char *flags)
+{
+ if (nexthop_is_multipath(nh)) {
+ struct nlattr *mp;
+
+ mp = nla_nest_start(skb, RTA_MULTIPATH);
+ if (!mp)
+ goto nla_put_failure;
+
+ if (nexthop_mpath_fill_node(skb, nh))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, mp);
+ } else {
+ struct fib6_nh *fib6_nh;
+
+ fib6_nh = nexthop_fib6_nh(nh);
+ if (fib_nexthop_info(skb, &fib6_nh->nh_common,
+ flags, false) < 0)
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
static int rt6_fill_node(struct net *net, struct sk_buff *skb,
struct fib6_info *rt, struct dst_entry *dst,
struct in6_addr *dest, struct in6_addr *src,
@@ -4687,6 +5357,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
struct rt6_info *rt6 = (struct rt6_info *)dst;
struct rt6key *rt6_dst, *rt6_src;
u32 *pmetrics, table, rt6_flags;
+ unsigned char nh_flags = 0;
struct nlmsghdr *nlh;
struct rtmsg *rtm;
long expires = 0;
@@ -4794,22 +5465,31 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
if (!mp)
goto nla_put_failure;
- if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
- rt->fib6_nh.fib_nh_weight) < 0)
+ if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
+ rt->fib6_nh->fib_nh_weight) < 0)
goto nla_put_failure;
list_for_each_entry_safe(sibling, next_sibling,
&rt->fib6_siblings, fib6_siblings) {
- if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
- sibling->fib6_nh.fib_nh_weight) < 0)
+ if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
+ sibling->fib6_nh->fib_nh_weight) < 0)
goto nla_put_failure;
}
nla_nest_end(skb, mp);
- } else {
- unsigned char nh_flags = 0;
+ } else if (rt->nh) {
+ if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
+ goto nla_put_failure;
- if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
+ if (nexthop_is_blackhole(rt->nh))
+ rtm->rtm_type = RTN_BLACKHOLE;
+
+ if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
+ goto nla_put_failure;
+
+ rtm->rtm_flags |= nh_flags;
+ } else {
+ if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common,
&nh_flags, false) < 0)
goto nla_put_failure;
@@ -4836,10 +5516,28 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg)
+{
+ const struct net_device *dev = arg;
+
+ if (nh->fib_nh_dev == dev)
+ return 1;
+
+ return 0;
+}
+
static bool fib6_info_uses_dev(const struct fib6_info *f6i,
const struct net_device *dev)
{
- if (f6i->fib6_nh.fib_nh_dev == dev)
+ if (f6i->nh) {
+ struct net_device *_dev = (struct net_device *)dev;
+
+ return !!nexthop_for_each_fib6_nh(f6i->nh,
+ fib6_info_nh_uses_dev,
+ _dev);
+ }
+
+ if (f6i->fib6_nh->fib_nh_dev == dev)
return true;
if (f6i->fib6_nsiblings) {
@@ -4847,7 +5545,7 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i,
list_for_each_entry_safe(sibling, next_sibling,
&f6i->fib6_siblings, fib6_siblings) {
- if (sibling->fib6_nh.fib_nh_dev == dev)
+ if (sibling->fib6_nh->fib_nh_dev == dev)
return true;
}
}
@@ -4855,33 +5553,131 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i,
return false;
}
-int rt6_dump_route(struct fib6_info *rt, void *p_arg)
+struct fib6_nh_exception_dump_walker {
+ struct rt6_rtnl_dump_arg *dump;
+ struct fib6_info *rt;
+ unsigned int flags;
+ unsigned int skip;
+ unsigned int count;
+};
+
+static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg)
+{
+ struct fib6_nh_exception_dump_walker *w = arg;
+ struct rt6_rtnl_dump_arg *dump = w->dump;
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
+ int i, err;
+
+ bucket = fib6_nh_get_excptn_bucket(nh, NULL);
+ if (!bucket)
+ return 0;
+
+ for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+ hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+ if (w->skip) {
+ w->skip--;
+ continue;
+ }
+
+ /* Expiration of entries doesn't bump sernum, insertion
+ * does. Removal is triggered by insertion, so we can
+ * rely on the fact that if entries change between two
+ * partial dumps, this node is scanned again completely,
+ * see rt6_insert_exception() and fib6_dump_table().
+ *
+ * Count expired entries we go through as handled
+ * entries that we'll skip next time, in case of partial
+ * node dump. Otherwise, if entries expire meanwhile,
+ * we'll skip the wrong amount.
+ */
+ if (rt6_check_expired(rt6_ex->rt6i)) {
+ w->count++;
+ continue;
+ }
+
+ err = rt6_fill_node(dump->net, dump->skb, w->rt,
+ &rt6_ex->rt6i->dst, NULL, NULL, 0,
+ RTM_NEWROUTE,
+ NETLINK_CB(dump->cb->skb).portid,
+ dump->cb->nlh->nlmsg_seq, w->flags);
+ if (err)
+ return err;
+
+ w->count++;
+ }
+ bucket++;
+ }
+
+ return 0;
+}
+
+/* Return -1 if done with node, number of handled routes on partial dump */
+int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip)
{
struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
struct fib_dump_filter *filter = &arg->filter;
unsigned int flags = NLM_F_MULTI;
struct net *net = arg->net;
+ int count = 0;
if (rt == net->ipv6.fib6_null_entry)
- return 0;
+ return -1;
if ((filter->flags & RTM_F_PREFIX) &&
!(rt->fib6_flags & RTF_PREFIX_RT)) {
/* success since this is not a prefix route */
- return 1;
+ return -1;
}
- if (filter->filter_set) {
- if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
- (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
- (filter->protocol && rt->fib6_protocol != filter->protocol)) {
- return 1;
- }
+ if (filter->filter_set &&
+ ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
+ (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
+ (filter->protocol && rt->fib6_protocol != filter->protocol))) {
+ return -1;
+ }
+
+ if (filter->filter_set ||
+ !filter->dump_routes || !filter->dump_exceptions) {
flags |= NLM_F_DUMP_FILTERED;
}
- return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
- RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
- arg->cb->nlh->nlmsg_seq, flags);
+ if (filter->dump_routes) {
+ if (skip) {
+ skip--;
+ } else {
+ if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL,
+ 0, RTM_NEWROUTE,
+ NETLINK_CB(arg->cb->skb).portid,
+ arg->cb->nlh->nlmsg_seq, flags)) {
+ return 0;
+ }
+ count++;
+ }
+ }
+
+ if (filter->dump_exceptions) {
+ struct fib6_nh_exception_dump_walker w = { .dump = arg,
+ .rt = rt,
+ .flags = flags,
+ .skip = skip,
+ .count = 0 };
+ int err;
+
+ rcu_read_lock();
+ if (rt->nh) {
+ err = nexthop_for_each_fib6_nh(rt->nh,
+ rt6_nh_dump_exceptions,
+ &w);
+ } else {
+ err = rt6_nh_dump_exceptions(rt->fib6_nh, &w);
+ }
+ rcu_read_unlock();
+
+ if (err)
+ return count += w.count;
+ }
+
+ return -1;
}
static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
@@ -5126,6 +5922,38 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
}
+void fib6_rt_update(struct net *net, struct fib6_info *rt,
+ struct nl_info *info)
+{
+ u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ /* call_fib6_entry_notifiers will be removed when in-kernel notifier
+ * is implemented and supported for nexthop objects
+ */
+ call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL);
+
+ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
+ if (!skb)
+ goto errout;
+
+ err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
+ RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
+ info->nlh, gfp_any());
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
+}
+
static int ip6_route_dev_notify(struct notifier_block *this,
unsigned long event, void *ptr)
{
@@ -5136,7 +5964,7 @@ static int ip6_route_dev_notify(struct notifier_block *this,
return NOTIFY_OK;
if (event == NETDEV_REGISTER) {
- net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
+ net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev;
net->ipv6.ip6_null_entry->dst.dev = dev;
net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -5330,11 +6158,11 @@ static int __net_init ip6_route_net_init(struct net *net)
if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
goto out_ip6_dst_ops;
- net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
- sizeof(*net->ipv6.fib6_null_entry),
- GFP_KERNEL);
+ net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true);
if (!net->ipv6.fib6_null_entry)
goto out_ip6_dst_entries;
+ memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template,
+ sizeof(*net->ipv6.fib6_null_entry));
net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
sizeof(*net->ipv6.ip6_null_entry),
@@ -5344,6 +6172,7 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
ip6_template_metrics, true);
+ INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
net->ipv6.fib6_has_custom_rules = false;
@@ -5355,6 +6184,7 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
ip6_template_metrics, true);
+ INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached);
net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
sizeof(*net->ipv6.ip6_blk_hole_entry),
@@ -5364,6 +6194,7 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
ip6_template_metrics, true);
+ INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached);
#endif
net->ipv6.sysctl.flush_delay = 0;
@@ -5471,7 +6302,7 @@ void __init ip6_route_init_special_entries(void)
/* Registering of the loopback is done before this portion of code,
* the loopback reference in rt6_info will not be taken, do it
* manually for init_net */
- init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
+ init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev;
init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index e15cd37024fd..dc4c91e0bfb8 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -23,6 +23,7 @@
static int zero;
static int one = 1;
+static int flowlabel_reflect_max = 0x7;
static int auto_flowlabels_min;
static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
@@ -113,7 +114,9 @@ static struct ctl_table ipv6_table_template[] = {
.data = &init_net.ipv6.sysctl.flowlabel_reflect,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &flowlabel_reflect_max,
},
{
.procname = "max_dst_opts_number",
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7a14ea37d2df..d56a9019a0fe 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -171,7 +171,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
struct ip6_flowlabel *flowlabel;
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (!flowlabel)
+ if (IS_ERR(flowlabel))
return -EINVAL;
fl6_sock_release(flowlabel);
}
@@ -883,9 +883,17 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
fl6.flowi6_oif = oif;
}
- if (sk)
- mark = (sk->sk_state == TCP_TIME_WAIT) ?
- inet_twsk(sk)->tw_mark : sk->sk_mark;
+ if (sk) {
+ if (sk->sk_state == TCP_TIME_WAIT) {
+ mark = inet_twsk(sk)->tw_mark;
+ /* autoflowlabel relies on buff->hash */
+ skb_set_hash(buff, inet_twsk(sk)->tw_txhash,
+ PKT_HASH_TYPE_L4);
+ } else {
+ mark = sk->sk_mark;
+ }
+ buff->tstamp = tcp_transmit_time(sk);
+ }
fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
fl6.fl6_dport = t1->dest;
fl6.fl6_sport = t1->source;
@@ -912,15 +920,17 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
u32 seq = 0, ack_seq = 0;
struct tcp_md5sig_key *key = NULL;
#ifdef CONFIG_TCP_MD5SIG
const __u8 *hash_location = NULL;
- struct ipv6hdr *ipv6h = ipv6_hdr(skb);
unsigned char newhash[16];
int genhash;
struct sock *sk1 = NULL;
#endif
+ __be32 label = 0;
+ struct net *net;
int oif = 0;
if (th->rst)
@@ -932,6 +942,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
if (!sk && !ipv6_unicast_destination(skb))
return;
+ net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
#ifdef CONFIG_TCP_MD5SIG
rcu_read_lock();
hash_location = tcp_parse_md5sig_option(th);
@@ -945,7 +956,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
* Incoming packet is checked with md5 hash with finding key,
* no RST generated if md5 hash doesn't match.
*/
- sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev),
+ sk1 = inet6_lookup_listener(net,
&tcp_hashinfo, NULL, 0,
&ipv6h->saddr,
th->source, &ipv6h->daddr,
@@ -975,9 +986,15 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
oif = sk->sk_bound_dev_if;
if (sk_fullsock(sk))
trace_tcp_send_reset(sk, skb);
+ if (sk->sk_state == TCP_TIME_WAIT)
+ label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel);
+ } else {
+ if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET)
+ label = ip6_flowlabel(ipv6h);
}
- tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0);
+ tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0,
+ label);
#ifdef CONFIG_TCP_MD5SIG
out:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 70b01bd95022..827fe7385078 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -54,16 +54,6 @@
#include <trace/events/skb.h>
#include "udp_impl.h"
-static bool udp6_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
-{
-#if defined(CONFIG_NET_L3_MASTER_DEV)
- if (!net->ipv4.sysctl_udp_l3mdev_accept &&
- skb && ipv6_l3mdev_skb(IP6CB(skb)->flags))
- return true;
-#endif
- return false;
-}
-
static u32 udp6_ehashfn(const struct net *net,
const struct in6_addr *laddr,
const u16 lport,
@@ -111,7 +101,7 @@ void udp_v6_rehash(struct sock *sk)
static int compute_score(struct sock *sk, struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, unsigned short hnum,
- int dif, int sdif, bool exact_dif)
+ int dif, int sdif)
{
int score;
struct inet_sock *inet;
@@ -155,8 +145,8 @@ static int compute_score(struct sock *sk, struct net *net,
static struct sock *udp6_lib_lookup2(struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, unsigned int hnum,
- int dif, int sdif, bool exact_dif,
- struct udp_hslot *hslot2, struct sk_buff *skb)
+ int dif, int sdif, struct udp_hslot *hslot2,
+ struct sk_buff *skb)
{
struct sock *sk, *result;
int score, badness;
@@ -166,7 +156,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
badness = -1;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, sdif, exact_dif);
+ daddr, hnum, dif, sdif);
if (score > badness) {
if (sk->sk_reuseport) {
hash = udp6_ehashfn(net, daddr, hnum,
@@ -195,14 +185,13 @@ struct sock *__udp6_lib_lookup(struct net *net,
unsigned int hash2, slot2;
struct udp_hslot *hslot2;
struct sock *result;
- bool exact_dif = udp6_lib_exact_dif_match(net, skb);
hash2 = ipv6_portaddr_hash(net, daddr, hnum);
slot2 = hash2 & udptable->mask;
hslot2 = &udptable->hash2[slot2];
result = udp6_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif, sdif, exact_dif,
+ daddr, hnum, dif, sdif,
hslot2, skb);
if (!result) {
hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
@@ -212,10 +201,9 @@ struct sock *__udp6_lib_lookup(struct net *net,
result = udp6_lib_lookup2(net, saddr, sport,
&in6addr_any, hnum, dif, sdif,
- exact_dif, hslot2,
- skb);
+ hslot2, skb);
}
- if (unlikely(IS_ERR(result)))
+ if (IS_ERR(result))
return NULL;
return result;
}
@@ -838,8 +826,7 @@ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
int ret;
if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
- skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
- ip6_compute_pseudo);
+ skb_checksum_try_convert(skb, IPPROTO_UDP, ip6_compute_pseudo);
ret = udpv6_queue_rcv_skb(sk, skb);
@@ -1332,7 +1319,7 @@ do_udp_sendmsg:
fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (!flowlabel)
+ if (IS_ERR(flowlabel))
return -EINVAL;
}
}
@@ -1384,7 +1371,7 @@ do_udp_sendmsg:
}
if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (!flowlabel)
+ if (IS_ERR(flowlabel))
return -EINVAL;
}
if (!(opt->opt_nflen|opt->opt_flen))
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 5bdca3d5d6b7..78daadecbdef 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -21,137 +21,6 @@
#include <net/ipv6.h>
#include <net/addrconf.h>
-static void
-__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
-{
- const struct flowi6 *fl6 = &fl->u.ip6;
-
- /* Initialize temporary selector matching only
- * to current session. */
- *(struct in6_addr *)&sel->daddr = fl6->daddr;
- *(struct in6_addr *)&sel->saddr = fl6->saddr;
- sel->dport = xfrm_flowi_dport(fl, &fl6->uli);
- sel->dport_mask = htons(0xffff);
- sel->sport = xfrm_flowi_sport(fl, &fl6->uli);
- sel->sport_mask = htons(0xffff);
- sel->family = AF_INET6;
- sel->prefixlen_d = 128;
- sel->prefixlen_s = 128;
- sel->proto = fl6->flowi6_proto;
- sel->ifindex = fl6->flowi6_oif;
-}
-
-static void
-xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
- const xfrm_address_t *daddr, const xfrm_address_t *saddr)
-{
- x->id = tmpl->id;
- if (ipv6_addr_any((struct in6_addr *)&x->id.daddr))
- memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr));
- memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr));
- if (ipv6_addr_any((struct in6_addr *)&x->props.saddr))
- memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr));
- x->props.mode = tmpl->mode;
- x->props.reqid = tmpl->reqid;
- x->props.family = AF_INET6;
-}
-
-/* distribution counting sort function for xfrm_state and xfrm_tmpl */
-static int
-__xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass)
-{
- int count[XFRM_MAX_DEPTH] = { };
- int class[XFRM_MAX_DEPTH];
- int i;
-
- for (i = 0; i < n; i++) {
- int c;
- class[i] = c = cmp(src[i]);
- count[c]++;
- }
-
- for (i = 2; i < maxclass; i++)
- count[i] += count[i - 1];
-
- for (i = 0; i < n; i++) {
- dst[count[class[i] - 1]++] = src[i];
- src[i] = NULL;
- }
-
- return 0;
-}
-
-/*
- * Rule for xfrm_state:
- *
- * rule 1: select IPsec transport except AH
- * rule 2: select MIPv6 RO or inbound trigger
- * rule 3: select IPsec transport AH
- * rule 4: select IPsec tunnel
- * rule 5: others
- */
-static int __xfrm6_state_sort_cmp(void *p)
-{
- struct xfrm_state *v = p;
-
- switch (v->props.mode) {
- case XFRM_MODE_TRANSPORT:
- if (v->id.proto != IPPROTO_AH)
- return 1;
- else
- return 3;
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
- case XFRM_MODE_ROUTEOPTIMIZATION:
- case XFRM_MODE_IN_TRIGGER:
- return 2;
-#endif
- case XFRM_MODE_TUNNEL:
- case XFRM_MODE_BEET:
- return 4;
- }
- return 5;
-}
-
-static int
-__xfrm6_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n)
-{
- return __xfrm6_sort((void **)dst, (void **)src, n,
- __xfrm6_state_sort_cmp, 6);
-}
-
-/*
- * Rule for xfrm_tmpl:
- *
- * rule 1: select IPsec transport
- * rule 2: select MIPv6 RO or inbound trigger
- * rule 3: select IPsec tunnel
- * rule 4: others
- */
-static int __xfrm6_tmpl_sort_cmp(void *p)
-{
- struct xfrm_tmpl *v = p;
- switch (v->mode) {
- case XFRM_MODE_TRANSPORT:
- return 1;
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
- case XFRM_MODE_ROUTEOPTIMIZATION:
- case XFRM_MODE_IN_TRIGGER:
- return 2;
-#endif
- case XFRM_MODE_TUNNEL:
- case XFRM_MODE_BEET:
- return 3;
- }
- return 4;
-}
-
-static int
-__xfrm6_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n)
-{
- return __xfrm6_sort((void **)dst, (void **)src, n,
- __xfrm6_tmpl_sort_cmp, 5);
-}
-
int xfrm6_extract_header(struct sk_buff *skb)
{
struct ipv6hdr *iph = ipv6_hdr(skb);
@@ -171,12 +40,6 @@ int xfrm6_extract_header(struct sk_buff *skb)
static struct xfrm_state_afinfo xfrm6_state_afinfo = {
.family = AF_INET6,
.proto = IPPROTO_IPV6,
- .eth_proto = htons(ETH_P_IPV6),
- .owner = THIS_MODULE,
- .init_tempsel = __xfrm6_init_tempsel,
- .init_temprop = xfrm6_init_temprop,
- .tmpl_sort = __xfrm6_tmpl_sort,
- .state_sort = __xfrm6_state_sort,
.output = xfrm6_output,
.output_finish = xfrm6_output_finish,
.extract_input = xfrm6_extract_input,