aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/Kconfig7
-rw-r--r--net/bridge/br_if.c1
-rw-r--r--net/bridge/br_mdb.c9
-rw-r--r--net/bridge/br_multicast.c34
-rw-r--r--net/bridge/br_private.h7
-rw-r--r--net/core/Makefile1
-rw-r--r--net/core/dev.c36
-rw-r--r--net/core/dst.c84
-rw-r--r--net/core/fib_rules.c24
-rw-r--r--net/core/filter.c65
-rw-r--r--net/core/lwtunnel.c235
-rw-r--r--net/core/net-sysfs.c14
-rw-r--r--net/core/rtnetlink.c42
-rw-r--r--net/core/timestamping.c6
-rw-r--r--net/ipv4/arp.c65
-rw-r--r--net/ipv4/fib_frontend.c10
-rw-r--r--net/ipv4/fib_semantics.c96
-rw-r--r--net/ipv4/icmp.c1
-rw-r--r--net/ipv4/inet_hashtables.c38
-rw-r--r--net/ipv4/inet_timewait_sock.c55
-rw-r--r--net/ipv4/ip_fragment.c26
-rw-r--r--net/ipv4/ip_input.c3
-rw-r--r--net/ipv4/ip_tunnel_core.c130
-rw-r--r--net/ipv4/ping.c3
-rw-r--r--net/ipv4/proc.c2
-rw-r--r--net/ipv4/route.c31
-rw-r--r--net/ipv4/tcp_bic.c2
-rw-r--r--net/ipv4/tcp_cdg.c2
-rw-r--r--net/ipv4/tcp_cong.c6
-rw-r--r--net/ipv4/tcp_cubic.c4
-rw-r--r--net/ipv4/tcp_highspeed.c2
-rw-r--r--net/ipv4/tcp_htcp.c2
-rw-r--r--net/ipv4/tcp_hybla.c2
-rw-r--r--net/ipv4/tcp_illinois.c2
-rw-r--r--net/ipv4/tcp_input.c64
-rw-r--r--net/ipv4/tcp_ipv4.c3
-rw-r--r--net/ipv4/tcp_metrics.c2
-rw-r--r--net/ipv4/tcp_minisocks.c6
-rw-r--r--net/ipv4/tcp_output.c6
-rw-r--r--net/ipv4/tcp_scalable.c2
-rw-r--r--net/ipv4/tcp_timer.c1
-rw-r--r--net/ipv4/tcp_vegas.c6
-rw-r--r--net/ipv4/tcp_veno.c2
-rw-r--r--net/ipv6/Kconfig11
-rw-r--r--net/ipv6/addrconf.c223
-rw-r--r--net/ipv6/af_inet6.c7
-rw-r--r--net/ipv6/datagram.c8
-rw-r--r--net/ipv6/exthdrs.c2
-rw-r--r--net/ipv6/inet6_hashtables.c9
-rw-r--r--net/ipv6/ip6_fib.c2
-rw-r--r--net/ipv6/ip6_output.c2
-rw-r--r--net/ipv6/raw.c3
-rw-r--r--net/ipv6/route.c34
-rw-r--r--net/ipv6/sysctl_net_ipv6.c8
-rw-r--r--net/ipv6/tcp_ipv6.c3
-rw-r--r--net/mpls/Kconfig8
-rw-r--r--net/mpls/Makefile1
-rw-r--r--net/mpls/af_mpls.c78
-rw-r--r--net/mpls/internal.h9
-rw-r--r--net/mpls/mpls_iptunnel.c233
-rw-r--r--net/netfilter/nft_meta.c4
-rw-r--r--net/netfilter/xt_TPROXY.c6
-rw-r--r--net/openvswitch/Kconfig12
-rw-r--r--net/openvswitch/Makefile1
-rw-r--r--net/openvswitch/actions.c12
-rw-r--r--net/openvswitch/datapath.c19
-rw-r--r--net/openvswitch/datapath.h5
-rw-r--r--net/openvswitch/dp_notify.c5
-rw-r--r--net/openvswitch/flow.c4
-rw-r--r--net/openvswitch/flow.h79
-rw-r--r--net/openvswitch/flow_netlink.c84
-rw-r--r--net/openvswitch/flow_netlink.h3
-rw-r--r--net/openvswitch/flow_table.c4
-rw-r--r--net/openvswitch/vport-geneve.c17
-rw-r--r--net/openvswitch/vport-gre.c16
-rw-r--r--net/openvswitch/vport-internal_dev.c38
-rw-r--r--net/openvswitch/vport-netdev.c289
-rw-r--r--net/openvswitch/vport-netdev.h13
-rw-r--r--net/openvswitch/vport-vxlan.c322
-rw-r--r--net/openvswitch/vport-vxlan.h11
-rw-r--r--net/openvswitch/vport.c34
-rw-r--r--net/openvswitch/vport.h21
-rw-r--r--net/sched/act_api.c44
-rw-r--r--net/sched/act_bpf.c2
-rw-r--r--net/sched/act_connmark.c3
-rw-r--r--net/sched/act_csum.c3
-rw-r--r--net/sched/act_gact.c44
-rw-r--r--net/sched/act_ipt.c2
-rw-r--r--net/sched/act_mirred.c58
-rw-r--r--net/sched/act_nat.c3
-rw-r--r--net/sched/act_pedit.c3
-rw-r--r--net/sched/act_simple.c3
-rw-r--r--net/sched/act_skbedit.c3
-rw-r--r--net/sched/act_vlan.c3
-rw-r--r--net/sched/cls_cgroup.c23
-rw-r--r--net/sched/sch_qfq.c1
-rw-r--r--net/sctp/protocol.c42
-rw-r--r--net/sctp/sm_statefuns.c2
-rw-r--r--net/switchdev/switchdev.c111
-rw-r--r--net/tipc/bcast.c31
-rw-r--r--net/tipc/bcast.h1
-rw-r--r--net/tipc/bearer.c26
-rw-r--r--net/tipc/bearer.h3
-rw-r--r--net/tipc/core.h5
-rw-r--r--net/tipc/discover.c20
-rw-r--r--net/tipc/link.c1517
-rw-r--r--net/tipc/link.h74
-rw-r--r--net/tipc/msg.h53
-rw-r--r--net/tipc/name_distr.c6
-rw-r--r--net/tipc/node.c549
-rw-r--r--net/tipc/node.h93
-rw-r--r--net/tipc/socket.c71
-rw-r--r--net/xfrm/xfrm_user.c2
113 files changed, 3509 insertions, 2071 deletions
diff --git a/net/Kconfig b/net/Kconfig
index 57a7c5af3175..7021c1bf44d6 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -374,6 +374,13 @@ source "net/caif/Kconfig"
source "net/ceph/Kconfig"
source "net/nfc/Kconfig"
+config LWTUNNEL
+ bool "Network light weight tunnels"
+ ---help---
+ This feature provides an infrastructure to support light weight
+ tunnels like mpls. There is no netdevice associated with a light
+ weight tunnel endpoint. Tunnel encapsulation parameters are stored
+ with light weight tunnel state associated with fib routes.
endif # if NET
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index a538cb1199a3..45e4757c6fd2 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -281,6 +281,7 @@ void br_dev_delete(struct net_device *dev, struct list_head *head)
br_fdb_delete_by_port(br, NULL, 0, 1);
br_vlan_flush(br);
+ br_multicast_dev_del(br);
del_timer_sync(&br->gc_timer);
br_sysfs_delbr(br->dev);
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 1198a3dbad95..6a591e6e2d04 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -85,6 +85,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
memset(&e, 0, sizeof(e));
e.ifindex = port->dev->ifindex;
e.state = p->state;
+ e.vid = p->addr.vid;
if (p->addr.proto == htons(ETH_P_IP))
e.addr.u.ip4 = p->addr.u.ip4;
#if IS_ENABLED(CONFIG_IPV6)
@@ -230,7 +231,7 @@ errout:
}
void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
- struct br_ip *group, int type)
+ struct br_ip *group, int type, u8 state)
{
struct br_mdb_entry entry;
@@ -241,6 +242,8 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
#if IS_ENABLED(CONFIG_IPV6)
entry.addr.u.ip6 = group->u.ip6;
#endif
+ entry.state = state;
+ entry.vid = group->vid;
__br_mdb_notify(dev, &entry, type);
}
@@ -263,6 +266,8 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry)
return false;
if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY)
return false;
+ if (entry->vid >= VLAN_VID_MASK)
+ return false;
return true;
}
@@ -374,6 +379,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
return -EINVAL;
memset(&ip, 0, sizeof(ip));
+ ip.vid = entry->vid;
ip.proto = entry->addr.proto;
if (ip.proto == htons(ETH_P_IP))
ip.u.ip4 = entry->addr.u.ip4;
@@ -421,6 +427,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
return -EINVAL;
memset(&ip, 0, sizeof(ip));
+ ip.vid = entry->vid;
ip.proto = entry->addr.proto;
if (ip.proto == htons(ETH_P_IP))
ip.u.ip4 = entry->addr.u.ip4;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 79db489cdade..ed5dc684a4ce 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -283,6 +283,8 @@ static void br_multicast_del_pg(struct net_bridge *br,
rcu_assign_pointer(*pp, p->next);
hlist_del_init(&p->mglist);
del_timer(&p->timer);
+ br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB,
+ p->state);
call_rcu_bh(&p->rcu, br_multicast_free_pg);
if (!mp->ports && !mp->mglist &&
@@ -704,7 +706,7 @@ static int br_multicast_add_group(struct net_bridge *br,
if (unlikely(!p))
goto err;
rcu_assign_pointer(*pp, p);
- br_mdb_notify(br->dev, port, group, RTM_NEWMDB);
+ br_mdb_notify(br->dev, port, group, RTM_NEWMDB, MDB_TEMPORARY);
found:
mod_timer(&p->timer, now + br->multicast_membership_interval);
@@ -924,6 +926,15 @@ void br_multicast_add_port(struct net_bridge_port *port)
void br_multicast_del_port(struct net_bridge_port *port)
{
+ struct net_bridge *br = port->br;
+ struct net_bridge_port_group *pg;
+ struct hlist_node *n;
+
+ /* Take care of the remaining groups, only perm ones should be left */
+ spin_lock_bh(&br->multicast_lock);
+ hlist_for_each_entry_safe(pg, n, &port->mglist, mglist)
+ br_multicast_del_pg(br, pg);
+ spin_unlock_bh(&br->multicast_lock);
del_timer_sync(&port->multicast_router_timer);
}
@@ -963,7 +974,8 @@ void br_multicast_disable_port(struct net_bridge_port *port)
spin_lock(&br->multicast_lock);
hlist_for_each_entry_safe(pg, n, &port->mglist, mglist)
- br_multicast_del_pg(br, pg);
+ if (pg->state == MDB_TEMPORARY)
+ br_multicast_del_pg(br, pg);
if (!hlist_unhashed(&port->rlist))
hlist_del_init_rcu(&port->rlist);
@@ -1462,8 +1474,9 @@ br_multicast_leave_group(struct net_bridge *br,
rcu_assign_pointer(*pp, p->next);
hlist_del_init(&p->mglist);
del_timer(&p->timer);
+ br_mdb_notify(br->dev, port, group, RTM_DELMDB,
+ p->state);
call_rcu_bh(&p->rcu, br_multicast_free_pg);
- br_mdb_notify(br->dev, port, group, RTM_DELMDB);
if (!mp->ports && !mp->mglist &&
netif_running(br->dev))
@@ -1752,12 +1765,6 @@ void br_multicast_open(struct net_bridge *br)
void br_multicast_stop(struct net_bridge *br)
{
- struct net_bridge_mdb_htable *mdb;
- struct net_bridge_mdb_entry *mp;
- struct hlist_node *n;
- u32 ver;
- int i;
-
del_timer_sync(&br->multicast_router_timer);
del_timer_sync(&br->ip4_other_query.timer);
del_timer_sync(&br->ip4_own_query.timer);
@@ -1765,6 +1772,15 @@ void br_multicast_stop(struct net_bridge *br)
del_timer_sync(&br->ip6_other_query.timer);
del_timer_sync(&br->ip6_own_query.timer);
#endif
+}
+
+void br_multicast_dev_del(struct net_bridge *br)
+{
+ struct net_bridge_mdb_htable *mdb;
+ struct net_bridge_mdb_entry *mp;
+ struct hlist_node *n;
+ u32 ver;
+ int i;
spin_lock_bh(&br->multicast_lock);
mdb = mlock_dereference(br->mdb, br);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 8b21146b24a0..3ad1290528af 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -466,6 +466,7 @@ void br_multicast_disable_port(struct net_bridge_port *port);
void br_multicast_init(struct net_bridge *br);
void br_multicast_open(struct net_bridge *br);
void br_multicast_stop(struct net_bridge *br);
+void br_multicast_dev_del(struct net_bridge *br);
void br_multicast_deliver(struct net_bridge_mdb_entry *mdst,
struct sk_buff *skb);
void br_multicast_forward(struct net_bridge_mdb_entry *mdst,
@@ -488,7 +489,7 @@ br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
void br_mdb_init(void);
void br_mdb_uninit(void);
void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
- struct br_ip *group, int type);
+ struct br_ip *group, int type, u8 state);
#define mlock_dereference(X, br) \
rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))
@@ -565,6 +566,10 @@ static inline void br_multicast_stop(struct net_bridge *br)
{
}
+static inline void br_multicast_dev_del(struct net_bridge *br)
+{
+}
+
static inline void br_multicast_deliver(struct net_bridge_mdb_entry *mdst,
struct sk_buff *skb)
{
diff --git a/net/core/Makefile b/net/core/Makefile
index fec0856dd6c0..086b01fbe1bd 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
+obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
diff --git a/net/core/dev.c b/net/core/dev.c
index a8e4dd430285..cb52cba30ae8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3061,6 +3061,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
else
skb_dst_force(skb);
+#ifdef CONFIG_NET_SWITCHDEV
+ /* Don't forward if offload device already forwarded */
+ if (skb->offload_fwd_mark &&
+ skb->offload_fwd_mark == dev->offload_fwd_mark) {
+ consume_skb(skb);
+ rc = NET_XMIT_SUCCESS;
+ goto out;
+ }
+#endif
+
txq = netdev_pick_tx(dev, skb, accel_priv);
q = rcu_dereference_bh(txq->qdisc);
@@ -3645,7 +3655,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
qdisc_skb_cb(skb)->pkt_len = skb->len;
skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
- qdisc_bstats_update_cpu(cl->q, skb);
+ qdisc_bstats_cpu_update(cl->q, skb);
switch (tc_classify(skb, cl, &cl_res)) {
case TC_ACT_OK:
@@ -3653,7 +3663,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
skb->tc_index = TC_H_MIN(cl_res.classid);
break;
case TC_ACT_SHOT:
- qdisc_qstats_drop_cpu(cl->q);
+ qdisc_qstats_cpu_drop(cl->q);
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
kfree_skb(skb);
@@ -6075,6 +6085,26 @@ int dev_get_phys_port_name(struct net_device *dev,
EXPORT_SYMBOL(dev_get_phys_port_name);
/**
+ * dev_change_proto_down - update protocol port state information
+ * @dev: device
+ * @proto_down: new value
+ *
+ * This info can be used by switch drivers to set the phys state of the
+ * port.
+ */
+int dev_change_proto_down(struct net_device *dev, bool proto_down)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_change_proto_down)
+ return -EOPNOTSUPP;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return ops->ndo_change_proto_down(dev, proto_down);
+}
+EXPORT_SYMBOL(dev_change_proto_down);
+
+/**
* dev_new_index - allocate an ifindex
* @net: the applicable net namespace
*
@@ -7639,7 +7669,7 @@ static int __init net_dev_init(void)
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
hotcpu_notifier(dev_cpu_callback, 0);
- dst_init();
+ dst_subsys_init();
rc = 0;
out:
return rc;
diff --git a/net/core/dst.c b/net/core/dst.c
index 002144bea935..76a617f6d60a 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -22,6 +22,7 @@
#include <linux/prefetch.h>
#include <net/dst.h>
+#include <net/dst_metadata.h>
/*
* Theory of operations:
@@ -158,19 +159,10 @@ const u32 dst_default_metrics[RTAX_MAX + 1] = {
[RTAX_MAX] = 0xdeadbeef,
};
-
-void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
- int initial_ref, int initial_obsolete, unsigned short flags)
+void dst_init(struct dst_entry *dst, struct dst_ops *ops,
+ struct net_device *dev, int initial_ref, int initial_obsolete,
+ unsigned short flags)
{
- struct dst_entry *dst;
-
- if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
- if (ops->gc(ops))
- return NULL;
- }
- dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
- if (!dst)
- return NULL;
dst->child = NULL;
dst->dev = dev;
if (dev)
@@ -200,6 +192,25 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
dst->next = NULL;
if (!(flags & DST_NOCOUNT))
dst_entries_add(ops, 1);
+}
+EXPORT_SYMBOL(dst_init);
+
+void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
+ int initial_ref, int initial_obsolete, unsigned short flags)
+{
+ struct dst_entry *dst;
+
+ if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
+ if (ops->gc(ops))
+ return NULL;
+ }
+
+ dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
+ if (!dst)
+ return NULL;
+
+ dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags);
+
return dst;
}
EXPORT_SYMBOL(dst_alloc);
@@ -248,7 +259,11 @@ again:
dst->ops->destroy(dst);
if (dst->dev)
dev_put(dst->dev);
- kmem_cache_free(dst->ops->kmem_cachep, dst);
+
+ if (dst->flags & DST_METADATA)
+ kfree(dst);
+ else
+ kmem_cache_free(dst->ops->kmem_cachep, dst);
dst = child;
if (dst) {
@@ -329,6 +344,47 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
}
EXPORT_SYMBOL(__dst_destroy_metrics_generic);
+static struct dst_ops md_dst_ops = {
+ .family = AF_UNSPEC,
+};
+
+static int dst_md_discard_sk(struct sock *sk, struct sk_buff *skb)
+{
+ WARN_ONCE(1, "Attempting to call output on metadata dst\n");
+ kfree_skb(skb);
+ return 0;
+}
+
+static int dst_md_discard(struct sk_buff *skb)
+{
+ WARN_ONCE(1, "Attempting to call input on metadata dst\n");
+ kfree_skb(skb);
+ return 0;
+}
+
+struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
+{
+ struct metadata_dst *md_dst;
+ struct dst_entry *dst;
+
+ md_dst = kmalloc(sizeof(*md_dst) + optslen, flags);
+ if (!md_dst)
+ return ERR_PTR(-ENOMEM);
+
+ dst = &md_dst->dst;
+ dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE,
+ DST_METADATA | DST_NOCACHE | DST_NOCOUNT);
+
+ dst->input = dst_md_discard;
+ dst->output = dst_md_discard_sk;
+
+ memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
+ md_dst->opts_len = optslen;
+
+ return md_dst;
+}
+EXPORT_SYMBOL_GPL(metadata_dst_alloc);
+
/* Dirty hack. We did it in 2.2 (in __dst_free),
* we have _very_ good reasons not to repeat
* this mistake in 2.3, but we have no choice
@@ -393,7 +449,7 @@ static struct notifier_block dst_dev_notifier = {
.priority = -10, /* must be called after other network notifiers */
};
-void __init dst_init(void)
+void __init dst_subsys_init(void)
{
register_netdevice_notifier(&dst_dev_notifier);
}
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 9a12668f7d62..ae8306e7c56f 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -16,6 +16,7 @@
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/fib_rules.h>
+#include <net/ip_tunnels.h>
int fib_default_rule_add(struct fib_rules_ops *ops,
u32 pref, u32 table, u32 flags)
@@ -186,6 +187,9 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
goto out;
+ if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id))
+ goto out;
+
ret = ops->match(rule, fl, flags);
out:
return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
@@ -330,6 +334,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
if (tb[FRA_FWMASK])
rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
+ if (tb[FRA_TUN_ID])
+ rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
+
rule->action = frh->action;
rule->flags = frh->flags;
rule->table = frh_get_table(frh, tb);
@@ -407,6 +414,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
if (unresolved)
ops->unresolved_rules++;
+ if (rule->tun_id)
+ ip_tunnel_need_metadata();
+
notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
flush_route_cache(ops);
rules_ops_put(ops);
@@ -473,6 +483,10 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
(rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
continue;
+ if (tb[FRA_TUN_ID] &&
+ (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID])))
+ continue;
+
if (!ops->compare(rule, frh, tb))
continue;
@@ -487,6 +501,9 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
goto errout;
}
+ if (rule->tun_id)
+ ip_tunnel_unneed_metadata();
+
list_del_rcu(&rule->list);
if (rule->action == FR_ACT_GOTO) {
@@ -535,7 +552,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */
+ nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
+ nla_total_size(4) /* FRA_FWMARK */
- + nla_total_size(4); /* FRA_FWMASK */
+ + nla_total_size(4) /* FRA_FWMASK */
+ + nla_total_size(8); /* FRA_TUN_ID */
if (ops->nlmsg_payload)
payload += ops->nlmsg_payload(rule);
@@ -591,7 +609,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
((rule->mark_mask || rule->mark) &&
nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) ||
(rule->target &&
- nla_put_u32(skb, FRA_GOTO, rule->target)))
+ nla_put_u32(skb, FRA_GOTO, rule->target)) ||
+ (rule->tun_id &&
+ nla_put_be64(skb, FRA_TUN_ID, rule->tun_id)))
goto nla_put_failure;
if (rule->suppress_ifgroup != -1) {
diff --git a/net/core/filter.c b/net/core/filter.c
index be3098fb65e4..786722a9c6f2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -47,6 +47,7 @@
#include <linux/if_vlan.h>
#include <linux/bpf.h>
#include <net/sch_generic.h>
+#include <net/cls_cgroup.h>
/**
* sk_filter - run a packet through a socket filter
@@ -1424,6 +1425,64 @@ const struct bpf_func_proto bpf_clone_redirect_proto = {
.arg3_type = ARG_ANYTHING,
};
+static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return task_get_classid((struct sk_buff *) (unsigned long) r1);
+}
+
+static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
+ .func = bpf_get_cgroup_classid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ __be16 vlan_proto = (__force __be16) r2;
+
+ if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
+ vlan_proto != htons(ETH_P_8021AD)))
+ vlan_proto = htons(ETH_P_8021Q);
+
+ return skb_vlan_push(skb, vlan_proto, vlan_tci);
+}
+
+const struct bpf_func_proto bpf_skb_vlan_push_proto = {
+ .func = bpf_skb_vlan_push,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
+
+static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+
+ return skb_vlan_pop(skb);
+}
+
+const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
+ .func = bpf_skb_vlan_pop,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto);
+
+bool bpf_helper_changes_skb_data(void *func)
+{
+ if (func == bpf_skb_vlan_push)
+ return true;
+ if (func == bpf_skb_vlan_pop)
+ return true;
+ return false;
+}
+
static const struct bpf_func_proto *
sk_filter_func_proto(enum bpf_func_id func_id)
{
@@ -1461,6 +1520,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_l4_csum_replace_proto;
case BPF_FUNC_clone_redirect:
return &bpf_clone_redirect_proto;
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_proto;
+ case BPF_FUNC_skb_vlan_push:
+ return &bpf_skb_vlan_push_proto;
+ case BPF_FUNC_skb_vlan_pop:
+ return &bpf_skb_vlan_pop_proto;
default:
return sk_filter_func_proto(func_id);
}
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
new file mode 100644
index 000000000000..bb58826c708d
--- /dev/null
+++ b/net/core/lwtunnel.c
@@ -0,0 +1,235 @@
+/*
+ * lwtunnel Infrastructure for light weight tunnels like mpls
+ *
+ * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/lwtunnel.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <linux/err.h>
+
+#include <net/lwtunnel.h>
+#include <net/rtnetlink.h>
+#include <net/ip6_fib.h>
+
+struct lwtunnel_state *lwtunnel_state_alloc(int encap_len)
+{
+ struct lwtunnel_state *lws;
+
+ lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC);
+
+ return lws;
+}
+EXPORT_SYMBOL(lwtunnel_state_alloc);
+
+const struct lwtunnel_encap_ops __rcu *
+ lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly;
+
+int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops,
+ unsigned int num)
+{
+ if (num > LWTUNNEL_ENCAP_MAX)
+ return -ERANGE;
+
+ return !cmpxchg((const struct lwtunnel_encap_ops **)
+ &lwtun_encaps[num],
+ NULL, ops) ? 0 : -1;
+}
+EXPORT_SYMBOL(lwtunnel_encap_add_ops);
+
+int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
+ unsigned int encap_type)
+{
+ int ret;
+
+ if (encap_type == LWTUNNEL_ENCAP_NONE ||
+ encap_type > LWTUNNEL_ENCAP_MAX)
+ return -ERANGE;
+
+ ret = (cmpxchg((const struct lwtunnel_encap_ops **)
+ &lwtun_encaps[encap_type],
+ ops, NULL) == ops) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(lwtunnel_encap_del_ops);
+
+int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+ struct nlattr *encap, struct lwtunnel_state **lws)
+{
+ const struct lwtunnel_encap_ops *ops;
+ int ret = -EINVAL;
+
+ if (encap_type == LWTUNNEL_ENCAP_NONE ||
+ encap_type > LWTUNNEL_ENCAP_MAX)
+ return ret;
+
+ ret = -EOPNOTSUPP;
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[encap_type]);
+ if (likely(ops && ops->build_state))
+ ret = ops->build_state(dev, encap, lws);
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(lwtunnel_build_state);
+
+int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
+{
+ const struct lwtunnel_encap_ops *ops;
+ struct nlattr *nest;
+ int ret = -EINVAL;
+
+ if (!lwtstate)
+ return 0;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ ret = -EOPNOTSUPP;
+ nest = nla_nest_start(skb, RTA_ENCAP);
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->fill_encap))
+ ret = ops->fill_encap(skb, lwtstate);
+ rcu_read_unlock();
+
+ if (ret)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+ ret = nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type);
+ if (ret)
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+
+ return (ret == -EOPNOTSUPP ? 0 : ret);
+}
+EXPORT_SYMBOL(lwtunnel_fill_encap);
+
+int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate)
+{
+ const struct lwtunnel_encap_ops *ops;
+ int ret = 0;
+
+ if (!lwtstate)
+ return 0;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->get_encap_size))
+ ret = nla_total_size(ops->get_encap_size(lwtstate));
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(lwtunnel_get_encap_size);
+
+int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ const struct lwtunnel_encap_ops *ops;
+ int ret = 0;
+
+ if (!a && !b)
+ return 0;
+
+ if (!a || !b)
+ return 1;
+
+ if (a->type != b->type)
+ return 1;
+
+ if (a->type == LWTUNNEL_ENCAP_NONE ||
+ a->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[a->type]);
+ if (likely(ops && ops->cmp_encap))
+ ret = ops->cmp_encap(a, b);
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(lwtunnel_cmp_encap);
+
+int __lwtunnel_output(struct sock *sk, struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ const struct lwtunnel_encap_ops *ops;
+ int ret = -EINVAL;
+
+ if (!lwtstate)
+ goto drop;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ ret = -EOPNOTSUPP;
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->output))
+ ret = ops->output(sk, skb);
+ rcu_read_unlock();
+
+ if (ret == -EOPNOTSUPP)
+ goto drop;
+
+ return ret;
+
+drop:
+ kfree(skb);
+
+ return ret;
+}
+
+int lwtunnel_output6(struct sock *sk, struct sk_buff *skb)
+{
+ struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+ struct lwtunnel_state *lwtstate = NULL;
+
+ if (rt)
+ lwtstate = rt->rt6i_lwtstate;
+
+ return __lwtunnel_output(sk, skb, lwtstate);
+}
+EXPORT_SYMBOL(lwtunnel_output6);
+
+int lwtunnel_output(struct sock *sk, struct sk_buff *skb)
+{
+ struct rtable *rt = (struct rtable *)skb_dst(skb);
+ struct lwtunnel_state *lwtstate = NULL;
+
+ if (rt)
+ lwtstate = rt->rt_lwtstate;
+
+ return __lwtunnel_output(sk, skb, lwtstate);
+}
+EXPORT_SYMBOL(lwtunnel_output);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 18b34d771ed4..194c1d03b2b3 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -404,6 +404,19 @@ static ssize_t group_store(struct device *dev, struct device_attribute *attr,
NETDEVICE_SHOW(group, fmt_dec);
static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store);
+static int change_proto_down(struct net_device *dev, unsigned long proto_down)
+{
+ return dev_change_proto_down(dev, (bool) proto_down);
+}
+
+static ssize_t proto_down_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_proto_down);
+}
+NETDEVICE_SHOW_RW(proto_down, fmt_dec);
+
static ssize_t phys_port_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -501,6 +514,7 @@ static struct attribute *net_class_attrs[] = {
&dev_attr_phys_port_id.attr,
&dev_attr_phys_port_name.attr,
&dev_attr_phys_switch_id.attr,
+ &dev_attr_proto_down.attr,
NULL,
};
ATTRIBUTE_GROUPS(net_class);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index dc004b1e1f85..788ceed39463 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -896,7 +896,9 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ rtnl_link_get_size(dev) /* IFLA_LINKINFO */
+ rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */
+ nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */
- + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_SWITCH_ID */
+ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
+ + nla_total_size(1); /* IFLA_PROTO_DOWN */
+
}
static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
@@ -1082,7 +1084,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
(dev->ifalias &&
nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) ||
nla_put_u32(skb, IFLA_CARRIER_CHANGES,
- atomic_read(&dev->carrier_changes)))
+ atomic_read(&dev->carrier_changes)) ||
+ nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
goto nla_put_failure;
if (1) {
@@ -1319,6 +1322,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */
[IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
[IFLA_LINK_NETNSID] = { .type = NLA_S32 },
+ [IFLA_PROTO_DOWN] = { .type = NLA_U8 },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -1861,6 +1865,14 @@ static int do_setlink(const struct sk_buff *skb,
}
err = 0;
+ if (tb[IFLA_PROTO_DOWN]) {
+ err = dev_change_proto_down(dev,
+ nla_get_u8(tb[IFLA_PROTO_DOWN]));
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
+
errout:
if (status & DO_SETLINK_MODIFIED) {
if (status & DO_SETLINK_NOTIFY)
@@ -1951,16 +1963,30 @@ static int rtnl_group_dellink(const struct net *net, int group)
return 0;
}
+int rtnl_delete_link(struct net_device *dev)
+{
+ const struct rtnl_link_ops *ops;
+ LIST_HEAD(list_kill);
+
+ ops = dev->rtnl_link_ops;
+ if (!ops || !ops->dellink)
+ return -EOPNOTSUPP;
+
+ ops->dellink(dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rtnl_delete_link);
+
static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
- const struct rtnl_link_ops *ops;
struct net_device *dev;
struct ifinfomsg *ifm;
char ifname[IFNAMSIZ];
struct nlattr *tb[IFLA_MAX+1];
int err;
- LIST_HEAD(list_kill);
err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
if (err < 0)
@@ -1982,13 +2008,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
if (!dev)
return -ENODEV;
- ops = dev->rtnl_link_ops;
- if (!ops || !ops->dellink)
- return -EOPNOTSUPP;
-
- ops->dellink(dev, &list_kill);
- unregister_netdevice_many(&list_kill);
- return 0;
+ return rtnl_delete_link(dev);
}
int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index 43d3dd62fcc8..42689d5c468c 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -60,11 +60,15 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)
struct phy_device *phydev;
unsigned int type;
+ if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->drv)
+ return false;
+
if (skb_headroom(skb) < ETH_HLEN)
return false;
+
__skb_push(skb, ETH_HLEN);
- type = classify(skb);
+ type = ptp_classify_raw(skb);
__skb_pull(skb, ETH_HLEN);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 933a92820d26..1d59e50ce8b7 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -291,6 +291,40 @@ static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
kfree_skb(skb);
}
+/* Create and send an arp packet. */
+static void arp_send_dst(int type, int ptype, __be32 dest_ip,
+ struct net_device *dev, __be32 src_ip,
+ const unsigned char *dest_hw,
+ const unsigned char *src_hw,
+ const unsigned char *target_hw, struct sk_buff *oskb)
+{
+ struct sk_buff *skb;
+
+ /* arp on this interface. */
+ if (dev->flags & IFF_NOARP)
+ return;
+
+ skb = arp_create(type, ptype, dest_ip, dev, src_ip,
+ dest_hw, src_hw, target_hw);
+ if (!skb)
+ return;
+
+ if (oskb)
+ skb_dst_copy(skb, oskb);
+
+ arp_xmit(skb);
+}
+
+void arp_send(int type, int ptype, __be32 dest_ip,
+ struct net_device *dev, __be32 src_ip,
+ const unsigned char *dest_hw, const unsigned char *src_hw,
+ const unsigned char *target_hw)
+{
+ arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw,
+ target_hw, NULL);
+}
+EXPORT_SYMBOL(arp_send);
+
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
__be32 saddr = 0;
@@ -346,8 +380,9 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
}
}
- arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
- dst_hw, dev->dev_addr, NULL);
+ arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
+ dst_hw, dev->dev_addr, NULL,
+ dev->priv_flags & IFF_XMIT_DST_RELEASE ? NULL : skb);
}
static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
@@ -597,32 +632,6 @@ void arp_xmit(struct sk_buff *skb)
EXPORT_SYMBOL(arp_xmit);
/*
- * Create and send an arp packet.
- */
-void arp_send(int type, int ptype, __be32 dest_ip,
- struct net_device *dev, __be32 src_ip,
- const unsigned char *dest_hw, const unsigned char *src_hw,
- const unsigned char *target_hw)
-{
- struct sk_buff *skb;
-
- /*
- * No arp on this interface.
- */
-
- if (dev->flags&IFF_NOARP)
- return;
-
- skb = arp_create(type, ptype, dest_ip, dev, src_ip,
- dest_hw, src_hw, target_hw);
- if (!skb)
- return;
-
- arp_xmit(skb);
-}
-EXPORT_SYMBOL(arp_send);
-
-/*
* Process an arp request.
*/
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 6bbc54940eb4..6b98de0d7949 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -280,6 +280,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_scope = scope;
fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
+ fl4.flowi4_tun_key.tun_id = 0;
if (!fib_lookup(net, &fl4, &res, 0))
return FIB_RES_PREFSRC(net, res);
} else {
@@ -313,6 +314,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
fl4.saddr = dst;
fl4.flowi4_tos = tos;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.flowi4_tun_key.tun_id = 0;
no_addr = idev->ifa_list == NULL;
@@ -591,6 +593,8 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_METRICS] = { .type = NLA_NESTED },
[RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
[RTA_FLOW] = { .type = NLA_U32 },
+ [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
+ [RTA_ENCAP] = { .type = NLA_NESTED },
};
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
@@ -656,6 +660,12 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
case RTA_TABLE:
cfg->fc_table = nla_get_u32(attr);
break;
+ case RTA_ENCAP:
+ cfg->fc_encap = attr;
+ break;
+ case RTA_ENCAP_TYPE:
+ cfg->fc_encap_type = nla_get_u16(attr);
+ break;
}
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c7358ea4ae93..6754c64b2fe0 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -42,6 +42,7 @@
#include <net/ip_fib.h>
#include <net/netlink.h>
#include <net/nexthop.h>
+#include <net/lwtunnel.h>
#include "fib_lookup.h"
@@ -208,6 +209,7 @@ static void free_fib_info_rcu(struct rcu_head *head)
change_nexthops(fi) {
if (nexthop_nh->nh_dev)
dev_put(nexthop_nh->nh_dev);
+ lwtunnel_state_put(nexthop_nh->nh_lwtstate);
free_nh_exceptions(nexthop_nh);
rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
rt_fibinfo_free(&nexthop_nh->nh_rth_input);
@@ -266,6 +268,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
#ifdef CONFIG_IP_ROUTE_CLASSID
nh->nh_tclassid != onh->nh_tclassid ||
#endif
+ lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) ||
((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK))
return -1;
onh++;
@@ -366,6 +369,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
if (fi->fib_nhs) {
+ size_t nh_encapsize = 0;
/* Also handles the special case fib_nhs == 1 */
/* each nexthop is packed in an attribute */
@@ -374,8 +378,21 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
/* may contain flow and gateway attribute */
nhsize += 2 * nla_total_size(4);
+ /* grab encap info */
+ for_nexthops(fi) {
+ if (nh->nh_lwtstate) {
+ /* RTA_ENCAP_TYPE */
+ nh_encapsize += lwtunnel_get_encap_size(
+ nh->nh_lwtstate);
+ /* RTA_ENCAP */
+ nh_encapsize += nla_total_size(2);
+ }
+ } endfor_nexthops(fi);
+
/* all nexthops are packed in a nested attribute */
- payload += nla_total_size(fi->fib_nhs * nhsize);
+ payload += nla_total_size((fi->fib_nhs * nhsize) +
+ nh_encapsize);
+
}
return payload;
@@ -452,6 +469,9 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
int remaining, struct fib_config *cfg)
{
+ struct net *net = cfg->fc_nlinfo.nl_net;
+ int ret;
+
change_nexthops(fi) {
int attrlen;
@@ -475,18 +495,66 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
if (nexthop_nh->nh_tclassid)
fi->fib_net->ipv4.fib_num_tclassid_users++;
#endif
+ nla = nla_find(attrs, attrlen, RTA_ENCAP);
+ if (nla) {
+ struct lwtunnel_state *lwtstate;
+ struct net_device *dev = NULL;
+ struct nlattr *nla_entype;
+
+ nla_entype = nla_find(attrs, attrlen,
+ RTA_ENCAP_TYPE);
+ if (!nla_entype)
+ goto err_inval;
+ if (cfg->fc_oif)
+ dev = __dev_get_by_index(net, cfg->fc_oif);
+ ret = lwtunnel_build_state(dev, nla_get_u16(
+ nla_entype),
+ nla, &lwtstate);
+ if (ret)
+ goto errout;
+ lwtunnel_state_get(lwtstate);
+ nexthop_nh->nh_lwtstate = lwtstate;
+ }
}
rtnh = rtnh_next(rtnh, &remaining);
} endfor_nexthops(fi);
return 0;
+
+err_inval:
+ ret = -EINVAL;
+
+errout:
+ return ret;
}
#endif
+int fib_encap_match(struct net *net, u16 encap_type,
+ struct nlattr *encap,
+ int oif, const struct fib_nh *nh)
+{
+ struct lwtunnel_state *lwtstate;
+ struct net_device *dev = NULL;
+ int ret;
+
+ if (encap_type == LWTUNNEL_ENCAP_NONE)
+ return 0;
+
+ if (oif)
+ dev = __dev_get_by_index(net, oif);
+ ret = lwtunnel_build_state(dev, encap_type,
+ encap, &lwtstate);
+ if (!ret)
+ return lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
+
+ return 0;
+}
+
int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
{
+ struct net *net = cfg->fc_nlinfo.nl_net;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
struct rtnexthop *rtnh;
int remaining;
@@ -496,6 +564,12 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
return 1;
if (cfg->fc_oif || cfg->fc_gw) {
+ if (cfg->fc_encap) {
+ if (fib_encap_match(net, cfg->fc_encap_type,
+ cfg->fc_encap, cfg->fc_oif,
+ fi->fib_nh))
+ return 1;
+ }
if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
(!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw))
return 0;
@@ -882,6 +956,22 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
} else {
struct fib_nh *nh = fi->fib_nh;
+ if (cfg->fc_encap) {
+ struct lwtunnel_state *lwtstate;
+ struct net_device *dev = NULL;
+
+ if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE)
+ goto err_inval;
+ if (cfg->fc_oif)
+ dev = __dev_get_by_index(net, cfg->fc_oif);
+ err = lwtunnel_build_state(dev, cfg->fc_encap_type,
+ cfg->fc_encap, &lwtstate);
+ if (err)
+ goto failure;
+
+ lwtunnel_state_get(lwtstate);
+ nh->nh_lwtstate = lwtstate;
+ }
nh->nh_oif = cfg->fc_oif;
nh->nh_gw = cfg->fc_gw;
nh->nh_flags = cfg->fc_flags;
@@ -1055,6 +1145,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
goto nla_put_failure;
#endif
+ if (fi->fib_nh->nh_lwtstate)
+ lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate);
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (fi->fib_nhs > 1) {
@@ -1090,6 +1182,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
goto nla_put_failure;
#endif
+ if (nh->nh_lwtstate)
+ lwtunnel_fill_encap(skb, nh->nh_lwtstate);
/* length of rtnetlink header + attributes */
rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
} endfor_nexthops(fi);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index f5203fba6236..c0556f1e4bf0 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -496,6 +496,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
}
/* Ugh! */
orefdst = skb_in->_skb_refdst; /* save old refdst */
+ skb_dst_set(skb_in, NULL);
err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
RT_TOS(tos), rt2->dst.dev);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 0cb9165421d4..89120196a949 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -343,7 +343,6 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk2;
const struct hlist_nulls_node *node;
struct inet_timewait_sock *tw = NULL;
- int twrefcnt = 0;
spin_lock(lock);
@@ -371,21 +370,17 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
WARN_ON(!sk_unhashed(sk));
__sk_nulls_add_node_rcu(sk, &head->chain);
if (tw) {
- twrefcnt = inet_twsk_unhash(tw);
+ sk_nulls_del_node_init_rcu((struct sock *)tw);
NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
}
spin_unlock(lock);
- if (twrefcnt)
- inet_twsk_put(tw);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
if (twp) {
*twp = tw;
} else if (tw) {
/* Silly. Should hash-dance instead... */
- inet_twsk_deschedule(tw);
-
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
}
return 0;
@@ -403,13 +398,12 @@ static u32 inet_sk_port_offset(const struct sock *sk)
inet->inet_dport);
}
-int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
+void __inet_hash_nolisten(struct sock *sk, struct sock *osk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct hlist_nulls_head *list;
struct inet_ehash_bucket *head;
spinlock_t *lock;
- int twrefcnt = 0;
WARN_ON(!sk_unhashed(sk));
@@ -420,23 +414,22 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
spin_lock(lock);
__sk_nulls_add_node_rcu(sk, list);
- if (tw) {
- WARN_ON(sk->sk_hash != tw->tw_hash);
- twrefcnt = inet_twsk_unhash(tw);
+ if (osk) {
+ WARN_ON(sk->sk_hash != osk->sk_hash);
+ sk_nulls_del_node_init_rcu(osk);
}
spin_unlock(lock);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
- return twrefcnt;
}
EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
-int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw)
+void __inet_hash(struct sock *sk, struct sock *osk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb;
if (sk->sk_state != TCP_LISTEN)
- return __inet_hash_nolisten(sk, tw);
+ return __inet_hash_nolisten(sk, osk);
WARN_ON(!sk_unhashed(sk));
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
@@ -445,7 +438,6 @@ int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw)
__sk_nulls_add_node_rcu(sk, &ilb->head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
spin_unlock(&ilb->lock);
- return 0;
}
EXPORT_SYMBOL(__inet_hash);
@@ -492,7 +484,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct inet_bind_bucket *tb;
int ret;
struct net *net = sock_net(sk);
- int twrefcnt = 1;
if (!snum) {
int i, remaining, low, high, port;
@@ -560,19 +551,14 @@ ok:
inet_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) {
inet_sk(sk)->inet_sport = htons(port);
- twrefcnt += __inet_hash_nolisten(sk, tw);
+ __inet_hash_nolisten(sk, (struct sock *)tw);
}
if (tw)
- twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
+ inet_twsk_bind_unhash(tw, hinfo);
spin_unlock(&head->lock);
- if (tw) {
- inet_twsk_deschedule(tw);
- while (twrefcnt) {
- twrefcnt--;
- inet_twsk_put(tw);
- }
- }
+ if (tw)
+ inet_twsk_deschedule_put(tw);
ret = 0;
goto out;
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 2ffbd16b79e0..ae22cc24fbe8 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -18,28 +18,6 @@
/**
- * inet_twsk_unhash - unhash a timewait socket from established hash
- * @tw: timewait socket
- *
- * unhash a timewait socket from established hash, if hashed.
- * ehash lock must be held by caller.
- * Returns 1 if caller should call inet_twsk_put() after lock release.
- */
-int inet_twsk_unhash(struct inet_timewait_sock *tw)
-{
- if (hlist_nulls_unhashed(&tw->tw_node))
- return 0;
-
- hlist_nulls_del_rcu(&tw->tw_node);
- sk_nulls_node_init(&tw->tw_node);
- /*
- * We cannot call inet_twsk_put() ourself under lock,
- * caller must call it for us.
- */
- return 1;
-}
-
-/**
* inet_twsk_bind_unhash - unhash a timewait socket from bind hash
* @tw: timewait socket
* @hashinfo: hashinfo pointer
@@ -48,35 +26,29 @@ int inet_twsk_unhash(struct inet_timewait_sock *tw)
* bind hash lock must be held by caller.
* Returns 1 if caller should call inet_twsk_put() after lock release.
*/
-int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
+void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
struct inet_hashinfo *hashinfo)
{
struct inet_bind_bucket *tb = tw->tw_tb;
if (!tb)
- return 0;
+ return;
__hlist_del(&tw->tw_bind_node);
tw->tw_tb = NULL;
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
- /*
- * We cannot call inet_twsk_put() ourself under lock,
- * caller must call it for us.
- */
- return 1;
+ __sock_put((struct sock *)tw);
}
/* Must be called with locally disabled BHs. */
static void inet_twsk_kill(struct inet_timewait_sock *tw)
{
struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
- struct inet_bind_hashbucket *bhead;
- int refcnt;
- /* Unlink from established hashes. */
spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
+ struct inet_bind_hashbucket *bhead;
spin_lock(lock);
- refcnt = inet_twsk_unhash(tw);
+ sk_nulls_del_node_init_rcu((struct sock *)tw);
spin_unlock(lock);
/* Disassociate with bind bucket. */
@@ -84,11 +56,9 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
hashinfo->bhash_size)];
spin_lock(&bhead->lock);
- refcnt += inet_twsk_bind_unhash(tw, hashinfo);
+ inet_twsk_bind_unhash(tw, hashinfo);
spin_unlock(&bhead->lock);
- BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt));
- atomic_sub(refcnt, &tw->tw_refcnt);
atomic_dec(&tw->tw_dr->tw_count);
inet_twsk_put(tw);
}
@@ -235,13 +205,17 @@ EXPORT_SYMBOL_GPL(inet_twsk_alloc);
* tcp_input.c to verify this.
*/
-/* This is for handling early-kills of TIME_WAIT sockets. */
-void inet_twsk_deschedule(struct inet_timewait_sock *tw)
+/* This is for handling early-kills of TIME_WAIT sockets.
+ * Warning : consume reference.
+ * Caller should not access tw anymore.
+ */
+void inet_twsk_deschedule_put(struct inet_timewait_sock *tw)
{
if (del_timer_sync(&tw->tw_timer))
inet_twsk_kill(tw);
+ inet_twsk_put(tw);
}
-EXPORT_SYMBOL(inet_twsk_deschedule);
+EXPORT_SYMBOL(inet_twsk_deschedule_put);
void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo)
{
@@ -311,9 +285,8 @@ restart:
rcu_read_unlock();
local_bh_disable();
- inet_twsk_deschedule(tw);
+ inet_twsk_deschedule_put(tw);
local_bh_enable();
- inet_twsk_put(tw);
goto restart_rcu;
}
/* If the nulls value we got at the end of this lookup is
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 31f71b15cfba..f44bccc42494 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -522,7 +522,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
int len;
int ihlen;
int err;
- int sum_truesize;
u8 ecn;
ipq_kill(qp);
@@ -590,32 +589,19 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
add_frag_mem_limit(&qp->q, clone->truesize);
}
+ skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head));
- sum_truesize = head->truesize;
- for (fp = head->next; fp;) {
- bool headstolen;
- int delta;
- struct sk_buff *next = fp->next;
-
- sum_truesize += fp->truesize;
+ for (fp=head->next; fp; fp = fp->next) {
+ head->data_len += fp->len;
+ head->len += fp->len;
if (head->ip_summed != fp->ip_summed)
head->ip_summed = CHECKSUM_NONE;
else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum);
-
- if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
- kfree_skb_partial(fp, headstolen);
- } else {
- if (!skb_shinfo(head)->frag_list)
- skb_shinfo(head)->frag_list = fp;
- head->data_len += fp->len;
- head->len += fp->len;
- head->truesize += fp->truesize;
- }
- fp = next;
+ head->truesize += fp->truesize;
}
- sub_frag_mem_limit(&qp->q, sum_truesize);
+ sub_frag_mem_limit(&qp->q, head->truesize);
head->next = NULL;
head->dev = dev;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 2db4c8773c1b..f4fc8a77aaa7 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -146,6 +146,7 @@
#include <net/xfrm.h>
#include <linux/mroute.h>
#include <linux/netlink.h>
+#include <net/dst_metadata.h>
/*
* Process Router Attention IP option (RFC 2113)
@@ -331,7 +332,7 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb)
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
- if (!skb_dst(skb)) {
+ if (!skb_valid_dst(skb)) {
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
iph->tos, skb->dev);
if (unlikely(err)) {
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 6a51a71a6c67..630e6d5712e8 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -32,6 +32,7 @@
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
+#include <linux/static_key.h>
#include <net/ip.h>
#include <net/icmp.h>
@@ -190,3 +191,132 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
return tot;
}
EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
+
+static const struct nla_policy ip_tun_policy[IP_TUN_MAX + 1] = {
+ [IP_TUN_ID] = { .type = NLA_U64 },
+ [IP_TUN_DST] = { .type = NLA_U32 },
+ [IP_TUN_SRC] = { .type = NLA_U32 },
+ [IP_TUN_TTL] = { .type = NLA_U8 },
+ [IP_TUN_TOS] = { .type = NLA_U8 },
+ [IP_TUN_SPORT] = { .type = NLA_U16 },
+ [IP_TUN_DPORT] = { .type = NLA_U16 },
+ [IP_TUN_FLAGS] = { .type = NLA_U16 },
+};
+
+static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
+ struct lwtunnel_state **ts)
+{
+ struct ip_tunnel_info *tun_info;
+ struct lwtunnel_state *new_state;
+ struct nlattr *tb[IP_TUN_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, IP_TUN_MAX, attr, ip_tun_policy);
+ if (err < 0)
+ return err;
+
+ new_state = lwtunnel_state_alloc(sizeof(*tun_info));
+ if (!new_state)
+ return -ENOMEM;
+
+ new_state->type = LWTUNNEL_ENCAP_IP;
+
+ tun_info = lwt_tun_info(new_state);
+
+ if (tb[IP_TUN_ID])
+ tun_info->key.tun_id = nla_get_u64(tb[IP_TUN_ID]);
+
+ if (tb[IP_TUN_DST])
+ tun_info->key.ipv4_dst = nla_get_be32(tb[IP_TUN_DST]);
+
+ if (tb[IP_TUN_SRC])
+ tun_info->key.ipv4_src = nla_get_be32(tb[IP_TUN_SRC]);
+
+ if (tb[IP_TUN_TTL])
+ tun_info->key.ipv4_ttl = nla_get_u8(tb[IP_TUN_TTL]);
+
+ if (tb[IP_TUN_TOS])
+ tun_info->key.ipv4_tos = nla_get_u8(tb[IP_TUN_TOS]);
+
+ if (tb[IP_TUN_SPORT])
+ tun_info->key.tp_src = nla_get_be16(tb[IP_TUN_SPORT]);
+
+ if (tb[IP_TUN_DPORT])
+ tun_info->key.tp_dst = nla_get_be16(tb[IP_TUN_DPORT]);
+
+ if (tb[IP_TUN_FLAGS])
+ tun_info->key.tun_flags = nla_get_u16(tb[IP_TUN_FLAGS]);
+
+ tun_info->mode = IP_TUNNEL_INFO_TX;
+ tun_info->options = NULL;
+ tun_info->options_len = 0;
+
+ *ts = new_state;
+
+ return 0;
+}
+
+static int ip_tun_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
+
+ if (nla_put_u64(skb, IP_TUN_ID, tun_info->key.tun_id) ||
+ nla_put_be32(skb, IP_TUN_DST, tun_info->key.ipv4_dst) ||
+ nla_put_be32(skb, IP_TUN_SRC, tun_info->key.ipv4_src) ||
+ nla_put_u8(skb, IP_TUN_TOS, tun_info->key.ipv4_tos) ||
+ nla_put_u8(skb, IP_TUN_TTL, tun_info->key.ipv4_ttl) ||
+ nla_put_u16(skb, IP_TUN_SPORT, tun_info->key.tp_src) ||
+ nla_put_u16(skb, IP_TUN_DPORT, tun_info->key.tp_dst) ||
+ nla_put_u16(skb, IP_TUN_FLAGS, tun_info->key.tun_flags))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ return nla_total_size(8) /* IP_TUN_ID */
+ + nla_total_size(4) /* IP_TUN_DST */
+ + nla_total_size(4) /* IP_TUN_SRC */
+ + nla_total_size(1) /* IP_TUN_TOS */
+ + nla_total_size(1) /* IP_TUN_TTL */
+ + nla_total_size(2) /* IP_TUN_SPORT */
+ + nla_total_size(2) /* IP_TUN_DPORT */
+ + nla_total_size(2); /* IP_TUN_FLAGS */
+}
+
+static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
+ .build_state = ip_tun_build_state,
+ .fill_encap = ip_tun_fill_encap_info,
+ .get_encap_size = ip_tun_encap_nlsize,
+};
+
+static int __init ip_tunnel_core_init(void)
+{
+ lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
+
+ return 0;
+}
+module_init(ip_tunnel_core_init);
+
+static void __exit ip_tunnel_core_exit(void)
+{
+ lwtunnel_encap_del_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
+}
+module_exit(ip_tunnel_core_exit);
+
+struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL(ip_tunnel_metadata_cnt);
+
+void ip_tunnel_need_metadata(void)
+{
+ static_key_slow_inc(&ip_tunnel_metadata_cnt);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata);
+
+void ip_tunnel_unneed_metadata(void)
+{
+ static_key_slow_dec(&ip_tunnel_metadata_cnt);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 05ff44b758df..e89094ab5ddb 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -363,7 +363,8 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
scoped);
rcu_read_unlock();
- if (!(isk->freebind || isk->transparent || has_addr ||
+ if (!(net->ipv6.sysctl.ip_nonlocal_bind ||
+ isk->freebind || isk->transparent || has_addr ||
addr_type == IPV6_ADDR_ANY))
return -EADDRNOTAVAIL;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index da5d483e236a..3abd9d7a3adf 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -300,6 +300,8 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE),
SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE),
SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
+ SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL),
+ SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d0362a2de3d3..519ec232818d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -91,6 +91,7 @@
#include <linux/slab.h>
#include <linux/jhash.h>
#include <net/dst.h>
+#include <net/dst_metadata.h>
#include <net/net_namespace.h>
#include <net/protocol.h>
#include <net/ip.h>
@@ -102,6 +103,7 @@
#include <net/tcp.h>
#include <net/icmp.h>
#include <net/xfrm.h>
+#include <net/lwtunnel.h>
#include <net/netevent.h>
#include <net/rtnetlink.h>
#ifdef CONFIG_SYSCTL
@@ -109,6 +111,7 @@
#include <linux/kmemleak.h>
#endif
#include <net/secure_seq.h>
+#include <net/ip_tunnels.h>
#define RT_FL_TOS(oldflp4) \
((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
@@ -1355,6 +1358,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
list_del(&rt->rt_uncached);
spin_unlock_bh(&ul->lock);
}
+ lwtunnel_state_put(rt->rt_lwtstate);
}
void rt_flush_dev(struct net_device *dev)
@@ -1403,6 +1407,12 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
#ifdef CONFIG_IP_ROUTE_CLASSID
rt->dst.tclassid = nh->nh_tclassid;
#endif
+ if (nh->nh_lwtstate) {
+ lwtunnel_state_get(nh->nh_lwtstate);
+ rt->rt_lwtstate = nh->nh_lwtstate;
+ } else {
+ rt->rt_lwtstate = NULL;
+ }
if (unlikely(fnhe))
cached = rt_bind_exception(rt, fnhe, daddr);
else if (!(rt->dst.flags & DST_NOCACHE))
@@ -1488,6 +1498,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
rth->rt_gateway = 0;
rth->rt_uses_gateway = 0;
INIT_LIST_HEAD(&rth->rt_uncached);
+ rth->rt_lwtstate = NULL;
if (our) {
rth->dst.input= ip_local_deliver;
rth->rt_flags |= RTCF_LOCAL;
@@ -1546,7 +1557,6 @@ static int __mkroute_input(struct sk_buff *skb,
struct rtable *rth;
int err;
struct in_device *out_dev;
- unsigned int flags = 0;
bool do_cache;
u32 itag = 0;
@@ -1610,7 +1620,7 @@ static int __mkroute_input(struct sk_buff *skb,
}
rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
- rth->rt_flags = flags;
+ rth->rt_flags = 0;
rth->rt_type = res->type;
rth->rt_is_input = 1;
rth->rt_iif = 0;
@@ -1618,12 +1628,15 @@ static int __mkroute_input(struct sk_buff *skb,
rth->rt_gateway = 0;
rth->rt_uses_gateway = 0;
INIT_LIST_HEAD(&rth->rt_uncached);
+ rth->rt_lwtstate = NULL;
RT_CACHE_STAT_INC(in_slow_tot);
rth->dst.input = ip_forward;
rth->dst.output = ip_output;
rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
+ if (lwtunnel_output_redirect(rth->rt_lwtstate))
+ rth->dst.output = lwtunnel_output;
skb_dst_set(skb, &rth->dst);
out:
err = 0;
@@ -1662,6 +1675,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
{
struct fib_result res;
struct in_device *in_dev = __in_dev_get_rcu(dev);
+ struct ip_tunnel_info *tun_info;
struct flowi4 fl4;
unsigned int flags = 0;
u32 itag = 0;
@@ -1679,6 +1693,13 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
by fib_lookup.
*/
+ tun_info = skb_tunnel_info(skb, AF_INET);
+ if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX)
+ fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
+ else
+ fl4.flowi4_tun_key.tun_id = 0;
+ skb_dst_drop(skb);
+
if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
goto martian_source;
@@ -1792,6 +1813,8 @@ local_input:
rth->rt_gateway = 0;
rth->rt_uses_gateway = 0;
INIT_LIST_HEAD(&rth->rt_uncached);
+ rth->rt_lwtstate = NULL;
+
RT_CACHE_STAT_INC(in_slow_tot);
if (res.type == RTN_UNREACHABLE) {
rth->dst.input= ip_error;
@@ -1981,7 +2004,7 @@ add:
rth->rt_gateway = 0;
rth->rt_uses_gateway = 0;
INIT_LIST_HEAD(&rth->rt_uncached);
-
+ rth->rt_lwtstate = NULL;
RT_CACHE_STAT_INC(out_slow_tot);
if (flags & RTCF_LOCAL)
@@ -2261,7 +2284,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
rt->rt_uses_gateway = ort->rt_uses_gateway;
INIT_LIST_HEAD(&rt->rt_uncached);
-
+ rt->rt_lwtstate = NULL;
dst_free(new);
}
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index c037644eafb7..fd1405d37c14 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -146,7 +146,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else {
bictcp_update(ca, tp->snd_cwnd);
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 8c6fd3d5e40f..167b6a3e1b98 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -264,7 +264,7 @@ static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked)
u32 prior_snd_cwnd;
u32 incr;
- if (tp->snd_cwnd < tp->snd_ssthresh && hystart_detect)
+ if (tcp_in_slow_start(tp) && hystart_detect)
tcp_cdg_hystart_update(sk);
if (after(ack, ca->rtt_seq) && ca->rtt.v64) {
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 84be008c945c..a2ed23c595cf 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -365,10 +365,8 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
*/
u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
{
- u32 cwnd = tp->snd_cwnd + acked;
+ u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh);
- if (cwnd > tp->snd_ssthresh)
- cwnd = tp->snd_ssthresh + 1;
acked -= cwnd - tp->snd_cwnd;
tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
@@ -413,7 +411,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
return;
/* In "safe" area, increase. */
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (tcp_in_slow_start(tp)) {
acked = tcp_slow_start(tp, acked);
if (!acked)
return;
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 06d3d665a9fd..28011fb1f4a2 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -320,7 +320,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (tcp_in_slow_start(tp)) {
if (hystart && after(ack, ca->end_seq))
bictcp_hystart_reset(sk);
acked = tcp_slow_start(tp, acked);
@@ -439,7 +439,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
ca->delay_min = delay;
/* hystart triggers when cwnd is larger than some threshold */
- if (hystart && tp->snd_cwnd <= tp->snd_ssthresh &&
+ if (hystart && tcp_in_slow_start(tp) &&
tp->snd_cwnd >= hystart_low_window)
hystart_update(sk, delay);
}
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 882c08aae2f5..db7842495a64 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -116,7 +116,7 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else {
/* Update AIMD parameters.
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 58469fff6c18..82f0d9ed60f5 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -236,7 +236,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else {
/* In dangerous area, increase slowly.
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index f963b274f2b0..083831e359df 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -112,7 +112,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
rho_fractions = ca->rho_3ls - (ca->rho << 3);
- if (tp->snd_cwnd < tp->snd_ssthresh) {
+ if (tcp_in_slow_start(tp)) {
/*
* slow start
* INC = 2^RHO - 1
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index f71002e4db0b..2ab9bbb6faff 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -268,7 +268,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)
return;
/* In slow start */
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 728f5b3d3c64..4e4d6bcd0ca9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -109,6 +109,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
#define FLAG_DATA_SACKED 0x20 /* New SACK. */
#define FLAG_ECE 0x40 /* ECE in this ACK */
+#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
@@ -196,11 +197,13 @@ static void tcp_enter_quickack_mode(struct sock *sk)
* and the session is not interactive.
*/
-static inline bool tcp_in_quickack_mode(const struct sock *sk)
+static bool tcp_in_quickack_mode(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct dst_entry *dst = __sk_dst_get(sk);
- return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
+ return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
+ (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);
}
static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
@@ -1037,7 +1040,7 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
* highest SACK block). Also calculate the lowest snd_nxt among the remaining
* retransmitted skbs to avoid some costly processing per ACKs.
*/
-static void tcp_mark_lost_retrans(struct sock *sk)
+static void tcp_mark_lost_retrans(struct sock *sk, int *flag)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -1078,7 +1081,7 @@ static void tcp_mark_lost_retrans(struct sock *sk)
if (after(received_upto, ack_seq)) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
-
+ *flag |= FLAG_LOST_RETRANS;
tcp_skb_mark_lost_uncond_verify(tp, skb);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
} else {
@@ -1818,7 +1821,7 @@ advance_sp:
((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
- tcp_mark_lost_retrans(sk);
+ tcp_mark_lost_retrans(sk, &state->flag);
tcp_verify_left_out(tp);
out:
@@ -2474,15 +2477,14 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
return false;
}
-/* The cwnd reduction in CWR and Recovery use the PRR algorithm
- * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/
+/* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.
* It computes the number of packets to send (sndcnt) based on packets newly
* delivered:
* 1) If the packets in flight is larger than ssthresh, PRR spreads the
* cwnd reductions across a full RTT.
- * 2) If packets in flight is lower than ssthresh (such as due to excess
- * losses and/or application stalls), do not perform any further cwnd
- * reductions, but instead slow start up to ssthresh.
+ * 2) Otherwise PRR uses packet conservation to send as much as delivered.
+ * But when the retransmits are acked without further losses, PRR
+ * slow starts cwnd up to ssthresh to speed up the recovery.
*/
static void tcp_init_cwnd_reduction(struct sock *sk)
{
@@ -2499,7 +2501,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
}
static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
- int fast_rexmit)
+ int fast_rexmit, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0;
@@ -2508,16 +2510,18 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
(tp->packets_out - tp->sacked_out);
tp->prr_delivered += newly_acked_sacked;
- if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
+ if (delta < 0) {
u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
tp->prior_cwnd - 1;
sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
- } else {
+ } else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
+ !(flag & FLAG_LOST_RETRANS)) {
sndcnt = min_t(int, delta,
max_t(int, tp->prr_delivered - tp->prr_out,
newly_acked_sacked) + 1);
+ } else {
+ sndcnt = min(delta, newly_acked_sacked);
}
-
sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
}
@@ -2578,7 +2582,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
tcp_try_keep_open(sk);
} else {
- tcp_cwnd_reduction(sk, prior_unsacked, 0);
+ tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
}
}
@@ -2588,6 +2592,7 @@ static void tcp_mtup_probe_failed(struct sock *sk)
icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
icsk->icsk_mtup.probe_size = 0;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
}
static void tcp_mtup_probe_success(struct sock *sk)
@@ -2607,6 +2612,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
icsk->icsk_mtup.probe_size = 0;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
}
/* Do a simple retransmit without using the backoff mechanisms in
@@ -2675,7 +2681,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
tp->prior_ssthresh = 0;
tcp_init_undo(tp);
- if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+ if (!tcp_in_cwnd_reduction(sk)) {
if (!ece_ack)
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tcp_init_cwnd_reduction(sk);
@@ -2735,7 +2741,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
/* Undo during fast recovery after partial ACK. */
static bool tcp_try_undo_partial(struct sock *sk, const int acked,
- const int prior_unsacked)
+ const int prior_unsacked, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2751,7 +2757,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked,
* mark more packets lost or retransmit more.
*/
if (tp->retrans_out) {
- tcp_cwnd_reduction(sk, prior_unsacked, 0);
+ tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
return true;
}
@@ -2838,7 +2844,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (tcp_is_reno(tp) && is_dupack)
tcp_add_reno_sack(sk);
} else {
- if (tcp_try_undo_partial(sk, acked, prior_unsacked))
+ if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag))
return;
/* Partial ACK arrived. Force fast retransmit. */
do_lost = tcp_is_reno(tp) ||
@@ -2851,9 +2857,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack);
- if (icsk->icsk_ca_state != TCP_CA_Open)
+ if (icsk->icsk_ca_state != TCP_CA_Open &&
+ !(flag & FLAG_LOST_RETRANS))
return;
- /* Fall through to processing in Open state. */
+ /* Change state if cwnd is undone or retransmits are lost */
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
@@ -2888,7 +2895,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (do_lost)
tcp_update_scoreboard(sk, fast_rexmit);
- tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit);
+ tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag);
tcp_xmit_retransmit_queue(sk);
}
@@ -3562,10 +3569,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
&sack_state);
acked -= tp->packets_out;
- /* Advance cwnd if state allows */
- if (tcp_may_raise_cwnd(sk, flag))
- tcp_cong_avoid(sk, ack, acked);
-
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
tcp_fastretrans_alert(sk, acked, prior_unsacked,
@@ -3574,6 +3577,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
+ /* Advance cwnd if state allows */
+ if (tcp_may_raise_cwnd(sk, flag))
+ tcp_cong_avoid(sk, ack, acked);
+
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
struct dst_entry *dst = __sk_dst_get(sk);
if (dst)
@@ -3947,7 +3954,6 @@ void tcp_reset(struct sock *sk)
static void tcp_fin(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- const struct dst_entry *dst;
inet_csk_schedule_ack(sk);
@@ -3959,9 +3965,7 @@ static void tcp_fin(struct sock *sk)
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
- dst = __sk_dst_get(sk);
- if (!dst || !dst_metric(dst, RTAX_QUICKACK))
- inet_csk(sk)->icsk_ack.pingpong = 1;
+ inet_csk(sk)->icsk_ack.pingpong = 1;
break;
case TCP_CLOSE_WAIT:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d7d4c2b79cf2..486ba96ae91a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1683,8 +1683,7 @@ do_time_wait:
iph->daddr, th->dest,
inet_iif(skb));
if (sk2) {
- inet_twsk_deschedule(inet_twsk(sk));
- inet_twsk_put(inet_twsk(sk));
+ inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
goto process;
}
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index a51d63a43e33..b3d64f61d922 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -461,7 +461,7 @@ void tcp_update_metrics(struct sock *sk)
tcp_metric_set(tm, TCP_METRIC_CWND,
tp->snd_cwnd);
}
- } else if (tp->snd_cwnd > tp->snd_ssthresh &&
+ } else if (!tcp_in_slow_start(tp) &&
icsk->icsk_ca_state == TCP_CA_Open) {
/* Cong. avoidance phase, cwnd is reliable. */
if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4bc00cb79e60..6d8795b066ac 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -147,8 +147,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
if (!th->fin ||
TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
kill_with_rst:
- inet_twsk_deschedule(tw);
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
return TCP_TW_RST;
}
@@ -198,8 +197,7 @@ kill_with_rst:
*/
if (sysctl_tcp_rfc1337 == 0) {
kill:
- inet_twsk_deschedule(tw);
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
return TCP_TW_SUCCESS;
}
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b1c218df2c85..71057849593a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -163,7 +163,6 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
{
struct inet_connection_sock *icsk = inet_csk(sk);
const u32 now = tcp_time_stamp;
- const struct dst_entry *dst = __sk_dst_get(sk);
if (sysctl_tcp_slow_start_after_idle &&
(!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
@@ -174,9 +173,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
/* If it is a reply for ato after last received
* packet, enter pingpong mode.
*/
- if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato &&
- (!dst || !dst_metric(dst, RTAX_QUICKACK)))
- icsk->icsk_ack.pingpong = 1;
+ if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
+ icsk->icsk_ack.pingpong = 1;
}
/* Account for an ACK we sent. */
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 333bcb2415ff..bf5ea9e9bbc1 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -22,7 +22,7 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else
tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 5b752f58a900..7149ebc820c7 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -649,4 +649,3 @@ void tcp_init_xmit_timers(struct sock *sk)
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
&tcp_keepalive_timer);
}
-EXPORT_SYMBOL(tcp_init_xmit_timers);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index a6cea1d5e20d..13951c4087d4 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -225,7 +225,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
*/
diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT;
- if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (diff > gamma && tcp_in_slow_start(tp)) {
/* Going too fast. Time to slow down
* and switch to congestion avoidance.
*/
@@ -240,7 +240,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
tp->snd_ssthresh = tcp_vegas_ssthresh(tp);
- } else if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ } else if (tcp_in_slow_start(tp)) {
/* Slow start. */
tcp_slow_start(tp, acked);
} else {
@@ -281,7 +281,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
vegas->minRTT = 0x7fffffff;
}
/* Use normal slow start */
- else if (tp->snd_cwnd <= tp->snd_ssthresh)
+ else if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
}
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 112151eeee45..0d094b995cd9 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -150,7 +150,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (tcp_in_slow_start(tp)) {
/* Slow start. */
tcp_slow_start(tp, acked);
} else {
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 438a73aa777c..643f61339e7b 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -5,16 +5,15 @@
# IPv6 as module will cause a CRASH if you try to unload it
menuconfig IPV6
tristate "The IPv6 protocol"
- default m
+ default y
---help---
- This is complemental support for the IP version 6.
- You will still be able to do traditional IPv4 networking as well.
+ Support for IP version 6 (IPv6).
For general information about IPv6, see
<https://en.wikipedia.org/wiki/IPv6>.
- For Linux IPv6 development information, see <http://www.linux-ipv6.org>.
- For specific information about IPv6 under Linux, read the HOWTO at
- <http://www.bieringer.de/linux/IPv6/>.
+ For specific information about IPv6 under Linux, see
+ Documentation/networking/ipv6.txt and read the HOWTO at
+ <http://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/>
To compile this protocol support as a module, choose M here: the
module will be called ipv6.
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 21c2c818df3b..eb0c6a3a8a00 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -211,7 +211,8 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.accept_ra_mtu = 1,
.stable_secret = {
.initialized = false,
- }
+ },
+ .use_oif_addrs_only = 0,
};
static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -253,6 +254,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.stable_secret = {
.initialized = false,
},
+ .use_oif_addrs_only = 0,
};
/* Check if a valid qdisc is available */
@@ -1358,15 +1360,96 @@ out:
return ret;
}
+static int __ipv6_dev_get_saddr(struct net *net,
+ struct ipv6_saddr_dst *dst,
+ struct inet6_dev *idev,
+ struct ipv6_saddr_score *scores,
+ int hiscore_idx)
+{
+ struct ipv6_saddr_score *score = &scores[1 - hiscore_idx], *hiscore = &scores[hiscore_idx];
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(score->ifa, &idev->addr_list, if_list) {
+ int i;
+
+ /*
+ * - Tentative Address (RFC2462 section 5.4)
+ * - A tentative address is not considered
+ * "assigned to an interface" in the traditional
+ * sense, unless it is also flagged as optimistic.
+ * - Candidate Source Address (section 4)
+ * - In any case, anycast addresses, multicast
+ * addresses, and the unspecified address MUST
+ * NOT be included in a candidate set.
+ */
+ if ((score->ifa->flags & IFA_F_TENTATIVE) &&
+ (!(score->ifa->flags & IFA_F_OPTIMISTIC)))
+ continue;
+
+ score->addr_type = __ipv6_addr_type(&score->ifa->addr);
+
+ if (unlikely(score->addr_type == IPV6_ADDR_ANY ||
+ score->addr_type & IPV6_ADDR_MULTICAST)) {
+ net_dbg_ratelimited("ADDRCONF: unspecified / multicast address assigned as unicast address on %s",
+ idev->dev->name);
+ continue;
+ }
+
+ score->rule = -1;
+ bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX);
+
+ for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) {
+ int minihiscore, miniscore;
+
+ minihiscore = ipv6_get_saddr_eval(net, hiscore, dst, i);
+ miniscore = ipv6_get_saddr_eval(net, score, dst, i);
+
+ if (minihiscore > miniscore) {
+ if (i == IPV6_SADDR_RULE_SCOPE &&
+ score->scopedist > 0) {
+ /*
+ * special case:
+ * each remaining entry
+ * has too small (not enough)
+ * scope, because ifa entries
+ * are sorted by their scope
+ * values.
+ */
+ goto out;
+ }
+ break;
+ } else if (minihiscore < miniscore) {
+ if (hiscore->ifa)
+ in6_ifa_put(hiscore->ifa);
+
+ in6_ifa_hold(score->ifa);
+
+ swap(hiscore, score);
+ hiscore_idx = 1 - hiscore_idx;
+
+ /* restore our iterator */
+ score->ifa = hiscore->ifa;
+
+ break;
+ }
+ }
+ }
+out:
+ read_unlock_bh(&idev->lock);
+ return hiscore_idx;
+}
+
int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
const struct in6_addr *daddr, unsigned int prefs,
struct in6_addr *saddr)
{
- struct ipv6_saddr_score scores[2],
- *score = &scores[0], *hiscore = &scores[1];
+ struct ipv6_saddr_score scores[2], *hiscore;
struct ipv6_saddr_dst dst;
+ struct inet6_dev *idev;
struct net_device *dev;
int dst_type;
+ bool use_oif_addr = false;
+ int hiscore_idx = 0;
dst_type = __ipv6_addr_type(daddr);
dst.addr = daddr;
@@ -1375,105 +1458,50 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
dst.label = ipv6_addr_label(net, daddr, dst_type, dst.ifindex);
dst.prefs = prefs;
- hiscore->rule = -1;
- hiscore->ifa = NULL;
+ scores[hiscore_idx].rule = -1;
+ scores[hiscore_idx].ifa = NULL;
rcu_read_lock();
- for_each_netdev_rcu(net, dev) {
- struct inet6_dev *idev;
-
- /* Candidate Source Address (section 4)
- * - multicast and link-local destination address,
- * the set of candidate source address MUST only
- * include addresses assigned to interfaces
- * belonging to the same link as the outgoing
- * interface.
- * (- For site-local destination addresses, the
- * set of candidate source addresses MUST only
- * include addresses assigned to interfaces
- * belonging to the same site as the outgoing
- * interface.)
- */
- if (((dst_type & IPV6_ADDR_MULTICAST) ||
- dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) &&
- dst.ifindex && dev->ifindex != dst.ifindex)
- continue;
-
- idev = __in6_dev_get(dev);
- if (!idev)
- continue;
-
- read_lock_bh(&idev->lock);
- list_for_each_entry(score->ifa, &idev->addr_list, if_list) {
- int i;
-
- /*
- * - Tentative Address (RFC2462 section 5.4)
- * - A tentative address is not considered
- * "assigned to an interface" in the traditional
- * sense, unless it is also flagged as optimistic.
- * - Candidate Source Address (section 4)
- * - In any case, anycast addresses, multicast
- * addresses, and the unspecified address MUST
- * NOT be included in a candidate set.
- */
- if ((score->ifa->flags & IFA_F_TENTATIVE) &&
- (!(score->ifa->flags & IFA_F_OPTIMISTIC)))
- continue;
-
- score->addr_type = __ipv6_addr_type(&score->ifa->addr);
+ /* Candidate Source Address (section 4)
+ * - multicast and link-local destination address,
+ * the set of candidate source address MUST only
+ * include addresses assigned to interfaces
+ * belonging to the same link as the outgoing
+ * interface.
+ * (- For site-local destination addresses, the
+ * set of candidate source addresses MUST only
+ * include addresses assigned to interfaces
+ * belonging to the same site as the outgoing
+ * interface.)
+ * - "It is RECOMMENDED that the candidate source addresses
+ * be the set of unicast addresses assigned to the
+ * interface that will be used to send to the destination
+ * (the 'outgoing' interface)." (RFC 6724)
+ */
+ if (dst_dev) {
+ idev = __in6_dev_get(dst_dev);
+ if ((dst_type & IPV6_ADDR_MULTICAST) ||
+ dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL ||
+ (idev && idev->cnf.use_oif_addrs_only)) {
+ use_oif_addr = true;
+ }
+ }
- if (unlikely(score->addr_type == IPV6_ADDR_ANY ||
- score->addr_type & IPV6_ADDR_MULTICAST)) {
- net_dbg_ratelimited("ADDRCONF: unspecified / multicast address assigned as unicast address on %s",
- dev->name);
+ if (use_oif_addr) {
+ if (idev)
+ hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
+ } else {
+ for_each_netdev_rcu(net, dev) {
+ idev = __in6_dev_get(dev);
+ if (!idev)
continue;
- }
-
- score->rule = -1;
- bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX);
-
- for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) {
- int minihiscore, miniscore;
-
- minihiscore = ipv6_get_saddr_eval(net, hiscore, &dst, i);
- miniscore = ipv6_get_saddr_eval(net, score, &dst, i);
-
- if (minihiscore > miniscore) {
- if (i == IPV6_SADDR_RULE_SCOPE &&
- score->scopedist > 0) {
- /*
- * special case:
- * each remaining entry
- * has too small (not enough)
- * scope, because ifa entries
- * are sorted by their scope
- * values.
- */
- goto try_nextdev;
- }
- break;
- } else if (minihiscore < miniscore) {
- if (hiscore->ifa)
- in6_ifa_put(hiscore->ifa);
-
- in6_ifa_hold(score->ifa);
-
- swap(hiscore, score);
-
- /* restore our iterator */
- score->ifa = hiscore->ifa;
-
- break;
- }
- }
+ hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
}
-try_nextdev:
- read_unlock_bh(&idev->lock);
}
rcu_read_unlock();
+ hiscore = &scores[hiscore_idx];
if (!hiscore->ifa)
return -EADDRNOTAVAIL;
@@ -4586,6 +4614,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local;
array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu;
/* we omit DEVCONF_STABLE_SECRET for now */
+ array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
}
static inline size_t inet6_ifla6_size(void)
@@ -5585,6 +5614,14 @@ static struct addrconf_sysctl_table
.proc_handler = addrconf_sysctl_stable_secret,
},
{
+ .procname = "use_oif_addrs_only",
+ .data = &ipv6_devconf.use_oif_addrs_only,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+
+ },
+ {
/* sentinel */
}
},
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 7de52b65173f..7bc92ea4ae8f 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -342,7 +342,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
*/
v4addr = LOOPBACK4_IPV6;
if (!(addr_type & IPV6_ADDR_MULTICAST)) {
- if (!(inet->freebind || inet->transparent) &&
+ if (!net->ipv6.sysctl.ip_nonlocal_bind &&
+ !(inet->freebind || inet->transparent) &&
!ipv6_chk_addr(net, &addr->sin6_addr,
dev, 0)) {
err = -EADDRNOTAVAIL;
@@ -679,8 +680,8 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
const struct ipv6_pinfo *np = inet6_sk(sk);
if (np->rxopt.all) {
- if ((opt->hop && (np->rxopt.bits.hopopts ||
- np->rxopt.bits.ohopopts)) ||
+ if (((opt->flags & IP6SKB_HOPBYHOP) &&
+ (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) ||
(ip6_flowinfo((struct ipv6hdr *) skb_network_header(skb)) &&
np->rxopt.bits.rxflow) ||
(opt->srcrt && (np->rxopt.bits.srcrt ||
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index b10a88986a98..2572a324b345 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -568,8 +568,8 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
}
/* HbH is allowed only once */
- if (np->rxopt.bits.hopopts && opt->hop) {
- u8 *ptr = nh + opt->hop;
+ if (np->rxopt.bits.hopopts && (opt->flags & IP6SKB_HOPBYHOP)) {
+ u8 *ptr = nh + sizeof(struct ipv6hdr);
put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr);
}
@@ -630,8 +630,8 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
int hlim = ipv6_hdr(skb)->hop_limit;
put_cmsg(msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim);
}
- if (np->rxopt.bits.ohopopts && opt->hop) {
- u8 *ptr = nh + opt->hop;
+ if (np->rxopt.bits.ohopopts && (opt->flags & IP6SKB_HOPBYHOP)) {
+ u8 *ptr = nh + sizeof(struct ipv6hdr);
put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr);
}
if (np->rxopt.bits.odstopts && opt->dst0) {
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index a7bbbe45570b..ce203b0402be 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -632,7 +632,7 @@ int ipv6_parse_hopopts(struct sk_buff *skb)
return -1;
}
- opt->hop = sizeof(struct ipv6hdr);
+ opt->flags |= IP6SKB_HOPBYHOP;
if (ip6_parse_tlv(tlvprochopopt_lst, skb)) {
skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3;
opt = IP6CB(skb);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index b4fd96de97e6..6ac8dad0138a 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -207,7 +207,6 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk2;
const struct hlist_nulls_node *node;
struct inet_timewait_sock *tw = NULL;
- int twrefcnt = 0;
spin_lock(lock);
@@ -234,21 +233,17 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
WARN_ON(!sk_unhashed(sk));
__sk_nulls_add_node_rcu(sk, &head->chain);
if (tw) {
- twrefcnt = inet_twsk_unhash(tw);
+ sk_nulls_del_node_init_rcu((struct sock *)tw);
NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
}
spin_unlock(lock);
- if (twrefcnt)
- inet_twsk_put(tw);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
if (twp) {
*twp = tw;
} else if (tw) {
/* Silly. Should hash-dance instead... */
- inet_twsk_deschedule(tw);
-
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
}
return 0;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 55d19861ab20..d715f2e0c4e7 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -32,6 +32,7 @@
#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
+#include <net/lwtunnel.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
@@ -177,6 +178,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
static void rt6_release(struct rt6_info *rt)
{
if (atomic_dec_and_test(&rt->rt6i_ref)) {
+ lwtunnel_state_put(rt->rt6i_lwtstate);
rt6_free_pcpu(rt);
dst_free(&rt->dst);
}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index d5f7716662db..c5fc85286ef6 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1023,6 +1023,8 @@ struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
return ERR_PTR(err);
if (final_dst)
fl6->daddr = *final_dst;
+ if (!fl6->flowi6_oif)
+ fl6->flowi6_oif = dst->dev->ifindex;
return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
}
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index ca4700cb26c4..fdbada1569a3 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -295,7 +295,8 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
* unspecified and mapped address have a v4 equivalent.
*/
v4addr = LOOPBACK4_IPV6;
- if (!(addr_type & IPV6_ADDR_MULTICAST)) {
+ if (!(addr_type & IPV6_ADDR_MULTICAST) &&
+ !sock_net(sk)->ipv6.sysctl.ip_nonlocal_bind) {
err = -EADDRNOTAVAIL;
if (!ipv6_chk_addr(sock_net(sk), &addr->sin6_addr,
dev, 0)) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 6090969937f8..7f2214f8fde7 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -58,6 +58,7 @@
#include <net/netevent.h>
#include <net/netlink.h>
#include <net/nexthop.h>
+#include <net/lwtunnel.h>
#include <asm/uaccess.h>
@@ -1770,6 +1771,18 @@ int ip6_route_add(struct fib6_config *cfg)
rt->dst.output = ip6_output;
+ if (cfg->fc_encap) {
+ struct lwtunnel_state *lwtstate;
+
+ err = lwtunnel_build_state(dev, cfg->fc_encap_type,
+ cfg->fc_encap, &lwtstate);
+ if (err)
+ goto out;
+ lwtunnel_state_get(lwtstate);
+ rt->rt6i_lwtstate = lwtstate;
+ rt->dst.output = lwtunnel_output6;
+ }
+
ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
rt->rt6i_dst.plen = cfg->fc_dst_len;
if (rt->rt6i_dst.plen == 128)
@@ -2595,6 +2608,8 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_METRICS] = { .type = NLA_NESTED },
[RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
[RTA_PREF] = { .type = NLA_U8 },
+ [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
+ [RTA_ENCAP] = { .type = NLA_NESTED },
};
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2689,6 +2704,12 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->fc_flags |= RTF_PREF(pref);
}
+ if (tb[RTA_ENCAP])
+ cfg->fc_encap = tb[RTA_ENCAP];
+
+ if (tb[RTA_ENCAP_TYPE])
+ cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
+
err = 0;
errout:
return err;
@@ -2721,6 +2742,10 @@ beginning:
r_cfg.fc_gateway = nla_get_in6_addr(nla);
r_cfg.fc_flags |= RTF_GATEWAY;
}
+ r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
+ nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
+ if (nla)
+ r_cfg.fc_encap_type = nla_get_u16(nla);
}
err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
if (err) {
@@ -2783,7 +2808,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
return ip6_route_add(&cfg);
}
-static inline size_t rt6_nlmsg_size(void)
+static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
{
return NLMSG_ALIGN(sizeof(struct rtmsg))
+ nla_total_size(16) /* RTA_SRC */
@@ -2797,7 +2822,8 @@ static inline size_t rt6_nlmsg_size(void)
+ RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
+ nla_total_size(sizeof(struct rta_cacheinfo))
+ nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
- + nla_total_size(1); /* RTA_PREF */
+ + nla_total_size(1) /* RTA_PREF */
+ + lwtunnel_get_encap_size(rt->rt6i_lwtstate);
}
static int rt6_fill_node(struct net *net,
@@ -2945,6 +2971,8 @@ static int rt6_fill_node(struct net *net,
if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
goto nla_put_failure;
+ lwtunnel_fill_encap(skb, rt->rt6i_lwtstate);
+
nlmsg_end(skb, nlh);
return 0;
@@ -3071,7 +3099,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
err = -ENOBUFS;
seq = info->nlh ? info->nlh->nlmsg_seq : 0;
- skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
+ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
if (!skb)
goto errout;
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 4e705add4f18..db48aebd9c47 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -75,6 +75,13 @@ static struct ctl_table ipv6_table_template[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "ip_nonlocal_bind",
+ .data = &init_net.ipv6.sysctl.ip_nonlocal_bind,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
{ }
};
@@ -117,6 +124,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
ipv6_table[5].data = &net->ipv6.sysctl.idgen_retries;
ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay;
ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges;
+ ipv6_table[8].data = &net->ipv6.sysctl.ip_nonlocal_bind;
ipv6_route_table = ipv6_route_sysctl_init(net);
if (!ipv6_route_table)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6748c4277aff..d540846a1a79 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1481,8 +1481,7 @@ do_time_wait:
ntohs(th->dest), tcp_v6_iif(skb));
if (sk2) {
struct inet_timewait_sock *tw = inet_twsk(sk);
- inet_twsk_deschedule(tw);
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
sk = sk2;
tcp_v6_restore_cb(skb);
goto process;
diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig
index 17bde799c854..5c467ef97311 100644
--- a/net/mpls/Kconfig
+++ b/net/mpls/Kconfig
@@ -24,7 +24,13 @@ config NET_MPLS_GSO
config MPLS_ROUTING
tristate "MPLS: routing support"
- help
+ ---help---
Add support for forwarding of mpls packets.
+config MPLS_IPTUNNEL
+ tristate "MPLS: IP over MPLS tunnel support"
+ depends on LWTUNNEL && MPLS_ROUTING
+ ---help---
+ mpls ip tunnel support.
+
endif # MPLS
diff --git a/net/mpls/Makefile b/net/mpls/Makefile
index 65bbe68c72e6..9ca923625016 100644
--- a/net/mpls/Makefile
+++ b/net/mpls/Makefile
@@ -3,5 +3,6 @@
#
obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o
obj-$(CONFIG_MPLS_ROUTING) += mpls_router.o
+obj-$(CONFIG_MPLS_IPTUNNEL) += mpls_iptunnel.o
mpls_router-y := af_mpls.o
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 1f93a5978f2a..49f1b0e41bdf 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -15,6 +15,7 @@
#include <net/ip_fib.h>
#include <net/netevent.h>
#include <net/netns/generic.h>
+#include <net/ip6_route.h>
#include "internal.h"
#define LABEL_NOT_SPECIFIED (1<<20)
@@ -58,10 +59,11 @@ static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
return rcu_dereference_rtnl(dev->mpls_ptr);
}
-static bool mpls_output_possible(const struct net_device *dev)
+bool mpls_output_possible(const struct net_device *dev)
{
return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev);
}
+EXPORT_SYMBOL_GPL(mpls_output_possible);
static unsigned int mpls_rt_header_size(const struct mpls_route *rt)
{
@@ -69,13 +71,14 @@ static unsigned int mpls_rt_header_size(const struct mpls_route *rt)
return rt->rt_labels * sizeof(struct mpls_shim_hdr);
}
-static unsigned int mpls_dev_mtu(const struct net_device *dev)
+unsigned int mpls_dev_mtu(const struct net_device *dev)
{
/* The amount of data the layer 2 frame can hold */
return dev->mtu;
}
+EXPORT_SYMBOL_GPL(mpls_dev_mtu);
-static bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
+bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
{
if (skb->len <= mtu)
return false;
@@ -85,6 +88,7 @@ static bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
return true;
}
+EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
struct mpls_entry_decoded dec)
@@ -327,6 +331,70 @@ static unsigned find_free_label(struct net *net)
return LABEL_NOT_SPECIFIED;
}
+static struct net_device *inet_fib_lookup_dev(struct net *net, void *addr)
+{
+ struct net_device *dev = NULL;
+ struct rtable *rt;
+ struct in_addr daddr;
+
+ memcpy(&daddr, addr, sizeof(struct in_addr));
+ rt = ip_route_output(net, daddr.s_addr, 0, 0, 0);
+ if (IS_ERR(rt))
+ goto errout;
+
+ dev = rt->dst.dev;
+ dev_hold(dev);
+
+ ip_rt_put(rt);
+
+errout:
+ return dev;
+}
+
+static struct net_device *inet6_fib_lookup_dev(struct net *net, void *addr)
+{
+ struct net_device *dev = NULL;
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ memcpy(&fl6.daddr, addr, sizeof(struct in6_addr));
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error)
+ goto errout;
+
+ dev = dst->dev;
+ dev_hold(dev);
+
+errout:
+ dst_release(dst);
+
+ return dev;
+}
+
+static struct net_device *find_outdev(struct net *net,
+ struct mpls_route_config *cfg)
+{
+ struct net_device *dev = NULL;
+
+ if (!cfg->rc_ifindex) {
+ switch (cfg->rc_via_table) {
+ case NEIGH_ARP_TABLE:
+ dev = inet_fib_lookup_dev(net, cfg->rc_via);
+ break;
+ case NEIGH_ND_TABLE:
+ dev = inet6_fib_lookup_dev(net, cfg->rc_via);
+ break;
+ case NEIGH_LINK_TABLE:
+ break;
+ }
+ } else {
+ dev = dev_get_by_index(net, cfg->rc_ifindex);
+ }
+
+ return dev;
+}
+
static int mpls_route_add(struct mpls_route_config *cfg)
{
struct mpls_route __rcu **platform_label;
@@ -358,7 +426,7 @@ static int mpls_route_add(struct mpls_route_config *cfg)
goto errout;
err = -ENODEV;
- dev = dev_get_by_index(net, cfg->rc_ifindex);
+ dev = find_outdev(net, cfg);
if (!dev)
goto errout;
@@ -626,6 +694,7 @@ int nla_put_labels(struct sk_buff *skb, int attrtype,
return 0;
}
+EXPORT_SYMBOL_GPL(nla_put_labels);
int nla_get_labels(const struct nlattr *nla,
u32 max_labels, u32 *labels, u32 label[])
@@ -671,6 +740,7 @@ int nla_get_labels(const struct nlattr *nla,
*labels = nla_labels;
return 0;
}
+EXPORT_SYMBOL_GPL(nla_get_labels);
static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh,
struct mpls_route_config *cfg)
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index 8cabeb5a1cb9..2681a4ba6c37 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -50,7 +50,12 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *
return result;
}
-int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]);
-int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels, u32 label[]);
+int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels,
+ const u32 label[]);
+int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels,
+ u32 label[]);
+bool mpls_output_possible(const struct net_device *dev);
+unsigned int mpls_dev_mtu(const struct net_device *dev);
+bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu);
#endif /* MPLS_INTERNAL_H */
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
new file mode 100644
index 000000000000..276f8c992218
--- /dev/null
+++ b/net/mpls/mpls_iptunnel.c
@@ -0,0 +1,233 @@
+/*
+ * mpls tunnels An implementation mpls tunnels using the light weight tunnel
+ * infrastructure
+ *
+ * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/module.h>
+#include <linux/mpls.h>
+#include <linux/vmalloc.h>
+#include <net/ip.h>
+#include <net/dst.h>
+#include <net/lwtunnel.h>
+#include <net/netevent.h>
+#include <net/netns/generic.h>
+#include <net/ip6_fib.h>
+#include <net/route.h>
+#include <net/mpls_iptunnel.h>
+#include <linux/mpls_iptunnel.h>
+#include "internal.h"
+
+static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = {
+ [MPLS_IPTUNNEL_DST] = { .type = NLA_U32 },
+};
+
+static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en)
+{
+ /* The size of the layer 2.5 labels to be added for this route */
+ return en->labels * sizeof(struct mpls_shim_hdr);
+}
+
+int mpls_output(struct sock *sk, struct sk_buff *skb)
+{
+ struct mpls_iptunnel_encap *tun_encap_info;
+ struct mpls_shim_hdr *hdr;
+ struct net_device *out_dev;
+ unsigned int hh_len;
+ unsigned int new_header_size;
+ unsigned int mtu;
+ struct dst_entry *dst = skb_dst(skb);
+ struct rtable *rt = NULL;
+ struct rt6_info *rt6 = NULL;
+ struct lwtunnel_state *lwtstate = NULL;
+ int err = 0;
+ bool bos;
+ int i;
+ unsigned int ttl;
+
+ /* Obtain the ttl */
+ if (skb->protocol == htons(ETH_P_IP)) {
+ ttl = ip_hdr(skb)->ttl;
+ rt = (struct rtable *)dst;
+ lwtstate = rt->rt_lwtstate;
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ ttl = ipv6_hdr(skb)->hop_limit;
+ rt6 = (struct rt6_info *)dst;
+ lwtstate = rt6->rt6i_lwtstate;
+ } else {
+ goto drop;
+ }
+
+ skb_orphan(skb);
+
+ /* Find the output device */
+ out_dev = dst->dev;
+ if (!mpls_output_possible(out_dev) ||
+ !lwtstate || skb_warn_if_lro(skb))
+ goto drop;
+
+ skb_forward_csum(skb);
+
+ tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+
+ /* Verify the destination can hold the packet */
+ new_header_size = mpls_encap_size(tun_encap_info);
+ mtu = mpls_dev_mtu(out_dev);
+ if (mpls_pkt_too_big(skb, mtu - new_header_size))
+ goto drop;
+
+ hh_len = LL_RESERVED_SPACE(out_dev);
+ if (!out_dev->header_ops)
+ hh_len = 0;
+
+ /* Ensure there is enough space for the headers in the skb */
+ if (skb_cow(skb, hh_len + new_header_size))
+ goto drop;
+
+ skb_push(skb, new_header_size);
+ skb_reset_network_header(skb);
+
+ skb->dev = out_dev;
+ skb->protocol = htons(ETH_P_MPLS_UC);
+
+ /* Push the new labels */
+ hdr = mpls_hdr(skb);
+ bos = true;
+ for (i = tun_encap_info->labels - 1; i >= 0; i--) {
+ hdr[i] = mpls_entry_encode(tun_encap_info->label[i],
+ ttl, 0, bos);
+ bos = false;
+ }
+
+ if (rt)
+ err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway,
+ skb);
+ else if (rt6)
+ err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt6->rt6i_gateway,
+ skb);
+ if (err)
+ net_dbg_ratelimited("%s: packet transmission failed: %d\n",
+ __func__, err);
+
+ return 0;
+
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static int mpls_build_state(struct net_device *dev, struct nlattr *nla,
+ struct lwtunnel_state **ts)
+{
+ struct mpls_iptunnel_encap *tun_encap_info;
+ struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1];
+ struct lwtunnel_state *newts;
+ int tun_encap_info_len;
+ int ret;
+
+ ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla,
+ mpls_iptunnel_policy);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[MPLS_IPTUNNEL_DST])
+ return -EINVAL;
+
+ tun_encap_info_len = sizeof(*tun_encap_info);
+
+ newts = lwtunnel_state_alloc(tun_encap_info_len);
+ if (!newts)
+ return -ENOMEM;
+
+ newts->len = tun_encap_info_len;
+ tun_encap_info = mpls_lwtunnel_encap(newts);
+ ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS,
+ &tun_encap_info->labels, tun_encap_info->label);
+ if (ret)
+ goto errout;
+ newts->type = LWTUNNEL_ENCAP_MPLS;
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
+ *ts = newts;
+
+ return 0;
+
+errout:
+ kfree(newts);
+ *ts = NULL;
+
+ return ret;
+}
+
+static int mpls_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct mpls_iptunnel_encap *tun_encap_info;
+
+ tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+
+ if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels,
+ tun_encap_info->label))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ struct mpls_iptunnel_encap *tun_encap_info;
+
+ tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+
+ return nla_total_size(tun_encap_info->labels * 4);
+}
+
+static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct mpls_iptunnel_encap *a_hdr = mpls_lwtunnel_encap(a);
+ struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b);
+ int l;
+
+ if (a_hdr->labels != b_hdr->labels)
+ return 1;
+
+ for (l = 0; l < MAX_NEW_LABELS; l++)
+ if (a_hdr->label[l] != b_hdr->label[l])
+ return 1;
+ return 0;
+}
+
+static const struct lwtunnel_encap_ops mpls_iptun_ops = {
+ .build_state = mpls_build_state,
+ .output = mpls_output,
+ .fill_encap = mpls_fill_encap_info,
+ .get_encap_size = mpls_encap_nlsize,
+ .cmp_encap = mpls_encap_cmp,
+};
+
+static int __init mpls_iptunnel_init(void)
+{
+ return lwtunnel_encap_add_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS);
+}
+module_init(mpls_iptunnel_init);
+
+static void __exit mpls_iptunnel_exit(void)
+{
+ lwtunnel_encap_del_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS);
+}
+module_exit(mpls_iptunnel_exit);
+
+MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels");
+MODULE_LICENSE("GPL v2");
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 52561e1c31e2..cb2f13ebb5a6 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -166,11 +166,13 @@ void nft_meta_get_eval(const struct nft_expr *expr,
goto err;
*dest = out->group;
break;
+#ifdef CONFIG_CGROUP_NET_CLASSID
case NFT_META_CGROUP:
if (skb->sk == NULL || !sk_fullsock(skb->sk))
goto err;
*dest = skb->sk->sk_classid;
break;
+#endif
default:
WARN_ON(1);
goto err;
@@ -246,7 +248,9 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
case NFT_META_CPU:
case NFT_META_IIFGROUP:
case NFT_META_OIFGROUP:
+#ifdef CONFIG_CGROUP_NET_CLASSID
case NFT_META_CGROUP:
+#endif
len = sizeof(u32);
break;
case NFT_META_IIFNAME:
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index cca96cec1b68..d0c96c5ae29a 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -272,8 +272,7 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
hp->source, lport ? lport : hp->dest,
skb->dev, NFT_LOOKUP_LISTENER);
if (sk2) {
- inet_twsk_deschedule(inet_twsk(sk));
- inet_twsk_put(inet_twsk(sk));
+ inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
}
}
@@ -437,8 +436,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
tgi->lport ? tgi->lport : hp->dest,
skb->dev, NFT_LOOKUP_LISTENER);
if (sk2) {
- inet_twsk_deschedule(inet_twsk(sk));
- inet_twsk_put(inet_twsk(sk));
+ inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
}
}
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 15840401a2ce..1119f46b80b4 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -44,18 +44,6 @@ config OPENVSWITCH_GRE
If unsure, say Y.
-config OPENVSWITCH_VXLAN
- tristate "Open vSwitch VXLAN tunneling support"
- depends on OPENVSWITCH
- depends on VXLAN
- default OPENVSWITCH
- ---help---
- If you say Y here, then the Open vSwitch will be able create vxlan vport.
-
- Say N to exclude this support and reduce the binary size.
-
- If unsure, say Y.
-
config OPENVSWITCH_GENEVE
tristate "Open vSwitch Geneve tunneling support"
depends on OPENVSWITCH
diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
index 91b9478413ef..38e0e149c55e 100644
--- a/net/openvswitch/Makefile
+++ b/net/openvswitch/Makefile
@@ -16,5 +16,4 @@ openvswitch-y := \
vport-netdev.o
obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o
-obj-$(CONFIG_OPENVSWITCH_VXLAN) += vport-vxlan.o
obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 8a8c0b8b4f63..cf04c2f8b32a 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -611,7 +611,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key, const struct nlattr *attr,
const struct nlattr *actions, int actions_len)
{
- struct ovs_tunnel_info info;
+ struct ip_tunnel_info info;
struct dp_upcall_info upcall;
const struct nlattr *a;
int rem;
@@ -733,7 +733,15 @@ static int execute_set_action(struct sk_buff *skb,
{
/* Only tunnel set execution is supported without a mask. */
if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) {
- OVS_CB(skb)->egress_tun_info = nla_data(a);
+ struct ovs_tunnel_info *tun = nla_data(a);
+
+ skb_dst_drop(skb);
+ dst_hold((struct dst_entry *)tun->tun_dst);
+ skb_dst_set(skb, (struct dst_entry *)tun->tun_dst);
+
+ /* FIXME: Remove when all vports have been converted */
+ OVS_CB(skb)->egress_tun_info = &tun->tun_dst->u.tun_info;
+
return 0;
}
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index ff8c4a4c1609..ffe984f5b95c 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -176,7 +176,7 @@ static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
const char *ovs_dp_name(const struct datapath *dp)
{
struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL);
- return vport->ops->get_name(vport);
+ return ovs_vport_name(vport);
}
static int get_dpifindex(const struct datapath *dp)
@@ -188,7 +188,7 @@ static int get_dpifindex(const struct datapath *dp)
local = ovs_vport_rcu(dp, OVSP_LOCAL);
if (local)
- ifindex = netdev_vport_priv(local)->dev->ifindex;
+ ifindex = local->dev->ifindex;
else
ifindex = 0;
@@ -1018,7 +1018,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
}
ovs_unlock();
- ovs_nla_free_flow_actions(old_acts);
+ ovs_nla_free_flow_actions_rcu(old_acts);
ovs_flow_free(new_flow, false);
}
@@ -1030,7 +1030,7 @@ err_unlock_ovs:
ovs_unlock();
kfree_skb(reply);
err_kfree_acts:
- kfree(acts);
+ ovs_nla_free_flow_actions(acts);
err_kfree_flow:
ovs_flow_free(new_flow, false);
error:
@@ -1157,7 +1157,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
if (reply)
ovs_notify(&dp_flow_genl_family, reply, info);
if (old_acts)
- ovs_nla_free_flow_actions(old_acts);
+ ovs_nla_free_flow_actions_rcu(old_acts);
return 0;
@@ -1165,7 +1165,7 @@ err_unlock_ovs:
ovs_unlock();
kfree_skb(reply);
err_kfree_acts:
- kfree(acts);
+ ovs_nla_free_flow_actions(acts);
error:
return error;
}
@@ -1800,7 +1800,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) ||
nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) ||
nla_put_string(skb, OVS_VPORT_ATTR_NAME,
- vport->ops->get_name(vport)))
+ ovs_vport_name(vport)))
goto nla_put_failure;
ovs_vport_get_stats(vport, &vport_stats);
@@ -2219,13 +2219,10 @@ static void __net_exit list_vports_from_net(struct net *net, struct net *dnet,
struct vport *vport;
hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) {
- struct netdev_vport *netdev_vport;
-
if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL)
continue;
- netdev_vport = netdev_vport_priv(vport);
- if (dev_net(netdev_vport->dev) == dnet)
+ if (dev_net(vport->dev) == dnet)
list_add(&vport->detach_list, head);
}
}
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index cd691e935e08..6b28c5cedb23 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -25,6 +25,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/u64_stats_sync.h>
+#include <net/ip_tunnels.h>
#include "flow.h"
#include "flow_table.h"
@@ -98,7 +99,7 @@ struct datapath {
* when a packet is received by OVS.
*/
struct ovs_skb_cb {
- struct ovs_tunnel_info *egress_tun_info;
+ struct ip_tunnel_info *egress_tun_info;
struct vport *input_vport;
};
#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
@@ -114,7 +115,7 @@ struct ovs_skb_cb {
* @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY.
*/
struct dp_upcall_info {
- const struct ovs_tunnel_info *egress_tun_info;
+ const struct ip_tunnel_info *egress_tun_info;
const struct nlattr *userdata;
const struct nlattr *actions;
int actions_len;
diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c
index 2c631fe76be1..a7a80a6b77b0 100644
--- a/net/openvswitch/dp_notify.c
+++ b/net/openvswitch/dp_notify.c
@@ -58,13 +58,10 @@ void ovs_dp_notify_wq(struct work_struct *work)
struct hlist_node *n;
hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) {
- struct netdev_vport *netdev_vport;
-
if (vport->ops->type != OVS_VPORT_TYPE_NETDEV)
continue;
- netdev_vport = netdev_vport_priv(vport);
- if (!(netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH))
+ if (!(vport->dev->priv_flags & IFF_OVS_DATAPATH))
dp_detach_port_notify(vport);
}
}
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index bc7b0aba994a..8db22ef73626 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -682,12 +682,12 @@ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
return key_extract(skb, key);
}
-int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info,
+int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
struct sk_buff *skb, struct sw_flow_key *key)
{
/* Extract metadata from packet. */
if (tun_info) {
- memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key));
+ memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key));
if (tun_info->options) {
BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) *
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index a076e445ccc2..b62cdb3e3589 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -32,31 +32,11 @@
#include <linux/time.h>
#include <linux/flex_array.h>
#include <net/inet_ecn.h>
+#include <net/ip_tunnels.h>
+#include <net/dst_metadata.h>
struct sk_buff;
-/* Used to memset ovs_key_ipv4_tunnel padding. */
-#define OVS_TUNNEL_KEY_SIZE \
- (offsetof(struct ovs_key_ipv4_tunnel, tp_dst) + \
- FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, tp_dst))
-
-struct ovs_key_ipv4_tunnel {
- __be64 tun_id;
- __be32 ipv4_src;
- __be32 ipv4_dst;
- __be16 tun_flags;
- u8 ipv4_tos;
- u8 ipv4_ttl;
- __be16 tp_src;
- __be16 tp_dst;
-} __packed __aligned(4); /* Minimize padding. */
-
-struct ovs_tunnel_info {
- struct ovs_key_ipv4_tunnel tunnel;
- const void *options;
- u8 options_len;
-};
-
/* Store options at the end of the array if they are less than the
* maximum size. This allows us to get the benefits of variable length
* matching for small options.
@@ -66,54 +46,9 @@ struct ovs_tunnel_info {
#define TUN_METADATA_OPTS(flow_key, opt_len) \
((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len)))
-static inline void __ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
- __be32 saddr, __be32 daddr,
- u8 tos, u8 ttl,
- __be16 tp_src,
- __be16 tp_dst,
- __be64 tun_id,
- __be16 tun_flags,
- const void *opts,
- u8 opts_len)
-{
- tun_info->tunnel.tun_id = tun_id;
- tun_info->tunnel.ipv4_src = saddr;
- tun_info->tunnel.ipv4_dst = daddr;
- tun_info->tunnel.ipv4_tos = tos;
- tun_info->tunnel.ipv4_ttl = ttl;
- tun_info->tunnel.tun_flags = tun_flags;
-
- /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of
- * the upper tunnel are used.
- * E.g: GRE over IPSEC, the tp_src and tp_port are zero.
- */
- tun_info->tunnel.tp_src = tp_src;
- tun_info->tunnel.tp_dst = tp_dst;
-
- /* Clear struct padding. */
- if (sizeof(tun_info->tunnel) != OVS_TUNNEL_KEY_SIZE)
- memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE,
- 0, sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE);
-
- tun_info->options = opts;
- tun_info->options_len = opts_len;
-}
-
-static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
- const struct iphdr *iph,
- __be16 tp_src,
- __be16 tp_dst,
- __be64 tun_id,
- __be16 tun_flags,
- const void *opts,
- u8 opts_len)
-{
- __ovs_flow_tun_info_init(tun_info, iph->saddr, iph->daddr,
- iph->tos, iph->ttl,
- tp_src, tp_dst,
- tun_id, tun_flags,
- opts, opts_len);
-}
+struct ovs_tunnel_info {
+ struct metadata_dst *tun_dst;
+};
#define OVS_SW_FLOW_KEY_METADATA_SIZE \
(offsetof(struct sw_flow_key, recirc_id) + \
@@ -122,7 +57,7 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
struct sw_flow_key {
u8 tun_opts[255];
u8 tun_opts_len;
- struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */
+ struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */
struct {
u32 priority; /* Packet QoS priority. */
u32 skb_mark; /* SKB mark. */
@@ -273,7 +208,7 @@ void ovs_flow_stats_clear(struct sw_flow *);
u64 ovs_flow_used_time(unsigned long flow_jiffies);
int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key);
-int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info,
+int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
struct sk_buff *skb,
struct sw_flow_key *key);
/* Extract key from packet coming from userspace. */
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 624e41c4267f..a6eb77ab1a64 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -47,9 +47,9 @@
#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/mpls.h>
+#include <net/vxlan.h>
#include "flow_netlink.h"
-#include "vport-vxlan.h"
struct ovs_len_tbl {
int len;
@@ -475,7 +475,7 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *a,
{
struct nlattr *tb[OVS_VXLAN_EXT_MAX+1];
unsigned long opt_key_offset;
- struct ovs_vxlan_opts opts;
+ struct vxlan_metadata opts;
int err;
BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts));
@@ -626,7 +626,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
static int vxlan_opt_to_nlattr(struct sk_buff *skb,
const void *tun_opts, int swkey_tun_opts_len)
{
- const struct ovs_vxlan_opts *opts = tun_opts;
+ const struct vxlan_metadata *opts = tun_opts;
struct nlattr *nla;
nla = nla_nest_start(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS);
@@ -641,7 +641,7 @@ static int vxlan_opt_to_nlattr(struct sk_buff *skb,
}
static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
- const struct ovs_key_ipv4_tunnel *output,
+ const struct ip_tunnel_key *output,
const void *tun_opts, int swkey_tun_opts_len)
{
if (output->tun_flags & TUNNEL_KEY &&
@@ -689,7 +689,7 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
}
static int ipv4_tun_to_nlattr(struct sk_buff *skb,
- const struct ovs_key_ipv4_tunnel *output,
+ const struct ip_tunnel_key *output,
const void *tun_opts, int swkey_tun_opts_len)
{
struct nlattr *nla;
@@ -708,9 +708,9 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb,
}
int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb,
- const struct ovs_tunnel_info *egress_tun_info)
+ const struct ip_tunnel_info *egress_tun_info)
{
- return __ipv4_tun_to_nlattr(skb, &egress_tun_info->tunnel,
+ return __ipv4_tun_to_nlattr(skb, &egress_tun_info->key,
egress_tun_info->options,
egress_tun_info->options_len);
}
@@ -1548,11 +1548,48 @@ static struct sw_flow_actions *nla_alloc_flow_actions(int size, bool log)
return sfa;
}
+static void ovs_nla_free_set_action(const struct nlattr *a)
+{
+ const struct nlattr *ovs_key = nla_data(a);
+ struct ovs_tunnel_info *ovs_tun;
+
+ switch (nla_type(ovs_key)) {
+ case OVS_KEY_ATTR_TUNNEL_INFO:
+ ovs_tun = nla_data(ovs_key);
+ dst_release((struct dst_entry *)ovs_tun->tun_dst);
+ break;
+ }
+}
+
+void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
+{
+ const struct nlattr *a;
+ int rem;
+
+ if (!sf_acts)
+ return;
+
+ nla_for_each_attr(a, sf_acts->actions, sf_acts->actions_len, rem) {
+ switch (nla_type(a)) {
+ case OVS_ACTION_ATTR_SET:
+ ovs_nla_free_set_action(a);
+ break;
+ }
+ }
+
+ kfree(sf_acts);
+}
+
+static void __ovs_nla_free_flow_actions(struct rcu_head *head)
+{
+ ovs_nla_free_flow_actions(container_of(head, struct sw_flow_actions, rcu));
+}
+
/* Schedules 'sf_acts' to be freed after the next RCU grace period.
* The caller must hold rcu_read_lock for this to be sensible. */
-void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
+void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *sf_acts)
{
- kfree_rcu(sf_acts, rcu);
+ call_rcu(&sf_acts->rcu, __ovs_nla_free_flow_actions);
}
static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
@@ -1746,7 +1783,9 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
{
struct sw_flow_match match;
struct sw_flow_key key;
- struct ovs_tunnel_info *tun_info;
+ struct metadata_dst *tun_dst;
+ struct ip_tunnel_info *tun_info;
+ struct ovs_tunnel_info *ovs_tun;
struct nlattr *a;
int err = 0, start, opts_type;
@@ -1771,13 +1810,23 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
if (start < 0)
return start;
+ tun_dst = metadata_dst_alloc(key.tun_opts_len, GFP_KERNEL);
+ if (!tun_dst)
+ return -ENOMEM;
+
a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL,
- sizeof(*tun_info) + key.tun_opts_len, log);
- if (IS_ERR(a))
+ sizeof(*ovs_tun), log);
+ if (IS_ERR(a)) {
+ dst_release((struct dst_entry *)tun_dst);
return PTR_ERR(a);
+ }
+
+ ovs_tun = nla_data(a);
+ ovs_tun->tun_dst = tun_dst;
- tun_info = nla_data(a);
- tun_info->tunnel = key.tun_key;
+ tun_info = &tun_dst->u.tun_info;
+ tun_info->mode = IP_TUNNEL_INFO_TX;
+ tun_info->key = key.tun_key;
tun_info->options_len = key.tun_opts_len;
if (tun_info->options_len) {
@@ -2177,7 +2226,7 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type,
key->eth.tci, log);
if (err)
- kfree(*sfa);
+ ovs_nla_free_flow_actions(*sfa);
return err;
}
@@ -2227,13 +2276,14 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
switch (key_type) {
case OVS_KEY_ATTR_TUNNEL_INFO: {
- struct ovs_tunnel_info *tun_info = nla_data(ovs_key);
+ struct ovs_tunnel_info *ovs_tun = nla_data(ovs_key);
+ struct ip_tunnel_info *tun_info = &ovs_tun->tun_dst->u.tun_info;
start = nla_nest_start(skb, OVS_ACTION_ATTR_SET);
if (!start)
return -EMSGSIZE;
- err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel,
+ err = ipv4_tun_to_nlattr(skb, &tun_info->key,
tun_info->options_len ?
tun_info->options : NULL,
tun_info->options_len);
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 5c3d75bff310..acd074408f0a 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -55,7 +55,7 @@ int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb);
int ovs_nla_get_match(struct sw_flow_match *, const struct nlattr *key,
const struct nlattr *mask, bool log);
int ovs_nla_put_egress_tunnel_key(struct sk_buff *,
- const struct ovs_tunnel_info *);
+ const struct ip_tunnel_info *);
bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log);
int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid,
@@ -69,5 +69,6 @@ int ovs_nla_put_actions(const struct nlattr *attr,
int len, struct sk_buff *skb);
void ovs_nla_free_flow_actions(struct sw_flow_actions *);
+void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *);
#endif /* flow_netlink.h */
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 65523948fb95..3a9d1dde76ed 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -18,6 +18,7 @@
#include "flow.h"
#include "datapath.h"
+#include "flow_netlink.h"
#include <linux/uaccess.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
@@ -143,7 +144,8 @@ static void flow_free(struct sw_flow *flow)
if (ovs_identifier_is_key(&flow->id))
kfree(flow->id.unmasked_key);
- kfree((struct sw_flow_actions __force *)flow->sf_acts);
+ if (flow->sf_acts)
+ ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts);
for_each_node(node)
if (flow->stats[node])
kmem_cache_free(flow_stats_cache,
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 208c576bd1b6..1da3a14d1010 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -77,7 +77,7 @@ static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb)
struct vport *vport = gs->rcv_data;
struct genevehdr *geneveh = geneve_hdr(skb);
int opts_len;
- struct ovs_tunnel_info tun_info;
+ struct ip_tunnel_info tun_info;
__be64 key;
__be16 flags;
@@ -90,10 +90,9 @@ static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb)
key = vni_to_tunnel_id(geneveh->vni);
- ovs_flow_tun_info_init(&tun_info, ip_hdr(skb),
- udp_hdr(skb)->source, udp_hdr(skb)->dest,
- key, flags,
- geneveh->options, opts_len);
+ ip_tunnel_info_init(&tun_info, ip_hdr(skb),
+ udp_hdr(skb)->source, udp_hdr(skb)->dest,
+ key, flags, geneveh->options, opts_len);
ovs_vport_receive(vport, skb, &tun_info);
}
@@ -165,8 +164,8 @@ error:
static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
{
- const struct ovs_key_ipv4_tunnel *tun_key;
- struct ovs_tunnel_info *tun_info;
+ const struct ip_tunnel_key *tun_key;
+ struct ip_tunnel_info *tun_info;
struct net *net = ovs_dp_get_net(vport->dp);
struct geneve_port *geneve_port = geneve_vport(vport);
__be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport;
@@ -183,7 +182,7 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
goto error;
}
- tun_key = &tun_info->tunnel;
+ tun_key = &tun_info->key;
rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
@@ -225,7 +224,7 @@ static const char *geneve_get_name(const struct vport *vport)
}
static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *egress_tun_info)
+ struct ip_tunnel_info *egress_tun_info)
{
struct geneve_port *geneve_port = geneve_vport(vport);
struct net *net = ovs_dp_get_net(vport->dp);
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index f17ac9642f4e..b87656c66aaf 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -67,9 +67,9 @@ static struct sk_buff *__build_header(struct sk_buff *skb,
int tunnel_hlen)
{
struct tnl_ptk_info tpi;
- const struct ovs_key_ipv4_tunnel *tun_key;
+ const struct ip_tunnel_key *tun_key;
- tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
+ tun_key = &OVS_CB(skb)->egress_tun_info->key;
skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM));
if (IS_ERR(skb))
@@ -97,7 +97,7 @@ static __be64 key_to_tunnel_id(__be32 key, __be32 seq)
static int gre_rcv(struct sk_buff *skb,
const struct tnl_ptk_info *tpi)
{
- struct ovs_tunnel_info tun_info;
+ struct ip_tunnel_info tun_info;
struct ovs_net *ovs_net;
struct vport *vport;
__be64 key;
@@ -108,8 +108,8 @@ static int gre_rcv(struct sk_buff *skb,
return PACKET_REJECT;
key = key_to_tunnel_id(tpi->key, tpi->seq);
- ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), 0, 0, key,
- filter_tnl_flags(tpi->flags), NULL, 0);
+ ip_tunnel_info_init(&tun_info, ip_hdr(skb), 0, 0, key,
+ filter_tnl_flags(tpi->flags), NULL, 0);
ovs_vport_receive(vport, skb, &tun_info);
return PACKET_RCVD;
@@ -134,7 +134,7 @@ static int gre_err(struct sk_buff *skb, u32 info,
static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
{
struct net *net = ovs_dp_get_net(vport->dp);
- const struct ovs_key_ipv4_tunnel *tun_key;
+ const struct ip_tunnel_key *tun_key;
struct flowi4 fl;
struct rtable *rt;
int min_headroom;
@@ -147,7 +147,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
goto err_free_skb;
}
- tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
+ tun_key = &OVS_CB(skb)->egress_tun_info->key;
rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_GRE);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
@@ -277,7 +277,7 @@ static void gre_tnl_destroy(struct vport *vport)
}
static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *egress_tun_info)
+ struct ip_tunnel_info *egress_tun_info)
{
return ovs_tunnel_get_egress_info(egress_tun_info,
ovs_dp_get_net(vport->dp),
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 6a55f7105505..c058bbf876c3 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -156,49 +156,44 @@ static void do_setup(struct net_device *netdev)
static struct vport *internal_dev_create(const struct vport_parms *parms)
{
struct vport *vport;
- struct netdev_vport *netdev_vport;
struct internal_dev *internal_dev;
int err;
- vport = ovs_vport_alloc(sizeof(struct netdev_vport),
- &ovs_internal_vport_ops, parms);
+ vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms);
if (IS_ERR(vport)) {
err = PTR_ERR(vport);
goto error;
}
- netdev_vport = netdev_vport_priv(vport);
-
- netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev),
- parms->name, NET_NAME_UNKNOWN,
- do_setup);
- if (!netdev_vport->dev) {
+ vport->dev = alloc_netdev(sizeof(struct internal_dev),
+ parms->name, NET_NAME_UNKNOWN, do_setup);
+ if (!vport->dev) {
err = -ENOMEM;
goto error_free_vport;
}
- dev_net_set(netdev_vport->dev, ovs_dp_get_net(vport->dp));
- internal_dev = internal_dev_priv(netdev_vport->dev);
+ dev_net_set(vport->dev, ovs_dp_get_net(vport->dp));
+ internal_dev = internal_dev_priv(vport->dev);
internal_dev->vport = vport;
/* Restrict bridge port to current netns. */
if (vport->port_no == OVSP_LOCAL)
- netdev_vport->dev->features |= NETIF_F_NETNS_LOCAL;
+ vport->dev->features |= NETIF_F_NETNS_LOCAL;
rtnl_lock();
- err = register_netdevice(netdev_vport->dev);
+ err = register_netdevice(vport->dev);
if (err)
goto error_free_netdev;
- dev_set_promiscuity(netdev_vport->dev, 1);
+ dev_set_promiscuity(vport->dev, 1);
rtnl_unlock();
- netif_start_queue(netdev_vport->dev);
+ netif_start_queue(vport->dev);
return vport;
error_free_netdev:
rtnl_unlock();
- free_netdev(netdev_vport->dev);
+ free_netdev(vport->dev);
error_free_vport:
ovs_vport_free(vport);
error:
@@ -207,21 +202,19 @@ error:
static void internal_dev_destroy(struct vport *vport)
{
- struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
-
- netif_stop_queue(netdev_vport->dev);
+ netif_stop_queue(vport->dev);
rtnl_lock();
- dev_set_promiscuity(netdev_vport->dev, -1);
+ dev_set_promiscuity(vport->dev, -1);
/* unregister_netdevice() waits for an RCU grace period. */
- unregister_netdevice(netdev_vport->dev);
+ unregister_netdevice(vport->dev);
rtnl_unlock();
}
static int internal_dev_recv(struct vport *vport, struct sk_buff *skb)
{
- struct net_device *netdev = netdev_vport_priv(vport)->dev;
+ struct net_device *netdev = vport->dev;
int len;
if (unlikely(!(netdev->flags & IFF_UP))) {
@@ -249,7 +242,6 @@ static struct vport_ops ovs_internal_vport_ops = {
.type = OVS_VPORT_TYPE_INTERNAL,
.create = internal_dev_create,
.destroy = internal_dev_destroy,
- .get_name = ovs_netdev_get_name,
.send = internal_dev_recv,
};
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 33e6d6e2908f..68d0582fc001 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -27,9 +27,13 @@
#include <linux/skbuff.h>
#include <linux/openvswitch.h>
-#include <net/llc.h>
+#include <net/udp.h>
+#include <net/ip_tunnels.h>
+#include <net/rtnetlink.h>
+#include <net/vxlan.h>
#include "datapath.h"
+#include "vport.h"
#include "vport-internal_dev.h"
#include "vport-netdev.h"
@@ -83,104 +87,93 @@ static struct net_device *get_dpdev(const struct datapath *dp)
local = ovs_vport_ovsl(dp, OVSP_LOCAL);
BUG_ON(!local);
- return netdev_vport_priv(local)->dev;
+ return local->dev;
}
-static struct vport *netdev_create(const struct vport_parms *parms)
+static struct vport *netdev_link(struct vport *vport, const char *name)
{
- struct vport *vport;
- struct netdev_vport *netdev_vport;
int err;
- vport = ovs_vport_alloc(sizeof(struct netdev_vport),
- &ovs_netdev_vport_ops, parms);
- if (IS_ERR(vport)) {
- err = PTR_ERR(vport);
- goto error;
- }
-
- netdev_vport = netdev_vport_priv(vport);
-
- netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name);
- if (!netdev_vport->dev) {
+ vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name);
+ if (!vport->dev) {
err = -ENODEV;
goto error_free_vport;
}
- if (netdev_vport->dev->flags & IFF_LOOPBACK ||
- netdev_vport->dev->type != ARPHRD_ETHER ||
- ovs_is_internal_dev(netdev_vport->dev)) {
+ if (vport->dev->flags & IFF_LOOPBACK ||
+ vport->dev->type != ARPHRD_ETHER ||
+ ovs_is_internal_dev(vport->dev)) {
err = -EINVAL;
goto error_put;
}
rtnl_lock();
- err = netdev_master_upper_dev_link(netdev_vport->dev,
+ err = netdev_master_upper_dev_link(vport->dev,
get_dpdev(vport->dp));
if (err)
goto error_unlock;
- err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook,
+ err = netdev_rx_handler_register(vport->dev, netdev_frame_hook,
vport);
if (err)
goto error_master_upper_dev_unlink;
- dev_disable_lro(netdev_vport->dev);
- dev_set_promiscuity(netdev_vport->dev, 1);
- netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH;
+ dev_disable_lro(vport->dev);
+ dev_set_promiscuity(vport->dev, 1);
+ vport->dev->priv_flags |= IFF_OVS_DATAPATH;
rtnl_unlock();
return vport;
error_master_upper_dev_unlink:
- netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp));
+ netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp));
error_unlock:
rtnl_unlock();
error_put:
- dev_put(netdev_vport->dev);
+ dev_put(vport->dev);
error_free_vport:
ovs_vport_free(vport);
-error:
return ERR_PTR(err);
}
+static struct vport *netdev_create(const struct vport_parms *parms)
+{
+ struct vport *vport;
+
+ vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms);
+ if (IS_ERR(vport))
+ return vport;
+
+ return netdev_link(vport, parms->name);
+}
+
static void free_port_rcu(struct rcu_head *rcu)
{
- struct netdev_vport *netdev_vport = container_of(rcu,
- struct netdev_vport, rcu);
+ struct vport *vport = container_of(rcu, struct vport, rcu);
- dev_put(netdev_vport->dev);
- ovs_vport_free(vport_from_priv(netdev_vport));
+ if (vport->dev)
+ dev_put(vport->dev);
+ ovs_vport_free(vport);
}
void ovs_netdev_detach_dev(struct vport *vport)
{
- struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
-
ASSERT_RTNL();
- netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH;
- netdev_rx_handler_unregister(netdev_vport->dev);
- netdev_upper_dev_unlink(netdev_vport->dev,
- netdev_master_upper_dev_get(netdev_vport->dev));
- dev_set_promiscuity(netdev_vport->dev, -1);
+ vport->dev->priv_flags &= ~IFF_OVS_DATAPATH;
+ netdev_rx_handler_unregister(vport->dev);
+ netdev_upper_dev_unlink(vport->dev,
+ netdev_master_upper_dev_get(vport->dev));
+ dev_set_promiscuity(vport->dev, -1);
}
static void netdev_destroy(struct vport *vport)
{
- struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
-
rtnl_lock();
- if (netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH)
+ if (vport->dev->priv_flags & IFF_OVS_DATAPATH)
ovs_netdev_detach_dev(vport);
rtnl_unlock();
- call_rcu(&netdev_vport->rcu, free_port_rcu);
-}
-
-const char *ovs_netdev_get_name(const struct vport *vport)
-{
- const struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
- return netdev_vport->dev->name;
+ call_rcu(&vport->rcu, free_port_rcu);
}
static unsigned int packet_length(const struct sk_buff *skb)
@@ -195,18 +188,17 @@ static unsigned int packet_length(const struct sk_buff *skb)
static int netdev_send(struct vport *vport, struct sk_buff *skb)
{
- struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
- int mtu = netdev_vport->dev->mtu;
+ int mtu = vport->dev->mtu;
int len;
if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
- netdev_vport->dev->name,
+ vport->dev->name,
packet_length(skb), mtu);
goto drop;
}
- skb->dev = netdev_vport->dev;
+ skb->dev = vport->dev;
len = skb->len;
dev_queue_xmit(skb);
@@ -231,16 +223,205 @@ static struct vport_ops ovs_netdev_vport_ops = {
.type = OVS_VPORT_TYPE_NETDEV,
.create = netdev_create,
.destroy = netdev_destroy,
- .get_name = ovs_netdev_get_name,
.send = netdev_send,
};
+/* Compat code for old userspace. */
+#if IS_ENABLED(CONFIG_VXLAN)
+static struct vport_ops ovs_vxlan_netdev_vport_ops;
+
+static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb)
+{
+ struct vxlan_dev *vxlan = netdev_priv(vport->dev);
+ __be16 dst_port = vxlan->cfg.dst_port;
+
+ if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port)))
+ return -EMSGSIZE;
+
+ if (vxlan->flags & VXLAN_F_GBP) {
+ struct nlattr *exts;
+
+ exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION);
+ if (!exts)
+ return -EMSGSIZE;
+
+ if (vxlan->flags & VXLAN_F_GBP &&
+ nla_put_flag(skb, OVS_VXLAN_EXT_GBP))
+ return -EMSGSIZE;
+
+ nla_nest_end(skb, exts);
+ }
+
+ return 0;
+}
+
+static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX + 1] = {
+ [OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, },
+};
+
+static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr,
+ struct vxlan_config *conf)
+{
+ struct nlattr *exts[OVS_VXLAN_EXT_MAX + 1];
+ int err;
+
+ if (nla_len(attr) < sizeof(struct nlattr))
+ return -EINVAL;
+
+ err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy);
+ if (err < 0)
+ return err;
+
+ if (exts[OVS_VXLAN_EXT_GBP])
+ conf->flags |= VXLAN_F_GBP;
+
+ return 0;
+}
+
+static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
+{
+ struct net *net = ovs_dp_get_net(parms->dp);
+ struct nlattr *options = parms->options;
+ struct net_device *dev;
+ struct vport *vport;
+ struct nlattr *a;
+ int err;
+ struct vxlan_config conf = {
+ .no_share = true,
+ .flags = VXLAN_F_FLOW_BASED | VXLAN_F_COLLECT_METADATA,
+ };
+
+ if (!options) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT);
+ if (a && nla_len(a) == sizeof(u16)) {
+ conf.dst_port = htons(nla_get_u16(a));
+ } else {
+ /* Require destination port from userspace. */
+ err = -EINVAL;
+ goto error;
+ }
+
+ vport = ovs_vport_alloc(0, &ovs_vxlan_netdev_vport_ops, parms);
+ if (IS_ERR(vport))
+ return vport;
+
+ a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION);
+ if (a) {
+ err = vxlan_configure_exts(vport, a, &conf);
+ if (err) {
+ ovs_vport_free(vport);
+ goto error;
+ }
+ }
+
+ rtnl_lock();
+ dev = vxlan_dev_create(net, parms->name, NET_NAME_USER, &conf);
+ if (IS_ERR(dev)) {
+ rtnl_unlock();
+ ovs_vport_free(vport);
+ return ERR_CAST(dev);
+ }
+
+ dev_change_flags(dev, dev->flags | IFF_UP);
+ rtnl_unlock();
+ return vport;
+error:
+ return ERR_PTR(err);
+}
+
+static struct vport *vxlan_create(const struct vport_parms *parms)
+{
+ struct vport *vport;
+
+ vport = vxlan_tnl_create(parms);
+ if (IS_ERR(vport))
+ return vport;
+
+ return netdev_link(vport, parms->name);
+}
+
+static void vxlan_destroy(struct vport *vport)
+{
+ rtnl_lock();
+ if (vport->dev->priv_flags & IFF_OVS_DATAPATH)
+ ovs_netdev_detach_dev(vport);
+
+ /* Early release so we can unregister the device */
+ dev_put(vport->dev);
+ rtnl_delete_link(vport->dev);
+ vport->dev = NULL;
+ rtnl_unlock();
+
+ call_rcu(&vport->rcu, free_port_rcu);
+}
+
+static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
+ struct ip_tunnel_info *egress_tun_info)
+{
+ struct vxlan_dev *vxlan = netdev_priv(vport->dev);
+ struct net *net = ovs_dp_get_net(vport->dp);
+ __be16 dst_port = vxlan_dev_dst_port(vxlan);
+ __be16 src_port;
+ int port_min;
+ int port_max;
+
+ inet_get_local_port_range(net, &port_min, &port_max);
+ src_port = udp_flow_src_port(net, skb, 0, 0, true);
+
+ return ovs_tunnel_get_egress_info(egress_tun_info, net,
+ OVS_CB(skb)->egress_tun_info,
+ IPPROTO_UDP, skb->mark,
+ src_port, dst_port);
+}
+
+static struct vport_ops ovs_vxlan_netdev_vport_ops = {
+ .type = OVS_VPORT_TYPE_VXLAN,
+ .create = vxlan_create,
+ .destroy = vxlan_destroy,
+ .get_options = vxlan_get_options,
+ .send = netdev_send,
+ .get_egress_tun_info = vxlan_get_egress_tun_info,
+};
+
+static int vxlan_compat_init(void)
+{
+ return ovs_vport_ops_register(&ovs_vxlan_netdev_vport_ops);
+}
+
+static void vxlan_compat_exit(void)
+{
+ ovs_vport_ops_unregister(&ovs_vxlan_netdev_vport_ops);
+}
+#else
+static int vxlan_compat_init(void)
+{
+ return 0;
+}
+
+static void vxlan_compat_exit(void)
+{
+}
+#endif
+
int __init ovs_netdev_init(void)
{
- return ovs_vport_ops_register(&ovs_netdev_vport_ops);
+ int err;
+
+ err = ovs_vport_ops_register(&ovs_netdev_vport_ops);
+ if (err)
+ return err;
+ err = vxlan_compat_init();
+ if (err)
+ vxlan_compat_exit();
+ return err;
}
void ovs_netdev_exit(void)
{
ovs_vport_ops_unregister(&ovs_netdev_vport_ops);
+ vxlan_compat_exit();
}
diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h
index 6f7038e79c52..684fb88723a4 100644
--- a/net/openvswitch/vport-netdev.h
+++ b/net/openvswitch/vport-netdev.h
@@ -26,19 +26,6 @@
struct vport *ovs_netdev_get_vport(struct net_device *dev);
-struct netdev_vport {
- struct rcu_head rcu;
-
- struct net_device *dev;
-};
-
-static inline struct netdev_vport *
-netdev_vport_priv(const struct vport *vport)
-{
- return vport_priv(vport);
-}
-
-const char *ovs_netdev_get_name(const struct vport *);
void ovs_netdev_detach_dev(struct vport *);
int __init ovs_netdev_init(void);
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
deleted file mode 100644
index 6d39766e7828..000000000000
--- a/net/openvswitch/vport-vxlan.c
+++ /dev/null
@@ -1,322 +0,0 @@
-/*
- * Copyright (c) 2014 Nicira, Inc.
- * Copyright (c) 2013 Cisco Systems, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/net.h>
-#include <linux/rculist.h>
-#include <linux/udp.h>
-#include <linux/module.h>
-
-#include <net/icmp.h>
-#include <net/ip.h>
-#include <net/udp.h>
-#include <net/ip_tunnels.h>
-#include <net/rtnetlink.h>
-#include <net/route.h>
-#include <net/dsfield.h>
-#include <net/inet_ecn.h>
-#include <net/net_namespace.h>
-#include <net/netns/generic.h>
-#include <net/vxlan.h>
-
-#include "datapath.h"
-#include "vport.h"
-#include "vport-vxlan.h"
-
-/**
- * struct vxlan_port - Keeps track of open UDP ports
- * @vs: vxlan_sock created for the port.
- * @name: vport name.
- */
-struct vxlan_port {
- struct vxlan_sock *vs;
- char name[IFNAMSIZ];
- u32 exts; /* VXLAN_F_* in <net/vxlan.h> */
-};
-
-static struct vport_ops ovs_vxlan_vport_ops;
-
-static inline struct vxlan_port *vxlan_vport(const struct vport *vport)
-{
- return vport_priv(vport);
-}
-
-/* Called with rcu_read_lock and BH disabled. */
-static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
- struct vxlan_metadata *md)
-{
- struct ovs_tunnel_info tun_info;
- struct vxlan_port *vxlan_port;
- struct vport *vport = vs->data;
- struct iphdr *iph;
- struct ovs_vxlan_opts opts = {
- .gbp = md->gbp,
- };
- __be64 key;
- __be16 flags;
-
- flags = TUNNEL_KEY | (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0);
- vxlan_port = vxlan_vport(vport);
- if (vxlan_port->exts & VXLAN_F_GBP && md->gbp)
- flags |= TUNNEL_VXLAN_OPT;
-
- /* Save outer tunnel values */
- iph = ip_hdr(skb);
- key = cpu_to_be64(ntohl(md->vni) >> 8);
- ovs_flow_tun_info_init(&tun_info, iph,
- udp_hdr(skb)->source, udp_hdr(skb)->dest,
- key, flags, &opts, sizeof(opts));
-
- ovs_vport_receive(vport, skb, &tun_info);
-}
-
-static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb)
-{
- struct vxlan_port *vxlan_port = vxlan_vport(vport);
- __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
-
- if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port)))
- return -EMSGSIZE;
-
- if (vxlan_port->exts) {
- struct nlattr *exts;
-
- exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION);
- if (!exts)
- return -EMSGSIZE;
-
- if (vxlan_port->exts & VXLAN_F_GBP &&
- nla_put_flag(skb, OVS_VXLAN_EXT_GBP))
- return -EMSGSIZE;
-
- nla_nest_end(skb, exts);
- }
-
- return 0;
-}
-
-static void vxlan_tnl_destroy(struct vport *vport)
-{
- struct vxlan_port *vxlan_port = vxlan_vport(vport);
-
- vxlan_sock_release(vxlan_port->vs);
-
- ovs_vport_deferred_free(vport);
-}
-
-static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX+1] = {
- [OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, },
-};
-
-static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr)
-{
- struct nlattr *exts[OVS_VXLAN_EXT_MAX+1];
- struct vxlan_port *vxlan_port;
- int err;
-
- if (nla_len(attr) < sizeof(struct nlattr))
- return -EINVAL;
-
- err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy);
- if (err < 0)
- return err;
-
- vxlan_port = vxlan_vport(vport);
-
- if (exts[OVS_VXLAN_EXT_GBP])
- vxlan_port->exts |= VXLAN_F_GBP;
-
- return 0;
-}
-
-static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
-{
- struct net *net = ovs_dp_get_net(parms->dp);
- struct nlattr *options = parms->options;
- struct vxlan_port *vxlan_port;
- struct vxlan_sock *vs;
- struct vport *vport;
- struct nlattr *a;
- u16 dst_port;
- int err;
-
- if (!options) {
- err = -EINVAL;
- goto error;
- }
- a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT);
- if (a && nla_len(a) == sizeof(u16)) {
- dst_port = nla_get_u16(a);
- } else {
- /* Require destination port from userspace. */
- err = -EINVAL;
- goto error;
- }
-
- vport = ovs_vport_alloc(sizeof(struct vxlan_port),
- &ovs_vxlan_vport_ops, parms);
- if (IS_ERR(vport))
- return vport;
-
- vxlan_port = vxlan_vport(vport);
- strncpy(vxlan_port->name, parms->name, IFNAMSIZ);
-
- a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION);
- if (a) {
- err = vxlan_configure_exts(vport, a);
- if (err) {
- ovs_vport_free(vport);
- goto error;
- }
- }
-
- vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true,
- vxlan_port->exts);
- if (IS_ERR(vs)) {
- ovs_vport_free(vport);
- return (void *)vs;
- }
- vxlan_port->vs = vs;
-
- return vport;
-
-error:
- return ERR_PTR(err);
-}
-
-static int vxlan_ext_gbp(struct sk_buff *skb)
-{
- const struct ovs_tunnel_info *tun_info;
- const struct ovs_vxlan_opts *opts;
-
- tun_info = OVS_CB(skb)->egress_tun_info;
- opts = tun_info->options;
-
- if (tun_info->tunnel.tun_flags & TUNNEL_VXLAN_OPT &&
- tun_info->options_len >= sizeof(*opts))
- return opts->gbp;
- else
- return 0;
-}
-
-static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
-{
- struct net *net = ovs_dp_get_net(vport->dp);
- struct vxlan_port *vxlan_port = vxlan_vport(vport);
- struct sock *sk = vxlan_port->vs->sock->sk;
- __be16 dst_port = inet_sk(sk)->inet_sport;
- const struct ovs_key_ipv4_tunnel *tun_key;
- struct vxlan_metadata md = {0};
- struct rtable *rt;
- struct flowi4 fl;
- __be16 src_port;
- __be16 df;
- int err;
- u32 vxflags;
-
- if (unlikely(!OVS_CB(skb)->egress_tun_info)) {
- err = -EINVAL;
- goto error;
- }
-
- tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
- rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- goto error;
- }
-
- df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ?
- htons(IP_DF) : 0;
-
- skb->ignore_df = 1;
-
- src_port = udp_flow_src_port(net, skb, 0, 0, true);
- md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8);
- md.gbp = vxlan_ext_gbp(skb);
- vxflags = vxlan_port->exts |
- (tun_key->tun_flags & TUNNEL_CSUM ? VXLAN_F_UDP_CSUM : 0);
-
- err = vxlan_xmit_skb(rt, sk, skb, fl.saddr, tun_key->ipv4_dst,
- tun_key->ipv4_tos, tun_key->ipv4_ttl, df,
- src_port, dst_port,
- &md, false, vxflags);
- if (err < 0)
- ip_rt_put(rt);
- return err;
-error:
- kfree_skb(skb);
- return err;
-}
-
-static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *egress_tun_info)
-{
- struct net *net = ovs_dp_get_net(vport->dp);
- struct vxlan_port *vxlan_port = vxlan_vport(vport);
- __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
- __be16 src_port;
- int port_min;
- int port_max;
-
- inet_get_local_port_range(net, &port_min, &port_max);
- src_port = udp_flow_src_port(net, skb, 0, 0, true);
-
- return ovs_tunnel_get_egress_info(egress_tun_info, net,
- OVS_CB(skb)->egress_tun_info,
- IPPROTO_UDP, skb->mark,
- src_port, dst_port);
-}
-
-static const char *vxlan_get_name(const struct vport *vport)
-{
- struct vxlan_port *vxlan_port = vxlan_vport(vport);
- return vxlan_port->name;
-}
-
-static struct vport_ops ovs_vxlan_vport_ops = {
- .type = OVS_VPORT_TYPE_VXLAN,
- .create = vxlan_tnl_create,
- .destroy = vxlan_tnl_destroy,
- .get_name = vxlan_get_name,
- .get_options = vxlan_get_options,
- .send = vxlan_tnl_send,
- .get_egress_tun_info = vxlan_get_egress_tun_info,
- .owner = THIS_MODULE,
-};
-
-static int __init ovs_vxlan_tnl_init(void)
-{
- return ovs_vport_ops_register(&ovs_vxlan_vport_ops);
-}
-
-static void __exit ovs_vxlan_tnl_exit(void)
-{
- ovs_vport_ops_unregister(&ovs_vxlan_vport_ops);
-}
-
-module_init(ovs_vxlan_tnl_init);
-module_exit(ovs_vxlan_tnl_exit);
-
-MODULE_DESCRIPTION("OVS: VXLAN switching port");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("vport-type-4");
diff --git a/net/openvswitch/vport-vxlan.h b/net/openvswitch/vport-vxlan.h
deleted file mode 100644
index 4b08233e73d5..000000000000
--- a/net/openvswitch/vport-vxlan.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef VPORT_VXLAN_H
-#define VPORT_VXLAN_H 1
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-
-struct ovs_vxlan_opts {
- __u32 gbp;
-};
-
-#endif
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 067a3fff1d2c..d14f59403c5e 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -113,7 +113,7 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name)
struct vport *vport;
hlist_for_each_entry_rcu(vport, bucket, hash_node)
- if (!strcmp(name, vport->ops->get_name(vport)) &&
+ if (!strcmp(name, ovs_vport_name(vport)) &&
net_eq(ovs_dp_get_net(vport->dp), net))
return vport;
@@ -226,7 +226,7 @@ struct vport *ovs_vport_add(const struct vport_parms *parms)
}
bucket = hash_bucket(ovs_dp_get_net(vport->dp),
- vport->ops->get_name(vport));
+ ovs_vport_name(vport));
hlist_add_head_rcu(&vport->hash_node, bucket);
return vport;
}
@@ -469,7 +469,7 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb)
* skb->data should point to the Ethernet header.
*/
void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
- const struct ovs_tunnel_info *tun_info)
+ const struct ip_tunnel_info *tun_info)
{
struct pcpu_sw_netstats *stats;
struct sw_flow_key key;
@@ -572,22 +572,22 @@ void ovs_vport_deferred_free(struct vport *vport)
}
EXPORT_SYMBOL_GPL(ovs_vport_deferred_free);
-int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
+int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info,
struct net *net,
- const struct ovs_tunnel_info *tun_info,
+ const struct ip_tunnel_info *tun_info,
u8 ipproto,
u32 skb_mark,
__be16 tp_src,
__be16 tp_dst)
{
- const struct ovs_key_ipv4_tunnel *tun_key;
+ const struct ip_tunnel_key *tun_key;
struct rtable *rt;
struct flowi4 fl;
if (unlikely(!tun_info))
return -EINVAL;
- tun_key = &tun_info->tunnel;
+ tun_key = &tun_info->key;
/* Route lookup to get srouce IP address.
* The process may need to be changed if the corresponding process
@@ -602,22 +602,22 @@ int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
/* Generate egress_tun_info based on tun_info,
* saddr, tp_src and tp_dst
*/
- __ovs_flow_tun_info_init(egress_tun_info,
- fl.saddr, tun_key->ipv4_dst,
- tun_key->ipv4_tos,
- tun_key->ipv4_ttl,
- tp_src, tp_dst,
- tun_key->tun_id,
- tun_key->tun_flags,
- tun_info->options,
- tun_info->options_len);
+ __ip_tunnel_info_init(egress_tun_info,
+ fl.saddr, tun_key->ipv4_dst,
+ tun_key->ipv4_tos,
+ tun_key->ipv4_ttl,
+ tp_src, tp_dst,
+ tun_key->tun_id,
+ tun_key->tun_flags,
+ tun_info->options,
+ tun_info->options_len);
return 0;
}
EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info);
int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *info)
+ struct ip_tunnel_info *info)
{
/* get_egress_tun_info() is only implemented on tunnel ports. */
if (unlikely(!vport->ops->get_egress_tun_info))
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index bc85331a6c60..1a689c28b5a6 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -27,6 +27,7 @@
#include <linux/skbuff.h>
#include <linux/spinlock.h>
#include <linux/u64_stats_sync.h>
+#include <net/route.h>
#include "datapath.h"
@@ -58,15 +59,15 @@ u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *);
int ovs_vport_send(struct vport *, struct sk_buff *);
-int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
+int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info,
struct net *net,
- const struct ovs_tunnel_info *tun_info,
+ const struct ip_tunnel_info *tun_info,
u8 ipproto,
u32 skb_mark,
__be16 tp_src,
__be16 tp_dst);
int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *info);
+ struct ip_tunnel_info *info);
/* The following definitions are for implementers of vport devices: */
@@ -106,7 +107,7 @@ struct vport_portids {
* @detach_list: list used for detaching vport in net-exit call.
*/
struct vport {
- struct rcu_head rcu;
+ struct net_device *dev;
struct datapath *dp;
struct vport_portids __rcu *upcall_portids;
u16 port_no;
@@ -119,6 +120,7 @@ struct vport {
struct vport_err_stats err_stats;
struct list_head detach_list;
+ struct rcu_head rcu;
};
/**
@@ -176,7 +178,7 @@ struct vport_ops {
int (*send)(struct vport *, struct sk_buff *);
int (*get_egress_tun_info)(struct vport *, struct sk_buff *,
- struct ovs_tunnel_info *);
+ struct ip_tunnel_info *);
struct module *owner;
struct list_head list;
@@ -226,7 +228,7 @@ static inline struct vport *vport_from_priv(void *priv)
}
void ovs_vport_receive(struct vport *, struct sk_buff *,
- const struct ovs_tunnel_info *);
+ const struct ip_tunnel_info *);
static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
const void *start, unsigned int len)
@@ -235,11 +237,16 @@ static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
skb->csum = csum_add(skb->csum, csum_partial(start, len, 0));
}
+static inline const char *ovs_vport_name(struct vport *vport)
+{
+ return vport->dev ? vport->dev->name : vport->ops->get_name(vport);
+}
+
int ovs_vport_ops_register(struct vport_ops *ops);
void ovs_vport_ops_unregister(struct vport_ops *ops);
static inline struct rtable *ovs_tunnel_route_lookup(struct net *net,
- const struct ovs_key_ipv4_tunnel *key,
+ const struct ip_tunnel_key *key,
u32 mark,
struct flowi4 *fl,
u8 protocol)
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index af427a3dbcba..074a32f466f8 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -27,6 +27,15 @@
#include <net/act_api.h>
#include <net/netlink.h>
+static void free_tcf(struct rcu_head *head)
+{
+ struct tcf_common *p = container_of(head, struct tcf_common, tcfc_rcu);
+
+ free_percpu(p->cpu_bstats);
+ free_percpu(p->cpu_qstats);
+ kfree(p);
+}
+
void tcf_hash_destroy(struct tc_action *a)
{
struct tcf_common *p = a->priv;
@@ -41,7 +50,7 @@ void tcf_hash_destroy(struct tc_action *a)
* gen_estimator est_timer() might access p->tcfc_lock
* or bstats, wait a RCU grace period before freeing p
*/
- kfree_rcu(p, tcfc_rcu);
+ call_rcu(&p->tcfc_rcu, free_tcf);
}
EXPORT_SYMBOL(tcf_hash_destroy);
@@ -230,15 +239,16 @@ void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
if (est)
gen_kill_estimator(&pc->tcfc_bstats,
&pc->tcfc_rate_est);
- kfree_rcu(pc, tcfc_rcu);
+ call_rcu(&pc->tcfc_rcu, free_tcf);
}
EXPORT_SYMBOL(tcf_hash_cleanup);
int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
- int size, int bind)
+ int size, int bind, bool cpustats)
{
struct tcf_hashinfo *hinfo = a->ops->hinfo;
struct tcf_common *p = kzalloc(size, GFP_KERNEL);
+ int err = -ENOMEM;
if (unlikely(!p))
return -ENOMEM;
@@ -246,18 +256,32 @@ int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
if (bind)
p->tcfc_bindcnt = 1;
+ if (cpustats) {
+ p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+ if (!p->cpu_bstats) {
+err1:
+ kfree(p);
+ return err;
+ }
+ p->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
+ if (!p->cpu_qstats) {
+err2:
+ free_percpu(p->cpu_bstats);
+ goto err1;
+ }
+ }
spin_lock_init(&p->tcfc_lock);
INIT_HLIST_NODE(&p->tcfc_head);
p->tcfc_index = index ? index : tcf_hash_new_index(hinfo);
p->tcfc_tm.install = jiffies;
p->tcfc_tm.lastuse = jiffies;
if (est) {
- int err = gen_new_estimator(&p->tcfc_bstats, NULL,
- &p->tcfc_rate_est,
- &p->tcfc_lock, est);
+ err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats,
+ &p->tcfc_rate_est,
+ &p->tcfc_lock, est);
if (err) {
- kfree(p);
- return err;
+ free_percpu(p->cpu_qstats);
+ goto err2;
}
}
@@ -615,10 +639,10 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
if (err < 0)
goto errout;
- if (gnet_stats_copy_basic(&d, NULL, &p->tcfc_bstats) < 0 ||
+ if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfc_bstats) < 0 ||
gnet_stats_copy_rate_est(&d, &p->tcfc_bstats,
&p->tcfc_rate_est) < 0 ||
- gnet_stats_copy_queue(&d, NULL,
+ gnet_stats_copy_queue(&d, p->cpu_qstats,
&p->tcfc_qstats,
p->tcfc_qstats.qlen) < 0)
goto errout;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 1df78289e248..e9e923a8c747 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -281,7 +281,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
if (!tcf_hash_check(parm->index, act, bind)) {
ret = tcf_hash_create(parm->index, est, act,
- sizeof(*prog), bind);
+ sizeof(*prog), bind, false);
if (ret < 0)
goto destroy_fp;
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 295d14bd6c67..f2b540220ad0 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -108,7 +108,8 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
parm = nla_data(tb[TCA_CONNMARK_PARMS]);
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*ci), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*ci),
+ bind, false);
if (ret)
return ret;
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 4cd5cf1aedf8..b07c535ba8e7 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -62,7 +62,8 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,
parm = nla_data(tb[TCA_CSUM_PARMS]);
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
+ bind, false);
if (ret)
return ret;
ret = ACT_P_CREATED;
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 7fffc2272701..5c1b05170736 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -28,14 +28,18 @@
#ifdef CONFIG_GACT_PROB
static int gact_net_rand(struct tcf_gact *gact)
{
- if (!gact->tcfg_pval || prandom_u32() % gact->tcfg_pval)
+ smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */
+ if (prandom_u32() % gact->tcfg_pval)
return gact->tcf_action;
return gact->tcfg_paction;
}
static int gact_determ(struct tcf_gact *gact)
{
- if (!gact->tcfg_pval || gact->tcf_bstats.packets % gact->tcfg_pval)
+ u32 pack = atomic_inc_return(&gact->packets);
+
+ smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */
+ if (pack % gact->tcfg_pval)
return gact->tcf_action;
return gact->tcfg_paction;
}
@@ -85,7 +89,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
#endif
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*gact),
+ bind, true);
if (ret)
return ret;
ret = ACT_P_CREATED;
@@ -99,16 +104,19 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
gact = to_gact(a);
- spin_lock_bh(&gact->tcf_lock);
+ ASSERT_RTNL();
gact->tcf_action = parm->action;
#ifdef CONFIG_GACT_PROB
if (p_parm) {
gact->tcfg_paction = p_parm->paction;
- gact->tcfg_pval = p_parm->pval;
+ gact->tcfg_pval = max_t(u16, 1, p_parm->pval);
+ /* Make sure tcfg_pval is written before tcfg_ptype
+ * coupled with smp_rmb() in gact_net_rand() & gact_determ()
+ */
+ smp_wmb();
gact->tcfg_ptype = p_parm->ptype;
}
#endif
- spin_unlock_bh(&gact->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_hash_insert(a);
return ret;
@@ -118,23 +126,21 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_gact *gact = a->priv;
- int action = TC_ACT_SHOT;
+ int action = READ_ONCE(gact->tcf_action);
- spin_lock(&gact->tcf_lock);
#ifdef CONFIG_GACT_PROB
- if (gact->tcfg_ptype)
- action = gact_rand[gact->tcfg_ptype](gact);
- else
- action = gact->tcf_action;
-#else
- action = gact->tcf_action;
+ {
+ u32 ptype = READ_ONCE(gact->tcfg_ptype);
+
+ if (ptype)
+ action = gact_rand[ptype](gact);
+ }
#endif
- gact->tcf_bstats.bytes += qdisc_pkt_len(skb);
- gact->tcf_bstats.packets++;
+ bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), skb);
if (action == TC_ACT_SHOT)
- gact->tcf_qstats.drops++;
- gact->tcf_tm.lastuse = jiffies;
- spin_unlock(&gact->tcf_lock);
+ qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats));
+
+ tcf_lastuse_update(&gact->tcf_tm);
return action;
}
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index cbc8dd7dd48a..99c9cc1c7af9 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -114,7 +114,7 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,
index = nla_get_u32(tb[TCA_IPT_INDEX]);
if (!tcf_hash_check(index, a, bind) ) {
- ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind);
+ ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind, false);
if (ret)
return ret;
ret = ACT_P_CREATED;
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index a42a3b257226..19cd8904efa0 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -35,9 +35,11 @@ static LIST_HEAD(mirred_list);
static void tcf_mirred_release(struct tc_action *a, int bind)
{
struct tcf_mirred *m = to_mirred(a);
+ struct net_device *dev = rcu_dereference_protected(m->tcfm_dev, 1);
+
list_del(&m->tcfm_list);
- if (m->tcfm_dev)
- dev_put(m->tcfm_dev);
+ if (dev)
+ dev_put(dev);
}
static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
@@ -93,7 +95,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
if (!tcf_hash_check(parm->index, a, bind)) {
if (dev == NULL)
return -EINVAL;
- ret = tcf_hash_create(parm->index, est, a, sizeof(*m), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*m),
+ bind, true);
if (ret)
return ret;
ret = ACT_P_CREATED;
@@ -105,18 +108,18 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
}
m = to_mirred(a);
- spin_lock_bh(&m->tcf_lock);
+ ASSERT_RTNL();
m->tcf_action = parm->action;
m->tcfm_eaction = parm->eaction;
if (dev != NULL) {
m->tcfm_ifindex = parm->ifindex;
if (ret != ACT_P_CREATED)
- dev_put(m->tcfm_dev);
+ dev_put(rcu_dereference_protected(m->tcfm_dev, 1));
dev_hold(dev);
- m->tcfm_dev = dev;
+ rcu_assign_pointer(m->tcfm_dev, dev);
m->tcfm_ok_push = ok_push;
}
- spin_unlock_bh(&m->tcf_lock);
+
if (ret == ACT_P_CREATED) {
list_add(&m->tcfm_list, &mirred_list);
tcf_hash_insert(a);
@@ -131,20 +134,22 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
struct tcf_mirred *m = a->priv;
struct net_device *dev;
struct sk_buff *skb2;
+ int retval, err;
u32 at;
- int retval, err = 1;
- spin_lock(&m->tcf_lock);
- m->tcf_tm.lastuse = jiffies;
- bstats_update(&m->tcf_bstats, skb);
+ tcf_lastuse_update(&m->tcf_tm);
+
+ bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
- dev = m->tcfm_dev;
- if (!dev) {
- printk_once(KERN_NOTICE "tc mirred: target device is gone\n");
+ rcu_read_lock();
+ retval = READ_ONCE(m->tcf_action);
+ dev = rcu_dereference(m->tcfm_dev);
+ if (unlikely(!dev)) {
+ pr_notice_once("tc mirred: target device is gone\n");
goto out;
}
- if (!(dev->flags & IFF_UP)) {
+ if (unlikely(!(dev->flags & IFF_UP))) {
net_notice_ratelimited("tc mirred to Houston: device %s is down\n",
dev->name);
goto out;
@@ -152,7 +157,7 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
at = G_TC_AT(skb->tc_verd);
skb2 = skb_clone(skb, GFP_ATOMIC);
- if (skb2 == NULL)
+ if (!skb2)
goto out;
if (!(at & AT_EGRESS)) {
@@ -168,16 +173,13 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
skb2->dev = dev;
err = dev_queue_xmit(skb2);
-out:
if (err) {
- m->tcf_qstats.overlimits++;
+out:
+ qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats));
if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
retval = TC_ACT_SHOT;
- else
- retval = m->tcf_action;
- } else
- retval = m->tcf_action;
- spin_unlock(&m->tcf_lock);
+ }
+ rcu_read_unlock();
return retval;
}
@@ -216,14 +218,16 @@ static int mirred_device_event(struct notifier_block *unused,
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct tcf_mirred *m;
+ ASSERT_RTNL();
if (event == NETDEV_UNREGISTER)
list_for_each_entry(m, &mirred_list, tcfm_list) {
- spin_lock_bh(&m->tcf_lock);
- if (m->tcfm_dev == dev) {
+ if (rcu_access_pointer(m->tcfm_dev) == dev) {
dev_put(dev);
- m->tcfm_dev = NULL;
+ /* Note : no rcu grace period necessary, as
+ * net_device are already rcu protected.
+ */
+ RCU_INIT_POINTER(m->tcfm_dev, NULL);
}
- spin_unlock_bh(&m->tcf_lock);
}
return NOTIFY_DONE;
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 270a030d5fd0..5be0b3c1c5b0 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -55,7 +55,8 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
parm = nla_data(tb[TCA_NAT_PARMS]);
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
+ bind, false);
if (ret)
return ret;
ret = ACT_P_CREATED;
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 17e6d6669c7f..ce8676ad892f 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -57,7 +57,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
if (!tcf_hash_check(parm->index, a, bind)) {
if (!parm->nkeys)
return -EINVAL;
- ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
+ bind, false);
if (ret)
return ret;
p = to_pedit(a);
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 6a8d9488613a..d6b708d6afdf 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -103,7 +103,8 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
defdata = nla_data(tb[TCA_DEF_DATA]);
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*d),
+ bind, false);
if (ret)
return ret;
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index fcfeeaf838be..6751b5f8c046 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -99,7 +99,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*d),
+ bind, false);
if (ret)
return ret;
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index d735ecf0b1a7..796785e0bf96 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -116,7 +116,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
action = parm->v_action;
if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*v), bind);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*v),
+ bind, false);
if (ret)
return ret;
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index ea611b216412..4c85bd3a750c 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -30,35 +30,16 @@ static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
struct cls_cgroup_head *head = rcu_dereference_bh(tp->root);
- u32 classid;
-
- classid = task_cls_state(current)->classid;
-
- /*
- * Due to the nature of the classifier it is required to ignore all
- * packets originating from softirq context as accessing `current'
- * would lead to false results.
- *
- * This test assumes that all callers of dev_queue_xmit() explicitely
- * disable bh. Knowing this, it is possible to detect softirq based
- * calls by looking at the number of nested bh disable calls because
- * softirqs always disables bh.
- */
- if (in_serving_softirq()) {
- /* If there is an sk_classid we'll use that. */
- if (!skb->sk)
- return -1;
- classid = skb->sk->sk_classid;
- }
+ u32 classid = task_get_classid(skb);
if (!classid)
return -1;
-
if (!tcf_em_tree_match(skb, &head->ematches, NULL))
return -1;
res->classid = classid;
res->class = 0;
+
return tcf_exts_exec(skb, &head->exts, res);
}
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index b8d73bca683c..ffaeea63d473 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -186,7 +186,6 @@ struct qfq_sched {
u64 oldV, V; /* Precise virtual times. */
struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */
- u32 num_active_agg; /* Num. of active aggregates */
u32 wsum; /* weight sum */
u32 iwsum; /* inverse weight sum */
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 59e80356672b..4345790ad326 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -487,23 +487,35 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
*/
rcu_read_lock();
list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+ struct net_device *odev;
+
if (!laddr->valid)
continue;
- if ((laddr->state == SCTP_ADDR_SRC) &&
- (AF_INET == laddr->a.sa.sa_family)) {
- fl4->fl4_sport = laddr->a.v4.sin_port;
- flowi4_update_output(fl4,
- asoc->base.sk->sk_bound_dev_if,
- RT_CONN_FLAGS(asoc->base.sk),
- daddr->v4.sin_addr.s_addr,
- laddr->a.v4.sin_addr.s_addr);
-
- rt = ip_route_output_key(sock_net(sk), fl4);
- if (!IS_ERR(rt)) {
- dst = &rt->dst;
- goto out_unlock;
- }
- }
+ if (laddr->state != SCTP_ADDR_SRC ||
+ AF_INET != laddr->a.sa.sa_family)
+ continue;
+
+ fl4->fl4_sport = laddr->a.v4.sin_port;
+ flowi4_update_output(fl4,
+ asoc->base.sk->sk_bound_dev_if,
+ RT_CONN_FLAGS(asoc->base.sk),
+ daddr->v4.sin_addr.s_addr,
+ laddr->a.v4.sin_addr.s_addr);
+
+ rt = ip_route_output_key(sock_net(sk), fl4);
+ if (IS_ERR(rt))
+ continue;
+
+ /* Ensure the src address belongs to the output
+ * interface.
+ */
+ odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr,
+ false);
+ if (!odev || odev->ifindex != fl4->flowi4_oif)
+ continue;
+
+ dst = &rt->dst;
+ break;
}
out_unlock:
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 3ee27b7704ff..d7eaa7354cf7 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -853,7 +853,7 @@ nomem:
/*
* Respond to a normal COOKIE ACK chunk.
- * We are the side that is being asked for an association.
+ * We are the side that is asking for an association.
*
* RFC 2960 5.1 Normal Establishment of an Association
*
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 9f2add3cba26..33bafa2e703e 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -910,13 +910,9 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi)
if (switchdev_port_attr_get(dev, &attr))
return NULL;
- if (nhsel > 0) {
- if (prev_attr.u.ppid.id_len != attr.u.ppid.id_len)
+ if (nhsel > 0 &&
+ !netdev_phys_item_id_same(&prev_attr.u.ppid, &attr.u.ppid))
return NULL;
- if (memcmp(prev_attr.u.ppid.id, attr.u.ppid.id,
- attr.u.ppid.id_len))
- return NULL;
- }
prev_attr = attr;
}
@@ -1043,3 +1039,106 @@ void switchdev_fib_ipv4_abort(struct fib_info *fi)
fi->fib_net->ipv4.fib_offload_disabled = true;
}
EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort);
+
+static bool switchdev_port_same_parent_id(struct net_device *a,
+ struct net_device *b)
+{
+ struct switchdev_attr a_attr = {
+ .id = SWITCHDEV_ATTR_PORT_PARENT_ID,
+ .flags = SWITCHDEV_F_NO_RECURSE,
+ };
+ struct switchdev_attr b_attr = {
+ .id = SWITCHDEV_ATTR_PORT_PARENT_ID,
+ .flags = SWITCHDEV_F_NO_RECURSE,
+ };
+
+ if (switchdev_port_attr_get(a, &a_attr) ||
+ switchdev_port_attr_get(b, &b_attr))
+ return false;
+
+ return netdev_phys_item_id_same(&a_attr.u.ppid, &b_attr.u.ppid);
+}
+
+static u32 switchdev_port_fwd_mark_get(struct net_device *dev,
+ struct net_device *group_dev)
+{
+ struct net_device *lower_dev;
+ struct list_head *iter;
+
+ netdev_for_each_lower_dev(group_dev, lower_dev, iter) {
+ if (lower_dev == dev)
+ continue;
+ if (switchdev_port_same_parent_id(dev, lower_dev))
+ return lower_dev->offload_fwd_mark;
+ return switchdev_port_fwd_mark_get(dev, lower_dev);
+ }
+
+ return dev->ifindex;
+}
+
+static void switchdev_port_fwd_mark_reset(struct net_device *group_dev,
+ u32 old_mark, u32 *reset_mark)
+{
+ struct net_device *lower_dev;
+ struct list_head *iter;
+
+ netdev_for_each_lower_dev(group_dev, lower_dev, iter) {
+ if (lower_dev->offload_fwd_mark == old_mark) {
+ if (!*reset_mark)
+ *reset_mark = lower_dev->ifindex;
+ lower_dev->offload_fwd_mark = *reset_mark;
+ }
+ switchdev_port_fwd_mark_reset(lower_dev, old_mark, reset_mark);
+ }
+}
+
+/**
+ * switchdev_port_fwd_mark_set - Set port offload forwarding mark
+ *
+ * @dev: port device
+ * @group_dev: containing device
+ * @joining: true if dev is joining group; false if leaving group
+ *
+ * An ungrouped port's offload mark is just its ifindex. A grouped
+ * port's (member of a bridge, for example) offload mark is the ifindex
+ * of one of the ports in the group with the same parent (switch) ID.
+ * Ports on the same device in the same group will have the same mark.
+ *
+ * Example:
+ *
+ * br0 ifindex=9
+ * sw1p1 ifindex=2 mark=2
+ * sw1p2 ifindex=3 mark=2
+ * sw2p1 ifindex=4 mark=5
+ * sw2p2 ifindex=5 mark=5
+ *
+ * If sw2p2 leaves the bridge, we'll have:
+ *
+ * br0 ifindex=9
+ * sw1p1 ifindex=2 mark=2
+ * sw1p2 ifindex=3 mark=2
+ * sw2p1 ifindex=4 mark=4
+ * sw2p2 ifindex=5 mark=5
+ */
+void switchdev_port_fwd_mark_set(struct net_device *dev,
+ struct net_device *group_dev,
+ bool joining)
+{
+ u32 mark = dev->ifindex;
+ u32 reset_mark = 0;
+
+ if (group_dev && joining) {
+ mark = switchdev_port_fwd_mark_get(dev, group_dev);
+ } else if (group_dev && !joining) {
+ if (dev->offload_fwd_mark == mark)
+ /* Ohoh, this port was the mark reference port,
+ * but it's leaving the group, so reset the
+ * mark for the remaining ports in the group.
+ */
+ switchdev_port_fwd_mark_reset(group_dev, mark,
+ &reset_mark);
+ }
+
+ dev->offload_fwd_mark = mark;
+}
+EXPORT_SYMBOL_GPL(switchdev_port_fwd_mark_set);
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index a816382fc8af..8b010c976b2f 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -316,6 +316,29 @@ void tipc_bclink_update_link_state(struct tipc_node *n_ptr,
}
}
+void tipc_bclink_sync_state(struct tipc_node *n, struct tipc_msg *hdr)
+{
+ u16 last = msg_last_bcast(hdr);
+ int mtyp = msg_type(hdr);
+
+ if (unlikely(msg_user(hdr) != LINK_PROTOCOL))
+ return;
+ if (mtyp == STATE_MSG) {
+ tipc_bclink_update_link_state(n, last);
+ return;
+ }
+ /* Compatibility: older nodes don't know BCAST_PROTOCOL synchronization,
+ * and transfer synch info in LINK_PROTOCOL messages.
+ */
+ if (tipc_node_is_up(n))
+ return;
+ if ((mtyp != RESET_MSG) && (mtyp != ACTIVATE_MSG))
+ return;
+ n->bclink.last_sent = last;
+ n->bclink.last_in = last;
+ n->bclink.oos_state = 0;
+}
+
/**
* bclink_peek_nack - monitor retransmission requests sent by other nodes
*
@@ -358,10 +381,9 @@ int tipc_bclink_xmit(struct net *net, struct sk_buff_head *list)
/* Prepare clone of message for local node */
skb = tipc_msg_reassemble(list);
- if (unlikely(!skb)) {
- __skb_queue_purge(list);
+ if (unlikely(!skb))
return -EHOSTUNREACH;
- }
+
/* Broadcast to all nodes */
if (likely(bclink)) {
tipc_bclink_lock(net);
@@ -413,7 +435,7 @@ static void bclink_accept_pkt(struct tipc_node *node, u32 seqno)
* all nodes in the cluster don't ACK at the same time
*/
if (((seqno - tn->own_addr) % TIPC_MIN_LINK_WIN) == 0) {
- tipc_link_proto_xmit(node->active_links[node->addr & 1],
+ tipc_link_proto_xmit(node_active_link(node, node->addr),
STATE_MSG, 0, 0, 0, 0);
tn->bcl->stats.sent_acks++;
}
@@ -925,7 +947,6 @@ int tipc_bclink_init(struct net *net)
tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT);
bcl->bearer_id = MAX_BEARERS;
rcu_assign_pointer(tn->bearer_list[MAX_BEARERS], &bcbearer->bearer);
- bcl->state = WORKING_WORKING;
bcl->pmsg = (struct tipc_msg *)&bcl->proto_msg;
msg_set_prevnode(bcl->pmsg, tn->own_addr);
strlcpy(bcl->name, tipc_bclink_name, TIPC_MAX_LINK_NAME);
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index 3c290a48f720..d74c69bcf60b 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -133,5 +133,6 @@ void tipc_bclink_wakeup_users(struct net *net);
int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg);
int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]);
void tipc_bclink_input(struct net *net);
+void tipc_bclink_sync_state(struct tipc_node *n, struct tipc_msg *msg);
#endif
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 00bc0e620532..eae58a6b121c 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -470,6 +470,32 @@ void tipc_bearer_send(struct net *net, u32 bearer_id, struct sk_buff *buf,
rcu_read_unlock();
}
+/* tipc_bearer_xmit() -send buffer to destination over bearer
+ */
+void tipc_bearer_xmit(struct net *net, u32 bearer_id,
+ struct sk_buff_head *xmitq,
+ struct tipc_media_addr *dst)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_bearer *b;
+ struct sk_buff *skb, *tmp;
+
+ if (skb_queue_empty(xmitq))
+ return;
+
+ rcu_read_lock();
+ b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
+ if (likely(b)) {
+ skb_queue_walk_safe(xmitq, skb, tmp) {
+ __skb_dequeue(xmitq);
+ b->media->send_msg(net, skb, b, dst);
+ /* Until we remove cloning in tipc_l2_send_msg(): */
+ kfree_skb(skb);
+ }
+ }
+ rcu_read_unlock();
+}
+
/**
* tipc_l2_rcv_msg - handle incoming TIPC message from an interface
* @buf: the received packet
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index dc714d977768..6426f242f626 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -217,5 +217,8 @@ void tipc_bearer_cleanup(void);
void tipc_bearer_stop(struct net *net);
void tipc_bearer_send(struct net *net, u32 bearer_id, struct sk_buff *buf,
struct tipc_media_addr *dest);
+void tipc_bearer_xmit(struct net *net, u32 bearer_id,
+ struct sk_buff_head *xmitq,
+ struct tipc_media_addr *dst);
#endif /* _TIPC_BEARER_H */
diff --git a/net/tipc/core.h b/net/tipc/core.h
index 0fcf133d5cb7..f4ed67778c54 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -129,6 +129,11 @@ static inline int less(u16 left, u16 right)
return less_eq(left, right) && (mod(right) != mod(left));
}
+static inline int in_range(u16 val, u16 min, u16 max)
+{
+ return !less(val, min) && !more(val, max);
+}
+
#ifdef CONFIG_SYSCTL
int tipc_register_sysctl(void);
void tipc_unregister_sysctl(void);
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index 967e292f53c8..164d08907d6f 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -35,7 +35,7 @@
*/
#include "core.h"
-#include "link.h"
+#include "node.h"
#include "discover.h"
/* min delay during bearer start up */
@@ -125,7 +125,6 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *buf,
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
struct tipc_node *node;
- struct tipc_link *link;
struct tipc_media_addr maddr;
struct sk_buff *rbuf;
struct tipc_msg *msg = buf_msg(buf);
@@ -170,13 +169,10 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *buf,
return;
tipc_node_lock(node);
node->capabilities = caps;
- link = node->links[bearer->identity];
/* Prepare to validate requesting node's signature and media address */
sign_match = (signature == node->signature);
- addr_match = link && !memcmp(&link->media_addr, &maddr, sizeof(maddr));
- link_up = link && tipc_link_is_up(link);
-
+ tipc_node_check_dest(node, bearer, &link_up, &addr_match, &maddr);
/* These three flags give us eight permutations: */
@@ -239,16 +235,8 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *buf,
if (accept_sign)
node->signature = signature;
- if (accept_addr) {
- if (!link)
- link = tipc_link_create(node, bearer, &maddr);
- if (link) {
- memcpy(&link->media_addr, &maddr, sizeof(maddr));
- tipc_link_reset(link);
- } else {
- respond = false;
- }
- }
+ if (accept_addr && !tipc_node_update_dest(node, bearer, &maddr))
+ respond = false;
/* Send response, if necessary */
if (respond && (mtyp == DSC_REQ_MSG)) {
diff --git a/net/tipc/link.c b/net/tipc/link.c
index eaa9fe54b4ae..b63d57390bb7 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -77,36 +77,70 @@ static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = {
};
/*
- * Out-of-range value for link session numbers
+ * Interval between NACKs when packets arrive out of order
*/
-#define INVALID_SESSION 0x10000
-
+#define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2)
/*
- * Link state events:
+ * Out-of-range value for link session numbers
*/
-#define STARTING_EVT 856384768 /* link processing trigger */
-#define TRAFFIC_MSG_EVT 560815u /* rx'd ??? */
-#define SILENCE_EVT 560817u /* timer dicovered silence from peer */
+#define WILDCARD_SESSION 0x10000
-/*
- * State value stored in 'failover_pkts'
+/* State value stored in 'failover_pkts'
*/
#define FIRST_FAILOVER 0xffffu
-static void link_handle_out_of_seq_msg(struct tipc_link *link,
- struct sk_buff *skb);
-static void tipc_link_proto_rcv(struct tipc_link *link,
- struct sk_buff *skb);
-static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol);
-static void link_state_event(struct tipc_link *l_ptr, u32 event);
+/* Link FSM states and events:
+ */
+enum {
+ TIPC_LINK_WORKING,
+ TIPC_LINK_PROBING,
+ TIPC_LINK_RESETTING,
+ TIPC_LINK_ESTABLISHING
+};
+
+enum {
+ PEER_RESET_EVT = RESET_MSG,
+ ACTIVATE_EVT = ACTIVATE_MSG,
+ TRAFFIC_EVT, /* Any other valid msg from peer */
+ SILENCE_EVT /* Peer was silent during last timer interval*/
+};
+
+/* Link FSM state checking routines
+ */
+static int link_working(struct tipc_link *l)
+{
+ return l->state == TIPC_LINK_WORKING;
+}
+
+static int link_probing(struct tipc_link *l)
+{
+ return l->state == TIPC_LINK_PROBING;
+}
+
+static int link_resetting(struct tipc_link *l)
+{
+ return l->state == TIPC_LINK_RESETTING;
+}
+
+static int link_establishing(struct tipc_link *l)
+{
+ return l->state == TIPC_LINK_ESTABLISHING;
+}
+
+static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
+ struct sk_buff_head *xmitq);
+static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
+ u16 rcvgap, int tolerance, int priority,
+ struct sk_buff_head *xmitq);
static void link_reset_statistics(struct tipc_link *l_ptr);
static void link_print(struct tipc_link *l_ptr, const char *str);
-static void tipc_link_sync_xmit(struct tipc_link *l);
+static void tipc_link_build_bcast_sync_msg(struct tipc_link *l,
+ struct sk_buff_head *xmitq);
static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf);
static void tipc_link_input(struct tipc_link *l, struct sk_buff *skb);
static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb);
static bool tipc_link_failover_rcv(struct tipc_link *l, struct sk_buff **skb);
-static void link_set_timer(struct tipc_link *link, unsigned long time);
+
/*
* Simple link routines
*/
@@ -115,26 +149,13 @@ static unsigned int align(unsigned int i)
return (i + 3) & ~3u;
}
-static void tipc_link_release(struct kref *kref)
-{
- kfree(container_of(kref, struct tipc_link, ref));
-}
-
-static void tipc_link_get(struct tipc_link *l_ptr)
-{
- kref_get(&l_ptr->ref);
-}
-
-static void tipc_link_put(struct tipc_link *l_ptr)
-{
- kref_put(&l_ptr->ref, tipc_link_release);
-}
-
static struct tipc_link *tipc_parallel_link(struct tipc_link *l)
{
- if (l->owner->active_links[0] != l)
- return l->owner->active_links[0];
- return l->owner->active_links[1];
+ struct tipc_node *n = l->owner;
+
+ if (node_active_link(n, 0) != l)
+ return node_active_link(n, 0);
+ return node_active_link(n, 1);
}
/*
@@ -144,74 +165,14 @@ int tipc_link_is_up(struct tipc_link *l_ptr)
{
if (!l_ptr)
return 0;
- return link_working_working(l_ptr) || link_working_unknown(l_ptr);
+ return link_working(l_ptr) || link_probing(l_ptr);
}
-int tipc_link_is_active(struct tipc_link *l_ptr)
+int tipc_link_is_active(struct tipc_link *l)
{
- return (l_ptr->owner->active_links[0] == l_ptr) ||
- (l_ptr->owner->active_links[1] == l_ptr);
-}
-
-/**
- * link_timeout - handle expiration of link timer
- * @l_ptr: pointer to link
- */
-static void link_timeout(unsigned long data)
-{
- struct tipc_link *l_ptr = (struct tipc_link *)data;
- struct sk_buff *skb;
-
- tipc_node_lock(l_ptr->owner);
-
- /* update counters used in statistical profiling of send traffic */
- l_ptr->stats.accu_queue_sz += skb_queue_len(&l_ptr->transmq);
- l_ptr->stats.queue_sz_counts++;
-
- skb = skb_peek(&l_ptr->transmq);
- if (skb) {
- struct tipc_msg *msg = buf_msg(skb);
- u32 length = msg_size(msg);
-
- if ((msg_user(msg) == MSG_FRAGMENTER) &&
- (msg_type(msg) == FIRST_FRAGMENT)) {
- length = msg_size(msg_get_wrapped(msg));
- }
- if (length) {
- l_ptr->stats.msg_lengths_total += length;
- l_ptr->stats.msg_length_counts++;
- if (length <= 64)
- l_ptr->stats.msg_length_profile[0]++;
- else if (length <= 256)
- l_ptr->stats.msg_length_profile[1]++;
- else if (length <= 1024)
- l_ptr->stats.msg_length_profile[2]++;
- else if (length <= 4096)
- l_ptr->stats.msg_length_profile[3]++;
- else if (length <= 16384)
- l_ptr->stats.msg_length_profile[4]++;
- else if (length <= 32768)
- l_ptr->stats.msg_length_profile[5]++;
- else
- l_ptr->stats.msg_length_profile[6]++;
- }
- }
-
- /* do all other link processing performed on a periodic basis */
- if (l_ptr->silent_intv_cnt || tipc_bclink_acks_missing(l_ptr->owner))
- link_state_event(l_ptr, SILENCE_EVT);
- l_ptr->silent_intv_cnt++;
- if (skb_queue_len(&l_ptr->backlogq))
- tipc_link_push_packets(l_ptr);
- link_set_timer(l_ptr, l_ptr->keepalive_intv);
- tipc_node_unlock(l_ptr->owner);
- tipc_link_put(l_ptr);
-}
+ struct tipc_node *n = l->owner;
-static void link_set_timer(struct tipc_link *link, unsigned long time)
-{
- if (!mod_timer(&link->timer, jiffies + time))
- tipc_link_get(link);
+ return (node_active_link(n, 0) == l) || (node_active_link(n, 1) == l);
}
/**
@@ -224,7 +185,9 @@ static void link_set_timer(struct tipc_link *link, unsigned long time)
*/
struct tipc_link *tipc_link_create(struct tipc_node *n_ptr,
struct tipc_bearer *b_ptr,
- const struct tipc_media_addr *media_addr)
+ const struct tipc_media_addr *media_addr,
+ struct sk_buff_head *inputq,
+ struct sk_buff_head *namedq)
{
struct tipc_net *tn = net_generic(n_ptr->net, tipc_net_id);
struct tipc_link *l_ptr;
@@ -240,7 +203,7 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr,
return NULL;
}
- if (n_ptr->links[b_ptr->identity]) {
+ if (n_ptr->links[b_ptr->identity].link) {
tipc_addr_string_fill(addr_string, n_ptr->addr);
pr_err("Attempt to establish second link on <%s> to %s\n",
b_ptr->name, addr_string);
@@ -252,7 +215,6 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr,
pr_warn("Link creation failed, no memory\n");
return NULL;
}
- kref_init(&l_ptr->ref);
l_ptr->addr = peer;
if_name = strchr(b_ptr->name, ':') + 1;
sprintf(l_ptr->name, "%u.%u.%u:%s-%u.%u.%u:unknown",
@@ -263,10 +225,10 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr,
/* note: peer i/f name is updated by reset/activate message */
memcpy(&l_ptr->media_addr, media_addr, sizeof(*media_addr));
l_ptr->owner = n_ptr;
- l_ptr->peer_session = INVALID_SESSION;
+ l_ptr->peer_session = WILDCARD_SESSION;
l_ptr->bearer_id = b_ptr->identity;
- link_set_supervision_props(l_ptr, b_ptr->tolerance);
- l_ptr->state = RESET_UNKNOWN;
+ l_ptr->tolerance = b_ptr->tolerance;
+ l_ptr->state = TIPC_LINK_RESETTING;
l_ptr->pmsg = (struct tipc_msg *)&l_ptr->proto_msg;
msg = l_ptr->pmsg;
@@ -286,13 +248,11 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr,
__skb_queue_head_init(&l_ptr->backlogq);
__skb_queue_head_init(&l_ptr->deferdq);
skb_queue_head_init(&l_ptr->wakeupq);
- skb_queue_head_init(&l_ptr->inputq);
- skb_queue_head_init(&l_ptr->namedq);
+ l_ptr->inputq = inputq;
+ l_ptr->namedq = namedq;
+ skb_queue_head_init(l_ptr->inputq);
link_reset_statistics(l_ptr);
tipc_node_attach_link(n_ptr, l_ptr);
- setup_timer(&l_ptr->timer, link_timeout, (unsigned long)l_ptr);
- link_state_event(l_ptr, STARTING_EVT);
-
return l_ptr;
}
@@ -303,13 +263,8 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr,
void tipc_link_delete(struct tipc_link *l)
{
tipc_link_reset(l);
- if (del_timer(&l->timer))
- tipc_link_put(l);
- l->flags |= LINK_STOPPED;
- /* Delete link now, or when timer is finished: */
tipc_link_reset_fragments(l);
tipc_node_detach_link(l->owner, l);
- tipc_link_put(l);
}
void tipc_link_delete_list(struct net *net, unsigned int bearer_id)
@@ -321,7 +276,7 @@ void tipc_link_delete_list(struct net *net, unsigned int bearer_id)
rcu_read_lock();
list_for_each_entry_rcu(node, &tn->node_list, list) {
tipc_node_lock(node);
- link = node->links[bearer_id];
+ link = node->links[bearer_id].link;
if (link)
tipc_link_delete(link);
tipc_node_unlock(node);
@@ -329,12 +284,219 @@ void tipc_link_delete_list(struct net *net, unsigned int bearer_id)
rcu_read_unlock();
}
+/* tipc_link_build_bcast_sync_msg() - synchronize broadcast link endpoints.
+ *
+ * Give a newly added peer node the sequence number where it should
+ * start receiving and acking broadcast packets.
+ */
+static void tipc_link_build_bcast_sync_msg(struct tipc_link *l,
+ struct sk_buff_head *xmitq)
+{
+ struct sk_buff *skb;
+ struct sk_buff_head list;
+
+ skb = tipc_msg_create(BCAST_PROTOCOL, STATE_MSG, INT_H_SIZE,
+ 0, l->addr, link_own_addr(l), 0, 0, 0);
+ if (!skb)
+ return;
+ __skb_queue_head_init(&list);
+ __skb_queue_tail(&list, skb);
+ tipc_link_xmit(l, &list, xmitq);
+}
+
+/**
+ * tipc_link_fsm_evt - link finite state machine
+ * @l: pointer to link
+ * @evt: state machine event to be processed
+ * @xmitq: queue to prepend created protocol message, if any
+ */
+static int tipc_link_fsm_evt(struct tipc_link *l, int evt,
+ struct sk_buff_head *xmitq)
+{
+ int mtyp = 0, rc = 0;
+ struct tipc_link *pl;
+ enum {
+ LINK_RESET = 1,
+ LINK_ACTIVATE = (1 << 1),
+ SND_PROBE = (1 << 2),
+ SND_STATE = (1 << 3),
+ SND_RESET = (1 << 4),
+ SND_ACTIVATE = (1 << 5),
+ SND_BCAST_SYNC = (1 << 6)
+ } actions = 0;
+
+ if (l->exec_mode == TIPC_LINK_BLOCKED)
+ return rc;
+
+ switch (l->state) {
+ case TIPC_LINK_WORKING:
+ switch (evt) {
+ case TRAFFIC_EVT:
+ case ACTIVATE_EVT:
+ break;
+ case SILENCE_EVT:
+ l->state = TIPC_LINK_PROBING;
+ actions |= SND_PROBE;
+ break;
+ case PEER_RESET_EVT:
+ actions |= LINK_RESET | SND_ACTIVATE;
+ break;
+ default:
+ pr_debug("%s%u WORKING\n", link_unk_evt, evt);
+ }
+ break;
+ case TIPC_LINK_PROBING:
+ switch (evt) {
+ case TRAFFIC_EVT:
+ case ACTIVATE_EVT:
+ l->state = TIPC_LINK_WORKING;
+ break;
+ case PEER_RESET_EVT:
+ actions |= LINK_RESET | SND_ACTIVATE;
+ break;
+ case SILENCE_EVT:
+ if (l->silent_intv_cnt <= l->abort_limit) {
+ actions |= SND_PROBE;
+ break;
+ }
+ actions |= LINK_RESET | SND_RESET;
+ break;
+ default:
+ pr_err("%s%u PROBING\n", link_unk_evt, evt);
+ }
+ break;
+ case TIPC_LINK_RESETTING:
+ switch (evt) {
+ case TRAFFIC_EVT:
+ break;
+ case ACTIVATE_EVT:
+ pl = node_active_link(l->owner, 0);
+ if (pl && link_probing(pl))
+ break;
+ actions |= LINK_ACTIVATE;
+ if (!l->owner->working_links)
+ actions |= SND_BCAST_SYNC;
+ break;
+ case PEER_RESET_EVT:
+ l->state = TIPC_LINK_ESTABLISHING;
+ actions |= SND_ACTIVATE;
+ break;
+ case SILENCE_EVT:
+ actions |= SND_RESET;
+ break;
+ default:
+ pr_err("%s%u in RESETTING\n", link_unk_evt, evt);
+ }
+ break;
+ case TIPC_LINK_ESTABLISHING:
+ switch (evt) {
+ case TRAFFIC_EVT:
+ case ACTIVATE_EVT:
+ pl = node_active_link(l->owner, 0);
+ if (pl && link_probing(pl))
+ break;
+ actions |= LINK_ACTIVATE;
+ if (!l->owner->working_links)
+ actions |= SND_BCAST_SYNC;
+ break;
+ case PEER_RESET_EVT:
+ break;
+ case SILENCE_EVT:
+ actions |= SND_ACTIVATE;
+ break;
+ default:
+ pr_err("%s%u ESTABLISHING\n", link_unk_evt, evt);
+ }
+ break;
+ default:
+ pr_err("Unknown link state %u/%u\n", l->state, evt);
+ }
+
+ /* Perform actions as decided by FSM */
+ if (actions & LINK_RESET) {
+ l->exec_mode = TIPC_LINK_BLOCKED;
+ rc |= TIPC_LINK_DOWN_EVT;
+ }
+ if (actions & LINK_ACTIVATE) {
+ l->exec_mode = TIPC_LINK_OPEN;
+ rc |= TIPC_LINK_UP_EVT;
+ }
+ if (actions & (SND_STATE | SND_PROBE))
+ mtyp = STATE_MSG;
+ if (actions & SND_RESET)
+ mtyp = RESET_MSG;
+ if (actions & SND_ACTIVATE)
+ mtyp = ACTIVATE_MSG;
+ if (actions & (SND_PROBE | SND_STATE | SND_RESET | SND_ACTIVATE))
+ tipc_link_build_proto_msg(l, mtyp, actions & SND_PROBE,
+ 0, 0, 0, xmitq);
+ if (actions & SND_BCAST_SYNC)
+ tipc_link_build_bcast_sync_msg(l, xmitq);
+ return rc;
+}
+
+/* link_profile_stats - update statistical profiling of traffic
+ */
+static void link_profile_stats(struct tipc_link *l)
+{
+ struct sk_buff *skb;
+ struct tipc_msg *msg;
+ int length;
+
+ /* Update counters used in statistical profiling of send traffic */
+ l->stats.accu_queue_sz += skb_queue_len(&l->transmq);
+ l->stats.queue_sz_counts++;
+
+ skb = skb_peek(&l->transmq);
+ if (!skb)
+ return;
+ msg = buf_msg(skb);
+ length = msg_size(msg);
+
+ if (msg_user(msg) == MSG_FRAGMENTER) {
+ if (msg_type(msg) != FIRST_FRAGMENT)
+ return;
+ length = msg_size(msg_get_wrapped(msg));
+ }
+ l->stats.msg_lengths_total += length;
+ l->stats.msg_length_counts++;
+ if (length <= 64)
+ l->stats.msg_length_profile[0]++;
+ else if (length <= 256)
+ l->stats.msg_length_profile[1]++;
+ else if (length <= 1024)
+ l->stats.msg_length_profile[2]++;
+ else if (length <= 4096)
+ l->stats.msg_length_profile[3]++;
+ else if (length <= 16384)
+ l->stats.msg_length_profile[4]++;
+ else if (length <= 32768)
+ l->stats.msg_length_profile[5]++;
+ else
+ l->stats.msg_length_profile[6]++;
+}
+
+/* tipc_link_timeout - perform periodic task as instructed from node timeout
+ */
+int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
+{
+ int rc = 0;
+
+ link_profile_stats(l);
+ if (l->silent_intv_cnt)
+ rc = tipc_link_fsm_evt(l, SILENCE_EVT, xmitq);
+ else if (link_working(l) && tipc_bclink_acks_missing(l->owner))
+ tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq);
+ l->silent_intv_cnt++;
+ return rc;
+}
+
/**
* link_schedule_user - schedule a message sender for wakeup after congestion
* @link: congested link
* @list: message that was attempted sent
* Create pseudo msg to send back to user when congestion abates
- * Only consumes message if there is an error
+ * Does not consume buffer list
*/
static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list)
{
@@ -347,8 +509,7 @@ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list)
/* This really cannot happen... */
if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) {
pr_warn("%s<%s>, send queue full", link_rst_msg, link->name);
- tipc_link_reset(link);
- goto err;
+ return -ENOBUFS;
}
/* Non-blocking sender: */
if (TIPC_SKB_CB(skb_peek(list))->wakeup_pending)
@@ -358,15 +519,12 @@ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list)
skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0,
addr, addr, oport, 0, 0);
if (!skb)
- goto err;
+ return -ENOBUFS;
TIPC_SKB_CB(skb)->chain_sz = skb_queue_len(list);
TIPC_SKB_CB(skb)->chain_imp = imp;
skb_queue_tail(&link->wakeupq, skb);
link->stats.link_congs++;
return -ELINKCONG;
-err:
- __skb_queue_purge(list);
- return -ENOBUFS;
}
/**
@@ -388,8 +546,8 @@ void link_prepare_wakeup(struct tipc_link *l)
if ((pnd[imp] + l->backlog[imp].len) >= lim)
break;
skb_unlink(skb, &l->wakeupq);
- skb_queue_tail(&l->inputq, skb);
- l->owner->inputq = &l->inputq;
+ skb_queue_tail(l->inputq, skb);
+ l->owner->inputq = l->inputq;
l->owner->action_flags |= TIPC_MSG_EVT;
}
}
@@ -436,21 +594,22 @@ void tipc_link_reset(struct tipc_link *l_ptr)
msg_set_session(l_ptr->pmsg, ((msg_session(l_ptr->pmsg) + 1) & 0xffff));
/* Link is down, accept any session */
- l_ptr->peer_session = INVALID_SESSION;
+ l_ptr->peer_session = WILDCARD_SESSION;
/* Prepare for renewed mtu size negotiation */
l_ptr->mtu = l_ptr->advertised_mtu;
- l_ptr->state = RESET_UNKNOWN;
+ l_ptr->state = TIPC_LINK_RESETTING;
- if ((prev_state == RESET_UNKNOWN) || (prev_state == RESET_RESET))
+ if ((prev_state == TIPC_LINK_RESETTING) ||
+ (prev_state == TIPC_LINK_ESTABLISHING))
return;
- tipc_node_link_down(l_ptr->owner, l_ptr);
+ tipc_node_link_down(l_ptr->owner, l_ptr->bearer_id);
tipc_bearer_remove_dest(owner->net, l_ptr->bearer_id, l_ptr->addr);
if (was_active_link && tipc_node_is_up(l_ptr->owner) && (pl != l_ptr)) {
- l_ptr->flags |= LINK_FAILINGOVER;
+ l_ptr->exec_mode = TIPC_LINK_BLOCKED;
l_ptr->failover_checkpt = l_ptr->rcv_nxt;
pl->failover_pkts = FIRST_FAILOVER;
pl->failover_checkpt = l_ptr->rcv_nxt;
@@ -462,7 +621,7 @@ void tipc_link_reset(struct tipc_link *l_ptr)
__skb_queue_purge(&l_ptr->transmq);
__skb_queue_purge(&l_ptr->deferdq);
if (!owner->inputq)
- owner->inputq = &l_ptr->inputq;
+ owner->inputq = l_ptr->inputq;
skb_queue_splice_init(&l_ptr->wakeupq, owner->inputq);
if (!skb_queue_empty(owner->inputq))
owner->action_flags |= TIPC_MSG_EVT;
@@ -470,173 +629,32 @@ void tipc_link_reset(struct tipc_link *l_ptr)
l_ptr->reasm_buf = NULL;
l_ptr->rcv_unacked = 0;
l_ptr->snd_nxt = 1;
+ l_ptr->rcv_nxt = 1;
l_ptr->silent_intv_cnt = 0;
+ l_ptr->stats.recv_info = 0;
l_ptr->stale_count = 0;
link_reset_statistics(l_ptr);
}
-static void link_activate(struct tipc_link *link)
+void tipc_link_activate(struct tipc_link *link)
{
struct tipc_node *node = link->owner;
link->rcv_nxt = 1;
link->stats.recv_info = 1;
link->silent_intv_cnt = 0;
- tipc_node_link_up(node, link);
+ link->state = TIPC_LINK_WORKING;
+ link->exec_mode = TIPC_LINK_OPEN;
+ tipc_node_link_up(node, link->bearer_id);
tipc_bearer_add_dest(node->net, link->bearer_id, link->addr);
}
/**
- * link_state_event - link finite state machine
- * @l_ptr: pointer to link
- * @event: state machine event to process
- */
-static void link_state_event(struct tipc_link *l_ptr, unsigned int event)
-{
- struct tipc_link *other;
- unsigned long timer_intv = l_ptr->keepalive_intv;
-
- if (l_ptr->flags & LINK_STOPPED)
- return;
-
- if (!(l_ptr->flags & LINK_STARTED) && (event != STARTING_EVT))
- return; /* Not yet. */
-
- if (l_ptr->flags & LINK_FAILINGOVER)
- return;
-
- switch (l_ptr->state) {
- case WORKING_WORKING:
- switch (event) {
- case TRAFFIC_MSG_EVT:
- case ACTIVATE_MSG:
- l_ptr->silent_intv_cnt = 0;
- break;
- case SILENCE_EVT:
- if (!l_ptr->silent_intv_cnt) {
- if (tipc_bclink_acks_missing(l_ptr->owner))
- tipc_link_proto_xmit(l_ptr, STATE_MSG,
- 0, 0, 0, 0);
- break;
- }
- l_ptr->state = WORKING_UNKNOWN;
- tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0);
- break;
- case RESET_MSG:
- pr_debug("%s<%s>, requested by peer\n",
- link_rst_msg, l_ptr->name);
- tipc_link_reset(l_ptr);
- l_ptr->state = RESET_RESET;
- tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG,
- 0, 0, 0, 0);
- break;
- default:
- pr_debug("%s%u in WW state\n", link_unk_evt, event);
- }
- break;
- case WORKING_UNKNOWN:
- switch (event) {
- case TRAFFIC_MSG_EVT:
- case ACTIVATE_MSG:
- l_ptr->state = WORKING_WORKING;
- l_ptr->silent_intv_cnt = 0;
- break;
- case RESET_MSG:
- pr_debug("%s<%s>, requested by peer while probing\n",
- link_rst_msg, l_ptr->name);
- tipc_link_reset(l_ptr);
- l_ptr->state = RESET_RESET;
- tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG,
- 0, 0, 0, 0);
- break;
- case SILENCE_EVT:
- if (!l_ptr->silent_intv_cnt) {
- l_ptr->state = WORKING_WORKING;
- if (tipc_bclink_acks_missing(l_ptr->owner))
- tipc_link_proto_xmit(l_ptr, STATE_MSG,
- 0, 0, 0, 0);
- } else if (l_ptr->silent_intv_cnt <
- l_ptr->abort_limit) {
- tipc_link_proto_xmit(l_ptr, STATE_MSG,
- 1, 0, 0, 0);
- } else { /* Link has failed */
- pr_debug("%s<%s>, peer not responding\n",
- link_rst_msg, l_ptr->name);
- tipc_link_reset(l_ptr);
- l_ptr->state = RESET_UNKNOWN;
- tipc_link_proto_xmit(l_ptr, RESET_MSG,
- 0, 0, 0, 0);
- }
- break;
- default:
- pr_err("%s%u in WU state\n", link_unk_evt, event);
- }
- break;
- case RESET_UNKNOWN:
- switch (event) {
- case TRAFFIC_MSG_EVT:
- break;
- case ACTIVATE_MSG:
- other = l_ptr->owner->active_links[0];
- if (other && link_working_unknown(other))
- break;
- l_ptr->state = WORKING_WORKING;
- link_activate(l_ptr);
- tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0);
- if (l_ptr->owner->working_links == 1)
- tipc_link_sync_xmit(l_ptr);
- break;
- case RESET_MSG:
- l_ptr->state = RESET_RESET;
- tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG,
- 1, 0, 0, 0);
- break;
- case STARTING_EVT:
- l_ptr->flags |= LINK_STARTED;
- link_set_timer(l_ptr, timer_intv);
- break;
- case SILENCE_EVT:
- tipc_link_proto_xmit(l_ptr, RESET_MSG, 0, 0, 0, 0);
- break;
- default:
- pr_err("%s%u in RU state\n", link_unk_evt, event);
- }
- break;
- case RESET_RESET:
- switch (event) {
- case TRAFFIC_MSG_EVT:
- case ACTIVATE_MSG:
- other = l_ptr->owner->active_links[0];
- if (other && link_working_unknown(other))
- break;
- l_ptr->state = WORKING_WORKING;
- link_activate(l_ptr);
- tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0);
- if (l_ptr->owner->working_links == 1)
- tipc_link_sync_xmit(l_ptr);
- break;
- case RESET_MSG:
- break;
- case SILENCE_EVT:
- tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG,
- 0, 0, 0, 0);
- break;
- default:
- pr_err("%s%u in RR state\n", link_unk_evt, event);
- }
- break;
- default:
- pr_err("Unknown link state %u/%u\n", l_ptr->state, event);
- }
-}
-
-/**
* __tipc_link_xmit(): same as tipc_link_xmit, but destlink is known & locked
* @link: link to use
* @list: chain of buffers containing message
*
- * Consumes the buffer chain, except when returning -ELINKCONG,
- * since the caller then may want to make more send attempts.
+ * Consumes the buffer chain, except when returning an error code,
* Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS
* Messages at TIPC_SYSTEM_IMPORTANCE are always accepted
*/
@@ -660,10 +678,9 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link,
if (unlikely(link->backlog[i].len >= link->backlog[i].limit))
return link_schedule_user(link, list);
}
- if (unlikely(msg_size(msg) > mtu)) {
- __skb_queue_purge(list);
+ if (unlikely(msg_size(msg) > mtu))
return -EMSGSIZE;
- }
+
/* Prepare each packet for sending, and add to relevant queue: */
while (skb_queue_len(list)) {
skb = skb_peek(list);
@@ -700,101 +717,90 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link,
return 0;
}
-static void skb2list(struct sk_buff *skb, struct sk_buff_head *list)
-{
- skb_queue_head_init(list);
- __skb_queue_tail(list, skb);
-}
-
-static int __tipc_link_xmit_skb(struct tipc_link *link, struct sk_buff *skb)
-{
- struct sk_buff_head head;
-
- skb2list(skb, &head);
- return __tipc_link_xmit(link->owner->net, link, &head);
-}
-
-/* tipc_link_xmit_skb(): send single buffer to destination
- * Buffers sent via this functon are generally TIPC_SYSTEM_IMPORTANCE
- * messages, which will not be rejected
- * The only exception is datagram messages rerouted after secondary
- * lookup, which are rare and safe to dispose of anyway.
- * TODO: Return real return value, and let callers use
- * tipc_wait_for_sendpkt() where applicable
- */
-int tipc_link_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode,
- u32 selector)
-{
- struct sk_buff_head head;
- int rc;
-
- skb2list(skb, &head);
- rc = tipc_link_xmit(net, &head, dnode, selector);
- if (rc == -ELINKCONG)
- kfree_skb(skb);
- return 0;
-}
-
/**
- * tipc_link_xmit() is the general link level function for message sending
- * @net: the applicable net namespace
+ * tipc_link_xmit(): enqueue buffer list according to queue situation
+ * @link: link to use
* @list: chain of buffers containing message
- * @dsz: amount of user data to be sent
- * @dnode: address of destination node
- * @selector: a number used for deterministic link selection
- * Consumes the buffer chain, except when returning -ELINKCONG
- * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE
+ * @xmitq: returned list of packets to be sent by caller
+ *
+ * Consumes the buffer chain, except when returning -ELINKCONG,
+ * since the caller then may want to make more send attempts.
+ * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS
+ * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted
*/
-int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dnode,
- u32 selector)
+int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
+ struct sk_buff_head *xmitq)
{
- struct tipc_link *link = NULL;
- struct tipc_node *node;
- int rc = -EHOSTUNREACH;
+ struct tipc_msg *hdr = buf_msg(skb_peek(list));
+ unsigned int maxwin = l->window;
+ unsigned int i, imp = msg_importance(hdr);
+ unsigned int mtu = l->mtu;
+ u16 ack = l->rcv_nxt - 1;
+ u16 seqno = l->snd_nxt;
+ u16 bc_last_in = l->owner->bclink.last_in;
+ struct sk_buff_head *transmq = &l->transmq;
+ struct sk_buff_head *backlogq = &l->backlogq;
+ struct sk_buff *skb, *_skb, *bskb;
- node = tipc_node_find(net, dnode);
- if (node) {
- tipc_node_lock(node);
- link = node->active_links[selector & 1];
- if (link)
- rc = __tipc_link_xmit(net, link, list);
- tipc_node_unlock(node);
- tipc_node_put(node);
+ /* Match msg importance against this and all higher backlog limits: */
+ for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) {
+ if (unlikely(l->backlog[i].len >= l->backlog[i].limit))
+ return link_schedule_user(l, list);
}
- if (link)
- return rc;
+ if (unlikely(msg_size(hdr) > mtu))
+ return -EMSGSIZE;
- if (likely(in_own_node(net, dnode))) {
- tipc_sk_rcv(net, list);
- return 0;
- }
+ /* Prepare each packet for sending, and add to relevant queue: */
+ while (skb_queue_len(list)) {
+ skb = skb_peek(list);
+ hdr = buf_msg(skb);
+ msg_set_seqno(hdr, seqno);
+ msg_set_ack(hdr, ack);
+ msg_set_bcast_ack(hdr, bc_last_in);
- __skb_queue_purge(list);
- return rc;
+ if (likely(skb_queue_len(transmq) < maxwin)) {
+ _skb = skb_clone(skb, GFP_ATOMIC);
+ if (!_skb)
+ return -ENOBUFS;
+ __skb_dequeue(list);
+ __skb_queue_tail(transmq, skb);
+ __skb_queue_tail(xmitq, _skb);
+ l->rcv_unacked = 0;
+ seqno++;
+ continue;
+ }
+ if (tipc_msg_bundle(skb_peek_tail(backlogq), hdr, mtu)) {
+ kfree_skb(__skb_dequeue(list));
+ l->stats.sent_bundled++;
+ continue;
+ }
+ if (tipc_msg_make_bundle(&bskb, hdr, mtu, l->addr)) {
+ kfree_skb(__skb_dequeue(list));
+ __skb_queue_tail(backlogq, bskb);
+ l->backlog[msg_importance(buf_msg(bskb))].len++;
+ l->stats.sent_bundled++;
+ l->stats.sent_bundles++;
+ continue;
+ }
+ l->backlog[imp].len += skb_queue_len(list);
+ skb_queue_splice_tail_init(list, backlogq);
+ }
+ l->snd_nxt = seqno;
+ return 0;
}
-/*
- * tipc_link_sync_xmit - synchronize broadcast link endpoints.
- *
- * Give a newly added peer node the sequence number where it should
- * start receiving and acking broadcast packets.
- *
- * Called with node locked
- */
-static void tipc_link_sync_xmit(struct tipc_link *link)
+static void skb2list(struct sk_buff *skb, struct sk_buff_head *list)
{
- struct sk_buff *skb;
- struct tipc_msg *msg;
+ skb_queue_head_init(list);
+ __skb_queue_tail(list, skb);
+}
- skb = tipc_buf_acquire(INT_H_SIZE);
- if (!skb)
- return;
+static int __tipc_link_xmit_skb(struct tipc_link *link, struct sk_buff *skb)
+{
+ struct sk_buff_head head;
- msg = buf_msg(skb);
- tipc_msg_init(link_own_addr(link), msg, BCAST_PROTOCOL, STATE_MSG,
- INT_H_SIZE, link->addr);
- msg_set_last_bcast(msg, link->owner->bclink.acked);
- __tipc_link_xmit_skb(link, skb);
+ skb2list(skb, &head);
+ return __tipc_link_xmit(link->owner->net, link, &head);
}
/*
@@ -847,6 +853,34 @@ void tipc_link_push_packets(struct tipc_link *link)
link->snd_nxt = seqno;
}
+void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq)
+{
+ struct sk_buff *skb, *_skb;
+ struct tipc_msg *hdr;
+ u16 seqno = l->snd_nxt;
+ u16 ack = l->rcv_nxt - 1;
+
+ while (skb_queue_len(&l->transmq) < l->window) {
+ skb = skb_peek(&l->backlogq);
+ if (!skb)
+ break;
+ _skb = skb_clone(skb, GFP_ATOMIC);
+ if (!_skb)
+ break;
+ __skb_dequeue(&l->backlogq);
+ hdr = buf_msg(skb);
+ l->backlog[msg_importance(hdr)].len--;
+ __skb_queue_tail(&l->transmq, skb);
+ __skb_queue_tail(xmitq, _skb);
+ msg_set_ack(hdr, ack);
+ msg_set_seqno(hdr, seqno);
+ msg_set_bcast_ack(hdr, l->owner->bclink.last_in);
+ l->rcv_unacked = 0;
+ seqno++;
+ }
+ l->snd_nxt = seqno;
+}
+
void tipc_link_reset_all(struct tipc_node *node)
{
char addr_string[16];
@@ -858,9 +892,9 @@ void tipc_link_reset_all(struct tipc_node *node)
tipc_addr_string_fill(addr_string, node->addr));
for (i = 0; i < MAX_BEARERS; i++) {
- if (node->links[i]) {
- link_print(node->links[i], "Resetting link\n");
- tipc_link_reset(node->links[i]);
+ if (node->links[i].link) {
+ link_print(node->links[i].link, "Resetting link\n");
+ tipc_link_reset(node->links[i].link);
}
}
@@ -877,9 +911,13 @@ static void link_retransmit_failure(struct tipc_link *l_ptr,
if (l_ptr->addr) {
/* Handle failure on standard link */
- link_print(l_ptr, "Resetting link\n");
+ link_print(l_ptr, "Resetting link ");
+ pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n",
+ msg_user(msg), msg_type(msg), msg_size(msg),
+ msg_errcode(msg));
+ pr_info("sqno %u, prev: %x, src: %x\n",
+ msg_seqno(msg), msg_prevnode(msg), msg_orignode(msg));
tipc_link_reset(l_ptr);
-
} else {
/* Handle failure on broadcast link */
struct tipc_node *n_ptr;
@@ -940,6 +978,41 @@ void tipc_link_retransmit(struct tipc_link *l_ptr, struct sk_buff *skb,
}
}
+static int tipc_link_retransm(struct tipc_link *l, int retransm,
+ struct sk_buff_head *xmitq)
+{
+ struct sk_buff *_skb, *skb = skb_peek(&l->transmq);
+ struct tipc_msg *hdr;
+
+ if (!skb)
+ return 0;
+
+ /* Detect repeated retransmit failures on same packet */
+ if (likely(l->last_retransm != buf_seqno(skb))) {
+ l->last_retransm = buf_seqno(skb);
+ l->stale_count = 1;
+ } else if (++l->stale_count > 100) {
+ link_retransmit_failure(l, skb);
+ return TIPC_LINK_DOWN_EVT;
+ }
+ skb_queue_walk(&l->transmq, skb) {
+ if (!retransm)
+ return 0;
+ hdr = buf_msg(skb);
+ _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC);
+ if (!_skb)
+ return 0;
+ hdr = buf_msg(_skb);
+ msg_set_ack(hdr, l->rcv_nxt - 1);
+ msg_set_bcast_ack(hdr, l->owner->bclink.last_in);
+ _skb->priority = TC_PRIO_CONTROL;
+ __skb_queue_tail(xmitq, _skb);
+ retransm--;
+ l->stats.retransmitted++;
+ }
+ return 0;
+}
+
/* link_synch(): check if all packets arrived before the synch
* point have been consumed
* Returns true if the parallel links are synched, otherwise false
@@ -959,168 +1032,13 @@ static bool link_synch(struct tipc_link *l)
/* Is it still in the input queue ? */
post_synch = mod(pl->rcv_nxt - l->synch_point) - 1;
- if (skb_queue_len(&pl->inputq) > post_synch)
+ if (skb_queue_len(pl->inputq) > post_synch)
return false;
synched:
- l->flags &= ~LINK_SYNCHING;
+ l->exec_mode = TIPC_LINK_OPEN;
return true;
}
-static void link_retrieve_defq(struct tipc_link *link,
- struct sk_buff_head *list)
-{
- u16 seq_no;
-
- if (skb_queue_empty(&link->deferdq))
- return;
-
- seq_no = buf_seqno(skb_peek(&link->deferdq));
- if (seq_no == link->rcv_nxt)
- skb_queue_splice_tail_init(&link->deferdq, list);
-}
-
-/**
- * tipc_rcv - process TIPC packets/messages arriving from off-node
- * @net: the applicable net namespace
- * @skb: TIPC packet
- * @b_ptr: pointer to bearer message arrived on
- *
- * Invoked with no locks held. Bearer pointer must point to a valid bearer
- * structure (i.e. cannot be NULL), but bearer can be inactive.
- */
-void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr)
-{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct sk_buff_head head;
- struct tipc_node *n_ptr;
- struct tipc_link *l_ptr;
- struct sk_buff *skb1, *tmp;
- struct tipc_msg *msg;
- u16 seq_no;
- u16 ackd;
- u32 released;
-
- skb2list(skb, &head);
-
- while ((skb = __skb_dequeue(&head))) {
- /* Ensure message is well-formed */
- if (unlikely(!tipc_msg_validate(skb)))
- goto discard;
-
- /* Handle arrival of a non-unicast link message */
- msg = buf_msg(skb);
- if (unlikely(msg_non_seq(msg))) {
- if (msg_user(msg) == LINK_CONFIG)
- tipc_disc_rcv(net, skb, b_ptr);
- else
- tipc_bclink_rcv(net, skb);
- continue;
- }
-
- /* Discard unicast link messages destined for another node */
- if (unlikely(!msg_short(msg) &&
- (msg_destnode(msg) != tn->own_addr)))
- goto discard;
-
- /* Locate neighboring node that sent message */
- n_ptr = tipc_node_find(net, msg_prevnode(msg));
- if (unlikely(!n_ptr))
- goto discard;
-
- tipc_node_lock(n_ptr);
- /* Locate unicast link endpoint that should handle message */
- l_ptr = n_ptr->links[b_ptr->identity];
- if (unlikely(!l_ptr))
- goto unlock;
-
- /* Verify that communication with node is currently allowed */
- if ((n_ptr->action_flags & TIPC_WAIT_PEER_LINKS_DOWN) &&
- msg_user(msg) == LINK_PROTOCOL &&
- (msg_type(msg) == RESET_MSG ||
- msg_type(msg) == ACTIVATE_MSG) &&
- !msg_redundant_link(msg))
- n_ptr->action_flags &= ~TIPC_WAIT_PEER_LINKS_DOWN;
-
- if (tipc_node_blocked(n_ptr))
- goto unlock;
-
- /* Validate message sequence number info */
- seq_no = msg_seqno(msg);
- ackd = msg_ack(msg);
-
- /* Release acked messages */
- if (unlikely(n_ptr->bclink.acked != msg_bcast_ack(msg)))
- tipc_bclink_acknowledge(n_ptr, msg_bcast_ack(msg));
-
- released = 0;
- skb_queue_walk_safe(&l_ptr->transmq, skb1, tmp) {
- if (more(buf_seqno(skb1), ackd))
- break;
- __skb_unlink(skb1, &l_ptr->transmq);
- kfree_skb(skb1);
- released = 1;
- }
-
- /* Try sending any messages link endpoint has pending */
- if (unlikely(skb_queue_len(&l_ptr->backlogq)))
- tipc_link_push_packets(l_ptr);
-
- if (released && !skb_queue_empty(&l_ptr->wakeupq))
- link_prepare_wakeup(l_ptr);
-
- /* Process the incoming packet */
- if (unlikely(!link_working_working(l_ptr))) {
- if (msg_user(msg) == LINK_PROTOCOL) {
- tipc_link_proto_rcv(l_ptr, skb);
- link_retrieve_defq(l_ptr, &head);
- skb = NULL;
- goto unlock;
- }
-
- /* Traffic message. Conditionally activate link */
- link_state_event(l_ptr, TRAFFIC_MSG_EVT);
-
- if (link_working_working(l_ptr)) {
- /* Re-insert buffer in front of queue */
- __skb_queue_head(&head, skb);
- skb = NULL;
- goto unlock;
- }
- goto unlock;
- }
-
- /* Link is now in state WORKING_WORKING */
- if (unlikely(seq_no != l_ptr->rcv_nxt)) {
- link_handle_out_of_seq_msg(l_ptr, skb);
- link_retrieve_defq(l_ptr, &head);
- skb = NULL;
- goto unlock;
- }
- l_ptr->silent_intv_cnt = 0;
-
- /* Synchronize with parallel link if applicable */
- if (unlikely((l_ptr->flags & LINK_SYNCHING) && !msg_dup(msg))) {
- if (!link_synch(l_ptr))
- goto unlock;
- }
- l_ptr->rcv_nxt++;
- if (unlikely(!skb_queue_empty(&l_ptr->deferdq)))
- link_retrieve_defq(l_ptr, &head);
- if (unlikely(++l_ptr->rcv_unacked >= TIPC_MIN_LINK_WIN)) {
- l_ptr->stats.sent_acks++;
- tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0);
- }
- tipc_link_input(l_ptr, skb);
- skb = NULL;
-unlock:
- tipc_node_unlock(n_ptr);
- tipc_node_put(n_ptr);
-discard:
- if (unlikely(skb))
- kfree_skb(skb);
- }
-}
-
/* tipc_data_input - deliver data and name distr msgs to upper layer
*
* Consumes buffer if message is of right type
@@ -1138,16 +1056,16 @@ static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb)
case TIPC_HIGH_IMPORTANCE:
case TIPC_CRITICAL_IMPORTANCE:
case CONN_MANAGER:
- if (tipc_skb_queue_tail(&link->inputq, skb, dport)) {
- node->inputq = &link->inputq;
+ if (tipc_skb_queue_tail(link->inputq, skb, dport)) {
+ node->inputq = link->inputq;
node->action_flags |= TIPC_MSG_EVT;
}
return true;
case NAME_DISTRIBUTOR:
node->bclink.recv_permitted = true;
- node->namedq = &link->namedq;
- skb_queue_tail(&link->namedq, skb);
- if (skb_queue_len(&link->namedq) == 1)
+ node->namedq = link->namedq;
+ skb_queue_tail(link->namedq, skb);
+ if (skb_queue_len(link->namedq) == 1)
node->action_flags |= TIPC_NAMED_MSG_EVT;
return true;
case MSG_BUNDLER:
@@ -1174,13 +1092,10 @@ static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb)
struct sk_buff *iskb;
int pos = 0;
- if (likely(tipc_data_input(link, skb)))
- return;
-
switch (msg_user(msg)) {
case TUNNEL_PROTOCOL:
if (msg_dup(msg)) {
- link->flags |= LINK_SYNCHING;
+ link->exec_mode = TIPC_LINK_TUNNEL;
link->synch_point = msg_seqno(msg_get_wrapped(msg));
kfree_skb(skb);
break;
@@ -1215,6 +1130,110 @@ static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb)
};
}
+static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked)
+{
+ bool released = false;
+ struct sk_buff *skb, *tmp;
+
+ skb_queue_walk_safe(&l->transmq, skb, tmp) {
+ if (more(buf_seqno(skb), acked))
+ break;
+ __skb_unlink(skb, &l->transmq);
+ kfree_skb(skb);
+ released = true;
+ }
+ return released;
+}
+
+/* tipc_link_rcv - process TIPC packets/messages arriving from off-node
+ * @link: the link that should handle the message
+ * @skb: TIPC packet
+ * @xmitq: queue to place packets to be sent after this call
+ */
+int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
+ struct sk_buff_head *xmitq)
+{
+ struct sk_buff_head *arrvq = &l->deferdq;
+ struct sk_buff *tmp;
+ struct tipc_msg *hdr;
+ u16 seqno, rcv_nxt;
+ int rc = 0;
+
+ if (unlikely(!__tipc_skb_queue_sorted(arrvq, skb))) {
+ if (!(skb_queue_len(arrvq) % TIPC_NACK_INTV))
+ tipc_link_build_proto_msg(l, STATE_MSG, 0,
+ 0, 0, 0, xmitq);
+ return rc;
+ }
+
+ skb_queue_walk_safe(arrvq, skb, tmp) {
+ hdr = buf_msg(skb);
+
+ /* Verify and update link state */
+ if (unlikely(msg_user(hdr) == LINK_PROTOCOL)) {
+ __skb_dequeue(arrvq);
+ rc |= tipc_link_proto_rcv(l, skb, xmitq);
+ continue;
+ }
+
+ if (unlikely(!link_working(l))) {
+ rc |= tipc_link_fsm_evt(l, TRAFFIC_EVT, xmitq);
+ if (!link_working(l)) {
+ kfree_skb(__skb_dequeue(arrvq));
+ return rc;
+ }
+ }
+
+ l->silent_intv_cnt = 0;
+
+ /* Forward queues and wake up waiting users */
+ if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) {
+ tipc_link_advance_backlog(l, xmitq);
+ if (unlikely(!skb_queue_empty(&l->wakeupq)))
+ link_prepare_wakeup(l);
+ }
+
+ /* Defer reception if there is a gap in the sequence */
+ seqno = msg_seqno(hdr);
+ rcv_nxt = l->rcv_nxt;
+ if (unlikely(less(rcv_nxt, seqno))) {
+ l->stats.deferred_recv++;
+ return rc;
+ }
+
+ __skb_dequeue(arrvq);
+
+ /* Drop if packet already received */
+ if (unlikely(more(rcv_nxt, seqno))) {
+ l->stats.duplicates++;
+ kfree_skb(skb);
+ return rc;
+ }
+
+ /* Synchronize with parallel link if applicable */
+ if (unlikely(l->exec_mode == TIPC_LINK_TUNNEL))
+ if (!msg_dup(hdr) && !link_synch(l)) {
+ kfree_skb(skb);
+ return rc;
+ }
+
+ /* Packet can be delivered */
+ l->rcv_nxt++;
+ l->stats.recv_info++;
+ if (unlikely(!tipc_data_input(l, skb)))
+ tipc_link_input(l, skb);
+
+ /* Ack at regular intervals */
+ if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN)) {
+ l->rcv_unacked = 0;
+ l->stats.sent_acks++;
+ tipc_link_build_proto_msg(l, STATE_MSG,
+ 0, 0, 0, 0, xmitq);
+ }
+ }
+ return rc;
+}
+
/**
* tipc_link_defer_pkt - Add out-of-sequence message to deferred reception queue
*
@@ -1255,235 +1274,85 @@ u32 tipc_link_defer_pkt(struct sk_buff_head *list, struct sk_buff *skb)
}
/*
- * link_handle_out_of_seq_msg - handle arrival of out-of-sequence packet
+ * Send protocol message to the other endpoint.
*/
-static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr,
- struct sk_buff *buf)
+void tipc_link_proto_xmit(struct tipc_link *l, u32 msg_typ, int probe_msg,
+ u32 gap, u32 tolerance, u32 priority)
{
- u32 seq_no = buf_seqno(buf);
-
- if (likely(msg_user(buf_msg(buf)) == LINK_PROTOCOL)) {
- tipc_link_proto_rcv(l_ptr, buf);
- return;
- }
-
- /* Record OOS packet arrival */
- l_ptr->silent_intv_cnt = 0;
+ struct sk_buff *skb = NULL;
+ struct sk_buff_head xmitq;
- /*
- * Discard packet if a duplicate; otherwise add it to deferred queue
- * and notify peer of gap as per protocol specification
- */
- if (less(seq_no, l_ptr->rcv_nxt)) {
- l_ptr->stats.duplicates++;
- kfree_skb(buf);
+ __skb_queue_head_init(&xmitq);
+ tipc_link_build_proto_msg(l, msg_typ, probe_msg, gap,
+ tolerance, priority, &xmitq);
+ skb = __skb_dequeue(&xmitq);
+ if (!skb)
return;
- }
-
- if (tipc_link_defer_pkt(&l_ptr->deferdq, buf)) {
- l_ptr->stats.deferred_recv++;
- if ((skb_queue_len(&l_ptr->deferdq) % TIPC_MIN_LINK_WIN) == 1)
- tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0);
- } else {
- l_ptr->stats.duplicates++;
- }
+ tipc_bearer_send(l->owner->net, l->bearer_id, skb, &l->media_addr);
+ l->rcv_unacked = 0;
+ kfree_skb(skb);
}
-/*
- * Send protocol message to the other endpoint.
+/* tipc_link_build_proto_msg: prepare link protocol message for transmission
*/
-void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg,
- u32 gap, u32 tolerance, u32 priority)
+static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
+ u16 rcvgap, int tolerance, int priority,
+ struct sk_buff_head *xmitq)
{
- struct sk_buff *buf = NULL;
- struct tipc_msg *msg = l_ptr->pmsg;
- u32 msg_size = sizeof(l_ptr->proto_msg);
- int r_flag;
- u16 last_rcv;
-
- /* Don't send protocol message during link failover */
- if (l_ptr->flags & LINK_FAILINGOVER)
- return;
-
- /* Abort non-RESET send if communication with node is prohibited */
- if ((tipc_node_blocked(l_ptr->owner)) && (msg_typ != RESET_MSG))
+ struct sk_buff *skb = NULL;
+ struct tipc_msg *hdr = l->pmsg;
+ u16 snd_nxt = l->snd_nxt;
+ u16 rcv_nxt = l->rcv_nxt;
+ u16 rcv_last = rcv_nxt - 1;
+ int node_up = l->owner->bclink.recv_permitted;
+
+ /* Don't send protocol message during reset or link failover */
+ if (l->exec_mode == TIPC_LINK_BLOCKED)
return;
- /* Create protocol message with "out-of-sequence" sequence number */
- msg_set_type(msg, msg_typ);
- msg_set_net_plane(msg, l_ptr->net_plane);
- msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in);
- msg_set_last_bcast(msg, tipc_bclink_get_last_sent(l_ptr->owner->net));
+ msg_set_type(hdr, mtyp);
+ msg_set_net_plane(hdr, l->net_plane);
+ msg_set_bcast_ack(hdr, l->owner->bclink.last_in);
+ msg_set_last_bcast(hdr, tipc_bclink_get_last_sent(l->owner->net));
+ msg_set_link_tolerance(hdr, tolerance);
+ msg_set_linkprio(hdr, priority);
+ msg_set_redundant_link(hdr, node_up);
+ msg_set_seq_gap(hdr, 0);
- if (msg_typ == STATE_MSG) {
- u16 next_sent = l_ptr->snd_nxt;
+ /* Compatibility: created msg must not be in sequence with pkt flow */
+ msg_set_seqno(hdr, snd_nxt + U16_MAX / 2);
- if (!tipc_link_is_up(l_ptr))
+ if (mtyp == STATE_MSG) {
+ if (!tipc_link_is_up(l))
return;
- msg_set_next_sent(msg, next_sent);
- if (!skb_queue_empty(&l_ptr->deferdq)) {
- last_rcv = buf_seqno(skb_peek(&l_ptr->deferdq));
- gap = mod(last_rcv - l_ptr->rcv_nxt);
+ msg_set_next_sent(hdr, snd_nxt);
+
+ /* Override rcvgap if there are packets in deferred queue */
+ if (!skb_queue_empty(&l->deferdq))
+ rcvgap = buf_seqno(skb_peek(&l->deferdq)) - rcv_nxt;
+ if (rcvgap) {
+ msg_set_seq_gap(hdr, rcvgap);
+ l->stats.sent_nacks++;
}
- msg_set_seq_gap(msg, gap);
- if (gap)
- l_ptr->stats.sent_nacks++;
- msg_set_link_tolerance(msg, tolerance);
- msg_set_linkprio(msg, priority);
- msg_set_max_pkt(msg, l_ptr->mtu);
- msg_set_ack(msg, mod(l_ptr->rcv_nxt - 1));
- msg_set_probe(msg, probe_msg != 0);
- if (probe_msg)
- l_ptr->stats.sent_probes++;
- l_ptr->stats.sent_states++;
- } else { /* RESET_MSG or ACTIVATE_MSG */
- msg_set_ack(msg, mod(l_ptr->failover_checkpt - 1));
- msg_set_seq_gap(msg, 0);
- msg_set_next_sent(msg, 1);
- msg_set_probe(msg, 0);
- msg_set_link_tolerance(msg, l_ptr->tolerance);
- msg_set_linkprio(msg, l_ptr->priority);
- msg_set_max_pkt(msg, l_ptr->advertised_mtu);
+ msg_set_ack(hdr, rcv_last);
+ msg_set_probe(hdr, probe);
+ if (probe)
+ l->stats.sent_probes++;
+ l->stats.sent_states++;
+ } else {
+ /* RESET_MSG or ACTIVATE_MSG */
+ msg_set_max_pkt(hdr, l->advertised_mtu);
+ msg_set_ack(hdr, l->failover_checkpt - 1);
+ msg_set_next_sent(hdr, 1);
}
-
- r_flag = (l_ptr->owner->working_links > tipc_link_is_up(l_ptr));
- msg_set_redundant_link(msg, r_flag);
- msg_set_linkprio(msg, l_ptr->priority);
- msg_set_size(msg, msg_size);
-
- msg_set_seqno(msg, mod(l_ptr->snd_nxt + (0xffff / 2)));
-
- buf = tipc_buf_acquire(msg_size);
- if (!buf)
+ skb = tipc_buf_acquire(msg_size(hdr));
+ if (!skb)
return;
-
- skb_copy_to_linear_data(buf, msg, sizeof(l_ptr->proto_msg));
- buf->priority = TC_PRIO_CONTROL;
- tipc_bearer_send(l_ptr->owner->net, l_ptr->bearer_id, buf,
- &l_ptr->media_addr);
- l_ptr->rcv_unacked = 0;
- kfree_skb(buf);
+ skb_copy_to_linear_data(skb, hdr, msg_size(hdr));
+ skb->priority = TC_PRIO_CONTROL;
+ __skb_queue_head(xmitq, skb);
}
-/*
- * Receive protocol message :
- * Note that network plane id propagates through the network, and may
- * change at any time. The node with lowest address rules
- */
-static void tipc_link_proto_rcv(struct tipc_link *l_ptr,
- struct sk_buff *buf)
-{
- u32 rec_gap = 0;
- u32 msg_tol;
- struct tipc_msg *msg = buf_msg(buf);
-
- if (l_ptr->flags & LINK_FAILINGOVER)
- goto exit;
-
- if (l_ptr->net_plane != msg_net_plane(msg))
- if (link_own_addr(l_ptr) > msg_prevnode(msg))
- l_ptr->net_plane = msg_net_plane(msg);
-
- switch (msg_type(msg)) {
-
- case RESET_MSG:
- if (!link_working_unknown(l_ptr) &&
- (l_ptr->peer_session != INVALID_SESSION)) {
- if (less_eq(msg_session(msg), l_ptr->peer_session))
- break; /* duplicate or old reset: ignore */
- }
-
- if (!msg_redundant_link(msg) && (link_working_working(l_ptr) ||
- link_working_unknown(l_ptr))) {
- /*
- * peer has lost contact -- don't allow peer's links
- * to reactivate before we recognize loss & clean up
- */
- l_ptr->owner->action_flags |= TIPC_WAIT_OWN_LINKS_DOWN;
- }
-
- link_state_event(l_ptr, RESET_MSG);
-
- /* fall thru' */
- case ACTIVATE_MSG:
- /* Update link settings according other endpoint's values */
- strcpy((strrchr(l_ptr->name, ':') + 1), (char *)msg_data(msg));
-
- msg_tol = msg_link_tolerance(msg);
- if (msg_tol > l_ptr->tolerance)
- link_set_supervision_props(l_ptr, msg_tol);
-
- if (msg_linkprio(msg) > l_ptr->priority)
- l_ptr->priority = msg_linkprio(msg);
-
- if (l_ptr->mtu > msg_max_pkt(msg))
- l_ptr->mtu = msg_max_pkt(msg);
-
- /* Synchronize broadcast link info, if not done previously */
- if (!tipc_node_is_up(l_ptr->owner)) {
- l_ptr->owner->bclink.last_sent =
- l_ptr->owner->bclink.last_in =
- msg_last_bcast(msg);
- l_ptr->owner->bclink.oos_state = 0;
- }
-
- l_ptr->peer_session = msg_session(msg);
- l_ptr->peer_bearer_id = msg_bearer_id(msg);
-
- if (msg_type(msg) == ACTIVATE_MSG)
- link_state_event(l_ptr, ACTIVATE_MSG);
- break;
- case STATE_MSG:
-
- msg_tol = msg_link_tolerance(msg);
- if (msg_tol)
- link_set_supervision_props(l_ptr, msg_tol);
-
- if (msg_linkprio(msg) &&
- (msg_linkprio(msg) != l_ptr->priority)) {
- pr_debug("%s<%s>, priority change %u->%u\n",
- link_rst_msg, l_ptr->name,
- l_ptr->priority, msg_linkprio(msg));
- l_ptr->priority = msg_linkprio(msg);
- tipc_link_reset(l_ptr); /* Enforce change to take effect */
- break;
- }
-
- /* Record reception; force mismatch at next timeout: */
- l_ptr->silent_intv_cnt = 0;
-
- link_state_event(l_ptr, TRAFFIC_MSG_EVT);
- l_ptr->stats.recv_states++;
- if (link_reset_unknown(l_ptr))
- break;
-
- if (less_eq(l_ptr->rcv_nxt, msg_next_sent(msg)))
- rec_gap = mod(msg_next_sent(msg) - l_ptr->rcv_nxt);
-
- if (msg_probe(msg))
- l_ptr->stats.recv_probes++;
-
- /* Protocol message before retransmits, reduce loss risk */
- if (l_ptr->owner->bclink.recv_permitted)
- tipc_bclink_update_link_state(l_ptr->owner,
- msg_last_bcast(msg));
-
- if (rec_gap || (msg_probe(msg))) {
- tipc_link_proto_xmit(l_ptr, STATE_MSG, 0,
- rec_gap, 0, 0);
- }
- if (msg_seq_gap(msg)) {
- l_ptr->stats.recv_nacks++;
- tipc_link_retransmit(l_ptr, skb_peek(&l_ptr->transmq),
- msg_seq_gap(msg));
- }
- break;
- }
-exit:
- kfree_skb(buf);
-}
-
-
/* tipc_link_tunnel_xmit(): Tunnel one packet via a link belonging to
* a different bearer. Owner node is locked.
*/
@@ -1496,7 +1365,7 @@ static void tipc_link_tunnel_xmit(struct tipc_link *l_ptr,
struct sk_buff *skb;
u32 length = msg_size(msg);
- tunnel = l_ptr->owner->active_links[selector & 1];
+ tunnel = node_active_link(l_ptr->owner, selector & 1);
if (!tipc_link_is_up(tunnel)) {
pr_warn("%stunnel link no longer available\n", link_co_err);
return;
@@ -1522,7 +1391,7 @@ static void tipc_link_tunnel_xmit(struct tipc_link *l_ptr,
void tipc_link_failover_send_queue(struct tipc_link *l_ptr)
{
int msgcount;
- struct tipc_link *tunnel = l_ptr->owner->active_links[0];
+ struct tipc_link *tunnel = node_active_link(l_ptr->owner, 0);
struct tipc_msg tunnel_hdr;
struct sk_buff *skb;
int split_bundles;
@@ -1556,8 +1425,8 @@ void tipc_link_failover_send_queue(struct tipc_link *l_ptr)
return;
}
- split_bundles = (l_ptr->owner->active_links[0] !=
- l_ptr->owner->active_links[1]);
+ split_bundles = (node_active_link(l_ptr->owner, 0) !=
+ node_active_link(l_ptr->owner, 0));
skb_queue_walk(&l_ptr->transmq, skb) {
struct tipc_msg *msg = buf_msg(skb);
@@ -1660,7 +1529,7 @@ static bool tipc_link_failover_rcv(struct tipc_link *link,
if (bearer_id == link->bearer_id)
goto exit;
- pl = link->owner->links[bearer_id];
+ pl = link->owner->links[bearer_id].link;
if (pl && tipc_link_is_up(pl))
tipc_link_reset(pl);
@@ -1691,22 +1560,100 @@ static bool tipc_link_failover_rcv(struct tipc_link *link,
}
exit:
if (!link->failover_pkts && pl)
- pl->flags &= ~LINK_FAILINGOVER;
+ pl->exec_mode = TIPC_LINK_OPEN;
kfree_skb(*skb);
*skb = iskb;
return *skb;
}
-static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol)
+/* tipc_link_proto_rcv(): receive link level protocol message :
+ * Note that network plane id propagates through the network, and may
+ * change at any time. The node with lowest numerical id determines
+ * network plane
+ */
+static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
+ struct sk_buff_head *xmitq)
{
- unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4;
+ struct tipc_msg *hdr = buf_msg(skb);
+ u16 rcvgap = 0;
+ u16 nacked_gap = msg_seq_gap(hdr);
+ u16 peers_snd_nxt = msg_next_sent(hdr);
+ u16 peers_tol = msg_link_tolerance(hdr);
+ u16 peers_prio = msg_linkprio(hdr);
+ char *if_name;
+ int rc = 0;
- if ((tol < TIPC_MIN_LINK_TOL) || (tol > TIPC_MAX_LINK_TOL))
- return;
+ if (l->exec_mode == TIPC_LINK_BLOCKED)
+ goto exit;
+
+ if (link_own_addr(l) > msg_prevnode(hdr))
+ l->net_plane = msg_net_plane(hdr);
+
+ switch (msg_type(hdr)) {
+ case RESET_MSG:
+
+ /* Ignore duplicate RESET with old session number */
+ if ((less_eq(msg_session(hdr), l->peer_session)) &&
+ (l->peer_session != WILDCARD_SESSION))
+ break;
+ /* fall thru' */
+ case ACTIVATE_MSG:
+
+ /* Complete own link name with peer's interface name */
+ if_name = strrchr(l->name, ':') + 1;
+ if (sizeof(l->name) - (if_name - l->name) <= TIPC_MAX_IF_NAME)
+ break;
+ if (msg_data_sz(hdr) < TIPC_MAX_IF_NAME)
+ break;
+ strncpy(if_name, msg_data(hdr), TIPC_MAX_IF_NAME);
+
+ /* Update own tolerance if peer indicates a non-zero value */
+ if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL))
+ l->tolerance = peers_tol;
- l_ptr->tolerance = tol;
- l_ptr->keepalive_intv = msecs_to_jiffies(intv);
- l_ptr->abort_limit = tol / (jiffies_to_msecs(l_ptr->keepalive_intv));
+ /* Update own priority if peer's priority is higher */
+ if (in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI))
+ l->priority = peers_prio;
+
+ l->peer_session = msg_session(hdr);
+ l->peer_bearer_id = msg_bearer_id(hdr);
+ rc = tipc_link_fsm_evt(l, msg_type(hdr), xmitq);
+ if (l->mtu > msg_max_pkt(hdr))
+ l->mtu = msg_max_pkt(hdr);
+ break;
+ case STATE_MSG:
+ /* Update own tolerance if peer indicates a non-zero value */
+ if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL))
+ l->tolerance = peers_tol;
+
+ l->silent_intv_cnt = 0;
+ l->stats.recv_states++;
+ if (msg_probe(hdr))
+ l->stats.recv_probes++;
+ rc = tipc_link_fsm_evt(l, TRAFFIC_EVT, xmitq);
+ if (!tipc_link_is_up(l))
+ break;
+
+ /* Has peer sent packets we haven't received yet ? */
+ if (more(peers_snd_nxt, l->rcv_nxt))
+ rcvgap = peers_snd_nxt - l->rcv_nxt;
+ if (rcvgap || (msg_probe(hdr)))
+ tipc_link_build_proto_msg(l, STATE_MSG, 0, rcvgap,
+ 0, 0, xmitq);
+ tipc_link_release_pkts(l, msg_ack(hdr));
+
+ /* If NACK, retransmit will now start at right position */
+ if (nacked_gap) {
+ rc |= tipc_link_retransm(l, nacked_gap, xmitq);
+ l->stats.recv_nacks++;
+ }
+ tipc_link_advance_backlog(l, xmitq);
+ if (unlikely(!skb_queue_empty(&l->wakeupq)))
+ link_prepare_wakeup(l);
+ }
+exit:
+ kfree_skb(skb);
+ return rc;
}
void tipc_link_set_queue_limits(struct tipc_link *l, u32 win)
@@ -1743,7 +1690,7 @@ static struct tipc_node *tipc_link_find_owner(struct net *net,
list_for_each_entry_rcu(n_ptr, &tn->node_list, list) {
tipc_node_lock(n_ptr);
for (i = 0; i < MAX_BEARERS; i++) {
- l_ptr = n_ptr->links[i];
+ l_ptr = n_ptr->links[i].link;
if (l_ptr && !strcmp(l_ptr->name, link_name)) {
*bearer_id = i;
found_node = n_ptr;
@@ -1770,27 +1717,28 @@ static void link_reset_statistics(struct tipc_link *l_ptr)
l_ptr->stats.recv_info = l_ptr->rcv_nxt;
}
-static void link_print(struct tipc_link *l_ptr, const char *str)
+static void link_print(struct tipc_link *l, const char *str)
{
- struct tipc_net *tn = net_generic(l_ptr->owner->net, tipc_net_id);
- struct tipc_bearer *b_ptr;
-
- rcu_read_lock();
- b_ptr = rcu_dereference_rtnl(tn->bearer_list[l_ptr->bearer_id]);
- if (b_ptr)
- pr_info("%s Link %x<%s>:", str, l_ptr->addr, b_ptr->name);
- rcu_read_unlock();
-
- if (link_working_unknown(l_ptr))
- pr_cont(":WU\n");
- else if (link_reset_reset(l_ptr))
- pr_cont(":RR\n");
- else if (link_reset_unknown(l_ptr))
- pr_cont(":RU\n");
- else if (link_working_working(l_ptr))
- pr_cont(":WW\n");
+ struct sk_buff *hskb = skb_peek(&l->transmq);
+ u16 head = hskb ? msg_seqno(buf_msg(hskb)) : l->snd_nxt;
+ u16 tail = l->snd_nxt - 1;
+
+ pr_info("%s Link <%s>:", str, l->name);
+
+ if (link_probing(l))
+ pr_cont(":P\n");
+ else if (link_establishing(l))
+ pr_cont(":E\n");
+ else if (link_resetting(l))
+ pr_cont(":R\n");
+ else if (link_working(l))
+ pr_cont(":W\n");
else
pr_cont("\n");
+
+ pr_info("XMTQ: %u [%u-%u], BKLGQ: %u, SNDNX: %u, RCVNX: %u\n",
+ skb_queue_len(&l->transmq), head, tail,
+ skb_queue_len(&l->backlogq), l->snd_nxt, l->rcv_nxt);
}
/* Parse and validate nested (link) properties valid for media, bearer and link
@@ -1865,7 +1813,7 @@ int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info)
tipc_node_lock(node);
- link = node->links[bearer_id];
+ link = node->links[bearer_id].link;
if (!link) {
res = -EINVAL;
goto out;
@@ -1885,7 +1833,7 @@ int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info)
u32 tol;
tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]);
- link_set_supervision_props(link, tol);
+ link->tolerance = tol;
tipc_link_proto_xmit(link, STATE_MSG, 0, 0, tol, 0);
}
if (props[TIPC_NLA_PROP_PRIO]) {
@@ -2055,10 +2003,11 @@ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg,
for (i = *prev_link; i < MAX_BEARERS; i++) {
*prev_link = i;
- if (!node->links[i])
+ if (!node->links[i].link)
continue;
- err = __tipc_nl_add_link(net, msg, node->links[i], NLM_F_MULTI);
+ err = __tipc_nl_add_link(net, msg,
+ node->links[i].link, NLM_F_MULTI);
if (err)
return err;
}
@@ -2172,7 +2121,7 @@ int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
tipc_node_lock(node);
- link = node->links[bearer_id];
+ link = node->links[bearer_id].link;
if (!link) {
tipc_node_unlock(node);
nlmsg_free(msg.skb);
@@ -2227,7 +2176,7 @@ int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info)
tipc_node_lock(node);
- link = node->links[bearer_id];
+ link = node->links[bearer_id].link;
if (!link) {
tipc_node_unlock(node);
return -EINVAL;
diff --git a/net/tipc/link.h b/net/tipc/link.h
index ae0a0ea572f2..37cfd7d7bf7d 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -49,19 +49,21 @@
*/
#define INVALID_LINK_SEQ 0x10000
-/* Link working states
+
+/* Link endpoint receive states
*/
-#define WORKING_WORKING 560810u
-#define WORKING_UNKNOWN 560811u
-#define RESET_UNKNOWN 560812u
-#define RESET_RESET 560813u
+enum {
+ TIPC_LINK_OPEN,
+ TIPC_LINK_BLOCKED,
+ TIPC_LINK_TUNNEL
+};
-/* Link endpoint execution states
+/* Events returned from link at packet reception or at timeout
*/
-#define LINK_STARTED 0x0001
-#define LINK_STOPPED 0x0002
-#define LINK_SYNCHING 0x0004
-#define LINK_FAILINGOVER 0x0008
+enum {
+ TIPC_LINK_UP_EVT = 1,
+ TIPC_LINK_DOWN_EVT = (1 << 1)
+};
/* Starting value for maximum packet size negotiation on unicast links
* (unless bearer MTU is less)
@@ -106,7 +108,6 @@ struct tipc_stats {
* @timer: link timer
* @owner: pointer to peer node
* @refcnt: reference counter for permanent references (owner node & timer)
- * @flags: execution state flags for link endpoint instance
* @peer_session: link session # being used by peer end of link
* @peer_bearer_id: bearer id used by link's peer endpoint
* @bearer_id: local bearer id used by link
@@ -119,6 +120,7 @@ struct tipc_stats {
* @pmsg: convenience pointer to "proto_msg" field
* @priority: current link priority
* @net_plane: current link network plane ('A' through 'H')
+ * @exec_mode: transmit/receive mode for link endpoint instance
* @backlog_limit: backlog queue congestion thresholds (indexed by importance)
* @exp_msg_count: # of tunnelled messages expected during link changeover
* @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset
@@ -144,12 +146,9 @@ struct tipc_link {
u32 addr;
char name[TIPC_MAX_LINK_NAME];
struct tipc_media_addr media_addr;
- struct timer_list timer;
struct tipc_node *owner;
- struct kref ref;
/* Management and link supervision data */
- unsigned int flags;
u32 peer_session;
u32 peer_bearer_id;
u32 bearer_id;
@@ -165,6 +164,7 @@ struct tipc_link {
struct tipc_msg *pmsg;
u32 priority;
char net_plane;
+ u8 exec_mode;
u16 synch_point;
/* Failover */
@@ -192,8 +192,8 @@ struct tipc_link {
u16 rcv_nxt;
u32 rcv_unacked;
struct sk_buff_head deferdq;
- struct sk_buff_head inputq;
- struct sk_buff_head namedq;
+ struct sk_buff_head *inputq;
+ struct sk_buff_head *namedq;
/* Congestion handling */
struct sk_buff_head wakeupq;
@@ -207,9 +207,11 @@ struct tipc_link {
struct tipc_port;
-struct tipc_link *tipc_link_create(struct tipc_node *n_ptr,
- struct tipc_bearer *b_ptr,
- const struct tipc_media_addr *media_addr);
+struct tipc_link *tipc_link_create(struct tipc_node *n,
+ struct tipc_bearer *b,
+ const struct tipc_media_addr *maddr,
+ struct sk_buff_head *inputq,
+ struct sk_buff_head *namedq);
void tipc_link_delete(struct tipc_link *link);
void tipc_link_delete_list(struct net *net, unsigned int bearer_id);
void tipc_link_failover_send_queue(struct tipc_link *l_ptr);
@@ -221,12 +223,11 @@ void tipc_link_purge_queues(struct tipc_link *l_ptr);
void tipc_link_purge_backlog(struct tipc_link *l);
void tipc_link_reset_all(struct tipc_node *node);
void tipc_link_reset(struct tipc_link *l_ptr);
-int tipc_link_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest,
- u32 selector);
-int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dest,
- u32 selector);
+void tipc_link_activate(struct tipc_link *link);
int __tipc_link_xmit(struct net *net, struct tipc_link *link,
struct sk_buff_head *list);
+int tipc_link_xmit(struct tipc_link *link, struct sk_buff_head *list,
+ struct sk_buff_head *xmitq);
void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int prob,
u32 gap, u32 tolerance, u32 priority);
void tipc_link_push_packets(struct tipc_link *l_ptr);
@@ -243,33 +244,12 @@ int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]);
void link_prepare_wakeup(struct tipc_link *l);
-
+int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq);
+int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
+ struct sk_buff_head *xmitq);
static inline u32 link_own_addr(struct tipc_link *l)
{
return msg_prevnode(l->pmsg);
}
-/*
- * Link status checking routines
- */
-static inline int link_working_working(struct tipc_link *l_ptr)
-{
- return l_ptr->state == WORKING_WORKING;
-}
-
-static inline int link_working_unknown(struct tipc_link *l_ptr)
-{
- return l_ptr->state == WORKING_UNKNOWN;
-}
-
-static inline int link_reset_unknown(struct tipc_link *l_ptr)
-{
- return l_ptr->state == RESET_UNKNOWN;
-}
-
-static inline int link_reset_reset(struct tipc_link *l_ptr)
-{
- return l_ptr->state == RESET_RESET;
-}
-
#endif
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 19c45fb66238..2f1563b47e24 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -38,6 +38,7 @@
#define _TIPC_MSG_H
#include <linux/tipc.h>
+#include "core.h"
/*
* Constants and routines used to read and write TIPC payload message headers
@@ -658,12 +659,12 @@ static inline void msg_set_link_selector(struct tipc_msg *m, u32 n)
/*
* Word 5
*/
-static inline u32 msg_session(struct tipc_msg *m)
+static inline u16 msg_session(struct tipc_msg *m)
{
return msg_bits(m, 5, 16, 0xffff);
}
-static inline void msg_set_session(struct tipc_msg *m, u32 n)
+static inline void msg_set_session(struct tipc_msg *m, u16 n)
{
msg_set_bits(m, 5, 16, 0xffff, n);
}
@@ -766,6 +767,22 @@ static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n)
msg_set_bits(m, 9, 0, 0xffff, n);
}
+static inline bool msg_is_traffic(struct tipc_msg *m)
+{
+ if (likely(msg_user(m) != LINK_PROTOCOL))
+ return true;
+ if ((msg_type(m) == RESET_MSG) || (msg_type(m) == ACTIVATE_MSG))
+ return false;
+ return true;
+}
+
+static inline bool msg_peer_is_up(struct tipc_msg *m)
+{
+ if (likely(msg_is_traffic(m)))
+ return false;
+ return msg_redundant_link(m);
+}
+
struct sk_buff *tipc_buf_acquire(u32 size);
bool tipc_msg_validate(struct sk_buff *skb);
bool tipc_msg_reverse(u32 own_addr, struct sk_buff *buf, u32 *dnode,
@@ -879,4 +896,36 @@ static inline bool tipc_skb_queue_tail(struct sk_buff_head *list,
return rv;
}
+/* tipc_skb_queue_sorted(); sort pkt into list according to sequence number
+ * @list: list to be appended to
+ * @skb: buffer to add
+ * Returns true if queue should treated further, otherwise false
+ */
+static inline bool __tipc_skb_queue_sorted(struct sk_buff_head *list,
+ struct sk_buff *skb)
+{
+ struct sk_buff *_skb, *tmp;
+ struct tipc_msg *hdr = buf_msg(skb);
+ u16 seqno = msg_seqno(hdr);
+
+ if (skb_queue_empty(list) || (msg_user(hdr) == LINK_PROTOCOL)) {
+ __skb_queue_head(list, skb);
+ return true;
+ }
+ if (likely(less(seqno, buf_seqno(skb_peek(list))))) {
+ __skb_queue_head(list, skb);
+ return true;
+ }
+ if (!more(seqno, buf_seqno(skb_peek_tail(list)))) {
+ skb_queue_walk_safe(list, _skb, tmp) {
+ if (likely(less(seqno, buf_seqno(_skb)))) {
+ __skb_queue_before(list, _skb, skb);
+ return true;
+ }
+ }
+ }
+ __skb_queue_tail(list, skb);
+ return false;
+}
+
#endif
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 41e7b7e4dda0..e6018b7eb197 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -96,13 +96,13 @@ void named_cluster_distribute(struct net *net, struct sk_buff *skb)
dnode = node->addr;
if (in_own_node(net, dnode))
continue;
- if (!tipc_node_active_links(node))
+ if (!tipc_node_is_up(node))
continue;
oskb = pskb_copy(skb, GFP_ATOMIC);
if (!oskb)
break;
msg_set_destnode(buf_msg(oskb), dnode);
- tipc_link_xmit_skb(net, oskb, dnode, dnode);
+ tipc_node_xmit_skb(net, oskb, dnode, dnode);
}
rcu_read_unlock();
@@ -223,7 +223,7 @@ void tipc_named_node_up(struct net *net, u32 dnode)
&tn->nametbl->publ_list[TIPC_ZONE_SCOPE]);
rcu_read_unlock();
- tipc_link_xmit(net, &head, dnode, dnode);
+ tipc_node_xmit(net, &head, dnode, dnode);
}
static void tipc_publ_subscribe(struct net *net, struct publication *publ,
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 0b1d61a5f853..e92f84afbf95 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -40,10 +40,13 @@
#include "name_distr.h"
#include "socket.h"
#include "bcast.h"
+#include "discover.h"
static void node_lost_contact(struct tipc_node *n_ptr);
static void node_established_contact(struct tipc_node *n_ptr);
static void tipc_node_delete(struct tipc_node *node);
+static void tipc_node_timeout(unsigned long data);
+static void tipc_node_fsm_evt(struct tipc_node *n, int evt);
struct tipc_sock_conn {
u32 port;
@@ -132,6 +135,7 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr)
INIT_LIST_HEAD(&n_ptr->list);
INIT_LIST_HEAD(&n_ptr->publ_list);
INIT_LIST_HEAD(&n_ptr->conn_sks);
+ skb_queue_head_init(&n_ptr->bclink.namedq);
__skb_queue_head_init(&n_ptr->bclink.deferdq);
hlist_add_head_rcu(&n_ptr->hash, &tn->node_htable[tipc_hashfn(addr)]);
list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
@@ -139,14 +143,32 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr)
break;
}
list_add_tail_rcu(&n_ptr->list, &temp_node->list);
- n_ptr->action_flags = TIPC_WAIT_PEER_LINKS_DOWN;
+ n_ptr->state = SELF_DOWN_PEER_LEAVING;
n_ptr->signature = INVALID_NODE_SIG;
+ n_ptr->active_links[0] = INVALID_BEARER_ID;
+ n_ptr->active_links[1] = INVALID_BEARER_ID;
tipc_node_get(n_ptr);
+ setup_timer(&n_ptr->timer, tipc_node_timeout, (unsigned long)n_ptr);
+ n_ptr->keepalive_intv = U32_MAX;
exit:
spin_unlock_bh(&tn->node_list_lock);
return n_ptr;
}
+static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l)
+{
+ unsigned long tol = l->tolerance;
+ unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4;
+ unsigned long keepalive_intv = msecs_to_jiffies(intv);
+
+ /* Link with lowest tolerance determines timer interval */
+ if (keepalive_intv < n->keepalive_intv)
+ n->keepalive_intv = keepalive_intv;
+
+ /* Ensure link's abort limit corresponds to current interval */
+ l->abort_limit = l->tolerance / jiffies_to_msecs(n->keepalive_intv);
+}
+
static void tipc_node_delete(struct tipc_node *node)
{
list_del_rcu(&node->list);
@@ -160,8 +182,11 @@ void tipc_node_stop(struct net *net)
struct tipc_node *node, *t_node;
spin_lock_bh(&tn->node_list_lock);
- list_for_each_entry_safe(node, t_node, &tn->node_list, list)
+ list_for_each_entry_safe(node, t_node, &tn->node_list, list) {
+ if (del_timer(&node->timer))
+ tipc_node_put(node);
tipc_node_put(node);
+ }
spin_unlock_bh(&tn->node_list_lock);
}
@@ -219,131 +244,170 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port)
tipc_node_put(node);
}
+/* tipc_node_timeout - handle expiration of node timer
+ */
+static void tipc_node_timeout(unsigned long data)
+{
+ struct tipc_node *n = (struct tipc_node *)data;
+ struct sk_buff_head xmitq;
+ struct tipc_link *l;
+ struct tipc_media_addr *maddr;
+ int bearer_id;
+ int rc = 0;
+
+ __skb_queue_head_init(&xmitq);
+
+ for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) {
+ tipc_node_lock(n);
+ l = n->links[bearer_id].link;
+ if (l) {
+ /* Link tolerance may change asynchronously: */
+ tipc_node_calculate_timer(n, l);
+ rc = tipc_link_timeout(l, &xmitq);
+ if (rc & TIPC_LINK_DOWN_EVT)
+ tipc_link_reset(l);
+ }
+ tipc_node_unlock(n);
+ maddr = &n->links[bearer_id].maddr;
+ tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr);
+ }
+ if (!mod_timer(&n->timer, jiffies + n->keepalive_intv))
+ tipc_node_get(n);
+ tipc_node_put(n);
+}
+
/**
* tipc_node_link_up - handle addition of link
*
* Link becomes active (alone or shared) or standby, depending on its priority.
*/
-void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr)
+void tipc_node_link_up(struct tipc_node *n, int bearer_id)
{
- struct tipc_link **active = &n_ptr->active_links[0];
+ int *slot0 = &n->active_links[0];
+ int *slot1 = &n->active_links[1];
+ struct tipc_link_entry *links = n->links;
+ struct tipc_link *l = n->links[bearer_id].link;
- n_ptr->working_links++;
- n_ptr->action_flags |= TIPC_NOTIFY_LINK_UP;
- n_ptr->link_id = l_ptr->peer_bearer_id << 16 | l_ptr->bearer_id;
+ /* Leave room for tunnel header when returning 'mtu' to users: */
+ links[bearer_id].mtu = l->mtu - INT_H_SIZE;
+
+ n->working_links++;
+ n->action_flags |= TIPC_NOTIFY_LINK_UP;
+ n->link_id = l->peer_bearer_id << 16 | l->bearer_id;
pr_debug("Established link <%s> on network plane %c\n",
- l_ptr->name, l_ptr->net_plane);
+ l->name, l->net_plane);
- if (!active[0]) {
- active[0] = active[1] = l_ptr;
- node_established_contact(n_ptr);
- goto exit;
+ /* No active links ? => take both active slots */
+ if (*slot0 < 0) {
+ *slot0 = bearer_id;
+ *slot1 = bearer_id;
+ node_established_contact(n);
+ return;
}
- if (l_ptr->priority < active[0]->priority) {
- pr_debug("New link <%s> becomes standby\n", l_ptr->name);
- goto exit;
+
+ /* Lower prio than current active ? => no slot */
+ if (l->priority < links[*slot0].link->priority) {
+ pr_debug("New link <%s> becomes standby\n", l->name);
+ return;
}
- tipc_link_dup_queue_xmit(active[0], l_ptr);
- if (l_ptr->priority == active[0]->priority) {
- active[0] = l_ptr;
- goto exit;
+ tipc_link_dup_queue_xmit(links[*slot0].link, l);
+
+ /* Same prio as current active ? => take one slot */
+ if (l->priority == links[*slot0].link->priority) {
+ *slot0 = bearer_id;
+ return;
}
- pr_debug("Old link <%s> becomes standby\n", active[0]->name);
- if (active[1] != active[0])
- pr_debug("Old link <%s> becomes standby\n", active[1]->name);
- active[0] = active[1] = l_ptr;
-exit:
- /* Leave room for changeover header when returning 'mtu' to users: */
- n_ptr->act_mtus[0] = active[0]->mtu - INT_H_SIZE;
- n_ptr->act_mtus[1] = active[1]->mtu - INT_H_SIZE;
+
+ /* Higher prio than current active => take both active slots */
+ pr_debug("Old link <%s> now standby\n", links[*slot0].link->name);
+ *slot0 = bearer_id;
+ *slot1 = bearer_id;
}
/**
- * node_select_active_links - select active link
+ * tipc_node_link_down - handle loss of link
*/
-static void node_select_active_links(struct tipc_node *n_ptr)
+void tipc_node_link_down(struct tipc_node *n, int bearer_id)
{
- struct tipc_link **active = &n_ptr->active_links[0];
- u32 i;
- u32 highest_prio = 0;
+ int *slot0 = &n->active_links[0];
+ int *slot1 = &n->active_links[1];
+ int i, highest = 0;
+ struct tipc_link *l, *_l;
- active[0] = active[1] = NULL;
+ l = n->links[bearer_id].link;
+ n->working_links--;
+ n->action_flags |= TIPC_NOTIFY_LINK_DOWN;
+ n->link_id = l->peer_bearer_id << 16 | l->bearer_id;
- for (i = 0; i < MAX_BEARERS; i++) {
- struct tipc_link *l_ptr = n_ptr->links[i];
+ pr_debug("Lost link <%s> on network plane %c\n",
+ l->name, l->net_plane);
- if (!l_ptr || !tipc_link_is_up(l_ptr) ||
- (l_ptr->priority < highest_prio))
+ /* Select new active link if any available */
+ *slot0 = INVALID_BEARER_ID;
+ *slot1 = INVALID_BEARER_ID;
+ for (i = 0; i < MAX_BEARERS; i++) {
+ _l = n->links[i].link;
+ if (!_l || !tipc_link_is_up(_l))
+ continue;
+ if (_l->priority < highest)
+ continue;
+ if (_l->priority > highest) {
+ highest = _l->priority;
+ *slot0 = i;
+ *slot1 = i;
continue;
-
- if (l_ptr->priority > highest_prio) {
- highest_prio = l_ptr->priority;
- active[0] = active[1] = l_ptr;
- } else {
- active[1] = l_ptr;
}
+ *slot1 = i;
}
+ if (tipc_node_is_up(n))
+ tipc_link_failover_send_queue(l);
+ else
+ node_lost_contact(n);
}
-/**
- * tipc_node_link_down - handle loss of link
- */
-void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr)
+bool tipc_node_is_up(struct tipc_node *n)
{
- struct tipc_net *tn = net_generic(n_ptr->net, tipc_net_id);
- struct tipc_link **active;
-
- n_ptr->working_links--;
- n_ptr->action_flags |= TIPC_NOTIFY_LINK_DOWN;
- n_ptr->link_id = l_ptr->peer_bearer_id << 16 | l_ptr->bearer_id;
-
- if (!tipc_link_is_active(l_ptr)) {
- pr_debug("Lost standby link <%s> on network plane %c\n",
- l_ptr->name, l_ptr->net_plane);
- return;
- }
- pr_debug("Lost link <%s> on network plane %c\n",
- l_ptr->name, l_ptr->net_plane);
-
- active = &n_ptr->active_links[0];
- if (active[0] == l_ptr)
- active[0] = active[1];
- if (active[1] == l_ptr)
- active[1] = active[0];
- if (active[0] == l_ptr)
- node_select_active_links(n_ptr);
- if (tipc_node_is_up(n_ptr))
- tipc_link_failover_send_queue(l_ptr);
- else
- node_lost_contact(n_ptr);
-
- /* Leave room for changeover header when returning 'mtu' to users: */
- if (active[0]) {
- n_ptr->act_mtus[0] = active[0]->mtu - INT_H_SIZE;
- n_ptr->act_mtus[1] = active[1]->mtu - INT_H_SIZE;
- return;
- }
- /* Loopback link went down? No fragmentation needed from now on. */
- if (n_ptr->addr == tn->own_addr) {
- n_ptr->act_mtus[0] = MAX_MSG_SIZE;
- n_ptr->act_mtus[1] = MAX_MSG_SIZE;
- }
+ return n->active_links[0] != INVALID_BEARER_ID;
}
-int tipc_node_active_links(struct tipc_node *n_ptr)
+void tipc_node_check_dest(struct tipc_node *n, struct tipc_bearer *b,
+ bool *link_up, bool *addr_match,
+ struct tipc_media_addr *maddr)
{
- return n_ptr->active_links[0] != NULL;
+ struct tipc_link *l = n->links[b->identity].link;
+ struct tipc_media_addr *curr = &n->links[b->identity].maddr;
+
+ *link_up = l && tipc_link_is_up(l);
+ *addr_match = l && !memcmp(curr, maddr, sizeof(*maddr));
}
-int tipc_node_is_up(struct tipc_node *n_ptr)
+bool tipc_node_update_dest(struct tipc_node *n, struct tipc_bearer *b,
+ struct tipc_media_addr *maddr)
{
- return tipc_node_active_links(n_ptr);
+ struct tipc_link *l = n->links[b->identity].link;
+ struct tipc_media_addr *curr = &n->links[b->identity].maddr;
+ struct sk_buff_head *inputq = &n->links[b->identity].inputq;
+
+ if (!l) {
+ l = tipc_link_create(n, b, maddr, inputq, &n->bclink.namedq);
+ if (!l)
+ return false;
+ tipc_node_calculate_timer(n, l);
+ if (n->link_cnt == 1) {
+ if (!mod_timer(&n->timer, jiffies + n->keepalive_intv))
+ tipc_node_get(n);
+ }
+ }
+ memcpy(&l->media_addr, maddr, sizeof(*maddr));
+ memcpy(curr, maddr, sizeof(*maddr));
+ tipc_link_reset(l);
+ return true;
}
void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr)
{
- n_ptr->links[l_ptr->bearer_id] = l_ptr;
+ n_ptr->links[l_ptr->bearer_id].link = l_ptr;
n_ptr->link_cnt++;
}
@@ -352,15 +416,151 @@ void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr)
int i;
for (i = 0; i < MAX_BEARERS; i++) {
- if (l_ptr != n_ptr->links[i])
+ if (l_ptr != n_ptr->links[i].link)
continue;
- n_ptr->links[i] = NULL;
+ n_ptr->links[i].link = NULL;
n_ptr->link_cnt--;
}
}
+/* tipc_node_fsm_evt - node finite state machine
+ * Determines when contact is allowed with peer node
+ */
+static void tipc_node_fsm_evt(struct tipc_node *n, int evt)
+{
+ int state = n->state;
+
+ switch (state) {
+ case SELF_DOWN_PEER_DOWN:
+ switch (evt) {
+ case SELF_ESTABL_CONTACT_EVT:
+ state = SELF_UP_PEER_COMING;
+ break;
+ case PEER_ESTABL_CONTACT_EVT:
+ state = SELF_COMING_PEER_UP;
+ break;
+ case SELF_LOST_CONTACT_EVT:
+ case PEER_LOST_CONTACT_EVT:
+ break;
+ default:
+ pr_err("Unknown node fsm evt %x/%x\n", state, evt);
+ }
+ break;
+ case SELF_UP_PEER_UP:
+ switch (evt) {
+ case SELF_LOST_CONTACT_EVT:
+ state = SELF_DOWN_PEER_LEAVING;
+ break;
+ case PEER_LOST_CONTACT_EVT:
+ state = SELF_LEAVING_PEER_DOWN;
+ break;
+ case SELF_ESTABL_CONTACT_EVT:
+ case PEER_ESTABL_CONTACT_EVT:
+ break;
+ default:
+ pr_err("Unknown node fsm evt %x/%x\n", state, evt);
+ }
+ break;
+ case SELF_DOWN_PEER_LEAVING:
+ switch (evt) {
+ case PEER_LOST_CONTACT_EVT:
+ state = SELF_DOWN_PEER_DOWN;
+ break;
+ case SELF_ESTABL_CONTACT_EVT:
+ case PEER_ESTABL_CONTACT_EVT:
+ case SELF_LOST_CONTACT_EVT:
+ break;
+ default:
+ pr_err("Unknown node fsm evt %x/%x\n", state, evt);
+ }
+ break;
+ case SELF_UP_PEER_COMING:
+ switch (evt) {
+ case PEER_ESTABL_CONTACT_EVT:
+ state = SELF_UP_PEER_UP;
+ break;
+ case SELF_LOST_CONTACT_EVT:
+ state = SELF_DOWN_PEER_LEAVING;
+ break;
+ case SELF_ESTABL_CONTACT_EVT:
+ case PEER_LOST_CONTACT_EVT:
+ break;
+ default:
+ pr_err("Unknown node fsm evt %x/%x\n", state, evt);
+ }
+ break;
+ case SELF_COMING_PEER_UP:
+ switch (evt) {
+ case SELF_ESTABL_CONTACT_EVT:
+ state = SELF_UP_PEER_UP;
+ break;
+ case PEER_LOST_CONTACT_EVT:
+ state = SELF_LEAVING_PEER_DOWN;
+ break;
+ case SELF_LOST_CONTACT_EVT:
+ case PEER_ESTABL_CONTACT_EVT:
+ break;
+ default:
+ pr_err("Unknown node fsm evt %x/%x\n", state, evt);
+ }
+ break;
+ case SELF_LEAVING_PEER_DOWN:
+ switch (evt) {
+ case SELF_LOST_CONTACT_EVT:
+ state = SELF_DOWN_PEER_DOWN;
+ break;
+ case SELF_ESTABL_CONTACT_EVT:
+ case PEER_ESTABL_CONTACT_EVT:
+ case PEER_LOST_CONTACT_EVT:
+ break;
+ default:
+ pr_err("Unknown node fsm evt %x/%x\n", state, evt);
+ }
+ break;
+ default:
+ pr_err("Unknown node fsm state %x\n", state);
+ break;
+ }
+
+ n->state = state;
+}
+
+bool tipc_node_filter_skb(struct tipc_node *n, struct tipc_link *l,
+ struct tipc_msg *hdr)
+{
+ int state = n->state;
+
+ if (likely(state == SELF_UP_PEER_UP))
+ return true;
+
+ if (state == SELF_DOWN_PEER_DOWN)
+ return true;
+
+ if (state == SELF_UP_PEER_COMING) {
+ /* If not traffic msg, peer may still be ESTABLISHING */
+ if (tipc_link_is_up(l) && msg_is_traffic(hdr))
+ tipc_node_fsm_evt(n, PEER_ESTABL_CONTACT_EVT);
+ return true;
+ }
+
+ if (state == SELF_COMING_PEER_UP)
+ return true;
+
+ if (state == SELF_LEAVING_PEER_DOWN)
+ return false;
+
+ if (state == SELF_DOWN_PEER_LEAVING) {
+ if (msg_peer_is_up(hdr))
+ return false;
+ tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT);
+ return true;
+ }
+ return false;
+}
+
static void node_established_contact(struct tipc_node *n_ptr)
{
+ tipc_node_fsm_evt(n_ptr, SELF_ESTABL_CONTACT_EVT);
n_ptr->action_flags |= TIPC_NOTIFY_NODE_UP;
n_ptr->bclink.oos_state = 0;
n_ptr->bclink.acked = tipc_bclink_get_last_sent(n_ptr->net);
@@ -396,21 +596,18 @@ static void node_lost_contact(struct tipc_node *n_ptr)
/* Abort any ongoing link failover */
for (i = 0; i < MAX_BEARERS; i++) {
- struct tipc_link *l_ptr = n_ptr->links[i];
+ struct tipc_link *l_ptr = n_ptr->links[i].link;
if (!l_ptr)
continue;
- l_ptr->flags &= ~LINK_FAILINGOVER;
+ l_ptr->exec_mode = TIPC_LINK_OPEN;
l_ptr->failover_checkpt = 0;
l_ptr->failover_pkts = 0;
kfree_skb(l_ptr->failover_skb);
l_ptr->failover_skb = NULL;
tipc_link_reset_fragments(l_ptr);
}
-
- n_ptr->action_flags &= ~TIPC_WAIT_OWN_LINKS_DOWN;
-
/* Prevent re-contact with node until cleanup is done */
- n_ptr->action_flags |= TIPC_WAIT_PEER_LINKS_DOWN;
+ tipc_node_fsm_evt(n_ptr, SELF_LOST_CONTACT_EVT);
/* Notify publications from this node */
n_ptr->action_flags |= TIPC_NOTIFY_NODE_DOWN;
@@ -453,7 +650,7 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr,
goto exit;
tipc_node_lock(node);
- link = node->links[bearer_id];
+ link = node->links[bearer_id].link;
if (link) {
strncpy(linkname, link->name, len);
err = 0;
@@ -559,6 +756,160 @@ msg_full:
return -EMSGSIZE;
}
+static struct tipc_link *tipc_node_select_link(struct tipc_node *n, int sel,
+ int *bearer_id,
+ struct tipc_media_addr **maddr)
+{
+ int id = n->active_links[sel & 1];
+
+ if (unlikely(id < 0))
+ return NULL;
+
+ *bearer_id = id;
+ *maddr = &n->links[id].maddr;
+ return n->links[id].link;
+}
+
+/**
+ * tipc_node_xmit() is the general link level function for message sending
+ * @net: the applicable net namespace
+ * @list: chain of buffers containing message
+ * @dnode: address of destination node
+ * @selector: a number used for deterministic link selection
+ * Consumes the buffer chain, except when returning -ELINKCONG
+ * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE
+ */
+int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
+ u32 dnode, int selector)
+{
+ struct tipc_link *l = NULL;
+ struct tipc_node *n;
+ struct sk_buff_head xmitq;
+ struct tipc_media_addr *maddr;
+ int bearer_id;
+ int rc = -EHOSTUNREACH;
+
+ __skb_queue_head_init(&xmitq);
+ n = tipc_node_find(net, dnode);
+ if (likely(n)) {
+ tipc_node_lock(n);
+ l = tipc_node_select_link(n, selector, &bearer_id, &maddr);
+ if (likely(l))
+ rc = tipc_link_xmit(l, list, &xmitq);
+ if (unlikely(rc == -ENOBUFS))
+ tipc_link_reset(l);
+ tipc_node_unlock(n);
+ tipc_node_put(n);
+ }
+ if (likely(!rc)) {
+ tipc_bearer_xmit(net, bearer_id, &xmitq, maddr);
+ return 0;
+ }
+ if (likely(in_own_node(net, dnode))) {
+ tipc_sk_rcv(net, list);
+ return 0;
+ }
+ return rc;
+}
+
+/* tipc_node_xmit_skb(): send single buffer to destination
+ * Buffers sent via this functon are generally TIPC_SYSTEM_IMPORTANCE
+ * messages, which will not be rejected
+ * The only exception is datagram messages rerouted after secondary
+ * lookup, which are rare and safe to dispose of anyway.
+ * TODO: Return real return value, and let callers use
+ * tipc_wait_for_sendpkt() where applicable
+ */
+int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode,
+ u32 selector)
+{
+ struct sk_buff_head head;
+ int rc;
+
+ skb_queue_head_init(&head);
+ __skb_queue_tail(&head, skb);
+ rc = tipc_node_xmit(net, &head, dnode, selector);
+ if (rc == -ELINKCONG)
+ kfree_skb(skb);
+ return 0;
+}
+
+/**
+ * tipc_rcv - process TIPC packets/messages arriving from off-node
+ * @net: the applicable net namespace
+ * @skb: TIPC packet
+ * @bearer: pointer to bearer message arrived on
+ *
+ * Invoked with no locks held. Bearer pointer must point to a valid bearer
+ * structure (i.e. cannot be NULL), but bearer can be inactive.
+ */
+void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
+{
+ struct sk_buff_head xmitq;
+ struct tipc_node *n;
+ struct tipc_link *l;
+ struct tipc_msg *hdr;
+ struct tipc_media_addr *maddr;
+ int bearer_id = b->identity;
+ int rc = 0;
+
+ __skb_queue_head_init(&xmitq);
+
+ /* Ensure message is well-formed */
+ if (unlikely(!tipc_msg_validate(skb)))
+ goto discard;
+
+ /* Handle arrival of a non-unicast link packet */
+ hdr = buf_msg(skb);
+ if (unlikely(msg_non_seq(hdr))) {
+ if (msg_user(hdr) == LINK_CONFIG)
+ tipc_disc_rcv(net, skb, b);
+ else
+ tipc_bclink_rcv(net, skb);
+ return;
+ }
+
+ /* Locate neighboring node that sent packet */
+ n = tipc_node_find(net, msg_prevnode(hdr));
+ if (unlikely(!n))
+ goto discard;
+ tipc_node_lock(n);
+
+ /* Locate link endpoint that should handle packet */
+ l = n->links[bearer_id].link;
+ if (unlikely(!l))
+ goto unlock;
+
+ /* Is reception of this packet permitted at the moment ? */
+ if (unlikely(n->state != SELF_UP_PEER_UP))
+ if (!tipc_node_filter_skb(n, l, hdr))
+ goto unlock;
+
+ if (unlikely(msg_user(hdr) == LINK_PROTOCOL))
+ tipc_bclink_sync_state(n, hdr);
+
+ /* Release acked broadcast messages */
+ if (unlikely(n->bclink.acked != msg_bcast_ack(hdr)))
+ tipc_bclink_acknowledge(n, msg_bcast_ack(hdr));
+
+ /* Check protocol and update link state */
+ rc = tipc_link_rcv(l, skb, &xmitq);
+
+ if (unlikely(rc & TIPC_LINK_UP_EVT))
+ tipc_link_activate(l);
+ if (unlikely(rc & TIPC_LINK_DOWN_EVT))
+ tipc_link_reset(l);
+ skb = NULL;
+unlock:
+ tipc_node_unlock(n);
+ tipc_sk_rcv(net, &n->links[bearer_id].inputq);
+ maddr = &n->links[bearer_id].maddr;
+ tipc_bearer_xmit(net, bearer_id, &xmitq, maddr);
+ tipc_node_put(n);
+discard:
+ kfree_skb(skb);
+}
+
int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
int err;
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 5a834cf142c8..5e7016802077 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -45,6 +45,26 @@
/* Out-of-range value for node signature */
#define INVALID_NODE_SIG 0x10000
+#define INVALID_BEARER_ID -1
+
+/* Node FSM states and events:
+ */
+enum {
+ SELF_DOWN_PEER_DOWN = 0xdd,
+ SELF_UP_PEER_UP = 0xaa,
+ SELF_DOWN_PEER_LEAVING = 0xd1,
+ SELF_UP_PEER_COMING = 0xac,
+ SELF_COMING_PEER_UP = 0xca,
+ SELF_LEAVING_PEER_DOWN = 0x1d,
+};
+
+enum {
+ SELF_ESTABL_CONTACT_EVT = 0xec,
+ SELF_LOST_CONTACT_EVT = 0x1c,
+ PEER_ESTABL_CONTACT_EVT = 0xfec,
+ PEER_LOST_CONTACT_EVT = 0xf1c
+};
+
/* Flags used to take different actions according to flag type
* TIPC_WAIT_PEER_LINKS_DOWN: wait to see that peer's links are down
* TIPC_WAIT_OWN_LINKS_DOWN: wait until peer node is declared down
@@ -54,8 +74,6 @@
*/
enum {
TIPC_MSG_EVT = 1,
- TIPC_WAIT_PEER_LINKS_DOWN = (1 << 1),
- TIPC_WAIT_OWN_LINKS_DOWN = (1 << 2),
TIPC_NOTIFY_NODE_DOWN = (1 << 3),
TIPC_NOTIFY_NODE_UP = (1 << 4),
TIPC_WAKEUP_BCAST_USERS = (1 << 5),
@@ -85,10 +103,17 @@ struct tipc_node_bclink {
u32 deferred_size;
struct sk_buff_head deferdq;
struct sk_buff *reasm_buf;
- int inputq_map;
+ struct sk_buff_head namedq;
bool recv_permitted;
};
+struct tipc_link_entry {
+ struct tipc_link *link;
+ u32 mtu;
+ struct sk_buff_head inputq;
+ struct tipc_media_addr maddr;
+};
+
/**
* struct tipc_node - TIPC node structure
* @addr: network address of node
@@ -98,9 +123,8 @@ struct tipc_node_bclink {
* @hash: links to adjacent nodes in unsorted hash chain
* @inputq: pointer to input queue containing messages for msg event
* @namedq: pointer to name table input queue with name table messages
- * @curr_link: the link holding the node lock, if any
- * @active_links: pointers to active links to node
- * @links: pointers to all links to node
+ * @active_links: bearer ids of active links, used as index into links[] array
+ * @links: array containing references to all links to node
* @action_flags: bit mask of different types of node actions
* @bclink: broadcast-related info
* @list: links to adjacent nodes in sorted list of cluster's nodes
@@ -120,12 +144,12 @@ struct tipc_node {
struct hlist_node hash;
struct sk_buff_head *inputq;
struct sk_buff_head *namedq;
- struct tipc_link *active_links[2];
- u32 act_mtus[2];
- struct tipc_link *links[MAX_BEARERS];
+ int active_links[2];
+ struct tipc_link_entry links[MAX_BEARERS];
int action_flags;
struct tipc_node_bclink bclink;
struct list_head list;
+ int state;
int link_cnt;
u16 working_links;
u16 capabilities;
@@ -133,6 +157,8 @@ struct tipc_node {
u32 link_id;
struct list_head publ_list;
struct list_head conn_sks;
+ unsigned long keepalive_intv;
+ struct timer_list timer;
struct rcu_head rcu;
};
@@ -140,18 +166,25 @@ struct tipc_node *tipc_node_find(struct net *net, u32 addr);
void tipc_node_put(struct tipc_node *node);
struct tipc_node *tipc_node_create(struct net *net, u32 addr);
void tipc_node_stop(struct net *net);
+void tipc_node_check_dest(struct tipc_node *n, struct tipc_bearer *bearer,
+ bool *link_up, bool *addr_match,
+ struct tipc_media_addr *maddr);
+bool tipc_node_update_dest(struct tipc_node *n, struct tipc_bearer *bearer,
+ struct tipc_media_addr *maddr);
void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr);
void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr);
-void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr);
-void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr);
-int tipc_node_active_links(struct tipc_node *n_ptr);
-int tipc_node_is_up(struct tipc_node *n_ptr);
+void tipc_node_link_down(struct tipc_node *n_ptr, int bearer_id);
+void tipc_node_link_up(struct tipc_node *n_ptr, int bearer_id);
+bool tipc_node_is_up(struct tipc_node *n);
int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node,
char *linkname, size_t len);
void tipc_node_unlock(struct tipc_node *node);
+int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode,
+ int selector);
+int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest,
+ u32 selector);
int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port);
void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port);
-
int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb);
static inline void tipc_node_lock(struct tipc_node *node)
@@ -159,26 +192,30 @@ static inline void tipc_node_lock(struct tipc_node *node)
spin_lock_bh(&node->lock);
}
-static inline bool tipc_node_blocked(struct tipc_node *node)
+static inline struct tipc_link *node_active_link(struct tipc_node *n, int sel)
{
- return (node->action_flags & (TIPC_WAIT_PEER_LINKS_DOWN |
- TIPC_NOTIFY_NODE_DOWN | TIPC_WAIT_OWN_LINKS_DOWN));
+ int bearer_id = n->active_links[sel & 1];
+
+ if (unlikely(bearer_id == INVALID_BEARER_ID))
+ return NULL;
+
+ return n->links[bearer_id].link;
}
-static inline uint tipc_node_get_mtu(struct net *net, u32 addr, u32 selector)
+static inline unsigned int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel)
{
- struct tipc_node *node;
- u32 mtu;
-
- node = tipc_node_find(net, addr);
+ struct tipc_node *n;
+ int bearer_id;
+ unsigned int mtu = MAX_MSG_SIZE;
- if (likely(node)) {
- mtu = node->act_mtus[selector & 1];
- tipc_node_put(node);
- } else {
- mtu = MAX_MSG_SIZE;
- }
+ n = tipc_node_find(net, addr);
+ if (unlikely(!n))
+ return mtu;
+ bearer_id = n->active_links[sel & 1];
+ if (likely(bearer_id != INVALID_BEARER_ID))
+ mtu = n->links[bearer_id].mtu;
+ tipc_node_put(n);
return mtu;
}
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 3a7567f690f3..5b0b08d58fcc 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -261,7 +261,7 @@ static void tsk_rej_rx_queue(struct sock *sk)
while ((skb = __skb_dequeue(&sk->sk_receive_queue))) {
if (tipc_msg_reverse(own_node, skb, &dnode, TIPC_ERR_NO_PORT))
- tipc_link_xmit_skb(sock_net(sk), skb, dnode, 0);
+ tipc_node_xmit_skb(sock_net(sk), skb, dnode, 0);
}
}
@@ -443,7 +443,7 @@ static int tipc_release(struct socket *sock)
}
if (tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode,
TIPC_ERR_NO_PORT))
- tipc_link_xmit_skb(net, skb, dnode, 0);
+ tipc_node_xmit_skb(net, skb, dnode, 0);
}
}
@@ -456,7 +456,7 @@ static int tipc_release(struct socket *sock)
tsk_own_node(tsk), tsk_peer_port(tsk),
tsk->portid, TIPC_ERR_NO_PORT);
if (skb)
- tipc_link_xmit_skb(net, skb, dnode, tsk->portid);
+ tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
tipc_node_remove_conn(net, dnode, tsk->portid);
}
@@ -686,21 +686,22 @@ new_mtu:
do {
rc = tipc_bclink_xmit(net, pktchain);
- if (likely(rc >= 0)) {
- rc = dsz;
- break;
+ if (likely(!rc))
+ return dsz;
+
+ if (rc == -ELINKCONG) {
+ tsk->link_cong = 1;
+ rc = tipc_wait_for_sndmsg(sock, &timeo);
+ if (!rc)
+ continue;
}
+ __skb_queue_purge(pktchain);
if (rc == -EMSGSIZE) {
msg->msg_iter = save;
goto new_mtu;
}
- if (rc != -ELINKCONG)
- break;
- tipc_sk(sk)->link_cong = 1;
- rc = tipc_wait_for_sndmsg(sock, &timeo);
- if (rc)
- __skb_queue_purge(pktchain);
- } while (!rc);
+ break;
+ } while (1);
return rc;
}
@@ -924,24 +925,25 @@ new_mtu:
do {
skb = skb_peek(pktchain);
TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong;
- rc = tipc_link_xmit(net, pktchain, dnode, tsk->portid);
- if (likely(rc >= 0)) {
+ rc = tipc_node_xmit(net, pktchain, dnode, tsk->portid);
+ if (likely(!rc)) {
if (sock->state != SS_READY)
sock->state = SS_CONNECTING;
- rc = dsz;
- break;
+ return dsz;
}
+ if (rc == -ELINKCONG) {
+ tsk->link_cong = 1;
+ rc = tipc_wait_for_sndmsg(sock, &timeo);
+ if (!rc)
+ continue;
+ }
+ __skb_queue_purge(pktchain);
if (rc == -EMSGSIZE) {
m->msg_iter = save;
goto new_mtu;
}
- if (rc != -ELINKCONG)
- break;
- tsk->link_cong = 1;
- rc = tipc_wait_for_sndmsg(sock, &timeo);
- if (rc)
- __skb_queue_purge(pktchain);
- } while (!rc);
+ break;
+ } while (1);
return rc;
}
@@ -1043,15 +1045,16 @@ next:
return rc;
do {
if (likely(!tsk_conn_cong(tsk))) {
- rc = tipc_link_xmit(net, pktchain, dnode, portid);
+ rc = tipc_node_xmit(net, pktchain, dnode, portid);
if (likely(!rc)) {
tsk->sent_unacked++;
sent += send;
if (sent == dsz)
- break;
+ return dsz;
goto next;
}
if (rc == -EMSGSIZE) {
+ __skb_queue_purge(pktchain);
tsk->max_pkt = tipc_node_get_mtu(net, dnode,
portid);
m->msg_iter = save;
@@ -1059,13 +1062,13 @@ next:
}
if (rc != -ELINKCONG)
break;
+
tsk->link_cong = 1;
}
rc = tipc_wait_for_sndpkt(sock, &timeo);
- if (rc)
- __skb_queue_purge(pktchain);
} while (!rc);
+ __skb_queue_purge(pktchain);
return sent ? sent : rc;
}
@@ -1221,7 +1224,7 @@ static void tipc_sk_send_ack(struct tipc_sock *tsk, uint ack)
return;
msg = buf_msg(skb);
msg_set_msgcnt(msg, ack);
- tipc_link_xmit_skb(net, skb, dnode, msg_link_selector(msg));
+ tipc_node_xmit_skb(net, skb, dnode, msg_link_selector(msg));
}
static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop)
@@ -1700,7 +1703,7 @@ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
}
if (!err || tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode, -err))
- tipc_link_xmit_skb(net, skb, dnode, tsk->portid);
+ tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
return 0;
}
@@ -1796,7 +1799,7 @@ int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq)
if (!tipc_msg_reverse(tn->own_addr, skb, &dnode, -err))
continue;
xmit:
- tipc_link_xmit_skb(net, skb, dnode, dport);
+ tipc_node_xmit_skb(net, skb, dnode, dport);
}
return err ? -EHOSTUNREACH : 0;
}
@@ -2089,7 +2092,7 @@ restart:
}
if (tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode,
TIPC_CONN_SHUTDOWN))
- tipc_link_xmit_skb(net, skb, dnode,
+ tipc_node_xmit_skb(net, skb, dnode,
tsk->portid);
} else {
dnode = tsk_peer_node(tsk);
@@ -2099,7 +2102,7 @@ restart:
0, dnode, tsk_own_node(tsk),
tsk_peer_port(tsk),
tsk->portid, TIPC_CONN_SHUTDOWN);
- tipc_link_xmit_skb(net, skb, dnode, tsk->portid);
+ tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
}
tsk->connected = 0;
sock->state = SS_DISCONNECTING;
@@ -2161,7 +2164,7 @@ static void tipc_sk_timeout(unsigned long data)
}
bh_unlock_sock(sk);
if (skb)
- tipc_link_xmit_skb(sock_net(sk), skb, peer_node, tsk->portid);
+ tipc_node_xmit_skb(sock_net(sk), skb, peer_node, tsk->portid);
exit:
sock_put(sk);
}
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index bd16c6c7e1e7..0cebf1fc37a2 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2048,7 +2048,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
xfrm_audit_policy_delete(xp, 1, true);
} else {
// reset the timers here?
- WARN(1, "Dont know what to do with soft policy expire\n");
+ WARN(1, "Don't know what to do with soft policy expire\n");
}
km_policy_expired(xp, p->dir, up->hard, nlh->nlmsg_pid);