aboutsummaryrefslogtreecommitdiffstats
path: root/net/openvswitch
diff options
context:
space:
mode:
Diffstat (limited to 'net/openvswitch')
-rw-r--r--net/openvswitch/Kconfig8
-rw-r--r--net/openvswitch/Makefile3
-rw-r--r--net/openvswitch/actions.c193
-rw-r--r--net/openvswitch/conntrack.c270
-rw-r--r--net/openvswitch/conntrack.h6
-rw-r--r--net/openvswitch/datapath.c289
-rw-r--r--net/openvswitch/datapath.h33
-rw-r--r--net/openvswitch/flow.c187
-rw-r--r--net/openvswitch/flow.h14
-rw-r--r--net/openvswitch/flow_netlink.c266
-rw-r--r--net/openvswitch/flow_table.c412
-rw-r--r--net/openvswitch/flow_table.h32
-rw-r--r--net/openvswitch/meter.c337
-rw-r--r--net/openvswitch/meter.h20
-rw-r--r--net/openvswitch/openvswitch_trace.c10
-rw-r--r--net/openvswitch/openvswitch_trace.h158
-rw-r--r--net/openvswitch/vport-internal_dev.c60
-rw-r--r--net/openvswitch/vport-netdev.c16
-rw-r--r--net/openvswitch/vport.c23
-rw-r--r--net/openvswitch/vport.h8
20 files changed, 1843 insertions, 502 deletions
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 22d7d5604b4c..15bd287f5cbd 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -15,7 +15,7 @@ config OPENVSWITCH
select NET_MPLS_GSO
select DST_CACHE
select NET_NSH
- ---help---
+ help
Open vSwitch is a multilayer Ethernet switch targeted at virtualized
environments. In addition to supporting a variety of features
expected in a traditional hardware switch, it enables fine-grained
@@ -43,7 +43,7 @@ config OPENVSWITCH_GRE
depends on OPENVSWITCH
depends on NET_IPGRE
default OPENVSWITCH
- ---help---
+ help
If you say Y here, then the Open vSwitch will be able create GRE
vport.
@@ -56,7 +56,7 @@ config OPENVSWITCH_VXLAN
depends on OPENVSWITCH
depends on VXLAN
default OPENVSWITCH
- ---help---
+ help
If you say Y here, then the Open vSwitch will be able create vxlan vport.
Say N to exclude this support and reduce the binary size.
@@ -68,7 +68,7 @@ config OPENVSWITCH_GENEVE
depends on OPENVSWITCH
depends on GENEVE
default OPENVSWITCH
- ---help---
+ help
If you say Y here, then the Open vSwitch will be able create geneve vport.
Say N to exclude this support and reduce the binary size.
diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
index 41109c326f3a..28982630bef3 100644
--- a/net/openvswitch/Makefile
+++ b/net/openvswitch/Makefile
@@ -13,6 +13,7 @@ openvswitch-y := \
flow_netlink.o \
flow_table.o \
meter.o \
+ openvswitch_trace.o \
vport.o \
vport-internal_dev.o \
vport-netdev.o
@@ -24,3 +25,5 @@ endif
obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o
obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o
obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o
+
+CFLAGS_openvswitch_trace.o = -I$(src)
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 7fbfe2adfffa..ca3ebfdb3023 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -9,7 +9,6 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/openvswitch.h>
-#include <linux/netfilter_ipv6.h>
#include <linux/sctp.h>
#include <linux/tcp.h>
#include <linux/udp.h>
@@ -31,6 +30,7 @@
#include "conntrack.h"
#include "vport.h"
#include "flow_netlink.h"
+#include "openvswitch_trace.h"
struct deferred_action {
struct sk_buff *skb;
@@ -200,6 +200,9 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key,
__be32 lse;
int err;
+ if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN))
+ return -ENOMEM;
+
stack = mpls_hdr(skb);
lse = OVS_MASKED(stack->label_stack_entry, *mpls_lse, *mask);
err = skb_mpls_update_lse(skb, lse);
@@ -278,9 +281,11 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
*/
static int pop_eth(struct sk_buff *skb, struct sw_flow_key *key)
{
- skb_pull_rcsum(skb, ETH_HLEN);
- skb_reset_mac_header(skb);
- skb_reset_mac_len(skb);
+ int err;
+
+ err = skb_eth_pop(skb);
+ if (err)
+ return err;
/* safe right before invalidate_flow_key */
key->mac_proto = MAC_PROTO_NONE;
@@ -291,22 +296,12 @@ static int pop_eth(struct sk_buff *skb, struct sw_flow_key *key)
static int push_eth(struct sk_buff *skb, struct sw_flow_key *key,
const struct ovs_action_push_eth *ethh)
{
- struct ethhdr *hdr;
-
- /* Add the new Ethernet header */
- if (skb_cow_head(skb, ETH_HLEN) < 0)
- return -ENOMEM;
-
- skb_push(skb, ETH_HLEN);
- skb_reset_mac_header(skb);
- skb_reset_mac_len(skb);
-
- hdr = eth_hdr(skb);
- ether_addr_copy(hdr->h_source, ethh->addresses.eth_src);
- ether_addr_copy(hdr->h_dest, ethh->addresses.eth_dst);
- hdr->h_proto = skb->protocol;
+ int err;
- skb_postpush_rcsum(skb, hdr, ETH_HLEN);
+ err = skb_eth_push(skb, ethh->addresses.eth_dst,
+ ethh->addresses.eth_src);
+ if (err)
+ return err;
/* safe right before invalidate_flow_key */
key->mac_proto = MAC_PROTO_ETHERNET;
@@ -378,6 +373,7 @@ static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh,
update_ip_l4_checksum(skb, nh, *addr, new_addr);
csum_replace4(&nh->check, *addr, new_addr);
skb_clear_hash(skb);
+ ovs_ct_clear(skb, NULL);
*addr = new_addr;
}
@@ -425,15 +421,47 @@ static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto,
update_ipv6_checksum(skb, l4_proto, addr, new_addr);
skb_clear_hash(skb);
+ ovs_ct_clear(skb, NULL);
memcpy(addr, new_addr, sizeof(__be32[4]));
}
-static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl, u32 mask)
+static void set_ipv6_dsfield(struct sk_buff *skb, struct ipv6hdr *nh, u8 ipv6_tclass, u8 mask)
+{
+ u8 old_ipv6_tclass = ipv6_get_dsfield(nh);
+
+ ipv6_tclass = OVS_MASKED(old_ipv6_tclass, ipv6_tclass, mask);
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ csum_replace(&skb->csum, (__force __wsum)(old_ipv6_tclass << 12),
+ (__force __wsum)(ipv6_tclass << 12));
+
+ ipv6_change_dsfield(nh, ~mask, ipv6_tclass);
+}
+
+static void set_ipv6_fl(struct sk_buff *skb, struct ipv6hdr *nh, u32 fl, u32 mask)
{
+ u32 ofl;
+
+ ofl = nh->flow_lbl[0] << 16 | nh->flow_lbl[1] << 8 | nh->flow_lbl[2];
+ fl = OVS_MASKED(ofl, fl, mask);
+
/* Bits 21-24 are always unmasked, so this retains their values. */
- OVS_SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16));
- OVS_SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8));
- OVS_SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask);
+ nh->flow_lbl[0] = (u8)(fl >> 16);
+ nh->flow_lbl[1] = (u8)(fl >> 8);
+ nh->flow_lbl[2] = (u8)fl;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ csum_replace(&skb->csum, (__force __wsum)htonl(ofl), (__force __wsum)htonl(fl));
+}
+
+static void set_ipv6_ttl(struct sk_buff *skb, struct ipv6hdr *nh, u8 new_ttl, u8 mask)
+{
+ new_ttl = OVS_MASKED(nh->hop_limit, new_ttl, mask);
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ csum_replace(&skb->csum, (__force __wsum)(nh->hop_limit << 8),
+ (__force __wsum)(new_ttl << 8));
+ nh->hop_limit = new_ttl;
}
static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl,
@@ -551,18 +579,17 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key,
}
}
if (mask->ipv6_tclass) {
- ipv6_change_dsfield(nh, ~mask->ipv6_tclass, key->ipv6_tclass);
+ set_ipv6_dsfield(skb, nh, key->ipv6_tclass, mask->ipv6_tclass);
flow_key->ip.tos = ipv6_get_dsfield(nh);
}
if (mask->ipv6_label) {
- set_ipv6_fl(nh, ntohl(key->ipv6_label),
+ set_ipv6_fl(skb, nh, ntohl(key->ipv6_label),
ntohl(mask->ipv6_label));
flow_key->ipv6.label =
*(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL);
}
if (mask->ipv6_hlimit) {
- OVS_SET_MASKED(nh->hop_limit, key->ipv6_hlimit,
- mask->ipv6_hlimit);
+ set_ipv6_ttl(skb, nh, key->ipv6_hlimit, mask->ipv6_hlimit);
flow_key->ip.ttl = nh->hop_limit;
}
return 0;
@@ -635,6 +662,7 @@ static int set_nsh(struct sk_buff *skb, struct sw_flow_key *flow_key,
static void set_tp_port(struct sk_buff *skb, __be16 *port,
__be16 new_port, __sum16 *check)
{
+ ovs_ct_clear(skb, NULL);
inet_proto_csum_replace2(check, skb, *port, new_port, false);
*port = new_port;
}
@@ -674,6 +702,7 @@ static int set_udp(struct sk_buff *skb, struct sw_flow_key *flow_key,
uh->dest = dst;
flow_key->tp.src = src;
flow_key->tp.dst = dst;
+ ovs_ct_clear(skb, NULL);
}
skb_clear_hash(skb);
@@ -736,13 +765,16 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key,
sh->checksum = old_csum ^ old_correct_csum ^ new_csum;
skb_clear_hash(skb);
+ ovs_ct_clear(skb, NULL);
+
flow_key->tp.src = sh->source;
flow_key->tp.dst = sh->dest;
return 0;
}
-static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int ovs_vport_output(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage);
struct vport *vport = data->vport;
@@ -832,29 +864,25 @@ static void ovs_fragment(struct net *net, struct vport *vport,
}
if (key->eth.type == htons(ETH_P_IP)) {
- struct dst_entry ovs_dst;
+ struct rtable ovs_rt = { 0 };
unsigned long orig_dst;
prepare_frag(vport, skb, orig_network_offset,
ovs_key_mac_proto(key));
- dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1,
+ dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1,
DST_OBSOLETE_NONE, DST_NOCOUNT);
- ovs_dst.dev = vport->dev;
+ ovs_rt.dst.dev = vport->dev;
orig_dst = skb->_skb_refdst;
- skb_dst_set_noref(skb, &ovs_dst);
+ skb_dst_set_noref(skb, &ovs_rt.dst);
IPCB(skb)->frag_max_size = mru;
ip_do_fragment(net, skb->sk, skb, ovs_vport_output);
refdst_drop(orig_dst);
} else if (key->eth.type == htons(ETH_P_IPV6)) {
- const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
unsigned long orig_dst;
struct rt6_info ovs_rt;
- if (!v6ops)
- goto err;
-
prepare_frag(vport, skb, orig_network_offset,
ovs_key_mac_proto(key));
memset(&ovs_rt, 0, sizeof(ovs_rt));
@@ -866,7 +894,7 @@ static void ovs_fragment(struct net *net, struct vport *vport,
skb_dst_set_noref(skb, &ovs_rt.dst);
IP6CB(skb)->frag_max_size = mru;
- v6ops->fragment(net, skb->sk, skb, ovs_vport_output);
+ ipv6_stub->ipv6_fragment(net, skb->sk, skb, ovs_vport_output);
refdst_drop(orig_dst);
} else {
WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.",
@@ -925,14 +953,20 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
upcall.mru = OVS_CB(skb)->mru;
for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
- a = nla_next(a, &rem)) {
+ a = nla_next(a, &rem)) {
switch (nla_type(a)) {
case OVS_USERSPACE_ATTR_USERDATA:
upcall.userdata = a;
break;
case OVS_USERSPACE_ATTR_PID:
- upcall.portid = nla_get_u32(a);
+ if (dp->user_features &
+ OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
+ upcall.portid =
+ ovs_dp_get_upcall_portid(dp,
+ smp_processor_id());
+ else
+ upcall.portid = nla_get_u32(a);
break;
case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: {
@@ -964,6 +998,21 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
return ovs_dp_upcall(dp, skb, key, &upcall, cutlen);
}
+static int dec_ttl_exception_handler(struct datapath *dp, struct sk_buff *skb,
+ struct sw_flow_key *key,
+ const struct nlattr *attr)
+{
+ /* The first attribute is always 'OVS_DEC_TTL_ATTR_ACTION'. */
+ struct nlattr *actions = nla_data(attr);
+
+ if (nla_len(actions))
+ return clone_execute(dp, skb, key, 0, nla_data(actions),
+ nla_len(actions), true, false);
+
+ consume_skb(skb);
+ return 0;
+}
+
/* When 'last' is true, sample() should always consume the 'skb'.
* Otherwise, sample() should keep 'skb' intact regardless what
* actions are executed within sample().
@@ -984,7 +1033,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
actions = nla_next(sample_arg, &rem);
if ((arg->probability != U32_MAX) &&
- (!arg->probability || prandom_u32() > arg->probability)) {
+ (!arg->probability || get_random_u32() > arg->probability)) {
if (last)
consume_skb(skb);
return 0;
@@ -1008,7 +1057,7 @@ static int clone(struct datapath *dp, struct sk_buff *skb,
int rem = nla_len(attr);
bool dont_clone_flow_key;
- /* The first action is always 'OVS_CLONE_ATTR_ARG'. */
+ /* The first action is always 'OVS_CLONE_ATTR_EXEC'. */
clone_arg = nla_data(attr);
dont_clone_flow_key = nla_get_u32(clone_arg);
actions = nla_next(clone_arg, &rem);
@@ -1150,9 +1199,10 @@ static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key,
const struct nlattr *attr, bool last)
{
+ struct ovs_skb_cb *ovs_cb = OVS_CB(skb);
const struct nlattr *actions, *cpl_arg;
+ int len, max_len, rem = nla_len(attr);
const struct check_pkt_len_arg *arg;
- int rem = nla_len(attr);
bool clone_flow_key;
/* The first netlink attribute in 'attr' is always
@@ -1161,7 +1211,11 @@ static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb,
cpl_arg = nla_data(attr);
arg = nla_data(cpl_arg);
- if (skb->len <= arg->pkt_len) {
+ len = ovs_cb->mru ? ovs_cb->mru + skb->mac_len : skb->len;
+ max_len = arg->pkt_len;
+
+ if ((skb_is_gso(skb) && skb_gso_validate_mac_len(skb, max_len)) ||
+ len <= max_len) {
/* Second netlink attribute in 'attr' is always
* 'OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL'.
*/
@@ -1180,6 +1234,45 @@ static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb,
nla_len(actions), last, clone_flow_key);
}
+static int execute_dec_ttl(struct sk_buff *skb, struct sw_flow_key *key)
+{
+ int err;
+
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct ipv6hdr *nh;
+
+ err = skb_ensure_writable(skb, skb_network_offset(skb) +
+ sizeof(*nh));
+ if (unlikely(err))
+ return err;
+
+ nh = ipv6_hdr(skb);
+
+ if (nh->hop_limit <= 1)
+ return -EHOSTUNREACH;
+
+ key->ip.ttl = --nh->hop_limit;
+ } else if (skb->protocol == htons(ETH_P_IP)) {
+ struct iphdr *nh;
+ u8 old_ttl;
+
+ err = skb_ensure_writable(skb, skb_network_offset(skb) +
+ sizeof(*nh));
+ if (unlikely(err))
+ return err;
+
+ nh = ip_hdr(skb);
+ if (nh->ttl <= 1)
+ return -EHOSTUNREACH;
+
+ old_ttl = nh->ttl--;
+ csum_replace2(&nh->check, htons(old_ttl << 8),
+ htons(nh->ttl << 8));
+ key->ip.ttl = nh->ttl;
+ }
+ return 0;
+}
+
/* Execute a list of actions against 'skb'. */
static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key,
@@ -1192,6 +1285,9 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
a = nla_next(a, &rem)) {
int err = 0;
+ if (trace_ovs_do_execute_action_enabled())
+ trace_ovs_do_execute_action(dp, skb, key, a, rem);
+
switch (nla_type(a)) {
case OVS_ACTION_ATTR_OUTPUT: {
int port = nla_get_u32(a);
@@ -1365,6 +1461,13 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
break;
}
+
+ case OVS_ACTION_ATTR_DEC_TTL:
+ err = execute_dec_ttl(skb, key);
+ if (err == -EHOSTUNREACH)
+ return dec_ttl_exception_handler(dp, skb,
+ key, a);
+ break;
}
if (unlikely(err)) {
@@ -1442,8 +1545,8 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb,
pr_warn("%s: deferred action limit reached, drop sample action\n",
ovs_dp_name(dp));
} else { /* Recirc action */
- pr_warn("%s: deferred action limit reached, drop recirc action\n",
- ovs_dp_name(dp));
+ pr_warn("%s: deferred action limit reached, drop recirc action (recirc_id=%#x)\n",
+ ovs_dp_name(dp), recirc_id);
}
}
}
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index e726159cfcfa..c7b10234cf7c 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -25,6 +25,8 @@
#include <net/netfilter/nf_nat.h>
#endif
+#include <net/netfilter/nf_conntrack_act_ct.h>
+
#include "datapath.h"
#include "conntrack.h"
#include "flow.h"
@@ -271,15 +273,13 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
/* This is called to initialize CT key fields possibly coming in from the local
* stack.
*/
-void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
+void ovs_ct_fill_key(const struct sk_buff *skb,
+ struct sw_flow_key *key,
+ bool post_ct)
{
- ovs_ct_update_key(skb, NULL, key, false, false);
+ ovs_ct_update_key(skb, NULL, key, post_ct, false);
}
-#define IN6_ADDR_INITIALIZER(ADDR) \
- { (ADDR).s6_addr32[0], (ADDR).s6_addr32[1], \
- (ADDR).s6_addr32[2], (ADDR).s6_addr32[3] }
-
int ovs_ct_put_key(const struct sw_flow_key *swkey,
const struct sw_flow_key *output, struct sk_buff *skb)
{
@@ -301,24 +301,30 @@ int ovs_ct_put_key(const struct sw_flow_key *swkey,
if (swkey->ct_orig_proto) {
if (swkey->eth.type == htons(ETH_P_IP)) {
- struct ovs_key_ct_tuple_ipv4 orig = {
- output->ipv4.ct_orig.src,
- output->ipv4.ct_orig.dst,
- output->ct.orig_tp.src,
- output->ct.orig_tp.dst,
- output->ct_orig_proto,
- };
+ struct ovs_key_ct_tuple_ipv4 orig;
+
+ memset(&orig, 0, sizeof(orig));
+ orig.ipv4_src = output->ipv4.ct_orig.src;
+ orig.ipv4_dst = output->ipv4.ct_orig.dst;
+ orig.src_port = output->ct.orig_tp.src;
+ orig.dst_port = output->ct.orig_tp.dst;
+ orig.ipv4_proto = output->ct_orig_proto;
+
if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,
sizeof(orig), &orig))
return -EMSGSIZE;
} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
- struct ovs_key_ct_tuple_ipv6 orig = {
- IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.src),
- IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.dst),
- output->ct.orig_tp.src,
- output->ct.orig_tp.dst,
- output->ct_orig_proto,
- };
+ struct ovs_key_ct_tuple_ipv6 orig;
+
+ memset(&orig, 0, sizeof(orig));
+ memcpy(orig.ipv6_src, output->ipv6.ct_orig.src.s6_addr32,
+ sizeof(orig.ipv6_src));
+ memcpy(orig.ipv6_dst, output->ipv6.ct_orig.dst.s6_addr32,
+ sizeof(orig.ipv6_dst));
+ orig.src_port = output->ct.orig_tp.src;
+ orig.dst_port = output->ct.orig_tp.dst;
+ orig.ipv6_proto = output->ct_orig_proto;
+
if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,
sizeof(orig), &orig))
return -EMSGSIZE;
@@ -570,7 +576,7 @@ ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
nf_ct_delete(ct, 0, 0);
- nf_conntrack_put(&ct->ct_general);
+ nf_ct_put(ct);
}
}
@@ -719,7 +725,7 @@ static bool skb_nfct_cached(struct net *net,
if (nf_ct_is_confirmed(ct))
nf_ct_delete(ct, 0, 0);
- nf_conntrack_put(&ct->ct_general);
+ nf_ct_put(ct);
nf_ct_set(skb, NULL, 0);
return false;
}
@@ -728,6 +734,57 @@ static bool skb_nfct_cached(struct net *net,
}
#if IS_ENABLED(CONFIG_NF_NAT)
+static void ovs_nat_update_key(struct sw_flow_key *key,
+ const struct sk_buff *skb,
+ enum nf_nat_manip_type maniptype)
+{
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ __be16 src;
+
+ key->ct_state |= OVS_CS_F_SRC_NAT;
+ if (key->eth.type == htons(ETH_P_IP))
+ key->ipv4.addr.src = ip_hdr(skb)->saddr;
+ else if (key->eth.type == htons(ETH_P_IPV6))
+ memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr,
+ sizeof(key->ipv6.addr.src));
+ else
+ return;
+
+ if (key->ip.proto == IPPROTO_UDP)
+ src = udp_hdr(skb)->source;
+ else if (key->ip.proto == IPPROTO_TCP)
+ src = tcp_hdr(skb)->source;
+ else if (key->ip.proto == IPPROTO_SCTP)
+ src = sctp_hdr(skb)->source;
+ else
+ return;
+
+ key->tp.src = src;
+ } else {
+ __be16 dst;
+
+ key->ct_state |= OVS_CS_F_DST_NAT;
+ if (key->eth.type == htons(ETH_P_IP))
+ key->ipv4.addr.dst = ip_hdr(skb)->daddr;
+ else if (key->eth.type == htons(ETH_P_IPV6))
+ memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr,
+ sizeof(key->ipv6.addr.dst));
+ else
+ return;
+
+ if (key->ip.proto == IPPROTO_UDP)
+ dst = udp_hdr(skb)->dest;
+ else if (key->ip.proto == IPPROTO_TCP)
+ dst = tcp_hdr(skb)->dest;
+ else if (key->ip.proto == IPPROTO_SCTP)
+ dst = sctp_hdr(skb)->dest;
+ else
+ return;
+
+ key->tp.dst = dst;
+ }
+}
+
/* Modelled after nf_nat_ipv[46]_fn().
* range is only used for new, uninitialized NAT state.
* Returns either NF_ACCEPT or NF_DROP.
@@ -735,7 +792,7 @@ static bool skb_nfct_cached(struct net *net,
static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
const struct nf_nat_range2 *range,
- enum nf_nat_manip_type maniptype)
+ enum nf_nat_manip_type maniptype, struct sw_flow_key *key)
{
int hooknum, nh_off, err = NF_ACCEPT;
@@ -776,7 +833,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
}
}
/* Non-ICMP, fall thru to initialize if needed. */
- /* fall through */
+ fallthrough;
case IP_CT_NEW:
/* Seen it before? This can happen for loopback, retrans,
* or local packets.
@@ -805,61 +862,13 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
err = nf_nat_packet(ct, ctinfo, hooknum, skb);
push:
- skb_push(skb, nh_off);
- skb_postpush_rcsum(skb, skb->data, nh_off);
-
- return err;
-}
+ skb_push_rcsum(skb, nh_off);
-static void ovs_nat_update_key(struct sw_flow_key *key,
- const struct sk_buff *skb,
- enum nf_nat_manip_type maniptype)
-{
- if (maniptype == NF_NAT_MANIP_SRC) {
- __be16 src;
-
- key->ct_state |= OVS_CS_F_SRC_NAT;
- if (key->eth.type == htons(ETH_P_IP))
- key->ipv4.addr.src = ip_hdr(skb)->saddr;
- else if (key->eth.type == htons(ETH_P_IPV6))
- memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr,
- sizeof(key->ipv6.addr.src));
- else
- return;
-
- if (key->ip.proto == IPPROTO_UDP)
- src = udp_hdr(skb)->source;
- else if (key->ip.proto == IPPROTO_TCP)
- src = tcp_hdr(skb)->source;
- else if (key->ip.proto == IPPROTO_SCTP)
- src = sctp_hdr(skb)->source;
- else
- return;
-
- key->tp.src = src;
- } else {
- __be16 dst;
-
- key->ct_state |= OVS_CS_F_DST_NAT;
- if (key->eth.type == htons(ETH_P_IP))
- key->ipv4.addr.dst = ip_hdr(skb)->daddr;
- else if (key->eth.type == htons(ETH_P_IPV6))
- memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr,
- sizeof(key->ipv6.addr.dst));
- else
- return;
-
- if (key->ip.proto == IPPROTO_UDP)
- dst = udp_hdr(skb)->dest;
- else if (key->ip.proto == IPPROTO_TCP)
- dst = tcp_hdr(skb)->dest;
- else if (key->ip.proto == IPPROTO_SCTP)
- dst = sctp_hdr(skb)->dest;
- else
- return;
+ /* Update the flow key if NAT successful. */
+ if (err == NF_ACCEPT)
+ ovs_nat_update_key(key, skb, maniptype);
- key->tp.dst = dst;
- }
+ return err;
}
/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */
@@ -901,23 +910,23 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
} else {
return NF_ACCEPT; /* Connection is not NATed. */
}
- err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype);
+ err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype, key);
- if (err == NF_ACCEPT &&
- ct->status & IPS_SRC_NAT && ct->status & IPS_DST_NAT) {
- if (maniptype == NF_NAT_MANIP_SRC)
- maniptype = NF_NAT_MANIP_DST;
- else
- maniptype = NF_NAT_MANIP_SRC;
+ if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) {
+ if (ct->status & IPS_SRC_NAT) {
+ if (maniptype == NF_NAT_MANIP_SRC)
+ maniptype = NF_NAT_MANIP_DST;
+ else
+ maniptype = NF_NAT_MANIP_SRC;
- err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range,
- maniptype);
+ err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range,
+ maniptype, key);
+ } else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+ err = ovs_ct_nat_execute(skb, ct, ctinfo, NULL,
+ NF_NAT_MANIP_SRC, key);
+ }
}
- /* Mark NAT done if successful and update the flow key. */
- if (err == NF_ACCEPT)
- ovs_nat_update_key(key, skb, maniptype);
-
return err;
}
#else /* !CONFIG_NF_NAT */
@@ -960,8 +969,8 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
/* Associate skb with specified zone. */
if (tmpl) {
- if (skb_nfct(skb))
- nf_conntrack_put(skb_nfct(skb));
+ ct = nf_ct_get(skb, &ctinfo);
+ nf_ct_put(ct);
nf_conntrack_get(&tmpl->ct_general);
nf_ct_set(skb, tmpl, IP_CT_NEW);
}
@@ -1006,7 +1015,8 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
* connections which we will commit, we may need to attach
* the helper here.
*/
- if (info->commit && info->helper && !nfct_help(ct)) {
+ if (!nf_ct_is_confirmed(ct) && info->commit &&
+ info->helper && !nfct_help(ct)) {
int err = __nf_ct_try_assign_helper(ct, info->ct,
GFP_ATOMIC);
if (err)
@@ -1031,6 +1041,16 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
return -EINVAL;
}
+
+ if (nf_ct_protonum(ct) == IPPROTO_TCP &&
+ nf_ct_is_confirmed(ct) && nf_conntrack_tcp_established(ct)) {
+ /* Be liberal for tcp packets so that out-of-window
+ * packets are not marked invalid.
+ */
+ nf_ct_set_tcp_be_liberal(ct);
+ }
+
+ nf_conn_act_ct_ext_fill(skb, ct, ctinfo);
}
return 0;
@@ -1231,6 +1251,8 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
&info->labels.mask);
if (err)
return err;
+
+ nf_conn_act_ct_ext_add(ct);
} else if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
labels_nonzero(&info->labels.mask)) {
err = ovs_ct_set_labels(ct, key, &info->labels.value,
@@ -1306,8 +1328,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
else
err = ovs_ct_lookup(net, key, info, skb);
- skb_push(skb, nh_ofs);
- skb_postpush_rcsum(skb, skb->data, nh_ofs);
+ skb_push_rcsum(skb, nh_ofs);
if (err)
kfree_skb(skb);
return err;
@@ -1315,11 +1336,16 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key)
{
- if (skb_nfct(skb)) {
- nf_conntrack_put(skb_nfct(skb));
- nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
- ovs_ct_fill_key(skb, key);
- }
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+
+ nf_ct_put(ct);
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+
+ if (key)
+ ovs_ct_fill_key(skb, key, false);
return 0;
}
@@ -1538,7 +1564,7 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
switch (type) {
case OVS_CT_ATTR_FORCE_COMMIT:
info->force = true;
- /* fall through. */
+ fallthrough;
case OVS_CT_ATTR_COMMIT:
info->commit = true;
break;
@@ -1705,7 +1731,6 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr,
goto err_free_ct;
__set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status);
- nf_conntrack_get(&ct_info.ct->ct_general);
return 0;
err_free_ct:
__ovs_ct_free_action(&ct_info);
@@ -1895,11 +1920,12 @@ static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net)
struct hlist_head *head = &info->limits[i];
struct ovs_ct_limit *ct_limit;
- hlist_for_each_entry_rcu(ct_limit, head, hlist_node)
+ hlist_for_each_entry_rcu(ct_limit, head, hlist_node,
+ lockdep_ovsl_is_held())
kfree_rcu(ct_limit, rcu);
}
- kfree(ovs_net->ct_limit_info->limits);
- kfree(ovs_net->ct_limit_info);
+ kfree(info->limits);
+ kfree(info);
}
static struct sk_buff *
@@ -1957,7 +1983,8 @@ static int ovs_ct_limit_set_zone_limit(struct nlattr *nla_zone_limit,
} else {
struct ovs_ct_limit *ct_limit;
- ct_limit = kmalloc(sizeof(*ct_limit), GFP_KERNEL);
+ ct_limit = kmalloc(sizeof(*ct_limit),
+ GFP_KERNEL_ACCOUNT);
if (!ct_limit)
return -ENOMEM;
@@ -2017,16 +2044,12 @@ static int ovs_ct_limit_del_zone_limit(struct nlattr *nla_zone_limit,
static int ovs_ct_limit_get_default_limit(struct ovs_ct_limit_info *info,
struct sk_buff *reply)
{
- struct ovs_zone_limit zone_limit;
- int err;
-
- zone_limit.zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE;
- zone_limit.limit = info->default_limit;
- err = nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit);
- if (err)
- return err;
+ struct ovs_zone_limit zone_limit = {
+ .zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE,
+ .limit = info->default_limit,
+ };
- return 0;
+ return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit);
}
static int __ovs_ct_limit_get_zone_limit(struct net *net,
@@ -2228,17 +2251,19 @@ exit_err:
return err;
}
-static struct genl_ops ct_limit_genl_ops[] = {
+static const struct genl_small_ops ct_limit_genl_ops[] = {
{ .cmd = OVS_CT_LIMIT_CMD_SET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
- .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
- * privilege. */
+ .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN
+ * privilege.
+ */
.doit = ovs_ct_limit_cmd_set,
},
{ .cmd = OVS_CT_LIMIT_CMD_DEL,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
- .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
- * privilege. */
+ .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN
+ * privilege.
+ */
.doit = ovs_ct_limit_cmd_del,
},
{ .cmd = OVS_CT_LIMIT_CMD_GET,
@@ -2260,8 +2285,9 @@ struct genl_family dp_ct_limit_genl_family __ro_after_init = {
.policy = ct_limit_policy,
.netnsok = true,
.parallel_ops = true,
- .ops = ct_limit_genl_ops,
- .n_ops = ARRAY_SIZE(ct_limit_genl_ops),
+ .small_ops = ct_limit_genl_ops,
+ .n_small_ops = ARRAY_SIZE(ct_limit_genl_ops),
+ .resv_start_op = OVS_CT_LIMIT_CMD_GET + 1,
.mcgrps = &ovs_ct_limit_multicast_group,
.n_mcgrps = 1,
.module = THIS_MODULE,
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 59dc32761b91..317e525c8a11 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -25,7 +25,8 @@ int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
const struct ovs_conntrack_info *);
int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key);
-void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key);
+void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key,
+ bool post_ct);
int ovs_ct_put_key(const struct sw_flow_key *swkey,
const struct sw_flow_key *output, struct sk_buff *skb);
void ovs_ct_free_action(const struct nlattr *a);
@@ -74,7 +75,8 @@ static inline int ovs_ct_clear(struct sk_buff *skb,
}
static inline void ovs_ct_fill_key(const struct sk_buff *skb,
- struct sw_flow_key *key)
+ struct sw_flow_key *key,
+ bool post_ct)
{
key->ct_state = 0;
key->ct_zone = 0;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 07a7dd185995..8b84869eb2ac 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -37,12 +37,14 @@
#include <net/genetlink.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <net/pkt_cls.h>
#include "datapath.h"
#include "flow.h"
#include "flow_table.h"
#include "flow_netlink.h"
#include "meter.h"
+#include "openvswitch_trace.h"
#include "vport-internal_dev.h"
#include "vport-netdev.h"
@@ -130,6 +132,10 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *,
const struct dp_upcall_info *,
uint32_t cutlen);
+static void ovs_dp_masks_rebalance(struct work_struct *work);
+
+static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr *);
+
/* Must be called with rcu_read_lock or ovs_mutex. */
const char *ovs_dp_name(const struct datapath *dp)
{
@@ -163,6 +169,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
free_percpu(dp->stats_percpu);
kfree(dp->ports);
ovs_meters_exit(dp);
+ kfree(rcu_dereference_raw(dp->upcall_portids));
kfree(dp);
}
@@ -180,7 +187,7 @@ struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no)
head = vport_hash_bucket(dp, port_no);
hlist_for_each_entry_rcu(vport, head, dp_hash_node,
- lockdep_ovsl_is_held()) {
+ lockdep_ovsl_is_held()) {
if (vport->port_no == port_no)
return vport;
}
@@ -223,25 +230,39 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
struct dp_stats_percpu *stats;
u64 *stats_counter;
u32 n_mask_hit;
+ u32 n_cache_hit;
int error;
stats = this_cpu_ptr(dp->stats_percpu);
/* Look up flow. */
flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb),
- &n_mask_hit);
+ &n_mask_hit, &n_cache_hit);
if (unlikely(!flow)) {
struct dp_upcall_info upcall;
memset(&upcall, 0, sizeof(upcall));
upcall.cmd = OVS_PACKET_CMD_MISS;
- upcall.portid = ovs_vport_find_upcall_portid(p, skb);
+
+ if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
+ upcall.portid =
+ ovs_dp_get_upcall_portid(dp, smp_processor_id());
+ else
+ upcall.portid = ovs_vport_find_upcall_portid(p, skb);
+
upcall.mru = OVS_CB(skb)->mru;
error = ovs_dp_upcall(dp, skb, key, &upcall, 0);
- if (unlikely(error))
- kfree_skb(skb);
- else
+ switch (error) {
+ case 0:
+ case -EAGAIN:
+ case -ERESTARTSYS:
+ case -EINTR:
consume_skb(skb);
+ break;
+ default:
+ kfree_skb(skb);
+ break;
+ }
stats_counter = &stats->n_missed;
goto out;
}
@@ -251,7 +272,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
error = ovs_execute_actions(dp, skb, sf_acts, key);
if (unlikely(error))
net_dbg_ratelimited("ovs: action execution error on datapath %s: %d\n",
- ovs_dp_name(dp), error);
+ ovs_dp_name(dp), error);
stats_counter = &stats->n_hit;
@@ -260,6 +281,7 @@ out:
u64_stats_update_begin(&stats->syncp);
(*stats_counter)++;
stats->n_mask_hit += n_mask_hit;
+ stats->n_cache_hit += n_cache_hit;
u64_stats_update_end(&stats->syncp);
}
@@ -271,6 +293,9 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
struct dp_stats_percpu *stats;
int err;
+ if (trace_ovs_dp_upcall_enabled())
+ trace_ovs_dp_upcall(dp, skb, key, upcall_info);
+
if (upcall_info->portid == 0) {
err = -ENOTCONN;
goto err;
@@ -298,14 +323,14 @@ err:
static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_key *key,
const struct dp_upcall_info *upcall_info,
- uint32_t cutlen)
+ uint32_t cutlen)
{
unsigned int gso_type = skb_shinfo(skb)->gso_type;
struct sw_flow_key later_key;
struct sk_buff *segs, *nskb;
int err;
- BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_SGO_CB_OFFSET);
+ BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_GSO_CB_OFFSET);
segs = __skb_gso_segment(skb, NETIF_F_SG, false);
if (IS_ERR(segs))
return PTR_ERR(segs);
@@ -533,8 +558,9 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
out:
if (err)
skb_tx_error(skb);
- kfree_skb(user_skb);
- kfree_skb(nskb);
+ consume_skb(user_skb);
+ consume_skb(nskb);
+
return err;
}
@@ -648,7 +674,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
[OVS_PACKET_ATTR_HASH] = { .type = NLA_U64 },
};
-static const struct genl_ops dp_packet_genl_ops[] = {
+static const struct genl_small_ops dp_packet_genl_ops[] = {
{ .cmd = OVS_PACKET_CMD_EXECUTE,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
@@ -664,8 +690,9 @@ static struct genl_family dp_packet_genl_family __ro_after_init = {
.policy = packet_policy,
.netnsok = true,
.parallel_ops = true,
- .ops = dp_packet_genl_ops,
- .n_ops = ARRAY_SIZE(dp_packet_genl_ops),
+ .small_ops = dp_packet_genl_ops,
+ .n_small_ops = ARRAY_SIZE(dp_packet_genl_ops),
+ .resv_start_op = OVS_PACKET_CMD_EXECUTE + 1,
.module = THIS_MODULE,
};
@@ -697,6 +724,7 @@ static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats,
stats->n_missed += local_stats.n_missed;
stats->n_lost += local_stats.n_lost;
mega_stats->n_mask_hit += local_stats.n_mask_hit;
+ mega_stats->n_cache_hit += local_stats.n_cache_hit;
}
}
@@ -1075,11 +1103,12 @@ error:
}
/* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */
-static noinline_for_stack struct sw_flow_actions *get_flow_actions(struct net *net,
- const struct nlattr *a,
- const struct sw_flow_key *key,
- const struct sw_flow_mask *mask,
- bool log)
+static noinline_for_stack
+struct sw_flow_actions *get_flow_actions(struct net *net,
+ const struct nlattr *a,
+ const struct sw_flow_key *key,
+ const struct sw_flow_mask *mask,
+ bool log)
{
struct sw_flow_actions *acts;
struct sw_flow_key masked_key;
@@ -1378,7 +1407,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
ovs_notify(&dp_flow_genl_family, reply, info);
} else {
- netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, PTR_ERR(reply));
+ netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0,
+ PTR_ERR(reply));
}
}
@@ -1446,7 +1476,7 @@ static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
[OVS_FLOW_ATTR_UFID_FLAGS] = { .type = NLA_U32 },
};
-static const struct genl_ops dp_flow_genl_ops[] = {
+static const struct genl_small_ops dp_flow_genl_ops[] = {
{ .cmd = OVS_FLOW_CMD_NEW,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
@@ -1478,8 +1508,9 @@ static struct genl_family dp_flow_genl_family __ro_after_init = {
.policy = flow_policy,
.netnsok = true,
.parallel_ops = true,
- .ops = dp_flow_genl_ops,
- .n_ops = ARRAY_SIZE(dp_flow_genl_ops),
+ .small_ops = dp_flow_genl_ops,
+ .n_small_ops = ARRAY_SIZE(dp_flow_genl_ops),
+ .resv_start_op = OVS_FLOW_CMD_SET + 1,
.mcgrps = &ovs_dp_flow_multicast_group,
.n_mcgrps = 1,
.module = THIS_MODULE,
@@ -1493,6 +1524,8 @@ static size_t ovs_dp_cmd_msg_size(void)
msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_stats));
msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_megaflow_stats));
msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */
+ msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_MASKS_CACHE_SIZE */
+ msgsize += nla_total_size(sizeof(u32) * nr_cpu_ids); /* OVS_DP_ATTR_PER_CPU_PIDS */
return msgsize;
}
@@ -1504,10 +1537,11 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
struct ovs_header *ovs_header;
struct ovs_dp_stats dp_stats;
struct ovs_dp_megaflow_stats dp_megaflow_stats;
- int err;
+ struct dp_nlsk_pids *pids = ovsl_dereference(dp->upcall_portids);
+ int err, pids_len;
ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family,
- flags, cmd);
+ flags, cmd);
if (!ovs_header)
goto error;
@@ -1530,6 +1564,16 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features))
goto nla_put_failure;
+ if (nla_put_u32(skb, OVS_DP_ATTR_MASKS_CACHE_SIZE,
+ ovs_flow_tbl_masks_cache_size(&dp->table)))
+ goto nla_put_failure;
+
+ if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && pids) {
+ pids_len = min(pids->n_pids, nr_cpu_ids) * sizeof(u32);
+ if (nla_put(skb, OVS_DP_ATTR_PER_CPU_PIDS, pids_len, &pids->pids))
+ goto nla_put_failure;
+ }
+
genlmsg_end(skb, ovs_header);
return 0;
@@ -1562,30 +1606,85 @@ static struct datapath *lookup_datapath(struct net *net,
return dp ? dp : ERR_PTR(-ENODEV);
}
-static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info)
+static void ovs_dp_reset_user_features(struct sk_buff *skb,
+ struct genl_info *info)
{
struct datapath *dp;
- dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
+ dp = lookup_datapath(sock_net(skb->sk), info->userhdr,
+ info->attrs);
if (IS_ERR(dp))
return;
- WARN(dp->user_features, "Dropping previously announced user features\n");
+ pr_warn("%s: Dropping previously announced user features\n",
+ ovs_dp_name(dp));
dp->user_features = 0;
}
-DEFINE_STATIC_KEY_FALSE(tc_recirc_sharing_support);
+static int ovs_dp_set_upcall_portids(struct datapath *dp,
+ const struct nlattr *ids)
+{
+ struct dp_nlsk_pids *old, *dp_nlsk_pids;
+
+ if (!nla_len(ids) || nla_len(ids) % sizeof(u32))
+ return -EINVAL;
+
+ old = ovsl_dereference(dp->upcall_portids);
+
+ dp_nlsk_pids = kmalloc(sizeof(*dp_nlsk_pids) + nla_len(ids),
+ GFP_KERNEL);
+ if (!dp_nlsk_pids)
+ return -ENOMEM;
+
+ dp_nlsk_pids->n_pids = nla_len(ids) / sizeof(u32);
+ nla_memcpy(dp_nlsk_pids->pids, ids, nla_len(ids));
+
+ rcu_assign_pointer(dp->upcall_portids, dp_nlsk_pids);
+
+ kfree_rcu(old, rcu);
+
+ return 0;
+}
+
+u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id)
+{
+ struct dp_nlsk_pids *dp_nlsk_pids;
+
+ dp_nlsk_pids = rcu_dereference(dp->upcall_portids);
+
+ if (dp_nlsk_pids) {
+ if (cpu_id < dp_nlsk_pids->n_pids) {
+ return dp_nlsk_pids->pids[cpu_id];
+ } else if (dp_nlsk_pids->n_pids > 0 &&
+ cpu_id >= dp_nlsk_pids->n_pids) {
+ /* If the number of netlink PIDs is mismatched with
+ * the number of CPUs as seen by the kernel, log this
+ * and send the upcall to an arbitrary socket (0) in
+ * order to not drop packets
+ */
+ pr_info_ratelimited("cpu_id mismatch with handler threads");
+ return dp_nlsk_pids->pids[cpu_id %
+ dp_nlsk_pids->n_pids];
+ } else {
+ return 0;
+ }
+ } else {
+ return 0;
+ }
+}
static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])
{
- u32 user_features = 0;
+ u32 user_features = 0, old_features = dp->user_features;
+ int err;
if (a[OVS_DP_ATTR_USER_FEATURES]) {
user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]);
if (user_features & ~(OVS_DP_F_VPORT_PIDS |
OVS_DP_F_UNALIGNED |
- OVS_DP_F_TC_RECIRC_SHARING))
+ OVS_DP_F_TC_RECIRC_SHARING |
+ OVS_DP_F_DISPATCH_UPCALL_PER_CPU))
return -EOPNOTSUPP;
#if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
@@ -1594,12 +1693,33 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])
#endif
}
+ if (a[OVS_DP_ATTR_MASKS_CACHE_SIZE]) {
+ int err;
+ u32 cache_size;
+
+ cache_size = nla_get_u32(a[OVS_DP_ATTR_MASKS_CACHE_SIZE]);
+ err = ovs_flow_tbl_masks_cache_resize(&dp->table, cache_size);
+ if (err)
+ return err;
+ }
+
dp->user_features = user_features;
- if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING)
- static_branch_enable(&tc_recirc_sharing_support);
- else
- static_branch_disable(&tc_recirc_sharing_support);
+ if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU &&
+ a[OVS_DP_ATTR_PER_CPU_PIDS]) {
+ /* Upcall Netlink Port IDs have been updated */
+ err = ovs_dp_set_upcall_portids(dp,
+ a[OVS_DP_ATTR_PER_CPU_PIDS]);
+ if (err)
+ return err;
+ }
+
+ if ((dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) &&
+ !(old_features & OVS_DP_F_TC_RECIRC_SHARING))
+ tc_skb_ext_tc_enable();
+ else if (!(dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) &&
+ (old_features & OVS_DP_F_TC_RECIRC_SHARING))
+ tc_skb_ext_tc_disable();
return 0;
}
@@ -1678,14 +1798,16 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
parms.dp = dp;
parms.port_no = OVSP_LOCAL;
parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID];
-
- err = ovs_dp_change(dp, a);
- if (err)
- goto err_destroy_meters;
+ parms.desired_ifindex = a[OVS_DP_ATTR_IFINDEX]
+ ? nla_get_u32(a[OVS_DP_ATTR_IFINDEX]) : 0;
/* So far only local changes have been made, now need the lock. */
ovs_lock();
+ err = ovs_dp_change(dp, a);
+ if (err)
+ goto err_unlock_and_destroy_meters;
+
vport = new_vport(&parms);
if (IS_ERR(vport)) {
err = PTR_ERR(vport);
@@ -1701,8 +1823,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
ovs_dp_reset_user_features(skb, info);
}
- ovs_unlock();
- goto err_destroy_meters;
+ goto err_destroy_portids;
}
err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
@@ -1717,7 +1838,10 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
ovs_notify(&dp_datapath_genl_family, reply, info);
return 0;
-err_destroy_meters:
+err_destroy_portids:
+ kfree(rcu_dereference_raw(dp->upcall_portids));
+err_unlock_and_destroy_meters:
+ ovs_unlock();
ovs_meters_exit(dp);
err_destroy_ports:
kfree(dp->ports);
@@ -1736,8 +1860,12 @@ err:
/* Called with ovs_mutex. */
static void __dp_destroy(struct datapath *dp)
{
+ struct flow_table *table = &dp->table;
int i;
+ if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING)
+ tc_skb_ext_tc_disable();
+
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
struct vport *vport;
struct hlist_node *n;
@@ -1754,7 +1882,14 @@ static void __dp_destroy(struct datapath *dp)
*/
ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL));
- /* RCU destroy the flow table */
+ /* Flush sw_flow in the tables. RCU cb only releases resource
+ * such as dp, ports and tables. That may avoid some issues
+ * such as RCU usage warning.
+ */
+ table_instance_flow_flush(table, ovsl_dereference(table->ti),
+ ovsl_dereference(table->ufid_ti));
+
+ /* RCU destroy the ports, meters and flow tables. */
call_rcu(&dp->rcu, destroy_dp_rcu);
}
@@ -1882,9 +2017,12 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
[OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
[OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 },
[OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 },
+ [OVS_DP_ATTR_MASKS_CACHE_SIZE] = NLA_POLICY_RANGE(NLA_U32, 0,
+ PCPU_MIN_UNIT_SIZE / sizeof(struct mask_cache_entry)),
+ [OVS_DP_ATTR_IFINDEX] = {.type = NLA_U32 },
};
-static const struct genl_ops dp_datapath_genl_ops[] = {
+static const struct genl_small_ops dp_datapath_genl_ops[] = {
{ .cmd = OVS_DP_CMD_NEW,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
@@ -1916,8 +2054,9 @@ static struct genl_family dp_datapath_genl_family __ro_after_init = {
.policy = datapath_policy,
.netnsok = true,
.parallel_ops = true,
- .ops = dp_datapath_genl_ops,
- .n_ops = ARRAY_SIZE(dp_datapath_genl_ops),
+ .small_ops = dp_datapath_genl_ops,
+ .n_small_ops = ARRAY_SIZE(dp_datapath_genl_ops),
+ .resv_start_op = OVS_DP_CMD_SET + 1,
.mcgrps = &ovs_dp_datapath_multicast_group,
.n_mcgrps = 1,
.module = THIS_MODULE,
@@ -2045,7 +2184,7 @@ static unsigned int ovs_get_max_headroom(struct datapath *dp)
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node,
- lockdep_ovsl_is_held()) {
+ lockdep_ovsl_is_held()) {
dev = vport->dev;
dev_headroom = netdev_get_fwd_headroom(dev);
if (dev_headroom > max_headroom)
@@ -2063,10 +2202,11 @@ static void ovs_update_headroom(struct datapath *dp, unsigned int new_headroom)
int i;
dp->max_headroom = new_headroom;
- for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
+ for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node,
- lockdep_ovsl_is_held())
+ lockdep_ovsl_is_held())
netdev_set_rx_headroom(vport->dev, new_headroom);
+ }
}
static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
@@ -2084,7 +2224,10 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] ||
!a[OVS_VPORT_ATTR_UPCALL_PID])
return -EINVAL;
- if (a[OVS_VPORT_ATTR_IFINDEX])
+
+ parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]);
+
+ if (a[OVS_VPORT_ATTR_IFINDEX] && parms.type != OVS_VPORT_TYPE_INTERNAL)
return -EOPNOTSUPP;
port_no = a[OVS_VPORT_ATTR_PORT_NO]
@@ -2121,11 +2264,12 @@ restart:
}
parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]);
- parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]);
parms.options = a[OVS_VPORT_ATTR_OPTIONS];
parms.dp = dp;
parms.port_no = port_no;
parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID];
+ parms.desired_ifindex = a[OVS_VPORT_ATTR_IFINDEX]
+ ? nla_get_u32(a[OVS_VPORT_ATTR_IFINDEX]) : 0;
vport = new_vport(&parms);
err = PTR_ERR(vport);
@@ -2338,6 +2482,23 @@ out:
return skb->len;
}
+static void ovs_dp_masks_rebalance(struct work_struct *work)
+{
+ struct ovs_net *ovs_net = container_of(work, struct ovs_net,
+ masks_rebalance.work);
+ struct datapath *dp;
+
+ ovs_lock();
+
+ list_for_each_entry(dp, &ovs_net->dps, list_node)
+ ovs_flow_masks_rebalance(&dp->table);
+
+ ovs_unlock();
+
+ schedule_delayed_work(&ovs_net->masks_rebalance,
+ msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL));
+}
+
static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
[OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
[OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) },
@@ -2349,7 +2510,7 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
[OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 },
};
-static const struct genl_ops dp_vport_genl_ops[] = {
+static const struct genl_small_ops dp_vport_genl_ops[] = {
{ .cmd = OVS_VPORT_CMD_NEW,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
@@ -2381,8 +2542,9 @@ struct genl_family dp_vport_genl_family __ro_after_init = {
.policy = vport_policy,
.netnsok = true,
.parallel_ops = true,
- .ops = dp_vport_genl_ops,
- .n_ops = ARRAY_SIZE(dp_vport_genl_ops),
+ .small_ops = dp_vport_genl_ops,
+ .n_small_ops = ARRAY_SIZE(dp_vport_genl_ops),
+ .resv_start_op = OVS_VPORT_CMD_SET + 1,
.mcgrps = &ovs_dp_vport_multicast_group,
.n_mcgrps = 1,
.module = THIS_MODULE,
@@ -2429,10 +2591,19 @@ error:
static int __net_init ovs_init_net(struct net *net)
{
struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+ int err;
INIT_LIST_HEAD(&ovs_net->dps);
INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
- return ovs_ct_init(net);
+ INIT_DELAYED_WORK(&ovs_net->masks_rebalance, ovs_dp_masks_rebalance);
+
+ err = ovs_ct_init(net);
+ if (err)
+ return err;
+
+ schedule_delayed_work(&ovs_net->masks_rebalance,
+ msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL));
+ return 0;
}
static void __net_exit list_vports_from_net(struct net *net, struct net *dnet,
@@ -2466,8 +2637,10 @@ static void __net_exit ovs_exit_net(struct net *dnet)
struct net *net;
LIST_HEAD(head);
- ovs_ct_exit(dnet);
ovs_lock();
+
+ ovs_ct_exit(dnet);
+
list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node)
__dp_destroy(dp);
@@ -2484,6 +2657,7 @@ static void __net_exit ovs_exit_net(struct net *dnet)
ovs_unlock();
+ cancel_delayed_work_sync(&ovs_net->masks_rebalance);
cancel_work_sync(&ovs_net->dp_notify_work);
}
@@ -2498,7 +2672,8 @@ static int __init dp_init(void)
{
int err;
- BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof_field(struct sk_buff, cb));
+ BUILD_BUG_ON(sizeof(struct ovs_skb_cb) >
+ sizeof_field(struct sk_buff, cb));
pr_info("Open vSwitch switching datapath\n");
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index e239a46c2f94..0cd29971a907 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -20,8 +20,9 @@
#include "meter.h"
#include "vport-internal_dev.h"
-#define DP_MAX_PORTS USHRT_MAX
-#define DP_VPORT_HASH_BUCKETS 1024
+#define DP_MAX_PORTS USHRT_MAX
+#define DP_VPORT_HASH_BUCKETS 1024
+#define DP_MASKS_REBALANCE_INTERVAL 4000
/**
* struct dp_stats_percpu - per-cpu packet processing statistics for a given
@@ -37,16 +38,34 @@
* @n_mask_hit: Number of masks looked up for flow match.
* @n_mask_hit / (@n_hit + @n_missed) will be the average masks looked
* up per packet.
+ * @n_cache_hit: The number of received packets that had their mask found using
+ * the mask cache.
*/
struct dp_stats_percpu {
u64 n_hit;
u64 n_missed;
u64 n_lost;
u64 n_mask_hit;
+ u64 n_cache_hit;
struct u64_stats_sync syncp;
};
/**
+ * struct dp_nlsk_pids - array of netlink portids of for a datapath.
+ * This is used when OVS_DP_F_DISPATCH_UPCALL_PER_CPU
+ * is enabled and must be protected by rcu.
+ * @rcu: RCU callback head for deferred destruction.
+ * @n_pids: Size of @pids array.
+ * @pids: Array storing the Netlink socket PIDs indexed by CPU ID for packets
+ * that miss the flow table.
+ */
+struct dp_nlsk_pids {
+ struct rcu_head rcu;
+ u32 n_pids;
+ u32 pids[];
+};
+
+/**
* struct datapath - datapath for flow-based packet switching
* @rcu: RCU callback head for deferred destruction.
* @list_node: Element in global 'dps' list.
@@ -57,6 +76,7 @@ struct dp_stats_percpu {
* @net: Reference to net namespace.
* @max_headroom: the maximum headroom of all vports in this datapath; it will
* be used by all the internal vports in this dp.
+ * @upcall_portids: RCU protected 'struct dp_nlsk_pids'.
*
* Context: See the comment on locking at the top of datapath.c for additional
* locking information.
@@ -82,7 +102,9 @@ struct datapath {
u32 max_headroom;
/* Switch meters. */
- struct hlist_head *meters;
+ struct dp_meter_table meter_tbl;
+
+ struct dp_nlsk_pids __rcu *upcall_portids;
};
/**
@@ -131,6 +153,7 @@ struct dp_upcall_info {
struct ovs_net {
struct list_head dps;
struct work_struct dp_notify_work;
+ struct delayed_work masks_rebalance;
#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
struct ovs_ct_limit_info *ct_limit_info;
#endif
@@ -230,14 +253,14 @@ static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
extern struct notifier_block ovs_dp_device_notifier;
extern struct genl_family dp_vport_genl_family;
-DECLARE_STATIC_KEY_FALSE(tc_recirc_sharing_support);
-
void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key);
void ovs_dp_detach_port(struct vport *);
int ovs_dp_upcall(struct datapath *, struct sk_buff *,
const struct sw_flow_key *, const struct dp_upcall_info *,
uint32_t cutlen);
+u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id);
+
const char *ovs_dp_name(const struct datapath *dp);
struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net,
u32 portid, u32 seq, u8 cmd);
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 9d375e74b607..e20d1a973417 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -34,6 +34,8 @@
#include <net/mpls.h>
#include <net/ndisc.h>
#include <net/nsh.h>
+#include <net/pkt_cls.h>
+#include <net/netfilter/nf_conntrack_zones.h>
#include "conntrack.h"
#include "datapath.h"
@@ -239,6 +241,144 @@ static bool icmphdr_ok(struct sk_buff *skb)
sizeof(struct icmphdr));
}
+/**
+ * get_ipv6_ext_hdrs() - Parses packet and sets IPv6 extension header flags.
+ *
+ * @skb: buffer where extension header data starts in packet
+ * @nh: ipv6 header
+ * @ext_hdrs: flags are stored here
+ *
+ * OFPIEH12_UNREP is set if more than one of a given IPv6 extension header
+ * is unexpectedly encountered. (Two destination options headers may be
+ * expected and would not cause this bit to be set.)
+ *
+ * OFPIEH12_UNSEQ is set if IPv6 extension headers were not in the order
+ * preferred (but not required) by RFC 2460:
+ *
+ * When more than one extension header is used in the same packet, it is
+ * recommended that those headers appear in the following order:
+ * IPv6 header
+ * Hop-by-Hop Options header
+ * Destination Options header
+ * Routing header
+ * Fragment header
+ * Authentication header
+ * Encapsulating Security Payload header
+ * Destination Options header
+ * upper-layer header
+ */
+static void get_ipv6_ext_hdrs(struct sk_buff *skb, struct ipv6hdr *nh,
+ u16 *ext_hdrs)
+{
+ u8 next_type = nh->nexthdr;
+ unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr);
+ int dest_options_header_count = 0;
+
+ *ext_hdrs = 0;
+
+ while (ipv6_ext_hdr(next_type)) {
+ struct ipv6_opt_hdr _hdr, *hp;
+
+ switch (next_type) {
+ case IPPROTO_NONE:
+ *ext_hdrs |= OFPIEH12_NONEXT;
+ /* stop parsing */
+ return;
+
+ case IPPROTO_ESP:
+ if (*ext_hdrs & OFPIEH12_ESP)
+ *ext_hdrs |= OFPIEH12_UNREP;
+ if ((*ext_hdrs & ~(OFPIEH12_HOP | OFPIEH12_DEST |
+ OFPIEH12_ROUTER | IPPROTO_FRAGMENT |
+ OFPIEH12_AUTH | OFPIEH12_UNREP)) ||
+ dest_options_header_count >= 2) {
+ *ext_hdrs |= OFPIEH12_UNSEQ;
+ }
+ *ext_hdrs |= OFPIEH12_ESP;
+ break;
+
+ case IPPROTO_AH:
+ if (*ext_hdrs & OFPIEH12_AUTH)
+ *ext_hdrs |= OFPIEH12_UNREP;
+ if ((*ext_hdrs &
+ ~(OFPIEH12_HOP | OFPIEH12_DEST | OFPIEH12_ROUTER |
+ IPPROTO_FRAGMENT | OFPIEH12_UNREP)) ||
+ dest_options_header_count >= 2) {
+ *ext_hdrs |= OFPIEH12_UNSEQ;
+ }
+ *ext_hdrs |= OFPIEH12_AUTH;
+ break;
+
+ case IPPROTO_DSTOPTS:
+ if (dest_options_header_count == 0) {
+ if (*ext_hdrs &
+ ~(OFPIEH12_HOP | OFPIEH12_UNREP))
+ *ext_hdrs |= OFPIEH12_UNSEQ;
+ *ext_hdrs |= OFPIEH12_DEST;
+ } else if (dest_options_header_count == 1) {
+ if (*ext_hdrs &
+ ~(OFPIEH12_HOP | OFPIEH12_DEST |
+ OFPIEH12_ROUTER | OFPIEH12_FRAG |
+ OFPIEH12_AUTH | OFPIEH12_ESP |
+ OFPIEH12_UNREP)) {
+ *ext_hdrs |= OFPIEH12_UNSEQ;
+ }
+ } else {
+ *ext_hdrs |= OFPIEH12_UNREP;
+ }
+ dest_options_header_count++;
+ break;
+
+ case IPPROTO_FRAGMENT:
+ if (*ext_hdrs & OFPIEH12_FRAG)
+ *ext_hdrs |= OFPIEH12_UNREP;
+ if ((*ext_hdrs & ~(OFPIEH12_HOP |
+ OFPIEH12_DEST |
+ OFPIEH12_ROUTER |
+ OFPIEH12_UNREP)) ||
+ dest_options_header_count >= 2) {
+ *ext_hdrs |= OFPIEH12_UNSEQ;
+ }
+ *ext_hdrs |= OFPIEH12_FRAG;
+ break;
+
+ case IPPROTO_ROUTING:
+ if (*ext_hdrs & OFPIEH12_ROUTER)
+ *ext_hdrs |= OFPIEH12_UNREP;
+ if ((*ext_hdrs & ~(OFPIEH12_HOP |
+ OFPIEH12_DEST |
+ OFPIEH12_UNREP)) ||
+ dest_options_header_count >= 2) {
+ *ext_hdrs |= OFPIEH12_UNSEQ;
+ }
+ *ext_hdrs |= OFPIEH12_ROUTER;
+ break;
+
+ case IPPROTO_HOPOPTS:
+ if (*ext_hdrs & OFPIEH12_HOP)
+ *ext_hdrs |= OFPIEH12_UNREP;
+ /* OFPIEH12_HOP is set to 1 if a hop-by-hop IPv6
+ * extension header is present as the first
+ * extension header in the packet.
+ */
+ if (*ext_hdrs == 0)
+ *ext_hdrs |= OFPIEH12_HOP;
+ else
+ *ext_hdrs |= OFPIEH12_UNSEQ;
+ break;
+
+ default:
+ return;
+ }
+
+ hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
+ if (!hp)
+ break;
+ next_type = hp->nexthdr;
+ start += ipv6_optlen(hp);
+ }
+}
+
static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
{
unsigned short frag_off;
@@ -254,6 +394,8 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
nh = ipv6_hdr(skb);
+ get_ipv6_ext_hdrs(skb, nh, &key->ipv6.exthdrs);
+
key->ip.proto = NEXTHDR_NONE;
key->ip.tos = ipv6_get_dsfield(nh);
key->ip.ttl = nh->hop_limit;
@@ -265,7 +407,7 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
if (flags & IP6_FH_F_FRAG) {
if (frag_off) {
key->ip.frag = OVS_FRAG_TYPE_LATER;
- key->ip.proto = nexthdr;
+ key->ip.proto = NEXTHDR_FRAGMENT;
return 0;
}
key->ip.frag = OVS_FRAG_TYPE_FIRST;
@@ -293,10 +435,14 @@ static bool icmp6hdr_ok(struct sk_buff *skb)
}
/**
- * Parse vlan tag from vlan header.
- * Returns ERROR on memory error.
- * Returns 0 if it encounters a non-vlan or incomplete packet.
- * Returns 1 after successfully parsing vlan tag.
+ * parse_vlan_tag - Parse vlan tag from vlan header.
+ * @skb: skb containing frame to parse
+ * @key_vh: pointer to parsed vlan tag
+ * @untag_vlan: should the vlan header be removed from the frame
+ *
+ * Return: ERROR on memory error.
+ * %0 if it encounters a non-vlan or incomplete packet.
+ * %1 after successfully parsing vlan tag.
*/
static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh,
bool untag_vlan)
@@ -528,6 +674,7 @@ static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key)
* L3 header
* @key: output flow key
*
+ * Return: %0 if successful, otherwise a negative errno value.
*/
static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key)
{
@@ -675,7 +822,7 @@ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key)
case -EINVAL:
memset(&key->ip, 0, sizeof(key->ip));
memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr));
- /* fall-through */
+ fallthrough;
case -EPROTO:
skb->transport_header = skb->network_header;
error = 0;
@@ -744,8 +891,6 @@ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key)
*
* The caller must ensure that skb->len >= ETH_HLEN.
*
- * Returns 0 if successful, otherwise a negative errno value.
- *
* Initializes @skb header fields as follows:
*
* - skb->mac_header: the L2 header.
@@ -760,6 +905,8 @@ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key)
*
* - skb->protocol: the type of the data starting at skb->network_header.
* Equals to key->eth.type.
+ *
+ * Return: %0 if successful, otherwise a negative errno value.
*/
static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
{
@@ -853,7 +1000,9 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
struct tc_skb_ext *tc_ext;
#endif
+ bool post_ct = false, post_ct_snat = false, post_ct_dnat = false;
int res, err;
+ u16 zone = 0;
/* Extract metadata from packet. */
if (tun_info) {
@@ -887,9 +1036,14 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
key->mac_proto = res;
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
- if (static_branch_unlikely(&tc_recirc_sharing_support)) {
+ if (tc_skb_ext_tc_enabled()) {
tc_ext = skb_ext_find(skb, TC_SKB_EXT);
key->recirc_id = tc_ext ? tc_ext->chain : 0;
+ OVS_CB(skb)->mru = tc_ext ? tc_ext->mru : 0;
+ post_ct = tc_ext ? tc_ext->post_ct : false;
+ post_ct_snat = post_ct ? tc_ext->post_ct_snat : false;
+ post_ct_dnat = post_ct ? tc_ext->post_ct_dnat : false;
+ zone = post_ct ? tc_ext->zone : 0;
} else {
key->recirc_id = 0;
}
@@ -898,8 +1052,19 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
#endif
err = key_extract(skb, key);
- if (!err)
- ovs_ct_fill_key(skb, key); /* Must be after key_extract(). */
+ if (!err) {
+ ovs_ct_fill_key(skb, key, post_ct); /* Must be after key_extract(). */
+ if (post_ct) {
+ if (!skb_get_nfct(skb)) {
+ key->ct_zone = zone;
+ } else {
+ if (!post_ct_dnat)
+ key->ct_state &= ~OVS_CS_F_DST_NAT;
+ if (!post_ct_snat)
+ key->ct_state &= ~OVS_CS_F_SRC_NAT;
+ }
+ }
+ }
return err;
}
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 758a8c77f736..073ab73ffeaa 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -32,6 +32,19 @@ enum sw_flow_mac_proto {
#define SW_FLOW_KEY_INVALID 0x80
#define MPLS_LABEL_DEPTH 3
+/* Bit definitions for IPv6 Extension Header pseudo-field. */
+enum ofp12_ipv6exthdr_flags {
+ OFPIEH12_NONEXT = 1 << 0, /* "No next header" encountered. */
+ OFPIEH12_ESP = 1 << 1, /* Encrypted Sec Payload header present. */
+ OFPIEH12_AUTH = 1 << 2, /* Authentication header present. */
+ OFPIEH12_DEST = 1 << 3, /* 1 or 2 dest headers present. */
+ OFPIEH12_FRAG = 1 << 4, /* Fragment header present. */
+ OFPIEH12_ROUTER = 1 << 5, /* Router header present. */
+ OFPIEH12_HOP = 1 << 6, /* Hop-by-hop header present. */
+ OFPIEH12_UNREP = 1 << 7, /* Unexpected repeats encountered. */
+ OFPIEH12_UNSEQ = 1 << 8 /* Unexpected sequencing encountered. */
+};
+
/* Store options at the end of the array if they are less than the
* maximum size. This allows us to get the benefits of variable length
* matching for small options.
@@ -121,6 +134,7 @@ struct sw_flow_key {
struct in6_addr dst; /* IPv6 destination address. */
} addr;
__be32 label; /* IPv6 flow label. */
+ u16 exthdrs; /* IPv6 extension header flags */
union {
struct {
struct in6_addr src;
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 288122eec7c8..4a07ab094a84 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -80,6 +80,7 @@ static bool actions_may_change_flow(const struct nlattr *actions)
case OVS_ACTION_ATTR_METER:
case OVS_ACTION_ATTR_CHECK_PKT_LEN:
case OVS_ACTION_ATTR_ADD_MPLS:
+ case OVS_ACTION_ATTR_DEC_TTL:
default:
return true;
}
@@ -345,7 +346,7 @@ size_t ovs_key_attr_size(void)
/* Whenever adding new OVS_KEY_ FIELDS, we should consider
* updating this function.
*/
- BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 29);
+ BUILD_BUG_ON(OVS_KEY_ATTR_MAX != 32);
return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */
+ nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */
@@ -368,7 +369,8 @@ size_t ovs_key_attr_size(void)
+ nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */
+ nla_total_size(40) /* OVS_KEY_ATTR_IPV6 */
+ nla_total_size(2) /* OVS_KEY_ATTR_ICMPV6 */
- + nla_total_size(28); /* OVS_KEY_ATTR_ND */
+ + nla_total_size(28) /* OVS_KEY_ATTR_ND */
+ + nla_total_size(2); /* OVS_KEY_ATTR_IPV6_EXTHDRS */
}
static const struct ovs_len_tbl ovs_vxlan_ext_key_lens[OVS_VXLAN_EXT_MAX + 1] = {
@@ -436,6 +438,8 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
.len = sizeof(struct ovs_key_ct_tuple_ipv6) },
[OVS_KEY_ATTR_NSH] = { .len = OVS_ATTR_NESTED,
.next = ovs_nsh_key_attr_lens, },
+ [OVS_KEY_ATTR_IPV6_EXTHDRS] = {
+ .len = sizeof(struct ovs_key_ipv6_exthdrs) },
};
static bool check_attr_len(unsigned int attr_len, unsigned int expected_len)
@@ -478,7 +482,14 @@ static int __parse_flow_nlattrs(const struct nlattr *attr,
return -EINVAL;
}
- if (attrs & (1 << type)) {
+ if (type == OVS_KEY_ATTR_PACKET_TYPE ||
+ type == OVS_KEY_ATTR_ND_EXTENSIONS ||
+ type == OVS_KEY_ATTR_TUNNEL_INFO) {
+ OVS_NLERR(log, "Key type %d is not supported", type);
+ return -EINVAL;
+ }
+
+ if (attrs & (1ULL << type)) {
OVS_NLERR(log, "Duplicate key (type %d).", type);
return -EINVAL;
}
@@ -491,7 +502,7 @@ static int __parse_flow_nlattrs(const struct nlattr *attr,
}
if (!nz || !is_all_zero(nla_data(nla), nla_len(nla))) {
- attrs |= 1 << type;
+ attrs |= 1ULL << type;
a[type] = nla;
}
}
@@ -1596,6 +1607,17 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
attrs &= ~(1 << OVS_KEY_ATTR_IPV6);
}
+ if (attrs & (1ULL << OVS_KEY_ATTR_IPV6_EXTHDRS)) {
+ const struct ovs_key_ipv6_exthdrs *ipv6_exthdrs_key;
+
+ ipv6_exthdrs_key = nla_data(a[OVS_KEY_ATTR_IPV6_EXTHDRS]);
+
+ SW_FLOW_KEY_PUT(match, ipv6.exthdrs,
+ ipv6_exthdrs_key->hdrs, is_mask);
+
+ attrs &= ~(1ULL << OVS_KEY_ATTR_IPV6_EXTHDRS);
+ }
+
if (attrs & (1 << OVS_KEY_ATTR_ARP)) {
const struct ovs_key_arp *arp_key;
@@ -1762,11 +1784,11 @@ static void mask_set_nlattr(struct nlattr *attr, u8 val)
* does not include any don't care bit.
* @net: Used to determine per-namespace field support.
* @match: receives the extracted flow match information.
- * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
+ * @nla_key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
* sequence. The fields should of the packet that triggered the creation
* of this flow.
- * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink
- * attribute specifies the mask field of the wildcarded flow.
+ * @nla_mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_*
+ * Netlink attribute specifies the mask field of the wildcarded flow.
* @log: Boolean to allow kernel error logging. Normally true, but when
* probing for feature compatibility this should be passed in as false to
* suppress unnecessary error logging.
@@ -2098,6 +2120,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
ipv4_key->ipv4_frag = output->ip.frag;
} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
struct ovs_key_ipv6 *ipv6_key;
+ struct ovs_key_ipv6_exthdrs *ipv6_exthdrs_key;
nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key));
if (!nla)
@@ -2112,6 +2135,13 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
ipv6_key->ipv6_tclass = output->ip.tos;
ipv6_key->ipv6_hlimit = output->ip.ttl;
ipv6_key->ipv6_frag = output->ip.frag;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6_EXTHDRS,
+ sizeof(*ipv6_exthdrs_key));
+ if (!nla)
+ goto nla_put_failure;
+ ipv6_exthdrs_key = nla_data(nla);
+ ipv6_exthdrs_key->hdrs = output->ipv6.exthdrs;
} else if (swkey->eth.type == htons(ETH_P_NSH)) {
if (nsh_key_to_nlattr(&output->nsh, is_mask, skb))
goto nla_put_failure;
@@ -2200,8 +2230,8 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
icmpv6_key->icmpv6_type = ntohs(output->tp.src);
icmpv6_key->icmpv6_code = ntohs(output->tp.dst);
- if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION ||
- icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) {
+ if (swkey->tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) ||
+ swkey->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
struct ovs_key_nd *nd_key;
nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key));
@@ -2287,6 +2317,62 @@ static struct sw_flow_actions *nla_alloc_flow_actions(int size)
return sfa;
}
+static void ovs_nla_free_nested_actions(const struct nlattr *actions, int len);
+
+static void ovs_nla_free_check_pkt_len_action(const struct nlattr *action)
+{
+ const struct nlattr *a;
+ int rem;
+
+ nla_for_each_nested(a, action, rem) {
+ switch (nla_type(a)) {
+ case OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL:
+ case OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER:
+ ovs_nla_free_nested_actions(nla_data(a), nla_len(a));
+ break;
+ }
+ }
+}
+
+static void ovs_nla_free_clone_action(const struct nlattr *action)
+{
+ const struct nlattr *a = nla_data(action);
+ int rem = nla_len(action);
+
+ switch (nla_type(a)) {
+ case OVS_CLONE_ATTR_EXEC:
+ /* The real list of actions follows this attribute. */
+ a = nla_next(a, &rem);
+ ovs_nla_free_nested_actions(a, rem);
+ break;
+ }
+}
+
+static void ovs_nla_free_dec_ttl_action(const struct nlattr *action)
+{
+ const struct nlattr *a = nla_data(action);
+
+ switch (nla_type(a)) {
+ case OVS_DEC_TTL_ATTR_ACTION:
+ ovs_nla_free_nested_actions(nla_data(a), nla_len(a));
+ break;
+ }
+}
+
+static void ovs_nla_free_sample_action(const struct nlattr *action)
+{
+ const struct nlattr *a = nla_data(action);
+ int rem = nla_len(action);
+
+ switch (nla_type(a)) {
+ case OVS_SAMPLE_ATTR_ARG:
+ /* The real list of actions follows this attribute. */
+ a = nla_next(a, &rem);
+ ovs_nla_free_nested_actions(a, rem);
+ break;
+ }
+}
+
static void ovs_nla_free_set_action(const struct nlattr *a)
{
const struct nlattr *ovs_key = nla_data(a);
@@ -2300,25 +2386,54 @@ static void ovs_nla_free_set_action(const struct nlattr *a)
}
}
-void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
+static void ovs_nla_free_nested_actions(const struct nlattr *actions, int len)
{
const struct nlattr *a;
int rem;
- if (!sf_acts)
+ /* Whenever new actions are added, the need to update this
+ * function should be considered.
+ */
+ BUILD_BUG_ON(OVS_ACTION_ATTR_MAX != 23);
+
+ if (!actions)
return;
- nla_for_each_attr(a, sf_acts->actions, sf_acts->actions_len, rem) {
+ nla_for_each_attr(a, actions, len, rem) {
switch (nla_type(a)) {
- case OVS_ACTION_ATTR_SET:
- ovs_nla_free_set_action(a);
+ case OVS_ACTION_ATTR_CHECK_PKT_LEN:
+ ovs_nla_free_check_pkt_len_action(a);
break;
+
+ case OVS_ACTION_ATTR_CLONE:
+ ovs_nla_free_clone_action(a);
+ break;
+
case OVS_ACTION_ATTR_CT:
ovs_ct_free_action(a);
break;
+
+ case OVS_ACTION_ATTR_DEC_TTL:
+ ovs_nla_free_dec_ttl_action(a);
+ break;
+
+ case OVS_ACTION_ATTR_SAMPLE:
+ ovs_nla_free_sample_action(a);
+ break;
+
+ case OVS_ACTION_ATTR_SET:
+ ovs_nla_free_set_action(a);
+ break;
}
}
+}
+
+void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
+{
+ if (!sf_acts)
+ return;
+ ovs_nla_free_nested_actions(sf_acts->actions, sf_acts->actions_len);
kfree(sf_acts);
}
@@ -2350,7 +2465,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2);
if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
- if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) {
+ if ((next_offset + req_size) > MAX_ACTIONS_BUFSIZE) {
OVS_NLERR(log, "Flow action size exceeds max %u",
MAX_ACTIONS_BUFSIZE);
return ERR_PTR(-EMSGSIZE);
@@ -2495,6 +2610,63 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
return 0;
}
+static int validate_and_copy_dec_ttl(struct net *net,
+ const struct nlattr *attr,
+ const struct sw_flow_key *key,
+ struct sw_flow_actions **sfa,
+ __be16 eth_type, __be16 vlan_tci,
+ u32 mpls_label_count, bool log)
+{
+ const struct nlattr *attrs[OVS_DEC_TTL_ATTR_MAX + 1];
+ int start, action_start, err, rem;
+ const struct nlattr *a, *actions;
+
+ memset(attrs, 0, sizeof(attrs));
+ nla_for_each_nested(a, attr, rem) {
+ int type = nla_type(a);
+
+ /* Ignore unknown attributes to be future proof. */
+ if (type > OVS_DEC_TTL_ATTR_MAX)
+ continue;
+
+ if (!type || attrs[type]) {
+ OVS_NLERR(log, "Duplicate or invalid key (type %d).",
+ type);
+ return -EINVAL;
+ }
+
+ attrs[type] = a;
+ }
+
+ if (rem) {
+ OVS_NLERR(log, "Message has %d unknown bytes.", rem);
+ return -EINVAL;
+ }
+
+ actions = attrs[OVS_DEC_TTL_ATTR_ACTION];
+ if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) {
+ OVS_NLERR(log, "Missing valid actions attribute.");
+ return -EINVAL;
+ }
+
+ start = add_nested_action_start(sfa, OVS_ACTION_ATTR_DEC_TTL, log);
+ if (start < 0)
+ return start;
+
+ action_start = add_nested_action_start(sfa, OVS_DEC_TTL_ATTR_ACTION, log);
+ if (action_start < 0)
+ return action_start;
+
+ err = __ovs_nla_copy_actions(net, actions, key, sfa, eth_type,
+ vlan_tci, mpls_label_count, log);
+ if (err)
+ return err;
+
+ add_nested_action_end(*sfa, action_start);
+ add_nested_action_end(*sfa, start);
+ return 0;
+}
+
static int validate_and_copy_clone(struct net *net,
const struct nlattr *attr,
const struct sw_flow_key *key,
@@ -3009,6 +3181,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
[OVS_ACTION_ATTR_CLONE] = (u32)-1,
[OVS_ACTION_ATTR_CHECK_PKT_LEN] = (u32)-1,
[OVS_ACTION_ATTR_ADD_MPLS] = sizeof(struct ovs_action_add_mpls),
+ [OVS_ACTION_ATTR_DEC_TTL] = (u32)-1,
};
const struct ovs_action_push_vlan *vlan;
int type = nla_type(a);
@@ -3131,7 +3304,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
/* Disallow subsequent L2.5+ set actions and mpls_pop
* actions once the last MPLS label in the packet is
- * is popped as there is no check here to ensure that
+ * popped as there is no check here to ensure that
* the new eth type is valid and thus set actions could
* write off the end of the packet or otherwise corrupt
* it.
@@ -3269,6 +3442,15 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
break;
}
+ case OVS_ACTION_ATTR_DEC_TTL:
+ err = validate_and_copy_dec_ttl(net, a, key, sfa,
+ eth_type, vlan_tci,
+ mpls_label_count, log);
+ if (err)
+ return err;
+ skip_copy = true;
+ break;
+
default:
OVS_NLERR(log, "Unknown Action type %d", type);
return -EINVAL;
@@ -3361,7 +3543,9 @@ static int clone_action_to_attr(const struct nlattr *attr,
if (!start)
return -EMSGSIZE;
- err = ovs_nla_put_actions(nla_data(attr), rem, skb);
+ /* Skipping the OVS_CLONE_ATTR_EXEC that is always the first attribute. */
+ attr = nla_next(nla_data(attr), &rem);
+ err = ovs_nla_put_actions(attr, rem, skb);
if (err)
nla_nest_cancel(skb, start);
@@ -3440,6 +3624,48 @@ out:
return err;
}
+static int dec_ttl_action_to_attr(const struct nlattr *attr,
+ struct sk_buff *skb)
+{
+ struct nlattr *start, *action_start;
+ const struct nlattr *a;
+ int err = 0, rem;
+
+ start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_DEC_TTL);
+ if (!start)
+ return -EMSGSIZE;
+
+ nla_for_each_attr(a, nla_data(attr), nla_len(attr), rem) {
+ switch (nla_type(a)) {
+ case OVS_DEC_TTL_ATTR_ACTION:
+
+ action_start = nla_nest_start_noflag(skb, OVS_DEC_TTL_ATTR_ACTION);
+ if (!action_start) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb);
+ if (err)
+ goto out;
+
+ nla_nest_end(skb, action_start);
+ break;
+
+ default:
+ /* Ignore all other option to be future compatible */
+ break;
+ }
+ }
+
+ nla_nest_end(skb, start);
+ return 0;
+
+out:
+ nla_nest_cancel(skb, start);
+ return err;
+}
+
static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
{
const struct nlattr *ovs_key = nla_data(a);
@@ -3540,6 +3766,12 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb)
return err;
break;
+ case OVS_ACTION_ATTR_DEC_TTL:
+ err = dec_ttl_action_to_attr(a, skb);
+ if (err)
+ return err;
+ break;
+
default:
if (nla_put(skb, type, nla_len(a), nla_data(a)))
return -EMSGSIZE;
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index fd8a01ca7a2d..d4a2db0b2299 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -29,6 +29,7 @@
#include <linux/icmp.h>
#include <linux/icmpv6.h>
#include <linux/rculist.h>
+#include <linux/sort.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
@@ -37,8 +38,8 @@
#define MASK_ARRAY_SIZE_MIN 16
#define REHASH_INTERVAL (10 * 60 * HZ)
+#define MC_DEFAULT_HASH_ENTRIES 256
#define MC_HASH_SHIFT 8
-#define MC_HASH_ENTRIES (1u << MC_HASH_SHIFT)
#define MC_HASH_SEGS ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT)
static struct kmem_cache *flow_cache;
@@ -110,12 +111,16 @@ static void flow_free(struct sw_flow *flow)
if (ovs_identifier_is_key(&flow->id))
kfree(flow->id.unmasked_key);
if (flow->sf_acts)
- ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts);
+ ovs_nla_free_flow_actions((struct sw_flow_actions __force *)
+ flow->sf_acts);
/* We open code this to make sure cpu 0 is always considered */
- for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, &flow->cpu_used_mask))
+ for (cpu = 0; cpu < nr_cpu_ids;
+ cpu = cpumask_next(cpu, &flow->cpu_used_mask)) {
if (flow->stats[cpu])
kmem_cache_free(flow_stats_cache,
(struct sw_flow_stats __force *)flow->stats[cpu]);
+ }
+
kmem_cache_free(flow_cache, flow);
}
@@ -163,22 +168,76 @@ static struct table_instance *table_instance_alloc(int new_size)
ti->n_buckets = new_size;
ti->node_ver = 0;
- ti->keep_flows = false;
get_random_bytes(&ti->hash_seed, sizeof(u32));
return ti;
}
+static void __mask_array_destroy(struct mask_array *ma)
+{
+ free_percpu(ma->masks_usage_stats);
+ kfree(ma);
+}
+
+static void mask_array_rcu_cb(struct rcu_head *rcu)
+{
+ struct mask_array *ma = container_of(rcu, struct mask_array, rcu);
+
+ __mask_array_destroy(ma);
+}
+
+static void tbl_mask_array_reset_counters(struct mask_array *ma)
+{
+ int i, cpu;
+
+ /* As the per CPU counters are not atomic we can not go ahead and
+ * reset them from another CPU. To be able to still have an approximate
+ * zero based counter we store the value at reset, and subtract it
+ * later when processing.
+ */
+ for (i = 0; i < ma->max; i++) {
+ ma->masks_usage_zero_cntr[i] = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct mask_array_stats *stats;
+ unsigned int start;
+ u64 counter;
+
+ stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
+ do {
+ start = u64_stats_fetch_begin_irq(&stats->syncp);
+ counter = stats->usage_cntrs[i];
+ } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+
+ ma->masks_usage_zero_cntr[i] += counter;
+ }
+ }
+}
+
static struct mask_array *tbl_mask_array_alloc(int size)
{
struct mask_array *new;
size = max(MASK_ARRAY_SIZE_MIN, size);
new = kzalloc(sizeof(struct mask_array) +
- sizeof(struct sw_flow_mask *) * size, GFP_KERNEL);
+ sizeof(struct sw_flow_mask *) * size +
+ sizeof(u64) * size, GFP_KERNEL);
if (!new)
return NULL;
+ new->masks_usage_zero_cntr = (u64 *)((u8 *)new +
+ sizeof(struct mask_array) +
+ sizeof(struct sw_flow_mask *) *
+ size);
+
+ new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) +
+ sizeof(u64) * size,
+ __alignof__(u64));
+ if (!new->masks_usage_stats) {
+ kfree(new);
+ return NULL;
+ }
+
new->count = 0;
new->max = size;
@@ -202,10 +261,10 @@ static int tbl_mask_array_realloc(struct flow_table *tbl, int size)
if (ovsl_dereference(old->masks[i]))
new->masks[new->count++] = old->masks[i];
}
+ call_rcu(&old->rcu, mask_array_rcu_cb);
}
rcu_assign_pointer(tbl->mask_array, new);
- kfree_rcu(old, rcu);
return 0;
}
@@ -218,17 +277,22 @@ static int tbl_mask_array_add_mask(struct flow_table *tbl,
if (ma_count >= ma->max) {
err = tbl_mask_array_realloc(tbl, ma->max +
- MASK_ARRAY_SIZE_MIN);
+ MASK_ARRAY_SIZE_MIN);
if (err)
return err;
ma = ovsl_dereference(tbl->mask_array);
+ } else {
+ /* On every add or delete we need to reset the counters so
+ * every new mask gets a fair chance of being prioritized.
+ */
+ tbl_mask_array_reset_counters(ma);
}
BUG_ON(ovsl_dereference(ma->masks[ma_count]));
rcu_assign_pointer(ma->masks[ma_count], new);
- WRITE_ONCE(ma->count, ma_count +1);
+ WRITE_ONCE(ma->count, ma_count + 1);
return 0;
}
@@ -249,10 +313,10 @@ static void tbl_mask_array_del_mask(struct flow_table *tbl,
return;
found:
- WRITE_ONCE(ma->count, ma_count -1);
+ WRITE_ONCE(ma->count, ma_count - 1);
- rcu_assign_pointer(ma->masks[i], ma->masks[ma_count -1]);
- RCU_INIT_POINTER(ma->masks[ma_count -1], NULL);
+ rcu_assign_pointer(ma->masks[i], ma->masks[ma_count - 1]);
+ RCU_INIT_POINTER(ma->masks[ma_count - 1], NULL);
kfree_rcu(mask, rcu);
@@ -260,6 +324,9 @@ found:
if (ma->max >= (MASK_ARRAY_SIZE_MIN * 2) &&
ma_count <= (ma->max / 3))
tbl_mask_array_realloc(tbl, ma->max / 2);
+ else
+ tbl_mask_array_reset_counters(ma);
+
}
/* Remove 'mask' from the mask list, if it is not needed any more. */
@@ -278,15 +345,79 @@ static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask)
}
}
+static void __mask_cache_destroy(struct mask_cache *mc)
+{
+ free_percpu(mc->mask_cache);
+ kfree(mc);
+}
+
+static void mask_cache_rcu_cb(struct rcu_head *rcu)
+{
+ struct mask_cache *mc = container_of(rcu, struct mask_cache, rcu);
+
+ __mask_cache_destroy(mc);
+}
+
+static struct mask_cache *tbl_mask_cache_alloc(u32 size)
+{
+ struct mask_cache_entry __percpu *cache = NULL;
+ struct mask_cache *new;
+
+ /* Only allow size to be 0, or a power of 2, and does not exceed
+ * percpu allocation size.
+ */
+ if ((!is_power_of_2(size) && size != 0) ||
+ (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE)
+ return NULL;
+
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ new->cache_size = size;
+ if (new->cache_size > 0) {
+ cache = __alloc_percpu(array_size(sizeof(struct mask_cache_entry),
+ new->cache_size),
+ __alignof__(struct mask_cache_entry));
+ if (!cache) {
+ kfree(new);
+ return NULL;
+ }
+ }
+
+ new->mask_cache = cache;
+ return new;
+}
+int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size)
+{
+ struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache);
+ struct mask_cache *new;
+
+ if (size == mc->cache_size)
+ return 0;
+
+ if ((!is_power_of_2(size) && size != 0) ||
+ (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE)
+ return -EINVAL;
+
+ new = tbl_mask_cache_alloc(size);
+ if (!new)
+ return -ENOMEM;
+
+ rcu_assign_pointer(table->mask_cache, new);
+ call_rcu(&mc->rcu, mask_cache_rcu_cb);
+
+ return 0;
+}
+
int ovs_flow_tbl_init(struct flow_table *table)
{
struct table_instance *ti, *ufid_ti;
+ struct mask_cache *mc;
struct mask_array *ma;
- table->mask_cache = __alloc_percpu(sizeof(struct mask_cache_entry) *
- MC_HASH_ENTRIES,
- __alignof__(struct mask_cache_entry));
- if (!table->mask_cache)
+ mc = tbl_mask_cache_alloc(MC_DEFAULT_HASH_ENTRIES);
+ if (!mc)
return -ENOMEM;
ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN);
@@ -304,6 +435,7 @@ int ovs_flow_tbl_init(struct flow_table *table)
rcu_assign_pointer(table->ti, ti);
rcu_assign_pointer(table->ufid_ti, ufid_ti);
rcu_assign_pointer(table->mask_array, ma);
+ rcu_assign_pointer(table->mask_cache, mc);
table->last_rehash = jiffies;
table->count = 0;
table->ufid_count = 0;
@@ -312,77 +444,71 @@ int ovs_flow_tbl_init(struct flow_table *table)
free_ti:
__table_instance_destroy(ti);
free_mask_array:
- kfree(ma);
+ __mask_array_destroy(ma);
free_mask_cache:
- free_percpu(table->mask_cache);
+ __mask_cache_destroy(mc);
return -ENOMEM;
}
static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
{
- struct table_instance *ti = container_of(rcu, struct table_instance, rcu);
+ struct table_instance *ti;
+ ti = container_of(rcu, struct table_instance, rcu);
__table_instance_destroy(ti);
}
static void table_instance_flow_free(struct flow_table *table,
- struct table_instance *ti,
- struct table_instance *ufid_ti,
- struct sw_flow *flow,
- bool count)
+ struct table_instance *ti,
+ struct table_instance *ufid_ti,
+ struct sw_flow *flow)
{
hlist_del_rcu(&flow->flow_table.node[ti->node_ver]);
- if (count)
- table->count--;
+ table->count--;
if (ovs_identifier_is_ufid(&flow->id)) {
hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]);
-
- if (count)
- table->ufid_count--;
+ table->ufid_count--;
}
flow_mask_remove(table, flow->mask);
}
-static void table_instance_destroy(struct flow_table *table,
- struct table_instance *ti,
- struct table_instance *ufid_ti,
- bool deferred)
+/* Must be called with OVS mutex held. */
+void table_instance_flow_flush(struct flow_table *table,
+ struct table_instance *ti,
+ struct table_instance *ufid_ti)
{
int i;
- if (!ti)
- return;
-
- BUG_ON(!ufid_ti);
- if (ti->keep_flows)
- goto skip_flows;
-
for (i = 0; i < ti->n_buckets; i++) {
- struct sw_flow *flow;
struct hlist_head *head = &ti->buckets[i];
struct hlist_node *n;
+ struct sw_flow *flow;
hlist_for_each_entry_safe(flow, n, head,
flow_table.node[ti->node_ver]) {
table_instance_flow_free(table, ti, ufid_ti,
- flow, false);
- ovs_flow_free(flow, deferred);
+ flow);
+ ovs_flow_free(flow, true);
}
}
-skip_flows:
- if (deferred) {
- call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
- call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb);
- } else {
- __table_instance_destroy(ti);
- __table_instance_destroy(ufid_ti);
+ if (WARN_ON(table->count != 0 ||
+ table->ufid_count != 0)) {
+ table->count = 0;
+ table->ufid_count = 0;
}
}
+static void table_instance_destroy(struct table_instance *ti,
+ struct table_instance *ufid_ti)
+{
+ call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
+ call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb);
+}
+
/* No need for locking this function is called from RCU callback or
* error path.
*/
@@ -390,10 +516,12 @@ void ovs_flow_tbl_destroy(struct flow_table *table)
{
struct table_instance *ti = rcu_dereference_raw(table->ti);
struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti);
+ struct mask_cache *mc = rcu_dereference_raw(table->mask_cache);
+ struct mask_array *ma = rcu_dereference_raw(table->mask_array);
- free_percpu(table->mask_cache);
- kfree_rcu(rcu_dereference_raw(table->mask_array), rcu);
- table_instance_destroy(table, ti, ufid_ti, false);
+ call_rcu(&mc->rcu, mask_cache_rcu_cb);
+ call_rcu(&ma->rcu, mask_array_rcu_cb);
+ table_instance_destroy(ti, ufid_ti);
}
struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
@@ -462,16 +590,16 @@ static void flow_table_copy_flows(struct table_instance *old,
struct hlist_head *head = &old->buckets[i];
if (ufid)
- hlist_for_each_entry(flow, head,
- ufid_table.node[old_ver])
+ hlist_for_each_entry_rcu(flow, head,
+ ufid_table.node[old_ver],
+ lockdep_ovsl_is_held())
ufid_table_instance_insert(new, flow);
else
- hlist_for_each_entry(flow, head,
- flow_table.node[old_ver])
+ hlist_for_each_entry_rcu(flow, head,
+ flow_table.node[old_ver],
+ lockdep_ovsl_is_held())
table_instance_insert(new, flow);
}
-
- old->keep_flows = true;
}
static struct table_instance *table_instance_rehash(struct table_instance *ti,
@@ -506,10 +634,9 @@ int ovs_flow_tbl_flush(struct flow_table *flow_table)
rcu_assign_pointer(flow_table->ti, new_ti);
rcu_assign_pointer(flow_table->ufid_ti, new_ufid_ti);
flow_table->last_rehash = jiffies;
- flow_table->count = 0;
- flow_table->ufid_count = 0;
- table_instance_destroy(flow_table, old_ti, old_ufid_ti, true);
+ table_instance_flow_flush(flow_table, old_ti, old_ufid_ti);
+ table_instance_destroy(old_ti, old_ufid_ti);
return 0;
err_free_ti:
@@ -534,7 +661,7 @@ static int flow_key_start(const struct sw_flow_key *key)
return 0;
else
return rounddown(offsetof(struct sw_flow_key, phy),
- sizeof(long));
+ sizeof(long));
}
static bool cmp_key(const struct sw_flow_key *key1,
@@ -543,13 +670,13 @@ static bool cmp_key(const struct sw_flow_key *key1,
{
const long *cp1 = (const long *)((const u8 *)key1 + key_start);
const long *cp2 = (const long *)((const u8 *)key2 + key_start);
- long diffs = 0;
int i;
- for (i = key_start; i < key_end; i += sizeof(long))
- diffs |= *cp1++ ^ *cp2++;
+ for (i = key_start; i < key_end; i += sizeof(long))
+ if (*cp1++ ^ *cp2++)
+ return false;
- return diffs == 0;
+ return true;
}
static bool flow_cmp_masked_key(const struct sw_flow *flow,
@@ -586,7 +713,7 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
(*n_mask_hit)++;
hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver],
- lockdep_ovsl_is_held()) {
+ lockdep_ovsl_is_held()) {
if (flow->mask == mask && flow->flow_table.hash == hash &&
flow_cmp_masked_key(flow, &masked_key, &mask->range))
return flow;
@@ -596,14 +723,18 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
/* Flow lookup does full lookup on flow table. It starts with
* mask from index passed in *index.
+ * This function MUST be called with BH disabled due to the use
+ * of CPU specific variables.
*/
static struct sw_flow *flow_lookup(struct flow_table *tbl,
struct table_instance *ti,
struct mask_array *ma,
const struct sw_flow_key *key,
u32 *n_mask_hit,
+ u32 *n_cache_hit,
u32 *index)
{
+ struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats);
struct sw_flow *flow;
struct sw_flow_mask *mask;
int i;
@@ -612,8 +743,13 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
mask = rcu_dereference_ovsl(ma->masks[*index]);
if (mask) {
flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
- if (flow)
+ if (flow) {
+ u64_stats_update_begin(&stats->syncp);
+ stats->usage_cntrs[*index]++;
+ u64_stats_update_end(&stats->syncp);
+ (*n_cache_hit)++;
return flow;
+ }
}
}
@@ -629,6 +765,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
if (flow) { /* Found */
*index = i;
+ u64_stats_update_begin(&stats->syncp);
+ stats->usage_cntrs[*index]++;
+ u64_stats_update_end(&stats->syncp);
return flow;
}
}
@@ -646,8 +785,10 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
const struct sw_flow_key *key,
u32 skb_hash,
- u32 *n_mask_hit)
+ u32 *n_mask_hit,
+ u32 *n_cache_hit)
{
+ struct mask_cache *mc = rcu_dereference(tbl->mask_cache);
struct mask_array *ma = rcu_dereference(tbl->mask_array);
struct table_instance *ti = rcu_dereference(tbl->ti);
struct mask_cache_entry *entries, *ce;
@@ -656,10 +797,13 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
int seg;
*n_mask_hit = 0;
- if (unlikely(!skb_hash)) {
+ *n_cache_hit = 0;
+ if (unlikely(!skb_hash || mc->cache_size == 0)) {
u32 mask_index = 0;
+ u32 cache = 0;
- return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index);
+ return flow_lookup(tbl, ti, ma, key, n_mask_hit, &cache,
+ &mask_index);
}
/* Pre and post recirulation flows usually have the same skb_hash
@@ -670,17 +814,17 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
ce = NULL;
hash = skb_hash;
- entries = this_cpu_ptr(tbl->mask_cache);
+ entries = this_cpu_ptr(mc->mask_cache);
/* Find the cache entry 'ce' to operate on. */
for (seg = 0; seg < MC_HASH_SEGS; seg++) {
- int index = hash & (MC_HASH_ENTRIES - 1);
+ int index = hash & (mc->cache_size - 1);
struct mask_cache_entry *e;
e = &entries[index];
if (e->skb_hash == skb_hash) {
flow = flow_lookup(tbl, ti, ma, key, n_mask_hit,
- &e->mask_index);
+ n_cache_hit, &e->mask_index);
if (!flow)
e->skb_hash = 0;
return flow;
@@ -693,10 +837,12 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
}
/* Cache miss, do full lookup. */
- flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index);
+ flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, n_cache_hit,
+ &ce->mask_index);
if (flow)
ce->skb_hash = skb_hash;
+ *n_cache_hit = 0;
return flow;
}
@@ -706,9 +852,18 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
u32 __always_unused n_mask_hit;
+ u32 __always_unused n_cache_hit;
+ struct sw_flow *flow;
u32 index = 0;
- return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &index);
+ /* This function gets called trough the netlink interface and therefore
+ * is preemptible. However, flow_lookup() function needs to be called
+ * with BH disabled due to CPU specific variables.
+ */
+ local_bh_disable();
+ flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
+ local_bh_enable();
+ return flow;
}
struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
@@ -752,7 +907,8 @@ static bool ovs_flow_cmp_ufid(const struct sw_flow *flow,
return !memcmp(flow->id.ufid, sfid->ufid, sfid->ufid_len);
}
-bool ovs_flow_cmp(const struct sw_flow *flow, const struct sw_flow_match *match)
+bool ovs_flow_cmp(const struct sw_flow *flow,
+ const struct sw_flow_match *match)
{
if (ovs_identifier_is_ufid(&flow->id))
return flow_cmp_masked_key(flow, match->key, &match->range);
@@ -771,7 +927,7 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl,
hash = ufid_hash(ufid);
head = find_bucket(ti, hash);
hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver],
- lockdep_ovsl_is_held()) {
+ lockdep_ovsl_is_held()) {
if (flow->ufid_table.hash == hash &&
ovs_flow_cmp_ufid(flow, ufid))
return flow;
@@ -785,6 +941,13 @@ int ovs_flow_tbl_num_masks(const struct flow_table *table)
return READ_ONCE(ma->count);
}
+u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table)
+{
+ struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache);
+
+ return READ_ONCE(mc->cache_size);
+}
+
static struct table_instance *table_instance_expand(struct table_instance *ti,
bool ufid)
{
@@ -798,7 +961,7 @@ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti);
BUG_ON(table->count == 0);
- table_instance_flow_free(table, ti, ufid_ti, flow, true);
+ table_instance_flow_free(table, ti, ufid_ti, flow);
}
static struct sw_flow_mask *mask_alloc(void)
@@ -932,6 +1095,99 @@ int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
return 0;
}
+static int compare_mask_and_count(const void *a, const void *b)
+{
+ const struct mask_count *mc_a = a;
+ const struct mask_count *mc_b = b;
+
+ return (s64)mc_b->counter - (s64)mc_a->counter;
+}
+
+/* Must be called with OVS mutex held. */
+void ovs_flow_masks_rebalance(struct flow_table *table)
+{
+ struct mask_array *ma = rcu_dereference_ovsl(table->mask_array);
+ struct mask_count *masks_and_count;
+ struct mask_array *new;
+ int masks_entries = 0;
+ int i;
+
+ /* Build array of all current entries with use counters. */
+ masks_and_count = kmalloc_array(ma->max, sizeof(*masks_and_count),
+ GFP_KERNEL);
+ if (!masks_and_count)
+ return;
+
+ for (i = 0; i < ma->max; i++) {
+ struct sw_flow_mask *mask;
+ int cpu;
+
+ mask = rcu_dereference_ovsl(ma->masks[i]);
+ if (unlikely(!mask))
+ break;
+
+ masks_and_count[i].index = i;
+ masks_and_count[i].counter = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct mask_array_stats *stats;
+ unsigned int start;
+ u64 counter;
+
+ stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
+ do {
+ start = u64_stats_fetch_begin_irq(&stats->syncp);
+ counter = stats->usage_cntrs[i];
+ } while (u64_stats_fetch_retry_irq(&stats->syncp,
+ start));
+
+ masks_and_count[i].counter += counter;
+ }
+
+ /* Subtract the zero count value. */
+ masks_and_count[i].counter -= ma->masks_usage_zero_cntr[i];
+
+ /* Rather than calling tbl_mask_array_reset_counters()
+ * below when no change is needed, do it inline here.
+ */
+ ma->masks_usage_zero_cntr[i] += masks_and_count[i].counter;
+ }
+
+ if (i == 0)
+ goto free_mask_entries;
+
+ /* Sort the entries */
+ masks_entries = i;
+ sort(masks_and_count, masks_entries, sizeof(*masks_and_count),
+ compare_mask_and_count, NULL);
+
+ /* If the order is the same, nothing to do... */
+ for (i = 0; i < masks_entries; i++) {
+ if (i != masks_and_count[i].index)
+ break;
+ }
+ if (i == masks_entries)
+ goto free_mask_entries;
+
+ /* Rebuilt the new list in order of usage. */
+ new = tbl_mask_array_alloc(ma->max);
+ if (!new)
+ goto free_mask_entries;
+
+ for (i = 0; i < masks_entries; i++) {
+ int index = masks_and_count[i].index;
+
+ if (ovsl_dereference(ma->masks[index]))
+ new->masks[new->count++] = ma->masks[index];
+ }
+
+ rcu_assign_pointer(table->mask_array, new);
+ call_rcu(&ma->rcu, mask_array_rcu_cb);
+
+free_mask_entries:
+ kfree(masks_and_count);
+}
+
/* Initializes the flow module.
* Returns zero if successful or a negative error code. */
int ovs_flow_init(void)
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
index 8a5cea6ae111..9e659db78c05 100644
--- a/net/openvswitch/flow_table.h
+++ b/net/openvswitch/flow_table.h
@@ -27,9 +27,27 @@ struct mask_cache_entry {
u32 mask_index;
};
+struct mask_cache {
+ struct rcu_head rcu;
+ u32 cache_size; /* Must be ^2 value. */
+ struct mask_cache_entry __percpu *mask_cache;
+};
+
+struct mask_count {
+ int index;
+ u64 counter;
+};
+
+struct mask_array_stats {
+ struct u64_stats_sync syncp;
+ u64 usage_cntrs[];
+};
+
struct mask_array {
struct rcu_head rcu;
int count, max;
+ struct mask_array_stats __percpu *masks_usage_stats;
+ u64 *masks_usage_zero_cntr;
struct sw_flow_mask __rcu *masks[];
};
@@ -39,13 +57,12 @@ struct table_instance {
struct rcu_head rcu;
int node_ver;
u32 hash_seed;
- bool keep_flows;
};
struct flow_table {
struct table_instance __rcu *ti;
struct table_instance __rcu *ufid_ti;
- struct mask_cache_entry __percpu *mask_cache;
+ struct mask_cache __rcu *mask_cache;
struct mask_array __rcu *mask_array;
unsigned long last_rehash;
unsigned int count;
@@ -69,12 +86,15 @@ int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
const struct sw_flow_mask *mask);
void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow);
int ovs_flow_tbl_num_masks(const struct flow_table *table);
+u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table);
+int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size);
struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table,
u32 *bucket, u32 *idx);
struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *,
const struct sw_flow_key *,
u32 skb_hash,
- u32 *n_mask_hit);
+ u32 *n_mask_hit,
+ u32 *n_cache_hit);
struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *,
const struct sw_flow_key *);
struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
@@ -86,4 +106,10 @@ bool ovs_flow_cmp(const struct sw_flow *, const struct sw_flow_match *);
void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
bool full, const struct sw_flow_mask *mask);
+
+void ovs_flow_masks_rebalance(struct flow_table *table);
+void table_instance_flow_flush(struct flow_table *table,
+ struct table_instance *ti,
+ struct table_instance *ufid_ti);
+
#endif /* flow_table.h */
diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
index 5010d1ddd4bd..6e38f68f88c2 100644
--- a/net/openvswitch/meter.c
+++ b/net/openvswitch/meter.c
@@ -19,8 +19,6 @@
#include "datapath.h"
#include "meter.h"
-#define METER_HASH_BUCKETS 1024
-
static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = {
[OVS_METER_ATTR_ID] = { .type = NLA_U32, },
[OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG },
@@ -39,6 +37,11 @@ static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = {
[OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) },
};
+static u32 meter_hash(struct dp_meter_instance *ti, u32 id)
+{
+ return id % ti->n_meters;
+}
+
static void ovs_meter_free(struct dp_meter *meter)
{
if (!meter)
@@ -47,40 +50,162 @@ static void ovs_meter_free(struct dp_meter *meter)
kfree_rcu(meter, rcu);
}
-static struct hlist_head *meter_hash_bucket(const struct datapath *dp,
- u32 meter_id)
-{
- return &dp->meters[meter_id & (METER_HASH_BUCKETS - 1)];
-}
-
/* Call with ovs_mutex or RCU read lock. */
-static struct dp_meter *lookup_meter(const struct datapath *dp,
+static struct dp_meter *lookup_meter(const struct dp_meter_table *tbl,
u32 meter_id)
{
+ struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti);
+ u32 hash = meter_hash(ti, meter_id);
struct dp_meter *meter;
- struct hlist_head *head;
- head = meter_hash_bucket(dp, meter_id);
- hlist_for_each_entry_rcu(meter, head, dp_hash_node,
- lockdep_ovsl_is_held()) {
- if (meter->id == meter_id)
- return meter;
- }
+ meter = rcu_dereference_ovsl(ti->dp_meters[hash]);
+ if (meter && likely(meter->id == meter_id))
+ return meter;
+
return NULL;
}
-static void attach_meter(struct datapath *dp, struct dp_meter *meter)
+static struct dp_meter_instance *dp_meter_instance_alloc(const u32 size)
+{
+ struct dp_meter_instance *ti;
+
+ ti = kvzalloc(sizeof(*ti) +
+ sizeof(struct dp_meter *) * size,
+ GFP_KERNEL);
+ if (!ti)
+ return NULL;
+
+ ti->n_meters = size;
+
+ return ti;
+}
+
+static void dp_meter_instance_free(struct dp_meter_instance *ti)
+{
+ kvfree(ti);
+}
+
+static void dp_meter_instance_free_rcu(struct rcu_head *rcu)
+{
+ struct dp_meter_instance *ti;
+
+ ti = container_of(rcu, struct dp_meter_instance, rcu);
+ kvfree(ti);
+}
+
+static int
+dp_meter_instance_realloc(struct dp_meter_table *tbl, u32 size)
+{
+ struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti);
+ int n_meters = min(size, ti->n_meters);
+ struct dp_meter_instance *new_ti;
+ int i;
+
+ new_ti = dp_meter_instance_alloc(size);
+ if (!new_ti)
+ return -ENOMEM;
+
+ for (i = 0; i < n_meters; i++)
+ if (rcu_dereference_ovsl(ti->dp_meters[i]))
+ new_ti->dp_meters[i] = ti->dp_meters[i];
+
+ rcu_assign_pointer(tbl->ti, new_ti);
+ call_rcu(&ti->rcu, dp_meter_instance_free_rcu);
+
+ return 0;
+}
+
+static void dp_meter_instance_insert(struct dp_meter_instance *ti,
+ struct dp_meter *meter)
+{
+ u32 hash;
+
+ hash = meter_hash(ti, meter->id);
+ rcu_assign_pointer(ti->dp_meters[hash], meter);
+}
+
+static void dp_meter_instance_remove(struct dp_meter_instance *ti,
+ struct dp_meter *meter)
{
- struct hlist_head *head = meter_hash_bucket(dp, meter->id);
+ u32 hash;
+
+ hash = meter_hash(ti, meter->id);
+ RCU_INIT_POINTER(ti->dp_meters[hash], NULL);
+}
+
+static int attach_meter(struct dp_meter_table *tbl, struct dp_meter *meter)
+{
+ struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti);
+ u32 hash = meter_hash(ti, meter->id);
+ int err;
+
+ /* In generally, slots selected should be empty, because
+ * OvS uses id-pool to fetch a available id.
+ */
+ if (unlikely(rcu_dereference_ovsl(ti->dp_meters[hash])))
+ return -EBUSY;
+
+ dp_meter_instance_insert(ti, meter);
+
+ /* That function is thread-safe. */
+ tbl->count++;
+ if (tbl->count >= tbl->max_meters_allowed) {
+ err = -EFBIG;
+ goto attach_err;
+ }
+
+ if (tbl->count >= ti->n_meters &&
+ dp_meter_instance_realloc(tbl, ti->n_meters * 2)) {
+ err = -ENOMEM;
+ goto attach_err;
+ }
+
+ return 0;
- hlist_add_head_rcu(&meter->dp_hash_node, head);
+attach_err:
+ dp_meter_instance_remove(ti, meter);
+ tbl->count--;
+ return err;
}
-static void detach_meter(struct dp_meter *meter)
+static int detach_meter(struct dp_meter_table *tbl, struct dp_meter *meter)
{
+ struct dp_meter_instance *ti;
+
ASSERT_OVSL();
- if (meter)
- hlist_del_rcu(&meter->dp_hash_node);
+ if (!meter)
+ return 0;
+
+ ti = rcu_dereference_ovsl(tbl->ti);
+ dp_meter_instance_remove(ti, meter);
+
+ tbl->count--;
+
+ /* Shrink the meter array if necessary. */
+ if (ti->n_meters > DP_METER_ARRAY_SIZE_MIN &&
+ tbl->count <= (ti->n_meters / 4)) {
+ int half_size = ti->n_meters / 2;
+ int i;
+
+ /* Avoid hash collision, don't move slots to other place.
+ * Make sure there are no references of meters in array
+ * which will be released.
+ */
+ for (i = half_size; i < ti->n_meters; i++)
+ if (rcu_dereference_ovsl(ti->dp_meters[i]))
+ goto out;
+
+ if (dp_meter_instance_realloc(tbl, half_size))
+ goto shrink_err;
+ }
+
+out:
+ return 0;
+
+shrink_err:
+ dp_meter_instance_insert(ti, meter);
+ tbl->count++;
+ return -ENOMEM;
}
static struct sk_buff *
@@ -116,12 +241,11 @@ static int ovs_meter_cmd_reply_stats(struct sk_buff *reply, u32 meter_id,
if (nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id))
goto error;
- if (!meter)
- return 0;
-
if (nla_put(reply, OVS_METER_ATTR_STATS,
- sizeof(struct ovs_flow_stats), &meter->stats) ||
- nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used,
+ sizeof(struct ovs_flow_stats), &meter->stats))
+ goto error;
+
+ if (nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used,
OVS_METER_ATTR_PAD))
goto error;
@@ -150,18 +274,32 @@ error:
static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info)
{
- struct sk_buff *reply;
+ struct ovs_header *ovs_header = info->userhdr;
struct ovs_header *ovs_reply_header;
struct nlattr *nla, *band_nla;
- int err;
+ struct sk_buff *reply;
+ struct datapath *dp;
+ int err = -EMSGSIZE;
reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_FEATURES,
&ovs_reply_header);
if (IS_ERR(reply))
return PTR_ERR(reply);
- if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, U32_MAX) ||
- nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS))
+ ovs_lock();
+ dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+ if (!dp) {
+ err = -ENODEV;
+ goto exit_unlock;
+ }
+
+ if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS,
+ dp->meter_tbl.max_meters_allowed))
+ goto exit_unlock;
+
+ ovs_unlock();
+
+ if (nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS))
goto nla_put_failure;
nla = nla_nest_start_noflag(reply, OVS_METER_ATTR_BANDS);
@@ -180,9 +318,10 @@ static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info)
genlmsg_end(reply, ovs_reply_header);
return genlmsg_reply(reply, info);
+exit_unlock:
+ ovs_unlock();
nla_put_failure:
nlmsg_free(reply);
- err = -EMSGSIZE;
return err;
}
@@ -204,7 +343,7 @@ static struct dp_meter *dp_meter_create(struct nlattr **a)
return ERR_PTR(-EINVAL);
/* Allocate and set up the meter before locking anything. */
- meter = kzalloc(struct_size(meter, bands, n_bands), GFP_KERNEL);
+ meter = kzalloc(struct_size(meter, bands, n_bands), GFP_KERNEL_ACCOUNT);
if (!meter)
return ERR_PTR(-ENOMEM);
@@ -252,8 +391,8 @@ static struct dp_meter *dp_meter_create(struct nlattr **a)
*
* Start with a full bucket.
*/
- band->bucket = (band->burst_size + band->rate) * 1000;
- band_max_delta_t = band->bucket / band->rate;
+ band->bucket = band->burst_size * 1000ULL;
+ band_max_delta_t = div_u64(band->bucket, band->rate);
if (band_max_delta_t > meter->max_delta_t)
meter->max_delta_t = band_max_delta_t;
band++;
@@ -273,17 +412,17 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *reply;
struct ovs_header *ovs_reply_header;
struct ovs_header *ovs_header = info->userhdr;
+ struct dp_meter_table *meter_tbl;
struct datapath *dp;
int err;
u32 meter_id;
bool failed;
- if (!a[OVS_METER_ATTR_ID]) {
- return -ENODEV;
- }
+ if (!a[OVS_METER_ATTR_ID])
+ return -EINVAL;
meter = dp_meter_create(a);
- if (IS_ERR_OR_NULL(meter))
+ if (IS_ERR(meter))
return PTR_ERR(meter);
reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_SET,
@@ -300,12 +439,18 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info)
goto exit_unlock;
}
+ meter_tbl = &dp->meter_tbl;
meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]);
- /* Cannot fail after this. */
- old_meter = lookup_meter(dp, meter_id);
- detach_meter(old_meter);
- attach_meter(dp, meter);
+ old_meter = lookup_meter(meter_tbl, meter_id);
+ err = detach_meter(meter_tbl, old_meter);
+ if (err)
+ goto exit_unlock;
+
+ err = attach_meter(meter_tbl, meter);
+ if (err)
+ goto exit_unlock;
+
ovs_unlock();
/* Build response with the meter_id and stats from
@@ -337,14 +482,14 @@ exit_free_meter:
static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info)
{
- struct nlattr **a = info->attrs;
- u32 meter_id;
struct ovs_header *ovs_header = info->userhdr;
struct ovs_header *ovs_reply_header;
+ struct nlattr **a = info->attrs;
+ struct dp_meter *meter;
+ struct sk_buff *reply;
struct datapath *dp;
+ u32 meter_id;
int err;
- struct sk_buff *reply;
- struct dp_meter *meter;
if (!a[OVS_METER_ATTR_ID])
return -EINVAL;
@@ -365,7 +510,7 @@ static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info)
}
/* Locate meter, copy stats. */
- meter = lookup_meter(dp, meter_id);
+ meter = lookup_meter(&dp->meter_tbl, meter_id);
if (!meter) {
err = -ENOENT;
goto exit_unlock;
@@ -390,18 +535,17 @@ exit_unlock:
static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info)
{
- struct nlattr **a = info->attrs;
- u32 meter_id;
struct ovs_header *ovs_header = info->userhdr;
struct ovs_header *ovs_reply_header;
+ struct nlattr **a = info->attrs;
+ struct dp_meter *old_meter;
+ struct sk_buff *reply;
struct datapath *dp;
+ u32 meter_id;
int err;
- struct sk_buff *reply;
- struct dp_meter *old_meter;
if (!a[OVS_METER_ATTR_ID])
return -EINVAL;
- meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]);
reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_DEL,
&ovs_reply_header);
@@ -416,14 +560,19 @@ static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info)
goto exit_unlock;
}
- old_meter = lookup_meter(dp, meter_id);
+ meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]);
+ old_meter = lookup_meter(&dp->meter_tbl, meter_id);
if (old_meter) {
spin_lock_bh(&old_meter->lock);
err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter);
WARN_ON(err);
spin_unlock_bh(&old_meter->lock);
- detach_meter(old_meter);
+
+ err = detach_meter(&dp->meter_tbl, old_meter);
+ if (err)
+ goto exit_unlock;
}
+
ovs_unlock();
ovs_meter_free(old_meter);
genlmsg_end(reply, ovs_reply_header);
@@ -443,16 +592,16 @@ exit_unlock:
bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key, u32 meter_id)
{
- struct dp_meter *meter;
- struct dp_meter_band *band;
long long int now_ms = div_u64(ktime_get_ns(), 1000 * 1000);
long long int long_delta_ms;
- u32 delta_ms;
- u32 cost;
+ struct dp_meter_band *band;
+ struct dp_meter *meter;
int i, band_exceeded_max = -1;
u32 band_exceeded_rate = 0;
+ u32 delta_ms;
+ u32 cost;
- meter = lookup_meter(dp, meter_id);
+ meter = lookup_meter(&dp->meter_tbl, meter_id);
/* Do not drop the packet when there is no meter. */
if (!meter)
return false;
@@ -461,6 +610,14 @@ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb,
spin_lock(&meter->lock);
long_delta_ms = (now_ms - meter->used); /* ms */
+ if (long_delta_ms < 0) {
+ /* This condition means that we have several threads fighting
+ * for a meter lock, and the one who received the packets a
+ * bit later wins. Assuming that all racing threads received
+ * packets at the same time to avoid overflow.
+ */
+ long_delta_ms = 0;
+ }
/* Make sure delta_ms will not be too large, so that bucket will not
* wrap around below.
@@ -491,7 +648,7 @@ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb,
long long int max_bucket_size;
band = &meter->bands[i];
- max_bucket_size = (band->burst_size + band->rate) * 1000LL;
+ max_bucket_size = band->burst_size * 1000LL;
band->bucket += delta_ms * band->rate;
if (band->bucket > max_bucket_size)
@@ -522,7 +679,7 @@ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb,
return false;
}
-static struct genl_ops dp_meter_genl_ops[] = {
+static const struct genl_small_ops dp_meter_genl_ops[] = {
{ .cmd = OVS_METER_CMD_FEATURES,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = 0, /* OK for unprivileged users. */
@@ -530,9 +687,9 @@ static struct genl_ops dp_meter_genl_ops[] = {
},
{ .cmd = OVS_METER_CMD_SET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
- .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
- * privilege.
- */
+ .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN
+ * privilege.
+ */
.doit = ovs_meter_cmd_set,
},
{ .cmd = OVS_METER_CMD_GET,
@@ -542,9 +699,9 @@ static struct genl_ops dp_meter_genl_ops[] = {
},
{ .cmd = OVS_METER_CMD_DEL,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
- .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
- * privilege.
- */
+ .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN
+ * privilege.
+ */
.doit = ovs_meter_cmd_del
},
};
@@ -561,8 +718,9 @@ struct genl_family dp_meter_genl_family __ro_after_init = {
.policy = meter_policy,
.netnsok = true,
.parallel_ops = true,
- .ops = dp_meter_genl_ops,
- .n_ops = ARRAY_SIZE(dp_meter_genl_ops),
+ .small_ops = dp_meter_genl_ops,
+ .n_small_ops = ARRAY_SIZE(dp_meter_genl_ops),
+ .resv_start_op = OVS_METER_CMD_GET + 1,
.mcgrps = &ovs_meter_multicast_group,
.n_mcgrps = 1,
.module = THIS_MODULE,
@@ -570,32 +728,39 @@ struct genl_family dp_meter_genl_family __ro_after_init = {
int ovs_meters_init(struct datapath *dp)
{
- int i;
-
- dp->meters = kmalloc_array(METER_HASH_BUCKETS,
- sizeof(struct hlist_head), GFP_KERNEL);
+ struct dp_meter_table *tbl = &dp->meter_tbl;
+ struct dp_meter_instance *ti;
+ unsigned long free_mem_bytes;
- if (!dp->meters)
+ ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN);
+ if (!ti)
return -ENOMEM;
- for (i = 0; i < METER_HASH_BUCKETS; i++)
- INIT_HLIST_HEAD(&dp->meters[i]);
+ /* Allow meters in a datapath to use ~3.12% of physical memory. */
+ free_mem_bytes = nr_free_buffer_pages() * (PAGE_SIZE >> 5);
+ tbl->max_meters_allowed = min(free_mem_bytes / sizeof(struct dp_meter),
+ DP_METER_NUM_MAX);
+ if (!tbl->max_meters_allowed)
+ goto out_err;
+
+ rcu_assign_pointer(tbl->ti, ti);
+ tbl->count = 0;
return 0;
+
+out_err:
+ dp_meter_instance_free(ti);
+ return -ENOMEM;
}
void ovs_meters_exit(struct datapath *dp)
{
+ struct dp_meter_table *tbl = &dp->meter_tbl;
+ struct dp_meter_instance *ti = rcu_dereference_raw(tbl->ti);
int i;
- for (i = 0; i < METER_HASH_BUCKETS; i++) {
- struct hlist_head *head = &dp->meters[i];
- struct dp_meter *meter;
- struct hlist_node *n;
-
- hlist_for_each_entry_safe(meter, n, head, dp_hash_node)
- kfree(meter);
- }
+ for (i = 0; i < ti->n_meters; i++)
+ ovs_meter_free(rcu_dereference_raw(ti->dp_meters[i]));
- kfree(dp->meters);
+ dp_meter_instance_free(ti);
}
diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h
index f645913870bd..0c33889a8515 100644
--- a/net/openvswitch/meter.h
+++ b/net/openvswitch/meter.h
@@ -13,26 +13,26 @@
#include <linux/openvswitch.h>
#include <linux/genetlink.h>
#include <linux/skbuff.h>
+#include <linux/bits.h>
#include "flow.h"
struct datapath;
#define DP_MAX_BANDS 1
+#define DP_METER_ARRAY_SIZE_MIN BIT_ULL(10)
+#define DP_METER_NUM_MAX (200000UL)
struct dp_meter_band {
u32 type;
u32 rate;
u32 burst_size;
- u32 bucket; /* 1/1000 packets, or in bits */
+ u64 bucket; /* 1/1000 packets, or in bits */
struct ovs_flow_stats stats;
};
struct dp_meter {
spinlock_t lock; /* Per meter lock */
struct rcu_head rcu;
- struct hlist_node dp_hash_node; /*Element in datapath->meters
- * hash table.
- */
u32 id;
u16 kbps:1, keep_stats:1;
u16 n_bands;
@@ -42,6 +42,18 @@ struct dp_meter {
struct dp_meter_band bands[];
};
+struct dp_meter_instance {
+ struct rcu_head rcu;
+ u32 n_meters;
+ struct dp_meter __rcu *dp_meters[];
+};
+
+struct dp_meter_table {
+ struct dp_meter_instance __rcu *ti;
+ u32 count;
+ u32 max_meters_allowed;
+};
+
extern struct genl_family dp_meter_genl_family;
int ovs_meters_init(struct datapath *dp);
void ovs_meters_exit(struct datapath *dp);
diff --git a/net/openvswitch/openvswitch_trace.c b/net/openvswitch/openvswitch_trace.c
new file mode 100644
index 000000000000..62c5f7d6f023
--- /dev/null
+++ b/net/openvswitch/openvswitch_trace.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
+/* bug in tracepoint.h, it should include this */
+#include <linux/module.h>
+
+/* sparse isn't too happy with all macros... */
+#ifndef __CHECKER__
+#define CREATE_TRACE_POINTS
+#include "openvswitch_trace.h"
+
+#endif
diff --git a/net/openvswitch/openvswitch_trace.h b/net/openvswitch/openvswitch_trace.h
new file mode 100644
index 000000000000..3eb35d9eb700
--- /dev/null
+++ b/net/openvswitch/openvswitch_trace.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM openvswitch
+
+#if !defined(_TRACE_OPENVSWITCH_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_OPENVSWITCH_H
+
+#include <linux/tracepoint.h>
+
+#include "datapath.h"
+
+TRACE_EVENT(ovs_do_execute_action,
+
+ TP_PROTO(struct datapath *dp, struct sk_buff *skb,
+ struct sw_flow_key *key, const struct nlattr *a, int rem),
+
+ TP_ARGS(dp, skb, key, a, rem),
+
+ TP_STRUCT__entry(
+ __field( void *, dpaddr )
+ __string( dp_name, ovs_dp_name(dp) )
+ __string( dev_name, skb->dev->name )
+ __field( void *, skbaddr )
+ __field( unsigned int, len )
+ __field( unsigned int, data_len )
+ __field( unsigned int, truesize )
+ __field( u8, nr_frags )
+ __field( u16, gso_size )
+ __field( u16, gso_type )
+ __field( u32, ovs_flow_hash )
+ __field( u32, recirc_id )
+ __field( void *, keyaddr )
+ __field( u16, key_eth_type )
+ __field( u8, key_ct_state )
+ __field( u8, key_ct_orig_proto )
+ __field( u16, key_ct_zone )
+ __field( unsigned int, flow_key_valid )
+ __field( u8, action_type )
+ __field( unsigned int, action_len )
+ __field( void *, action_data )
+ __field( u8, is_last )
+ ),
+
+ TP_fast_assign(
+ __entry->dpaddr = dp;
+ __assign_str(dp_name, ovs_dp_name(dp));
+ __assign_str(dev_name, skb->dev->name);
+ __entry->skbaddr = skb;
+ __entry->len = skb->len;
+ __entry->data_len = skb->data_len;
+ __entry->truesize = skb->truesize;
+ __entry->nr_frags = skb_shinfo(skb)->nr_frags;
+ __entry->gso_size = skb_shinfo(skb)->gso_size;
+ __entry->gso_type = skb_shinfo(skb)->gso_type;
+ __entry->ovs_flow_hash = key->ovs_flow_hash;
+ __entry->recirc_id = key->recirc_id;
+ __entry->keyaddr = key;
+ __entry->key_eth_type = key->eth.type;
+ __entry->key_ct_state = key->ct_state;
+ __entry->key_ct_orig_proto = key->ct_orig_proto;
+ __entry->key_ct_zone = key->ct_zone;
+ __entry->flow_key_valid = !(key->mac_proto & SW_FLOW_KEY_INVALID);
+ __entry->action_type = nla_type(a);
+ __entry->action_len = nla_len(a);
+ __entry->action_data = nla_data(a);
+ __entry->is_last = nla_is_last(a, rem);
+ ),
+
+ TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u truesize=%u nr_frags=%d gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x ct_state=%02x ct_orig_proto=%02x ct_Zone=%04x flow_key_valid=%d action_type=%u action_len=%u action_data=%p is_last=%d",
+ __entry->dpaddr, __get_str(dp_name), __get_str(dev_name),
+ __entry->skbaddr, __entry->len, __entry->data_len,
+ __entry->truesize, __entry->nr_frags, __entry->gso_size,
+ __entry->gso_type, __entry->ovs_flow_hash,
+ __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type,
+ __entry->key_ct_state, __entry->key_ct_orig_proto,
+ __entry->key_ct_zone,
+ __entry->flow_key_valid,
+ __entry->action_type, __entry->action_len,
+ __entry->action_data, __entry->is_last)
+);
+
+TRACE_EVENT(ovs_dp_upcall,
+
+ TP_PROTO(struct datapath *dp, struct sk_buff *skb,
+ const struct sw_flow_key *key,
+ const struct dp_upcall_info *upcall_info),
+
+ TP_ARGS(dp, skb, key, upcall_info),
+
+ TP_STRUCT__entry(
+ __field( void *, dpaddr )
+ __string( dp_name, ovs_dp_name(dp) )
+ __string( dev_name, skb->dev->name )
+ __field( void *, skbaddr )
+ __field( unsigned int, len )
+ __field( unsigned int, data_len )
+ __field( unsigned int, truesize )
+ __field( u8, nr_frags )
+ __field( u16, gso_size )
+ __field( u16, gso_type )
+ __field( u32, ovs_flow_hash )
+ __field( u32, recirc_id )
+ __field( const void *, keyaddr )
+ __field( u16, key_eth_type )
+ __field( u8, key_ct_state )
+ __field( u8, key_ct_orig_proto )
+ __field( u16, key_ct_zone )
+ __field( unsigned int, flow_key_valid )
+ __field( u8, upcall_cmd )
+ __field( u32, upcall_port )
+ __field( u16, upcall_mru )
+ ),
+
+ TP_fast_assign(
+ __entry->dpaddr = dp;
+ __assign_str(dp_name, ovs_dp_name(dp));
+ __assign_str(dev_name, skb->dev->name);
+ __entry->skbaddr = skb;
+ __entry->len = skb->len;
+ __entry->data_len = skb->data_len;
+ __entry->truesize = skb->truesize;
+ __entry->nr_frags = skb_shinfo(skb)->nr_frags;
+ __entry->gso_size = skb_shinfo(skb)->gso_size;
+ __entry->gso_type = skb_shinfo(skb)->gso_type;
+ __entry->ovs_flow_hash = key->ovs_flow_hash;
+ __entry->recirc_id = key->recirc_id;
+ __entry->keyaddr = key;
+ __entry->key_eth_type = key->eth.type;
+ __entry->key_ct_state = key->ct_state;
+ __entry->key_ct_orig_proto = key->ct_orig_proto;
+ __entry->key_ct_zone = key->ct_zone;
+ __entry->flow_key_valid = !(key->mac_proto & SW_FLOW_KEY_INVALID);
+ __entry->upcall_cmd = upcall_info->cmd;
+ __entry->upcall_port = upcall_info->portid;
+ __entry->upcall_mru = upcall_info->mru;
+ ),
+
+ TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u truesize=%u nr_frags=%d gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x ct_state=%02x ct_orig_proto=%02x ct_zone=%04x flow_key_valid=%d upcall_cmd=%u upcall_port=%u upcall_mru=%u",
+ __entry->dpaddr, __get_str(dp_name), __get_str(dev_name),
+ __entry->skbaddr, __entry->len, __entry->data_len,
+ __entry->truesize, __entry->nr_frags, __entry->gso_size,
+ __entry->gso_type, __entry->ovs_flow_hash,
+ __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type,
+ __entry->key_ct_state, __entry->key_ct_orig_proto,
+ __entry->key_ct_zone,
+ __entry->flow_key_valid,
+ __entry->upcall_cmd, __entry->upcall_port,
+ __entry->upcall_mru)
+);
+
+#endif /* _TRACE_OPENVSWITCH_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE openvswitch_trace
+#include <trace/define_trace.h>
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 58a7b8312c28..74c88a6baa43 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -35,21 +35,18 @@ internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev)
{
int len, err;
+ /* store len value because skb can be freed inside ovs_vport_receive() */
len = skb->len;
+
rcu_read_lock();
err = ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL);
rcu_read_unlock();
- if (likely(!err)) {
- struct pcpu_sw_netstats *tstats = this_cpu_ptr(netdev->tstats);
-
- u64_stats_update_begin(&tstats->syncp);
- tstats->tx_bytes += len;
- tstats->tx_packets++;
- u64_stats_update_end(&tstats->syncp);
- } else {
+ if (likely(!err))
+ dev_sw_netstats_tx_add(netdev, 1, len);
+ else
netdev->stats.tx_errors++;
- }
+
return NETDEV_TX_OK;
}
@@ -68,7 +65,7 @@ static int internal_dev_stop(struct net_device *netdev)
static void internal_dev_getinfo(struct net_device *netdev,
struct ethtool_drvinfo *info)
{
- strlcpy(info->driver, "openvswitch", sizeof(info->driver));
+ strscpy(info->driver, "openvswitch", sizeof(info->driver));
}
static const struct ethtool_ops internal_dev_ethtool_ops = {
@@ -83,42 +80,12 @@ static void internal_dev_destructor(struct net_device *dev)
ovs_vport_free(vport);
}
-static void
-internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
-{
- int i;
-
- memset(stats, 0, sizeof(*stats));
- stats->rx_errors = dev->stats.rx_errors;
- stats->tx_errors = dev->stats.tx_errors;
- stats->tx_dropped = dev->stats.tx_dropped;
- stats->rx_dropped = dev->stats.rx_dropped;
-
- for_each_possible_cpu(i) {
- const struct pcpu_sw_netstats *percpu_stats;
- struct pcpu_sw_netstats local_stats;
- unsigned int start;
-
- percpu_stats = per_cpu_ptr(dev->tstats, i);
-
- do {
- start = u64_stats_fetch_begin_irq(&percpu_stats->syncp);
- local_stats = *percpu_stats;
- } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start));
-
- stats->rx_bytes += local_stats.rx_bytes;
- stats->rx_packets += local_stats.rx_packets;
- stats->tx_bytes += local_stats.tx_bytes;
- stats->tx_packets += local_stats.tx_packets;
- }
-}
-
static const struct net_device_ops internal_dev_netdev_ops = {
.ndo_open = internal_dev_open,
.ndo_stop = internal_dev_stop,
.ndo_start_xmit = internal_dev_xmit,
.ndo_set_mac_address = eth_mac_addr,
- .ndo_get_stats64 = internal_get_stats,
+ .ndo_get_stats64 = dev_get_tstats64,
};
static struct rtnl_link_ops internal_dev_link_ops __read_mostly = {
@@ -180,6 +147,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)
}
dev_net_set(vport->dev, ovs_dp_get_net(vport->dp));
+ dev->ifindex = parms->desired_ifindex;
internal_dev = internal_dev_priv(vport->dev);
internal_dev->vport = vport;
@@ -222,10 +190,9 @@ static void internal_dev_destroy(struct vport *vport)
rtnl_unlock();
}
-static netdev_tx_t internal_dev_recv(struct sk_buff *skb)
+static int internal_dev_recv(struct sk_buff *skb)
{
struct net_device *netdev = skb->dev;
- struct pcpu_sw_netstats *stats;
if (unlikely(!(netdev->flags & IFF_UP))) {
kfree_skb(skb);
@@ -240,12 +207,7 @@ static netdev_tx_t internal_dev_recv(struct sk_buff *skb)
skb->pkt_type = PACKET_HOST;
skb->protocol = eth_type_trans(skb, netdev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
-
- stats = this_cpu_ptr(netdev->tstats);
- u64_stats_update_begin(&stats->syncp);
- stats->rx_packets++;
- stats->rx_bytes += skb->len;
- u64_stats_update_end(&stats->syncp);
+ dev_sw_netstats_rx_add(netdev, skb->len);
netif_rx(skb);
return NETDEV_TX_OK;
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 57d6436e6f6a..2f61d5bdce1a 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -44,10 +44,9 @@ static void netdev_port_receive(struct sk_buff *skb)
if (unlikely(!skb))
return;
- if (skb->dev->type == ARPHRD_ETHER) {
- skb_push(skb, ETH_HLEN);
- skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
- }
+ if (skb->dev->type == ARPHRD_ETHER)
+ skb_push_rcsum(skb, ETH_HLEN);
+
ovs_vport_receive(vport, skb, skb_tunnel_info(skb));
return;
error:
@@ -83,7 +82,7 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name)
err = -ENODEV;
goto error_free_vport;
}
-
+ netdev_tracker_alloc(vport->dev, &vport->dev_tracker, GFP_KERNEL);
if (vport->dev->flags & IFF_LOOPBACK ||
(vport->dev->type != ARPHRD_ETHER &&
vport->dev->type != ARPHRD_NONE) ||
@@ -116,7 +115,7 @@ error_master_upper_dev_unlink:
error_unlock:
rtnl_unlock();
error_put:
- dev_put(vport->dev);
+ netdev_put(vport->dev, &vport->dev_tracker);
error_free_vport:
ovs_vport_free(vport);
return ERR_PTR(err);
@@ -138,8 +137,7 @@ static void vport_netdev_free(struct rcu_head *rcu)
{
struct vport *vport = container_of(rcu, struct vport, rcu);
- if (vport->dev)
- dev_put(vport->dev);
+ netdev_put(vport->dev, &vport->dev_tracker);
ovs_vport_free(vport);
}
@@ -175,7 +173,7 @@ void ovs_netdev_tunnel_destroy(struct vport *vport)
*/
if (vport->dev->reg_state == NETREG_REGISTERED)
rtnl_delete_link(vport->dev);
- dev_put(vport->dev);
+ netdev_put(vport->dev, &vport->dev_tracker);
vport->dev = NULL;
rtnl_unlock();
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 47febb4504f0..82a74f998966 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -87,6 +87,7 @@ EXPORT_SYMBOL_GPL(ovs_vport_ops_unregister);
/**
* ovs_vport_locate - find a port that has already been created
*
+ * @net: network namespace
* @name: name of port to find
*
* Must be called with ovs or RCU read lock.
@@ -97,7 +98,7 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name)
struct vport *vport;
hlist_for_each_entry_rcu(vport, bucket, hash_node,
- lockdep_ovsl_is_held())
+ lockdep_ovsl_is_held())
if (!strcmp(name, ovs_vport_name(vport)) &&
net_eq(ovs_dp_get_net(vport->dp), net))
return vport;
@@ -110,14 +111,16 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name)
*
* @priv_size: Size of private data area to allocate.
* @ops: vport device ops
+ * @parms: information about new vport.
*
* Allocate and initialize a new vport defined by @ops. The vport will contain
* a private data area of size @priv_size that can be accessed using
- * vport_priv(). vports that are no longer needed should be released with
+ * vport_priv(). Some parameters of the vport will be initialized from @parms.
+ * @vports that are no longer needed should be released with
* vport_free().
*/
struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops,
- const struct vport_parms *parms)
+ const struct vport_parms *parms)
{
struct vport *vport;
size_t alloc_size;
@@ -396,7 +399,8 @@ int ovs_vport_get_upcall_portids(const struct vport *vport,
*
* Returns the portid of the target socket. Must be called with rcu_read_lock.
*/
-u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb)
+u32 ovs_vport_find_upcall_portid(const struct vport *vport,
+ struct sk_buff *skb)
{
struct vport_portids *ids;
u32 ids_index;
@@ -418,7 +422,7 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb)
*
* @vport: vport that received the packet
* @skb: skb that was received
- * @tun_key: tunnel (if any) that carried packet
+ * @tun_info: tunnel (if any) that carried packet
*
* Must be called with rcu_read_lock. The packet cannot be shared and
* skb->data should point to the Ethernet header.
@@ -493,14 +497,17 @@ void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto)
if (unlikely(packet_length(skb, vport->dev) > mtu &&
!skb_is_gso(skb))) {
- net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
- vport->dev->name,
- packet_length(skb, vport->dev), mtu);
vport->dev->stats.tx_errors++;
+ if (vport->dev->flags & IFF_UP)
+ net_warn_ratelimited("%s: dropped over-mtu packet: "
+ "%d > %d\n", vport->dev->name,
+ packet_length(skb, vport->dev),
+ mtu);
goto drop;
}
skb->dev = vport->dev;
+ skb_clear_tstamp(skb);
vport->ops->send(skb);
return;
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index 1eb7495ac5b4..6ff45e8a0868 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -20,7 +20,7 @@
struct vport;
struct vport_parms;
-/* The following definitions are for users of the vport subsytem: */
+/* The following definitions are for users of the vport subsystem: */
int ovs_vport_init(void);
void ovs_vport_exit(void);
@@ -58,6 +58,7 @@ struct vport_portids {
/**
* struct vport - one port within a datapath
* @dev: Pointer to net_device.
+ * @dev_tracker: refcount tracker for @dev reference
* @dp: Datapath to which this port belongs.
* @upcall_portids: RCU protected 'struct vport_portids'.
* @port_no: Index into @dp's @ports array.
@@ -69,6 +70,7 @@ struct vport_portids {
*/
struct vport {
struct net_device *dev;
+ netdevice_tracker dev_tracker;
struct datapath *dp;
struct vport_portids __rcu *upcall_portids;
u16 port_no;
@@ -88,12 +90,14 @@ struct vport {
* @type: New vport's type.
* @options: %OVS_VPORT_ATTR_OPTIONS attribute from Netlink message, %NULL if
* none was supplied.
+ * @desired_ifindex: New vport's ifindex.
* @dp: New vport's datapath.
* @port_no: New vport's port number.
*/
struct vport_parms {
const char *name;
enum ovs_vport_type type;
+ int desired_ifindex;
struct nlattr *options;
/* For ovs_vport_alloc(). */
@@ -128,7 +132,7 @@ struct vport_ops {
int (*set_options)(struct vport *, struct nlattr *);
int (*get_options)(const struct vport *, struct sk_buff *);
- netdev_tx_t (*send) (struct sk_buff *skb);
+ int (*send)(struct sk_buff *skb);
struct module *owner;
struct list_head list;
};