diff options
Diffstat (limited to 'net/openvswitch')
-rw-r--r-- | net/openvswitch/Kconfig | 8 | ||||
-rw-r--r-- | net/openvswitch/Makefile | 3 | ||||
-rw-r--r-- | net/openvswitch/actions.c | 193 | ||||
-rw-r--r-- | net/openvswitch/conntrack.c | 270 | ||||
-rw-r--r-- | net/openvswitch/conntrack.h | 6 | ||||
-rw-r--r-- | net/openvswitch/datapath.c | 289 | ||||
-rw-r--r-- | net/openvswitch/datapath.h | 33 | ||||
-rw-r--r-- | net/openvswitch/flow.c | 187 | ||||
-rw-r--r-- | net/openvswitch/flow.h | 14 | ||||
-rw-r--r-- | net/openvswitch/flow_netlink.c | 266 | ||||
-rw-r--r-- | net/openvswitch/flow_table.c | 412 | ||||
-rw-r--r-- | net/openvswitch/flow_table.h | 32 | ||||
-rw-r--r-- | net/openvswitch/meter.c | 337 | ||||
-rw-r--r-- | net/openvswitch/meter.h | 20 | ||||
-rw-r--r-- | net/openvswitch/openvswitch_trace.c | 10 | ||||
-rw-r--r-- | net/openvswitch/openvswitch_trace.h | 158 | ||||
-rw-r--r-- | net/openvswitch/vport-internal_dev.c | 60 | ||||
-rw-r--r-- | net/openvswitch/vport-netdev.c | 16 | ||||
-rw-r--r-- | net/openvswitch/vport.c | 23 | ||||
-rw-r--r-- | net/openvswitch/vport.h | 8 |
20 files changed, 1843 insertions, 502 deletions
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 22d7d5604b4c..15bd287f5cbd 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -15,7 +15,7 @@ config OPENVSWITCH select NET_MPLS_GSO select DST_CACHE select NET_NSH - ---help--- + help Open vSwitch is a multilayer Ethernet switch targeted at virtualized environments. In addition to supporting a variety of features expected in a traditional hardware switch, it enables fine-grained @@ -43,7 +43,7 @@ config OPENVSWITCH_GRE depends on OPENVSWITCH depends on NET_IPGRE default OPENVSWITCH - ---help--- + help If you say Y here, then the Open vSwitch will be able create GRE vport. @@ -56,7 +56,7 @@ config OPENVSWITCH_VXLAN depends on OPENVSWITCH depends on VXLAN default OPENVSWITCH - ---help--- + help If you say Y here, then the Open vSwitch will be able create vxlan vport. Say N to exclude this support and reduce the binary size. @@ -68,7 +68,7 @@ config OPENVSWITCH_GENEVE depends on OPENVSWITCH depends on GENEVE default OPENVSWITCH - ---help--- + help If you say Y here, then the Open vSwitch will be able create geneve vport. Say N to exclude this support and reduce the binary size. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 41109c326f3a..28982630bef3 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -13,6 +13,7 @@ openvswitch-y := \ flow_netlink.o \ flow_table.o \ meter.o \ + openvswitch_trace.o \ vport.o \ vport-internal_dev.o \ vport-netdev.o @@ -24,3 +25,5 @@ endif obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o + +CFLAGS_openvswitch_trace.o = -I$(src) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 7fbfe2adfffa..ca3ebfdb3023 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -9,7 +9,6 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/openvswitch.h> -#include <linux/netfilter_ipv6.h> #include <linux/sctp.h> #include <linux/tcp.h> #include <linux/udp.h> @@ -31,6 +30,7 @@ #include "conntrack.h" #include "vport.h" #include "flow_netlink.h" +#include "openvswitch_trace.h" struct deferred_action { struct sk_buff *skb; @@ -200,6 +200,9 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, __be32 lse; int err; + if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN)) + return -ENOMEM; + stack = mpls_hdr(skb); lse = OVS_MASKED(stack->label_stack_entry, *mpls_lse, *mask); err = skb_mpls_update_lse(skb, lse); @@ -278,9 +281,11 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, */ static int pop_eth(struct sk_buff *skb, struct sw_flow_key *key) { - skb_pull_rcsum(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb_reset_mac_len(skb); + int err; + + err = skb_eth_pop(skb); + if (err) + return err; /* safe right before invalidate_flow_key */ key->mac_proto = MAC_PROTO_NONE; @@ -291,22 +296,12 @@ static int pop_eth(struct sk_buff *skb, struct sw_flow_key *key) static int push_eth(struct sk_buff *skb, struct sw_flow_key *key, const struct ovs_action_push_eth *ethh) { - struct ethhdr *hdr; - - /* Add the new Ethernet header */ - if (skb_cow_head(skb, ETH_HLEN) < 0) - return -ENOMEM; - - skb_push(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb_reset_mac_len(skb); - - hdr = eth_hdr(skb); - ether_addr_copy(hdr->h_source, ethh->addresses.eth_src); - ether_addr_copy(hdr->h_dest, ethh->addresses.eth_dst); - hdr->h_proto = skb->protocol; + int err; - skb_postpush_rcsum(skb, hdr, ETH_HLEN); + err = skb_eth_push(skb, ethh->addresses.eth_dst, + ethh->addresses.eth_src); + if (err) + return err; /* safe right before invalidate_flow_key */ key->mac_proto = MAC_PROTO_ETHERNET; @@ -378,6 +373,7 @@ static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, update_ip_l4_checksum(skb, nh, *addr, new_addr); csum_replace4(&nh->check, *addr, new_addr); skb_clear_hash(skb); + ovs_ct_clear(skb, NULL); *addr = new_addr; } @@ -425,15 +421,47 @@ static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, update_ipv6_checksum(skb, l4_proto, addr, new_addr); skb_clear_hash(skb); + ovs_ct_clear(skb, NULL); memcpy(addr, new_addr, sizeof(__be32[4])); } -static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl, u32 mask) +static void set_ipv6_dsfield(struct sk_buff *skb, struct ipv6hdr *nh, u8 ipv6_tclass, u8 mask) +{ + u8 old_ipv6_tclass = ipv6_get_dsfield(nh); + + ipv6_tclass = OVS_MASKED(old_ipv6_tclass, ipv6_tclass, mask); + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum_replace(&skb->csum, (__force __wsum)(old_ipv6_tclass << 12), + (__force __wsum)(ipv6_tclass << 12)); + + ipv6_change_dsfield(nh, ~mask, ipv6_tclass); +} + +static void set_ipv6_fl(struct sk_buff *skb, struct ipv6hdr *nh, u32 fl, u32 mask) { + u32 ofl; + + ofl = nh->flow_lbl[0] << 16 | nh->flow_lbl[1] << 8 | nh->flow_lbl[2]; + fl = OVS_MASKED(ofl, fl, mask); + /* Bits 21-24 are always unmasked, so this retains their values. */ - OVS_SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16)); - OVS_SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8)); - OVS_SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask); + nh->flow_lbl[0] = (u8)(fl >> 16); + nh->flow_lbl[1] = (u8)(fl >> 8); + nh->flow_lbl[2] = (u8)fl; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum_replace(&skb->csum, (__force __wsum)htonl(ofl), (__force __wsum)htonl(fl)); +} + +static void set_ipv6_ttl(struct sk_buff *skb, struct ipv6hdr *nh, u8 new_ttl, u8 mask) +{ + new_ttl = OVS_MASKED(nh->hop_limit, new_ttl, mask); + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum_replace(&skb->csum, (__force __wsum)(nh->hop_limit << 8), + (__force __wsum)(new_ttl << 8)); + nh->hop_limit = new_ttl; } static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl, @@ -551,18 +579,17 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, } } if (mask->ipv6_tclass) { - ipv6_change_dsfield(nh, ~mask->ipv6_tclass, key->ipv6_tclass); + set_ipv6_dsfield(skb, nh, key->ipv6_tclass, mask->ipv6_tclass); flow_key->ip.tos = ipv6_get_dsfield(nh); } if (mask->ipv6_label) { - set_ipv6_fl(nh, ntohl(key->ipv6_label), + set_ipv6_fl(skb, nh, ntohl(key->ipv6_label), ntohl(mask->ipv6_label)); flow_key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); } if (mask->ipv6_hlimit) { - OVS_SET_MASKED(nh->hop_limit, key->ipv6_hlimit, - mask->ipv6_hlimit); + set_ipv6_ttl(skb, nh, key->ipv6_hlimit, mask->ipv6_hlimit); flow_key->ip.ttl = nh->hop_limit; } return 0; @@ -635,6 +662,7 @@ static int set_nsh(struct sk_buff *skb, struct sw_flow_key *flow_key, static void set_tp_port(struct sk_buff *skb, __be16 *port, __be16 new_port, __sum16 *check) { + ovs_ct_clear(skb, NULL); inet_proto_csum_replace2(check, skb, *port, new_port, false); *port = new_port; } @@ -674,6 +702,7 @@ static int set_udp(struct sk_buff *skb, struct sw_flow_key *flow_key, uh->dest = dst; flow_key->tp.src = src; flow_key->tp.dst = dst; + ovs_ct_clear(skb, NULL); } skb_clear_hash(skb); @@ -736,13 +765,16 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, sh->checksum = old_csum ^ old_correct_csum ^ new_csum; skb_clear_hash(skb); + ovs_ct_clear(skb, NULL); + flow_key->tp.src = sh->source; flow_key->tp.dst = sh->dest; return 0; } -static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int ovs_vport_output(struct net *net, struct sock *sk, + struct sk_buff *skb) { struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage); struct vport *vport = data->vport; @@ -832,29 +864,25 @@ static void ovs_fragment(struct net *net, struct vport *vport, } if (key->eth.type == htons(ETH_P_IP)) { - struct dst_entry ovs_dst; + struct rtable ovs_rt = { 0 }; unsigned long orig_dst; prepare_frag(vport, skb, orig_network_offset, ovs_key_mac_proto(key)); - dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1, + dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1, DST_OBSOLETE_NONE, DST_NOCOUNT); - ovs_dst.dev = vport->dev; + ovs_rt.dst.dev = vport->dev; orig_dst = skb->_skb_refdst; - skb_dst_set_noref(skb, &ovs_dst); + skb_dst_set_noref(skb, &ovs_rt.dst); IPCB(skb)->frag_max_size = mru; ip_do_fragment(net, skb->sk, skb, ovs_vport_output); refdst_drop(orig_dst); } else if (key->eth.type == htons(ETH_P_IPV6)) { - const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); unsigned long orig_dst; struct rt6_info ovs_rt; - if (!v6ops) - goto err; - prepare_frag(vport, skb, orig_network_offset, ovs_key_mac_proto(key)); memset(&ovs_rt, 0, sizeof(ovs_rt)); @@ -866,7 +894,7 @@ static void ovs_fragment(struct net *net, struct vport *vport, skb_dst_set_noref(skb, &ovs_rt.dst); IP6CB(skb)->frag_max_size = mru; - v6ops->fragment(net, skb->sk, skb, ovs_vport_output); + ipv6_stub->ipv6_fragment(net, skb->sk, skb, ovs_vport_output); refdst_drop(orig_dst); } else { WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.", @@ -925,14 +953,20 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, upcall.mru = OVS_CB(skb)->mru; for (a = nla_data(attr), rem = nla_len(attr); rem > 0; - a = nla_next(a, &rem)) { + a = nla_next(a, &rem)) { switch (nla_type(a)) { case OVS_USERSPACE_ATTR_USERDATA: upcall.userdata = a; break; case OVS_USERSPACE_ATTR_PID: - upcall.portid = nla_get_u32(a); + if (dp->user_features & + OVS_DP_F_DISPATCH_UPCALL_PER_CPU) + upcall.portid = + ovs_dp_get_upcall_portid(dp, + smp_processor_id()); + else + upcall.portid = nla_get_u32(a); break; case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: { @@ -964,6 +998,21 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, return ovs_dp_upcall(dp, skb, key, &upcall, cutlen); } +static int dec_ttl_exception_handler(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *attr) +{ + /* The first attribute is always 'OVS_DEC_TTL_ATTR_ACTION'. */ + struct nlattr *actions = nla_data(attr); + + if (nla_len(actions)) + return clone_execute(dp, skb, key, 0, nla_data(actions), + nla_len(actions), true, false); + + consume_skb(skb); + return 0; +} + /* When 'last' is true, sample() should always consume the 'skb'. * Otherwise, sample() should keep 'skb' intact regardless what * actions are executed within sample(). @@ -984,7 +1033,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb, actions = nla_next(sample_arg, &rem); if ((arg->probability != U32_MAX) && - (!arg->probability || prandom_u32() > arg->probability)) { + (!arg->probability || get_random_u32() > arg->probability)) { if (last) consume_skb(skb); return 0; @@ -1008,7 +1057,7 @@ static int clone(struct datapath *dp, struct sk_buff *skb, int rem = nla_len(attr); bool dont_clone_flow_key; - /* The first action is always 'OVS_CLONE_ATTR_ARG'. */ + /* The first action is always 'OVS_CLONE_ATTR_EXEC'. */ clone_arg = nla_data(attr); dont_clone_flow_key = nla_get_u32(clone_arg); actions = nla_next(clone_arg, &rem); @@ -1150,9 +1199,10 @@ static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, bool last) { + struct ovs_skb_cb *ovs_cb = OVS_CB(skb); const struct nlattr *actions, *cpl_arg; + int len, max_len, rem = nla_len(attr); const struct check_pkt_len_arg *arg; - int rem = nla_len(attr); bool clone_flow_key; /* The first netlink attribute in 'attr' is always @@ -1161,7 +1211,11 @@ static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb, cpl_arg = nla_data(attr); arg = nla_data(cpl_arg); - if (skb->len <= arg->pkt_len) { + len = ovs_cb->mru ? ovs_cb->mru + skb->mac_len : skb->len; + max_len = arg->pkt_len; + + if ((skb_is_gso(skb) && skb_gso_validate_mac_len(skb, max_len)) || + len <= max_len) { /* Second netlink attribute in 'attr' is always * 'OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL'. */ @@ -1180,6 +1234,45 @@ static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb, nla_len(actions), last, clone_flow_key); } +static int execute_dec_ttl(struct sk_buff *skb, struct sw_flow_key *key) +{ + int err; + + if (skb->protocol == htons(ETH_P_IPV6)) { + struct ipv6hdr *nh; + + err = skb_ensure_writable(skb, skb_network_offset(skb) + + sizeof(*nh)); + if (unlikely(err)) + return err; + + nh = ipv6_hdr(skb); + + if (nh->hop_limit <= 1) + return -EHOSTUNREACH; + + key->ip.ttl = --nh->hop_limit; + } else if (skb->protocol == htons(ETH_P_IP)) { + struct iphdr *nh; + u8 old_ttl; + + err = skb_ensure_writable(skb, skb_network_offset(skb) + + sizeof(*nh)); + if (unlikely(err)) + return err; + + nh = ip_hdr(skb); + if (nh->ttl <= 1) + return -EHOSTUNREACH; + + old_ttl = nh->ttl--; + csum_replace2(&nh->check, htons(old_ttl << 8), + htons(nh->ttl << 8)); + key->ip.ttl = nh->ttl; + } + return 0; +} + /* Execute a list of actions against 'skb'. */ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, @@ -1192,6 +1285,9 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, a = nla_next(a, &rem)) { int err = 0; + if (trace_ovs_do_execute_action_enabled()) + trace_ovs_do_execute_action(dp, skb, key, a, rem); + switch (nla_type(a)) { case OVS_ACTION_ATTR_OUTPUT: { int port = nla_get_u32(a); @@ -1365,6 +1461,13 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, break; } + + case OVS_ACTION_ATTR_DEC_TTL: + err = execute_dec_ttl(skb, key); + if (err == -EHOSTUNREACH) + return dec_ttl_exception_handler(dp, skb, + key, a); + break; } if (unlikely(err)) { @@ -1442,8 +1545,8 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb, pr_warn("%s: deferred action limit reached, drop sample action\n", ovs_dp_name(dp)); } else { /* Recirc action */ - pr_warn("%s: deferred action limit reached, drop recirc action\n", - ovs_dp_name(dp)); + pr_warn("%s: deferred action limit reached, drop recirc action (recirc_id=%#x)\n", + ovs_dp_name(dp), recirc_id); } } } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index e726159cfcfa..c7b10234cf7c 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -25,6 +25,8 @@ #include <net/netfilter/nf_nat.h> #endif +#include <net/netfilter/nf_conntrack_act_ct.h> + #include "datapath.h" #include "conntrack.h" #include "flow.h" @@ -271,15 +273,13 @@ static void ovs_ct_update_key(const struct sk_buff *skb, /* This is called to initialize CT key fields possibly coming in from the local * stack. */ -void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) +void ovs_ct_fill_key(const struct sk_buff *skb, + struct sw_flow_key *key, + bool post_ct) { - ovs_ct_update_key(skb, NULL, key, false, false); + ovs_ct_update_key(skb, NULL, key, post_ct, false); } -#define IN6_ADDR_INITIALIZER(ADDR) \ - { (ADDR).s6_addr32[0], (ADDR).s6_addr32[1], \ - (ADDR).s6_addr32[2], (ADDR).s6_addr32[3] } - int ovs_ct_put_key(const struct sw_flow_key *swkey, const struct sw_flow_key *output, struct sk_buff *skb) { @@ -301,24 +301,30 @@ int ovs_ct_put_key(const struct sw_flow_key *swkey, if (swkey->ct_orig_proto) { if (swkey->eth.type == htons(ETH_P_IP)) { - struct ovs_key_ct_tuple_ipv4 orig = { - output->ipv4.ct_orig.src, - output->ipv4.ct_orig.dst, - output->ct.orig_tp.src, - output->ct.orig_tp.dst, - output->ct_orig_proto, - }; + struct ovs_key_ct_tuple_ipv4 orig; + + memset(&orig, 0, sizeof(orig)); + orig.ipv4_src = output->ipv4.ct_orig.src; + orig.ipv4_dst = output->ipv4.ct_orig.dst; + orig.src_port = output->ct.orig_tp.src; + orig.dst_port = output->ct.orig_tp.dst; + orig.ipv4_proto = output->ct_orig_proto; + if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4, sizeof(orig), &orig)) return -EMSGSIZE; } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - struct ovs_key_ct_tuple_ipv6 orig = { - IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.src), - IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.dst), - output->ct.orig_tp.src, - output->ct.orig_tp.dst, - output->ct_orig_proto, - }; + struct ovs_key_ct_tuple_ipv6 orig; + + memset(&orig, 0, sizeof(orig)); + memcpy(orig.ipv6_src, output->ipv6.ct_orig.src.s6_addr32, + sizeof(orig.ipv6_src)); + memcpy(orig.ipv6_dst, output->ipv6.ct_orig.dst.s6_addr32, + sizeof(orig.ipv6_dst)); + orig.src_port = output->ct.orig_tp.src; + orig.dst_port = output->ct.orig_tp.dst; + orig.ipv6_proto = output->ct_orig_proto; + if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6, sizeof(orig), &orig)) return -EMSGSIZE; @@ -570,7 +576,7 @@ ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); nf_ct_delete(ct, 0, 0); - nf_conntrack_put(&ct->ct_general); + nf_ct_put(ct); } } @@ -719,7 +725,7 @@ static bool skb_nfct_cached(struct net *net, if (nf_ct_is_confirmed(ct)) nf_ct_delete(ct, 0, 0); - nf_conntrack_put(&ct->ct_general); + nf_ct_put(ct); nf_ct_set(skb, NULL, 0); return false; } @@ -728,6 +734,57 @@ static bool skb_nfct_cached(struct net *net, } #if IS_ENABLED(CONFIG_NF_NAT) +static void ovs_nat_update_key(struct sw_flow_key *key, + const struct sk_buff *skb, + enum nf_nat_manip_type maniptype) +{ + if (maniptype == NF_NAT_MANIP_SRC) { + __be16 src; + + key->ct_state |= OVS_CS_F_SRC_NAT; + if (key->eth.type == htons(ETH_P_IP)) + key->ipv4.addr.src = ip_hdr(skb)->saddr; + else if (key->eth.type == htons(ETH_P_IPV6)) + memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr, + sizeof(key->ipv6.addr.src)); + else + return; + + if (key->ip.proto == IPPROTO_UDP) + src = udp_hdr(skb)->source; + else if (key->ip.proto == IPPROTO_TCP) + src = tcp_hdr(skb)->source; + else if (key->ip.proto == IPPROTO_SCTP) + src = sctp_hdr(skb)->source; + else + return; + + key->tp.src = src; + } else { + __be16 dst; + + key->ct_state |= OVS_CS_F_DST_NAT; + if (key->eth.type == htons(ETH_P_IP)) + key->ipv4.addr.dst = ip_hdr(skb)->daddr; + else if (key->eth.type == htons(ETH_P_IPV6)) + memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr, + sizeof(key->ipv6.addr.dst)); + else + return; + + if (key->ip.proto == IPPROTO_UDP) + dst = udp_hdr(skb)->dest; + else if (key->ip.proto == IPPROTO_TCP) + dst = tcp_hdr(skb)->dest; + else if (key->ip.proto == IPPROTO_SCTP) + dst = sctp_hdr(skb)->dest; + else + return; + + key->tp.dst = dst; + } +} + /* Modelled after nf_nat_ipv[46]_fn(). * range is only used for new, uninitialized NAT state. * Returns either NF_ACCEPT or NF_DROP. @@ -735,7 +792,7 @@ static bool skb_nfct_cached(struct net *net, static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct nf_nat_range2 *range, - enum nf_nat_manip_type maniptype) + enum nf_nat_manip_type maniptype, struct sw_flow_key *key) { int hooknum, nh_off, err = NF_ACCEPT; @@ -776,7 +833,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, } } /* Non-ICMP, fall thru to initialize if needed. */ - /* fall through */ + fallthrough; case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. @@ -805,61 +862,13 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, err = nf_nat_packet(ct, ctinfo, hooknum, skb); push: - skb_push(skb, nh_off); - skb_postpush_rcsum(skb, skb->data, nh_off); - - return err; -} + skb_push_rcsum(skb, nh_off); -static void ovs_nat_update_key(struct sw_flow_key *key, - const struct sk_buff *skb, - enum nf_nat_manip_type maniptype) -{ - if (maniptype == NF_NAT_MANIP_SRC) { - __be16 src; - - key->ct_state |= OVS_CS_F_SRC_NAT; - if (key->eth.type == htons(ETH_P_IP)) - key->ipv4.addr.src = ip_hdr(skb)->saddr; - else if (key->eth.type == htons(ETH_P_IPV6)) - memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr, - sizeof(key->ipv6.addr.src)); - else - return; - - if (key->ip.proto == IPPROTO_UDP) - src = udp_hdr(skb)->source; - else if (key->ip.proto == IPPROTO_TCP) - src = tcp_hdr(skb)->source; - else if (key->ip.proto == IPPROTO_SCTP) - src = sctp_hdr(skb)->source; - else - return; - - key->tp.src = src; - } else { - __be16 dst; - - key->ct_state |= OVS_CS_F_DST_NAT; - if (key->eth.type == htons(ETH_P_IP)) - key->ipv4.addr.dst = ip_hdr(skb)->daddr; - else if (key->eth.type == htons(ETH_P_IPV6)) - memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr, - sizeof(key->ipv6.addr.dst)); - else - return; - - if (key->ip.proto == IPPROTO_UDP) - dst = udp_hdr(skb)->dest; - else if (key->ip.proto == IPPROTO_TCP) - dst = tcp_hdr(skb)->dest; - else if (key->ip.proto == IPPROTO_SCTP) - dst = sctp_hdr(skb)->dest; - else - return; + /* Update the flow key if NAT successful. */ + if (err == NF_ACCEPT) + ovs_nat_update_key(key, skb, maniptype); - key->tp.dst = dst; - } + return err; } /* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */ @@ -901,23 +910,23 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, } else { return NF_ACCEPT; /* Connection is not NATed. */ } - err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype); + err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype, key); - if (err == NF_ACCEPT && - ct->status & IPS_SRC_NAT && ct->status & IPS_DST_NAT) { - if (maniptype == NF_NAT_MANIP_SRC) - maniptype = NF_NAT_MANIP_DST; - else - maniptype = NF_NAT_MANIP_SRC; + if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) { + if (ct->status & IPS_SRC_NAT) { + if (maniptype == NF_NAT_MANIP_SRC) + maniptype = NF_NAT_MANIP_DST; + else + maniptype = NF_NAT_MANIP_SRC; - err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, - maniptype); + err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, + maniptype, key); + } else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + err = ovs_ct_nat_execute(skb, ct, ctinfo, NULL, + NF_NAT_MANIP_SRC, key); + } } - /* Mark NAT done if successful and update the flow key. */ - if (err == NF_ACCEPT) - ovs_nat_update_key(key, skb, maniptype); - return err; } #else /* !CONFIG_NF_NAT */ @@ -960,8 +969,8 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, /* Associate skb with specified zone. */ if (tmpl) { - if (skb_nfct(skb)) - nf_conntrack_put(skb_nfct(skb)); + ct = nf_ct_get(skb, &ctinfo); + nf_ct_put(ct); nf_conntrack_get(&tmpl->ct_general); nf_ct_set(skb, tmpl, IP_CT_NEW); } @@ -1006,7 +1015,8 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, * connections which we will commit, we may need to attach * the helper here. */ - if (info->commit && info->helper && !nfct_help(ct)) { + if (!nf_ct_is_confirmed(ct) && info->commit && + info->helper && !nfct_help(ct)) { int err = __nf_ct_try_assign_helper(ct, info->ct, GFP_ATOMIC); if (err) @@ -1031,6 +1041,16 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, ovs_ct_helper(skb, info->family) != NF_ACCEPT) { return -EINVAL; } + + if (nf_ct_protonum(ct) == IPPROTO_TCP && + nf_ct_is_confirmed(ct) && nf_conntrack_tcp_established(ct)) { + /* Be liberal for tcp packets so that out-of-window + * packets are not marked invalid. + */ + nf_ct_set_tcp_be_liberal(ct); + } + + nf_conn_act_ct_ext_fill(skb, ct, ctinfo); } return 0; @@ -1231,6 +1251,8 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, &info->labels.mask); if (err) return err; + + nf_conn_act_ct_ext_add(ct); } else if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && labels_nonzero(&info->labels.mask)) { err = ovs_ct_set_labels(ct, key, &info->labels.value, @@ -1306,8 +1328,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, else err = ovs_ct_lookup(net, key, info, skb); - skb_push(skb, nh_ofs); - skb_postpush_rcsum(skb, skb->data, nh_ofs); + skb_push_rcsum(skb, nh_ofs); if (err) kfree_skb(skb); return err; @@ -1315,11 +1336,16 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key) { - if (skb_nfct(skb)) { - nf_conntrack_put(skb_nfct(skb)); - nf_ct_set(skb, NULL, IP_CT_UNTRACKED); - ovs_ct_fill_key(skb, key); - } + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + + nf_ct_put(ct); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + + if (key) + ovs_ct_fill_key(skb, key, false); return 0; } @@ -1538,7 +1564,7 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, switch (type) { case OVS_CT_ATTR_FORCE_COMMIT: info->force = true; - /* fall through. */ + fallthrough; case OVS_CT_ATTR_COMMIT: info->commit = true; break; @@ -1705,7 +1731,6 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, goto err_free_ct; __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); - nf_conntrack_get(&ct_info.ct->ct_general); return 0; err_free_ct: __ovs_ct_free_action(&ct_info); @@ -1895,11 +1920,12 @@ static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net) struct hlist_head *head = &info->limits[i]; struct ovs_ct_limit *ct_limit; - hlist_for_each_entry_rcu(ct_limit, head, hlist_node) + hlist_for_each_entry_rcu(ct_limit, head, hlist_node, + lockdep_ovsl_is_held()) kfree_rcu(ct_limit, rcu); } - kfree(ovs_net->ct_limit_info->limits); - kfree(ovs_net->ct_limit_info); + kfree(info->limits); + kfree(info); } static struct sk_buff * @@ -1957,7 +1983,8 @@ static int ovs_ct_limit_set_zone_limit(struct nlattr *nla_zone_limit, } else { struct ovs_ct_limit *ct_limit; - ct_limit = kmalloc(sizeof(*ct_limit), GFP_KERNEL); + ct_limit = kmalloc(sizeof(*ct_limit), + GFP_KERNEL_ACCOUNT); if (!ct_limit) return -ENOMEM; @@ -2017,16 +2044,12 @@ static int ovs_ct_limit_del_zone_limit(struct nlattr *nla_zone_limit, static int ovs_ct_limit_get_default_limit(struct ovs_ct_limit_info *info, struct sk_buff *reply) { - struct ovs_zone_limit zone_limit; - int err; - - zone_limit.zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE; - zone_limit.limit = info->default_limit; - err = nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit); - if (err) - return err; + struct ovs_zone_limit zone_limit = { + .zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE, + .limit = info->default_limit, + }; - return 0; + return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit); } static int __ovs_ct_limit_get_zone_limit(struct net *net, @@ -2228,17 +2251,19 @@ exit_err: return err; } -static struct genl_ops ct_limit_genl_ops[] = { +static const struct genl_small_ops ct_limit_genl_ops[] = { { .cmd = OVS_CT_LIMIT_CMD_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN - * privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN + * privilege. + */ .doit = ovs_ct_limit_cmd_set, }, { .cmd = OVS_CT_LIMIT_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN - * privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN + * privilege. + */ .doit = ovs_ct_limit_cmd_del, }, { .cmd = OVS_CT_LIMIT_CMD_GET, @@ -2260,8 +2285,9 @@ struct genl_family dp_ct_limit_genl_family __ro_after_init = { .policy = ct_limit_policy, .netnsok = true, .parallel_ops = true, - .ops = ct_limit_genl_ops, - .n_ops = ARRAY_SIZE(ct_limit_genl_ops), + .small_ops = ct_limit_genl_ops, + .n_small_ops = ARRAY_SIZE(ct_limit_genl_ops), + .resv_start_op = OVS_CT_LIMIT_CMD_GET + 1, .mcgrps = &ovs_ct_limit_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h index 59dc32761b91..317e525c8a11 100644 --- a/net/openvswitch/conntrack.h +++ b/net/openvswitch/conntrack.h @@ -25,7 +25,8 @@ int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *, const struct ovs_conntrack_info *); int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key); -void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key); +void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key, + bool post_ct); int ovs_ct_put_key(const struct sw_flow_key *swkey, const struct sw_flow_key *output, struct sk_buff *skb); void ovs_ct_free_action(const struct nlattr *a); @@ -74,7 +75,8 @@ static inline int ovs_ct_clear(struct sk_buff *skb, } static inline void ovs_ct_fill_key(const struct sk_buff *skb, - struct sw_flow_key *key) + struct sw_flow_key *key, + bool post_ct) { key->ct_state = 0; key->ct_zone = 0; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 07a7dd185995..8b84869eb2ac 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -37,12 +37,14 @@ #include <net/genetlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/pkt_cls.h> #include "datapath.h" #include "flow.h" #include "flow_table.h" #include "flow_netlink.h" #include "meter.h" +#include "openvswitch_trace.h" #include "vport-internal_dev.h" #include "vport-netdev.h" @@ -130,6 +132,10 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, const struct dp_upcall_info *, uint32_t cutlen); +static void ovs_dp_masks_rebalance(struct work_struct *work); + +static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr *); + /* Must be called with rcu_read_lock or ovs_mutex. */ const char *ovs_dp_name(const struct datapath *dp) { @@ -163,6 +169,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu) free_percpu(dp->stats_percpu); kfree(dp->ports); ovs_meters_exit(dp); + kfree(rcu_dereference_raw(dp->upcall_portids)); kfree(dp); } @@ -180,7 +187,7 @@ struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no) head = vport_hash_bucket(dp, port_no); hlist_for_each_entry_rcu(vport, head, dp_hash_node, - lockdep_ovsl_is_held()) { + lockdep_ovsl_is_held()) { if (vport->port_no == port_no) return vport; } @@ -223,25 +230,39 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) struct dp_stats_percpu *stats; u64 *stats_counter; u32 n_mask_hit; + u32 n_cache_hit; int error; stats = this_cpu_ptr(dp->stats_percpu); /* Look up flow. */ flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb), - &n_mask_hit); + &n_mask_hit, &n_cache_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_MISS; - upcall.portid = ovs_vport_find_upcall_portid(p, skb); + + if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU) + upcall.portid = + ovs_dp_get_upcall_portid(dp, smp_processor_id()); + else + upcall.portid = ovs_vport_find_upcall_portid(p, skb); + upcall.mru = OVS_CB(skb)->mru; error = ovs_dp_upcall(dp, skb, key, &upcall, 0); - if (unlikely(error)) - kfree_skb(skb); - else + switch (error) { + case 0: + case -EAGAIN: + case -ERESTARTSYS: + case -EINTR: consume_skb(skb); + break; + default: + kfree_skb(skb); + break; + } stats_counter = &stats->n_missed; goto out; } @@ -251,7 +272,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) error = ovs_execute_actions(dp, skb, sf_acts, key); if (unlikely(error)) net_dbg_ratelimited("ovs: action execution error on datapath %s: %d\n", - ovs_dp_name(dp), error); + ovs_dp_name(dp), error); stats_counter = &stats->n_hit; @@ -260,6 +281,7 @@ out: u64_stats_update_begin(&stats->syncp); (*stats_counter)++; stats->n_mask_hit += n_mask_hit; + stats->n_cache_hit += n_cache_hit; u64_stats_update_end(&stats->syncp); } @@ -271,6 +293,9 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, struct dp_stats_percpu *stats; int err; + if (trace_ovs_dp_upcall_enabled()) + trace_ovs_dp_upcall(dp, skb, key, upcall_info); + if (upcall_info->portid == 0) { err = -ENOTCONN; goto err; @@ -298,14 +323,14 @@ err: static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info, - uint32_t cutlen) + uint32_t cutlen) { unsigned int gso_type = skb_shinfo(skb)->gso_type; struct sw_flow_key later_key; struct sk_buff *segs, *nskb; int err; - BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_SGO_CB_OFFSET); + BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_GSO_CB_OFFSET); segs = __skb_gso_segment(skb, NETIF_F_SG, false); if (IS_ERR(segs)) return PTR_ERR(segs); @@ -533,8 +558,9 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, out: if (err) skb_tx_error(skb); - kfree_skb(user_skb); - kfree_skb(nskb); + consume_skb(user_skb); + consume_skb(nskb); + return err; } @@ -648,7 +674,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { [OVS_PACKET_ATTR_HASH] = { .type = NLA_U64 }, }; -static const struct genl_ops dp_packet_genl_ops[] = { +static const struct genl_small_ops dp_packet_genl_ops[] = { { .cmd = OVS_PACKET_CMD_EXECUTE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ @@ -664,8 +690,9 @@ static struct genl_family dp_packet_genl_family __ro_after_init = { .policy = packet_policy, .netnsok = true, .parallel_ops = true, - .ops = dp_packet_genl_ops, - .n_ops = ARRAY_SIZE(dp_packet_genl_ops), + .small_ops = dp_packet_genl_ops, + .n_small_ops = ARRAY_SIZE(dp_packet_genl_ops), + .resv_start_op = OVS_PACKET_CMD_EXECUTE + 1, .module = THIS_MODULE, }; @@ -697,6 +724,7 @@ static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, stats->n_missed += local_stats.n_missed; stats->n_lost += local_stats.n_lost; mega_stats->n_mask_hit += local_stats.n_mask_hit; + mega_stats->n_cache_hit += local_stats.n_cache_hit; } } @@ -1075,11 +1103,12 @@ error: } /* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */ -static noinline_for_stack struct sw_flow_actions *get_flow_actions(struct net *net, - const struct nlattr *a, - const struct sw_flow_key *key, - const struct sw_flow_mask *mask, - bool log) +static noinline_for_stack +struct sw_flow_actions *get_flow_actions(struct net *net, + const struct nlattr *a, + const struct sw_flow_key *key, + const struct sw_flow_mask *mask, + bool log) { struct sw_flow_actions *acts; struct sw_flow_key masked_key; @@ -1378,7 +1407,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) ovs_notify(&dp_flow_genl_family, reply, info); } else { - netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, PTR_ERR(reply)); + netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, + PTR_ERR(reply)); } } @@ -1446,7 +1476,7 @@ static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { [OVS_FLOW_ATTR_UFID_FLAGS] = { .type = NLA_U32 }, }; -static const struct genl_ops dp_flow_genl_ops[] = { +static const struct genl_small_ops dp_flow_genl_ops[] = { { .cmd = OVS_FLOW_CMD_NEW, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ @@ -1478,8 +1508,9 @@ static struct genl_family dp_flow_genl_family __ro_after_init = { .policy = flow_policy, .netnsok = true, .parallel_ops = true, - .ops = dp_flow_genl_ops, - .n_ops = ARRAY_SIZE(dp_flow_genl_ops), + .small_ops = dp_flow_genl_ops, + .n_small_ops = ARRAY_SIZE(dp_flow_genl_ops), + .resv_start_op = OVS_FLOW_CMD_SET + 1, .mcgrps = &ovs_dp_flow_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, @@ -1493,6 +1524,8 @@ static size_t ovs_dp_cmd_msg_size(void) msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_stats)); msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_megaflow_stats)); msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */ + msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_MASKS_CACHE_SIZE */ + msgsize += nla_total_size(sizeof(u32) * nr_cpu_ids); /* OVS_DP_ATTR_PER_CPU_PIDS */ return msgsize; } @@ -1504,10 +1537,11 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, struct ovs_header *ovs_header; struct ovs_dp_stats dp_stats; struct ovs_dp_megaflow_stats dp_megaflow_stats; - int err; + struct dp_nlsk_pids *pids = ovsl_dereference(dp->upcall_portids); + int err, pids_len; ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family, - flags, cmd); + flags, cmd); if (!ovs_header) goto error; @@ -1530,6 +1564,16 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) goto nla_put_failure; + if (nla_put_u32(skb, OVS_DP_ATTR_MASKS_CACHE_SIZE, + ovs_flow_tbl_masks_cache_size(&dp->table))) + goto nla_put_failure; + + if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && pids) { + pids_len = min(pids->n_pids, nr_cpu_ids) * sizeof(u32); + if (nla_put(skb, OVS_DP_ATTR_PER_CPU_PIDS, pids_len, &pids->pids)) + goto nla_put_failure; + } + genlmsg_end(skb, ovs_header); return 0; @@ -1562,30 +1606,85 @@ static struct datapath *lookup_datapath(struct net *net, return dp ? dp : ERR_PTR(-ENODEV); } -static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info) +static void ovs_dp_reset_user_features(struct sk_buff *skb, + struct genl_info *info) { struct datapath *dp; - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + dp = lookup_datapath(sock_net(skb->sk), info->userhdr, + info->attrs); if (IS_ERR(dp)) return; - WARN(dp->user_features, "Dropping previously announced user features\n"); + pr_warn("%s: Dropping previously announced user features\n", + ovs_dp_name(dp)); dp->user_features = 0; } -DEFINE_STATIC_KEY_FALSE(tc_recirc_sharing_support); +static int ovs_dp_set_upcall_portids(struct datapath *dp, + const struct nlattr *ids) +{ + struct dp_nlsk_pids *old, *dp_nlsk_pids; + + if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) + return -EINVAL; + + old = ovsl_dereference(dp->upcall_portids); + + dp_nlsk_pids = kmalloc(sizeof(*dp_nlsk_pids) + nla_len(ids), + GFP_KERNEL); + if (!dp_nlsk_pids) + return -ENOMEM; + + dp_nlsk_pids->n_pids = nla_len(ids) / sizeof(u32); + nla_memcpy(dp_nlsk_pids->pids, ids, nla_len(ids)); + + rcu_assign_pointer(dp->upcall_portids, dp_nlsk_pids); + + kfree_rcu(old, rcu); + + return 0; +} + +u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id) +{ + struct dp_nlsk_pids *dp_nlsk_pids; + + dp_nlsk_pids = rcu_dereference(dp->upcall_portids); + + if (dp_nlsk_pids) { + if (cpu_id < dp_nlsk_pids->n_pids) { + return dp_nlsk_pids->pids[cpu_id]; + } else if (dp_nlsk_pids->n_pids > 0 && + cpu_id >= dp_nlsk_pids->n_pids) { + /* If the number of netlink PIDs is mismatched with + * the number of CPUs as seen by the kernel, log this + * and send the upcall to an arbitrary socket (0) in + * order to not drop packets + */ + pr_info_ratelimited("cpu_id mismatch with handler threads"); + return dp_nlsk_pids->pids[cpu_id % + dp_nlsk_pids->n_pids]; + } else { + return 0; + } + } else { + return 0; + } +} static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) { - u32 user_features = 0; + u32 user_features = 0, old_features = dp->user_features; + int err; if (a[OVS_DP_ATTR_USER_FEATURES]) { user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]); if (user_features & ~(OVS_DP_F_VPORT_PIDS | OVS_DP_F_UNALIGNED | - OVS_DP_F_TC_RECIRC_SHARING)) + OVS_DP_F_TC_RECIRC_SHARING | + OVS_DP_F_DISPATCH_UPCALL_PER_CPU)) return -EOPNOTSUPP; #if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT) @@ -1594,12 +1693,33 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) #endif } + if (a[OVS_DP_ATTR_MASKS_CACHE_SIZE]) { + int err; + u32 cache_size; + + cache_size = nla_get_u32(a[OVS_DP_ATTR_MASKS_CACHE_SIZE]); + err = ovs_flow_tbl_masks_cache_resize(&dp->table, cache_size); + if (err) + return err; + } + dp->user_features = user_features; - if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) - static_branch_enable(&tc_recirc_sharing_support); - else - static_branch_disable(&tc_recirc_sharing_support); + if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && + a[OVS_DP_ATTR_PER_CPU_PIDS]) { + /* Upcall Netlink Port IDs have been updated */ + err = ovs_dp_set_upcall_portids(dp, + a[OVS_DP_ATTR_PER_CPU_PIDS]); + if (err) + return err; + } + + if ((dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) && + !(old_features & OVS_DP_F_TC_RECIRC_SHARING)) + tc_skb_ext_tc_enable(); + else if (!(dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) && + (old_features & OVS_DP_F_TC_RECIRC_SHARING)) + tc_skb_ext_tc_disable(); return 0; } @@ -1678,14 +1798,16 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) parms.dp = dp; parms.port_no = OVSP_LOCAL; parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID]; - - err = ovs_dp_change(dp, a); - if (err) - goto err_destroy_meters; + parms.desired_ifindex = a[OVS_DP_ATTR_IFINDEX] + ? nla_get_u32(a[OVS_DP_ATTR_IFINDEX]) : 0; /* So far only local changes have been made, now need the lock. */ ovs_lock(); + err = ovs_dp_change(dp, a); + if (err) + goto err_unlock_and_destroy_meters; + vport = new_vport(&parms); if (IS_ERR(vport)) { err = PTR_ERR(vport); @@ -1701,8 +1823,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_dp_reset_user_features(skb, info); } - ovs_unlock(); - goto err_destroy_meters; + goto err_destroy_portids; } err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, @@ -1717,7 +1838,10 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_notify(&dp_datapath_genl_family, reply, info); return 0; -err_destroy_meters: +err_destroy_portids: + kfree(rcu_dereference_raw(dp->upcall_portids)); +err_unlock_and_destroy_meters: + ovs_unlock(); ovs_meters_exit(dp); err_destroy_ports: kfree(dp->ports); @@ -1736,8 +1860,12 @@ err: /* Called with ovs_mutex. */ static void __dp_destroy(struct datapath *dp) { + struct flow_table *table = &dp->table; int i; + if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) + tc_skb_ext_tc_disable(); + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; struct hlist_node *n; @@ -1754,7 +1882,14 @@ static void __dp_destroy(struct datapath *dp) */ ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); - /* RCU destroy the flow table */ + /* Flush sw_flow in the tables. RCU cb only releases resource + * such as dp, ports and tables. That may avoid some issues + * such as RCU usage warning. + */ + table_instance_flow_flush(table, ovsl_dereference(table->ti), + ovsl_dereference(table->ufid_ti)); + + /* RCU destroy the ports, meters and flow tables. */ call_rcu(&dp->rcu, destroy_dp_rcu); } @@ -1882,9 +2017,12 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 }, + [OVS_DP_ATTR_MASKS_CACHE_SIZE] = NLA_POLICY_RANGE(NLA_U32, 0, + PCPU_MIN_UNIT_SIZE / sizeof(struct mask_cache_entry)), + [OVS_DP_ATTR_IFINDEX] = {.type = NLA_U32 }, }; -static const struct genl_ops dp_datapath_genl_ops[] = { +static const struct genl_small_ops dp_datapath_genl_ops[] = { { .cmd = OVS_DP_CMD_NEW, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ @@ -1916,8 +2054,9 @@ static struct genl_family dp_datapath_genl_family __ro_after_init = { .policy = datapath_policy, .netnsok = true, .parallel_ops = true, - .ops = dp_datapath_genl_ops, - .n_ops = ARRAY_SIZE(dp_datapath_genl_ops), + .small_ops = dp_datapath_genl_ops, + .n_small_ops = ARRAY_SIZE(dp_datapath_genl_ops), + .resv_start_op = OVS_DP_CMD_SET + 1, .mcgrps = &ovs_dp_datapath_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, @@ -2045,7 +2184,7 @@ static unsigned int ovs_get_max_headroom(struct datapath *dp) for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node, - lockdep_ovsl_is_held()) { + lockdep_ovsl_is_held()) { dev = vport->dev; dev_headroom = netdev_get_fwd_headroom(dev); if (dev_headroom > max_headroom) @@ -2063,10 +2202,11 @@ static void ovs_update_headroom(struct datapath *dp, unsigned int new_headroom) int i; dp->max_headroom = new_headroom; - for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node, - lockdep_ovsl_is_held()) + lockdep_ovsl_is_held()) netdev_set_rx_headroom(vport->dev, new_headroom); + } } static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) @@ -2084,7 +2224,10 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] || !a[OVS_VPORT_ATTR_UPCALL_PID]) return -EINVAL; - if (a[OVS_VPORT_ATTR_IFINDEX]) + + parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]); + + if (a[OVS_VPORT_ATTR_IFINDEX] && parms.type != OVS_VPORT_TYPE_INTERNAL) return -EOPNOTSUPP; port_no = a[OVS_VPORT_ATTR_PORT_NO] @@ -2121,11 +2264,12 @@ restart: } parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]); - parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]); parms.options = a[OVS_VPORT_ATTR_OPTIONS]; parms.dp = dp; parms.port_no = port_no; parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID]; + parms.desired_ifindex = a[OVS_VPORT_ATTR_IFINDEX] + ? nla_get_u32(a[OVS_VPORT_ATTR_IFINDEX]) : 0; vport = new_vport(&parms); err = PTR_ERR(vport); @@ -2338,6 +2482,23 @@ out: return skb->len; } +static void ovs_dp_masks_rebalance(struct work_struct *work) +{ + struct ovs_net *ovs_net = container_of(work, struct ovs_net, + masks_rebalance.work); + struct datapath *dp; + + ovs_lock(); + + list_for_each_entry(dp, &ovs_net->dps, list_node) + ovs_flow_masks_rebalance(&dp->table); + + ovs_unlock(); + + schedule_delayed_work(&ovs_net->masks_rebalance, + msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL)); +} + static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, @@ -2349,7 +2510,7 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { [OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 }, }; -static const struct genl_ops dp_vport_genl_ops[] = { +static const struct genl_small_ops dp_vport_genl_ops[] = { { .cmd = OVS_VPORT_CMD_NEW, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ @@ -2381,8 +2542,9 @@ struct genl_family dp_vport_genl_family __ro_after_init = { .policy = vport_policy, .netnsok = true, .parallel_ops = true, - .ops = dp_vport_genl_ops, - .n_ops = ARRAY_SIZE(dp_vport_genl_ops), + .small_ops = dp_vport_genl_ops, + .n_small_ops = ARRAY_SIZE(dp_vport_genl_ops), + .resv_start_op = OVS_VPORT_CMD_SET + 1, .mcgrps = &ovs_dp_vport_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, @@ -2429,10 +2591,19 @@ error: static int __net_init ovs_init_net(struct net *net) { struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + int err; INIT_LIST_HEAD(&ovs_net->dps); INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq); - return ovs_ct_init(net); + INIT_DELAYED_WORK(&ovs_net->masks_rebalance, ovs_dp_masks_rebalance); + + err = ovs_ct_init(net); + if (err) + return err; + + schedule_delayed_work(&ovs_net->masks_rebalance, + msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL)); + return 0; } static void __net_exit list_vports_from_net(struct net *net, struct net *dnet, @@ -2466,8 +2637,10 @@ static void __net_exit ovs_exit_net(struct net *dnet) struct net *net; LIST_HEAD(head); - ovs_ct_exit(dnet); ovs_lock(); + + ovs_ct_exit(dnet); + list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) __dp_destroy(dp); @@ -2484,6 +2657,7 @@ static void __net_exit ovs_exit_net(struct net *dnet) ovs_unlock(); + cancel_delayed_work_sync(&ovs_net->masks_rebalance); cancel_work_sync(&ovs_net->dp_notify_work); } @@ -2498,7 +2672,8 @@ static int __init dp_init(void) { int err; - BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof_field(struct sk_buff, cb)); + BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > + sizeof_field(struct sk_buff, cb)); pr_info("Open vSwitch switching datapath\n"); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index e239a46c2f94..0cd29971a907 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -20,8 +20,9 @@ #include "meter.h" #include "vport-internal_dev.h" -#define DP_MAX_PORTS USHRT_MAX -#define DP_VPORT_HASH_BUCKETS 1024 +#define DP_MAX_PORTS USHRT_MAX +#define DP_VPORT_HASH_BUCKETS 1024 +#define DP_MASKS_REBALANCE_INTERVAL 4000 /** * struct dp_stats_percpu - per-cpu packet processing statistics for a given @@ -37,16 +38,34 @@ * @n_mask_hit: Number of masks looked up for flow match. * @n_mask_hit / (@n_hit + @n_missed) will be the average masks looked * up per packet. + * @n_cache_hit: The number of received packets that had their mask found using + * the mask cache. */ struct dp_stats_percpu { u64 n_hit; u64 n_missed; u64 n_lost; u64 n_mask_hit; + u64 n_cache_hit; struct u64_stats_sync syncp; }; /** + * struct dp_nlsk_pids - array of netlink portids of for a datapath. + * This is used when OVS_DP_F_DISPATCH_UPCALL_PER_CPU + * is enabled and must be protected by rcu. + * @rcu: RCU callback head for deferred destruction. + * @n_pids: Size of @pids array. + * @pids: Array storing the Netlink socket PIDs indexed by CPU ID for packets + * that miss the flow table. + */ +struct dp_nlsk_pids { + struct rcu_head rcu; + u32 n_pids; + u32 pids[]; +}; + +/** * struct datapath - datapath for flow-based packet switching * @rcu: RCU callback head for deferred destruction. * @list_node: Element in global 'dps' list. @@ -57,6 +76,7 @@ struct dp_stats_percpu { * @net: Reference to net namespace. * @max_headroom: the maximum headroom of all vports in this datapath; it will * be used by all the internal vports in this dp. + * @upcall_portids: RCU protected 'struct dp_nlsk_pids'. * * Context: See the comment on locking at the top of datapath.c for additional * locking information. @@ -82,7 +102,9 @@ struct datapath { u32 max_headroom; /* Switch meters. */ - struct hlist_head *meters; + struct dp_meter_table meter_tbl; + + struct dp_nlsk_pids __rcu *upcall_portids; }; /** @@ -131,6 +153,7 @@ struct dp_upcall_info { struct ovs_net { struct list_head dps; struct work_struct dp_notify_work; + struct delayed_work masks_rebalance; #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) struct ovs_ct_limit_info *ct_limit_info; #endif @@ -230,14 +253,14 @@ static inline struct datapath *get_dp(struct net *net, int dp_ifindex) extern struct notifier_block ovs_dp_device_notifier; extern struct genl_family dp_vport_genl_family; -DECLARE_STATIC_KEY_FALSE(tc_recirc_sharing_support); - void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key); void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, const struct sw_flow_key *, const struct dp_upcall_info *, uint32_t cutlen); +u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id); + const char *ovs_dp_name(const struct datapath *dp); struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, u32 portid, u32 seq, u8 cmd); diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 9d375e74b607..e20d1a973417 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -34,6 +34,8 @@ #include <net/mpls.h> #include <net/ndisc.h> #include <net/nsh.h> +#include <net/pkt_cls.h> +#include <net/netfilter/nf_conntrack_zones.h> #include "conntrack.h" #include "datapath.h" @@ -239,6 +241,144 @@ static bool icmphdr_ok(struct sk_buff *skb) sizeof(struct icmphdr)); } +/** + * get_ipv6_ext_hdrs() - Parses packet and sets IPv6 extension header flags. + * + * @skb: buffer where extension header data starts in packet + * @nh: ipv6 header + * @ext_hdrs: flags are stored here + * + * OFPIEH12_UNREP is set if more than one of a given IPv6 extension header + * is unexpectedly encountered. (Two destination options headers may be + * expected and would not cause this bit to be set.) + * + * OFPIEH12_UNSEQ is set if IPv6 extension headers were not in the order + * preferred (but not required) by RFC 2460: + * + * When more than one extension header is used in the same packet, it is + * recommended that those headers appear in the following order: + * IPv6 header + * Hop-by-Hop Options header + * Destination Options header + * Routing header + * Fragment header + * Authentication header + * Encapsulating Security Payload header + * Destination Options header + * upper-layer header + */ +static void get_ipv6_ext_hdrs(struct sk_buff *skb, struct ipv6hdr *nh, + u16 *ext_hdrs) +{ + u8 next_type = nh->nexthdr; + unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); + int dest_options_header_count = 0; + + *ext_hdrs = 0; + + while (ipv6_ext_hdr(next_type)) { + struct ipv6_opt_hdr _hdr, *hp; + + switch (next_type) { + case IPPROTO_NONE: + *ext_hdrs |= OFPIEH12_NONEXT; + /* stop parsing */ + return; + + case IPPROTO_ESP: + if (*ext_hdrs & OFPIEH12_ESP) + *ext_hdrs |= OFPIEH12_UNREP; + if ((*ext_hdrs & ~(OFPIEH12_HOP | OFPIEH12_DEST | + OFPIEH12_ROUTER | IPPROTO_FRAGMENT | + OFPIEH12_AUTH | OFPIEH12_UNREP)) || + dest_options_header_count >= 2) { + *ext_hdrs |= OFPIEH12_UNSEQ; + } + *ext_hdrs |= OFPIEH12_ESP; + break; + + case IPPROTO_AH: + if (*ext_hdrs & OFPIEH12_AUTH) + *ext_hdrs |= OFPIEH12_UNREP; + if ((*ext_hdrs & + ~(OFPIEH12_HOP | OFPIEH12_DEST | OFPIEH12_ROUTER | + IPPROTO_FRAGMENT | OFPIEH12_UNREP)) || + dest_options_header_count >= 2) { + *ext_hdrs |= OFPIEH12_UNSEQ; + } + *ext_hdrs |= OFPIEH12_AUTH; + break; + + case IPPROTO_DSTOPTS: + if (dest_options_header_count == 0) { + if (*ext_hdrs & + ~(OFPIEH12_HOP | OFPIEH12_UNREP)) + *ext_hdrs |= OFPIEH12_UNSEQ; + *ext_hdrs |= OFPIEH12_DEST; + } else if (dest_options_header_count == 1) { + if (*ext_hdrs & + ~(OFPIEH12_HOP | OFPIEH12_DEST | + OFPIEH12_ROUTER | OFPIEH12_FRAG | + OFPIEH12_AUTH | OFPIEH12_ESP | + OFPIEH12_UNREP)) { + *ext_hdrs |= OFPIEH12_UNSEQ; + } + } else { + *ext_hdrs |= OFPIEH12_UNREP; + } + dest_options_header_count++; + break; + + case IPPROTO_FRAGMENT: + if (*ext_hdrs & OFPIEH12_FRAG) + *ext_hdrs |= OFPIEH12_UNREP; + if ((*ext_hdrs & ~(OFPIEH12_HOP | + OFPIEH12_DEST | + OFPIEH12_ROUTER | + OFPIEH12_UNREP)) || + dest_options_header_count >= 2) { + *ext_hdrs |= OFPIEH12_UNSEQ; + } + *ext_hdrs |= OFPIEH12_FRAG; + break; + + case IPPROTO_ROUTING: + if (*ext_hdrs & OFPIEH12_ROUTER) + *ext_hdrs |= OFPIEH12_UNREP; + if ((*ext_hdrs & ~(OFPIEH12_HOP | + OFPIEH12_DEST | + OFPIEH12_UNREP)) || + dest_options_header_count >= 2) { + *ext_hdrs |= OFPIEH12_UNSEQ; + } + *ext_hdrs |= OFPIEH12_ROUTER; + break; + + case IPPROTO_HOPOPTS: + if (*ext_hdrs & OFPIEH12_HOP) + *ext_hdrs |= OFPIEH12_UNREP; + /* OFPIEH12_HOP is set to 1 if a hop-by-hop IPv6 + * extension header is present as the first + * extension header in the packet. + */ + if (*ext_hdrs == 0) + *ext_hdrs |= OFPIEH12_HOP; + else + *ext_hdrs |= OFPIEH12_UNSEQ; + break; + + default: + return; + } + + hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); + if (!hp) + break; + next_type = hp->nexthdr; + start += ipv6_optlen(hp); + } +} + static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) { unsigned short frag_off; @@ -254,6 +394,8 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) nh = ipv6_hdr(skb); + get_ipv6_ext_hdrs(skb, nh, &key->ipv6.exthdrs); + key->ip.proto = NEXTHDR_NONE; key->ip.tos = ipv6_get_dsfield(nh); key->ip.ttl = nh->hop_limit; @@ -265,7 +407,7 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) if (flags & IP6_FH_F_FRAG) { if (frag_off) { key->ip.frag = OVS_FRAG_TYPE_LATER; - key->ip.proto = nexthdr; + key->ip.proto = NEXTHDR_FRAGMENT; return 0; } key->ip.frag = OVS_FRAG_TYPE_FIRST; @@ -293,10 +435,14 @@ static bool icmp6hdr_ok(struct sk_buff *skb) } /** - * Parse vlan tag from vlan header. - * Returns ERROR on memory error. - * Returns 0 if it encounters a non-vlan or incomplete packet. - * Returns 1 after successfully parsing vlan tag. + * parse_vlan_tag - Parse vlan tag from vlan header. + * @skb: skb containing frame to parse + * @key_vh: pointer to parsed vlan tag + * @untag_vlan: should the vlan header be removed from the frame + * + * Return: ERROR on memory error. + * %0 if it encounters a non-vlan or incomplete packet. + * %1 after successfully parsing vlan tag. */ static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh, bool untag_vlan) @@ -528,6 +674,7 @@ static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key) * L3 header * @key: output flow key * + * Return: %0 if successful, otherwise a negative errno value. */ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key) { @@ -675,7 +822,7 @@ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key) case -EINVAL: memset(&key->ip, 0, sizeof(key->ip)); memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr)); - /* fall-through */ + fallthrough; case -EPROTO: skb->transport_header = skb->network_header; error = 0; @@ -744,8 +891,6 @@ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key) * * The caller must ensure that skb->len >= ETH_HLEN. * - * Returns 0 if successful, otherwise a negative errno value. - * * Initializes @skb header fields as follows: * * - skb->mac_header: the L2 header. @@ -760,6 +905,8 @@ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key) * * - skb->protocol: the type of the data starting at skb->network_header. * Equals to key->eth.type. + * + * Return: %0 if successful, otherwise a negative errno value. */ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) { @@ -853,7 +1000,9 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) struct tc_skb_ext *tc_ext; #endif + bool post_ct = false, post_ct_snat = false, post_ct_dnat = false; int res, err; + u16 zone = 0; /* Extract metadata from packet. */ if (tun_info) { @@ -887,9 +1036,14 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, key->mac_proto = res; #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - if (static_branch_unlikely(&tc_recirc_sharing_support)) { + if (tc_skb_ext_tc_enabled()) { tc_ext = skb_ext_find(skb, TC_SKB_EXT); key->recirc_id = tc_ext ? tc_ext->chain : 0; + OVS_CB(skb)->mru = tc_ext ? tc_ext->mru : 0; + post_ct = tc_ext ? tc_ext->post_ct : false; + post_ct_snat = post_ct ? tc_ext->post_ct_snat : false; + post_ct_dnat = post_ct ? tc_ext->post_ct_dnat : false; + zone = post_ct ? tc_ext->zone : 0; } else { key->recirc_id = 0; } @@ -898,8 +1052,19 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, #endif err = key_extract(skb, key); - if (!err) - ovs_ct_fill_key(skb, key); /* Must be after key_extract(). */ + if (!err) { + ovs_ct_fill_key(skb, key, post_ct); /* Must be after key_extract(). */ + if (post_ct) { + if (!skb_get_nfct(skb)) { + key->ct_zone = zone; + } else { + if (!post_ct_dnat) + key->ct_state &= ~OVS_CS_F_DST_NAT; + if (!post_ct_snat) + key->ct_state &= ~OVS_CS_F_SRC_NAT; + } + } + } return err; } diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 758a8c77f736..073ab73ffeaa 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -32,6 +32,19 @@ enum sw_flow_mac_proto { #define SW_FLOW_KEY_INVALID 0x80 #define MPLS_LABEL_DEPTH 3 +/* Bit definitions for IPv6 Extension Header pseudo-field. */ +enum ofp12_ipv6exthdr_flags { + OFPIEH12_NONEXT = 1 << 0, /* "No next header" encountered. */ + OFPIEH12_ESP = 1 << 1, /* Encrypted Sec Payload header present. */ + OFPIEH12_AUTH = 1 << 2, /* Authentication header present. */ + OFPIEH12_DEST = 1 << 3, /* 1 or 2 dest headers present. */ + OFPIEH12_FRAG = 1 << 4, /* Fragment header present. */ + OFPIEH12_ROUTER = 1 << 5, /* Router header present. */ + OFPIEH12_HOP = 1 << 6, /* Hop-by-hop header present. */ + OFPIEH12_UNREP = 1 << 7, /* Unexpected repeats encountered. */ + OFPIEH12_UNSEQ = 1 << 8 /* Unexpected sequencing encountered. */ +}; + /* Store options at the end of the array if they are less than the * maximum size. This allows us to get the benefits of variable length * matching for small options. @@ -121,6 +134,7 @@ struct sw_flow_key { struct in6_addr dst; /* IPv6 destination address. */ } addr; __be32 label; /* IPv6 flow label. */ + u16 exthdrs; /* IPv6 extension header flags */ union { struct { struct in6_addr src; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 288122eec7c8..4a07ab094a84 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -80,6 +80,7 @@ static bool actions_may_change_flow(const struct nlattr *actions) case OVS_ACTION_ATTR_METER: case OVS_ACTION_ATTR_CHECK_PKT_LEN: case OVS_ACTION_ATTR_ADD_MPLS: + case OVS_ACTION_ATTR_DEC_TTL: default: return true; } @@ -345,7 +346,7 @@ size_t ovs_key_attr_size(void) /* Whenever adding new OVS_KEY_ FIELDS, we should consider * updating this function. */ - BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 29); + BUILD_BUG_ON(OVS_KEY_ATTR_MAX != 32); return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ @@ -368,7 +369,8 @@ size_t ovs_key_attr_size(void) + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + nla_total_size(40) /* OVS_KEY_ATTR_IPV6 */ + nla_total_size(2) /* OVS_KEY_ATTR_ICMPV6 */ - + nla_total_size(28); /* OVS_KEY_ATTR_ND */ + + nla_total_size(28) /* OVS_KEY_ATTR_ND */ + + nla_total_size(2); /* OVS_KEY_ATTR_IPV6_EXTHDRS */ } static const struct ovs_len_tbl ovs_vxlan_ext_key_lens[OVS_VXLAN_EXT_MAX + 1] = { @@ -436,6 +438,8 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { .len = sizeof(struct ovs_key_ct_tuple_ipv6) }, [OVS_KEY_ATTR_NSH] = { .len = OVS_ATTR_NESTED, .next = ovs_nsh_key_attr_lens, }, + [OVS_KEY_ATTR_IPV6_EXTHDRS] = { + .len = sizeof(struct ovs_key_ipv6_exthdrs) }, }; static bool check_attr_len(unsigned int attr_len, unsigned int expected_len) @@ -478,7 +482,14 @@ static int __parse_flow_nlattrs(const struct nlattr *attr, return -EINVAL; } - if (attrs & (1 << type)) { + if (type == OVS_KEY_ATTR_PACKET_TYPE || + type == OVS_KEY_ATTR_ND_EXTENSIONS || + type == OVS_KEY_ATTR_TUNNEL_INFO) { + OVS_NLERR(log, "Key type %d is not supported", type); + return -EINVAL; + } + + if (attrs & (1ULL << type)) { OVS_NLERR(log, "Duplicate key (type %d).", type); return -EINVAL; } @@ -491,7 +502,7 @@ static int __parse_flow_nlattrs(const struct nlattr *attr, } if (!nz || !is_all_zero(nla_data(nla), nla_len(nla))) { - attrs |= 1 << type; + attrs |= 1ULL << type; a[type] = nla; } } @@ -1596,6 +1607,17 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, attrs &= ~(1 << OVS_KEY_ATTR_IPV6); } + if (attrs & (1ULL << OVS_KEY_ATTR_IPV6_EXTHDRS)) { + const struct ovs_key_ipv6_exthdrs *ipv6_exthdrs_key; + + ipv6_exthdrs_key = nla_data(a[OVS_KEY_ATTR_IPV6_EXTHDRS]); + + SW_FLOW_KEY_PUT(match, ipv6.exthdrs, + ipv6_exthdrs_key->hdrs, is_mask); + + attrs &= ~(1ULL << OVS_KEY_ATTR_IPV6_EXTHDRS); + } + if (attrs & (1 << OVS_KEY_ATTR_ARP)) { const struct ovs_key_arp *arp_key; @@ -1762,11 +1784,11 @@ static void mask_set_nlattr(struct nlattr *attr, u8 val) * does not include any don't care bit. * @net: Used to determine per-namespace field support. * @match: receives the extracted flow match information. - * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * @nla_key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute * sequence. The fields should of the packet that triggered the creation * of this flow. - * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink - * attribute specifies the mask field of the wildcarded flow. + * @nla_mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* + * Netlink attribute specifies the mask field of the wildcarded flow. * @log: Boolean to allow kernel error logging. Normally true, but when * probing for feature compatibility this should be passed in as false to * suppress unnecessary error logging. @@ -2098,6 +2120,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, ipv4_key->ipv4_frag = output->ip.frag; } else if (swkey->eth.type == htons(ETH_P_IPV6)) { struct ovs_key_ipv6 *ipv6_key; + struct ovs_key_ipv6_exthdrs *ipv6_exthdrs_key; nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key)); if (!nla) @@ -2112,6 +2135,13 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, ipv6_key->ipv6_tclass = output->ip.tos; ipv6_key->ipv6_hlimit = output->ip.ttl; ipv6_key->ipv6_frag = output->ip.frag; + + nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6_EXTHDRS, + sizeof(*ipv6_exthdrs_key)); + if (!nla) + goto nla_put_failure; + ipv6_exthdrs_key = nla_data(nla); + ipv6_exthdrs_key->hdrs = output->ipv6.exthdrs; } else if (swkey->eth.type == htons(ETH_P_NSH)) { if (nsh_key_to_nlattr(&output->nsh, is_mask, skb)) goto nla_put_failure; @@ -2200,8 +2230,8 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, icmpv6_key->icmpv6_type = ntohs(output->tp.src); icmpv6_key->icmpv6_code = ntohs(output->tp.dst); - if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION || - icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) { + if (swkey->tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) || + swkey->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { struct ovs_key_nd *nd_key; nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key)); @@ -2287,6 +2317,62 @@ static struct sw_flow_actions *nla_alloc_flow_actions(int size) return sfa; } +static void ovs_nla_free_nested_actions(const struct nlattr *actions, int len); + +static void ovs_nla_free_check_pkt_len_action(const struct nlattr *action) +{ + const struct nlattr *a; + int rem; + + nla_for_each_nested(a, action, rem) { + switch (nla_type(a)) { + case OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL: + case OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER: + ovs_nla_free_nested_actions(nla_data(a), nla_len(a)); + break; + } + } +} + +static void ovs_nla_free_clone_action(const struct nlattr *action) +{ + const struct nlattr *a = nla_data(action); + int rem = nla_len(action); + + switch (nla_type(a)) { + case OVS_CLONE_ATTR_EXEC: + /* The real list of actions follows this attribute. */ + a = nla_next(a, &rem); + ovs_nla_free_nested_actions(a, rem); + break; + } +} + +static void ovs_nla_free_dec_ttl_action(const struct nlattr *action) +{ + const struct nlattr *a = nla_data(action); + + switch (nla_type(a)) { + case OVS_DEC_TTL_ATTR_ACTION: + ovs_nla_free_nested_actions(nla_data(a), nla_len(a)); + break; + } +} + +static void ovs_nla_free_sample_action(const struct nlattr *action) +{ + const struct nlattr *a = nla_data(action); + int rem = nla_len(action); + + switch (nla_type(a)) { + case OVS_SAMPLE_ATTR_ARG: + /* The real list of actions follows this attribute. */ + a = nla_next(a, &rem); + ovs_nla_free_nested_actions(a, rem); + break; + } +} + static void ovs_nla_free_set_action(const struct nlattr *a) { const struct nlattr *ovs_key = nla_data(a); @@ -2300,25 +2386,54 @@ static void ovs_nla_free_set_action(const struct nlattr *a) } } -void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +static void ovs_nla_free_nested_actions(const struct nlattr *actions, int len) { const struct nlattr *a; int rem; - if (!sf_acts) + /* Whenever new actions are added, the need to update this + * function should be considered. + */ + BUILD_BUG_ON(OVS_ACTION_ATTR_MAX != 23); + + if (!actions) return; - nla_for_each_attr(a, sf_acts->actions, sf_acts->actions_len, rem) { + nla_for_each_attr(a, actions, len, rem) { switch (nla_type(a)) { - case OVS_ACTION_ATTR_SET: - ovs_nla_free_set_action(a); + case OVS_ACTION_ATTR_CHECK_PKT_LEN: + ovs_nla_free_check_pkt_len_action(a); break; + + case OVS_ACTION_ATTR_CLONE: + ovs_nla_free_clone_action(a); + break; + case OVS_ACTION_ATTR_CT: ovs_ct_free_action(a); break; + + case OVS_ACTION_ATTR_DEC_TTL: + ovs_nla_free_dec_ttl_action(a); + break; + + case OVS_ACTION_ATTR_SAMPLE: + ovs_nla_free_sample_action(a); + break; + + case OVS_ACTION_ATTR_SET: + ovs_nla_free_set_action(a); + break; } } +} + +void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +{ + if (!sf_acts) + return; + ovs_nla_free_nested_actions(sf_acts->actions, sf_acts->actions_len); kfree(sf_acts); } @@ -2350,7 +2465,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2); if (new_acts_size > MAX_ACTIONS_BUFSIZE) { - if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) { + if ((next_offset + req_size) > MAX_ACTIONS_BUFSIZE) { OVS_NLERR(log, "Flow action size exceeds max %u", MAX_ACTIONS_BUFSIZE); return ERR_PTR(-EMSGSIZE); @@ -2495,6 +2610,63 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, return 0; } +static int validate_and_copy_dec_ttl(struct net *net, + const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, bool log) +{ + const struct nlattr *attrs[OVS_DEC_TTL_ATTR_MAX + 1]; + int start, action_start, err, rem; + const struct nlattr *a, *actions; + + memset(attrs, 0, sizeof(attrs)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + + /* Ignore unknown attributes to be future proof. */ + if (type > OVS_DEC_TTL_ATTR_MAX) + continue; + + if (!type || attrs[type]) { + OVS_NLERR(log, "Duplicate or invalid key (type %d).", + type); + return -EINVAL; + } + + attrs[type] = a; + } + + if (rem) { + OVS_NLERR(log, "Message has %d unknown bytes.", rem); + return -EINVAL; + } + + actions = attrs[OVS_DEC_TTL_ATTR_ACTION]; + if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) { + OVS_NLERR(log, "Missing valid actions attribute."); + return -EINVAL; + } + + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_DEC_TTL, log); + if (start < 0) + return start; + + action_start = add_nested_action_start(sfa, OVS_DEC_TTL_ATTR_ACTION, log); + if (action_start < 0) + return action_start; + + err = __ovs_nla_copy_actions(net, actions, key, sfa, eth_type, + vlan_tci, mpls_label_count, log); + if (err) + return err; + + add_nested_action_end(*sfa, action_start); + add_nested_action_end(*sfa, start); + return 0; +} + static int validate_and_copy_clone(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, @@ -3009,6 +3181,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_CLONE] = (u32)-1, [OVS_ACTION_ATTR_CHECK_PKT_LEN] = (u32)-1, [OVS_ACTION_ATTR_ADD_MPLS] = sizeof(struct ovs_action_add_mpls), + [OVS_ACTION_ATTR_DEC_TTL] = (u32)-1, }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -3131,7 +3304,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, /* Disallow subsequent L2.5+ set actions and mpls_pop * actions once the last MPLS label in the packet is - * is popped as there is no check here to ensure that + * popped as there is no check here to ensure that * the new eth type is valid and thus set actions could * write off the end of the packet or otherwise corrupt * it. @@ -3269,6 +3442,15 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, break; } + case OVS_ACTION_ATTR_DEC_TTL: + err = validate_and_copy_dec_ttl(net, a, key, sfa, + eth_type, vlan_tci, + mpls_label_count, log); + if (err) + return err; + skip_copy = true; + break; + default: OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; @@ -3361,7 +3543,9 @@ static int clone_action_to_attr(const struct nlattr *attr, if (!start) return -EMSGSIZE; - err = ovs_nla_put_actions(nla_data(attr), rem, skb); + /* Skipping the OVS_CLONE_ATTR_EXEC that is always the first attribute. */ + attr = nla_next(nla_data(attr), &rem); + err = ovs_nla_put_actions(attr, rem, skb); if (err) nla_nest_cancel(skb, start); @@ -3440,6 +3624,48 @@ out: return err; } +static int dec_ttl_action_to_attr(const struct nlattr *attr, + struct sk_buff *skb) +{ + struct nlattr *start, *action_start; + const struct nlattr *a; + int err = 0, rem; + + start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_DEC_TTL); + if (!start) + return -EMSGSIZE; + + nla_for_each_attr(a, nla_data(attr), nla_len(attr), rem) { + switch (nla_type(a)) { + case OVS_DEC_TTL_ATTR_ACTION: + + action_start = nla_nest_start_noflag(skb, OVS_DEC_TTL_ATTR_ACTION); + if (!action_start) { + err = -EMSGSIZE; + goto out; + } + + err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); + if (err) + goto out; + + nla_nest_end(skb, action_start); + break; + + default: + /* Ignore all other option to be future compatible */ + break; + } + } + + nla_nest_end(skb, start); + return 0; + +out: + nla_nest_cancel(skb, start); + return err; +} + static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) { const struct nlattr *ovs_key = nla_data(a); @@ -3540,6 +3766,12 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) return err; break; + case OVS_ACTION_ATTR_DEC_TTL: + err = dec_ttl_action_to_attr(a, skb); + if (err) + return err; + break; + default: if (nla_put(skb, type, nla_len(a), nla_data(a))) return -EMSGSIZE; diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index fd8a01ca7a2d..d4a2db0b2299 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -29,6 +29,7 @@ #include <linux/icmp.h> #include <linux/icmpv6.h> #include <linux/rculist.h> +#include <linux/sort.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ndisc.h> @@ -37,8 +38,8 @@ #define MASK_ARRAY_SIZE_MIN 16 #define REHASH_INTERVAL (10 * 60 * HZ) +#define MC_DEFAULT_HASH_ENTRIES 256 #define MC_HASH_SHIFT 8 -#define MC_HASH_ENTRIES (1u << MC_HASH_SHIFT) #define MC_HASH_SEGS ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT) static struct kmem_cache *flow_cache; @@ -110,12 +111,16 @@ static void flow_free(struct sw_flow *flow) if (ovs_identifier_is_key(&flow->id)) kfree(flow->id.unmasked_key); if (flow->sf_acts) - ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts); + ovs_nla_free_flow_actions((struct sw_flow_actions __force *) + flow->sf_acts); /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, &flow->cpu_used_mask)) + for (cpu = 0; cpu < nr_cpu_ids; + cpu = cpumask_next(cpu, &flow->cpu_used_mask)) { if (flow->stats[cpu]) kmem_cache_free(flow_stats_cache, (struct sw_flow_stats __force *)flow->stats[cpu]); + } + kmem_cache_free(flow_cache, flow); } @@ -163,22 +168,76 @@ static struct table_instance *table_instance_alloc(int new_size) ti->n_buckets = new_size; ti->node_ver = 0; - ti->keep_flows = false; get_random_bytes(&ti->hash_seed, sizeof(u32)); return ti; } +static void __mask_array_destroy(struct mask_array *ma) +{ + free_percpu(ma->masks_usage_stats); + kfree(ma); +} + +static void mask_array_rcu_cb(struct rcu_head *rcu) +{ + struct mask_array *ma = container_of(rcu, struct mask_array, rcu); + + __mask_array_destroy(ma); +} + +static void tbl_mask_array_reset_counters(struct mask_array *ma) +{ + int i, cpu; + + /* As the per CPU counters are not atomic we can not go ahead and + * reset them from another CPU. To be able to still have an approximate + * zero based counter we store the value at reset, and subtract it + * later when processing. + */ + for (i = 0; i < ma->max; i++) { + ma->masks_usage_zero_cntr[i] = 0; + + for_each_possible_cpu(cpu) { + struct mask_array_stats *stats; + unsigned int start; + u64 counter; + + stats = per_cpu_ptr(ma->masks_usage_stats, cpu); + do { + start = u64_stats_fetch_begin_irq(&stats->syncp); + counter = stats->usage_cntrs[i]; + } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); + + ma->masks_usage_zero_cntr[i] += counter; + } + } +} + static struct mask_array *tbl_mask_array_alloc(int size) { struct mask_array *new; size = max(MASK_ARRAY_SIZE_MIN, size); new = kzalloc(sizeof(struct mask_array) + - sizeof(struct sw_flow_mask *) * size, GFP_KERNEL); + sizeof(struct sw_flow_mask *) * size + + sizeof(u64) * size, GFP_KERNEL); if (!new) return NULL; + new->masks_usage_zero_cntr = (u64 *)((u8 *)new + + sizeof(struct mask_array) + + sizeof(struct sw_flow_mask *) * + size); + + new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) + + sizeof(u64) * size, + __alignof__(u64)); + if (!new->masks_usage_stats) { + kfree(new); + return NULL; + } + new->count = 0; new->max = size; @@ -202,10 +261,10 @@ static int tbl_mask_array_realloc(struct flow_table *tbl, int size) if (ovsl_dereference(old->masks[i])) new->masks[new->count++] = old->masks[i]; } + call_rcu(&old->rcu, mask_array_rcu_cb); } rcu_assign_pointer(tbl->mask_array, new); - kfree_rcu(old, rcu); return 0; } @@ -218,17 +277,22 @@ static int tbl_mask_array_add_mask(struct flow_table *tbl, if (ma_count >= ma->max) { err = tbl_mask_array_realloc(tbl, ma->max + - MASK_ARRAY_SIZE_MIN); + MASK_ARRAY_SIZE_MIN); if (err) return err; ma = ovsl_dereference(tbl->mask_array); + } else { + /* On every add or delete we need to reset the counters so + * every new mask gets a fair chance of being prioritized. + */ + tbl_mask_array_reset_counters(ma); } BUG_ON(ovsl_dereference(ma->masks[ma_count])); rcu_assign_pointer(ma->masks[ma_count], new); - WRITE_ONCE(ma->count, ma_count +1); + WRITE_ONCE(ma->count, ma_count + 1); return 0; } @@ -249,10 +313,10 @@ static void tbl_mask_array_del_mask(struct flow_table *tbl, return; found: - WRITE_ONCE(ma->count, ma_count -1); + WRITE_ONCE(ma->count, ma_count - 1); - rcu_assign_pointer(ma->masks[i], ma->masks[ma_count -1]); - RCU_INIT_POINTER(ma->masks[ma_count -1], NULL); + rcu_assign_pointer(ma->masks[i], ma->masks[ma_count - 1]); + RCU_INIT_POINTER(ma->masks[ma_count - 1], NULL); kfree_rcu(mask, rcu); @@ -260,6 +324,9 @@ found: if (ma->max >= (MASK_ARRAY_SIZE_MIN * 2) && ma_count <= (ma->max / 3)) tbl_mask_array_realloc(tbl, ma->max / 2); + else + tbl_mask_array_reset_counters(ma); + } /* Remove 'mask' from the mask list, if it is not needed any more. */ @@ -278,15 +345,79 @@ static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) } } +static void __mask_cache_destroy(struct mask_cache *mc) +{ + free_percpu(mc->mask_cache); + kfree(mc); +} + +static void mask_cache_rcu_cb(struct rcu_head *rcu) +{ + struct mask_cache *mc = container_of(rcu, struct mask_cache, rcu); + + __mask_cache_destroy(mc); +} + +static struct mask_cache *tbl_mask_cache_alloc(u32 size) +{ + struct mask_cache_entry __percpu *cache = NULL; + struct mask_cache *new; + + /* Only allow size to be 0, or a power of 2, and does not exceed + * percpu allocation size. + */ + if ((!is_power_of_2(size) && size != 0) || + (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) + return NULL; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + new->cache_size = size; + if (new->cache_size > 0) { + cache = __alloc_percpu(array_size(sizeof(struct mask_cache_entry), + new->cache_size), + __alignof__(struct mask_cache_entry)); + if (!cache) { + kfree(new); + return NULL; + } + } + + new->mask_cache = cache; + return new; +} +int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size) +{ + struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache); + struct mask_cache *new; + + if (size == mc->cache_size) + return 0; + + if ((!is_power_of_2(size) && size != 0) || + (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) + return -EINVAL; + + new = tbl_mask_cache_alloc(size); + if (!new) + return -ENOMEM; + + rcu_assign_pointer(table->mask_cache, new); + call_rcu(&mc->rcu, mask_cache_rcu_cb); + + return 0; +} + int ovs_flow_tbl_init(struct flow_table *table) { struct table_instance *ti, *ufid_ti; + struct mask_cache *mc; struct mask_array *ma; - table->mask_cache = __alloc_percpu(sizeof(struct mask_cache_entry) * - MC_HASH_ENTRIES, - __alignof__(struct mask_cache_entry)); - if (!table->mask_cache) + mc = tbl_mask_cache_alloc(MC_DEFAULT_HASH_ENTRIES); + if (!mc) return -ENOMEM; ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN); @@ -304,6 +435,7 @@ int ovs_flow_tbl_init(struct flow_table *table) rcu_assign_pointer(table->ti, ti); rcu_assign_pointer(table->ufid_ti, ufid_ti); rcu_assign_pointer(table->mask_array, ma); + rcu_assign_pointer(table->mask_cache, mc); table->last_rehash = jiffies; table->count = 0; table->ufid_count = 0; @@ -312,77 +444,71 @@ int ovs_flow_tbl_init(struct flow_table *table) free_ti: __table_instance_destroy(ti); free_mask_array: - kfree(ma); + __mask_array_destroy(ma); free_mask_cache: - free_percpu(table->mask_cache); + __mask_cache_destroy(mc); return -ENOMEM; } static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) { - struct table_instance *ti = container_of(rcu, struct table_instance, rcu); + struct table_instance *ti; + ti = container_of(rcu, struct table_instance, rcu); __table_instance_destroy(ti); } static void table_instance_flow_free(struct flow_table *table, - struct table_instance *ti, - struct table_instance *ufid_ti, - struct sw_flow *flow, - bool count) + struct table_instance *ti, + struct table_instance *ufid_ti, + struct sw_flow *flow) { hlist_del_rcu(&flow->flow_table.node[ti->node_ver]); - if (count) - table->count--; + table->count--; if (ovs_identifier_is_ufid(&flow->id)) { hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]); - - if (count) - table->ufid_count--; + table->ufid_count--; } flow_mask_remove(table, flow->mask); } -static void table_instance_destroy(struct flow_table *table, - struct table_instance *ti, - struct table_instance *ufid_ti, - bool deferred) +/* Must be called with OVS mutex held. */ +void table_instance_flow_flush(struct flow_table *table, + struct table_instance *ti, + struct table_instance *ufid_ti) { int i; - if (!ti) - return; - - BUG_ON(!ufid_ti); - if (ti->keep_flows) - goto skip_flows; - for (i = 0; i < ti->n_buckets; i++) { - struct sw_flow *flow; struct hlist_head *head = &ti->buckets[i]; struct hlist_node *n; + struct sw_flow *flow; hlist_for_each_entry_safe(flow, n, head, flow_table.node[ti->node_ver]) { table_instance_flow_free(table, ti, ufid_ti, - flow, false); - ovs_flow_free(flow, deferred); + flow); + ovs_flow_free(flow, true); } } -skip_flows: - if (deferred) { - call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); - call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb); - } else { - __table_instance_destroy(ti); - __table_instance_destroy(ufid_ti); + if (WARN_ON(table->count != 0 || + table->ufid_count != 0)) { + table->count = 0; + table->ufid_count = 0; } } +static void table_instance_destroy(struct table_instance *ti, + struct table_instance *ufid_ti) +{ + call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); + call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb); +} + /* No need for locking this function is called from RCU callback or * error path. */ @@ -390,10 +516,12 @@ void ovs_flow_tbl_destroy(struct flow_table *table) { struct table_instance *ti = rcu_dereference_raw(table->ti); struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti); + struct mask_cache *mc = rcu_dereference_raw(table->mask_cache); + struct mask_array *ma = rcu_dereference_raw(table->mask_array); - free_percpu(table->mask_cache); - kfree_rcu(rcu_dereference_raw(table->mask_array), rcu); - table_instance_destroy(table, ti, ufid_ti, false); + call_rcu(&mc->rcu, mask_cache_rcu_cb); + call_rcu(&ma->rcu, mask_array_rcu_cb); + table_instance_destroy(ti, ufid_ti); } struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, @@ -462,16 +590,16 @@ static void flow_table_copy_flows(struct table_instance *old, struct hlist_head *head = &old->buckets[i]; if (ufid) - hlist_for_each_entry(flow, head, - ufid_table.node[old_ver]) + hlist_for_each_entry_rcu(flow, head, + ufid_table.node[old_ver], + lockdep_ovsl_is_held()) ufid_table_instance_insert(new, flow); else - hlist_for_each_entry(flow, head, - flow_table.node[old_ver]) + hlist_for_each_entry_rcu(flow, head, + flow_table.node[old_ver], + lockdep_ovsl_is_held()) table_instance_insert(new, flow); } - - old->keep_flows = true; } static struct table_instance *table_instance_rehash(struct table_instance *ti, @@ -506,10 +634,9 @@ int ovs_flow_tbl_flush(struct flow_table *flow_table) rcu_assign_pointer(flow_table->ti, new_ti); rcu_assign_pointer(flow_table->ufid_ti, new_ufid_ti); flow_table->last_rehash = jiffies; - flow_table->count = 0; - flow_table->ufid_count = 0; - table_instance_destroy(flow_table, old_ti, old_ufid_ti, true); + table_instance_flow_flush(flow_table, old_ti, old_ufid_ti); + table_instance_destroy(old_ti, old_ufid_ti); return 0; err_free_ti: @@ -534,7 +661,7 @@ static int flow_key_start(const struct sw_flow_key *key) return 0; else return rounddown(offsetof(struct sw_flow_key, phy), - sizeof(long)); + sizeof(long)); } static bool cmp_key(const struct sw_flow_key *key1, @@ -543,13 +670,13 @@ static bool cmp_key(const struct sw_flow_key *key1, { const long *cp1 = (const long *)((const u8 *)key1 + key_start); const long *cp2 = (const long *)((const u8 *)key2 + key_start); - long diffs = 0; int i; - for (i = key_start; i < key_end; i += sizeof(long)) - diffs |= *cp1++ ^ *cp2++; + for (i = key_start; i < key_end; i += sizeof(long)) + if (*cp1++ ^ *cp2++) + return false; - return diffs == 0; + return true; } static bool flow_cmp_masked_key(const struct sw_flow *flow, @@ -586,7 +713,7 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, (*n_mask_hit)++; hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver], - lockdep_ovsl_is_held()) { + lockdep_ovsl_is_held()) { if (flow->mask == mask && flow->flow_table.hash == hash && flow_cmp_masked_key(flow, &masked_key, &mask->range)) return flow; @@ -596,14 +723,18 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, /* Flow lookup does full lookup on flow table. It starts with * mask from index passed in *index. + * This function MUST be called with BH disabled due to the use + * of CPU specific variables. */ static struct sw_flow *flow_lookup(struct flow_table *tbl, struct table_instance *ti, struct mask_array *ma, const struct sw_flow_key *key, u32 *n_mask_hit, + u32 *n_cache_hit, u32 *index) { + struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats); struct sw_flow *flow; struct sw_flow_mask *mask; int i; @@ -612,8 +743,13 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, mask = rcu_dereference_ovsl(ma->masks[*index]); if (mask) { flow = masked_flow_lookup(ti, key, mask, n_mask_hit); - if (flow) + if (flow) { + u64_stats_update_begin(&stats->syncp); + stats->usage_cntrs[*index]++; + u64_stats_update_end(&stats->syncp); + (*n_cache_hit)++; return flow; + } } } @@ -629,6 +765,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, flow = masked_flow_lookup(ti, key, mask, n_mask_hit); if (flow) { /* Found */ *index = i; + u64_stats_update_begin(&stats->syncp); + stats->usage_cntrs[*index]++; + u64_stats_update_end(&stats->syncp); return flow; } } @@ -646,8 +785,10 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, const struct sw_flow_key *key, u32 skb_hash, - u32 *n_mask_hit) + u32 *n_mask_hit, + u32 *n_cache_hit) { + struct mask_cache *mc = rcu_dereference(tbl->mask_cache); struct mask_array *ma = rcu_dereference(tbl->mask_array); struct table_instance *ti = rcu_dereference(tbl->ti); struct mask_cache_entry *entries, *ce; @@ -656,10 +797,13 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, int seg; *n_mask_hit = 0; - if (unlikely(!skb_hash)) { + *n_cache_hit = 0; + if (unlikely(!skb_hash || mc->cache_size == 0)) { u32 mask_index = 0; + u32 cache = 0; - return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index); + return flow_lookup(tbl, ti, ma, key, n_mask_hit, &cache, + &mask_index); } /* Pre and post recirulation flows usually have the same skb_hash @@ -670,17 +814,17 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, ce = NULL; hash = skb_hash; - entries = this_cpu_ptr(tbl->mask_cache); + entries = this_cpu_ptr(mc->mask_cache); /* Find the cache entry 'ce' to operate on. */ for (seg = 0; seg < MC_HASH_SEGS; seg++) { - int index = hash & (MC_HASH_ENTRIES - 1); + int index = hash & (mc->cache_size - 1); struct mask_cache_entry *e; e = &entries[index]; if (e->skb_hash == skb_hash) { flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, - &e->mask_index); + n_cache_hit, &e->mask_index); if (!flow) e->skb_hash = 0; return flow; @@ -693,10 +837,12 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, } /* Cache miss, do full lookup. */ - flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index); + flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, n_cache_hit, + &ce->mask_index); if (flow) ce->skb_hash = skb_hash; + *n_cache_hit = 0; return flow; } @@ -706,9 +852,18 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array); u32 __always_unused n_mask_hit; + u32 __always_unused n_cache_hit; + struct sw_flow *flow; u32 index = 0; - return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &index); + /* This function gets called trough the netlink interface and therefore + * is preemptible. However, flow_lookup() function needs to be called + * with BH disabled due to CPU specific variables. + */ + local_bh_disable(); + flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index); + local_bh_enable(); + return flow; } struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, @@ -752,7 +907,8 @@ static bool ovs_flow_cmp_ufid(const struct sw_flow *flow, return !memcmp(flow->id.ufid, sfid->ufid, sfid->ufid_len); } -bool ovs_flow_cmp(const struct sw_flow *flow, const struct sw_flow_match *match) +bool ovs_flow_cmp(const struct sw_flow *flow, + const struct sw_flow_match *match) { if (ovs_identifier_is_ufid(&flow->id)) return flow_cmp_masked_key(flow, match->key, &match->range); @@ -771,7 +927,7 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl, hash = ufid_hash(ufid); head = find_bucket(ti, hash); hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver], - lockdep_ovsl_is_held()) { + lockdep_ovsl_is_held()) { if (flow->ufid_table.hash == hash && ovs_flow_cmp_ufid(flow, ufid)) return flow; @@ -785,6 +941,13 @@ int ovs_flow_tbl_num_masks(const struct flow_table *table) return READ_ONCE(ma->count); } +u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table) +{ + struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache); + + return READ_ONCE(mc->cache_size); +} + static struct table_instance *table_instance_expand(struct table_instance *ti, bool ufid) { @@ -798,7 +961,7 @@ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti); BUG_ON(table->count == 0); - table_instance_flow_free(table, ti, ufid_ti, flow, true); + table_instance_flow_free(table, ti, ufid_ti, flow); } static struct sw_flow_mask *mask_alloc(void) @@ -932,6 +1095,99 @@ int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, return 0; } +static int compare_mask_and_count(const void *a, const void *b) +{ + const struct mask_count *mc_a = a; + const struct mask_count *mc_b = b; + + return (s64)mc_b->counter - (s64)mc_a->counter; +} + +/* Must be called with OVS mutex held. */ +void ovs_flow_masks_rebalance(struct flow_table *table) +{ + struct mask_array *ma = rcu_dereference_ovsl(table->mask_array); + struct mask_count *masks_and_count; + struct mask_array *new; + int masks_entries = 0; + int i; + + /* Build array of all current entries with use counters. */ + masks_and_count = kmalloc_array(ma->max, sizeof(*masks_and_count), + GFP_KERNEL); + if (!masks_and_count) + return; + + for (i = 0; i < ma->max; i++) { + struct sw_flow_mask *mask; + int cpu; + + mask = rcu_dereference_ovsl(ma->masks[i]); + if (unlikely(!mask)) + break; + + masks_and_count[i].index = i; + masks_and_count[i].counter = 0; + + for_each_possible_cpu(cpu) { + struct mask_array_stats *stats; + unsigned int start; + u64 counter; + + stats = per_cpu_ptr(ma->masks_usage_stats, cpu); + do { + start = u64_stats_fetch_begin_irq(&stats->syncp); + counter = stats->usage_cntrs[i]; + } while (u64_stats_fetch_retry_irq(&stats->syncp, + start)); + + masks_and_count[i].counter += counter; + } + + /* Subtract the zero count value. */ + masks_and_count[i].counter -= ma->masks_usage_zero_cntr[i]; + + /* Rather than calling tbl_mask_array_reset_counters() + * below when no change is needed, do it inline here. + */ + ma->masks_usage_zero_cntr[i] += masks_and_count[i].counter; + } + + if (i == 0) + goto free_mask_entries; + + /* Sort the entries */ + masks_entries = i; + sort(masks_and_count, masks_entries, sizeof(*masks_and_count), + compare_mask_and_count, NULL); + + /* If the order is the same, nothing to do... */ + for (i = 0; i < masks_entries; i++) { + if (i != masks_and_count[i].index) + break; + } + if (i == masks_entries) + goto free_mask_entries; + + /* Rebuilt the new list in order of usage. */ + new = tbl_mask_array_alloc(ma->max); + if (!new) + goto free_mask_entries; + + for (i = 0; i < masks_entries; i++) { + int index = masks_and_count[i].index; + + if (ovsl_dereference(ma->masks[index])) + new->masks[new->count++] = ma->masks[index]; + } + + rcu_assign_pointer(table->mask_array, new); + call_rcu(&ma->rcu, mask_array_rcu_cb); + +free_mask_entries: + kfree(masks_and_count); +} + /* Initializes the flow module. * Returns zero if successful or a negative error code. */ int ovs_flow_init(void) diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 8a5cea6ae111..9e659db78c05 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -27,9 +27,27 @@ struct mask_cache_entry { u32 mask_index; }; +struct mask_cache { + struct rcu_head rcu; + u32 cache_size; /* Must be ^2 value. */ + struct mask_cache_entry __percpu *mask_cache; +}; + +struct mask_count { + int index; + u64 counter; +}; + +struct mask_array_stats { + struct u64_stats_sync syncp; + u64 usage_cntrs[]; +}; + struct mask_array { struct rcu_head rcu; int count, max; + struct mask_array_stats __percpu *masks_usage_stats; + u64 *masks_usage_zero_cntr; struct sw_flow_mask __rcu *masks[]; }; @@ -39,13 +57,12 @@ struct table_instance { struct rcu_head rcu; int node_ver; u32 hash_seed; - bool keep_flows; }; struct flow_table { struct table_instance __rcu *ti; struct table_instance __rcu *ufid_ti; - struct mask_cache_entry __percpu *mask_cache; + struct mask_cache __rcu *mask_cache; struct mask_array __rcu *mask_array; unsigned long last_rehash; unsigned int count; @@ -69,12 +86,15 @@ int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, const struct sw_flow_mask *mask); void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); int ovs_flow_tbl_num_masks(const struct flow_table *table); +u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table); +int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size); struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, u32 *bucket, u32 *idx); struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, const struct sw_flow_key *, u32 skb_hash, - u32 *n_mask_hit); + u32 *n_mask_hit, + u32 *n_cache_hit); struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, const struct sw_flow_key *); struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, @@ -86,4 +106,10 @@ bool ovs_flow_cmp(const struct sw_flow *, const struct sw_flow_match *); void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, bool full, const struct sw_flow_mask *mask); + +void ovs_flow_masks_rebalance(struct flow_table *table); +void table_instance_flow_flush(struct flow_table *table, + struct table_instance *ti, + struct table_instance *ufid_ti); + #endif /* flow_table.h */ diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 5010d1ddd4bd..6e38f68f88c2 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -19,8 +19,6 @@ #include "datapath.h" #include "meter.h" -#define METER_HASH_BUCKETS 1024 - static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = { [OVS_METER_ATTR_ID] = { .type = NLA_U32, }, [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG }, @@ -39,6 +37,11 @@ static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = { [OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, }; +static u32 meter_hash(struct dp_meter_instance *ti, u32 id) +{ + return id % ti->n_meters; +} + static void ovs_meter_free(struct dp_meter *meter) { if (!meter) @@ -47,40 +50,162 @@ static void ovs_meter_free(struct dp_meter *meter) kfree_rcu(meter, rcu); } -static struct hlist_head *meter_hash_bucket(const struct datapath *dp, - u32 meter_id) -{ - return &dp->meters[meter_id & (METER_HASH_BUCKETS - 1)]; -} - /* Call with ovs_mutex or RCU read lock. */ -static struct dp_meter *lookup_meter(const struct datapath *dp, +static struct dp_meter *lookup_meter(const struct dp_meter_table *tbl, u32 meter_id) { + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); + u32 hash = meter_hash(ti, meter_id); struct dp_meter *meter; - struct hlist_head *head; - head = meter_hash_bucket(dp, meter_id); - hlist_for_each_entry_rcu(meter, head, dp_hash_node, - lockdep_ovsl_is_held()) { - if (meter->id == meter_id) - return meter; - } + meter = rcu_dereference_ovsl(ti->dp_meters[hash]); + if (meter && likely(meter->id == meter_id)) + return meter; + return NULL; } -static void attach_meter(struct datapath *dp, struct dp_meter *meter) +static struct dp_meter_instance *dp_meter_instance_alloc(const u32 size) +{ + struct dp_meter_instance *ti; + + ti = kvzalloc(sizeof(*ti) + + sizeof(struct dp_meter *) * size, + GFP_KERNEL); + if (!ti) + return NULL; + + ti->n_meters = size; + + return ti; +} + +static void dp_meter_instance_free(struct dp_meter_instance *ti) +{ + kvfree(ti); +} + +static void dp_meter_instance_free_rcu(struct rcu_head *rcu) +{ + struct dp_meter_instance *ti; + + ti = container_of(rcu, struct dp_meter_instance, rcu); + kvfree(ti); +} + +static int +dp_meter_instance_realloc(struct dp_meter_table *tbl, u32 size) +{ + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); + int n_meters = min(size, ti->n_meters); + struct dp_meter_instance *new_ti; + int i; + + new_ti = dp_meter_instance_alloc(size); + if (!new_ti) + return -ENOMEM; + + for (i = 0; i < n_meters; i++) + if (rcu_dereference_ovsl(ti->dp_meters[i])) + new_ti->dp_meters[i] = ti->dp_meters[i]; + + rcu_assign_pointer(tbl->ti, new_ti); + call_rcu(&ti->rcu, dp_meter_instance_free_rcu); + + return 0; +} + +static void dp_meter_instance_insert(struct dp_meter_instance *ti, + struct dp_meter *meter) +{ + u32 hash; + + hash = meter_hash(ti, meter->id); + rcu_assign_pointer(ti->dp_meters[hash], meter); +} + +static void dp_meter_instance_remove(struct dp_meter_instance *ti, + struct dp_meter *meter) { - struct hlist_head *head = meter_hash_bucket(dp, meter->id); + u32 hash; + + hash = meter_hash(ti, meter->id); + RCU_INIT_POINTER(ti->dp_meters[hash], NULL); +} + +static int attach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) +{ + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); + u32 hash = meter_hash(ti, meter->id); + int err; + + /* In generally, slots selected should be empty, because + * OvS uses id-pool to fetch a available id. + */ + if (unlikely(rcu_dereference_ovsl(ti->dp_meters[hash]))) + return -EBUSY; + + dp_meter_instance_insert(ti, meter); + + /* That function is thread-safe. */ + tbl->count++; + if (tbl->count >= tbl->max_meters_allowed) { + err = -EFBIG; + goto attach_err; + } + + if (tbl->count >= ti->n_meters && + dp_meter_instance_realloc(tbl, ti->n_meters * 2)) { + err = -ENOMEM; + goto attach_err; + } + + return 0; - hlist_add_head_rcu(&meter->dp_hash_node, head); +attach_err: + dp_meter_instance_remove(ti, meter); + tbl->count--; + return err; } -static void detach_meter(struct dp_meter *meter) +static int detach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) { + struct dp_meter_instance *ti; + ASSERT_OVSL(); - if (meter) - hlist_del_rcu(&meter->dp_hash_node); + if (!meter) + return 0; + + ti = rcu_dereference_ovsl(tbl->ti); + dp_meter_instance_remove(ti, meter); + + tbl->count--; + + /* Shrink the meter array if necessary. */ + if (ti->n_meters > DP_METER_ARRAY_SIZE_MIN && + tbl->count <= (ti->n_meters / 4)) { + int half_size = ti->n_meters / 2; + int i; + + /* Avoid hash collision, don't move slots to other place. + * Make sure there are no references of meters in array + * which will be released. + */ + for (i = half_size; i < ti->n_meters; i++) + if (rcu_dereference_ovsl(ti->dp_meters[i])) + goto out; + + if (dp_meter_instance_realloc(tbl, half_size)) + goto shrink_err; + } + +out: + return 0; + +shrink_err: + dp_meter_instance_insert(ti, meter); + tbl->count++; + return -ENOMEM; } static struct sk_buff * @@ -116,12 +241,11 @@ static int ovs_meter_cmd_reply_stats(struct sk_buff *reply, u32 meter_id, if (nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id)) goto error; - if (!meter) - return 0; - if (nla_put(reply, OVS_METER_ATTR_STATS, - sizeof(struct ovs_flow_stats), &meter->stats) || - nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used, + sizeof(struct ovs_flow_stats), &meter->stats)) + goto error; + + if (nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used, OVS_METER_ATTR_PAD)) goto error; @@ -150,18 +274,32 @@ error: static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) { - struct sk_buff *reply; + struct ovs_header *ovs_header = info->userhdr; struct ovs_header *ovs_reply_header; struct nlattr *nla, *band_nla; - int err; + struct sk_buff *reply; + struct datapath *dp; + int err = -EMSGSIZE; reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_FEATURES, &ovs_reply_header); if (IS_ERR(reply)) return PTR_ERR(reply); - if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, U32_MAX) || - nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS)) + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto exit_unlock; + } + + if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, + dp->meter_tbl.max_meters_allowed)) + goto exit_unlock; + + ovs_unlock(); + + if (nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS)) goto nla_put_failure; nla = nla_nest_start_noflag(reply, OVS_METER_ATTR_BANDS); @@ -180,9 +318,10 @@ static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); +exit_unlock: + ovs_unlock(); nla_put_failure: nlmsg_free(reply); - err = -EMSGSIZE; return err; } @@ -204,7 +343,7 @@ static struct dp_meter *dp_meter_create(struct nlattr **a) return ERR_PTR(-EINVAL); /* Allocate and set up the meter before locking anything. */ - meter = kzalloc(struct_size(meter, bands, n_bands), GFP_KERNEL); + meter = kzalloc(struct_size(meter, bands, n_bands), GFP_KERNEL_ACCOUNT); if (!meter) return ERR_PTR(-ENOMEM); @@ -252,8 +391,8 @@ static struct dp_meter *dp_meter_create(struct nlattr **a) * * Start with a full bucket. */ - band->bucket = (band->burst_size + band->rate) * 1000; - band_max_delta_t = band->bucket / band->rate; + band->bucket = band->burst_size * 1000ULL; + band_max_delta_t = div_u64(band->bucket, band->rate); if (band_max_delta_t > meter->max_delta_t) meter->max_delta_t = band_max_delta_t; band++; @@ -273,17 +412,17 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) struct sk_buff *reply; struct ovs_header *ovs_reply_header; struct ovs_header *ovs_header = info->userhdr; + struct dp_meter_table *meter_tbl; struct datapath *dp; int err; u32 meter_id; bool failed; - if (!a[OVS_METER_ATTR_ID]) { - return -ENODEV; - } + if (!a[OVS_METER_ATTR_ID]) + return -EINVAL; meter = dp_meter_create(a); - if (IS_ERR_OR_NULL(meter)) + if (IS_ERR(meter)) return PTR_ERR(meter); reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_SET, @@ -300,12 +439,18 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) goto exit_unlock; } + meter_tbl = &dp->meter_tbl; meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); - /* Cannot fail after this. */ - old_meter = lookup_meter(dp, meter_id); - detach_meter(old_meter); - attach_meter(dp, meter); + old_meter = lookup_meter(meter_tbl, meter_id); + err = detach_meter(meter_tbl, old_meter); + if (err) + goto exit_unlock; + + err = attach_meter(meter_tbl, meter); + if (err) + goto exit_unlock; + ovs_unlock(); /* Build response with the meter_id and stats from @@ -337,14 +482,14 @@ exit_free_meter: static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) { - struct nlattr **a = info->attrs; - u32 meter_id; struct ovs_header *ovs_header = info->userhdr; struct ovs_header *ovs_reply_header; + struct nlattr **a = info->attrs; + struct dp_meter *meter; + struct sk_buff *reply; struct datapath *dp; + u32 meter_id; int err; - struct sk_buff *reply; - struct dp_meter *meter; if (!a[OVS_METER_ATTR_ID]) return -EINVAL; @@ -365,7 +510,7 @@ static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) } /* Locate meter, copy stats. */ - meter = lookup_meter(dp, meter_id); + meter = lookup_meter(&dp->meter_tbl, meter_id); if (!meter) { err = -ENOENT; goto exit_unlock; @@ -390,18 +535,17 @@ exit_unlock: static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) { - struct nlattr **a = info->attrs; - u32 meter_id; struct ovs_header *ovs_header = info->userhdr; struct ovs_header *ovs_reply_header; + struct nlattr **a = info->attrs; + struct dp_meter *old_meter; + struct sk_buff *reply; struct datapath *dp; + u32 meter_id; int err; - struct sk_buff *reply; - struct dp_meter *old_meter; if (!a[OVS_METER_ATTR_ID]) return -EINVAL; - meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_DEL, &ovs_reply_header); @@ -416,14 +560,19 @@ static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) goto exit_unlock; } - old_meter = lookup_meter(dp, meter_id); + meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); + old_meter = lookup_meter(&dp->meter_tbl, meter_id); if (old_meter) { spin_lock_bh(&old_meter->lock); err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); WARN_ON(err); spin_unlock_bh(&old_meter->lock); - detach_meter(old_meter); + + err = detach_meter(&dp->meter_tbl, old_meter); + if (err) + goto exit_unlock; } + ovs_unlock(); ovs_meter_free(old_meter); genlmsg_end(reply, ovs_reply_header); @@ -443,16 +592,16 @@ exit_unlock: bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, u32 meter_id) { - struct dp_meter *meter; - struct dp_meter_band *band; long long int now_ms = div_u64(ktime_get_ns(), 1000 * 1000); long long int long_delta_ms; - u32 delta_ms; - u32 cost; + struct dp_meter_band *band; + struct dp_meter *meter; int i, band_exceeded_max = -1; u32 band_exceeded_rate = 0; + u32 delta_ms; + u32 cost; - meter = lookup_meter(dp, meter_id); + meter = lookup_meter(&dp->meter_tbl, meter_id); /* Do not drop the packet when there is no meter. */ if (!meter) return false; @@ -461,6 +610,14 @@ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, spin_lock(&meter->lock); long_delta_ms = (now_ms - meter->used); /* ms */ + if (long_delta_ms < 0) { + /* This condition means that we have several threads fighting + * for a meter lock, and the one who received the packets a + * bit later wins. Assuming that all racing threads received + * packets at the same time to avoid overflow. + */ + long_delta_ms = 0; + } /* Make sure delta_ms will not be too large, so that bucket will not * wrap around below. @@ -491,7 +648,7 @@ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, long long int max_bucket_size; band = &meter->bands[i]; - max_bucket_size = (band->burst_size + band->rate) * 1000LL; + max_bucket_size = band->burst_size * 1000LL; band->bucket += delta_ms * band->rate; if (band->bucket > max_bucket_size) @@ -522,7 +679,7 @@ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, return false; } -static struct genl_ops dp_meter_genl_ops[] = { +static const struct genl_small_ops dp_meter_genl_ops[] = { { .cmd = OVS_METER_CMD_FEATURES, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ @@ -530,9 +687,9 @@ static struct genl_ops dp_meter_genl_ops[] = { }, { .cmd = OVS_METER_CMD_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN - * privilege. - */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN + * privilege. + */ .doit = ovs_meter_cmd_set, }, { .cmd = OVS_METER_CMD_GET, @@ -542,9 +699,9 @@ static struct genl_ops dp_meter_genl_ops[] = { }, { .cmd = OVS_METER_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN - * privilege. - */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN + * privilege. + */ .doit = ovs_meter_cmd_del }, }; @@ -561,8 +718,9 @@ struct genl_family dp_meter_genl_family __ro_after_init = { .policy = meter_policy, .netnsok = true, .parallel_ops = true, - .ops = dp_meter_genl_ops, - .n_ops = ARRAY_SIZE(dp_meter_genl_ops), + .small_ops = dp_meter_genl_ops, + .n_small_ops = ARRAY_SIZE(dp_meter_genl_ops), + .resv_start_op = OVS_METER_CMD_GET + 1, .mcgrps = &ovs_meter_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, @@ -570,32 +728,39 @@ struct genl_family dp_meter_genl_family __ro_after_init = { int ovs_meters_init(struct datapath *dp) { - int i; - - dp->meters = kmalloc_array(METER_HASH_BUCKETS, - sizeof(struct hlist_head), GFP_KERNEL); + struct dp_meter_table *tbl = &dp->meter_tbl; + struct dp_meter_instance *ti; + unsigned long free_mem_bytes; - if (!dp->meters) + ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN); + if (!ti) return -ENOMEM; - for (i = 0; i < METER_HASH_BUCKETS; i++) - INIT_HLIST_HEAD(&dp->meters[i]); + /* Allow meters in a datapath to use ~3.12% of physical memory. */ + free_mem_bytes = nr_free_buffer_pages() * (PAGE_SIZE >> 5); + tbl->max_meters_allowed = min(free_mem_bytes / sizeof(struct dp_meter), + DP_METER_NUM_MAX); + if (!tbl->max_meters_allowed) + goto out_err; + + rcu_assign_pointer(tbl->ti, ti); + tbl->count = 0; return 0; + +out_err: + dp_meter_instance_free(ti); + return -ENOMEM; } void ovs_meters_exit(struct datapath *dp) { + struct dp_meter_table *tbl = &dp->meter_tbl; + struct dp_meter_instance *ti = rcu_dereference_raw(tbl->ti); int i; - for (i = 0; i < METER_HASH_BUCKETS; i++) { - struct hlist_head *head = &dp->meters[i]; - struct dp_meter *meter; - struct hlist_node *n; - - hlist_for_each_entry_safe(meter, n, head, dp_hash_node) - kfree(meter); - } + for (i = 0; i < ti->n_meters; i++) + ovs_meter_free(rcu_dereference_raw(ti->dp_meters[i])); - kfree(dp->meters); + dp_meter_instance_free(ti); } diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h index f645913870bd..0c33889a8515 100644 --- a/net/openvswitch/meter.h +++ b/net/openvswitch/meter.h @@ -13,26 +13,26 @@ #include <linux/openvswitch.h> #include <linux/genetlink.h> #include <linux/skbuff.h> +#include <linux/bits.h> #include "flow.h" struct datapath; #define DP_MAX_BANDS 1 +#define DP_METER_ARRAY_SIZE_MIN BIT_ULL(10) +#define DP_METER_NUM_MAX (200000UL) struct dp_meter_band { u32 type; u32 rate; u32 burst_size; - u32 bucket; /* 1/1000 packets, or in bits */ + u64 bucket; /* 1/1000 packets, or in bits */ struct ovs_flow_stats stats; }; struct dp_meter { spinlock_t lock; /* Per meter lock */ struct rcu_head rcu; - struct hlist_node dp_hash_node; /*Element in datapath->meters - * hash table. - */ u32 id; u16 kbps:1, keep_stats:1; u16 n_bands; @@ -42,6 +42,18 @@ struct dp_meter { struct dp_meter_band bands[]; }; +struct dp_meter_instance { + struct rcu_head rcu; + u32 n_meters; + struct dp_meter __rcu *dp_meters[]; +}; + +struct dp_meter_table { + struct dp_meter_instance __rcu *ti; + u32 count; + u32 max_meters_allowed; +}; + extern struct genl_family dp_meter_genl_family; int ovs_meters_init(struct datapath *dp); void ovs_meters_exit(struct datapath *dp); diff --git a/net/openvswitch/openvswitch_trace.c b/net/openvswitch/openvswitch_trace.c new file mode 100644 index 000000000000..62c5f7d6f023 --- /dev/null +++ b/net/openvswitch/openvswitch_trace.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 +/* bug in tracepoint.h, it should include this */ +#include <linux/module.h> + +/* sparse isn't too happy with all macros... */ +#ifndef __CHECKER__ +#define CREATE_TRACE_POINTS +#include "openvswitch_trace.h" + +#endif diff --git a/net/openvswitch/openvswitch_trace.h b/net/openvswitch/openvswitch_trace.h new file mode 100644 index 000000000000..3eb35d9eb700 --- /dev/null +++ b/net/openvswitch/openvswitch_trace.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM openvswitch + +#if !defined(_TRACE_OPENVSWITCH_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_OPENVSWITCH_H + +#include <linux/tracepoint.h> + +#include "datapath.h" + +TRACE_EVENT(ovs_do_execute_action, + + TP_PROTO(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, const struct nlattr *a, int rem), + + TP_ARGS(dp, skb, key, a, rem), + + TP_STRUCT__entry( + __field( void *, dpaddr ) + __string( dp_name, ovs_dp_name(dp) ) + __string( dev_name, skb->dev->name ) + __field( void *, skbaddr ) + __field( unsigned int, len ) + __field( unsigned int, data_len ) + __field( unsigned int, truesize ) + __field( u8, nr_frags ) + __field( u16, gso_size ) + __field( u16, gso_type ) + __field( u32, ovs_flow_hash ) + __field( u32, recirc_id ) + __field( void *, keyaddr ) + __field( u16, key_eth_type ) + __field( u8, key_ct_state ) + __field( u8, key_ct_orig_proto ) + __field( u16, key_ct_zone ) + __field( unsigned int, flow_key_valid ) + __field( u8, action_type ) + __field( unsigned int, action_len ) + __field( void *, action_data ) + __field( u8, is_last ) + ), + + TP_fast_assign( + __entry->dpaddr = dp; + __assign_str(dp_name, ovs_dp_name(dp)); + __assign_str(dev_name, skb->dev->name); + __entry->skbaddr = skb; + __entry->len = skb->len; + __entry->data_len = skb->data_len; + __entry->truesize = skb->truesize; + __entry->nr_frags = skb_shinfo(skb)->nr_frags; + __entry->gso_size = skb_shinfo(skb)->gso_size; + __entry->gso_type = skb_shinfo(skb)->gso_type; + __entry->ovs_flow_hash = key->ovs_flow_hash; + __entry->recirc_id = key->recirc_id; + __entry->keyaddr = key; + __entry->key_eth_type = key->eth.type; + __entry->key_ct_state = key->ct_state; + __entry->key_ct_orig_proto = key->ct_orig_proto; + __entry->key_ct_zone = key->ct_zone; + __entry->flow_key_valid = !(key->mac_proto & SW_FLOW_KEY_INVALID); + __entry->action_type = nla_type(a); + __entry->action_len = nla_len(a); + __entry->action_data = nla_data(a); + __entry->is_last = nla_is_last(a, rem); + ), + + TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u truesize=%u nr_frags=%d gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x ct_state=%02x ct_orig_proto=%02x ct_Zone=%04x flow_key_valid=%d action_type=%u action_len=%u action_data=%p is_last=%d", + __entry->dpaddr, __get_str(dp_name), __get_str(dev_name), + __entry->skbaddr, __entry->len, __entry->data_len, + __entry->truesize, __entry->nr_frags, __entry->gso_size, + __entry->gso_type, __entry->ovs_flow_hash, + __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type, + __entry->key_ct_state, __entry->key_ct_orig_proto, + __entry->key_ct_zone, + __entry->flow_key_valid, + __entry->action_type, __entry->action_len, + __entry->action_data, __entry->is_last) +); + +TRACE_EVENT(ovs_dp_upcall, + + TP_PROTO(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_key *key, + const struct dp_upcall_info *upcall_info), + + TP_ARGS(dp, skb, key, upcall_info), + + TP_STRUCT__entry( + __field( void *, dpaddr ) + __string( dp_name, ovs_dp_name(dp) ) + __string( dev_name, skb->dev->name ) + __field( void *, skbaddr ) + __field( unsigned int, len ) + __field( unsigned int, data_len ) + __field( unsigned int, truesize ) + __field( u8, nr_frags ) + __field( u16, gso_size ) + __field( u16, gso_type ) + __field( u32, ovs_flow_hash ) + __field( u32, recirc_id ) + __field( const void *, keyaddr ) + __field( u16, key_eth_type ) + __field( u8, key_ct_state ) + __field( u8, key_ct_orig_proto ) + __field( u16, key_ct_zone ) + __field( unsigned int, flow_key_valid ) + __field( u8, upcall_cmd ) + __field( u32, upcall_port ) + __field( u16, upcall_mru ) + ), + + TP_fast_assign( + __entry->dpaddr = dp; + __assign_str(dp_name, ovs_dp_name(dp)); + __assign_str(dev_name, skb->dev->name); + __entry->skbaddr = skb; + __entry->len = skb->len; + __entry->data_len = skb->data_len; + __entry->truesize = skb->truesize; + __entry->nr_frags = skb_shinfo(skb)->nr_frags; + __entry->gso_size = skb_shinfo(skb)->gso_size; + __entry->gso_type = skb_shinfo(skb)->gso_type; + __entry->ovs_flow_hash = key->ovs_flow_hash; + __entry->recirc_id = key->recirc_id; + __entry->keyaddr = key; + __entry->key_eth_type = key->eth.type; + __entry->key_ct_state = key->ct_state; + __entry->key_ct_orig_proto = key->ct_orig_proto; + __entry->key_ct_zone = key->ct_zone; + __entry->flow_key_valid = !(key->mac_proto & SW_FLOW_KEY_INVALID); + __entry->upcall_cmd = upcall_info->cmd; + __entry->upcall_port = upcall_info->portid; + __entry->upcall_mru = upcall_info->mru; + ), + + TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u truesize=%u nr_frags=%d gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x ct_state=%02x ct_orig_proto=%02x ct_zone=%04x flow_key_valid=%d upcall_cmd=%u upcall_port=%u upcall_mru=%u", + __entry->dpaddr, __get_str(dp_name), __get_str(dev_name), + __entry->skbaddr, __entry->len, __entry->data_len, + __entry->truesize, __entry->nr_frags, __entry->gso_size, + __entry->gso_type, __entry->ovs_flow_hash, + __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type, + __entry->key_ct_state, __entry->key_ct_orig_proto, + __entry->key_ct_zone, + __entry->flow_key_valid, + __entry->upcall_cmd, __entry->upcall_port, + __entry->upcall_mru) +); + +#endif /* _TRACE_OPENVSWITCH_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE openvswitch_trace +#include <trace/define_trace.h> diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 58a7b8312c28..74c88a6baa43 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -35,21 +35,18 @@ internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) { int len, err; + /* store len value because skb can be freed inside ovs_vport_receive() */ len = skb->len; + rcu_read_lock(); err = ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); rcu_read_unlock(); - if (likely(!err)) { - struct pcpu_sw_netstats *tstats = this_cpu_ptr(netdev->tstats); - - u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += len; - tstats->tx_packets++; - u64_stats_update_end(&tstats->syncp); - } else { + if (likely(!err)) + dev_sw_netstats_tx_add(netdev, 1, len); + else netdev->stats.tx_errors++; - } + return NETDEV_TX_OK; } @@ -68,7 +65,7 @@ static int internal_dev_stop(struct net_device *netdev) static void internal_dev_getinfo(struct net_device *netdev, struct ethtool_drvinfo *info) { - strlcpy(info->driver, "openvswitch", sizeof(info->driver)); + strscpy(info->driver, "openvswitch", sizeof(info->driver)); } static const struct ethtool_ops internal_dev_ethtool_ops = { @@ -83,42 +80,12 @@ static void internal_dev_destructor(struct net_device *dev) ovs_vport_free(vport); } -static void -internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) -{ - int i; - - memset(stats, 0, sizeof(*stats)); - stats->rx_errors = dev->stats.rx_errors; - stats->tx_errors = dev->stats.tx_errors; - stats->tx_dropped = dev->stats.tx_dropped; - stats->rx_dropped = dev->stats.rx_dropped; - - for_each_possible_cpu(i) { - const struct pcpu_sw_netstats *percpu_stats; - struct pcpu_sw_netstats local_stats; - unsigned int start; - - percpu_stats = per_cpu_ptr(dev->tstats, i); - - do { - start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); - local_stats = *percpu_stats; - } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); - - stats->rx_bytes += local_stats.rx_bytes; - stats->rx_packets += local_stats.rx_packets; - stats->tx_bytes += local_stats.tx_bytes; - stats->tx_packets += local_stats.tx_packets; - } -} - static const struct net_device_ops internal_dev_netdev_ops = { .ndo_open = internal_dev_open, .ndo_stop = internal_dev_stop, .ndo_start_xmit = internal_dev_xmit, .ndo_set_mac_address = eth_mac_addr, - .ndo_get_stats64 = internal_get_stats, + .ndo_get_stats64 = dev_get_tstats64, }; static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { @@ -180,6 +147,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) } dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); + dev->ifindex = parms->desired_ifindex; internal_dev = internal_dev_priv(vport->dev); internal_dev->vport = vport; @@ -222,10 +190,9 @@ static void internal_dev_destroy(struct vport *vport) rtnl_unlock(); } -static netdev_tx_t internal_dev_recv(struct sk_buff *skb) +static int internal_dev_recv(struct sk_buff *skb) { struct net_device *netdev = skb->dev; - struct pcpu_sw_netstats *stats; if (unlikely(!(netdev->flags & IFF_UP))) { kfree_skb(skb); @@ -240,12 +207,7 @@ static netdev_tx_t internal_dev_recv(struct sk_buff *skb) skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, netdev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - - stats = this_cpu_ptr(netdev->tstats); - u64_stats_update_begin(&stats->syncp); - stats->rx_packets++; - stats->rx_bytes += skb->len; - u64_stats_update_end(&stats->syncp); + dev_sw_netstats_rx_add(netdev, skb->len); netif_rx(skb); return NETDEV_TX_OK; diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 57d6436e6f6a..2f61d5bdce1a 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -44,10 +44,9 @@ static void netdev_port_receive(struct sk_buff *skb) if (unlikely(!skb)) return; - if (skb->dev->type == ARPHRD_ETHER) { - skb_push(skb, ETH_HLEN); - skb_postpush_rcsum(skb, skb->data, ETH_HLEN); - } + if (skb->dev->type == ARPHRD_ETHER) + skb_push_rcsum(skb, ETH_HLEN); + ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; error: @@ -83,7 +82,7 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name) err = -ENODEV; goto error_free_vport; } - + netdev_tracker_alloc(vport->dev, &vport->dev_tracker, GFP_KERNEL); if (vport->dev->flags & IFF_LOOPBACK || (vport->dev->type != ARPHRD_ETHER && vport->dev->type != ARPHRD_NONE) || @@ -116,7 +115,7 @@ error_master_upper_dev_unlink: error_unlock: rtnl_unlock(); error_put: - dev_put(vport->dev); + netdev_put(vport->dev, &vport->dev_tracker); error_free_vport: ovs_vport_free(vport); return ERR_PTR(err); @@ -138,8 +137,7 @@ static void vport_netdev_free(struct rcu_head *rcu) { struct vport *vport = container_of(rcu, struct vport, rcu); - if (vport->dev) - dev_put(vport->dev); + netdev_put(vport->dev, &vport->dev_tracker); ovs_vport_free(vport); } @@ -175,7 +173,7 @@ void ovs_netdev_tunnel_destroy(struct vport *vport) */ if (vport->dev->reg_state == NETREG_REGISTERED) rtnl_delete_link(vport->dev); - dev_put(vport->dev); + netdev_put(vport->dev, &vport->dev_tracker); vport->dev = NULL; rtnl_unlock(); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 47febb4504f0..82a74f998966 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -87,6 +87,7 @@ EXPORT_SYMBOL_GPL(ovs_vport_ops_unregister); /** * ovs_vport_locate - find a port that has already been created * + * @net: network namespace * @name: name of port to find * * Must be called with ovs or RCU read lock. @@ -97,7 +98,7 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name) struct vport *vport; hlist_for_each_entry_rcu(vport, bucket, hash_node, - lockdep_ovsl_is_held()) + lockdep_ovsl_is_held()) if (!strcmp(name, ovs_vport_name(vport)) && net_eq(ovs_dp_get_net(vport->dp), net)) return vport; @@ -110,14 +111,16 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name) * * @priv_size: Size of private data area to allocate. * @ops: vport device ops + * @parms: information about new vport. * * Allocate and initialize a new vport defined by @ops. The vport will contain * a private data area of size @priv_size that can be accessed using - * vport_priv(). vports that are no longer needed should be released with + * vport_priv(). Some parameters of the vport will be initialized from @parms. + * @vports that are no longer needed should be released with * vport_free(). */ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, - const struct vport_parms *parms) + const struct vport_parms *parms) { struct vport *vport; size_t alloc_size; @@ -396,7 +399,8 @@ int ovs_vport_get_upcall_portids(const struct vport *vport, * * Returns the portid of the target socket. Must be called with rcu_read_lock. */ -u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) +u32 ovs_vport_find_upcall_portid(const struct vport *vport, + struct sk_buff *skb) { struct vport_portids *ids; u32 ids_index; @@ -418,7 +422,7 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) * * @vport: vport that received the packet * @skb: skb that was received - * @tun_key: tunnel (if any) that carried packet + * @tun_info: tunnel (if any) that carried packet * * Must be called with rcu_read_lock. The packet cannot be shared and * skb->data should point to the Ethernet header. @@ -493,14 +497,17 @@ void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto) if (unlikely(packet_length(skb, vport->dev) > mtu && !skb_is_gso(skb))) { - net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", - vport->dev->name, - packet_length(skb, vport->dev), mtu); vport->dev->stats.tx_errors++; + if (vport->dev->flags & IFF_UP) + net_warn_ratelimited("%s: dropped over-mtu packet: " + "%d > %d\n", vport->dev->name, + packet_length(skb, vport->dev), + mtu); goto drop; } skb->dev = vport->dev; + skb_clear_tstamp(skb); vport->ops->send(skb); return; diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 1eb7495ac5b4..6ff45e8a0868 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -20,7 +20,7 @@ struct vport; struct vport_parms; -/* The following definitions are for users of the vport subsytem: */ +/* The following definitions are for users of the vport subsystem: */ int ovs_vport_init(void); void ovs_vport_exit(void); @@ -58,6 +58,7 @@ struct vport_portids { /** * struct vport - one port within a datapath * @dev: Pointer to net_device. + * @dev_tracker: refcount tracker for @dev reference * @dp: Datapath to which this port belongs. * @upcall_portids: RCU protected 'struct vport_portids'. * @port_no: Index into @dp's @ports array. @@ -69,6 +70,7 @@ struct vport_portids { */ struct vport { struct net_device *dev; + netdevice_tracker dev_tracker; struct datapath *dp; struct vport_portids __rcu *upcall_portids; u16 port_no; @@ -88,12 +90,14 @@ struct vport { * @type: New vport's type. * @options: %OVS_VPORT_ATTR_OPTIONS attribute from Netlink message, %NULL if * none was supplied. + * @desired_ifindex: New vport's ifindex. * @dp: New vport's datapath. * @port_no: New vport's port number. */ struct vport_parms { const char *name; enum ovs_vport_type type; + int desired_ifindex; struct nlattr *options; /* For ovs_vport_alloc(). */ @@ -128,7 +132,7 @@ struct vport_ops { int (*set_options)(struct vport *, struct nlattr *); int (*get_options)(const struct vport *, struct sk_buff *); - netdev_tx_t (*send) (struct sk_buff *skb); + int (*send)(struct sk_buff *skb); struct module *owner; struct list_head list; }; |