From e7f133290660d976da8cb20e9bc7310d0cd19341 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 17 Sep 2013 09:38:23 -0700 Subject: openvswitch: Move flow table rehashing to flow install. Rehashing in ovs-workqueue can cause ovs-mutex lock contentions in case of heavy flow setups where both needs ovs-mutex. So by moving rehashing to flow-setup we can eliminate contention. This also simplify ovs locking and reduces dependence on workqueue. Signed-off-by: Pravin B Shelar Signed-off-by: Jesse Gross --- net/openvswitch/datapath.c | 50 ++++++++++------------------------------------ net/openvswitch/datapath.h | 2 ++ 2 files changed, 13 insertions(+), 39 deletions(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 2aa13bd7f2b2..2e1a9c24e380 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -60,8 +60,6 @@ #define REHASH_FLOW_INTERVAL (10 * 60 * HZ) -static void rehash_flow_table(struct work_struct *work); -static DECLARE_DELAYED_WORK(rehash_flow_wq, rehash_flow_table); int ovs_net_id __read_mostly; @@ -1289,22 +1287,25 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) /* Check if this is a duplicate flow */ flow = ovs_flow_lookup(table, &key); if (!flow) { + struct flow_table *new_table = NULL; struct sw_flow_mask *mask_p; + /* Bail out if we're not allowed to create a new flow. */ error = -ENOENT; if (info->genlhdr->cmd == OVS_FLOW_CMD_SET) goto err_unlock_ovs; /* Expand table, if necessary, to make room. */ - if (ovs_flow_tbl_need_to_expand(table)) { - struct flow_table *new_table; - + if (ovs_flow_tbl_need_to_expand(table)) new_table = ovs_flow_tbl_expand(table); - if (!IS_ERR(new_table)) { - rcu_assign_pointer(dp->table, new_table); - ovs_flow_tbl_destroy(table, true); - table = ovsl_dereference(dp->table); - } + else if (time_after(jiffies, dp->last_rehash + REHASH_FLOW_INTERVAL)) + new_table = ovs_flow_tbl_rehash(table); + + if (new_table && !IS_ERR(new_table)) { + rcu_assign_pointer(dp->table, new_table); + ovs_flow_tbl_destroy(table, true); + table = ovsl_dereference(dp->table); + dp->last_rehash = jiffies; } /* Allocate flow. */ @@ -2336,32 +2337,6 @@ error: return err; } -static void rehash_flow_table(struct work_struct *work) -{ - struct datapath *dp; - struct net *net; - - ovs_lock(); - rtnl_lock(); - for_each_net(net) { - struct ovs_net *ovs_net = net_generic(net, ovs_net_id); - - list_for_each_entry(dp, &ovs_net->dps, list_node) { - struct flow_table *old_table = ovsl_dereference(dp->table); - struct flow_table *new_table; - - new_table = ovs_flow_tbl_rehash(old_table); - if (!IS_ERR(new_table)) { - rcu_assign_pointer(dp->table, new_table); - ovs_flow_tbl_destroy(old_table, true); - } - } - } - rtnl_unlock(); - ovs_unlock(); - schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL); -} - static int __net_init ovs_init_net(struct net *net) { struct ovs_net *ovs_net = net_generic(net, ovs_net_id); @@ -2419,8 +2394,6 @@ static int __init dp_init(void) if (err < 0) goto error_unreg_notifier; - schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL); - return 0; error_unreg_notifier: @@ -2437,7 +2410,6 @@ error: static void dp_cleanup(void) { - cancel_delayed_work_sync(&rehash_flow_wq); dp_unregister_genl(ARRAY_SIZE(dp_genl_families)); unregister_netdevice_notifier(&ovs_dp_device_notifier); unregister_pernet_device(&ovs_net_ops); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 4d109c176ef3..2c15541f3b46 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -62,6 +62,7 @@ struct dp_stats_percpu { * ovs_mutex and RCU. * @stats_percpu: Per-CPU datapath statistics. * @net: Reference to net namespace. + * @last_rehash: Timestamp of last rehash. * * Context: See the comment on locking at the top of datapath.c for additional * locking information. @@ -83,6 +84,7 @@ struct datapath { /* Network namespace ref. */ struct net *net; #endif + unsigned long last_rehash; }; /** -- cgit v1.2.3-59-g8ed1b From 9db5507947659065c5ffd76e4dd243f4a26a0bbb Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 23 Sep 2013 21:55:22 +0800 Subject: openvswitch: remove duplicated include from vport-vxlan.c Remove duplicated include. Signed-off-by: Wei Yongjun Signed-off-by: Jesse Gross --- net/openvswitch/vport-vxlan.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index a481c03e2861..b0da39469ec8 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3-59-g8ed1b From f0627cfa24389cab25c67bb7ca902912216a8a2d Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 23 Sep 2013 21:56:14 +0800 Subject: openvswitch: remove duplicated include from vport-gre.c Remove duplicated include. Signed-off-by: Wei Yongjun Signed-off-by: Jesse Gross --- net/openvswitch/vport-gre.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index c99dea543d64..a3d6951602db 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -24,8 +24,6 @@ #include #include #include -#include -#include #include #include #include -- cgit v1.2.3-59-g8ed1b From e64457191a259537bbbfaebeba9a8043786af96f Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Thu, 3 Oct 2013 18:16:47 -0700 Subject: openvswitch: Restructure datapath.c and flow.c Over the time datapath.c and flow.c has became pretty large files. Following patch restructures functionality of component into three different components: flow.c: contains flow extract. flow_netlink.c: netlink flow api. flow_table.c: flow table api. This patch restructures code without changing logic. Signed-off-by: Pravin B Shelar Signed-off-by: Jesse Gross --- net/openvswitch/Makefile | 2 + net/openvswitch/datapath.c | 528 +------------ net/openvswitch/datapath.h | 1 + net/openvswitch/flow.c | 1605 +--------------------------------------- net/openvswitch/flow.h | 128 +--- net/openvswitch/flow_netlink.c | 1603 +++++++++++++++++++++++++++++++++++++++ net/openvswitch/flow_netlink.h | 60 ++ net/openvswitch/flow_table.c | 517 +++++++++++++ net/openvswitch/flow_table.h | 91 +++ 9 files changed, 2354 insertions(+), 2181 deletions(-) create mode 100644 net/openvswitch/flow_netlink.c create mode 100644 net/openvswitch/flow_netlink.h create mode 100644 net/openvswitch/flow_table.c create mode 100644 net/openvswitch/flow_table.h (limited to 'net/openvswitch') diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index ea36e99089af..3591cb5dae91 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -9,6 +9,8 @@ openvswitch-y := \ datapath.o \ dp_notify.o \ flow.o \ + flow_netlink.o \ + flow_table.o \ vport.o \ vport-internal_dev.o \ vport-netdev.o diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 2e1a9c24e380..72e68743c643 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -55,10 +55,10 @@ #include "datapath.h" #include "flow.h" +#include "flow_netlink.h" #include "vport-internal_dev.h" #include "vport-netdev.h" - #define REHASH_FLOW_INTERVAL (10 * 60 * HZ) int ovs_net_id __read_mostly; @@ -235,7 +235,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) } /* Look up flow. */ - flow = ovs_flow_lookup(rcu_dereference(dp->table), &key); + flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table), &key); if (unlikely(!flow)) { struct dp_upcall_info upcall; @@ -433,7 +433,7 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex, upcall->dp_ifindex = dp_ifindex; nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY); - ovs_flow_to_nlattrs(upcall_info->key, upcall_info->key, user_skb); + ovs_nla_put_flow(upcall_info->key, upcall_info->key, user_skb); nla_nest_end(user_skb, nla); if (upcall_info->userdata) @@ -470,381 +470,6 @@ static int flush_flows(struct datapath *dp) return 0; } -static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, int attr_len) -{ - - struct sw_flow_actions *acts; - int new_acts_size; - int req_size = NLA_ALIGN(attr_len); - int next_offset = offsetof(struct sw_flow_actions, actions) + - (*sfa)->actions_len; - - if (req_size <= (ksize(*sfa) - next_offset)) - goto out; - - new_acts_size = ksize(*sfa) * 2; - - if (new_acts_size > MAX_ACTIONS_BUFSIZE) { - if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) - return ERR_PTR(-EMSGSIZE); - new_acts_size = MAX_ACTIONS_BUFSIZE; - } - - acts = ovs_flow_actions_alloc(new_acts_size); - if (IS_ERR(acts)) - return (void *)acts; - - memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len); - acts->actions_len = (*sfa)->actions_len; - kfree(*sfa); - *sfa = acts; - -out: - (*sfa)->actions_len += req_size; - return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset); -} - -static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len) -{ - struct nlattr *a; - - a = reserve_sfa_size(sfa, nla_attr_size(len)); - if (IS_ERR(a)) - return PTR_ERR(a); - - a->nla_type = attrtype; - a->nla_len = nla_attr_size(len); - - if (data) - memcpy(nla_data(a), data, len); - memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len)); - - return 0; -} - -static inline int add_nested_action_start(struct sw_flow_actions **sfa, int attrtype) -{ - int used = (*sfa)->actions_len; - int err; - - err = add_action(sfa, attrtype, NULL, 0); - if (err) - return err; - - return used; -} - -static inline void add_nested_action_end(struct sw_flow_actions *sfa, int st_offset) -{ - struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions + st_offset); - - a->nla_len = sfa->actions_len - st_offset; -} - -static int validate_and_copy_actions(const struct nlattr *attr, - const struct sw_flow_key *key, int depth, - struct sw_flow_actions **sfa); - -static int validate_and_copy_sample(const struct nlattr *attr, - const struct sw_flow_key *key, int depth, - struct sw_flow_actions **sfa) -{ - const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; - const struct nlattr *probability, *actions; - const struct nlattr *a; - int rem, start, err, st_acts; - - memset(attrs, 0, sizeof(attrs)); - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type]) - return -EINVAL; - attrs[type] = a; - } - if (rem) - return -EINVAL; - - probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY]; - if (!probability || nla_len(probability) != sizeof(u32)) - return -EINVAL; - - actions = attrs[OVS_SAMPLE_ATTR_ACTIONS]; - if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) - return -EINVAL; - - /* validation done, copy sample action. */ - start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE); - if (start < 0) - return start; - err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, nla_data(probability), sizeof(u32)); - if (err) - return err; - st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS); - if (st_acts < 0) - return st_acts; - - err = validate_and_copy_actions(actions, key, depth + 1, sfa); - if (err) - return err; - - add_nested_action_end(*sfa, st_acts); - add_nested_action_end(*sfa, start); - - return 0; -} - -static int validate_tp_port(const struct sw_flow_key *flow_key) -{ - if (flow_key->eth.type == htons(ETH_P_IP)) { - if (flow_key->ipv4.tp.src || flow_key->ipv4.tp.dst) - return 0; - } else if (flow_key->eth.type == htons(ETH_P_IPV6)) { - if (flow_key->ipv6.tp.src || flow_key->ipv6.tp.dst) - return 0; - } - - return -EINVAL; -} - -static int validate_and_copy_set_tun(const struct nlattr *attr, - struct sw_flow_actions **sfa) -{ - struct sw_flow_match match; - struct sw_flow_key key; - int err, start; - - ovs_match_init(&match, &key, NULL); - err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &match, false); - if (err) - return err; - - start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET); - if (start < 0) - return start; - - err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key, - sizeof(match.key->tun_key)); - add_nested_action_end(*sfa, start); - - return err; -} - -static int validate_set(const struct nlattr *a, - const struct sw_flow_key *flow_key, - struct sw_flow_actions **sfa, - bool *set_tun) -{ - const struct nlattr *ovs_key = nla_data(a); - int key_type = nla_type(ovs_key); - - /* There can be only one key in a action */ - if (nla_total_size(nla_len(ovs_key)) != nla_len(a)) - return -EINVAL; - - if (key_type > OVS_KEY_ATTR_MAX || - (ovs_key_lens[key_type] != nla_len(ovs_key) && - ovs_key_lens[key_type] != -1)) - return -EINVAL; - - switch (key_type) { - const struct ovs_key_ipv4 *ipv4_key; - const struct ovs_key_ipv6 *ipv6_key; - int err; - - case OVS_KEY_ATTR_PRIORITY: - case OVS_KEY_ATTR_SKB_MARK: - case OVS_KEY_ATTR_ETHERNET: - break; - - case OVS_KEY_ATTR_TUNNEL: - *set_tun = true; - err = validate_and_copy_set_tun(a, sfa); - if (err) - return err; - break; - - case OVS_KEY_ATTR_IPV4: - if (flow_key->eth.type != htons(ETH_P_IP)) - return -EINVAL; - - if (!flow_key->ip.proto) - return -EINVAL; - - ipv4_key = nla_data(ovs_key); - if (ipv4_key->ipv4_proto != flow_key->ip.proto) - return -EINVAL; - - if (ipv4_key->ipv4_frag != flow_key->ip.frag) - return -EINVAL; - - break; - - case OVS_KEY_ATTR_IPV6: - if (flow_key->eth.type != htons(ETH_P_IPV6)) - return -EINVAL; - - if (!flow_key->ip.proto) - return -EINVAL; - - ipv6_key = nla_data(ovs_key); - if (ipv6_key->ipv6_proto != flow_key->ip.proto) - return -EINVAL; - - if (ipv6_key->ipv6_frag != flow_key->ip.frag) - return -EINVAL; - - if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000) - return -EINVAL; - - break; - - case OVS_KEY_ATTR_TCP: - if (flow_key->ip.proto != IPPROTO_TCP) - return -EINVAL; - - return validate_tp_port(flow_key); - - case OVS_KEY_ATTR_UDP: - if (flow_key->ip.proto != IPPROTO_UDP) - return -EINVAL; - - return validate_tp_port(flow_key); - - case OVS_KEY_ATTR_SCTP: - if (flow_key->ip.proto != IPPROTO_SCTP) - return -EINVAL; - - return validate_tp_port(flow_key); - - default: - return -EINVAL; - } - - return 0; -} - -static int validate_userspace(const struct nlattr *attr) -{ - static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = { - [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 }, - [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC }, - }; - struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1]; - int error; - - error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX, - attr, userspace_policy); - if (error) - return error; - - if (!a[OVS_USERSPACE_ATTR_PID] || - !nla_get_u32(a[OVS_USERSPACE_ATTR_PID])) - return -EINVAL; - - return 0; -} - -static int copy_action(const struct nlattr *from, - struct sw_flow_actions **sfa) -{ - int totlen = NLA_ALIGN(from->nla_len); - struct nlattr *to; - - to = reserve_sfa_size(sfa, from->nla_len); - if (IS_ERR(to)) - return PTR_ERR(to); - - memcpy(to, from, totlen); - return 0; -} - -static int validate_and_copy_actions(const struct nlattr *attr, - const struct sw_flow_key *key, - int depth, - struct sw_flow_actions **sfa) -{ - const struct nlattr *a; - int rem, err; - - if (depth >= SAMPLE_ACTION_DEPTH) - return -EOVERFLOW; - - nla_for_each_nested(a, attr, rem) { - /* Expected argument lengths, (u32)-1 for variable length. */ - static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { - [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), - [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, - [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), - [OVS_ACTION_ATTR_POP_VLAN] = 0, - [OVS_ACTION_ATTR_SET] = (u32)-1, - [OVS_ACTION_ATTR_SAMPLE] = (u32)-1 - }; - const struct ovs_action_push_vlan *vlan; - int type = nla_type(a); - bool skip_copy; - - if (type > OVS_ACTION_ATTR_MAX || - (action_lens[type] != nla_len(a) && - action_lens[type] != (u32)-1)) - return -EINVAL; - - skip_copy = false; - switch (type) { - case OVS_ACTION_ATTR_UNSPEC: - return -EINVAL; - - case OVS_ACTION_ATTR_USERSPACE: - err = validate_userspace(a); - if (err) - return err; - break; - - case OVS_ACTION_ATTR_OUTPUT: - if (nla_get_u32(a) >= DP_MAX_PORTS) - return -EINVAL; - break; - - - case OVS_ACTION_ATTR_POP_VLAN: - break; - - case OVS_ACTION_ATTR_PUSH_VLAN: - vlan = nla_data(a); - if (vlan->vlan_tpid != htons(ETH_P_8021Q)) - return -EINVAL; - if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) - return -EINVAL; - break; - - case OVS_ACTION_ATTR_SET: - err = validate_set(a, key, sfa, &skip_copy); - if (err) - return err; - break; - - case OVS_ACTION_ATTR_SAMPLE: - err = validate_and_copy_sample(a, key, depth, sfa); - if (err) - return err; - skip_copy = true; - break; - - default: - return -EINVAL; - } - if (!skip_copy) { - err = copy_action(a, sfa); - if (err) - return err; - } - } - - if (rem > 0) - return -EINVAL; - - return 0; -} - static void clear_stats(struct sw_flow *flow) { flow->used = 0; @@ -900,15 +525,16 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (err) goto err_flow_free; - err = ovs_flow_metadata_from_nlattrs(flow, a[OVS_PACKET_ATTR_KEY]); + err = ovs_nla_get_flow_metadata(flow, a[OVS_PACKET_ATTR_KEY]); if (err) goto err_flow_free; - acts = ovs_flow_actions_alloc(nla_len(a[OVS_PACKET_ATTR_ACTIONS])); + acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_PACKET_ATTR_ACTIONS])); err = PTR_ERR(acts); if (IS_ERR(acts)) goto err_flow_free; - err = validate_and_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0, &acts); + err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], + &flow->key, 0, &acts); rcu_assign_pointer(flow->sf_acts, acts); if (err) goto err_flow_free; @@ -1003,100 +629,6 @@ static struct genl_multicast_group ovs_dp_flow_multicast_group = { .name = OVS_FLOW_MCGROUP }; -static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *skb); -static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) -{ - const struct nlattr *a; - struct nlattr *start; - int err = 0, rem; - - start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE); - if (!start) - return -EMSGSIZE; - - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - struct nlattr *st_sample; - - switch (type) { - case OVS_SAMPLE_ATTR_PROBABILITY: - if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY, sizeof(u32), nla_data(a))) - return -EMSGSIZE; - break; - case OVS_SAMPLE_ATTR_ACTIONS: - st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS); - if (!st_sample) - return -EMSGSIZE; - err = actions_to_attr(nla_data(a), nla_len(a), skb); - if (err) - return err; - nla_nest_end(skb, st_sample); - break; - } - } - - nla_nest_end(skb, start); - return err; -} - -static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) -{ - const struct nlattr *ovs_key = nla_data(a); - int key_type = nla_type(ovs_key); - struct nlattr *start; - int err; - - switch (key_type) { - case OVS_KEY_ATTR_IPV4_TUNNEL: - start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); - if (!start) - return -EMSGSIZE; - - err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key), - nla_data(ovs_key)); - if (err) - return err; - nla_nest_end(skb, start); - break; - default: - if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key)) - return -EMSGSIZE; - break; - } - - return 0; -} - -static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *skb) -{ - const struct nlattr *a; - int rem, err; - - nla_for_each_attr(a, attr, len, rem) { - int type = nla_type(a); - - switch (type) { - case OVS_ACTION_ATTR_SET: - err = set_action_to_attr(a, skb); - if (err) - return err; - break; - - case OVS_ACTION_ATTR_SAMPLE: - err = sample_action_to_attr(a, skb); - if (err) - return err; - break; - default: - if (nla_put(skb, type, nla_len(a), nla_data(a))) - return -EMSGSIZE; - break; - } - } - - return 0; -} - static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts) { return NLMSG_ALIGN(sizeof(struct ovs_header)) @@ -1133,8 +665,7 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, if (!nla) goto nla_put_failure; - err = ovs_flow_to_nlattrs(&flow->unmasked_key, - &flow->unmasked_key, skb); + err = ovs_nla_put_flow(&flow->unmasked_key, &flow->unmasked_key, skb); if (err) goto error; nla_nest_end(skb, nla); @@ -1143,7 +674,7 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, if (!nla) goto nla_put_failure; - err = ovs_flow_to_nlattrs(&flow->key, &flow->mask->key, skb); + err = ovs_nla_put_flow(&flow->key, &flow->mask->key, skb); if (err) goto error; @@ -1186,7 +717,8 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, sf_acts = rcu_dereference_check(flow->sf_acts, lockdep_ovsl_is_held()); - err = actions_to_attr(sf_acts->actions, sf_acts->actions_len, skb); + err = ovs_nla_put_actions(sf_acts->actions, + sf_acts->actions_len, skb); if (!err) nla_nest_end(skb, start); else { @@ -1252,21 +784,21 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) goto error; ovs_match_init(&match, &key, &mask); - error = ovs_match_from_nlattrs(&match, - a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]); + error = ovs_nla_get_match(&match, + a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]); if (error) goto error; /* Validate actions. */ if (a[OVS_FLOW_ATTR_ACTIONS]) { - acts = ovs_flow_actions_alloc(nla_len(a[OVS_FLOW_ATTR_ACTIONS])); + acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_FLOW_ATTR_ACTIONS])); error = PTR_ERR(acts); if (IS_ERR(acts)) goto error; - ovs_flow_key_mask(&masked_key, &key, &mask); - error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], - &masked_key, 0, &acts); + ovs_flow_mask_key(&masked_key, &key, &mask); + error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], + &masked_key, 0, &acts); if (error) { OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); goto err_kfree; @@ -1285,7 +817,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) table = ovsl_dereference(dp->table); /* Check if this is a duplicate flow */ - flow = ovs_flow_lookup(table, &key); + flow = ovs_flow_tbl_lookup(table, &key); if (!flow) { struct flow_table *new_table = NULL; struct sw_flow_mask *mask_p; @@ -1336,7 +868,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) rcu_assign_pointer(flow->sf_acts, acts); /* Put flow in bucket. */ - ovs_flow_insert(table, flow); + ovs_flow_tbl_insert(table, flow); reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, info->snd_seq, OVS_FLOW_CMD_NEW); @@ -1357,7 +889,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) /* The unmasked key has to be the same for flow updates. */ error = -EINVAL; - if (!ovs_flow_cmp_unmasked_key(flow, &key, match.range.end)) { + if (!ovs_flow_cmp_unmasked_key(flow, &match)) { OVS_NLERR("Flow modification message rejected, unmasked key does not match.\n"); goto err_unlock_ovs; } @@ -1365,7 +897,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) /* Update actions. */ old_acts = ovsl_dereference(flow->sf_acts); rcu_assign_pointer(flow->sf_acts, acts); - ovs_flow_deferred_free_acts(old_acts); + ovs_nla_free_flow_actions(old_acts); reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, info->snd_seq, OVS_FLOW_CMD_NEW); @@ -1414,7 +946,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) } ovs_match_init(&match, &key, NULL); - err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL); + err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL); if (err) return err; @@ -1426,8 +958,8 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) } table = ovsl_dereference(dp->table); - flow = ovs_flow_lookup_unmasked_key(table, &match); - if (!flow) { + flow = ovs_flow_tbl_lookup(table, &key); + if (!flow || !ovs_flow_cmp_unmasked_key(flow, &match)) { err = -ENOENT; goto unlock; } @@ -1471,13 +1003,13 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) } ovs_match_init(&match, &key, NULL); - err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL); + err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL); if (err) goto unlock; table = ovsl_dereference(dp->table); - flow = ovs_flow_lookup_unmasked_key(table, &match); - if (!flow) { + flow = ovs_flow_tbl_lookup(table, &key); + if (!flow || !ovs_flow_cmp_unmasked_key(flow, &match)) { err = -ENOENT; goto unlock; } @@ -1488,7 +1020,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) goto unlock; } - ovs_flow_remove(table, flow); + ovs_flow_tbl_remove(table, flow); err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_portid, info->snd_seq, 0, OVS_FLOW_CMD_DEL); @@ -1524,7 +1056,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) bucket = cb->args[0]; obj = cb->args[1]; - flow = ovs_flow_dump_next(table, &bucket, &obj); + flow = ovs_flow_tbl_dump_next(table, &bucket, &obj); if (!flow) break; @@ -1700,7 +1232,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) } dp->ports = kmalloc(DP_VPORT_HASH_BUCKETS * sizeof(struct hlist_head), - GFP_KERNEL); + GFP_KERNEL); if (!dp->ports) { err = -ENOMEM; goto err_destroy_percpu; diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 2c15541f3b46..a6982ef84f20 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -27,6 +27,7 @@ #include #include "flow.h" +#include "flow_table.h" #include "vport.h" #define DP_MAX_PORTS USHRT_MAX diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 410db90db73d..617810f1a21e 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -45,202 +45,40 @@ #include #include -static struct kmem_cache *flow_cache; - -static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask, - struct sw_flow_key_range *range, u8 val); - -static void update_range__(struct sw_flow_match *match, - size_t offset, size_t size, bool is_mask) +u64 ovs_flow_used_time(unsigned long flow_jiffies) { - struct sw_flow_key_range *range = NULL; - size_t start = rounddown(offset, sizeof(long)); - size_t end = roundup(offset + size, sizeof(long)); - - if (!is_mask) - range = &match->range; - else if (match->mask) - range = &match->mask->range; - - if (!range) - return; - - if (range->start == range->end) { - range->start = start; - range->end = end; - return; - } - - if (range->start > start) - range->start = start; + struct timespec cur_ts; + u64 cur_ms, idle_ms; - if (range->end < end) - range->end = end; -} + ktime_get_ts(&cur_ts); + idle_ms = jiffies_to_msecs(jiffies - flow_jiffies); + cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC + + cur_ts.tv_nsec / NSEC_PER_MSEC; -#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \ - do { \ - update_range__(match, offsetof(struct sw_flow_key, field), \ - sizeof((match)->key->field), is_mask); \ - if (is_mask) { \ - if ((match)->mask) \ - (match)->mask->key.field = value; \ - } else { \ - (match)->key->field = value; \ - } \ - } while (0) - -#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ - do { \ - update_range__(match, offsetof(struct sw_flow_key, field), \ - len, is_mask); \ - if (is_mask) { \ - if ((match)->mask) \ - memcpy(&(match)->mask->key.field, value_p, len);\ - } else { \ - memcpy(&(match)->key->field, value_p, len); \ - } \ - } while (0) - -static u16 range_n_bytes(const struct sw_flow_key_range *range) -{ - return range->end - range->start; + return cur_ms - idle_ms; } -void ovs_match_init(struct sw_flow_match *match, - struct sw_flow_key *key, - struct sw_flow_mask *mask) -{ - memset(match, 0, sizeof(*match)); - match->key = key; - match->mask = mask; - - memset(key, 0, sizeof(*key)); - - if (mask) { - memset(&mask->key, 0, sizeof(mask->key)); - mask->range.start = mask->range.end = 0; - } -} +#define TCP_FLAGS_OFFSET 13 +#define TCP_FLAG_MASK 0x3f -static bool ovs_match_validate(const struct sw_flow_match *match, - u64 key_attrs, u64 mask_attrs) +void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb) { - u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET; - u64 mask_allowed = key_attrs; /* At most allow all key attributes */ - - /* The following mask attributes allowed only if they - * pass the validation tests. */ - mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4) - | (1 << OVS_KEY_ATTR_IPV6) - | (1 << OVS_KEY_ATTR_TCP) - | (1 << OVS_KEY_ATTR_UDP) - | (1 << OVS_KEY_ATTR_SCTP) - | (1 << OVS_KEY_ATTR_ICMP) - | (1 << OVS_KEY_ATTR_ICMPV6) - | (1 << OVS_KEY_ATTR_ARP) - | (1 << OVS_KEY_ATTR_ND)); - - /* Always allowed mask fields. */ - mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL) - | (1 << OVS_KEY_ATTR_IN_PORT) - | (1 << OVS_KEY_ATTR_ETHERTYPE)); - - /* Check key attributes. */ - if (match->key->eth.type == htons(ETH_P_ARP) - || match->key->eth.type == htons(ETH_P_RARP)) { - key_expected |= 1 << OVS_KEY_ATTR_ARP; - if (match->mask && (match->mask->key.eth.type == htons(0xffff))) - mask_allowed |= 1 << OVS_KEY_ATTR_ARP; - } - - if (match->key->eth.type == htons(ETH_P_IP)) { - key_expected |= 1 << OVS_KEY_ATTR_IPV4; - if (match->mask && (match->mask->key.eth.type == htons(0xffff))) - mask_allowed |= 1 << OVS_KEY_ATTR_IPV4; - - if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { - if (match->key->ip.proto == IPPROTO_UDP) { - key_expected |= 1 << OVS_KEY_ATTR_UDP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1 << OVS_KEY_ATTR_UDP; - } - - if (match->key->ip.proto == IPPROTO_SCTP) { - key_expected |= 1 << OVS_KEY_ATTR_SCTP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; - } - - if (match->key->ip.proto == IPPROTO_TCP) { - key_expected |= 1 << OVS_KEY_ATTR_TCP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1 << OVS_KEY_ATTR_TCP; - } - - if (match->key->ip.proto == IPPROTO_ICMP) { - key_expected |= 1 << OVS_KEY_ATTR_ICMP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1 << OVS_KEY_ATTR_ICMP; - } - } - } - - if (match->key->eth.type == htons(ETH_P_IPV6)) { - key_expected |= 1 << OVS_KEY_ATTR_IPV6; - if (match->mask && (match->mask->key.eth.type == htons(0xffff))) - mask_allowed |= 1 << OVS_KEY_ATTR_IPV6; - - if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { - if (match->key->ip.proto == IPPROTO_UDP) { - key_expected |= 1 << OVS_KEY_ATTR_UDP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1 << OVS_KEY_ATTR_UDP; - } - - if (match->key->ip.proto == IPPROTO_SCTP) { - key_expected |= 1 << OVS_KEY_ATTR_SCTP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; - } - - if (match->key->ip.proto == IPPROTO_TCP) { - key_expected |= 1 << OVS_KEY_ATTR_TCP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1 << OVS_KEY_ATTR_TCP; - } - - if (match->key->ip.proto == IPPROTO_ICMPV6) { - key_expected |= 1 << OVS_KEY_ATTR_ICMPV6; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6; - - if (match->key->ipv6.tp.src == - htons(NDISC_NEIGHBOUR_SOLICITATION) || - match->key->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { - key_expected |= 1 << OVS_KEY_ATTR_ND; - if (match->mask && (match->mask->key.ipv6.tp.src == htons(0xffff))) - mask_allowed |= 1 << OVS_KEY_ATTR_ND; - } - } - } - } - - if ((key_attrs & key_expected) != key_expected) { - /* Key attributes check failed. */ - OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n", - key_attrs, key_expected); - return false; - } + u8 tcp_flags = 0; - if ((mask_attrs & mask_allowed) != mask_attrs) { - /* Mask attributes check failed. */ - OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n", - mask_attrs, mask_allowed); - return false; + if ((flow->key.eth.type == htons(ETH_P_IP) || + flow->key.eth.type == htons(ETH_P_IPV6)) && + flow->key.ip.proto == IPPROTO_TCP && + likely(skb->len >= skb_transport_offset(skb) + sizeof(struct tcphdr))) { + u8 *tcp = (u8 *)tcp_hdr(skb); + tcp_flags = *(tcp + TCP_FLAGS_OFFSET) & TCP_FLAG_MASK; } - return true; + spin_lock(&flow->lock); + flow->used = jiffies; + flow->packet_count++; + flow->byte_count += skb->len; + flow->tcp_flags |= tcp_flags; + spin_unlock(&flow->lock); } static int check_header(struct sk_buff *skb, int len) @@ -311,19 +149,6 @@ static bool icmphdr_ok(struct sk_buff *skb) sizeof(struct icmphdr)); } -u64 ovs_flow_used_time(unsigned long flow_jiffies) -{ - struct timespec cur_ts; - u64 cur_ms, idle_ms; - - ktime_get_ts(&cur_ts); - idle_ms = jiffies_to_msecs(jiffies - flow_jiffies); - cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC + - cur_ts.tv_nsec / NSEC_PER_MSEC; - - return cur_ms - idle_ms; -} - static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) { unsigned int nh_ofs = skb_network_offset(skb); @@ -372,311 +197,6 @@ static bool icmp6hdr_ok(struct sk_buff *skb) sizeof(struct icmp6hdr)); } -void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src, - const struct sw_flow_mask *mask) -{ - const long *m = (long *)((u8 *)&mask->key + mask->range.start); - const long *s = (long *)((u8 *)src + mask->range.start); - long *d = (long *)((u8 *)dst + mask->range.start); - int i; - - /* The memory outside of the 'mask->range' are not set since - * further operations on 'dst' only uses contents within - * 'mask->range'. - */ - for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long)) - *d++ = *s++ & *m++; -} - -#define TCP_FLAGS_OFFSET 13 -#define TCP_FLAG_MASK 0x3f - -void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb) -{ - u8 tcp_flags = 0; - - if ((flow->key.eth.type == htons(ETH_P_IP) || - flow->key.eth.type == htons(ETH_P_IPV6)) && - flow->key.ip.proto == IPPROTO_TCP && - likely(skb->len >= skb_transport_offset(skb) + sizeof(struct tcphdr))) { - u8 *tcp = (u8 *)tcp_hdr(skb); - tcp_flags = *(tcp + TCP_FLAGS_OFFSET) & TCP_FLAG_MASK; - } - - spin_lock(&flow->lock); - flow->used = jiffies; - flow->packet_count++; - flow->byte_count += skb->len; - flow->tcp_flags |= tcp_flags; - spin_unlock(&flow->lock); -} - -struct sw_flow_actions *ovs_flow_actions_alloc(int size) -{ - struct sw_flow_actions *sfa; - - if (size > MAX_ACTIONS_BUFSIZE) - return ERR_PTR(-EINVAL); - - sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL); - if (!sfa) - return ERR_PTR(-ENOMEM); - - sfa->actions_len = 0; - return sfa; -} - -struct sw_flow *ovs_flow_alloc(void) -{ - struct sw_flow *flow; - - flow = kmem_cache_alloc(flow_cache, GFP_KERNEL); - if (!flow) - return ERR_PTR(-ENOMEM); - - spin_lock_init(&flow->lock); - flow->sf_acts = NULL; - flow->mask = NULL; - - return flow; -} - -static struct hlist_head *find_bucket(struct flow_table *table, u32 hash) -{ - hash = jhash_1word(hash, table->hash_seed); - return flex_array_get(table->buckets, - (hash & (table->n_buckets - 1))); -} - -static struct flex_array *alloc_buckets(unsigned int n_buckets) -{ - struct flex_array *buckets; - int i, err; - - buckets = flex_array_alloc(sizeof(struct hlist_head), - n_buckets, GFP_KERNEL); - if (!buckets) - return NULL; - - err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL); - if (err) { - flex_array_free(buckets); - return NULL; - } - - for (i = 0; i < n_buckets; i++) - INIT_HLIST_HEAD((struct hlist_head *) - flex_array_get(buckets, i)); - - return buckets; -} - -static void free_buckets(struct flex_array *buckets) -{ - flex_array_free(buckets); -} - -static struct flow_table *__flow_tbl_alloc(int new_size) -{ - struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL); - - if (!table) - return NULL; - - table->buckets = alloc_buckets(new_size); - - if (!table->buckets) { - kfree(table); - return NULL; - } - table->n_buckets = new_size; - table->count = 0; - table->node_ver = 0; - table->keep_flows = false; - get_random_bytes(&table->hash_seed, sizeof(u32)); - table->mask_list = NULL; - - return table; -} - -static void __flow_tbl_destroy(struct flow_table *table) -{ - int i; - - if (table->keep_flows) - goto skip_flows; - - for (i = 0; i < table->n_buckets; i++) { - struct sw_flow *flow; - struct hlist_head *head = flex_array_get(table->buckets, i); - struct hlist_node *n; - int ver = table->node_ver; - - hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { - hlist_del(&flow->hash_node[ver]); - ovs_flow_free(flow, false); - } - } - - BUG_ON(!list_empty(table->mask_list)); - kfree(table->mask_list); - -skip_flows: - free_buckets(table->buckets); - kfree(table); -} - -struct flow_table *ovs_flow_tbl_alloc(int new_size) -{ - struct flow_table *table = __flow_tbl_alloc(new_size); - - if (!table) - return NULL; - - table->mask_list = kmalloc(sizeof(struct list_head), GFP_KERNEL); - if (!table->mask_list) { - table->keep_flows = true; - __flow_tbl_destroy(table); - return NULL; - } - INIT_LIST_HEAD(table->mask_list); - - return table; -} - -static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) -{ - struct flow_table *table = container_of(rcu, struct flow_table, rcu); - - __flow_tbl_destroy(table); -} - -void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred) -{ - if (!table) - return; - - if (deferred) - call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb); - else - __flow_tbl_destroy(table); -} - -struct sw_flow *ovs_flow_dump_next(struct flow_table *table, u32 *bucket, u32 *last) -{ - struct sw_flow *flow; - struct hlist_head *head; - int ver; - int i; - - ver = table->node_ver; - while (*bucket < table->n_buckets) { - i = 0; - head = flex_array_get(table->buckets, *bucket); - hlist_for_each_entry_rcu(flow, head, hash_node[ver]) { - if (i < *last) { - i++; - continue; - } - *last = i + 1; - return flow; - } - (*bucket)++; - *last = 0; - } - - return NULL; -} - -static void __tbl_insert(struct flow_table *table, struct sw_flow *flow) -{ - struct hlist_head *head; - - head = find_bucket(table, flow->hash); - hlist_add_head_rcu(&flow->hash_node[table->node_ver], head); - - table->count++; -} - -static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new) -{ - int old_ver; - int i; - - old_ver = old->node_ver; - new->node_ver = !old_ver; - - /* Insert in new table. */ - for (i = 0; i < old->n_buckets; i++) { - struct sw_flow *flow; - struct hlist_head *head; - - head = flex_array_get(old->buckets, i); - - hlist_for_each_entry(flow, head, hash_node[old_ver]) - __tbl_insert(new, flow); - } - - new->mask_list = old->mask_list; - old->keep_flows = true; -} - -static struct flow_table *__flow_tbl_rehash(struct flow_table *table, int n_buckets) -{ - struct flow_table *new_table; - - new_table = __flow_tbl_alloc(n_buckets); - if (!new_table) - return ERR_PTR(-ENOMEM); - - flow_table_copy_flows(table, new_table); - - return new_table; -} - -struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table) -{ - return __flow_tbl_rehash(table, table->n_buckets); -} - -struct flow_table *ovs_flow_tbl_expand(struct flow_table *table) -{ - return __flow_tbl_rehash(table, table->n_buckets * 2); -} - -static void __flow_free(struct sw_flow *flow) -{ - kfree((struct sf_flow_acts __force *)flow->sf_acts); - kmem_cache_free(flow_cache, flow); -} - -static void rcu_free_flow_callback(struct rcu_head *rcu) -{ - struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu); - - __flow_free(flow); -} - -void ovs_flow_free(struct sw_flow *flow, bool deferred) -{ - if (!flow) - return; - - ovs_sw_flow_mask_del_ref(flow->mask, deferred); - - if (deferred) - call_rcu(&flow->rcu, rcu_free_flow_callback); - else - __flow_free(flow); -} - -/* Schedules 'sf_acts' to be freed after the next RCU grace period. - * The caller must hold rcu_read_lock for this to be sensible. */ -void ovs_flow_deferred_free_acts(struct sw_flow_actions *sf_acts) -{ - kfree_rcu(sf_acts, rcu); -} - static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) { struct qtag_prefix { @@ -1002,1080 +522,3 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) return 0; } - -static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start, - int key_end) -{ - u32 *hash_key = (u32 *)((u8 *)key + key_start); - int hash_u32s = (key_end - key_start) >> 2; - - /* Make sure number of hash bytes are multiple of u32. */ - BUILD_BUG_ON(sizeof(long) % sizeof(u32)); - - return jhash2(hash_key, hash_u32s, 0); -} - -static int flow_key_start(const struct sw_flow_key *key) -{ - if (key->tun_key.ipv4_dst) - return 0; - else - return rounddown(offsetof(struct sw_flow_key, phy), - sizeof(long)); -} - -static bool __cmp_key(const struct sw_flow_key *key1, - const struct sw_flow_key *key2, int key_start, int key_end) -{ - const long *cp1 = (long *)((u8 *)key1 + key_start); - const long *cp2 = (long *)((u8 *)key2 + key_start); - long diffs = 0; - int i; - - for (i = key_start; i < key_end; i += sizeof(long)) - diffs |= *cp1++ ^ *cp2++; - - return diffs == 0; -} - -static bool __flow_cmp_masked_key(const struct sw_flow *flow, - const struct sw_flow_key *key, int key_start, int key_end) -{ - return __cmp_key(&flow->key, key, key_start, key_end); -} - -static bool __flow_cmp_unmasked_key(const struct sw_flow *flow, - const struct sw_flow_key *key, int key_start, int key_end) -{ - return __cmp_key(&flow->unmasked_key, key, key_start, key_end); -} - -bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, - const struct sw_flow_key *key, int key_end) -{ - int key_start; - key_start = flow_key_start(key); - - return __flow_cmp_unmasked_key(flow, key, key_start, key_end); - -} - -struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table, - struct sw_flow_match *match) -{ - struct sw_flow_key *unmasked = match->key; - int key_end = match->range.end; - struct sw_flow *flow; - - flow = ovs_flow_lookup(table, unmasked); - if (flow && (!ovs_flow_cmp_unmasked_key(flow, unmasked, key_end))) - flow = NULL; - - return flow; -} - -static struct sw_flow *ovs_masked_flow_lookup(struct flow_table *table, - const struct sw_flow_key *unmasked, - struct sw_flow_mask *mask) -{ - struct sw_flow *flow; - struct hlist_head *head; - int key_start = mask->range.start; - int key_end = mask->range.end; - u32 hash; - struct sw_flow_key masked_key; - - ovs_flow_key_mask(&masked_key, unmasked, mask); - hash = ovs_flow_hash(&masked_key, key_start, key_end); - head = find_bucket(table, hash); - hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) { - if (flow->mask == mask && - __flow_cmp_masked_key(flow, &masked_key, - key_start, key_end)) - return flow; - } - return NULL; -} - -struct sw_flow *ovs_flow_lookup(struct flow_table *tbl, - const struct sw_flow_key *key) -{ - struct sw_flow *flow = NULL; - struct sw_flow_mask *mask; - - list_for_each_entry_rcu(mask, tbl->mask_list, list) { - flow = ovs_masked_flow_lookup(tbl, key, mask); - if (flow) /* Found */ - break; - } - - return flow; -} - - -void ovs_flow_insert(struct flow_table *table, struct sw_flow *flow) -{ - flow->hash = ovs_flow_hash(&flow->key, flow->mask->range.start, - flow->mask->range.end); - __tbl_insert(table, flow); -} - -void ovs_flow_remove(struct flow_table *table, struct sw_flow *flow) -{ - BUG_ON(table->count == 0); - hlist_del_rcu(&flow->hash_node[table->node_ver]); - table->count--; -} - -/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ -const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { - [OVS_KEY_ATTR_ENCAP] = -1, - [OVS_KEY_ATTR_PRIORITY] = sizeof(u32), - [OVS_KEY_ATTR_IN_PORT] = sizeof(u32), - [OVS_KEY_ATTR_SKB_MARK] = sizeof(u32), - [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet), - [OVS_KEY_ATTR_VLAN] = sizeof(__be16), - [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16), - [OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4), - [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6), - [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp), - [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp), - [OVS_KEY_ATTR_SCTP] = sizeof(struct ovs_key_sctp), - [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp), - [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6), - [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp), - [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd), - [OVS_KEY_ATTR_TUNNEL] = -1, -}; - -static bool is_all_zero(const u8 *fp, size_t size) -{ - int i; - - if (!fp) - return false; - - for (i = 0; i < size; i++) - if (fp[i]) - return false; - - return true; -} - -static int __parse_flow_nlattrs(const struct nlattr *attr, - const struct nlattr *a[], - u64 *attrsp, bool nz) -{ - const struct nlattr *nla; - u32 attrs; - int rem; - - attrs = *attrsp; - nla_for_each_nested(nla, attr, rem) { - u16 type = nla_type(nla); - int expected_len; - - if (type > OVS_KEY_ATTR_MAX) { - OVS_NLERR("Unknown key attribute (type=%d, max=%d).\n", - type, OVS_KEY_ATTR_MAX); - return -EINVAL; - } - - if (attrs & (1 << type)) { - OVS_NLERR("Duplicate key attribute (type %d).\n", type); - return -EINVAL; - } - - expected_len = ovs_key_lens[type]; - if (nla_len(nla) != expected_len && expected_len != -1) { - OVS_NLERR("Key attribute has unexpected length (type=%d" - ", length=%d, expected=%d).\n", type, - nla_len(nla), expected_len); - return -EINVAL; - } - - if (!nz || !is_all_zero(nla_data(nla), expected_len)) { - attrs |= 1 << type; - a[type] = nla; - } - } - if (rem) { - OVS_NLERR("Message has %d unknown bytes.\n", rem); - return -EINVAL; - } - - *attrsp = attrs; - return 0; -} - -static int parse_flow_mask_nlattrs(const struct nlattr *attr, - const struct nlattr *a[], u64 *attrsp) -{ - return __parse_flow_nlattrs(attr, a, attrsp, true); -} - -static int parse_flow_nlattrs(const struct nlattr *attr, - const struct nlattr *a[], u64 *attrsp) -{ - return __parse_flow_nlattrs(attr, a, attrsp, false); -} - -int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr, - struct sw_flow_match *match, bool is_mask) -{ - struct nlattr *a; - int rem; - bool ttl = false; - __be16 tun_flags = 0; - - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { - [OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64), - [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32), - [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = sizeof(u32), - [OVS_TUNNEL_KEY_ATTR_TOS] = 1, - [OVS_TUNNEL_KEY_ATTR_TTL] = 1, - [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0, - [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, - }; - - if (type > OVS_TUNNEL_KEY_ATTR_MAX) { - OVS_NLERR("Unknown IPv4 tunnel attribute (type=%d, max=%d).\n", - type, OVS_TUNNEL_KEY_ATTR_MAX); - return -EINVAL; - } - - if (ovs_tunnel_key_lens[type] != nla_len(a)) { - OVS_NLERR("IPv4 tunnel attribute type has unexpected " - " length (type=%d, length=%d, expected=%d).\n", - type, nla_len(a), ovs_tunnel_key_lens[type]); - return -EINVAL; - } - - switch (type) { - case OVS_TUNNEL_KEY_ATTR_ID: - SW_FLOW_KEY_PUT(match, tun_key.tun_id, - nla_get_be64(a), is_mask); - tun_flags |= TUNNEL_KEY; - break; - case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_src, - nla_get_be32(a), is_mask); - break; - case OVS_TUNNEL_KEY_ATTR_IPV4_DST: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst, - nla_get_be32(a), is_mask); - break; - case OVS_TUNNEL_KEY_ATTR_TOS: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos, - nla_get_u8(a), is_mask); - break; - case OVS_TUNNEL_KEY_ATTR_TTL: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl, - nla_get_u8(a), is_mask); - ttl = true; - break; - case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT: - tun_flags |= TUNNEL_DONT_FRAGMENT; - break; - case OVS_TUNNEL_KEY_ATTR_CSUM: - tun_flags |= TUNNEL_CSUM; - break; - default: - return -EINVAL; - } - } - - SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask); - - if (rem > 0) { - OVS_NLERR("IPv4 tunnel attribute has %d unknown bytes.\n", rem); - return -EINVAL; - } - - if (!is_mask) { - if (!match->key->tun_key.ipv4_dst) { - OVS_NLERR("IPv4 tunnel destination address is zero.\n"); - return -EINVAL; - } - - if (!ttl) { - OVS_NLERR("IPv4 tunnel TTL not specified.\n"); - return -EINVAL; - } - } - - return 0; -} - -int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *tun_key, - const struct ovs_key_ipv4_tunnel *output) -{ - struct nlattr *nla; - - nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); - if (!nla) - return -EMSGSIZE; - - if (output->tun_flags & TUNNEL_KEY && - nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) - return -EMSGSIZE; - if (output->ipv4_src && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src)) - return -EMSGSIZE; - if (output->ipv4_dst && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst)) - return -EMSGSIZE; - if (output->ipv4_tos && - nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) - return -EMSGSIZE; - if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl)) - return -EMSGSIZE; - if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && - nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) - return -EMSGSIZE; - if ((output->tun_flags & TUNNEL_CSUM) && - nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) - return -EMSGSIZE; - - nla_nest_end(skb, nla); - return 0; -} - -static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, - const struct nlattr **a, bool is_mask) -{ - if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { - SW_FLOW_KEY_PUT(match, phy.priority, - nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask); - *attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY); - } - - if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) { - u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); - - if (is_mask) - in_port = 0xffffffff; /* Always exact match in_port. */ - else if (in_port >= DP_MAX_PORTS) - return -EINVAL; - - SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask); - *attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT); - } else if (!is_mask) { - SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask); - } - - if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) { - uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]); - - SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask); - *attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); - } - if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { - if (ovs_ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, - is_mask)) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); - } - return 0; -} - -static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, - const struct nlattr **a, bool is_mask) -{ - int err; - u64 orig_attrs = attrs; - - err = metadata_from_nlattrs(match, &attrs, a, is_mask); - if (err) - return err; - - if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) { - const struct ovs_key_ethernet *eth_key; - - eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]); - SW_FLOW_KEY_MEMCPY(match, eth.src, - eth_key->eth_src, ETH_ALEN, is_mask); - SW_FLOW_KEY_MEMCPY(match, eth.dst, - eth_key->eth_dst, ETH_ALEN, is_mask); - attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET); - } - - if (attrs & (1 << OVS_KEY_ATTR_VLAN)) { - __be16 tci; - - tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - if (!(tci & htons(VLAN_TAG_PRESENT))) { - if (is_mask) - OVS_NLERR("VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.\n"); - else - OVS_NLERR("VLAN TCI does not have VLAN_TAG_PRESENT bit set.\n"); - - return -EINVAL; - } - - SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask); - attrs &= ~(1 << OVS_KEY_ATTR_VLAN); - } else if (!is_mask) - SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true); - - if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { - __be16 eth_type; - - eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); - if (is_mask) { - /* Always exact match EtherType. */ - eth_type = htons(0xffff); - } else if (ntohs(eth_type) < ETH_P_802_3_MIN) { - OVS_NLERR("EtherType is less than minimum (type=%x, min=%x).\n", - ntohs(eth_type), ETH_P_802_3_MIN); - return -EINVAL; - } - - SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask); - attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); - } else if (!is_mask) { - SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask); - } - - if (attrs & (1 << OVS_KEY_ATTR_IPV4)) { - const struct ovs_key_ipv4 *ipv4_key; - - ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]); - if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) { - OVS_NLERR("Unknown IPv4 fragment type (value=%d, max=%d).\n", - ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX); - return -EINVAL; - } - SW_FLOW_KEY_PUT(match, ip.proto, - ipv4_key->ipv4_proto, is_mask); - SW_FLOW_KEY_PUT(match, ip.tos, - ipv4_key->ipv4_tos, is_mask); - SW_FLOW_KEY_PUT(match, ip.ttl, - ipv4_key->ipv4_ttl, is_mask); - SW_FLOW_KEY_PUT(match, ip.frag, - ipv4_key->ipv4_frag, is_mask); - SW_FLOW_KEY_PUT(match, ipv4.addr.src, - ipv4_key->ipv4_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv4.addr.dst, - ipv4_key->ipv4_dst, is_mask); - attrs &= ~(1 << OVS_KEY_ATTR_IPV4); - } - - if (attrs & (1 << OVS_KEY_ATTR_IPV6)) { - const struct ovs_key_ipv6 *ipv6_key; - - ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]); - if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) { - OVS_NLERR("Unknown IPv6 fragment type (value=%d, max=%d).\n", - ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX); - return -EINVAL; - } - SW_FLOW_KEY_PUT(match, ipv6.label, - ipv6_key->ipv6_label, is_mask); - SW_FLOW_KEY_PUT(match, ip.proto, - ipv6_key->ipv6_proto, is_mask); - SW_FLOW_KEY_PUT(match, ip.tos, - ipv6_key->ipv6_tclass, is_mask); - SW_FLOW_KEY_PUT(match, ip.ttl, - ipv6_key->ipv6_hlimit, is_mask); - SW_FLOW_KEY_PUT(match, ip.frag, - ipv6_key->ipv6_frag, is_mask); - SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src, - ipv6_key->ipv6_src, - sizeof(match->key->ipv6.addr.src), - is_mask); - SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst, - ipv6_key->ipv6_dst, - sizeof(match->key->ipv6.addr.dst), - is_mask); - - attrs &= ~(1 << OVS_KEY_ATTR_IPV6); - } - - if (attrs & (1 << OVS_KEY_ATTR_ARP)) { - const struct ovs_key_arp *arp_key; - - arp_key = nla_data(a[OVS_KEY_ATTR_ARP]); - if (!is_mask && (arp_key->arp_op & htons(0xff00))) { - OVS_NLERR("Unknown ARP opcode (opcode=%d).\n", - arp_key->arp_op); - return -EINVAL; - } - - SW_FLOW_KEY_PUT(match, ipv4.addr.src, - arp_key->arp_sip, is_mask); - SW_FLOW_KEY_PUT(match, ipv4.addr.dst, - arp_key->arp_tip, is_mask); - SW_FLOW_KEY_PUT(match, ip.proto, - ntohs(arp_key->arp_op), is_mask); - SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha, - arp_key->arp_sha, ETH_ALEN, is_mask); - SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha, - arp_key->arp_tha, ETH_ALEN, is_mask); - - attrs &= ~(1 << OVS_KEY_ATTR_ARP); - } - - if (attrs & (1 << OVS_KEY_ATTR_TCP)) { - const struct ovs_key_tcp *tcp_key; - - tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); - if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { - SW_FLOW_KEY_PUT(match, ipv4.tp.src, - tcp_key->tcp_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv4.tp.dst, - tcp_key->tcp_dst, is_mask); - } else { - SW_FLOW_KEY_PUT(match, ipv6.tp.src, - tcp_key->tcp_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv6.tp.dst, - tcp_key->tcp_dst, is_mask); - } - attrs &= ~(1 << OVS_KEY_ATTR_TCP); - } - - if (attrs & (1 << OVS_KEY_ATTR_UDP)) { - const struct ovs_key_udp *udp_key; - - udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); - if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { - SW_FLOW_KEY_PUT(match, ipv4.tp.src, - udp_key->udp_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv4.tp.dst, - udp_key->udp_dst, is_mask); - } else { - SW_FLOW_KEY_PUT(match, ipv6.tp.src, - udp_key->udp_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv6.tp.dst, - udp_key->udp_dst, is_mask); - } - attrs &= ~(1 << OVS_KEY_ATTR_UDP); - } - - if (attrs & (1 << OVS_KEY_ATTR_SCTP)) { - const struct ovs_key_sctp *sctp_key; - - sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]); - if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { - SW_FLOW_KEY_PUT(match, ipv4.tp.src, - sctp_key->sctp_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv4.tp.dst, - sctp_key->sctp_dst, is_mask); - } else { - SW_FLOW_KEY_PUT(match, ipv6.tp.src, - sctp_key->sctp_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv6.tp.dst, - sctp_key->sctp_dst, is_mask); - } - attrs &= ~(1 << OVS_KEY_ATTR_SCTP); - } - - if (attrs & (1 << OVS_KEY_ATTR_ICMP)) { - const struct ovs_key_icmp *icmp_key; - - icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]); - SW_FLOW_KEY_PUT(match, ipv4.tp.src, - htons(icmp_key->icmp_type), is_mask); - SW_FLOW_KEY_PUT(match, ipv4.tp.dst, - htons(icmp_key->icmp_code), is_mask); - attrs &= ~(1 << OVS_KEY_ATTR_ICMP); - } - - if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) { - const struct ovs_key_icmpv6 *icmpv6_key; - - icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]); - SW_FLOW_KEY_PUT(match, ipv6.tp.src, - htons(icmpv6_key->icmpv6_type), is_mask); - SW_FLOW_KEY_PUT(match, ipv6.tp.dst, - htons(icmpv6_key->icmpv6_code), is_mask); - attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6); - } - - if (attrs & (1 << OVS_KEY_ATTR_ND)) { - const struct ovs_key_nd *nd_key; - - nd_key = nla_data(a[OVS_KEY_ATTR_ND]); - SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target, - nd_key->nd_target, - sizeof(match->key->ipv6.nd.target), - is_mask); - SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll, - nd_key->nd_sll, ETH_ALEN, is_mask); - SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll, - nd_key->nd_tll, ETH_ALEN, is_mask); - attrs &= ~(1 << OVS_KEY_ATTR_ND); - } - - if (attrs != 0) - return -EINVAL; - - return 0; -} - -/** - * ovs_match_from_nlattrs - parses Netlink attributes into a flow key and - * mask. In case the 'mask' is NULL, the flow is treated as exact match - * flow. Otherwise, it is treated as a wildcarded flow, except the mask - * does not include any don't care bit. - * @match: receives the extracted flow match information. - * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute - * sequence. The fields should of the packet that triggered the creation - * of this flow. - * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink - * attribute specifies the mask field of the wildcarded flow. - */ -int ovs_match_from_nlattrs(struct sw_flow_match *match, - const struct nlattr *key, - const struct nlattr *mask) -{ - const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; - const struct nlattr *encap; - u64 key_attrs = 0; - u64 mask_attrs = 0; - bool encap_valid = false; - int err; - - err = parse_flow_nlattrs(key, a, &key_attrs); - if (err) - return err; - - if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) && - (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) && - (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) { - __be16 tci; - - if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) && - (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) { - OVS_NLERR("Invalid Vlan frame.\n"); - return -EINVAL; - } - - key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); - tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - encap = a[OVS_KEY_ATTR_ENCAP]; - key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); - encap_valid = true; - - if (tci & htons(VLAN_TAG_PRESENT)) { - err = parse_flow_nlattrs(encap, a, &key_attrs); - if (err) - return err; - } else if (!tci) { - /* Corner case for truncated 802.1Q header. */ - if (nla_len(encap)) { - OVS_NLERR("Truncated 802.1Q header has non-zero encap attribute.\n"); - return -EINVAL; - } - } else { - OVS_NLERR("Encap attribute is set for a non-VLAN frame.\n"); - return -EINVAL; - } - } - - err = ovs_key_from_nlattrs(match, key_attrs, a, false); - if (err) - return err; - - if (mask) { - err = parse_flow_mask_nlattrs(mask, a, &mask_attrs); - if (err) - return err; - - if (mask_attrs & 1ULL << OVS_KEY_ATTR_ENCAP) { - __be16 eth_type = 0; - __be16 tci = 0; - - if (!encap_valid) { - OVS_NLERR("Encap mask attribute is set for non-VLAN frame.\n"); - return -EINVAL; - } - - mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); - if (a[OVS_KEY_ATTR_ETHERTYPE]) - eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); - - if (eth_type == htons(0xffff)) { - mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); - encap = a[OVS_KEY_ATTR_ENCAP]; - err = parse_flow_mask_nlattrs(encap, a, &mask_attrs); - } else { - OVS_NLERR("VLAN frames must have an exact match on the TPID (mask=%x).\n", - ntohs(eth_type)); - return -EINVAL; - } - - if (a[OVS_KEY_ATTR_VLAN]) - tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - - if (!(tci & htons(VLAN_TAG_PRESENT))) { - OVS_NLERR("VLAN tag present bit must have an exact match (tci_mask=%x).\n", ntohs(tci)); - return -EINVAL; - } - } - - err = ovs_key_from_nlattrs(match, mask_attrs, a, true); - if (err) - return err; - } else { - /* Populate exact match flow's key mask. */ - if (match->mask) - ovs_sw_flow_mask_set(match->mask, &match->range, 0xff); - } - - if (!ovs_match_validate(match, key_attrs, mask_attrs)) - return -EINVAL; - - return 0; -} - -/** - * ovs_flow_metadata_from_nlattrs - parses Netlink attributes into a flow key. - * @flow: Receives extracted in_port, priority, tun_key and skb_mark. - * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute - * sequence. - * - * This parses a series of Netlink attributes that form a flow key, which must - * take the same form accepted by flow_from_nlattrs(), but only enough of it to - * get the metadata, that is, the parts of the flow key that cannot be - * extracted from the packet itself. - */ - -int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, - const struct nlattr *attr) -{ - struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key; - const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; - u64 attrs = 0; - int err; - struct sw_flow_match match; - - flow->key.phy.in_port = DP_MAX_PORTS; - flow->key.phy.priority = 0; - flow->key.phy.skb_mark = 0; - memset(tun_key, 0, sizeof(flow->key.tun_key)); - - err = parse_flow_nlattrs(attr, a, &attrs); - if (err) - return -EINVAL; - - memset(&match, 0, sizeof(match)); - match.key = &flow->key; - - err = metadata_from_nlattrs(&match, &attrs, a, false); - if (err) - return err; - - return 0; -} - -int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, - const struct sw_flow_key *output, struct sk_buff *skb) -{ - struct ovs_key_ethernet *eth_key; - struct nlattr *nla, *encap; - bool is_mask = (swkey != output); - - if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) - goto nla_put_failure; - - if ((swkey->tun_key.ipv4_dst || is_mask) && - ovs_ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key)) - goto nla_put_failure; - - if (swkey->phy.in_port == DP_MAX_PORTS) { - if (is_mask && (output->phy.in_port == 0xffff)) - if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff)) - goto nla_put_failure; - } else { - u16 upper_u16; - upper_u16 = !is_mask ? 0 : 0xffff; - - if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, - (upper_u16 << 16) | output->phy.in_port)) - goto nla_put_failure; - } - - if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark)) - goto nla_put_failure; - - nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); - if (!nla) - goto nla_put_failure; - - eth_key = nla_data(nla); - memcpy(eth_key->eth_src, output->eth.src, ETH_ALEN); - memcpy(eth_key->eth_dst, output->eth.dst, ETH_ALEN); - - if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) { - __be16 eth_type; - eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff); - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) || - nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci)) - goto nla_put_failure; - encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); - if (!swkey->eth.tci) - goto unencap; - } else - encap = NULL; - - if (swkey->eth.type == htons(ETH_P_802_2)) { - /* - * Ethertype 802.2 is represented in the netlink with omitted - * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and - * 0xffff in the mask attribute. Ethertype can also - * be wildcarded. - */ - if (is_mask && output->eth.type) - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, - output->eth.type)) - goto nla_put_failure; - goto unencap; - } - - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type)) - goto nla_put_failure; - - if (swkey->eth.type == htons(ETH_P_IP)) { - struct ovs_key_ipv4 *ipv4_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key)); - if (!nla) - goto nla_put_failure; - ipv4_key = nla_data(nla); - ipv4_key->ipv4_src = output->ipv4.addr.src; - ipv4_key->ipv4_dst = output->ipv4.addr.dst; - ipv4_key->ipv4_proto = output->ip.proto; - ipv4_key->ipv4_tos = output->ip.tos; - ipv4_key->ipv4_ttl = output->ip.ttl; - ipv4_key->ipv4_frag = output->ip.frag; - } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - struct ovs_key_ipv6 *ipv6_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key)); - if (!nla) - goto nla_put_failure; - ipv6_key = nla_data(nla); - memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src, - sizeof(ipv6_key->ipv6_src)); - memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst, - sizeof(ipv6_key->ipv6_dst)); - ipv6_key->ipv6_label = output->ipv6.label; - ipv6_key->ipv6_proto = output->ip.proto; - ipv6_key->ipv6_tclass = output->ip.tos; - ipv6_key->ipv6_hlimit = output->ip.ttl; - ipv6_key->ipv6_frag = output->ip.frag; - } else if (swkey->eth.type == htons(ETH_P_ARP) || - swkey->eth.type == htons(ETH_P_RARP)) { - struct ovs_key_arp *arp_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key)); - if (!nla) - goto nla_put_failure; - arp_key = nla_data(nla); - memset(arp_key, 0, sizeof(struct ovs_key_arp)); - arp_key->arp_sip = output->ipv4.addr.src; - arp_key->arp_tip = output->ipv4.addr.dst; - arp_key->arp_op = htons(output->ip.proto); - memcpy(arp_key->arp_sha, output->ipv4.arp.sha, ETH_ALEN); - memcpy(arp_key->arp_tha, output->ipv4.arp.tha, ETH_ALEN); - } - - if ((swkey->eth.type == htons(ETH_P_IP) || - swkey->eth.type == htons(ETH_P_IPV6)) && - swkey->ip.frag != OVS_FRAG_TYPE_LATER) { - - if (swkey->ip.proto == IPPROTO_TCP) { - struct ovs_key_tcp *tcp_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key)); - if (!nla) - goto nla_put_failure; - tcp_key = nla_data(nla); - if (swkey->eth.type == htons(ETH_P_IP)) { - tcp_key->tcp_src = output->ipv4.tp.src; - tcp_key->tcp_dst = output->ipv4.tp.dst; - } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - tcp_key->tcp_src = output->ipv6.tp.src; - tcp_key->tcp_dst = output->ipv6.tp.dst; - } - } else if (swkey->ip.proto == IPPROTO_UDP) { - struct ovs_key_udp *udp_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key)); - if (!nla) - goto nla_put_failure; - udp_key = nla_data(nla); - if (swkey->eth.type == htons(ETH_P_IP)) { - udp_key->udp_src = output->ipv4.tp.src; - udp_key->udp_dst = output->ipv4.tp.dst; - } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - udp_key->udp_src = output->ipv6.tp.src; - udp_key->udp_dst = output->ipv6.tp.dst; - } - } else if (swkey->ip.proto == IPPROTO_SCTP) { - struct ovs_key_sctp *sctp_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key)); - if (!nla) - goto nla_put_failure; - sctp_key = nla_data(nla); - if (swkey->eth.type == htons(ETH_P_IP)) { - sctp_key->sctp_src = swkey->ipv4.tp.src; - sctp_key->sctp_dst = swkey->ipv4.tp.dst; - } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - sctp_key->sctp_src = swkey->ipv6.tp.src; - sctp_key->sctp_dst = swkey->ipv6.tp.dst; - } - } else if (swkey->eth.type == htons(ETH_P_IP) && - swkey->ip.proto == IPPROTO_ICMP) { - struct ovs_key_icmp *icmp_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key)); - if (!nla) - goto nla_put_failure; - icmp_key = nla_data(nla); - icmp_key->icmp_type = ntohs(output->ipv4.tp.src); - icmp_key->icmp_code = ntohs(output->ipv4.tp.dst); - } else if (swkey->eth.type == htons(ETH_P_IPV6) && - swkey->ip.proto == IPPROTO_ICMPV6) { - struct ovs_key_icmpv6 *icmpv6_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6, - sizeof(*icmpv6_key)); - if (!nla) - goto nla_put_failure; - icmpv6_key = nla_data(nla); - icmpv6_key->icmpv6_type = ntohs(output->ipv6.tp.src); - icmpv6_key->icmpv6_code = ntohs(output->ipv6.tp.dst); - - if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION || - icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) { - struct ovs_key_nd *nd_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key)); - if (!nla) - goto nla_put_failure; - nd_key = nla_data(nla); - memcpy(nd_key->nd_target, &output->ipv6.nd.target, - sizeof(nd_key->nd_target)); - memcpy(nd_key->nd_sll, output->ipv6.nd.sll, ETH_ALEN); - memcpy(nd_key->nd_tll, output->ipv6.nd.tll, ETH_ALEN); - } - } - } - -unencap: - if (encap) - nla_nest_end(skb, encap); - - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - -/* Initializes the flow module. - * Returns zero if successful or a negative error code. */ -int ovs_flow_init(void) -{ - BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long)); - BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); - - flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0, - 0, NULL); - if (flow_cache == NULL) - return -ENOMEM; - - return 0; -} - -/* Uninitializes the flow module. */ -void ovs_flow_exit(void) -{ - kmem_cache_destroy(flow_cache); -} - -struct sw_flow_mask *ovs_sw_flow_mask_alloc(void) -{ - struct sw_flow_mask *mask; - - mask = kmalloc(sizeof(*mask), GFP_KERNEL); - if (mask) - mask->ref_count = 0; - - return mask; -} - -void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *mask) -{ - mask->ref_count++; -} - -void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *mask, bool deferred) -{ - if (!mask) - return; - - BUG_ON(!mask->ref_count); - mask->ref_count--; - - if (!mask->ref_count) { - list_del_rcu(&mask->list); - if (deferred) - kfree_rcu(mask, rcu); - else - kfree(mask); - } -} - -static bool ovs_sw_flow_mask_equal(const struct sw_flow_mask *a, - const struct sw_flow_mask *b) -{ - u8 *a_ = (u8 *)&a->key + a->range.start; - u8 *b_ = (u8 *)&b->key + b->range.start; - - return (a->range.end == b->range.end) - && (a->range.start == b->range.start) - && (memcmp(a_, b_, range_n_bytes(&a->range)) == 0); -} - -struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *tbl, - const struct sw_flow_mask *mask) -{ - struct list_head *ml; - - list_for_each(ml, tbl->mask_list) { - struct sw_flow_mask *m; - m = container_of(ml, struct sw_flow_mask, list); - if (ovs_sw_flow_mask_equal(mask, m)) - return m; - } - - return NULL; -} - -/** - * add a new mask into the mask list. - * The caller needs to make sure that 'mask' is not the same - * as any masks that are already on the list. - */ -void ovs_sw_flow_mask_insert(struct flow_table *tbl, struct sw_flow_mask *mask) -{ - list_add_rcu(&mask->list, tbl->mask_list); -} - -/** - * Set 'range' fields in the mask to the value of 'val'. - */ -static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask, - struct sw_flow_key_range *range, u8 val) -{ - u8 *m = (u8 *)&mask->key + range->start; - - mask->range = *range; - memset(m, val, range_n_bytes(range)); -} diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 212fbf7510c4..098fd1db6a23 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -33,14 +33,6 @@ #include struct sk_buff; -struct sw_flow_mask; -struct flow_table; - -struct sw_flow_actions { - struct rcu_head rcu; - u32 actions_len; - struct nlattr actions[]; -}; /* Used to memset ovs_key_ipv4_tunnel padding. */ #define OVS_TUNNEL_KEY_SIZE \ @@ -127,6 +119,31 @@ struct sw_flow_key { }; } __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */ +struct sw_flow_key_range { + size_t start; + size_t end; +}; + +struct sw_flow_mask { + int ref_count; + struct rcu_head rcu; + struct list_head list; + struct sw_flow_key_range range; + struct sw_flow_key key; +}; + +struct sw_flow_match { + struct sw_flow_key *key; + struct sw_flow_key_range range; + struct sw_flow_mask *mask; +}; + +struct sw_flow_actions { + struct rcu_head rcu; + u32 actions_len; + struct nlattr actions[]; +}; + struct sw_flow { struct rcu_head rcu; struct hlist_node hash_node[2]; @@ -144,20 +161,6 @@ struct sw_flow { u8 tcp_flags; /* Union of seen TCP flags. */ }; -struct sw_flow_key_range { - size_t start; - size_t end; -}; - -struct sw_flow_match { - struct sw_flow_key *key; - struct sw_flow_key_range range; - struct sw_flow_mask *mask; -}; - -void ovs_match_init(struct sw_flow_match *match, - struct sw_flow_key *key, struct sw_flow_mask *mask); - struct arp_eth_header { __be16 ar_hrd; /* format of hardware address */ __be16 ar_pro; /* format of protocol address */ @@ -172,88 +175,9 @@ struct arp_eth_header { unsigned char ar_tip[4]; /* target IP address */ } __packed; -int ovs_flow_init(void); -void ovs_flow_exit(void); - -struct sw_flow *ovs_flow_alloc(void); -void ovs_flow_deferred_free(struct sw_flow *); -void ovs_flow_free(struct sw_flow *, bool deferred); - -struct sw_flow_actions *ovs_flow_actions_alloc(int actions_len); -void ovs_flow_deferred_free_acts(struct sw_flow_actions *); - -int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *); void ovs_flow_used(struct sw_flow *, struct sk_buff *); u64 ovs_flow_used_time(unsigned long flow_jiffies); -int ovs_flow_to_nlattrs(const struct sw_flow_key *, - const struct sw_flow_key *, struct sk_buff *); -int ovs_match_from_nlattrs(struct sw_flow_match *match, - const struct nlattr *, - const struct nlattr *); -int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, - const struct nlattr *attr); -#define MAX_ACTIONS_BUFSIZE (32 * 1024) -#define TBL_MIN_BUCKETS 1024 - -struct flow_table { - struct flex_array *buckets; - unsigned int count, n_buckets; - struct rcu_head rcu; - struct list_head *mask_list; - int node_ver; - u32 hash_seed; - bool keep_flows; -}; - -static inline int ovs_flow_tbl_count(struct flow_table *table) -{ - return table->count; -} - -static inline int ovs_flow_tbl_need_to_expand(struct flow_table *table) -{ - return (table->count > table->n_buckets); -} - -struct sw_flow *ovs_flow_lookup(struct flow_table *, - const struct sw_flow_key *); -struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table, - struct sw_flow_match *match); - -void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred); -struct flow_table *ovs_flow_tbl_alloc(int new_size); -struct flow_table *ovs_flow_tbl_expand(struct flow_table *table); -struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table); - -void ovs_flow_insert(struct flow_table *table, struct sw_flow *flow); -void ovs_flow_remove(struct flow_table *table, struct sw_flow *flow); - -struct sw_flow *ovs_flow_dump_next(struct flow_table *table, u32 *bucket, u32 *idx); -extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1]; -int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr, - struct sw_flow_match *match, bool is_mask); -int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *tun_key, - const struct ovs_key_ipv4_tunnel *output); - -bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, - const struct sw_flow_key *key, int key_end); - -struct sw_flow_mask { - int ref_count; - struct rcu_head rcu; - struct list_head list; - struct sw_flow_key_range range; - struct sw_flow_key key; -}; +int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *); -struct sw_flow_mask *ovs_sw_flow_mask_alloc(void); -void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *); -void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *, bool deferred); -void ovs_sw_flow_mask_insert(struct flow_table *, struct sw_flow_mask *); -struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *, - const struct sw_flow_mask *); -void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src, - const struct sw_flow_mask *mask); #endif /* flow.h */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c new file mode 100644 index 000000000000..e04649c56a96 --- /dev/null +++ b/net/openvswitch/flow_netlink.c @@ -0,0 +1,1603 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include "flow.h" +#include "datapath.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "flow_netlink.h" + +static void update_range__(struct sw_flow_match *match, + size_t offset, size_t size, bool is_mask) +{ + struct sw_flow_key_range *range = NULL; + size_t start = rounddown(offset, sizeof(long)); + size_t end = roundup(offset + size, sizeof(long)); + + if (!is_mask) + range = &match->range; + else if (match->mask) + range = &match->mask->range; + + if (!range) + return; + + if (range->start == range->end) { + range->start = start; + range->end = end; + return; + } + + if (range->start > start) + range->start = start; + + if (range->end < end) + range->end = end; +} + +#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \ + do { \ + update_range__(match, offsetof(struct sw_flow_key, field), \ + sizeof((match)->key->field), is_mask); \ + if (is_mask) { \ + if ((match)->mask) \ + (match)->mask->key.field = value; \ + } else { \ + (match)->key->field = value; \ + } \ + } while (0) + +#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ + do { \ + update_range__(match, offsetof(struct sw_flow_key, field), \ + len, is_mask); \ + if (is_mask) { \ + if ((match)->mask) \ + memcpy(&(match)->mask->key.field, value_p, len);\ + } else { \ + memcpy(&(match)->key->field, value_p, len); \ + } \ + } while (0) + +static u16 range_n_bytes(const struct sw_flow_key_range *range) +{ + return range->end - range->start; +} + +static bool match_validate(const struct sw_flow_match *match, + u64 key_attrs, u64 mask_attrs) +{ + u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET; + u64 mask_allowed = key_attrs; /* At most allow all key attributes */ + + /* The following mask attributes allowed only if they + * pass the validation tests. */ + mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4) + | (1 << OVS_KEY_ATTR_IPV6) + | (1 << OVS_KEY_ATTR_TCP) + | (1 << OVS_KEY_ATTR_UDP) + | (1 << OVS_KEY_ATTR_SCTP) + | (1 << OVS_KEY_ATTR_ICMP) + | (1 << OVS_KEY_ATTR_ICMPV6) + | (1 << OVS_KEY_ATTR_ARP) + | (1 << OVS_KEY_ATTR_ND)); + + /* Always allowed mask fields. */ + mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL) + | (1 << OVS_KEY_ATTR_IN_PORT) + | (1 << OVS_KEY_ATTR_ETHERTYPE)); + + /* Check key attributes. */ + if (match->key->eth.type == htons(ETH_P_ARP) + || match->key->eth.type == htons(ETH_P_RARP)) { + key_expected |= 1 << OVS_KEY_ATTR_ARP; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_ARP; + } + + if (match->key->eth.type == htons(ETH_P_IP)) { + key_expected |= 1 << OVS_KEY_ATTR_IPV4; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_IPV4; + + if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { + if (match->key->ip.proto == IPPROTO_UDP) { + key_expected |= 1 << OVS_KEY_ATTR_UDP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_UDP; + } + + if (match->key->ip.proto == IPPROTO_SCTP) { + key_expected |= 1 << OVS_KEY_ATTR_SCTP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; + } + + if (match->key->ip.proto == IPPROTO_TCP) { + key_expected |= 1 << OVS_KEY_ATTR_TCP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_TCP; + } + + if (match->key->ip.proto == IPPROTO_ICMP) { + key_expected |= 1 << OVS_KEY_ATTR_ICMP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_ICMP; + } + } + } + + if (match->key->eth.type == htons(ETH_P_IPV6)) { + key_expected |= 1 << OVS_KEY_ATTR_IPV6; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_IPV6; + + if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { + if (match->key->ip.proto == IPPROTO_UDP) { + key_expected |= 1 << OVS_KEY_ATTR_UDP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_UDP; + } + + if (match->key->ip.proto == IPPROTO_SCTP) { + key_expected |= 1 << OVS_KEY_ATTR_SCTP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; + } + + if (match->key->ip.proto == IPPROTO_TCP) { + key_expected |= 1 << OVS_KEY_ATTR_TCP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_TCP; + } + + if (match->key->ip.proto == IPPROTO_ICMPV6) { + key_expected |= 1 << OVS_KEY_ATTR_ICMPV6; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6; + + if (match->key->ipv6.tp.src == + htons(NDISC_NEIGHBOUR_SOLICITATION) || + match->key->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { + key_expected |= 1 << OVS_KEY_ATTR_ND; + if (match->mask && (match->mask->key.ipv6.tp.src == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_ND; + } + } + } + } + + if ((key_attrs & key_expected) != key_expected) { + /* Key attributes check failed. */ + OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n", + key_attrs, key_expected); + return false; + } + + if ((mask_attrs & mask_allowed) != mask_attrs) { + /* Mask attributes check failed. */ + OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n", + mask_attrs, mask_allowed); + return false; + } + + return true; +} + +/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ +static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { + [OVS_KEY_ATTR_ENCAP] = -1, + [OVS_KEY_ATTR_PRIORITY] = sizeof(u32), + [OVS_KEY_ATTR_IN_PORT] = sizeof(u32), + [OVS_KEY_ATTR_SKB_MARK] = sizeof(u32), + [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet), + [OVS_KEY_ATTR_VLAN] = sizeof(__be16), + [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16), + [OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4), + [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6), + [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp), + [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp), + [OVS_KEY_ATTR_SCTP] = sizeof(struct ovs_key_sctp), + [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp), + [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6), + [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp), + [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd), + [OVS_KEY_ATTR_TUNNEL] = -1, +}; + +static bool is_all_zero(const u8 *fp, size_t size) +{ + int i; + + if (!fp) + return false; + + for (i = 0; i < size; i++) + if (fp[i]) + return false; + + return true; +} + +static int __parse_flow_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], + u64 *attrsp, bool nz) +{ + const struct nlattr *nla; + u64 attrs; + int rem; + + attrs = *attrsp; + nla_for_each_nested(nla, attr, rem) { + u16 type = nla_type(nla); + int expected_len; + + if (type > OVS_KEY_ATTR_MAX) { + OVS_NLERR("Unknown key attribute (type=%d, max=%d).\n", + type, OVS_KEY_ATTR_MAX); + return -EINVAL; + } + + if (attrs & (1 << type)) { + OVS_NLERR("Duplicate key attribute (type %d).\n", type); + return -EINVAL; + } + + expected_len = ovs_key_lens[type]; + if (nla_len(nla) != expected_len && expected_len != -1) { + OVS_NLERR("Key attribute has unexpected length (type=%d" + ", length=%d, expected=%d).\n", type, + nla_len(nla), expected_len); + return -EINVAL; + } + + if (!nz || !is_all_zero(nla_data(nla), expected_len)) { + attrs |= 1 << type; + a[type] = nla; + } + } + if (rem) { + OVS_NLERR("Message has %d unknown bytes.\n", rem); + return -EINVAL; + } + + *attrsp = attrs; + return 0; +} + +static int parse_flow_mask_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], u64 *attrsp) +{ + return __parse_flow_nlattrs(attr, a, attrsp, true); +} + +static int parse_flow_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], u64 *attrsp) +{ + return __parse_flow_nlattrs(attr, a, attrsp, false); +} + +static int ipv4_tun_from_nlattr(const struct nlattr *attr, + struct sw_flow_match *match, bool is_mask) +{ + struct nlattr *a; + int rem; + bool ttl = false; + __be16 tun_flags = 0; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { + [OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64), + [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32), + [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = sizeof(u32), + [OVS_TUNNEL_KEY_ATTR_TOS] = 1, + [OVS_TUNNEL_KEY_ATTR_TTL] = 1, + [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0, + [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, + }; + + if (type > OVS_TUNNEL_KEY_ATTR_MAX) { + OVS_NLERR("Unknown IPv4 tunnel attribute (type=%d, max=%d).\n", + type, OVS_TUNNEL_KEY_ATTR_MAX); + return -EINVAL; + } + + if (ovs_tunnel_key_lens[type] != nla_len(a)) { + OVS_NLERR("IPv4 tunnel attribute type has unexpected " + " length (type=%d, length=%d, expected=%d).\n", + type, nla_len(a), ovs_tunnel_key_lens[type]); + return -EINVAL; + } + + switch (type) { + case OVS_TUNNEL_KEY_ATTR_ID: + SW_FLOW_KEY_PUT(match, tun_key.tun_id, + nla_get_be64(a), is_mask); + tun_flags |= TUNNEL_KEY; + break; + case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: + SW_FLOW_KEY_PUT(match, tun_key.ipv4_src, + nla_get_be32(a), is_mask); + break; + case OVS_TUNNEL_KEY_ATTR_IPV4_DST: + SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst, + nla_get_be32(a), is_mask); + break; + case OVS_TUNNEL_KEY_ATTR_TOS: + SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos, + nla_get_u8(a), is_mask); + break; + case OVS_TUNNEL_KEY_ATTR_TTL: + SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl, + nla_get_u8(a), is_mask); + ttl = true; + break; + case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT: + tun_flags |= TUNNEL_DONT_FRAGMENT; + break; + case OVS_TUNNEL_KEY_ATTR_CSUM: + tun_flags |= TUNNEL_CSUM; + break; + default: + return -EINVAL; + } + } + + SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask); + + if (rem > 0) { + OVS_NLERR("IPv4 tunnel attribute has %d unknown bytes.\n", rem); + return -EINVAL; + } + + if (!is_mask) { + if (!match->key->tun_key.ipv4_dst) { + OVS_NLERR("IPv4 tunnel destination address is zero.\n"); + return -EINVAL; + } + + if (!ttl) { + OVS_NLERR("IPv4 tunnel TTL not specified.\n"); + return -EINVAL; + } + } + + return 0; +} + +static int ipv4_tun_to_nlattr(struct sk_buff *skb, + const struct ovs_key_ipv4_tunnel *tun_key, + const struct ovs_key_ipv4_tunnel *output) +{ + struct nlattr *nla; + + nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); + if (!nla) + return -EMSGSIZE; + + if (output->tun_flags & TUNNEL_KEY && + nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) + return -EMSGSIZE; + if (output->ipv4_src && + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src)) + return -EMSGSIZE; + if (output->ipv4_dst && + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst)) + return -EMSGSIZE; + if (output->ipv4_tos && + nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) + return -EMSGSIZE; + if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl)) + return -EMSGSIZE; + if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) + return -EMSGSIZE; + if ((output->tun_flags & TUNNEL_CSUM) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) + return -EMSGSIZE; + + nla_nest_end(skb, nla); + return 0; +} + + +static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, + const struct nlattr **a, bool is_mask) +{ + if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { + SW_FLOW_KEY_PUT(match, phy.priority, + nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY); + } + + if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) { + u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); + + if (is_mask) + in_port = 0xffffffff; /* Always exact match in_port. */ + else if (in_port >= DP_MAX_PORTS) + return -EINVAL; + + SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT); + } else if (!is_mask) { + SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask); + } + + if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) { + uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]); + + SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); + } + if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { + if (ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, + is_mask)) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); + } + return 0; +} + +static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, + const struct nlattr **a, bool is_mask) +{ + int err; + u64 orig_attrs = attrs; + + err = metadata_from_nlattrs(match, &attrs, a, is_mask); + if (err) + return err; + + if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) { + const struct ovs_key_ethernet *eth_key; + + eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]); + SW_FLOW_KEY_MEMCPY(match, eth.src, + eth_key->eth_src, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, eth.dst, + eth_key->eth_dst, ETH_ALEN, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET); + } + + if (attrs & (1 << OVS_KEY_ATTR_VLAN)) { + __be16 tci; + + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + if (!(tci & htons(VLAN_TAG_PRESENT))) { + if (is_mask) + OVS_NLERR("VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.\n"); + else + OVS_NLERR("VLAN TCI does not have VLAN_TAG_PRESENT bit set.\n"); + + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_VLAN); + } else if (!is_mask) + SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true); + + if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { + __be16 eth_type; + + eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + if (is_mask) { + /* Always exact match EtherType. */ + eth_type = htons(0xffff); + } else if (ntohs(eth_type) < ETH_P_802_3_MIN) { + OVS_NLERR("EtherType is less than minimum (type=%x, min=%x).\n", + ntohs(eth_type), ETH_P_802_3_MIN); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + } else if (!is_mask) { + SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask); + } + + if (attrs & (1 << OVS_KEY_ATTR_IPV4)) { + const struct ovs_key_ipv4 *ipv4_key; + + ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]); + if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) { + OVS_NLERR("Unknown IPv4 fragment type (value=%d, max=%d).\n", + ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX); + return -EINVAL; + } + SW_FLOW_KEY_PUT(match, ip.proto, + ipv4_key->ipv4_proto, is_mask); + SW_FLOW_KEY_PUT(match, ip.tos, + ipv4_key->ipv4_tos, is_mask); + SW_FLOW_KEY_PUT(match, ip.ttl, + ipv4_key->ipv4_ttl, is_mask); + SW_FLOW_KEY_PUT(match, ip.frag, + ipv4_key->ipv4_frag, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.src, + ipv4_key->ipv4_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.dst, + ipv4_key->ipv4_dst, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_IPV4); + } + + if (attrs & (1 << OVS_KEY_ATTR_IPV6)) { + const struct ovs_key_ipv6 *ipv6_key; + + ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]); + if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) { + OVS_NLERR("Unknown IPv6 fragment type (value=%d, max=%d).\n", + ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX); + return -EINVAL; + } + SW_FLOW_KEY_PUT(match, ipv6.label, + ipv6_key->ipv6_label, is_mask); + SW_FLOW_KEY_PUT(match, ip.proto, + ipv6_key->ipv6_proto, is_mask); + SW_FLOW_KEY_PUT(match, ip.tos, + ipv6_key->ipv6_tclass, is_mask); + SW_FLOW_KEY_PUT(match, ip.ttl, + ipv6_key->ipv6_hlimit, is_mask); + SW_FLOW_KEY_PUT(match, ip.frag, + ipv6_key->ipv6_frag, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src, + ipv6_key->ipv6_src, + sizeof(match->key->ipv6.addr.src), + is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst, + ipv6_key->ipv6_dst, + sizeof(match->key->ipv6.addr.dst), + is_mask); + + attrs &= ~(1 << OVS_KEY_ATTR_IPV6); + } + + if (attrs & (1 << OVS_KEY_ATTR_ARP)) { + const struct ovs_key_arp *arp_key; + + arp_key = nla_data(a[OVS_KEY_ATTR_ARP]); + if (!is_mask && (arp_key->arp_op & htons(0xff00))) { + OVS_NLERR("Unknown ARP opcode (opcode=%d).\n", + arp_key->arp_op); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, ipv4.addr.src, + arp_key->arp_sip, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.dst, + arp_key->arp_tip, is_mask); + SW_FLOW_KEY_PUT(match, ip.proto, + ntohs(arp_key->arp_op), is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha, + arp_key->arp_sha, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha, + arp_key->arp_tha, ETH_ALEN, is_mask); + + attrs &= ~(1 << OVS_KEY_ATTR_ARP); + } + + if (attrs & (1 << OVS_KEY_ATTR_TCP)) { + const struct ovs_key_tcp *tcp_key; + + tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); + if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { + SW_FLOW_KEY_PUT(match, ipv4.tp.src, + tcp_key->tcp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.tp.dst, + tcp_key->tcp_dst, is_mask); + } else { + SW_FLOW_KEY_PUT(match, ipv6.tp.src, + tcp_key->tcp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv6.tp.dst, + tcp_key->tcp_dst, is_mask); + } + attrs &= ~(1 << OVS_KEY_ATTR_TCP); + } + + if (attrs & (1 << OVS_KEY_ATTR_UDP)) { + const struct ovs_key_udp *udp_key; + + udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); + if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { + SW_FLOW_KEY_PUT(match, ipv4.tp.src, + udp_key->udp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.tp.dst, + udp_key->udp_dst, is_mask); + } else { + SW_FLOW_KEY_PUT(match, ipv6.tp.src, + udp_key->udp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv6.tp.dst, + udp_key->udp_dst, is_mask); + } + attrs &= ~(1 << OVS_KEY_ATTR_UDP); + } + + if (attrs & (1 << OVS_KEY_ATTR_SCTP)) { + const struct ovs_key_sctp *sctp_key; + + sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]); + if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { + SW_FLOW_KEY_PUT(match, ipv4.tp.src, + sctp_key->sctp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.tp.dst, + sctp_key->sctp_dst, is_mask); + } else { + SW_FLOW_KEY_PUT(match, ipv6.tp.src, + sctp_key->sctp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv6.tp.dst, + sctp_key->sctp_dst, is_mask); + } + attrs &= ~(1 << OVS_KEY_ATTR_SCTP); + } + + if (attrs & (1 << OVS_KEY_ATTR_ICMP)) { + const struct ovs_key_icmp *icmp_key; + + icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]); + SW_FLOW_KEY_PUT(match, ipv4.tp.src, + htons(icmp_key->icmp_type), is_mask); + SW_FLOW_KEY_PUT(match, ipv4.tp.dst, + htons(icmp_key->icmp_code), is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ICMP); + } + + if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) { + const struct ovs_key_icmpv6 *icmpv6_key; + + icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]); + SW_FLOW_KEY_PUT(match, ipv6.tp.src, + htons(icmpv6_key->icmpv6_type), is_mask); + SW_FLOW_KEY_PUT(match, ipv6.tp.dst, + htons(icmpv6_key->icmpv6_code), is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6); + } + + if (attrs & (1 << OVS_KEY_ATTR_ND)) { + const struct ovs_key_nd *nd_key; + + nd_key = nla_data(a[OVS_KEY_ATTR_ND]); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target, + nd_key->nd_target, + sizeof(match->key->ipv6.nd.target), + is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll, + nd_key->nd_sll, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll, + nd_key->nd_tll, ETH_ALEN, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ND); + } + + if (attrs != 0) + return -EINVAL; + + return 0; +} + +static void sw_flow_mask_set(struct sw_flow_mask *mask, + struct sw_flow_key_range *range, u8 val) +{ + u8 *m = (u8 *)&mask->key + range->start; + + mask->range = *range; + memset(m, val, range_n_bytes(range)); +} + +/** + * ovs_nla_get_match - parses Netlink attributes into a flow key and + * mask. In case the 'mask' is NULL, the flow is treated as exact match + * flow. Otherwise, it is treated as a wildcarded flow, except the mask + * does not include any don't care bit. + * @match: receives the extracted flow match information. + * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * sequence. The fields should of the packet that triggered the creation + * of this flow. + * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink + * attribute specifies the mask field of the wildcarded flow. + */ +int ovs_nla_get_match(struct sw_flow_match *match, + const struct nlattr *key, + const struct nlattr *mask) +{ + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + const struct nlattr *encap; + u64 key_attrs = 0; + u64 mask_attrs = 0; + bool encap_valid = false; + int err; + + err = parse_flow_nlattrs(key, a, &key_attrs); + if (err) + return err; + + if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) && + (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) && + (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) { + __be16 tci; + + if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) && + (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) { + OVS_NLERR("Invalid Vlan frame.\n"); + return -EINVAL; + } + + key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + encap = a[OVS_KEY_ATTR_ENCAP]; + key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); + encap_valid = true; + + if (tci & htons(VLAN_TAG_PRESENT)) { + err = parse_flow_nlattrs(encap, a, &key_attrs); + if (err) + return err; + } else if (!tci) { + /* Corner case for truncated 802.1Q header. */ + if (nla_len(encap)) { + OVS_NLERR("Truncated 802.1Q header has non-zero encap attribute.\n"); + return -EINVAL; + } + } else { + OVS_NLERR("Encap attribute is set for a non-VLAN frame.\n"); + return -EINVAL; + } + } + + err = ovs_key_from_nlattrs(match, key_attrs, a, false); + if (err) + return err; + + if (mask) { + err = parse_flow_mask_nlattrs(mask, a, &mask_attrs); + if (err) + return err; + + if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP) { + __be16 eth_type = 0; + __be16 tci = 0; + + if (!encap_valid) { + OVS_NLERR("Encap mask attribute is set for non-VLAN frame.\n"); + return -EINVAL; + } + + mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); + if (a[OVS_KEY_ATTR_ETHERTYPE]) + eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + + if (eth_type == htons(0xffff)) { + mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + encap = a[OVS_KEY_ATTR_ENCAP]; + err = parse_flow_mask_nlattrs(encap, a, &mask_attrs); + } else { + OVS_NLERR("VLAN frames must have an exact match on the TPID (mask=%x).\n", + ntohs(eth_type)); + return -EINVAL; + } + + if (a[OVS_KEY_ATTR_VLAN]) + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + + if (!(tci & htons(VLAN_TAG_PRESENT))) { + OVS_NLERR("VLAN tag present bit must have an exact match (tci_mask=%x).\n", ntohs(tci)); + return -EINVAL; + } + } + + err = ovs_key_from_nlattrs(match, mask_attrs, a, true); + if (err) + return err; + } else { + /* Populate exact match flow's key mask. */ + if (match->mask) + sw_flow_mask_set(match->mask, &match->range, 0xff); + } + + if (!match_validate(match, key_attrs, mask_attrs)) + return -EINVAL; + + return 0; +} + +/** + * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key. + * @flow: Receives extracted in_port, priority, tun_key and skb_mark. + * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * sequence. + * + * This parses a series of Netlink attributes that form a flow key, which must + * take the same form accepted by flow_from_nlattrs(), but only enough of it to + * get the metadata, that is, the parts of the flow key that cannot be + * extracted from the packet itself. + */ + +int ovs_nla_get_flow_metadata(struct sw_flow *flow, + const struct nlattr *attr) +{ + struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key; + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + u64 attrs = 0; + int err; + struct sw_flow_match match; + + flow->key.phy.in_port = DP_MAX_PORTS; + flow->key.phy.priority = 0; + flow->key.phy.skb_mark = 0; + memset(tun_key, 0, sizeof(flow->key.tun_key)); + + err = parse_flow_nlattrs(attr, a, &attrs); + if (err) + return -EINVAL; + + memset(&match, 0, sizeof(match)); + match.key = &flow->key; + + err = metadata_from_nlattrs(&match, &attrs, a, false); + if (err) + return err; + + return 0; +} + +int ovs_nla_put_flow(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, struct sk_buff *skb) +{ + struct ovs_key_ethernet *eth_key; + struct nlattr *nla, *encap; + bool is_mask = (swkey != output); + + if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) + goto nla_put_failure; + + if ((swkey->tun_key.ipv4_dst || is_mask) && + ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key)) + goto nla_put_failure; + + if (swkey->phy.in_port == DP_MAX_PORTS) { + if (is_mask && (output->phy.in_port == 0xffff)) + if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff)) + goto nla_put_failure; + } else { + u16 upper_u16; + upper_u16 = !is_mask ? 0 : 0xffff; + + if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, + (upper_u16 << 16) | output->phy.in_port)) + goto nla_put_failure; + } + + if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark)) + goto nla_put_failure; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); + if (!nla) + goto nla_put_failure; + + eth_key = nla_data(nla); + memcpy(eth_key->eth_src, output->eth.src, ETH_ALEN); + memcpy(eth_key->eth_dst, output->eth.dst, ETH_ALEN); + + if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) { + __be16 eth_type; + eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff); + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) || + nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci)) + goto nla_put_failure; + encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); + if (!swkey->eth.tci) + goto unencap; + } else + encap = NULL; + + if (swkey->eth.type == htons(ETH_P_802_2)) { + /* + * Ethertype 802.2 is represented in the netlink with omitted + * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and + * 0xffff in the mask attribute. Ethertype can also + * be wildcarded. + */ + if (is_mask && output->eth.type) + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, + output->eth.type)) + goto nla_put_failure; + goto unencap; + } + + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type)) + goto nla_put_failure; + + if (swkey->eth.type == htons(ETH_P_IP)) { + struct ovs_key_ipv4 *ipv4_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key)); + if (!nla) + goto nla_put_failure; + ipv4_key = nla_data(nla); + ipv4_key->ipv4_src = output->ipv4.addr.src; + ipv4_key->ipv4_dst = output->ipv4.addr.dst; + ipv4_key->ipv4_proto = output->ip.proto; + ipv4_key->ipv4_tos = output->ip.tos; + ipv4_key->ipv4_ttl = output->ip.ttl; + ipv4_key->ipv4_frag = output->ip.frag; + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + struct ovs_key_ipv6 *ipv6_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key)); + if (!nla) + goto nla_put_failure; + ipv6_key = nla_data(nla); + memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src, + sizeof(ipv6_key->ipv6_src)); + memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst, + sizeof(ipv6_key->ipv6_dst)); + ipv6_key->ipv6_label = output->ipv6.label; + ipv6_key->ipv6_proto = output->ip.proto; + ipv6_key->ipv6_tclass = output->ip.tos; + ipv6_key->ipv6_hlimit = output->ip.ttl; + ipv6_key->ipv6_frag = output->ip.frag; + } else if (swkey->eth.type == htons(ETH_P_ARP) || + swkey->eth.type == htons(ETH_P_RARP)) { + struct ovs_key_arp *arp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key)); + if (!nla) + goto nla_put_failure; + arp_key = nla_data(nla); + memset(arp_key, 0, sizeof(struct ovs_key_arp)); + arp_key->arp_sip = output->ipv4.addr.src; + arp_key->arp_tip = output->ipv4.addr.dst; + arp_key->arp_op = htons(output->ip.proto); + memcpy(arp_key->arp_sha, output->ipv4.arp.sha, ETH_ALEN); + memcpy(arp_key->arp_tha, output->ipv4.arp.tha, ETH_ALEN); + } + + if ((swkey->eth.type == htons(ETH_P_IP) || + swkey->eth.type == htons(ETH_P_IPV6)) && + swkey->ip.frag != OVS_FRAG_TYPE_LATER) { + + if (swkey->ip.proto == IPPROTO_TCP) { + struct ovs_key_tcp *tcp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key)); + if (!nla) + goto nla_put_failure; + tcp_key = nla_data(nla); + if (swkey->eth.type == htons(ETH_P_IP)) { + tcp_key->tcp_src = output->ipv4.tp.src; + tcp_key->tcp_dst = output->ipv4.tp.dst; + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + tcp_key->tcp_src = output->ipv6.tp.src; + tcp_key->tcp_dst = output->ipv6.tp.dst; + } + } else if (swkey->ip.proto == IPPROTO_UDP) { + struct ovs_key_udp *udp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key)); + if (!nla) + goto nla_put_failure; + udp_key = nla_data(nla); + if (swkey->eth.type == htons(ETH_P_IP)) { + udp_key->udp_src = output->ipv4.tp.src; + udp_key->udp_dst = output->ipv4.tp.dst; + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + udp_key->udp_src = output->ipv6.tp.src; + udp_key->udp_dst = output->ipv6.tp.dst; + } + } else if (swkey->ip.proto == IPPROTO_SCTP) { + struct ovs_key_sctp *sctp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key)); + if (!nla) + goto nla_put_failure; + sctp_key = nla_data(nla); + if (swkey->eth.type == htons(ETH_P_IP)) { + sctp_key->sctp_src = swkey->ipv4.tp.src; + sctp_key->sctp_dst = swkey->ipv4.tp.dst; + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + sctp_key->sctp_src = swkey->ipv6.tp.src; + sctp_key->sctp_dst = swkey->ipv6.tp.dst; + } + } else if (swkey->eth.type == htons(ETH_P_IP) && + swkey->ip.proto == IPPROTO_ICMP) { + struct ovs_key_icmp *icmp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key)); + if (!nla) + goto nla_put_failure; + icmp_key = nla_data(nla); + icmp_key->icmp_type = ntohs(output->ipv4.tp.src); + icmp_key->icmp_code = ntohs(output->ipv4.tp.dst); + } else if (swkey->eth.type == htons(ETH_P_IPV6) && + swkey->ip.proto == IPPROTO_ICMPV6) { + struct ovs_key_icmpv6 *icmpv6_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6, + sizeof(*icmpv6_key)); + if (!nla) + goto nla_put_failure; + icmpv6_key = nla_data(nla); + icmpv6_key->icmpv6_type = ntohs(output->ipv6.tp.src); + icmpv6_key->icmpv6_code = ntohs(output->ipv6.tp.dst); + + if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION || + icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) { + struct ovs_key_nd *nd_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key)); + if (!nla) + goto nla_put_failure; + nd_key = nla_data(nla); + memcpy(nd_key->nd_target, &output->ipv6.nd.target, + sizeof(nd_key->nd_target)); + memcpy(nd_key->nd_sll, output->ipv6.nd.sll, ETH_ALEN); + memcpy(nd_key->nd_tll, output->ipv6.nd.tll, ETH_ALEN); + } + } + } + +unencap: + if (encap) + nla_nest_end(skb, encap); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +#define MAX_ACTIONS_BUFSIZE (32 * 1024) + +struct sw_flow_actions *ovs_nla_alloc_flow_actions(int size) +{ + struct sw_flow_actions *sfa; + + if (size > MAX_ACTIONS_BUFSIZE) + return ERR_PTR(-EINVAL); + + sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL); + if (!sfa) + return ERR_PTR(-ENOMEM); + + sfa->actions_len = 0; + return sfa; +} + +/* RCU callback used by ovs_nla_free_flow_actions. */ +static void rcu_free_acts_callback(struct rcu_head *rcu) +{ + struct sw_flow_actions *sf_acts = container_of(rcu, + struct sw_flow_actions, rcu); + kfree(sf_acts); +} + +/* Schedules 'sf_acts' to be freed after the next RCU grace period. + * The caller must hold rcu_read_lock for this to be sensible. */ +void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +{ + call_rcu(&sf_acts->rcu, rcu_free_acts_callback); +} + +static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, + int attr_len) +{ + + struct sw_flow_actions *acts; + int new_acts_size; + int req_size = NLA_ALIGN(attr_len); + int next_offset = offsetof(struct sw_flow_actions, actions) + + (*sfa)->actions_len; + + if (req_size <= (ksize(*sfa) - next_offset)) + goto out; + + new_acts_size = ksize(*sfa) * 2; + + if (new_acts_size > MAX_ACTIONS_BUFSIZE) { + if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) + return ERR_PTR(-EMSGSIZE); + new_acts_size = MAX_ACTIONS_BUFSIZE; + } + + acts = ovs_nla_alloc_flow_actions(new_acts_size); + if (IS_ERR(acts)) + return (void *)acts; + + memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len); + acts->actions_len = (*sfa)->actions_len; + kfree(*sfa); + *sfa = acts; + +out: + (*sfa)->actions_len += req_size; + return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset); +} + +static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len) +{ + struct nlattr *a; + + a = reserve_sfa_size(sfa, nla_attr_size(len)); + if (IS_ERR(a)) + return PTR_ERR(a); + + a->nla_type = attrtype; + a->nla_len = nla_attr_size(len); + + if (data) + memcpy(nla_data(a), data, len); + memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len)); + + return 0; +} + +static inline int add_nested_action_start(struct sw_flow_actions **sfa, + int attrtype) +{ + int used = (*sfa)->actions_len; + int err; + + err = add_action(sfa, attrtype, NULL, 0); + if (err) + return err; + + return used; +} + +static inline void add_nested_action_end(struct sw_flow_actions *sfa, + int st_offset) +{ + struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions + + st_offset); + + a->nla_len = sfa->actions_len - st_offset; +} + +static int validate_and_copy_sample(const struct nlattr *attr, + const struct sw_flow_key *key, int depth, + struct sw_flow_actions **sfa) +{ + const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; + const struct nlattr *probability, *actions; + const struct nlattr *a; + int rem, start, err, st_acts; + + memset(attrs, 0, sizeof(attrs)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type]) + return -EINVAL; + attrs[type] = a; + } + if (rem) + return -EINVAL; + + probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY]; + if (!probability || nla_len(probability) != sizeof(u32)) + return -EINVAL; + + actions = attrs[OVS_SAMPLE_ATTR_ACTIONS]; + if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) + return -EINVAL; + + /* validation done, copy sample action. */ + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE); + if (start < 0) + return start; + err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, + nla_data(probability), sizeof(u32)); + if (err) + return err; + st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS); + if (st_acts < 0) + return st_acts; + + err = ovs_nla_copy_actions(actions, key, depth + 1, sfa); + if (err) + return err; + + add_nested_action_end(*sfa, st_acts); + add_nested_action_end(*sfa, start); + + return 0; +} + +static int validate_tp_port(const struct sw_flow_key *flow_key) +{ + if (flow_key->eth.type == htons(ETH_P_IP)) { + if (flow_key->ipv4.tp.src || flow_key->ipv4.tp.dst) + return 0; + } else if (flow_key->eth.type == htons(ETH_P_IPV6)) { + if (flow_key->ipv6.tp.src || flow_key->ipv6.tp.dst) + return 0; + } + + return -EINVAL; +} + +void ovs_match_init(struct sw_flow_match *match, + struct sw_flow_key *key, + struct sw_flow_mask *mask) +{ + memset(match, 0, sizeof(*match)); + match->key = key; + match->mask = mask; + + memset(key, 0, sizeof(*key)); + + if (mask) { + memset(&mask->key, 0, sizeof(mask->key)); + mask->range.start = mask->range.end = 0; + } +} + +static int validate_and_copy_set_tun(const struct nlattr *attr, + struct sw_flow_actions **sfa) +{ + struct sw_flow_match match; + struct sw_flow_key key; + int err, start; + + ovs_match_init(&match, &key, NULL); + err = ipv4_tun_from_nlattr(nla_data(attr), &match, false); + if (err) + return err; + + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET); + if (start < 0) + return start; + + err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key, + sizeof(match.key->tun_key)); + add_nested_action_end(*sfa, start); + + return err; +} + +static int validate_set(const struct nlattr *a, + const struct sw_flow_key *flow_key, + struct sw_flow_actions **sfa, + bool *set_tun) +{ + const struct nlattr *ovs_key = nla_data(a); + int key_type = nla_type(ovs_key); + + /* There can be only one key in a action */ + if (nla_total_size(nla_len(ovs_key)) != nla_len(a)) + return -EINVAL; + + if (key_type > OVS_KEY_ATTR_MAX || + (ovs_key_lens[key_type] != nla_len(ovs_key) && + ovs_key_lens[key_type] != -1)) + return -EINVAL; + + switch (key_type) { + const struct ovs_key_ipv4 *ipv4_key; + const struct ovs_key_ipv6 *ipv6_key; + int err; + + case OVS_KEY_ATTR_PRIORITY: + case OVS_KEY_ATTR_SKB_MARK: + case OVS_KEY_ATTR_ETHERNET: + break; + + case OVS_KEY_ATTR_TUNNEL: + *set_tun = true; + err = validate_and_copy_set_tun(a, sfa); + if (err) + return err; + break; + + case OVS_KEY_ATTR_IPV4: + if (flow_key->eth.type != htons(ETH_P_IP)) + return -EINVAL; + + if (!flow_key->ip.proto) + return -EINVAL; + + ipv4_key = nla_data(ovs_key); + if (ipv4_key->ipv4_proto != flow_key->ip.proto) + return -EINVAL; + + if (ipv4_key->ipv4_frag != flow_key->ip.frag) + return -EINVAL; + + break; + + case OVS_KEY_ATTR_IPV6: + if (flow_key->eth.type != htons(ETH_P_IPV6)) + return -EINVAL; + + if (!flow_key->ip.proto) + return -EINVAL; + + ipv6_key = nla_data(ovs_key); + if (ipv6_key->ipv6_proto != flow_key->ip.proto) + return -EINVAL; + + if (ipv6_key->ipv6_frag != flow_key->ip.frag) + return -EINVAL; + + if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000) + return -EINVAL; + + break; + + case OVS_KEY_ATTR_TCP: + if (flow_key->ip.proto != IPPROTO_TCP) + return -EINVAL; + + return validate_tp_port(flow_key); + + case OVS_KEY_ATTR_UDP: + if (flow_key->ip.proto != IPPROTO_UDP) + return -EINVAL; + + return validate_tp_port(flow_key); + + case OVS_KEY_ATTR_SCTP: + if (flow_key->ip.proto != IPPROTO_SCTP) + return -EINVAL; + + return validate_tp_port(flow_key); + + default: + return -EINVAL; + } + + return 0; +} + +static int validate_userspace(const struct nlattr *attr) +{ + static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = { + [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 }, + [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC }, + }; + struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1]; + int error; + + error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX, + attr, userspace_policy); + if (error) + return error; + + if (!a[OVS_USERSPACE_ATTR_PID] || + !nla_get_u32(a[OVS_USERSPACE_ATTR_PID])) + return -EINVAL; + + return 0; +} + +static int copy_action(const struct nlattr *from, + struct sw_flow_actions **sfa) +{ + int totlen = NLA_ALIGN(from->nla_len); + struct nlattr *to; + + to = reserve_sfa_size(sfa, from->nla_len); + if (IS_ERR(to)) + return PTR_ERR(to); + + memcpy(to, from, totlen); + return 0; +} + +int ovs_nla_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, + int depth, + struct sw_flow_actions **sfa) +{ + const struct nlattr *a; + int rem, err; + + if (depth >= SAMPLE_ACTION_DEPTH) + return -EOVERFLOW; + + nla_for_each_nested(a, attr, rem) { + /* Expected argument lengths, (u32)-1 for variable length. */ + static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { + [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), + [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, + [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), + [OVS_ACTION_ATTR_POP_VLAN] = 0, + [OVS_ACTION_ATTR_SET] = (u32)-1, + [OVS_ACTION_ATTR_SAMPLE] = (u32)-1 + }; + const struct ovs_action_push_vlan *vlan; + int type = nla_type(a); + bool skip_copy; + + if (type > OVS_ACTION_ATTR_MAX || + (action_lens[type] != nla_len(a) && + action_lens[type] != (u32)-1)) + return -EINVAL; + + skip_copy = false; + switch (type) { + case OVS_ACTION_ATTR_UNSPEC: + return -EINVAL; + + case OVS_ACTION_ATTR_USERSPACE: + err = validate_userspace(a); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_OUTPUT: + if (nla_get_u32(a) >= DP_MAX_PORTS) + return -EINVAL; + break; + + + case OVS_ACTION_ATTR_POP_VLAN: + break; + + case OVS_ACTION_ATTR_PUSH_VLAN: + vlan = nla_data(a); + if (vlan->vlan_tpid != htons(ETH_P_8021Q)) + return -EINVAL; + if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) + return -EINVAL; + break; + + case OVS_ACTION_ATTR_SET: + err = validate_set(a, key, sfa, &skip_copy); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_SAMPLE: + err = validate_and_copy_sample(a, key, depth, sfa); + if (err) + return err; + skip_copy = true; + break; + + default: + return -EINVAL; + } + if (!skip_copy) { + err = copy_action(a, sfa); + if (err) + return err; + } + } + + if (rem > 0) + return -EINVAL; + + return 0; +} + +static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) +{ + const struct nlattr *a; + struct nlattr *start; + int err = 0, rem; + + start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE); + if (!start) + return -EMSGSIZE; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + struct nlattr *st_sample; + + switch (type) { + case OVS_SAMPLE_ATTR_PROBABILITY: + if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY, + sizeof(u32), nla_data(a))) + return -EMSGSIZE; + break; + case OVS_SAMPLE_ATTR_ACTIONS: + st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS); + if (!st_sample) + return -EMSGSIZE; + err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); + if (err) + return err; + nla_nest_end(skb, st_sample); + break; + } + } + + nla_nest_end(skb, start); + return err; +} + +static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) +{ + const struct nlattr *ovs_key = nla_data(a); + int key_type = nla_type(ovs_key); + struct nlattr *start; + int err; + + switch (key_type) { + case OVS_KEY_ATTR_IPV4_TUNNEL: + start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); + if (!start) + return -EMSGSIZE; + + err = ipv4_tun_to_nlattr(skb, nla_data(ovs_key), + nla_data(ovs_key)); + if (err) + return err; + nla_nest_end(skb, start); + break; + default: + if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key)) + return -EMSGSIZE; + break; + } + + return 0; +} + +int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) +{ + const struct nlattr *a; + int rem, err; + + nla_for_each_attr(a, attr, len, rem) { + int type = nla_type(a); + + switch (type) { + case OVS_ACTION_ATTR_SET: + err = set_action_to_attr(a, skb); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_SAMPLE: + err = sample_action_to_attr(a, skb); + if (err) + return err; + break; + default: + if (nla_put(skb, type, nla_len(a), nla_data(a))) + return -EMSGSIZE; + break; + } + } + + return 0; +} diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h new file mode 100644 index 000000000000..440151045d39 --- /dev/null +++ b/net/openvswitch/flow_netlink.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + + +#ifndef FLOW_NETLINK_H +#define FLOW_NETLINK_H 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "flow.h" + +void ovs_match_init(struct sw_flow_match *match, + struct sw_flow_key *key, struct sw_flow_mask *mask); + +int ovs_nla_put_flow(const struct sw_flow_key *, + const struct sw_flow_key *, struct sk_buff *); +int ovs_nla_get_flow_metadata(struct sw_flow *flow, + const struct nlattr *attr); +int ovs_nla_get_match(struct sw_flow_match *match, + const struct nlattr *, + const struct nlattr *); + +int ovs_nla_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, int depth, + struct sw_flow_actions **sfa); +int ovs_nla_put_actions(const struct nlattr *attr, + int len, struct sk_buff *skb); + +struct sw_flow_actions *ovs_nla_alloc_flow_actions(int actions_len); +void ovs_nla_free_flow_actions(struct sw_flow_actions *); + +#endif /* flow_netlink.h */ diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c new file mode 100644 index 000000000000..dcadb75bb173 --- /dev/null +++ b/net/openvswitch/flow_table.c @@ -0,0 +1,517 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include "flow.h" +#include "datapath.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct kmem_cache *flow_cache; + +static u16 range_n_bytes(const struct sw_flow_key_range *range) +{ + return range->end - range->start; +} + +void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, + const struct sw_flow_mask *mask) +{ + const long *m = (long *)((u8 *)&mask->key + mask->range.start); + const long *s = (long *)((u8 *)src + mask->range.start); + long *d = (long *)((u8 *)dst + mask->range.start); + int i; + + /* The memory outside of the 'mask->range' are not set since + * further operations on 'dst' only uses contents within + * 'mask->range'. + */ + for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long)) + *d++ = *s++ & *m++; +} + +struct sw_flow *ovs_flow_alloc(void) +{ + struct sw_flow *flow; + + flow = kmem_cache_alloc(flow_cache, GFP_KERNEL); + if (!flow) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&flow->lock); + flow->sf_acts = NULL; + flow->mask = NULL; + + return flow; +} + +static struct flex_array *alloc_buckets(unsigned int n_buckets) +{ + struct flex_array *buckets; + int i, err; + + buckets = flex_array_alloc(sizeof(struct hlist_head), + n_buckets, GFP_KERNEL); + if (!buckets) + return NULL; + + err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL); + if (err) { + flex_array_free(buckets); + return NULL; + } + + for (i = 0; i < n_buckets; i++) + INIT_HLIST_HEAD((struct hlist_head *) + flex_array_get(buckets, i)); + + return buckets; +} + +static void flow_free(struct sw_flow *flow) +{ + kfree((struct sf_flow_acts __force *)flow->sf_acts); + kmem_cache_free(flow_cache, flow); +} + +static void rcu_free_flow_callback(struct rcu_head *rcu) +{ + struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu); + + flow_free(flow); +} + +void ovs_flow_free(struct sw_flow *flow, bool deferred) +{ + if (!flow) + return; + + ovs_sw_flow_mask_del_ref(flow->mask, deferred); + + if (deferred) + call_rcu(&flow->rcu, rcu_free_flow_callback); + else + flow_free(flow); +} + +static void free_buckets(struct flex_array *buckets) +{ + flex_array_free(buckets); +} + +static void __flow_tbl_destroy(struct flow_table *table) +{ + int i; + + if (table->keep_flows) + goto skip_flows; + + for (i = 0; i < table->n_buckets; i++) { + struct sw_flow *flow; + struct hlist_head *head = flex_array_get(table->buckets, i); + struct hlist_node *n; + int ver = table->node_ver; + + hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { + hlist_del(&flow->hash_node[ver]); + ovs_flow_free(flow, false); + } + } + + BUG_ON(!list_empty(table->mask_list)); + kfree(table->mask_list); + +skip_flows: + free_buckets(table->buckets); + kfree(table); +} + +static struct flow_table *__flow_tbl_alloc(int new_size) +{ + struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL); + + if (!table) + return NULL; + + table->buckets = alloc_buckets(new_size); + + if (!table->buckets) { + kfree(table); + return NULL; + } + table->n_buckets = new_size; + table->count = 0; + table->node_ver = 0; + table->keep_flows = false; + get_random_bytes(&table->hash_seed, sizeof(u32)); + table->mask_list = NULL; + + return table; +} + +struct flow_table *ovs_flow_tbl_alloc(int new_size) +{ + struct flow_table *table = __flow_tbl_alloc(new_size); + + if (!table) + return NULL; + + table->mask_list = kmalloc(sizeof(struct list_head), GFP_KERNEL); + if (!table->mask_list) { + table->keep_flows = true; + __flow_tbl_destroy(table); + return NULL; + } + INIT_LIST_HEAD(table->mask_list); + + return table; +} + +static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) +{ + struct flow_table *table = container_of(rcu, struct flow_table, rcu); + + __flow_tbl_destroy(table); +} + +void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred) +{ + if (!table) + return; + + if (deferred) + call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb); + else + __flow_tbl_destroy(table); +} + +struct sw_flow *ovs_flow_tbl_dump_next(struct flow_table *table, + u32 *bucket, u32 *last) +{ + struct sw_flow *flow; + struct hlist_head *head; + int ver; + int i; + + ver = table->node_ver; + while (*bucket < table->n_buckets) { + i = 0; + head = flex_array_get(table->buckets, *bucket); + hlist_for_each_entry_rcu(flow, head, hash_node[ver]) { + if (i < *last) { + i++; + continue; + } + *last = i + 1; + return flow; + } + (*bucket)++; + *last = 0; + } + + return NULL; +} + +static struct hlist_head *find_bucket(struct flow_table *table, u32 hash) +{ + hash = jhash_1word(hash, table->hash_seed); + return flex_array_get(table->buckets, + (hash & (table->n_buckets - 1))); +} + +static void __tbl_insert(struct flow_table *table, struct sw_flow *flow) +{ + struct hlist_head *head; + + head = find_bucket(table, flow->hash); + hlist_add_head_rcu(&flow->hash_node[table->node_ver], head); + + table->count++; +} + +static void flow_table_copy_flows(struct flow_table *old, + struct flow_table *new) +{ + int old_ver; + int i; + + old_ver = old->node_ver; + new->node_ver = !old_ver; + + /* Insert in new table. */ + for (i = 0; i < old->n_buckets; i++) { + struct sw_flow *flow; + struct hlist_head *head; + + head = flex_array_get(old->buckets, i); + + hlist_for_each_entry(flow, head, hash_node[old_ver]) + __tbl_insert(new, flow); + } + + new->mask_list = old->mask_list; + old->keep_flows = true; +} + +static struct flow_table *__flow_tbl_rehash(struct flow_table *table, + int n_buckets) +{ + struct flow_table *new_table; + + new_table = __flow_tbl_alloc(n_buckets); + if (!new_table) + return ERR_PTR(-ENOMEM); + + flow_table_copy_flows(table, new_table); + + return new_table; +} + +struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table) +{ + return __flow_tbl_rehash(table, table->n_buckets); +} + +struct flow_table *ovs_flow_tbl_expand(struct flow_table *table) +{ + return __flow_tbl_rehash(table, table->n_buckets * 2); +} + +static u32 flow_hash(const struct sw_flow_key *key, int key_start, + int key_end) +{ + u32 *hash_key = (u32 *)((u8 *)key + key_start); + int hash_u32s = (key_end - key_start) >> 2; + + /* Make sure number of hash bytes are multiple of u32. */ + BUILD_BUG_ON(sizeof(long) % sizeof(u32)); + + return jhash2(hash_key, hash_u32s, 0); +} + +static int flow_key_start(const struct sw_flow_key *key) +{ + if (key->tun_key.ipv4_dst) + return 0; + else + return rounddown(offsetof(struct sw_flow_key, phy), + sizeof(long)); +} + +static bool cmp_key(const struct sw_flow_key *key1, + const struct sw_flow_key *key2, + int key_start, int key_end) +{ + const long *cp1 = (long *)((u8 *)key1 + key_start); + const long *cp2 = (long *)((u8 *)key2 + key_start); + long diffs = 0; + int i; + + for (i = key_start; i < key_end; i += sizeof(long)) + diffs |= *cp1++ ^ *cp2++; + + return diffs == 0; +} + +static bool flow_cmp_masked_key(const struct sw_flow *flow, + const struct sw_flow_key *key, + int key_start, int key_end) +{ + return cmp_key(&flow->key, key, key_start, key_end); +} + +bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, + struct sw_flow_match *match) +{ + struct sw_flow_key *key = match->key; + int key_start = flow_key_start(key); + int key_end = match->range.end; + + return cmp_key(&flow->unmasked_key, key, key_start, key_end); +} + +static struct sw_flow *masked_flow_lookup(struct flow_table *table, + const struct sw_flow_key *unmasked, + struct sw_flow_mask *mask) +{ + struct sw_flow *flow; + struct hlist_head *head; + int key_start = mask->range.start; + int key_end = mask->range.end; + u32 hash; + struct sw_flow_key masked_key; + + ovs_flow_mask_key(&masked_key, unmasked, mask); + hash = flow_hash(&masked_key, key_start, key_end); + head = find_bucket(table, hash); + hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) { + if (flow->mask == mask && + flow_cmp_masked_key(flow, &masked_key, + key_start, key_end)) + return flow; + } + return NULL; +} + +struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, + const struct sw_flow_key *key) +{ + struct sw_flow *flow = NULL; + struct sw_flow_mask *mask; + + list_for_each_entry_rcu(mask, tbl->mask_list, list) { + flow = masked_flow_lookup(tbl, key, mask); + if (flow) /* Found */ + break; + } + + return flow; +} + +void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow) +{ + flow->hash = flow_hash(&flow->key, flow->mask->range.start, + flow->mask->range.end); + __tbl_insert(table, flow); +} + +void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) +{ + BUG_ON(table->count == 0); + hlist_del_rcu(&flow->hash_node[table->node_ver]); + table->count--; +} + +struct sw_flow_mask *ovs_sw_flow_mask_alloc(void) +{ + struct sw_flow_mask *mask; + + mask = kmalloc(sizeof(*mask), GFP_KERNEL); + if (mask) + mask->ref_count = 0; + + return mask; +} + +void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *mask) +{ + mask->ref_count++; +} + +static void rcu_free_sw_flow_mask_cb(struct rcu_head *rcu) +{ + struct sw_flow_mask *mask = container_of(rcu, struct sw_flow_mask, rcu); + + kfree(mask); +} + +void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *mask, bool deferred) +{ + if (!mask) + return; + + BUG_ON(!mask->ref_count); + mask->ref_count--; + + if (!mask->ref_count) { + list_del_rcu(&mask->list); + if (deferred) + call_rcu(&mask->rcu, rcu_free_sw_flow_mask_cb); + else + kfree(mask); + } +} + +static bool mask_equal(const struct sw_flow_mask *a, + const struct sw_flow_mask *b) +{ + u8 *a_ = (u8 *)&a->key + a->range.start; + u8 *b_ = (u8 *)&b->key + b->range.start; + + return (a->range.end == b->range.end) + && (a->range.start == b->range.start) + && (memcmp(a_, b_, range_n_bytes(&a->range)) == 0); +} + +struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *tbl, + const struct sw_flow_mask *mask) +{ + struct list_head *ml; + + list_for_each(ml, tbl->mask_list) { + struct sw_flow_mask *m; + m = container_of(ml, struct sw_flow_mask, list); + if (mask_equal(mask, m)) + return m; + } + + return NULL; +} + +/** + * add a new mask into the mask list. + * The caller needs to make sure that 'mask' is not the same + * as any masks that are already on the list. + */ +void ovs_sw_flow_mask_insert(struct flow_table *tbl, struct sw_flow_mask *mask) +{ + list_add_rcu(&mask->list, tbl->mask_list); +} + +/* Initializes the flow module. + * Returns zero if successful or a negative error code. */ +int ovs_flow_init(void) +{ + BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long)); + BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); + + flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0, + 0, NULL); + if (flow_cache == NULL) + return -ENOMEM; + + return 0; +} + +/* Uninitializes the flow module. */ +void ovs_flow_exit(void) +{ + kmem_cache_destroy(flow_cache); +} diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h new file mode 100644 index 000000000000..d7a114457cde --- /dev/null +++ b/net/openvswitch/flow_table.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef FLOW_TABLE_H +#define FLOW_TABLE_H 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "flow.h" + +#define TBL_MIN_BUCKETS 1024 + +struct flow_table { + struct flex_array *buckets; + unsigned int count, n_buckets; + struct rcu_head rcu; + struct list_head *mask_list; + int node_ver; + u32 hash_seed; + bool keep_flows; +}; + +int ovs_flow_init(void); +void ovs_flow_exit(void); + +struct sw_flow *ovs_flow_alloc(void); +void ovs_flow_free(struct sw_flow *, bool deferred); + +static inline int ovs_flow_tbl_count(struct flow_table *table) +{ + return table->count; +} + +static inline int ovs_flow_tbl_need_to_expand(struct flow_table *table) +{ + return (table->count > table->n_buckets); +} + +struct flow_table *ovs_flow_tbl_alloc(int new_size); +struct flow_table *ovs_flow_tbl_expand(struct flow_table *table); +struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table); +void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred); + +void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow); +void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); +struct sw_flow *ovs_flow_tbl_dump_next(struct flow_table *table, + u32 *bucket, u32 *idx); +struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, + const struct sw_flow_key *); + +bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, + struct sw_flow_match *match); + +struct sw_flow_mask *ovs_sw_flow_mask_alloc(void); +void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *); +void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *, bool deferred); +void ovs_sw_flow_mask_insert(struct flow_table *, struct sw_flow_mask *); +struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *, + const struct sw_flow_mask *); +void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, + const struct sw_flow_mask *mask); + +#endif /* flow_table.h */ -- cgit v1.2.3-59-g8ed1b From b637e4988c2d689bb43f943a5af0e684a4981159 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Fri, 4 Oct 2013 00:14:23 -0700 Subject: openvswitch: Move mega-flow list out of rehashing struct. ovs-flow rehash does not touch mega flow list. Following patch moves it dp struct datapath. Avoid one extra indirection for accessing mega-flow list head on every packet receive. Signed-off-by: Pravin B Shelar Signed-off-by: Jesse Gross --- net/openvswitch/datapath.c | 77 ++++------------ net/openvswitch/datapath.h | 6 +- net/openvswitch/flow_table.c | 205 ++++++++++++++++++++++++++----------------- net/openvswitch/flow_table.h | 32 +++---- 4 files changed, 155 insertions(+), 165 deletions(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 72e68743c643..60b9be3b9477 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -59,8 +59,6 @@ #include "vport-internal_dev.h" #include "vport-netdev.h" -#define REHASH_FLOW_INTERVAL (10 * 60 * HZ) - int ovs_net_id __read_mostly; static void ovs_notify(struct sk_buff *skb, struct genl_info *info, @@ -163,7 +161,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu) { struct datapath *dp = container_of(rcu, struct datapath, rcu); - ovs_flow_tbl_destroy((__force struct flow_table *)dp->table, false); + ovs_flow_tbl_destroy(&dp->table, false); free_percpu(dp->stats_percpu); release_net(ovs_dp_get_net(dp)); kfree(dp->ports); @@ -235,7 +233,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) } /* Look up flow. */ - flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table), &key); + flow = ovs_flow_tbl_lookup(&dp->table, &key); if (unlikely(!flow)) { struct dp_upcall_info upcall; @@ -453,23 +451,6 @@ out: return err; } -/* Called with ovs_mutex. */ -static int flush_flows(struct datapath *dp) -{ - struct flow_table *old_table; - struct flow_table *new_table; - - old_table = ovsl_dereference(dp->table); - new_table = ovs_flow_tbl_alloc(TBL_MIN_BUCKETS); - if (!new_table) - return -ENOMEM; - - rcu_assign_pointer(dp->table, new_table); - - ovs_flow_tbl_destroy(old_table, true); - return 0; -} - static void clear_stats(struct sw_flow *flow) { flow->used = 0; @@ -584,11 +565,9 @@ static struct genl_ops dp_packet_genl_ops[] = { static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats) { - struct flow_table *table; int i; - table = rcu_dereference_check(dp->table, lockdep_ovsl_is_held()); - stats->n_flows = ovs_flow_tbl_count(table); + stats->n_flows = ovs_flow_tbl_count(&dp->table); stats->n_hit = stats->n_missed = stats->n_lost = 0; for_each_possible_cpu(i) { @@ -773,7 +752,6 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) struct sw_flow_mask mask; struct sk_buff *reply; struct datapath *dp; - struct flow_table *table; struct sw_flow_actions *acts = NULL; struct sw_flow_match match; int error; @@ -814,12 +792,9 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) if (!dp) goto err_unlock_ovs; - table = ovsl_dereference(dp->table); - /* Check if this is a duplicate flow */ - flow = ovs_flow_tbl_lookup(table, &key); + flow = ovs_flow_tbl_lookup(&dp->table, &key); if (!flow) { - struct flow_table *new_table = NULL; struct sw_flow_mask *mask_p; /* Bail out if we're not allowed to create a new flow. */ @@ -827,19 +802,6 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) if (info->genlhdr->cmd == OVS_FLOW_CMD_SET) goto err_unlock_ovs; - /* Expand table, if necessary, to make room. */ - if (ovs_flow_tbl_need_to_expand(table)) - new_table = ovs_flow_tbl_expand(table); - else if (time_after(jiffies, dp->last_rehash + REHASH_FLOW_INTERVAL)) - new_table = ovs_flow_tbl_rehash(table); - - if (new_table && !IS_ERR(new_table)) { - rcu_assign_pointer(dp->table, new_table); - ovs_flow_tbl_destroy(table, true); - table = ovsl_dereference(dp->table); - dp->last_rehash = jiffies; - } - /* Allocate flow. */ flow = ovs_flow_alloc(); if (IS_ERR(flow)) { @@ -852,7 +814,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) flow->unmasked_key = key; /* Make sure mask is unique in the system */ - mask_p = ovs_sw_flow_mask_find(table, &mask); + mask_p = ovs_sw_flow_mask_find(&dp->table, &mask); if (!mask_p) { /* Allocate a new mask if none exsits. */ mask_p = ovs_sw_flow_mask_alloc(); @@ -860,7 +822,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) goto err_flow_free; mask_p->key = mask.key; mask_p->range = mask.range; - ovs_sw_flow_mask_insert(table, mask_p); + ovs_sw_flow_mask_insert(&dp->table, mask_p); } ovs_sw_flow_mask_add_ref(mask_p); @@ -868,7 +830,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) rcu_assign_pointer(flow->sf_acts, acts); /* Put flow in bucket. */ - ovs_flow_tbl_insert(table, flow); + ovs_flow_tbl_insert(&dp->table, flow); reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, info->snd_seq, OVS_FLOW_CMD_NEW); @@ -936,7 +898,6 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) struct sk_buff *reply; struct sw_flow *flow; struct datapath *dp; - struct flow_table *table; struct sw_flow_match match; int err; @@ -957,8 +918,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) goto unlock; } - table = ovsl_dereference(dp->table); - flow = ovs_flow_tbl_lookup(table, &key); + flow = ovs_flow_tbl_lookup(&dp->table, &key); if (!flow || !ovs_flow_cmp_unmasked_key(flow, &match)) { err = -ENOENT; goto unlock; @@ -986,7 +946,6 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) struct sk_buff *reply; struct sw_flow *flow; struct datapath *dp; - struct flow_table *table; struct sw_flow_match match; int err; @@ -998,7 +957,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) } if (!a[OVS_FLOW_ATTR_KEY]) { - err = flush_flows(dp); + err = ovs_flow_tbl_flush(&dp->table); goto unlock; } @@ -1007,8 +966,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) if (err) goto unlock; - table = ovsl_dereference(dp->table); - flow = ovs_flow_tbl_lookup(table, &key); + flow = ovs_flow_tbl_lookup(&dp->table, &key); if (!flow || !ovs_flow_cmp_unmasked_key(flow, &match)) { err = -ENOENT; goto unlock; @@ -1020,7 +978,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) goto unlock; } - ovs_flow_tbl_remove(table, flow); + ovs_flow_tbl_remove(&dp->table, flow); err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_portid, info->snd_seq, 0, OVS_FLOW_CMD_DEL); @@ -1039,8 +997,8 @@ unlock: static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); + struct table_instance *ti; struct datapath *dp; - struct flow_table *table; rcu_read_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); @@ -1049,14 +1007,14 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) return -ENODEV; } - table = rcu_dereference(dp->table); + ti = rcu_dereference(dp->table.ti); for (;;) { struct sw_flow *flow; u32 bucket, obj; bucket = cb->args[0]; obj = cb->args[1]; - flow = ovs_flow_tbl_dump_next(table, &bucket, &obj); + flow = ovs_flow_tbl_dump_next(ti, &bucket, &obj); if (!flow) break; @@ -1220,9 +1178,8 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_dp_set_net(dp, hold_net(sock_net(skb->sk))); /* Allocate table. */ - err = -ENOMEM; - rcu_assign_pointer(dp->table, ovs_flow_tbl_alloc(TBL_MIN_BUCKETS)); - if (!dp->table) + err = ovs_flow_tbl_init(&dp->table); + if (err) goto err_free_dp; dp->stats_percpu = alloc_percpu(struct dp_stats_percpu); @@ -1279,7 +1236,7 @@ err_destroy_ports_array: err_destroy_percpu: free_percpu(dp->stats_percpu); err_destroy_table: - ovs_flow_tbl_destroy(ovsl_dereference(dp->table), false); + ovs_flow_tbl_destroy(&dp->table, false); err_free_dp: release_net(ovs_dp_get_net(dp)); kfree(dp); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index a6982ef84f20..acfd4af8ca3a 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -58,12 +58,11 @@ struct dp_stats_percpu { * struct datapath - datapath for flow-based packet switching * @rcu: RCU callback head for deferred destruction. * @list_node: Element in global 'dps' list. - * @table: Current flow table. Protected by ovs_mutex and RCU. + * @table: flow table. * @ports: Hash table for ports. %OVSP_LOCAL port always exists. Protected by * ovs_mutex and RCU. * @stats_percpu: Per-CPU datapath statistics. * @net: Reference to net namespace. - * @last_rehash: Timestamp of last rehash. * * Context: See the comment on locking at the top of datapath.c for additional * locking information. @@ -73,7 +72,7 @@ struct datapath { struct list_head list_node; /* Flow table. */ - struct flow_table __rcu *table; + struct flow_table table; /* Switch ports. */ struct hlist_head *ports; @@ -85,7 +84,6 @@ struct datapath { /* Network namespace ref. */ struct net *net; #endif - unsigned long last_rehash; }; /** diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index dcadb75bb173..1c7e7732ed4c 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -44,6 +44,11 @@ #include #include +#include "datapath.h" + +#define TBL_MIN_BUCKETS 1024 +#define REHASH_INTERVAL (10 * 60 * HZ) + static struct kmem_cache *flow_cache; static u16 range_n_bytes(const struct sw_flow_key_range *range) @@ -82,6 +87,11 @@ struct sw_flow *ovs_flow_alloc(void) return flow; } +int ovs_flow_tbl_count(struct flow_table *table) +{ + return table->count; +} + static struct flex_array *alloc_buckets(unsigned int n_buckets) { struct flex_array *buckets; @@ -136,18 +146,18 @@ static void free_buckets(struct flex_array *buckets) flex_array_free(buckets); } -static void __flow_tbl_destroy(struct flow_table *table) +static void __table_instance_destroy(struct table_instance *ti) { int i; - if (table->keep_flows) + if (ti->keep_flows) goto skip_flows; - for (i = 0; i < table->n_buckets; i++) { + for (i = 0; i < ti->n_buckets; i++) { struct sw_flow *flow; - struct hlist_head *head = flex_array_get(table->buckets, i); + struct hlist_head *head = flex_array_get(ti->buckets, i); struct hlist_node *n; - int ver = table->node_ver; + int ver = ti->node_ver; hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { hlist_del(&flow->hash_node[ver]); @@ -155,74 +165,74 @@ static void __flow_tbl_destroy(struct flow_table *table) } } - BUG_ON(!list_empty(table->mask_list)); - kfree(table->mask_list); - skip_flows: - free_buckets(table->buckets); - kfree(table); + free_buckets(ti->buckets); + kfree(ti); } -static struct flow_table *__flow_tbl_alloc(int new_size) +static struct table_instance *table_instance_alloc(int new_size) { - struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL); + struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL); - if (!table) + if (!ti) return NULL; - table->buckets = alloc_buckets(new_size); + ti->buckets = alloc_buckets(new_size); - if (!table->buckets) { - kfree(table); + if (!ti->buckets) { + kfree(ti); return NULL; } - table->n_buckets = new_size; - table->count = 0; - table->node_ver = 0; - table->keep_flows = false; - get_random_bytes(&table->hash_seed, sizeof(u32)); - table->mask_list = NULL; + ti->n_buckets = new_size; + ti->node_ver = 0; + ti->keep_flows = false; + get_random_bytes(&ti->hash_seed, sizeof(u32)); - return table; + return ti; } -struct flow_table *ovs_flow_tbl_alloc(int new_size) +int ovs_flow_tbl_init(struct flow_table *table) { - struct flow_table *table = __flow_tbl_alloc(new_size); + struct table_instance *ti; - if (!table) - return NULL; + ti = table_instance_alloc(TBL_MIN_BUCKETS); - table->mask_list = kmalloc(sizeof(struct list_head), GFP_KERNEL); - if (!table->mask_list) { - table->keep_flows = true; - __flow_tbl_destroy(table); - return NULL; - } - INIT_LIST_HEAD(table->mask_list); + if (!ti) + return -ENOMEM; - return table; + rcu_assign_pointer(table->ti, ti); + INIT_LIST_HEAD(&table->mask_list); + table->last_rehash = jiffies; + table->count = 0; + return 0; } static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) { - struct flow_table *table = container_of(rcu, struct flow_table, rcu); + struct table_instance *ti = container_of(rcu, struct table_instance, rcu); - __flow_tbl_destroy(table); + __table_instance_destroy(ti); } -void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred) +static void table_instance_destroy(struct table_instance *ti, bool deferred) { - if (!table) + if (!ti) return; if (deferred) - call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb); + call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); else - __flow_tbl_destroy(table); + __table_instance_destroy(ti); +} + +void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred) +{ + struct table_instance *ti = ovsl_dereference(table->ti); + + table_instance_destroy(ti, deferred); } -struct sw_flow *ovs_flow_tbl_dump_next(struct flow_table *table, +struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, u32 *bucket, u32 *last) { struct sw_flow *flow; @@ -230,10 +240,10 @@ struct sw_flow *ovs_flow_tbl_dump_next(struct flow_table *table, int ver; int i; - ver = table->node_ver; - while (*bucket < table->n_buckets) { + ver = ti->node_ver; + while (*bucket < ti->n_buckets) { i = 0; - head = flex_array_get(table->buckets, *bucket); + head = flex_array_get(ti->buckets, *bucket); hlist_for_each_entry_rcu(flow, head, hash_node[ver]) { if (i < *last) { i++; @@ -249,25 +259,23 @@ struct sw_flow *ovs_flow_tbl_dump_next(struct flow_table *table, return NULL; } -static struct hlist_head *find_bucket(struct flow_table *table, u32 hash) +static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash) { - hash = jhash_1word(hash, table->hash_seed); - return flex_array_get(table->buckets, - (hash & (table->n_buckets - 1))); + hash = jhash_1word(hash, ti->hash_seed); + return flex_array_get(ti->buckets, + (hash & (ti->n_buckets - 1))); } -static void __tbl_insert(struct flow_table *table, struct sw_flow *flow) +static void table_instance_insert(struct table_instance *ti, struct sw_flow *flow) { struct hlist_head *head; - head = find_bucket(table, flow->hash); - hlist_add_head_rcu(&flow->hash_node[table->node_ver], head); - - table->count++; + head = find_bucket(ti, flow->hash); + hlist_add_head_rcu(&flow->hash_node[ti->node_ver], head); } -static void flow_table_copy_flows(struct flow_table *old, - struct flow_table *new) +static void flow_table_copy_flows(struct table_instance *old, + struct table_instance *new) { int old_ver; int i; @@ -283,35 +291,42 @@ static void flow_table_copy_flows(struct flow_table *old, head = flex_array_get(old->buckets, i); hlist_for_each_entry(flow, head, hash_node[old_ver]) - __tbl_insert(new, flow); + table_instance_insert(new, flow); } - new->mask_list = old->mask_list; old->keep_flows = true; } -static struct flow_table *__flow_tbl_rehash(struct flow_table *table, +static struct table_instance *table_instance_rehash(struct table_instance *ti, int n_buckets) { - struct flow_table *new_table; + struct table_instance *new_ti; - new_table = __flow_tbl_alloc(n_buckets); - if (!new_table) + new_ti = table_instance_alloc(n_buckets); + if (!new_ti) return ERR_PTR(-ENOMEM); - flow_table_copy_flows(table, new_table); + flow_table_copy_flows(ti, new_ti); - return new_table; + return new_ti; } -struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table) +int ovs_flow_tbl_flush(struct flow_table *flow_table) { - return __flow_tbl_rehash(table, table->n_buckets); -} + struct table_instance *old_ti; + struct table_instance *new_ti; -struct flow_table *ovs_flow_tbl_expand(struct flow_table *table) -{ - return __flow_tbl_rehash(table, table->n_buckets * 2); + old_ti = ovsl_dereference(flow_table->ti); + new_ti = table_instance_alloc(TBL_MIN_BUCKETS); + if (!new_ti) + return -ENOMEM; + + rcu_assign_pointer(flow_table->ti, new_ti); + flow_table->last_rehash = jiffies; + flow_table->count = 0; + + table_instance_destroy(old_ti, true); + return 0; } static u32 flow_hash(const struct sw_flow_key *key, int key_start, @@ -367,7 +382,7 @@ bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, return cmp_key(&flow->unmasked_key, key, key_start, key_end); } -static struct sw_flow *masked_flow_lookup(struct flow_table *table, +static struct sw_flow *masked_flow_lookup(struct table_instance *ti, const struct sw_flow_key *unmasked, struct sw_flow_mask *mask) { @@ -380,8 +395,8 @@ static struct sw_flow *masked_flow_lookup(struct flow_table *table, ovs_flow_mask_key(&masked_key, unmasked, mask); hash = flow_hash(&masked_key, key_start, key_end); - head = find_bucket(table, hash); - hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) { + head = find_bucket(ti, hash); + hlist_for_each_entry_rcu(flow, head, hash_node[ti->node_ver]) { if (flow->mask == mask && flow_cmp_masked_key(flow, &masked_key, key_start, key_end)) @@ -393,29 +408,55 @@ static struct sw_flow *masked_flow_lookup(struct flow_table *table, struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, const struct sw_flow_key *key) { - struct sw_flow *flow = NULL; + struct table_instance *ti = rcu_dereference(tbl->ti); struct sw_flow_mask *mask; + struct sw_flow *flow; - list_for_each_entry_rcu(mask, tbl->mask_list, list) { - flow = masked_flow_lookup(tbl, key, mask); + list_for_each_entry_rcu(mask, &tbl->mask_list, list) { + flow = masked_flow_lookup(ti, key, mask); if (flow) /* Found */ - break; + return flow; } + return NULL; +} - return flow; +static struct table_instance *table_instance_expand(struct table_instance *ti) +{ + return table_instance_rehash(ti, ti->n_buckets * 2); } void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow) { + struct table_instance *ti = NULL; + struct table_instance *new_ti = NULL; + + ti = ovsl_dereference(table->ti); + + /* Expand table, if necessary, to make room. */ + if (table->count > ti->n_buckets) + new_ti = table_instance_expand(ti); + else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL)) + new_ti = table_instance_rehash(ti, ti->n_buckets); + + if (new_ti && !IS_ERR(new_ti)) { + rcu_assign_pointer(table->ti, new_ti); + ovs_flow_tbl_destroy(table, true); + ti = ovsl_dereference(table->ti); + table->last_rehash = jiffies; + } + flow->hash = flow_hash(&flow->key, flow->mask->range.start, flow->mask->range.end); - __tbl_insert(table, flow); + table_instance_insert(ti, flow); + table->count++; } void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) { + struct table_instance *ti = ovsl_dereference(table->ti); + BUG_ON(table->count == 0); - hlist_del_rcu(&flow->hash_node[table->node_ver]); + hlist_del_rcu(&flow->hash_node[ti->node_ver]); table->count--; } @@ -475,7 +516,7 @@ struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *tbl, { struct list_head *ml; - list_for_each(ml, tbl->mask_list) { + list_for_each(ml, &tbl->mask_list) { struct sw_flow_mask *m; m = container_of(ml, struct sw_flow_mask, list); if (mask_equal(mask, m)) @@ -492,7 +533,7 @@ struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *tbl, */ void ovs_sw_flow_mask_insert(struct flow_table *tbl, struct sw_flow_mask *mask) { - list_add_rcu(&mask->list, tbl->mask_list); + list_add_rcu(&mask->list, &tbl->mask_list); } /* Initializes the flow module. diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index d7a114457cde..5d1abe566c46 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -36,42 +36,36 @@ #include "flow.h" -#define TBL_MIN_BUCKETS 1024 - -struct flow_table { +struct table_instance { struct flex_array *buckets; - unsigned int count, n_buckets; + unsigned int n_buckets; struct rcu_head rcu; - struct list_head *mask_list; int node_ver; u32 hash_seed; bool keep_flows; }; +struct flow_table { + struct table_instance __rcu *ti; + struct list_head mask_list; + unsigned long last_rehash; + unsigned int count; +}; + int ovs_flow_init(void); void ovs_flow_exit(void); struct sw_flow *ovs_flow_alloc(void); void ovs_flow_free(struct sw_flow *, bool deferred); -static inline int ovs_flow_tbl_count(struct flow_table *table) -{ - return table->count; -} - -static inline int ovs_flow_tbl_need_to_expand(struct flow_table *table) -{ - return (table->count > table->n_buckets); -} - -struct flow_table *ovs_flow_tbl_alloc(int new_size); -struct flow_table *ovs_flow_tbl_expand(struct flow_table *table); -struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table); +int ovs_flow_tbl_init(struct flow_table *); +int ovs_flow_tbl_count(struct flow_table *table); void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred); +int ovs_flow_tbl_flush(struct flow_table *flow_table); void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow); void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); -struct sw_flow *ovs_flow_tbl_dump_next(struct flow_table *table, +struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, u32 *bucket, u32 *idx); struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, const struct sw_flow_key *); -- cgit v1.2.3-59-g8ed1b From 618ed0c805b64c820279f50732110ab873221c3b Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Fri, 4 Oct 2013 00:17:42 -0700 Subject: openvswitch: Simplify mega-flow APIs. Hides mega-flow implementation in flow_table.c rather than datapath.c. Signed-off-by: Pravin B Shelar Signed-off-by: Jesse Gross --- net/openvswitch/datapath.c | 27 +++------ net/openvswitch/flow_table.c | 138 +++++++++++++++++++++++++------------------ net/openvswitch/flow_table.h | 12 +--- 3 files changed, 89 insertions(+), 88 deletions(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 60b9be3b9477..cf270973095d 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -161,7 +161,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu) { struct datapath *dp = container_of(rcu, struct datapath, rcu); - ovs_flow_tbl_destroy(&dp->table, false); + ovs_flow_tbl_destroy(&dp->table); free_percpu(dp->stats_percpu); release_net(ovs_dp_get_net(dp)); kfree(dp->ports); @@ -795,8 +795,6 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) /* Check if this is a duplicate flow */ flow = ovs_flow_tbl_lookup(&dp->table, &key); if (!flow) { - struct sw_flow_mask *mask_p; - /* Bail out if we're not allowed to create a new flow. */ error = -ENOENT; if (info->genlhdr->cmd == OVS_FLOW_CMD_SET) @@ -812,25 +810,14 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) flow->key = masked_key; flow->unmasked_key = key; - - /* Make sure mask is unique in the system */ - mask_p = ovs_sw_flow_mask_find(&dp->table, &mask); - if (!mask_p) { - /* Allocate a new mask if none exsits. */ - mask_p = ovs_sw_flow_mask_alloc(); - if (!mask_p) - goto err_flow_free; - mask_p->key = mask.key; - mask_p->range = mask.range; - ovs_sw_flow_mask_insert(&dp->table, mask_p); - } - - ovs_sw_flow_mask_add_ref(mask_p); - flow->mask = mask_p; rcu_assign_pointer(flow->sf_acts, acts); /* Put flow in bucket. */ - ovs_flow_tbl_insert(&dp->table, flow); + error = ovs_flow_tbl_insert(&dp->table, flow, &mask); + if (error) { + acts = NULL; + goto err_flow_free; + } reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, info->snd_seq, OVS_FLOW_CMD_NEW); @@ -1236,7 +1223,7 @@ err_destroy_ports_array: err_destroy_percpu: free_percpu(dp->stats_percpu); err_destroy_table: - ovs_flow_tbl_destroy(&dp->table, false); + ovs_flow_tbl_destroy(&dp->table); err_free_dp: release_net(ovs_dp_get_net(dp)); kfree(dp); diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 1c7e7732ed4c..036e019f8c3c 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -128,12 +128,36 @@ static void rcu_free_flow_callback(struct rcu_head *rcu) flow_free(flow); } +static void rcu_free_sw_flow_mask_cb(struct rcu_head *rcu) +{ + struct sw_flow_mask *mask = container_of(rcu, struct sw_flow_mask, rcu); + + kfree(mask); +} + +static void flow_mask_del_ref(struct sw_flow_mask *mask, bool deferred) +{ + if (!mask) + return; + + BUG_ON(!mask->ref_count); + mask->ref_count--; + + if (!mask->ref_count) { + list_del_rcu(&mask->list); + if (deferred) + call_rcu(&mask->rcu, rcu_free_sw_flow_mask_cb); + else + kfree(mask); + } +} + void ovs_flow_free(struct sw_flow *flow, bool deferred) { if (!flow) return; - ovs_sw_flow_mask_del_ref(flow->mask, deferred); + flow_mask_del_ref(flow->mask, deferred); if (deferred) call_rcu(&flow->rcu, rcu_free_flow_callback); @@ -225,11 +249,11 @@ static void table_instance_destroy(struct table_instance *ti, bool deferred) __table_instance_destroy(ti); } -void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred) +void ovs_flow_tbl_destroy(struct flow_table *table) { struct table_instance *ti = ovsl_dereference(table->ti); - table_instance_destroy(ti, deferred); + table_instance_destroy(ti, false); } struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, @@ -304,7 +328,7 @@ static struct table_instance *table_instance_rehash(struct table_instance *ti, new_ti = table_instance_alloc(n_buckets); if (!new_ti) - return ERR_PTR(-ENOMEM); + return NULL; flow_table_copy_flows(ti, new_ti); @@ -425,32 +449,6 @@ static struct table_instance *table_instance_expand(struct table_instance *ti) return table_instance_rehash(ti, ti->n_buckets * 2); } -void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow) -{ - struct table_instance *ti = NULL; - struct table_instance *new_ti = NULL; - - ti = ovsl_dereference(table->ti); - - /* Expand table, if necessary, to make room. */ - if (table->count > ti->n_buckets) - new_ti = table_instance_expand(ti); - else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL)) - new_ti = table_instance_rehash(ti, ti->n_buckets); - - if (new_ti && !IS_ERR(new_ti)) { - rcu_assign_pointer(table->ti, new_ti); - ovs_flow_tbl_destroy(table, true); - ti = ovsl_dereference(table->ti); - table->last_rehash = jiffies; - } - - flow->hash = flow_hash(&flow->key, flow->mask->range.start, - flow->mask->range.end); - table_instance_insert(ti, flow); - table->count++; -} - void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) { struct table_instance *ti = ovsl_dereference(table->ti); @@ -460,7 +458,7 @@ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) table->count--; } -struct sw_flow_mask *ovs_sw_flow_mask_alloc(void) +static struct sw_flow_mask *mask_alloc(void) { struct sw_flow_mask *mask; @@ -471,35 +469,11 @@ struct sw_flow_mask *ovs_sw_flow_mask_alloc(void) return mask; } -void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *mask) +static void mask_add_ref(struct sw_flow_mask *mask) { mask->ref_count++; } -static void rcu_free_sw_flow_mask_cb(struct rcu_head *rcu) -{ - struct sw_flow_mask *mask = container_of(rcu, struct sw_flow_mask, rcu); - - kfree(mask); -} - -void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *mask, bool deferred) -{ - if (!mask) - return; - - BUG_ON(!mask->ref_count); - mask->ref_count--; - - if (!mask->ref_count) { - list_del_rcu(&mask->list); - if (deferred) - call_rcu(&mask->rcu, rcu_free_sw_flow_mask_cb); - else - kfree(mask); - } -} - static bool mask_equal(const struct sw_flow_mask *a, const struct sw_flow_mask *b) { @@ -511,7 +485,7 @@ static bool mask_equal(const struct sw_flow_mask *a, && (memcmp(a_, b_, range_n_bytes(&a->range)) == 0); } -struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *tbl, +static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl, const struct sw_flow_mask *mask) { struct list_head *ml; @@ -531,9 +505,55 @@ struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *tbl, * The caller needs to make sure that 'mask' is not the same * as any masks that are already on the list. */ -void ovs_sw_flow_mask_insert(struct flow_table *tbl, struct sw_flow_mask *mask) +static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, + struct sw_flow_mask *new) +{ + struct sw_flow_mask *mask; + mask = flow_mask_find(tbl, new); + if (!mask) { + /* Allocate a new mask if none exsits. */ + mask = mask_alloc(); + if (!mask) + return -ENOMEM; + mask->key = new->key; + mask->range = new->range; + list_add_rcu(&mask->list, &tbl->mask_list); + } + + mask_add_ref(mask); + flow->mask = mask; + return 0; +} + +int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, + struct sw_flow_mask *mask) { - list_add_rcu(&mask->list, &tbl->mask_list); + struct table_instance *new_ti = NULL; + struct table_instance *ti; + int err; + + err = flow_mask_insert(table, flow, mask); + if (err) + return err; + + flow->hash = flow_hash(&flow->key, flow->mask->range.start, + flow->mask->range.end); + ti = ovsl_dereference(table->ti); + table_instance_insert(ti, flow); + table->count++; + + /* Expand table, if necessary, to make room. */ + if (table->count > ti->n_buckets) + new_ti = table_instance_expand(ti); + else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL)) + new_ti = table_instance_rehash(ti, ti->n_buckets); + + if (new_ti) { + rcu_assign_pointer(table->ti, new_ti); + table_instance_destroy(ti, true); + table->last_rehash = jiffies; + } + return 0; } /* Initializes the flow module. diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 5d1abe566c46..4db5f78b6f81 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -60,10 +60,11 @@ void ovs_flow_free(struct sw_flow *, bool deferred); int ovs_flow_tbl_init(struct flow_table *); int ovs_flow_tbl_count(struct flow_table *table); -void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred); +void ovs_flow_tbl_destroy(struct flow_table *table); int ovs_flow_tbl_flush(struct flow_table *flow_table); -void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow); +int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, + struct sw_flow_mask *mask); void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, u32 *bucket, u32 *idx); @@ -73,13 +74,6 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, struct sw_flow_match *match); -struct sw_flow_mask *ovs_sw_flow_mask_alloc(void); -void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *); -void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *, bool deferred); -void ovs_sw_flow_mask_insert(struct flow_table *, struct sw_flow_mask *); -struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *, - const struct sw_flow_mask *); void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, const struct sw_flow_mask *mask); - #endif /* flow_table.h */ -- cgit v1.2.3-59-g8ed1b From 1bd7116f1cb833c998cddb6b188df463342069d8 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Tue, 22 Oct 2013 10:42:46 -0700 Subject: openvswitch: collect mega flow mask stats Collect mega flow mask stats. ovs-dpctl show command can be used to display them for debugging and performance tuning. Signed-off-by: Andy Zhou Signed-off-by: Jesse Gross --- include/uapi/linux/openvswitch.h | 17 ++++++++++++++--- net/openvswitch/datapath.c | 38 +++++++++++++++++++++++++++++++------- net/openvswitch/datapath.h | 4 ++++ net/openvswitch/flow_table.c | 16 +++++++++++++++- net/openvswitch/flow_table.h | 4 +++- 5 files changed, 67 insertions(+), 12 deletions(-) (limited to 'net/openvswitch') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index a74d375b439b..2cc4644f68ef 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -63,15 +63,18 @@ enum ovs_datapath_cmd { * not be sent. * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the * datapath. Always present in notifications. + * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for the + * datapath. Always present in notifications. * * These attributes follow the &struct ovs_header within the Generic Netlink * payload for %OVS_DP_* commands. */ enum ovs_datapath_attr { OVS_DP_ATTR_UNSPEC, - OVS_DP_ATTR_NAME, /* name of dp_ifindex netdev */ - OVS_DP_ATTR_UPCALL_PID, /* Netlink PID to receive upcalls */ - OVS_DP_ATTR_STATS, /* struct ovs_dp_stats */ + OVS_DP_ATTR_NAME, /* name of dp_ifindex netdev */ + OVS_DP_ATTR_UPCALL_PID, /* Netlink PID to receive upcalls */ + OVS_DP_ATTR_STATS, /* struct ovs_dp_stats */ + OVS_DP_ATTR_MEGAFLOW_STATS, /* struct ovs_dp_megaflow_stats */ __OVS_DP_ATTR_MAX }; @@ -84,6 +87,14 @@ struct ovs_dp_stats { __u64 n_flows; /* Number of flows present */ }; +struct ovs_dp_megaflow_stats { + __u64 n_mask_hit; /* Number of masks used for flow lookups. */ + __u32 n_masks; /* Number of masks for the datapath. */ + __u32 pad0; /* Pad for future expension. */ + __u64 pad1; /* Pad for future expension. */ + __u64 pad2; /* Pad for future expension. */ +}; + struct ovs_vport_stats { __u64 rx_packets; /* total packets received */ __u64 tx_packets; /* total packets transmitted */ diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index cf270973095d..5bc5a4e64758 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -221,6 +221,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) struct dp_stats_percpu *stats; struct sw_flow_key key; u64 *stats_counter; + u32 n_mask_hit; int error; stats = this_cpu_ptr(dp->stats_percpu); @@ -233,7 +234,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) } /* Look up flow. */ - flow = ovs_flow_tbl_lookup(&dp->table, &key); + flow = ovs_flow_tbl_lookup(&dp->table, &key, &n_mask_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; @@ -258,6 +259,7 @@ out: /* Update datapath statistics. */ u64_stats_update_begin(&stats->sync); (*stats_counter)++; + stats->n_mask_hit += n_mask_hit; u64_stats_update_end(&stats->sync); } @@ -563,13 +565,18 @@ static struct genl_ops dp_packet_genl_ops[] = { } }; -static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats) +static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats, + struct ovs_dp_megaflow_stats *mega_stats) { int i; + memset(mega_stats, 0, sizeof(*mega_stats)); + stats->n_flows = ovs_flow_tbl_count(&dp->table); + mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table); stats->n_hit = stats->n_missed = stats->n_lost = 0; + for_each_possible_cpu(i) { const struct dp_stats_percpu *percpu_stats; struct dp_stats_percpu local_stats; @@ -585,6 +592,7 @@ static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats) stats->n_hit += local_stats.n_hit; stats->n_missed += local_stats.n_missed; stats->n_lost += local_stats.n_lost; + mega_stats->n_mask_hit += local_stats.n_mask_hit; } } @@ -743,6 +751,14 @@ static struct sk_buff *ovs_flow_cmd_build_info(struct sw_flow *flow, return skb; } +static struct sw_flow *__ovs_flow_tbl_lookup(struct flow_table *tbl, + const struct sw_flow_key *key) +{ + u32 __always_unused n_mask_hit; + + return ovs_flow_tbl_lookup(tbl, key, &n_mask_hit); +} + static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; @@ -793,7 +809,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) goto err_unlock_ovs; /* Check if this is a duplicate flow */ - flow = ovs_flow_tbl_lookup(&dp->table, &key); + flow = __ovs_flow_tbl_lookup(&dp->table, &key); if (!flow) { /* Bail out if we're not allowed to create a new flow. */ error = -ENOENT; @@ -905,7 +921,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) goto unlock; } - flow = ovs_flow_tbl_lookup(&dp->table, &key); + flow = __ovs_flow_tbl_lookup(&dp->table, &key); if (!flow || !ovs_flow_cmp_unmasked_key(flow, &match)) { err = -ENOENT; goto unlock; @@ -953,7 +969,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) if (err) goto unlock; - flow = ovs_flow_tbl_lookup(&dp->table, &key); + flow = __ovs_flow_tbl_lookup(&dp->table, &key); if (!flow || !ovs_flow_cmp_unmasked_key(flow, &match)) { err = -ENOENT; goto unlock; @@ -1067,6 +1083,7 @@ static size_t ovs_dp_cmd_msg_size(void) msgsize += nla_total_size(IFNAMSIZ); msgsize += nla_total_size(sizeof(struct ovs_dp_stats)); + msgsize += nla_total_size(sizeof(struct ovs_dp_megaflow_stats)); return msgsize; } @@ -1076,6 +1093,7 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, { struct ovs_header *ovs_header; struct ovs_dp_stats dp_stats; + struct ovs_dp_megaflow_stats dp_megaflow_stats; int err; ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family, @@ -1091,8 +1109,14 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, if (err) goto nla_put_failure; - get_dp_stats(dp, &dp_stats); - if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), &dp_stats)) + get_dp_stats(dp, &dp_stats, &dp_megaflow_stats); + if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), + &dp_stats)) + goto nla_put_failure; + + if (nla_put(skb, OVS_DP_ATTR_MEGAFLOW_STATS, + sizeof(struct ovs_dp_megaflow_stats), + &dp_megaflow_stats)) goto nla_put_failure; return genlmsg_end(skb, ovs_header); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index acfd4af8ca3a..d3d14a58aa91 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -46,11 +46,15 @@ * @n_lost: Number of received packets that had no matching flow in the flow * table that could not be sent to userspace (normally due to an overflow in * one of the datapath's queues). + * @n_mask_hit: Number of masks looked up for flow match. + * @n_mask_hit / (@n_hit + @n_missed) will be the average masks looked + * up per packet. */ struct dp_stats_percpu { u64 n_hit; u64 n_missed; u64 n_lost; + u64 n_mask_hit; struct u64_stats_sync sync; }; diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 036e019f8c3c..536b4d2a42e2 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -430,13 +430,16 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, } struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, - const struct sw_flow_key *key) + const struct sw_flow_key *key, + u32 *n_mask_hit) { struct table_instance *ti = rcu_dereference(tbl->ti); struct sw_flow_mask *mask; struct sw_flow *flow; + *n_mask_hit = 0; list_for_each_entry_rcu(mask, &tbl->mask_list, list) { + (*n_mask_hit)++; flow = masked_flow_lookup(ti, key, mask); if (flow) /* Found */ return flow; @@ -444,6 +447,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, return NULL; } +int ovs_flow_tbl_num_masks(const struct flow_table *table) +{ + struct sw_flow_mask *mask; + int num = 0; + + list_for_each_entry(mask, &table->mask_list, list) + num++; + + return num; +} + static struct table_instance *table_instance_expand(struct table_instance *ti) { return table_instance_rehash(ti, ti->n_buckets * 2); diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 4db5f78b6f81..fbe45d5ad07d 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -66,10 +66,12 @@ int ovs_flow_tbl_flush(struct flow_table *flow_table); int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, struct sw_flow_mask *mask); void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); +int ovs_flow_tbl_num_masks(const struct flow_table *table); struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, u32 *bucket, u32 *idx); struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, - const struct sw_flow_key *); + const struct sw_flow_key *, + u32 *n_mask_hit); bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, struct sw_flow_match *match); -- cgit v1.2.3-59-g8ed1b From 3cdb35b074142c915a463c535839886ae08fdfd4 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Fri, 25 Oct 2013 15:12:33 -0700 Subject: openvswitch: Enable all GSO features on internal port. OVS already can handle all types of segmentation offloads that are supported by the kernel. Following patch specifically enables UDP and IPV6 segmentation offloads. Signed-off-by: Pravin B Shelar Signed-off-by: Jesse Gross --- net/openvswitch/vport-internal_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 98d3edbbc235..729c68763fe7 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -134,7 +134,7 @@ static void do_setup(struct net_device *netdev) netdev->tx_queue_len = 0; netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST | - NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_TSO; + NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; netdev->vlan_features = netdev->features; netdev->features |= NETIF_F_HW_VLAN_CTAG_TX; -- cgit v1.2.3-59-g8ed1b From df23e9f642830f10c505c8a3d57772ad1238c701 Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Wed, 23 Oct 2013 01:40:44 -0700 Subject: openvswitch: Widen TCP flags handling. Widen TCP flags handling from 7 bits (uint8_t) to 12 bits (uint16_t). The kernel interface remains at 8 bits, which makes no functional difference now, as none of the higher bits is currently of interest to the userspace. Signed-off-by: Jarno Rajahalme Signed-off-by: Jesse Gross --- net/openvswitch/datapath.c | 2 +- net/openvswitch/flow.c | 8 +++----- net/openvswitch/flow.h | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 5bc5a4e64758..1408adc2a2a7 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -671,7 +671,7 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, used = flow->used; stats.n_packets = flow->packet_count; stats.n_bytes = flow->byte_count; - tcp_flags = flow->tcp_flags; + tcp_flags = (u8)ntohs(flow->tcp_flags); spin_unlock_bh(&flow->lock); if (used && diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 617810f1a21e..b73c7680a3d2 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -58,19 +58,17 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies) return cur_ms - idle_ms; } -#define TCP_FLAGS_OFFSET 13 -#define TCP_FLAG_MASK 0x3f +#define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF)) void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb) { - u8 tcp_flags = 0; + __be16 tcp_flags = 0; if ((flow->key.eth.type == htons(ETH_P_IP) || flow->key.eth.type == htons(ETH_P_IPV6)) && flow->key.ip.proto == IPPROTO_TCP && likely(skb->len >= skb_transport_offset(skb) + sizeof(struct tcphdr))) { - u8 *tcp = (u8 *)tcp_hdr(skb); - tcp_flags = *(tcp + TCP_FLAGS_OFFSET) & TCP_FLAG_MASK; + tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb)); } spin_lock(&flow->lock); diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 098fd1db6a23..204e0ccd116d 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -158,7 +158,7 @@ struct sw_flow { unsigned long used; /* Last used time (in jiffies). */ u64 packet_count; /* Number of packets matched. */ u64 byte_count; /* Number of bytes matched. */ - u8 tcp_flags; /* Union of seen TCP flags. */ + __be16 tcp_flags; /* Union of seen TCP flags. */ }; struct arp_eth_header { -- cgit v1.2.3-59-g8ed1b From 5eb26b156e29eadcc21f73fb5d14497f0db24b86 Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Wed, 23 Oct 2013 01:44:59 -0700 Subject: openvswitch: TCP flags matching support. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tcp_flags=flags/mask Bitwise match on TCP flags. The flags and mask are 16-bit num‐ bers written in decimal or in hexadecimal prefixed by 0x. Each 1-bit in mask requires that the corresponding bit in port must match. Each 0-bit in mask causes the corresponding bit to be ignored. TCP protocol currently defines 9 flag bits, and additional 3 bits are reserved (must be transmitted as zero), see RFCs 793, 3168, and 3540. The flag bits are, numbering from the least significant bit: 0: FIN No more data from sender. 1: SYN Synchronize sequence numbers. 2: RST Reset the connection. 3: PSH Push function. 4: ACK Acknowledgement field significant. 5: URG Urgent pointer field significant. 6: ECE ECN Echo. 7: CWR Congestion Windows Reduced. 8: NS Nonce Sum. 9-11: Reserved. 12-15: Not matchable, must be zero. Signed-off-by: Jarno Rajahalme Signed-off-by: Jesse Gross --- include/uapi/linux/openvswitch.h | 1 + net/openvswitch/flow.c | 2 ++ net/openvswitch/flow.h | 2 ++ net/openvswitch/flow_netlink.c | 31 +++++++++++++++++++++++++++++-- 4 files changed, 34 insertions(+), 2 deletions(-) (limited to 'net/openvswitch') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 2cc4644f68ef..d120f9fe0017 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -271,6 +271,7 @@ enum ovs_key_attr { OVS_KEY_ATTR_SKB_MARK, /* u32 skb mark */ OVS_KEY_ATTR_TUNNEL, /* Nested set of ovs_tunnel attributes */ OVS_KEY_ATTR_SCTP, /* struct ovs_key_sctp */ + OVS_KEY_ATTR_TCP_FLAGS, /* be16 TCP flags. */ #ifdef __KERNEL__ OVS_KEY_ATTR_IPV4_TUNNEL, /* struct ovs_key_ipv4_tunnel */ diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index b73c7680a3d2..b409f5279601 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -428,6 +428,7 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) struct tcphdr *tcp = tcp_hdr(skb); key->ipv4.tp.src = tcp->source; key->ipv4.tp.dst = tcp->dest; + key->ipv4.tp.flags = TCP_FLAGS_BE16(tcp); } } else if (key->ip.proto == IPPROTO_UDP) { if (udphdr_ok(skb)) { @@ -496,6 +497,7 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) struct tcphdr *tcp = tcp_hdr(skb); key->ipv6.tp.src = tcp->source; key->ipv6.tp.dst = tcp->dest; + key->ipv6.tp.flags = TCP_FLAGS_BE16(tcp); } } else if (key->ip.proto == NEXTHDR_UDP) { if (udphdr_ok(skb)) { diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 204e0ccd116d..1510f51dbf74 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -93,6 +93,7 @@ struct sw_flow_key { struct { __be16 src; /* TCP/UDP/SCTP source port. */ __be16 dst; /* TCP/UDP/SCTP destination port. */ + __be16 flags; /* TCP flags. */ } tp; struct { u8 sha[ETH_ALEN]; /* ARP source hardware address. */ @@ -109,6 +110,7 @@ struct sw_flow_key { struct { __be16 src; /* TCP/UDP/SCTP source port. */ __be16 dst; /* TCP/UDP/SCTP destination port. */ + __be16 flags; /* TCP flags. */ } tp; struct { struct in6_addr target; /* ND target address. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index e04649c56a96..2bc1bc1aca3b 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -114,6 +114,7 @@ static bool match_validate(const struct sw_flow_match *match, mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4) | (1 << OVS_KEY_ATTR_IPV6) | (1 << OVS_KEY_ATTR_TCP) + | (1 << OVS_KEY_ATTR_TCP_FLAGS) | (1 << OVS_KEY_ATTR_UDP) | (1 << OVS_KEY_ATTR_SCTP) | (1 << OVS_KEY_ATTR_ICMP) @@ -154,8 +155,11 @@ static bool match_validate(const struct sw_flow_match *match, if (match->key->ip.proto == IPPROTO_TCP) { key_expected |= 1 << OVS_KEY_ATTR_TCP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) + key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS; + if (match->mask && (match->mask->key.ip.proto == 0xff)) { mask_allowed |= 1 << OVS_KEY_ATTR_TCP; + mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS; + } } if (match->key->ip.proto == IPPROTO_ICMP) { @@ -186,8 +190,11 @@ static bool match_validate(const struct sw_flow_match *match, if (match->key->ip.proto == IPPROTO_TCP) { key_expected |= 1 << OVS_KEY_ATTR_TCP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) + key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS; + if (match->mask && (match->mask->key.ip.proto == 0xff)) { mask_allowed |= 1 << OVS_KEY_ATTR_TCP; + mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS; + } } if (match->key->ip.proto == IPPROTO_ICMPV6) { @@ -235,6 +242,7 @@ static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4), [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6), [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp), + [OVS_KEY_ATTR_TCP_FLAGS] = sizeof(__be16), [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp), [OVS_KEY_ATTR_SCTP] = sizeof(struct ovs_key_sctp), [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp), @@ -634,6 +642,19 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, attrs &= ~(1 << OVS_KEY_ATTR_TCP); } + if (attrs & (1 << OVS_KEY_ATTR_TCP_FLAGS)) { + if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { + SW_FLOW_KEY_PUT(match, ipv4.tp.flags, + nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), + is_mask); + } else { + SW_FLOW_KEY_PUT(match, ipv6.tp.flags, + nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), + is_mask); + } + attrs &= ~(1 << OVS_KEY_ATTR_TCP_FLAGS); + } + if (attrs & (1 << OVS_KEY_ATTR_UDP)) { const struct ovs_key_udp *udp_key; @@ -1004,9 +1025,15 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, if (swkey->eth.type == htons(ETH_P_IP)) { tcp_key->tcp_src = output->ipv4.tp.src; tcp_key->tcp_dst = output->ipv4.tp.dst; + if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS, + output->ipv4.tp.flags)) + goto nla_put_failure; } else if (swkey->eth.type == htons(ETH_P_IPV6)) { tcp_key->tcp_src = output->ipv6.tp.src; tcp_key->tcp_dst = output->ipv6.tp.dst; + if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS, + output->ipv6.tp.flags)) + goto nla_put_failure; } } else if (swkey->ip.proto == IPPROTO_UDP) { struct ovs_key_udp *udp_key; -- cgit v1.2.3-59-g8ed1b From 8ddd094675cfd453fc9838caa46ea108a4107183 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 29 Oct 2013 23:10:58 -0700 Subject: openvswitch: Use flow hash during flow lookup operation. Flow->hash can be used to detect hash collisions and avoid flow key compare in flow lookup. Signed-off-by: Pravin B Shelar Signed-off-by: Jesse Gross --- net/openvswitch/flow_table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 536b4d2a42e2..e42542706087 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -421,7 +421,7 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, hash = flow_hash(&masked_key, key_start, key_end); head = find_bucket(ti, hash); hlist_for_each_entry_rcu(flow, head, hash_node[ti->node_ver]) { - if (flow->mask == mask && + if (flow->mask == mask && flow->hash == hash && flow_cmp_masked_key(flow, &masked_key, key_start, key_end)) return flow; -- cgit v1.2.3-59-g8ed1b