From 6ece90f9a13e2592cbd6634f74bcb306169b5ab6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 29 Sep 2015 21:10:05 +0200 Subject: netfilter: fix Kconfig dependencies for nf_dup_ipv{4,6} net/built-in.o: In function `nf_dup_ipv4': (.text+0xed24d): undefined reference to `nf_conntrack_untracked' net/built-in.o: In function `nf_dup_ipv4': (.text+0xed267): undefined reference to `nf_conntrack_untracked' net/built-in.o: In function `nf_dup_ipv6': (.text+0x158aef): undefined reference to `nf_conntrack_untracked' net/built-in.o: In function `nf_dup_ipv6': (.text+0x158b09): undefined reference to `nf_conntrack_untracked' Reported-by: Randy Dunlap Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 690d27d3f2f9..a35584176535 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -75,6 +75,7 @@ endif # NF_TABLES config NF_DUP_IPV4 tristate "Netfilter IPv4 packet duplication to alternate destination" + depends on !NF_CONNTRACK || NF_CONNTRACK help This option enables the nf_dup_ipv4 core, which duplicates an IPv4 packet to be rerouted to another destination. -- cgit v1.2.3-59-g8ed1b From cc4998febd567d1c671684abce5595344bd4e8b2 Mon Sep 17 00:00:00 2001 From: lucien Date: Tue, 6 Oct 2015 21:03:07 +0800 Subject: netfilter: ipt_rpfilter: remove the nh_scope test in rpfilter_lookup_reverse --accept-local option works for res.type == RTN_LOCAL, which should be from the local table, but there, the fib_info's nh->nh_scope = RT_SCOPE_NOWHERE ( > RT_SCOPE_HOST). in fib_create_info(). if (cfg->fc_scope == RT_SCOPE_HOST) { struct fib_nh *nh = fi->fib_nh; /* Local address is added. */ if (nhs != 1 || nh->nh_gw) goto err_inval; nh->nh_scope = RT_SCOPE_NOWHERE; <=== nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif); err = -ENODEV; if (!nh->nh_dev) goto failure; but in our rpfilter_lookup_reverse(): if (dev_match || flags & XT_RPFILTER_LOOSE) return FIB_RES_NH(res).nh_scope <= RT_SCOPE_HOST; if nh->nh_scope > RT_SCOPE_HOST, it will fail. --accept-local option will never be passed. it seems the test is bogus and can be removed to fix this issue. if (dev_match || flags & XT_RPFILTER_LOOSE) return FIB_RES_NH(res).nh_scope <= RT_SCOPE_HOST; ipv6 does not have this issue. Signed-off-by: Xin Long Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_rpfilter.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 8618fd150c96..c4ffc9de1654 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -61,9 +61,7 @@ static bool rpfilter_lookup_reverse(struct flowi4 *fl4, if (FIB_RES_DEV(res) == dev) dev_match = true; #endif - if (dev_match || flags & XT_RPFILTER_LOOSE) - return FIB_RES_NH(res).nh_scope <= RT_SCOPE_HOST; - return dev_match; + return dev_match || flags & XT_RPFILTER_LOOSE; } static bool rpfilter_is_local(const struct sk_buff *skb) -- cgit v1.2.3-59-g8ed1b From ca064bd89363a6e7e71b1c5226ff1b718957a9d4 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 19 Oct 2015 10:30:05 +0200 Subject: xfrm: Fix pmtu discovery for local generated packets. Commit 044a832a777 ("xfrm: Fix local error reporting crash with interfamily tunnels") moved the setting of skb->protocol behind the last access of the inner mode family to fix an interfamily crash. Unfortunately now skb->protocol might not be set at all, so we fail dispatch to the inner address family. As a reault, the local error handler is not called and the mtu value is not reported back to userspace. We fix this by setting skb->protocol on message size errors before we call xfrm_local_error. Fixes: 044a832a7779c ("xfrm: Fix local error reporting crash with interfamily tunnels") Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_output.c | 2 ++ net/ipv6/xfrm6_output.c | 1 + 2 files changed, 3 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 2878dbfffeb7..41a261355662 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -30,6 +30,8 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) mtu = dst_mtu(skb_dst(skb)); if (skb->len > mtu) { + skb->protocol = htons(ETH_P_IP); + if (skb->sk) xfrm_local_error(skb, mtu); else diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index be033f22a3f3..e15feb7b413d 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -79,6 +79,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) if (!skb->ignore_df && skb->len > mtu) { skb->dev = dst->dev; + skb->protocol = htons(ETH_P_IPV6); if (xfrm6_local_dontfrag(skb)) xfrm6_local_rxpmtu(skb, mtu); -- cgit v1.2.3-59-g8ed1b From e2e8009ff72ad2a795b67785f3238af152146368 Mon Sep 17 00:00:00 2001 From: Renato Westphal Date: Mon, 19 Oct 2015 18:51:34 -0200 Subject: tcp: remove improper preemption check in tcp_xmit_probe_skb() Commit e520af48c7e5a introduced the following bug when setting the TCP_REPAIR sockoption: [ 2860.657036] BUG: using __this_cpu_add() in preemptible [00000000] code: daemon/12164 [ 2860.657045] caller is __this_cpu_preempt_check+0x13/0x20 [ 2860.657049] CPU: 1 PID: 12164 Comm: daemon Not tainted 4.2.3 #1 [ 2860.657051] Hardware name: Dell Inc. PowerEdge R210 II/0JP7TR, BIOS 2.0.5 03/13/2012 [ 2860.657054] ffffffff81c7f071 ffff880231e9fdf8 ffffffff8185d765 0000000000000002 [ 2860.657058] 0000000000000001 ffff880231e9fe28 ffffffff8146ed91 ffff880231e9fe18 [ 2860.657062] ffffffff81cd1a5d ffff88023534f200 ffff8800b9811000 ffff880231e9fe38 [ 2860.657065] Call Trace: [ 2860.657072] [] dump_stack+0x4f/0x7b [ 2860.657075] [] check_preemption_disabled+0xe1/0xf0 [ 2860.657078] [] __this_cpu_preempt_check+0x13/0x20 [ 2860.657082] [] tcp_xmit_probe_skb+0xc7/0x100 [ 2860.657085] [] tcp_send_window_probe+0x2d/0x30 [ 2860.657089] [] do_tcp_setsockopt.isra.29+0x74c/0x830 [ 2860.657093] [] tcp_setsockopt+0x2c/0x30 [ 2860.657097] [] sock_common_setsockopt+0x14/0x20 [ 2860.657100] [] SyS_setsockopt+0x71/0xc0 [ 2860.657104] [] entry_SYSCALL_64_fastpath+0x16/0x75 Since tcp_xmit_probe_skb() can be called from process context, use NET_INC_STATS() instead of NET_INC_STATS_BH(). Fixes: e520af48c7e5 ("tcp: add TCPWinProbe and TCPKeepAlive SNMP counters") Signed-off-by: Renato Westphal Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1100ffe4a722..3dbee0d83b15 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3405,7 +3405,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); skb_mstamp_get(&skb->skb_mstamp); - NET_INC_STATS_BH(sock_net(sk), mib); + NET_INC_STATS(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } -- cgit v1.2.3-59-g8ed1b From fc4099f17240767554ff3a73977acb78ef615404 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Thu, 22 Oct 2015 18:17:16 -0700 Subject: openvswitch: Fix egress tunnel info. While transitioning to netdev based vport we broke OVS feature which allows user to retrieve tunnel packet egress information for lwtunnel devices. Following patch fixes it by introducing ndo operation to get the tunnel egress info. Same ndo operation can be used for lwtunnel devices and compat ovs-tnl-vport devices. So after adding such device operation we can remove similar operation from ovs-vport. Fixes: 614732eaa12d ("openvswitch: Use regular VXLAN net_device device"). Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/geneve.c | 40 ++++++++++++++++++++++++----- drivers/net/vxlan.c | 41 +++++++++++++++++++++++++++++ include/linux/netdevice.h | 7 +++++ include/net/dst_metadata.h | 32 +++++++++++++++++++++++ net/core/dev.c | 27 ++++++++++++++++++++ net/ipv4/ip_gre.c | 46 ++++++++++++++++++++++++++------- net/openvswitch/actions.c | 9 +++---- net/openvswitch/datapath.c | 5 ++-- net/openvswitch/datapath.h | 1 - net/openvswitch/flow_netlink.c | 18 +++++-------- net/openvswitch/flow_netlink.h | 6 ++--- net/openvswitch/vport-geneve.c | 13 ---------- net/openvswitch/vport-gre.c | 8 ------ net/openvswitch/vport-vxlan.c | 19 -------------- net/openvswitch/vport.c | 58 ------------------------------------------ net/openvswitch/vport.h | 35 ------------------------- 16 files changed, 192 insertions(+), 173 deletions(-) (limited to 'net/ipv4') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index cde29f8a37bf..445071c163cb 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -594,14 +594,12 @@ static struct rtable *geneve_get_rt(struct sk_buff *skb, rt = ip_route_output_key(geneve->net, fl4); if (IS_ERR(rt)) { netdev_dbg(dev, "no route to %pI4\n", &fl4->daddr); - dev->stats.tx_carrier_errors++; - return rt; + return ERR_PTR(-ENETUNREACH); } if (rt->dst.dev == dev) { /* is this necessary? */ netdev_dbg(dev, "circular route to %pI4\n", &fl4->daddr); - dev->stats.collisions++; ip_rt_put(rt); - return ERR_PTR(-EINVAL); + return ERR_PTR(-ELOOP); } return rt; } @@ -627,12 +625,12 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev) struct ip_tunnel_info *info = NULL; struct rtable *rt = NULL; const struct iphdr *iip; /* interior IP header */ + int err = -EINVAL; struct flowi4 fl4; __u8 tos, ttl; __be16 sport; bool udp_csum; __be16 df; - int err; if (geneve->collect_md) { info = skb_tunnel_info(skb); @@ -647,7 +645,7 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev) rt = geneve_get_rt(skb, dev, &fl4, info); if (IS_ERR(rt)) { netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr); - dev->stats.tx_carrier_errors++; + err = PTR_ERR(rt); goto tx_error; } @@ -699,10 +697,37 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev) tx_error: dev_kfree_skb(skb); err: - dev->stats.tx_errors++; + if (err == -ELOOP) + dev->stats.collisions++; + else if (err == -ENETUNREACH) + dev->stats.tx_carrier_errors++; + else + dev->stats.tx_errors++; return NETDEV_TX_OK; } +static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) +{ + struct ip_tunnel_info *info = skb_tunnel_info(skb); + struct geneve_dev *geneve = netdev_priv(dev); + struct rtable *rt; + struct flowi4 fl4; + + if (ip_tunnel_info_af(info) != AF_INET) + return -EINVAL; + + rt = geneve_get_rt(skb, dev, &fl4, info); + if (IS_ERR(rt)) + return PTR_ERR(rt); + + ip_rt_put(rt); + info->key.u.ipv4.src = fl4.saddr; + info->key.tp_src = udp_flow_src_port(geneve->net, skb, + 1, USHRT_MAX, true); + info->key.tp_dst = geneve->dst_port; + return 0; +} + static const struct net_device_ops geneve_netdev_ops = { .ndo_init = geneve_init, .ndo_uninit = geneve_uninit, @@ -713,6 +738,7 @@ static const struct net_device_ops geneve_netdev_ops = { .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, + .ndo_fill_metadata_dst = geneve_fill_metadata_dst, }; static void geneve_get_drvinfo(struct net_device *dev, diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index afdc65fd5bc5..c1587ece28cf 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2337,6 +2337,46 @@ static int vxlan_change_mtu(struct net_device *dev, int new_mtu) return 0; } +static int egress_ipv4_tun_info(struct net_device *dev, struct sk_buff *skb, + struct ip_tunnel_info *info, + __be16 sport, __be16 dport) +{ + struct vxlan_dev *vxlan = netdev_priv(dev); + struct rtable *rt; + struct flowi4 fl4; + + memset(&fl4, 0, sizeof(fl4)); + fl4.flowi4_tos = RT_TOS(info->key.tos); + fl4.flowi4_mark = skb->mark; + fl4.flowi4_proto = IPPROTO_UDP; + fl4.daddr = info->key.u.ipv4.dst; + + rt = ip_route_output_key(vxlan->net, &fl4); + if (IS_ERR(rt)) + return PTR_ERR(rt); + ip_rt_put(rt); + + info->key.u.ipv4.src = fl4.saddr; + info->key.tp_src = sport; + info->key.tp_dst = dport; + return 0; +} + +static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) +{ + struct vxlan_dev *vxlan = netdev_priv(dev); + struct ip_tunnel_info *info = skb_tunnel_info(skb); + __be16 sport, dport; + + sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, + vxlan->cfg.port_max, true); + dport = info->key.tp_dst ? : vxlan->cfg.dst_port; + + if (ip_tunnel_info_af(info) == AF_INET) + return egress_ipv4_tun_info(dev, skb, info, sport, dport); + return -EINVAL; +} + static const struct net_device_ops vxlan_netdev_ops = { .ndo_init = vxlan_init, .ndo_uninit = vxlan_uninit, @@ -2351,6 +2391,7 @@ static const struct net_device_ops vxlan_netdev_ops = { .ndo_fdb_add = vxlan_fdb_add, .ndo_fdb_del = vxlan_fdb_delete, .ndo_fdb_dump = vxlan_fdb_dump, + .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, }; /* Info for udev, that this is a virtual tunnel endpoint */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2d15e3831440..210d11a75e4f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1054,6 +1054,10 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * This function is used to pass protocol port error state information * to the switch driver. The switch driver can react to the proto_down * by doing a phys down on the associated switch port. + * int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb); + * This function is used to get egress tunnel information for given skb. + * This is useful for retrieving outer tunnel header parameters while + * sampling packet. * */ struct net_device_ops { @@ -1227,6 +1231,8 @@ struct net_device_ops { int (*ndo_get_iflink)(const struct net_device *dev); int (*ndo_change_proto_down)(struct net_device *dev, bool proto_down); + int (*ndo_fill_metadata_dst)(struct net_device *dev, + struct sk_buff *skb); }; /** @@ -2203,6 +2209,7 @@ void dev_add_offload(struct packet_offload *po); void dev_remove_offload(struct packet_offload *po); int dev_get_iflink(const struct net_device *dev); +int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb); struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags, unsigned short mask); struct net_device *dev_get_by_name(struct net *net, const char *name); diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index af9d5382f6cb..ce009710120c 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -60,6 +60,38 @@ static inline struct metadata_dst *tun_rx_dst(int md_size) return tun_dst; } +static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb) +{ + struct metadata_dst *md_dst = skb_metadata_dst(skb); + int md_size = md_dst->u.tun_info.options_len; + struct metadata_dst *new_md; + + if (!md_dst) + return ERR_PTR(-EINVAL); + + new_md = metadata_dst_alloc(md_size, GFP_ATOMIC); + if (!new_md) + return ERR_PTR(-ENOMEM); + + memcpy(&new_md->u.tun_info, &md_dst->u.tun_info, + sizeof(struct ip_tunnel_info) + md_size); + skb_dst_drop(skb); + dst_hold(&new_md->dst); + skb_dst_set(skb, &new_md->dst); + return new_md; +} + +static inline struct ip_tunnel_info *skb_tunnel_info_unclone(struct sk_buff *skb) +{ + struct metadata_dst *dst; + + dst = tun_dst_unclone(skb); + if (IS_ERR(dst)) + return NULL; + + return &dst->u.tun_info; +} + static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, __be16 flags, __be64 tunnel_id, diff --git a/net/core/dev.c b/net/core/dev.c index 6bb6470f5b7b..c14748d051e7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -99,6 +99,7 @@ #include #include #include +#include #include #include #include @@ -681,6 +682,32 @@ int dev_get_iflink(const struct net_device *dev) } EXPORT_SYMBOL(dev_get_iflink); +/** + * dev_fill_metadata_dst - Retrieve tunnel egress information. + * @dev: targeted interface + * @skb: The packet. + * + * For better visibility of tunnel traffic OVS needs to retrieve + * egress tunnel information for a packet. Following API allows + * user to get this info. + */ +int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) +{ + struct ip_tunnel_info *info; + + if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) + return -EINVAL; + + info = skb_tunnel_info_unclone(skb); + if (!info) + return -ENOMEM; + if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) + return -EINVAL; + + return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); +} +EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); + /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index bd0679d90519..614521437e30 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -498,10 +498,26 @@ static struct sk_buff *gre_handle_offloads(struct sk_buff *skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } +static struct rtable *gre_get_rt(struct sk_buff *skb, + struct net_device *dev, + struct flowi4 *fl, + const struct ip_tunnel_key *key) +{ + struct net *net = dev_net(dev); + + memset(fl, 0, sizeof(*fl)); + fl->daddr = key->u.ipv4.dst; + fl->saddr = key->u.ipv4.src; + fl->flowi4_tos = RT_TOS(key->tos); + fl->flowi4_mark = skb->mark; + fl->flowi4_proto = IPPROTO_GRE; + + return ip_route_output_key(net, fl); +} + static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel_info *tun_info; - struct net *net = dev_net(dev); const struct ip_tunnel_key *key; struct flowi4 fl; struct rtable *rt; @@ -516,14 +532,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) goto err_free_skb; key = &tun_info->key; - memset(&fl, 0, sizeof(fl)); - fl.daddr = key->u.ipv4.dst; - fl.saddr = key->u.ipv4.src; - fl.flowi4_tos = RT_TOS(key->tos); - fl.flowi4_mark = skb->mark; - fl.flowi4_proto = IPPROTO_GRE; - - rt = ip_route_output_key(net, &fl); + rt = gre_get_rt(skb, dev, &fl, key); if (IS_ERR(rt)) goto err_free_skb; @@ -566,6 +575,24 @@ err_free_skb: dev->stats.tx_dropped++; } +static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) +{ + struct ip_tunnel_info *info = skb_tunnel_info(skb); + struct rtable *rt; + struct flowi4 fl4; + + if (ip_tunnel_info_af(info) != AF_INET) + return -EINVAL; + + rt = gre_get_rt(skb, dev, &fl4, &info->key); + if (IS_ERR(rt)) + return PTR_ERR(rt); + + ip_rt_put(rt); + info->key.u.ipv4.src = fl4.saddr; + return 0; +} + static netdev_tx_t ipgre_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -1023,6 +1050,7 @@ static const struct net_device_ops gre_tap_netdev_ops = { .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_fill_metadata_dst = gre_fill_metadata_dst, }; static void ipgre_tap_setup(struct net_device *dev) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index c6a39bf2c3b9..0bf0f406de52 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -768,7 +768,6 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, const struct nlattr *actions, int actions_len) { - struct ip_tunnel_info info; struct dp_upcall_info upcall; const struct nlattr *a; int rem; @@ -796,11 +795,9 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, if (vport) { int err; - upcall.egress_tun_info = &info; - err = ovs_vport_get_egress_tun_info(vport, skb, - &upcall); - if (err) - upcall.egress_tun_info = NULL; + err = dev_fill_metadata_dst(vport->dev, skb); + if (!err) + upcall.egress_tun_info = skb_tunnel_info(skb); } break; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index b816ff871528..c5d08ee37730 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -490,9 +490,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, if (upcall_info->egress_tun_info) { nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); - err = ovs_nla_put_egress_tunnel_key(user_skb, - upcall_info->egress_tun_info, - upcall_info->egress_tun_opts); + err = ovs_nla_put_tunnel_info(user_skb, + upcall_info->egress_tun_info); BUG_ON(err); nla_nest_end(user_skb, nla); } diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index f88038a99f44..67bdecd9fdc1 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -117,7 +117,6 @@ struct ovs_skb_cb { */ struct dp_upcall_info { struct ip_tunnel_info *egress_tun_info; - const void *egress_tun_opts; const struct nlattr *userdata; const struct nlattr *actions; int actions_len; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index bd710bc37469..38536c137c54 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -717,7 +717,7 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, if ((output->tun_flags & TUNNEL_OAM) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM)) return -EMSGSIZE; - if (tun_opts) { + if (swkey_tun_opts_len) { if (output->tun_flags & TUNNEL_GENEVE_OPT && nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, swkey_tun_opts_len, tun_opts)) @@ -749,13 +749,12 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, return 0; } -int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb, - const struct ip_tunnel_info *egress_tun_info, - const void *egress_tun_opts) +int ovs_nla_put_tunnel_info(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) { - return __ipv4_tun_to_nlattr(skb, &egress_tun_info->key, - egress_tun_opts, - egress_tun_info->options_len); + return __ipv4_tun_to_nlattr(skb, &tun_info->key, + ip_tunnel_info_opts(tun_info), + tun_info->options_len); } static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, @@ -2383,10 +2382,7 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) if (!start) return -EMSGSIZE; - err = ipv4_tun_to_nlattr(skb, &tun_info->key, - tun_info->options_len ? - ip_tunnel_info_opts(tun_info) : NULL, - tun_info->options_len); + err = ovs_nla_put_tunnel_info(skb, tun_info); if (err) return err; nla_nest_end(skb, start); diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 6ca3f0baf449..47dd142eca1c 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -55,9 +55,9 @@ int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb); int ovs_nla_get_match(struct net *, struct sw_flow_match *, const struct nlattr *key, const struct nlattr *mask, bool log); -int ovs_nla_put_egress_tunnel_key(struct sk_buff *, - const struct ip_tunnel_info *, - const void *egress_tun_opts); + +int ovs_nla_put_tunnel_info(struct sk_buff *skb, + struct ip_tunnel_info *tun_info); bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log); int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 2735e9c4a3b8..5f8aaaaa0785 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -52,18 +52,6 @@ static int geneve_get_options(const struct vport *vport, return 0; } -static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct dp_upcall_info *upcall) -{ - struct geneve_port *geneve_port = geneve_vport(vport); - struct net *net = ovs_dp_get_net(vport->dp); - __be16 dport = htons(geneve_port->port_no); - __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); - - return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp), - skb, IPPROTO_UDP, sport, dport); -} - static struct vport *geneve_tnl_create(const struct vport_parms *parms) { struct net *net = ovs_dp_get_net(parms->dp); @@ -130,7 +118,6 @@ static struct vport_ops ovs_geneve_vport_ops = { .get_options = geneve_get_options, .send = ovs_netdev_send, .owner = THIS_MODULE, - .get_egress_tun_info = geneve_get_egress_tun_info, }; static int __init ovs_geneve_tnl_init(void) diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 4d24481669c9..64225bf5eb40 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -84,18 +84,10 @@ static struct vport *gre_create(const struct vport_parms *parms) return ovs_netdev_link(vport, parms->name); } -static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct dp_upcall_info *upcall) -{ - return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp), - skb, IPPROTO_GRE, 0, 0); -} - static struct vport_ops ovs_gre_vport_ops = { .type = OVS_VPORT_TYPE_GRE, .create = gre_create, .send = ovs_netdev_send, - .get_egress_tun_info = gre_get_egress_tun_info, .destroy = ovs_netdev_tunnel_destroy, .owner = THIS_MODULE, }; diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index c11413d5075f..e1c9c0888037 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -146,31 +146,12 @@ static struct vport *vxlan_create(const struct vport_parms *parms) return ovs_netdev_link(vport, parms->name); } -static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct dp_upcall_info *upcall) -{ - struct vxlan_dev *vxlan = netdev_priv(vport->dev); - struct net *net = ovs_dp_get_net(vport->dp); - __be16 dst_port = vxlan_dev_dst_port(vxlan); - __be16 src_port; - int port_min; - int port_max; - - inet_get_local_port_range(net, &port_min, &port_max); - src_port = udp_flow_src_port(net, skb, 0, 0, true); - - return ovs_tunnel_get_egress_info(upcall, net, - skb, IPPROTO_UDP, - src_port, dst_port); -} - static struct vport_ops ovs_vxlan_netdev_vport_ops = { .type = OVS_VPORT_TYPE_VXLAN, .create = vxlan_create, .destroy = ovs_netdev_tunnel_destroy, .get_options = vxlan_get_options, .send = ovs_netdev_send, - .get_egress_tun_info = vxlan_get_egress_tun_info, }; static int __init ovs_vxlan_tnl_init(void) diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 12a36ac21eda..320c765ce44a 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -479,61 +479,3 @@ void ovs_vport_deferred_free(struct vport *vport) call_rcu(&vport->rcu, free_vport_rcu); } EXPORT_SYMBOL_GPL(ovs_vport_deferred_free); - -int ovs_tunnel_get_egress_info(struct dp_upcall_info *upcall, - struct net *net, - struct sk_buff *skb, - u8 ipproto, - __be16 tp_src, - __be16 tp_dst) -{ - struct ip_tunnel_info *egress_tun_info = upcall->egress_tun_info; - const struct ip_tunnel_info *tun_info = skb_tunnel_info(skb); - const struct ip_tunnel_key *tun_key; - u32 skb_mark = skb->mark; - struct rtable *rt; - struct flowi4 fl; - - if (unlikely(!tun_info)) - return -EINVAL; - if (ip_tunnel_info_af(tun_info) != AF_INET) - return -EINVAL; - - tun_key = &tun_info->key; - - /* Route lookup to get srouce IP address. - * The process may need to be changed if the corresponding process - * in vports ops changed. - */ - rt = ovs_tunnel_route_lookup(net, tun_key, skb_mark, &fl, ipproto); - if (IS_ERR(rt)) - return PTR_ERR(rt); - - ip_rt_put(rt); - - /* Generate egress_tun_info based on tun_info, - * saddr, tp_src and tp_dst - */ - ip_tunnel_key_init(&egress_tun_info->key, - fl.saddr, tun_key->u.ipv4.dst, - tun_key->tos, - tun_key->ttl, - tp_src, tp_dst, - tun_key->tun_id, - tun_key->tun_flags); - egress_tun_info->options_len = tun_info->options_len; - egress_tun_info->mode = tun_info->mode; - upcall->egress_tun_opts = ip_tunnel_info_opts(egress_tun_info); - return 0; -} -EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info); - -int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct dp_upcall_info *upcall) -{ - /* get_egress_tun_info() is only implemented on tunnel ports. */ - if (unlikely(!vport->ops->get_egress_tun_info)) - return -EINVAL; - - return vport->ops->get_egress_tun_info(vport, skb, upcall); -} diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index a413f3ae6a7b..d341ad6f3afe 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -27,7 +27,6 @@ #include #include #include -#include #include "datapath.h" @@ -53,16 +52,6 @@ int ovs_vport_set_upcall_portids(struct vport *, const struct nlattr *pids); int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *); u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); -int ovs_tunnel_get_egress_info(struct dp_upcall_info *upcall, - struct net *net, - struct sk_buff *, - u8 ipproto, - __be16 tp_src, - __be16 tp_dst); - -int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct dp_upcall_info *upcall); - /** * struct vport_portids - array of netlink portids of a vport. * must be protected by rcu. @@ -140,8 +129,6 @@ struct vport_parms { * have any configuration. * @send: Send a packet on the device. * zero for dropped packets or negative for error. - * @get_egress_tun_info: Get the egress tunnel 5-tuple and other info for - * a packet. */ struct vport_ops { enum ovs_vport_type type; @@ -154,9 +141,6 @@ struct vport_ops { int (*get_options)(const struct vport *, struct sk_buff *); void (*send)(struct vport *, struct sk_buff *); - int (*get_egress_tun_info)(struct vport *, struct sk_buff *, - struct dp_upcall_info *upcall); - struct module *owner; struct list_head list; }; @@ -215,25 +199,6 @@ static inline const char *ovs_vport_name(struct vport *vport) int ovs_vport_ops_register(struct vport_ops *ops); void ovs_vport_ops_unregister(struct vport_ops *ops); -static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, - const struct ip_tunnel_key *key, - u32 mark, - struct flowi4 *fl, - u8 protocol) -{ - struct rtable *rt; - - memset(fl, 0, sizeof(*fl)); - fl->daddr = key->u.ipv4.dst; - fl->saddr = key->u.ipv4.src; - fl->flowi4_tos = RT_TOS(key->tos); - fl->flowi4_mark = mark; - fl->flowi4_proto = protocol; - - rt = ip_route_output_key(net, fl); - return rt; -} - static inline void ovs_vport_send(struct vport *vport, struct sk_buff *skb) { vport->ops->send(vport, skb); -- cgit v1.2.3-59-g8ed1b From c80dbe04612986fd6104b4a1be21681b113b5ac9 Mon Sep 17 00:00:00 2001 From: Andrew Shewmaker Date: Sun, 18 Oct 2015 21:59:08 -0700 Subject: tcp: allow dctcp alpha to drop to zero If alpha is strictly reduced by alpha >> dctcp_shift_g and if alpha is less than 1 << dctcp_shift_g, then alpha may never reach zero. For example, given shift_g=4 and alpha=15, alpha >> dctcp_shift_g yields 0 and alpha remains 15. The effect isn't noticeable in this case below cwnd=137, but could gradually drive uncongested flows with leftover alpha down to cwnd=137. A larger dctcp_shift_g would have a greater effect. This change causes alpha=15 to drop to 0 instead of being decrementing by 1 as it would when alpha=16. However, it requires one less conditional to implement since it doesn't have to guard against subtracting 1 from 0U. A decay of 15 is not unreasonable since an equal or greater amount occurs at alpha >= 240. Signed-off-by: Andrew G. Shewmaker Acked-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/tcp_dctcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 7092a61c4dc8..7e538f71f5fb 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -209,7 +209,7 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags) /* alpha = (1 - g) * alpha + g * F */ - alpha -= alpha >> dctcp_shift_g; + alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g); if (bytes_ecn) { /* If dctcp_shift_g == 1, a 32bit value would overflow * after 8 Mbytes. -- cgit v1.2.3-59-g8ed1b