From 98298e6ca6d5908f96e529e70a254a4d5bf754e7 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 29 Oct 2019 14:50:50 +0100 Subject: flow_dissector: add meaningful comments Documents two piece of code which can't be understood at a glance. Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 1 + net/core/flow_dissector.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 90bd210be060..7747af3cc500 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -282,6 +282,7 @@ struct flow_keys { struct flow_dissector_key_vlan cvlan; struct flow_dissector_key_keyid keyid; struct flow_dissector_key_ports ports; + /* 'addrs' must be the last member */ struct flow_dissector_key_addrs addrs; }; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index dbf502c18656..bc22b384ac6c 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1408,6 +1408,9 @@ static inline size_t flow_keys_hash_length(const struct flow_keys *flow) { size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); + /* flow.addrs MUST be the last member in struct flow_keys because + * different L3 protocols have different address length + */ BUILD_BUG_ON(offsetof(typeof(*flow), addrs) != sizeof(*flow) - sizeof(flow->addrs)); @@ -1455,6 +1458,9 @@ __be32 flow_get_u32_dst(const struct flow_keys *flow) } EXPORT_SYMBOL(flow_get_u32_dst); +/* Sort the source and destination IP (and the ports if the IP are the same), + * to have consistent hash within the two directions + */ static inline void __flow_hash_consistentify(struct flow_keys *keys) { int addr_diff, i; -- cgit v1.2.3-59-g8ed1b From 3b336d6f4ec690b0082bcffe55bac22f234a41ff Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 29 Oct 2019 14:50:51 +0100 Subject: flow_dissector: skip the ICMP dissector for non ICMP packets FLOW_DISSECTOR_KEY_ICMP is checked for every packet, not only ICMP ones. Even if the test overhead is probably negligible, move the ICMP dissector code under the big 'switch(ip_proto)' so it gets called only for ICMP packets. Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index bc22b384ac6c..0fb721976f3d 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -234,6 +234,25 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, } EXPORT_SYMBOL(__skb_flow_get_ports); +/* If FLOW_DISSECTOR_KEY_ICMP is set, get the Type and Code from an ICMP packet + * using skb_flow_get_be16(). + */ +static void __skb_flow_dissect_icmp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, + void *data, int thoff, int hlen) +{ + struct flow_dissector_key_icmp *key_icmp; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP)) + return; + + key_icmp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ICMP, + target_container); + key_icmp->icmp = skb_flow_get_be16(skb, thoff, data, hlen); +} + void skb_flow_dissect_meta(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) @@ -884,7 +903,6 @@ bool __skb_flow_dissect(const struct net *net, struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_ports *key_ports; - struct flow_dissector_key_icmp *key_icmp; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; struct bpf_prog *attached = NULL; @@ -1329,6 +1347,12 @@ ip_proto_again: data, nhoff, hlen); break; + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + __skb_flow_dissect_icmp(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; + default: break; } @@ -1342,14 +1366,6 @@ ip_proto_again: data, hlen); } - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_ICMP)) { - key_icmp = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_ICMP, - target_container); - key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen); - } - /* Process result of IP proto processing */ switch (fdret) { case FLOW_DISSECT_RET_PROTO_AGAIN: -- cgit v1.2.3-59-g8ed1b From 5dec597e5cd0f4c3000d120508efa64157d5bd7a Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 29 Oct 2019 14:50:52 +0100 Subject: flow_dissector: extract more ICMP information The ICMP flow dissector currently parses only the Type and Code fields. Some ICMP packets (echo, timestamp) have a 16 bit Identifier field which is used to correlate packets. Add such field in flow_dissector_key_icmp and replace skb_flow_get_be16() with a more complex function which populate this field. Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 19 +++++++----- net/core/flow_dissector.c | 74 ++++++++++++++++++++++++++++++-------------- 2 files changed, 61 insertions(+), 32 deletions(-) diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 7747af3cc500..f8541d018848 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -6,6 +6,8 @@ #include #include +struct sk_buff; + /** * struct flow_dissector_key_control: * @thoff: Transport header offset @@ -156,19 +158,16 @@ struct flow_dissector_key_ports { /** * flow_dissector_key_icmp: - * @ports: type and code of ICMP header - * icmp: ICMP type (high) and code (low) * type: ICMP type * code: ICMP code + * id: session identifier */ struct flow_dissector_key_icmp { - union { - __be16 icmp; - struct { - u8 type; - u8 code; - }; + struct { + u8 type; + u8 code; }; + u16 id; }; /** @@ -282,6 +281,7 @@ struct flow_keys { struct flow_dissector_key_vlan cvlan; struct flow_dissector_key_keyid keyid; struct flow_dissector_key_ports ports; + struct flow_dissector_key_icmp icmp; /* 'addrs' must be the last member */ struct flow_dissector_key_addrs addrs; }; @@ -316,6 +316,9 @@ static inline bool flow_keys_have_l4(const struct flow_keys *keys) } u32 flow_hash_from_keys(struct flow_keys *keys); +void skb_flow_get_icmp_tci(const struct sk_buff *skb, + struct flow_dissector_key_icmp *key_icmp, + void *data, int thoff, int hlen); static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 0fb721976f3d..0807df0bde02 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -178,27 +178,6 @@ int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) mutex_unlock(&flow_dissector_mutex); return 0; } -/** - * skb_flow_get_be16 - extract be16 entity - * @skb: sk_buff to extract from - * @poff: offset to extract at - * @data: raw buffer pointer to the packet - * @hlen: packet header length - * - * The function will try to retrieve a be32 entity at - * offset poff - */ -static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, - void *data, int hlen) -{ - __be16 *u, _u; - - u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u); - if (u) - return *u; - - return 0; -} /** * __skb_flow_get_ports - extract the upper layer ports and return them @@ -234,8 +213,54 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, } EXPORT_SYMBOL(__skb_flow_get_ports); -/* If FLOW_DISSECTOR_KEY_ICMP is set, get the Type and Code from an ICMP packet - * using skb_flow_get_be16(). +static bool icmp_has_id(u8 type) +{ + switch (type) { + case ICMP_ECHO: + case ICMP_ECHOREPLY: + case ICMP_TIMESTAMP: + case ICMP_TIMESTAMPREPLY: + case ICMPV6_ECHO_REQUEST: + case ICMPV6_ECHO_REPLY: + return true; + } + + return false; +} + +/** + * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields + * @skb: sk_buff to extract from + * @key_icmp: struct flow_dissector_key_icmp to fill + * @data: raw buffer pointer to the packet + * @toff: offset to extract at + * @hlen: packet header length + */ +void skb_flow_get_icmp_tci(const struct sk_buff *skb, + struct flow_dissector_key_icmp *key_icmp, + void *data, int thoff, int hlen) +{ + struct icmphdr *ih, _ih; + + ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih); + if (!ih) + return; + + key_icmp->type = ih->type; + key_icmp->code = ih->code; + + /* As we use 0 to signal that the Id field is not present, + * avoid confusion with packets without such field + */ + if (icmp_has_id(ih->type)) + key_icmp->id = ih->un.echo.id ? : 1; + else + key_icmp->id = 0; +} +EXPORT_SYMBOL(skb_flow_get_icmp_tci); + +/* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet + * using skb_flow_get_icmp_tci(). */ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, @@ -250,7 +275,8 @@ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, key_icmp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ICMP, target_container); - key_icmp->icmp = skb_flow_get_be16(skb, thoff, data, hlen); + + skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); } void skb_flow_dissect_meta(const struct sk_buff *skb, -- cgit v1.2.3-59-g8ed1b From 58deb77cc52da9360d20676e68dd215742cbe473 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 29 Oct 2019 14:50:53 +0100 Subject: bonding: balance ICMP echoes in layer3+4 mode The bonding uses the L4 ports to balance flows between slaves. As the ICMP protocol has no ports, those packets are sent all to the same device: # tcpdump -qltnni veth0 ip |sed 's/^/0: /' & # tcpdump -qltnni veth1 ip |sed 's/^/1: /' & # ping -qc1 192.168.0.2 1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 315, seq 1, length 64 1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 315, seq 1, length 64 # ping -qc1 192.168.0.2 1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 316, seq 1, length 64 1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 316, seq 1, length 64 # ping -qc1 192.168.0.2 1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 317, seq 1, length 64 1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 317, seq 1, length 64 But some ICMP packets have an Identifier field which is used to match packets within sessions, let's use this value in the hash function to balance these packets between bond slaves: # ping -qc1 192.168.0.2 0: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 303, seq 1, length 64 0: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 303, seq 1, length 64 # ping -qc1 192.168.0.2 1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 304, seq 1, length 64 1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 304, seq 1, length 64 Aso, let's use a flow_dissector_key which defines FLOW_DISSECTOR_KEY_ICMP, so we can balance pings encapsulated in a tunnel when using mode encap3+4: # ping -q 192.168.1.2 -c1 0: IP 192.168.0.1 > 192.168.0.2: GREv0, length 102: IP 192.168.1.1 > 192.168.1.2: ICMP echo request, id 585, seq 1, length 64 0: IP 192.168.0.2 > 192.168.0.1: GREv0, length 102: IP 192.168.1.2 > 192.168.1.1: ICMP echo reply, id 585, seq 1, length 64 # ping -q 192.168.1.2 -c1 1: IP 192.168.0.1 > 192.168.0.2: GREv0, length 102: IP 192.168.1.1 > 192.168.1.2: ICMP echo request, id 586, seq 1, length 64 1: IP 192.168.0.2 > 192.168.0.1: GREv0, length 102: IP 192.168.1.2 > 192.168.1.1: ICMP echo reply, id 586, seq 1, length 64 Signed-off-by: Matteo Croce Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 77 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 70 insertions(+), 7 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 21d8fcc83c9c..3e496e746cc6 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -200,6 +200,51 @@ atomic_t netpoll_block_tx = ATOMIC_INIT(0); unsigned int bond_net_id __read_mostly; +static const struct flow_dissector_key flow_keys_bonding_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_CONTROL, + .offset = offsetof(struct flow_keys, control), + }, + { + .key_id = FLOW_DISSECTOR_KEY_BASIC, + .offset = offsetof(struct flow_keys, basic), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v4addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v6addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_TIPC, + .offset = offsetof(struct flow_keys, addrs.tipckey), + }, + { + .key_id = FLOW_DISSECTOR_KEY_PORTS, + .offset = offsetof(struct flow_keys, ports), + }, + { + .key_id = FLOW_DISSECTOR_KEY_ICMP, + .offset = offsetof(struct flow_keys, icmp), + }, + { + .key_id = FLOW_DISSECTOR_KEY_VLAN, + .offset = offsetof(struct flow_keys, vlan), + }, + { + .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, + .offset = offsetof(struct flow_keys, tags), + }, + { + .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, + .offset = offsetof(struct flow_keys, keyid), + }, +}; + +static struct flow_dissector flow_keys_bonding __read_mostly; + /*-------------------------- Forward declarations ---------------------------*/ static int bond_init(struct net_device *bond_dev); @@ -3263,10 +3308,14 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, const struct iphdr *iph; int noff, proto = -1; - if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23) - return skb_flow_dissect_flow_keys(skb, fk, 0); + if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23) { + memset(fk, 0, sizeof(*fk)); + return __skb_flow_dissect(NULL, skb, &flow_keys_bonding, + fk, NULL, 0, 0, 0, 0); + } fk->ports.ports = 0; + memset(&fk->icmp, 0, sizeof(fk->icmp)); noff = skb_network_offset(skb); if (skb->protocol == htons(ETH_P_IP)) { if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph)))) @@ -3286,8 +3335,14 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, } else { return false; } - if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0) - fk->ports.ports = skb_flow_get_ports(skb, noff, proto); + if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0) { + if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) + skb_flow_get_icmp_tci(skb, &fk->icmp, skb->data, + skb_transport_offset(skb), + skb_headlen(skb)); + else + fk->ports.ports = skb_flow_get_ports(skb, noff, proto); + } return true; } @@ -3314,10 +3369,14 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb) return bond_eth_hash(skb); if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER23 || - bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23) + bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23) { hash = bond_eth_hash(skb); - else - hash = (__force u32)flow.ports.ports; + } else { + if (flow.icmp.id) + memcpy(&hash, &flow.icmp, sizeof(hash)); + else + memcpy(&hash, &flow.ports.ports, sizeof(hash)); + } hash ^= (__force u32)flow_get_u32_dst(&flow) ^ (__force u32)flow_get_u32_src(&flow); hash ^= (hash >> 16); @@ -4901,6 +4960,10 @@ static int __init bonding_init(void) goto err; } + skb_flow_dissector_init(&flow_keys_bonding, + flow_keys_bonding_keys, + ARRAY_SIZE(flow_keys_bonding_keys)); + register_netdevice_notifier(&bond_netdev_notifier); out: return res; -- cgit v1.2.3-59-g8ed1b