From 61b590b9ee4221173ad6990a1150c5c9db73564e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 23 Oct 2015 12:43:18 +0200 Subject: netfilter: ingress: don't use nf_hook_list_active nf_hook_list_active() always returns true once at least one device has NF_INGRESS hook enabled. Thus, don't use this function. Instead, inverse the test and use the static key to elide list_empty test if no NF_INGRESS hooks are active. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ingress.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h index 187feabe557c..ba7ce8805fe3 100644 --- a/include/linux/netfilter_ingress.h +++ b/include/linux/netfilter_ingress.h @@ -5,10 +5,13 @@ #include #ifdef CONFIG_NETFILTER_INGRESS -static inline int nf_hook_ingress_active(struct sk_buff *skb) +static inline bool nf_hook_ingress_active(const struct sk_buff *skb) { - return nf_hook_list_active(&skb->dev->nf_hooks_ingress, - NFPROTO_NETDEV, NF_NETDEV_INGRESS); +#ifdef HAVE_JUMP_LABEL + if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS])) + return false; +#endif + return !list_empty(&skb->dev->nf_hooks_ingress); } static inline int nf_hook_ingress(struct sk_buff *skb) -- cgit v1.2.3-59-g8ed1b From b4865988eab598e56e6e628b9b32441acd142b28 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 6 Nov 2015 18:35:57 +0100 Subject: netfilter: ingress: fix wrong input interface on hook The input and output interfaces in nf_hook_state_init() are flipped. This fixes iif matching on nftables. Reported-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ingress.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h index ba7ce8805fe3..5fcd375ef175 100644 --- a/include/linux/netfilter_ingress.h +++ b/include/linux/netfilter_ingress.h @@ -19,8 +19,8 @@ static inline int nf_hook_ingress(struct sk_buff *skb) struct nf_hook_state state; nf_hook_state_init(&state, &skb->dev->nf_hooks_ingress, - NF_NETDEV_INGRESS, INT_MIN, NFPROTO_NETDEV, NULL, - skb->dev, NULL, dev_net(skb->dev), NULL); + NF_NETDEV_INGRESS, INT_MIN, NFPROTO_NETDEV, + skb->dev, NULL, NULL, dev_net(skb->dev), NULL); return nf_hook_slow(skb, &state); } -- cgit v1.2.3-59-g8ed1b From 95ad1f4a9358dff1dcf84bf5c9cc84caa9215f7f Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sat, 7 Nov 2015 11:21:47 +0100 Subject: netfilter: ipset: Fix extension alignment The data extensions in ipset lacked the proper memory alignment and thus could lead to kernel crash on several architectures. Therefore the structures have been reorganized and alignment attributes added where needed. The patch was tested on armv7h by Gerhard Wiesinger and on x86_64, sparc64 by Jozsef Kadlecsik. Reported-by: Gerhard Wiesinger Tested-by: Gerhard Wiesinger Tested-by: Jozsef Kadlecsik Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set.h | 2 +- net/netfilter/ipset/ip_set_bitmap_gen.h | 17 +++----- net/netfilter/ipset/ip_set_bitmap_ip.c | 14 ++----- net/netfilter/ipset/ip_set_bitmap_ipmac.c | 64 ++++++++++++++----------------- net/netfilter/ipset/ip_set_bitmap_port.c | 18 ++++----- net/netfilter/ipset/ip_set_core.c | 14 ++++--- net/netfilter/ipset/ip_set_hash_gen.h | 11 ++++-- net/netfilter/ipset/ip_set_list_set.c | 5 ++- 8 files changed, 65 insertions(+), 80 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 48bb01edcf30..0e1f433cc4b7 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -421,7 +421,7 @@ extern void ip_set_free(void *members); extern int ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr); extern int ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr); extern size_t ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], - size_t len); + size_t len, size_t align); extern int ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext *ext); diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index d05e759ed0fa..b0bc475f641e 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -33,7 +33,7 @@ #define mtype_gc IPSET_TOKEN(MTYPE, _gc) #define mtype MTYPE -#define get_ext(set, map, id) ((map)->extensions + (set)->dsize * (id)) +#define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id))) static void mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) @@ -67,12 +67,9 @@ mtype_destroy(struct ip_set *set) del_timer_sync(&map->gc); ip_set_free(map->members); - if (set->dsize) { - if (set->extensions & IPSET_EXT_DESTROY) - mtype_ext_cleanup(set); - ip_set_free(map->extensions); - } - kfree(map); + if (set->dsize && set->extensions & IPSET_EXT_DESTROY) + mtype_ext_cleanup(set); + ip_set_free(map); set->data = NULL; } @@ -92,16 +89,14 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) { const struct mtype *map = set->data; struct nlattr *nested; + size_t memsize = sizeof(*map) + map->memsize; nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; if (mtype_do_head(skb, map) || nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || - nla_put_net32(skb, IPSET_ATTR_MEMSIZE, - htonl(sizeof(*map) + - map->memsize + - set->dsize * map->elements))) + nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 64a564334418..4783efff0bde 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -41,7 +41,6 @@ MODULE_ALIAS("ip_set_bitmap:ip"); /* Type structure */ struct bitmap_ip { void *members; /* the set members */ - void *extensions; /* data extensions */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -49,6 +48,8 @@ struct bitmap_ip { size_t memsize; /* members size */ u8 netmask; /* subnet netmask */ struct timer_list gc; /* garbage collection */ + unsigned char extensions[0] /* data extensions */ + __aligned(__alignof__(u64)); }; /* ADT structure for generic function args */ @@ -224,13 +225,6 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, map->members = ip_set_alloc(map->memsize); if (!map->members) return false; - if (set->dsize) { - map->extensions = ip_set_alloc(set->dsize * elements); - if (!map->extensions) { - kfree(map->members); - return false; - } - } map->first_ip = first_ip; map->last_ip = last_ip; map->elements = elements; @@ -316,13 +310,13 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], pr_debug("hosts %u, elements %llu\n", hosts, (unsigned long long)elements); - map = kzalloc(sizeof(*map), GFP_KERNEL); + set->dsize = ip_set_elem_len(set, tb, 0, 0); + map = ip_set_alloc(sizeof(*map) + elements * set->dsize); if (!map) return -ENOMEM; map->memsize = bitmap_bytes(0, elements - 1); set->variant = &bitmap_ip; - set->dsize = ip_set_elem_len(set, tb, 0); if (!init_map_ip(set, map, first_ip, last_ip, elements, hosts, netmask)) { kfree(map); diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 1430535118fb..29dde208381d 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -47,24 +47,26 @@ enum { /* Type structure */ struct bitmap_ipmac { void *members; /* the set members */ - void *extensions; /* MAC + data extensions */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ struct timer_list gc; /* garbage collector */ + unsigned char extensions[0] /* MAC + data extensions */ + __aligned(__alignof__(u64)); }; /* ADT structure for generic function args */ struct bitmap_ipmac_adt_elem { + unsigned char ether[ETH_ALEN] __aligned(2); u16 id; - unsigned char *ether; + u16 add_mac; }; struct bitmap_ipmac_elem { unsigned char ether[ETH_ALEN]; unsigned char filled; -} __attribute__ ((aligned)); +} __aligned(__alignof__(u64)); static inline u32 ip_to_id(const struct bitmap_ipmac *m, u32 ip) @@ -72,11 +74,11 @@ ip_to_id(const struct bitmap_ipmac *m, u32 ip) return ip - m->first_ip; } -static inline struct bitmap_ipmac_elem * -get_elem(void *extensions, u16 id, size_t dsize) -{ - return (struct bitmap_ipmac_elem *)(extensions + id * dsize); -} +#define get_elem(extensions, id, dsize) \ + (struct bitmap_ipmac_elem *)(extensions + (id) * (dsize)) + +#define get_const_elem(extensions, id, dsize) \ + (const struct bitmap_ipmac_elem *)(extensions + (id) * (dsize)) /* Common functions */ @@ -88,10 +90,9 @@ bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, if (!test_bit(e->id, map->members)) return 0; - elem = get_elem(map->extensions, e->id, dsize); - if (elem->filled == MAC_FILLED) - return !e->ether || - ether_addr_equal(e->ether, elem->ether); + elem = get_const_elem(map->extensions, e->id, dsize); + if (e->add_mac && elem->filled == MAC_FILLED) + return ether_addr_equal(e->ether, elem->ether); /* Trigger kernel to fill out the ethernet address */ return -EAGAIN; } @@ -103,7 +104,7 @@ bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize) if (!test_bit(id, map->members)) return 0; - elem = get_elem(map->extensions, id, dsize); + elem = get_const_elem(map->extensions, id, dsize); /* Timer not started for the incomplete elements */ return elem->filled == MAC_FILLED; } @@ -133,7 +134,7 @@ bitmap_ipmac_add_timeout(unsigned long *timeout, * and we can reuse it later when MAC is filled out, * possibly by the kernel */ - if (e->ether) + if (e->add_mac) ip_set_timeout_set(timeout, t); else *timeout = t; @@ -150,7 +151,7 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, elem = get_elem(map->extensions, e->id, dsize); if (test_bit(e->id, map->members)) { if (elem->filled == MAC_FILLED) { - if (e->ether && + if (e->add_mac && (flags & IPSET_FLAG_EXIST) && !ether_addr_equal(e->ether, elem->ether)) { /* memcpy isn't atomic */ @@ -159,7 +160,7 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, ether_addr_copy(elem->ether, e->ether); } return IPSET_ADD_FAILED; - } else if (!e->ether) + } else if (!e->add_mac) /* Already added without ethernet address */ return IPSET_ADD_FAILED; /* Fill the MAC address and trigger the timer activation */ @@ -168,7 +169,7 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, ether_addr_copy(elem->ether, e->ether); elem->filled = MAC_FILLED; return IPSET_ADD_START_STORED_TIMEOUT; - } else if (e->ether) { + } else if (e->add_mac) { /* We can store MAC too */ ether_addr_copy(elem->ether, e->ether); elem->filled = MAC_FILLED; @@ -191,7 +192,7 @@ bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, u32 id, size_t dsize) { const struct bitmap_ipmac_elem *elem = - get_elem(map->extensions, id, dsize); + get_const_elem(map->extensions, id, dsize); return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip + id)) || @@ -213,7 +214,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, { struct bitmap_ipmac *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct bitmap_ipmac_adt_elem e = { .id = 0 }; + struct bitmap_ipmac_adt_elem e = { .id = 0, .add_mac = 1 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); u32 ip; @@ -231,7 +232,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, return -EINVAL; e.id = ip_to_id(map, ip); - e.ether = eth_hdr(skb)->h_source; + memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -265,11 +266,10 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_BITMAP_RANGE; e.id = ip_to_id(map, ip); - if (tb[IPSET_ATTR_ETHER]) - e.ether = nla_data(tb[IPSET_ATTR_ETHER]); - else - e.ether = NULL; - + if (tb[IPSET_ATTR_ETHER]) { + memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); + e.add_mac = 1; + } ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; @@ -300,13 +300,6 @@ init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, map->members = ip_set_alloc(map->memsize); if (!map->members) return false; - if (set->dsize) { - map->extensions = ip_set_alloc(set->dsize * elements); - if (!map->extensions) { - kfree(map->members); - return false; - } - } map->first_ip = first_ip; map->last_ip = last_ip; map->elements = elements; @@ -361,14 +354,15 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (elements > IPSET_BITMAP_MAX_RANGE + 1) return -IPSET_ERR_BITMAP_RANGE_SIZE; - map = kzalloc(sizeof(*map), GFP_KERNEL); + set->dsize = ip_set_elem_len(set, tb, + sizeof(struct bitmap_ipmac_elem), + __alignof__(struct bitmap_ipmac_elem)); + map = ip_set_alloc(sizeof(*map) + elements * set->dsize); if (!map) return -ENOMEM; map->memsize = bitmap_bytes(0, elements - 1); set->variant = &bitmap_ipmac; - set->dsize = ip_set_elem_len(set, tb, - sizeof(struct bitmap_ipmac_elem)); if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { kfree(map); return -ENOMEM; diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 5338ccd5da46..7f0c733358a4 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -35,12 +35,13 @@ MODULE_ALIAS("ip_set_bitmap:port"); /* Type structure */ struct bitmap_port { void *members; /* the set members */ - void *extensions; /* data extensions */ u16 first_port; /* host byte order, included in range */ u16 last_port; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ struct timer_list gc; /* garbage collection */ + unsigned char extensions[0] /* data extensions */ + __aligned(__alignof__(u64)); }; /* ADT structure for generic function args */ @@ -209,13 +210,6 @@ init_map_port(struct ip_set *set, struct bitmap_port *map, map->members = ip_set_alloc(map->memsize); if (!map->members) return false; - if (set->dsize) { - map->extensions = ip_set_alloc(set->dsize * map->elements); - if (!map->extensions) { - kfree(map->members); - return false; - } - } map->first_port = first_port; map->last_port = last_port; set->timeout = IPSET_NO_TIMEOUT; @@ -232,6 +226,7 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], { struct bitmap_port *map; u16 first_port, last_port; + u32 elements; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) || @@ -248,14 +243,15 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], last_port = tmp; } - map = kzalloc(sizeof(*map), GFP_KERNEL); + elements = last_port - first_port + 1; + set->dsize = ip_set_elem_len(set, tb, 0, 0); + map = ip_set_alloc(sizeof(*map) + elements * set->dsize); if (!map) return -ENOMEM; - map->elements = last_port - first_port + 1; + map->elements = elements; map->memsize = bitmap_bytes(0, map->elements); set->variant = &bitmap_port; - set->dsize = ip_set_elem_len(set, tb, 0); if (!init_map_port(set, map, first_port, last_port)) { kfree(map); return -ENOMEM; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 69ab9c2634e1..54f3d7cb23e6 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -364,25 +364,27 @@ add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[]) } size_t -ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len) +ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len, + size_t align) { enum ip_set_ext_id id; - size_t offset = len; u32 cadt_flags = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_WITH_FORCEADD) set->flags |= IPSET_CREATE_FLAG_FORCEADD; + if (!align) + align = 1; for (id = 0; id < IPSET_EXT_ID_MAX; id++) { if (!add_extension(id, cadt_flags, tb)) continue; - offset = ALIGN(offset, ip_set_extensions[id].align); - set->offset[id] = offset; + len = ALIGN(len, ip_set_extensions[id].align); + set->offset[id] = len; set->extensions |= ip_set_extensions[id].type; - offset += ip_set_extensions[id].len; + len += ip_set_extensions[id].len; } - return offset; + return ALIGN(len, align); } EXPORT_SYMBOL_GPL(ip_set_elem_len); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 691b54fcaf2a..4ff22194ce55 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -72,8 +72,9 @@ struct hbucket { DECLARE_BITMAP(used, AHASH_MAX_TUNED); u8 size; /* size of the array */ u8 pos; /* position of the first free entry */ - unsigned char value[0]; /* the array of the values */ -} __attribute__ ((aligned)); + unsigned char value[0] /* the array of the values */ + __aligned(__alignof__(u64)); +}; /* The hash table: the table size stored here in order to make resizing easy */ struct htable { @@ -1323,12 +1324,14 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #endif set->variant = &IPSET_TOKEN(HTYPE, 4_variant); set->dsize = ip_set_elem_len(set, tb, - sizeof(struct IPSET_TOKEN(HTYPE, 4_elem))); + sizeof(struct IPSET_TOKEN(HTYPE, 4_elem)), + __alignof__(struct IPSET_TOKEN(HTYPE, 4_elem))); #ifndef IP_SET_PROTO_UNDEF } else { set->variant = &IPSET_TOKEN(HTYPE, 6_variant); set->dsize = ip_set_elem_len(set, tb, - sizeof(struct IPSET_TOKEN(HTYPE, 6_elem))); + sizeof(struct IPSET_TOKEN(HTYPE, 6_elem)), + __alignof__(struct IPSET_TOKEN(HTYPE, 6_elem))); } #endif if (tb[IPSET_ATTR_TIMEOUT]) { diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 5a30ce6e8c90..bbede95c9f68 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -31,7 +31,7 @@ struct set_elem { struct rcu_head rcu; struct list_head list; ip_set_id_t id; -}; +} __aligned(__alignof__(u64)); struct set_adt_elem { ip_set_id_t id; @@ -618,7 +618,8 @@ list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], size = IP_SET_LIST_MIN_SIZE; set->variant = &set_variant; - set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem)); + set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem), + __alignof__(struct set_elem)); if (!init_list_set(net, set, size)) return -ENOMEM; if (tb[IPSET_ATTR_TIMEOUT]) { -- cgit v1.2.3-59-g8ed1b From aabc92bbe3cfe4c545f8ccdaaeeea012a46f0abf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 10 Nov 2015 14:31:18 +0100 Subject: net: add __netdev_alloc_pcpu_stats() to indicate gfp flags nf_tables may create percpu counters from the packet path through its dynamic set instantiation infrastructure, so we need a way to allocate this through GFP_ATOMIC. Signed-off-by: Pablo Neira Ayuso Acked-by: David S. Miller --- include/linux/netdevice.h | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2c00772bd136..e9d0c8a75380 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2068,20 +2068,23 @@ struct pcpu_sw_netstats { struct u64_stats_sync syncp; }; -#define netdev_alloc_pcpu_stats(type) \ -({ \ - typeof(type) __percpu *pcpu_stats = alloc_percpu(type); \ - if (pcpu_stats) { \ - int __cpu; \ - for_each_possible_cpu(__cpu) { \ - typeof(type) *stat; \ - stat = per_cpu_ptr(pcpu_stats, __cpu); \ - u64_stats_init(&stat->syncp); \ - } \ - } \ - pcpu_stats; \ +#define __netdev_alloc_pcpu_stats(type, gfp) \ +({ \ + typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\ + if (pcpu_stats) { \ + int __cpu; \ + for_each_possible_cpu(__cpu) { \ + typeof(type) *stat; \ + stat = per_cpu_ptr(pcpu_stats, __cpu); \ + u64_stats_init(&stat->syncp); \ + } \ + } \ + pcpu_stats; \ }) +#define netdev_alloc_pcpu_stats(type) \ + __netdev_alloc_pcpu_stats(type, GFP_KERNEL); + #include /* netdevice notifier chain. Please remember to update the rtnetlink -- cgit v1.2.3-59-g8ed1b From 086f332167d64b645d37405854f049b9ad7371ab Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 10 Nov 2015 13:39:42 +0100 Subject: netfilter: nf_tables: add clone interface to expression operations With the conversion of the counter expressions to make it percpu, we need to clone the percpu memory area, otherwise we crash when using counters from flow tables. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 16 +++++++++++-- net/netfilter/nft_counter.c | 49 ++++++++++++++++++++++++++++++++------- net/netfilter/nft_dynset.c | 5 ++-- 3 files changed, 58 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index c9149cc0a02d..4bd7508bedc9 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -618,6 +618,8 @@ struct nft_expr_ops { void (*eval)(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); + int (*clone)(struct nft_expr *dst, + const struct nft_expr *src); unsigned int size; int (*init)(const struct nft_ctx *ctx, @@ -660,10 +662,20 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr); -static inline void nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) +static inline int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) { + int err; + __module_get(src->ops->type->owner); - memcpy(dst, src, src->ops->size); + if (src->ops->clone) { + dst->ops = src->ops; + err = src->ops->clone(dst, src); + if (err < 0) + return err; + } else { + memcpy(dst, src, src->ops->size); + } + return 0; } /** diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index 1067fb4c1ffa..c7808fc19719 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -47,27 +47,34 @@ static void nft_counter_eval(const struct nft_expr *expr, local_bh_enable(); } -static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) +static void nft_counter_fetch(const struct nft_counter_percpu __percpu *counter, + struct nft_counter *total) { - struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); - struct nft_counter_percpu *cpu_stats; - struct nft_counter total; + const struct nft_counter_percpu *cpu_stats; u64 bytes, packets; unsigned int seq; int cpu; - memset(&total, 0, sizeof(total)); + memset(total, 0, sizeof(*total)); for_each_possible_cpu(cpu) { - cpu_stats = per_cpu_ptr(priv->counter, cpu); + cpu_stats = per_cpu_ptr(counter, cpu); do { seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp); bytes = cpu_stats->counter.bytes; packets = cpu_stats->counter.packets; } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq)); - total.packets += packets; - total.bytes += bytes; + total->packets += packets; + total->bytes += bytes; } +} + +static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + struct nft_counter total; + + nft_counter_fetch(priv->counter, &total); if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes)) || nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets))) @@ -118,6 +125,31 @@ static void nft_counter_destroy(const struct nft_ctx *ctx, free_percpu(priv->counter); } +static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src) +{ + struct nft_counter_percpu_priv *priv = nft_expr_priv(src); + struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst); + struct nft_counter_percpu __percpu *cpu_stats; + struct nft_counter_percpu *this_cpu; + struct nft_counter total; + + nft_counter_fetch(priv->counter, &total); + + cpu_stats = __netdev_alloc_pcpu_stats(struct nft_counter_percpu, + GFP_ATOMIC); + if (cpu_stats == NULL) + return ENOMEM; + + preempt_disable(); + this_cpu = this_cpu_ptr(cpu_stats); + this_cpu->counter.packets = total.packets; + this_cpu->counter.bytes = total.bytes; + preempt_enable(); + + priv_clone->counter = cpu_stats; + return 0; +} + static struct nft_expr_type nft_counter_type; static const struct nft_expr_ops nft_counter_ops = { .type = &nft_counter_type, @@ -126,6 +158,7 @@ static const struct nft_expr_ops nft_counter_ops = { .init = nft_counter_init, .destroy = nft_counter_destroy, .dump = nft_counter_dump, + .clone = nft_counter_clone, }; static struct nft_expr_type nft_counter_type __read_mostly = { diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 513a8ef60a59..9dec3bd1b63c 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -50,8 +50,9 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, } ext = nft_set_elem_ext(set, elem); - if (priv->expr != NULL) - nft_expr_clone(nft_set_ext_expr(ext), priv->expr); + if (priv->expr != NULL && + nft_expr_clone(nft_set_ext_expr(ext), priv->expr) < 0) + return NULL; return elem; } -- cgit v1.2.3-59-g8ed1b From 02bcf4e082e4dc634409a6a6cb7def8806d6e5e6 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 11 Nov 2015 11:51:08 -0800 Subject: ipv6: Check rt->dst.from for the DST_NOCACHE route All DST_NOCACHE rt6_info used to have rt->dst.from set to its parent. After commit 8e3d5be73681 ("ipv6: Avoid double dst_free"), DST_NOCACHE is also set to rt6_info which does not have a parent (i.e. rt->dst.from is NULL). This patch catches the rt->dst.from == NULL case. Fixes: 8e3d5be73681 ("ipv6: Avoid double dst_free") Signed-off-by: Martin KaFai Lau Cc: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 3 ++- net/ipv6/route.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index aaf9700fc9e5..fb961a576abe 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -167,7 +167,8 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) static inline u32 rt6_get_cookie(const struct rt6_info *rt) { - if (rt->rt6i_flags & RTF_PCPU || unlikely(rt->dst.flags & DST_NOCACHE)) + if (rt->rt6i_flags & RTF_PCPU || + (unlikely(rt->dst.flags & DST_NOCACHE) && rt->dst.from)) rt = (struct rt6_info *)(rt->dst.from); return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3754cf9287a0..6f01fe122abd 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1281,7 +1281,8 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) rt6_dst_from_metrics_check(rt); - if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE)) + if (rt->rt6i_flags & RTF_PCPU || + (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from)) return rt6_dst_from_check(rt, cookie); else return rt6_check(rt, cookie); -- cgit v1.2.3-59-g8ed1b From 00fd38d938db3f1ab1c486549afc450cb7e751b1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 Nov 2015 08:43:18 -0800 Subject: tcp: ensure proper barriers in lockless contexts Some functions access TCP sockets without holding a lock and might output non consistent data, depending on compiler and or architecture. tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ... Introduce sk_state_load() and sk_state_store() to fix the issues, and more clearly document where this lack of locking is happening. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 25 +++++++++++++++++++++++++ net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/tcp.c | 21 +++++++++++---------- net/ipv4/tcp_diag.c | 2 +- net/ipv4/tcp_ipv4.c | 14 ++++++++------ net/ipv6/tcp_ipv6.c | 19 +++++++++++++++---- 6 files changed, 62 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index bbf7c2cf15b4..7f89e4ba18d1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2226,6 +2226,31 @@ static inline bool sk_listener(const struct sock *sk) return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); } +/** + * sk_state_load - read sk->sk_state for lockless contexts + * @sk: socket pointer + * + * Paired with sk_state_store(). Used in places we do not hold socket lock : + * tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ... + */ +static inline int sk_state_load(const struct sock *sk) +{ + return smp_load_acquire(&sk->sk_state); +} + +/** + * sk_state_store - update sk->sk_state + * @sk: socket pointer + * @newstate: new state + * + * Paired with sk_state_load(). Should be used in contexts where + * state change might impact lockless readers. + */ +static inline void sk_state_store(struct sock *sk, int newstate) +{ + smp_store_release(&sk->sk_state, newstate); +} + void sock_enable_timestamp(struct sock *sk, int flag); int sock_get_timestamp(struct sock *, struct timeval __user *); int sock_get_timestampns(struct sock *, struct timespec __user *); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 1feb15f23de8..46b9c887bede 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -563,7 +563,7 @@ static void reqsk_timer_handler(unsigned long data) int max_retries, thresh; u8 defer_accept; - if (sk_listener->sk_state != TCP_LISTEN) + if (sk_state_load(sk_listener) != TCP_LISTEN) goto drop; max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; @@ -749,7 +749,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) * It is OK, because this socket enters to hash table only * after validation is complete. */ - sk->sk_state = TCP_LISTEN; + sk_state_store(sk, TCP_LISTEN); if (!sk->sk_prot->get_port(sk, inet->inet_num)) { inet->inet_sport = htons(inet->inet_num); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0cfa7c0c1e80..c1728771cf89 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -451,11 +451,14 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) unsigned int mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); + int state; sock_rps_record_flow(sk); sock_poll_wait(file, sk_sleep(sk), wait); - if (sk->sk_state == TCP_LISTEN) + + state = sk_state_load(sk); + if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events @@ -492,14 +495,14 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK */ - if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) + if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= POLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLIN | POLLRDNORM | POLLRDHUP; /* Connected or passive Fast Open socket? */ - if (sk->sk_state != TCP_SYN_SENT && - (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk)) { + if (state != TCP_SYN_SENT && + (state != TCP_SYN_RECV || tp->fastopen_rsk)) { int target = sock_rcvlowat(sk, 0, INT_MAX); if (tp->urg_seq == tp->copied_seq && @@ -507,9 +510,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) tp->urg_data) target++; - /* Potential race condition. If read of tp below will - * escape above sk->sk_state, we can be illegally awaken - * in SYN_* states. */ if (tp->rcv_nxt - tp->copied_seq >= target) mask |= POLLIN | POLLRDNORM; @@ -1934,7 +1934,7 @@ void tcp_set_state(struct sock *sk, int state) /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ - sk->sk_state = state; + sk_state_store(sk, state); #ifdef STATE_TRACE SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); @@ -2644,7 +2644,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (sk->sk_type != SOCK_STREAM) return; - info->tcpi_state = sk->sk_state; + info->tcpi_state = sk_state_load(sk); + info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; @@ -2672,7 +2673,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; - if (sk->sk_state == TCP_LISTEN) { + if (info->tcpi_state == TCP_LISTEN) { info->tcpi_unacked = sk->sk_ack_backlog; info->tcpi_sacked = sk->sk_max_ack_backlog; } else { diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 479f34946177..b31604086edd 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -21,7 +21,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, { struct tcp_info *info = _info; - if (sk->sk_state == TCP_LISTEN) { + if (sk_state_load(sk) == TCP_LISTEN) { r->idiag_rqueue = sk->sk_ack_backlog; r->idiag_wqueue = sk->sk_max_ack_backlog; } else if (sk->sk_type == SOCK_STREAM) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 950e28c0cdf2..ba09016d1bfd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2158,6 +2158,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) __u16 destp = ntohs(inet->inet_dport); __u16 srcp = ntohs(inet->inet_sport); int rx_queue; + int state; if (icsk->icsk_pending == ICSK_TIME_RETRANS || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || @@ -2175,17 +2176,18 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) timer_expires = jiffies; } - if (sk->sk_state == TCP_LISTEN) + state = sk_state_load(sk); + if (state == TCP_LISTEN) rx_queue = sk->sk_ack_backlog; else - /* - * because we dont lock socket, we might find a transient negative value + /* Because we don't lock the socket, + * we might find a transient negative value. */ rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", - i, src, srcp, dest, destp, sk->sk_state, + i, src, srcp, dest, destp, state, tp->write_seq - tp->snd_una, rx_queue, timer_active, @@ -2199,8 +2201,8 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, - sk->sk_state == TCP_LISTEN ? - (fastopenq ? fastopenq->max_qlen : 0) : + state == TCP_LISTEN ? + fastopenq->max_qlen : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5baa8e754e41..c5429a636f1a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1690,6 +1690,8 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) const struct tcp_sock *tp = tcp_sk(sp); const struct inet_connection_sock *icsk = inet_csk(sp); const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; + int rx_queue; + int state; dest = &sp->sk_v6_daddr; src = &sp->sk_v6_rcv_saddr; @@ -1710,6 +1712,15 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) timer_expires = jiffies; } + state = sk_state_load(sp); + if (state == TCP_LISTEN) + rx_queue = sp->sk_ack_backlog; + else + /* Because we don't lock the socket, + * we might find a transient negative value. + */ + rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); + seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %lu %lu %u %u %d\n", @@ -1718,9 +1729,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, - sp->sk_state, - tp->write_seq-tp->snd_una, - (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq), + state, + tp->write_seq - tp->snd_una, + rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, @@ -1732,7 +1743,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, - sp->sk_state == TCP_LISTEN ? + state == TCP_LISTEN ? fastopenq->max_qlen : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh) ); -- cgit v1.2.3-59-g8ed1b From 66189961e986e53ae39822898fc2ce88f44c61bb Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Thu, 12 Nov 2015 19:35:26 +0200 Subject: net/mlx5e: Added self loopback prevention Prevent outgoing multicast frames from looping back to the RX queue. By introducing new HW capability self_lb_en_modifiable, which indicates the support to modify self_lb_en bit in modify_tir command. When this capability is set we can prevent TIRs from sending back loopback multicast traffic to their own RQs, by "refreshing TIRs" with modify_tir command, on every time new channels (SQs/RQs) are created at device open. This is needed since TIRs are static and only allocated once on driver load, and the loopback decision is under their responsibility. Fixes issues of the kind: "IPv6: eth2: IPv6 duplicate address fe80::e61d:2dff:fe5c:f2e9 detected!" The issue is seen since the IPv6 solicitations multicast messages are loopedback and the network stack thinks they are coming from another host. Fixes: 5c50368f3831 ("net/mlx5e: Light-weight netdev open/stop") Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 48 +++++++++++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 24 +++++++----- 2 files changed, 62 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 5fc4d2d78cdf..df001754bcd1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1332,6 +1332,42 @@ static int mlx5e_modify_tir_lro(struct mlx5e_priv *priv, int tt) return err; } +static int mlx5e_refresh_tir_self_loopback_enable(struct mlx5_core_dev *mdev, + u32 tirn) +{ + void *in; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_tir_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_tir_in, in, bitmask.self_lb_en, 1); + + err = mlx5_core_modify_tir(mdev, tirn, in, inlen); + + kvfree(in); + + return err; +} + +static int mlx5e_refresh_tirs_self_loopback_enable(struct mlx5e_priv *priv) +{ + int err; + int i; + + for (i = 0; i < MLX5E_NUM_TT; i++) { + err = mlx5e_refresh_tir_self_loopback_enable(priv->mdev, + priv->tirn[i]); + if (err) + return err; + } + + return 0; +} + static int mlx5e_set_dev_port_mtu(struct net_device *netdev) { struct mlx5e_priv *priv = netdev_priv(netdev); @@ -1376,6 +1412,13 @@ int mlx5e_open_locked(struct net_device *netdev) goto err_clear_state_opened_flag; } + err = mlx5e_refresh_tirs_self_loopback_enable(priv); + if (err) { + netdev_err(netdev, "%s: mlx5e_refresh_tirs_self_loopback_enable failed, %d\n", + __func__, err); + goto err_close_channels; + } + mlx5e_update_carrier(priv); mlx5e_redirect_rqts(priv); @@ -1383,6 +1426,8 @@ int mlx5e_open_locked(struct net_device *netdev) return 0; +err_close_channels: + mlx5e_close_channels(priv); err_clear_state_opened_flag: clear_bit(MLX5E_STATE_OPENED, &priv->state); return err; @@ -1909,6 +1954,9 @@ static int mlx5e_check_required_hca_cap(struct mlx5_core_dev *mdev) "Not creating net device, some required device capabilities are missing\n"); return -ENOTSUPP; } + if (!MLX5_CAP_ETH(mdev, self_lb_en_modifiable)) + mlx5_core_warn(mdev, "Self loop back prevention is not supported\n"); + return 0; } diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index dd2097455a2e..1565324eb620 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -453,26 +453,28 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 lro_cap[0x1]; u8 lro_psh_flag[0x1]; u8 lro_time_stamp[0x1]; - u8 reserved_0[0x6]; + u8 reserved_0[0x3]; + u8 self_lb_en_modifiable[0x1]; + u8 reserved_1[0x2]; u8 max_lso_cap[0x5]; - u8 reserved_1[0x4]; + u8 reserved_2[0x4]; u8 rss_ind_tbl_cap[0x4]; - u8 reserved_2[0x3]; + u8 reserved_3[0x3]; u8 tunnel_lso_const_out_ip_id[0x1]; - u8 reserved_3[0x2]; + u8 reserved_4[0x2]; u8 tunnel_statless_gre[0x1]; u8 tunnel_stateless_vxlan[0x1]; - u8 reserved_4[0x20]; + u8 reserved_5[0x20]; - u8 reserved_5[0x10]; + u8 reserved_6[0x10]; u8 lro_min_mss_size[0x10]; - u8 reserved_6[0x120]; + u8 reserved_7[0x120]; u8 lro_timer_supported_periods[4][0x20]; - u8 reserved_7[0x600]; + u8 reserved_8[0x600]; }; struct mlx5_ifc_roce_cap_bits { @@ -4051,9 +4053,11 @@ struct mlx5_ifc_modify_tis_in_bits { }; struct mlx5_ifc_modify_tir_bitmask_bits { - u8 reserved[0x20]; + u8 reserved_0[0x20]; - u8 reserved1[0x1f]; + u8 reserved_1[0x1b]; + u8 self_lb_en[0x1]; + u8 reserved_2[0x3]; u8 lro[0x1]; }; -- cgit v1.2.3-59-g8ed1b From b4fe85f9c9146f60457e9512fb6055e69e6a7a65 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 12 Nov 2015 17:35:58 +0100 Subject: ip_tunnel: disable preemption when updating per-cpu tstats Drivers like vxlan use the recently introduced udp_tunnel_xmit_skb/udp_tunnel6_xmit_skb APIs. udp_tunnel6_xmit_skb makes use of ip6tunnel_xmit, and ip6tunnel_xmit, after sending the packet, updates the struct stats using the usual u64_stats_update_begin/end calls on this_cpu_ptr(dev->tstats). udp_tunnel_xmit_skb makes use of iptunnel_xmit, which doesn't touch tstats, so drivers like vxlan, immediately after, call iptunnel_xmit_stats, which does the same thing - calls u64_stats_update_begin/end on this_cpu_ptr(dev->tstats). While vxlan is probably fine (I don't know?), calling a similar function from, say, an unbound workqueue, on a fully preemptable kernel causes real issues: [ 188.434537] BUG: using smp_processor_id() in preemptible [00000000] code: kworker/u8:0/6 [ 188.435579] caller is debug_smp_processor_id+0x17/0x20 [ 188.435583] CPU: 0 PID: 6 Comm: kworker/u8:0 Not tainted 4.2.6 #2 [ 188.435607] Call Trace: [ 188.435611] [] dump_stack+0x4f/0x7b [ 188.435615] [] check_preemption_disabled+0x19d/0x1c0 [ 188.435619] [] debug_smp_processor_id+0x17/0x20 The solution would be to protect the whole this_cpu_ptr(dev->tstats)/u64_stats_update_begin/end blocks with disabling preemption and then reenabling it. Signed-off-by: Jason A. Donenfeld Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 3 ++- include/net/ip_tunnels.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index aaee6fa02cf1..ff788b665277 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -90,11 +90,12 @@ static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb); if (net_xmit_eval(err) == 0) { - struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); + struct pcpu_sw_netstats *tstats = get_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); tstats->tx_bytes += pkt_len; tstats->tx_packets++; u64_stats_update_end(&tstats->syncp); + put_cpu_ptr(tstats); } else { stats->tx_errors++; stats->tx_aborted_errors++; diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index f6dafec9102c..62a750a6a8f8 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -287,12 +287,13 @@ static inline void iptunnel_xmit_stats(int err, struct pcpu_sw_netstats __percpu *stats) { if (err > 0) { - struct pcpu_sw_netstats *tstats = this_cpu_ptr(stats); + struct pcpu_sw_netstats *tstats = get_cpu_ptr(stats); u64_stats_update_begin(&tstats->syncp); tstats->tx_bytes += err; tstats->tx_packets++; u64_stats_update_end(&tstats->syncp); + put_cpu_ptr(tstats); } else if (err < 0) { err_stats->tx_errors++; err_stats->tx_aborted_errors++; -- cgit v1.2.3-59-g8ed1b From 24cb7055a3066634a0f3fa0cd6a4780652905d35 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Mon, 16 Nov 2015 10:52:48 +0100 Subject: net: switchdev: fix return code of fdb_dump stub rtnl_fdb_dump always expects an index to be returned by the ndo_fdb_dump op, but when CONFIG_NET_SWITCHDEV is off, it returns an error. Fix that by returning the given unmodified idx. A similar fix was 0890cf6cb6ab ("switchdev: fix return value of switchdev_port_fdb_dump in case of error") but for the CONFIG_NET_SWITCHDEV=y case. Fixes: 45d4122ca7cd ("switchdev: add support for fdb add/del/dump via switchdev_port_obj ops.") Signed-off-by: Dragos Tatulea Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/switchdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index bc865e244efe..1d22ce9f352e 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -323,7 +323,7 @@ static inline int switchdev_port_fdb_dump(struct sk_buff *skb, struct net_device *filter_dev, int idx) { - return -EOPNOTSUPP; + return idx; } static inline void switchdev_port_fwd_mark_set(struct net_device *dev, -- cgit v1.2.3-59-g8ed1b From 28f9ee22bcdd84726dbf6267d0b58f254166b900 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Mon, 16 Nov 2015 15:43:45 -0500 Subject: vlan: Do not put vlan headers back on bridge and macvlan ports When a vlan is configured with REORDER_HEADER set to 0, the vlan header is put back into the packet and makes it appear that the vlan header is still there even after it's been processed. This posses a problem for bridge and macvlan ports. The packets passed to those device may be forwarded and at the time of the forward, vlan headers end up being unexpectedly present. With the patch, we make sure that we do not put the vlan header back (when REORDER_HEADER is 0) if a bridge or macvlan has been configured on top of the vlan device. Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ net/8021q/vlan_core.c | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cc221b967687..67bfac1abfc1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3857,6 +3857,11 @@ static inline bool netif_is_bridge_master(const struct net_device *dev) return dev->priv_flags & IFF_EBRIDGE; } +static inline bool netif_is_bridge_port(const struct net_device *dev) +{ + return dev->priv_flags & IFF_BRIDGE_PORT; +} + static inline bool netif_is_ovs_master(const struct net_device *dev) { return dev->priv_flags & IFF_OPENVSWITCH; diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 496b27588493..e2ed69850489 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -30,7 +30,9 @@ bool vlan_do_receive(struct sk_buff **skbp) skb->pkt_type = PACKET_HOST; } - if (!(vlan_dev_priv(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR)) { + if (!(vlan_dev_priv(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR) && + !netif_is_macvlan_port(vlan_dev) && + !netif_is_bridge_port(vlan_dev)) { unsigned int offset = skb->data - skb_mac_header(skb); /* -- cgit v1.2.3-59-g8ed1b From 819ec8e1f349f73bdf65bf33a364538e59007a9a Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 16 Nov 2015 23:34:41 +0100 Subject: phy: marvell: Add support for 88E1540 PHY The 88E1540 can be found embedded in the Marvell 88E6352 switch. It is compatible with the 88E1510, so add support for it, using the 88E1510 specific functions. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/marvell.c | 16 ++++++++++++++++ include/linux/marvell_phy.h | 1 + 2 files changed, 17 insertions(+) (limited to 'include') diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index 5de8d5827536..0240552b50f3 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -1153,6 +1153,21 @@ static struct phy_driver marvell_drivers[] = { .suspend = &genphy_suspend, .driver = { .owner = THIS_MODULE }, }, + { + .phy_id = MARVELL_PHY_ID_88E1540, + .phy_id_mask = MARVELL_PHY_ID_MASK, + .name = "Marvell 88E1540", + .features = PHY_GBIT_FEATURES, + .flags = PHY_HAS_INTERRUPT, + .config_aneg = &m88e1510_config_aneg, + .read_status = &marvell_read_status, + .ack_interrupt = &marvell_ack_interrupt, + .config_intr = &marvell_config_intr, + .did_interrupt = &m88e1121_did_interrupt, + .resume = &genphy_resume, + .suspend = &genphy_suspend, + .driver = { .owner = THIS_MODULE }, + }, { .phy_id = MARVELL_PHY_ID_88E3016, .phy_id_mask = MARVELL_PHY_ID_MASK, @@ -1186,6 +1201,7 @@ static struct mdio_device_id __maybe_unused marvell_tbl[] = { { MARVELL_PHY_ID_88E1318S, MARVELL_PHY_ID_MASK }, { MARVELL_PHY_ID_88E1116R, MARVELL_PHY_ID_MASK }, { MARVELL_PHY_ID_88E1510, MARVELL_PHY_ID_MASK }, + { MARVELL_PHY_ID_88E1540, MARVELL_PHY_ID_MASK }, { MARVELL_PHY_ID_88E3016, MARVELL_PHY_ID_MASK }, { } }; diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h index e6982ac3200d..a57f0dfb6db7 100644 --- a/include/linux/marvell_phy.h +++ b/include/linux/marvell_phy.h @@ -16,6 +16,7 @@ #define MARVELL_PHY_ID_88E1318S 0x01410e90 #define MARVELL_PHY_ID_88E1116R 0x01410e40 #define MARVELL_PHY_ID_88E1510 0x01410dd0 +#define MARVELL_PHY_ID_88E1540 0x01410eb0 #define MARVELL_PHY_ID_88E3016 0x01410e60 /* struct phy_device dev_flags definitions */ -- cgit v1.2.3-59-g8ed1b