From 4aa2c466a7733af093a526e9d1cdd0b3b90d47e9 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 28 Oct 2010 02:00:43 +0000 Subject: fib: Fix fib zone and its hash leak on namespace stop When we stop a namespace we flush the table and free one, but the added fn_zone-s (and their hashes if grown) are leaked. Need to free. Tries releases all its stuff in the flushing code. Shame on us - this bug exists since the very first make-fib-per-net patches in 2.6.27 :( Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 2 +- net/ipv4/fib_hash.c | 18 ++++++++++++++++++ net/ipv4/fib_trie.c | 5 +++++ 3 files changed, 24 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 36e27c2107de..eb6f69a8f27a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1052,7 +1052,7 @@ static void ip_fib_net_exit(struct net *net) hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { hlist_del(node); fib_table_flush(tb); - kfree(tb); + fib_free_table(tb); } } kfree(net->ipv4.fib_table_hash); diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index b232375a0b75..b3acb0417b21 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -716,6 +716,24 @@ int fib_table_flush(struct fib_table *tb) return found; } +void fib_free_table(struct fib_table *tb) +{ + struct fn_hash *table = (struct fn_hash *) tb->tb_data; + struct fn_zone *fz, *next; + + next = table->fn_zone_list; + while (next != NULL) { + fz = next; + next = fz->fz_next; + + if (fz->fz_hash != fz->fz_embedded_hash) + fz_hash_free(fz->fz_hash, fz->fz_divisor); + + kfree(fz); + } + + kfree(tb); +} static inline int fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index b14450895102..200eb538fbb3 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1797,6 +1797,11 @@ int fib_table_flush(struct fib_table *tb) return found; } +void fib_free_table(struct fib_table *tb) +{ + kfree(tb); +} + void fib_table_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) -- cgit v1.2.3-59-g8ed1b From 64e46749224aa658d8fc0d37ea83ab20b1d7955d Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 29 Oct 2010 16:28:07 +0200 Subject: netfilter: nf_nat: fix compiler warning with CONFIG_NF_CT_NETLINK=n net/ipv4/netfilter/nf_nat_core.c:52: warning: 'nf_nat_proto_find_get' defined but not used net/ipv4/netfilter/nf_nat_core.c:66: warning: 'nf_nat_proto_put' defined but not used Reported-by: Geert Uytterhoeven Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/nf_nat_core.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 295c97431e43..c04787ce1a71 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -47,26 +47,6 @@ __nf_nat_proto_find(u_int8_t protonum) return rcu_dereference(nf_nat_protos[protonum]); } -static const struct nf_nat_protocol * -nf_nat_proto_find_get(u_int8_t protonum) -{ - const struct nf_nat_protocol *p; - - rcu_read_lock(); - p = __nf_nat_proto_find(protonum); - if (!try_module_get(p->me)) - p = &nf_nat_unknown_protocol; - rcu_read_unlock(); - - return p; -} - -static void -nf_nat_proto_put(const struct nf_nat_protocol *p) -{ - module_put(p->me); -} - /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int hash_by_src(const struct net *net, u16 zone, @@ -588,6 +568,26 @@ static struct nf_ct_ext_type nat_extend __read_mostly = { #include #include +static const struct nf_nat_protocol * +nf_nat_proto_find_get(u_int8_t protonum) +{ + const struct nf_nat_protocol *p; + + rcu_read_lock(); + p = __nf_nat_proto_find(protonum); + if (!try_module_get(p->me)) + p = &nf_nat_unknown_protocol; + rcu_read_unlock(); + + return p; +} + +static void +nf_nat_proto_put(const struct nf_nat_protocol *p) +{ + module_put(p->me); +} + static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, -- cgit v1.2.3-59-g8ed1b From 3285ee3bb2e158299ff19b947e41da735980d954 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 30 Oct 2010 16:21:28 -0700 Subject: ip_gre: fix fallback tunnel setup Before making the fallback tunnel visible to lookups, we should make sure it is completely setup, once ipgre_tunnel_init() had been called and tstats per_cpu pointer allocated. move rcu_assign_pointer(ign->tunnels_wc[0], tunnel); from ipgre_fb_tunnel_init() to ipgre_init_net() Based on a patch from Pavel Emelyanov Reported-by: Pavel Emelyanov Signed-off-by: Eric Dumazet Acked-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 01087e035b7d..70ff77f02eee 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1325,7 +1325,6 @@ static void ipgre_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; - struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id); tunnel->dev = dev; strcpy(tunnel->parms.name, dev->name); @@ -1336,7 +1335,6 @@ static void ipgre_fb_tunnel_init(struct net_device *dev) tunnel->hlen = sizeof(struct iphdr) + 4; dev_hold(dev); - rcu_assign_pointer(ign->tunnels_wc[0], tunnel); } @@ -1383,10 +1381,12 @@ static int __net_init ipgre_init_net(struct net *net) if ((err = register_netdev(ign->fb_tunnel_dev))) goto err_reg_dev; + rcu_assign_pointer(ign->tunnels_wc[0], + netdev_priv(ign->fb_tunnel_dev)); return 0; err_reg_dev: - free_netdev(ign->fb_tunnel_dev); + ipgre_dev_free(ign->fb_tunnel_dev); err_alloc_dev: return err; } -- cgit v1.2.3-59-g8ed1b From 1a8b7a67224eb0c9dbd883b9bfc4938278bad370 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Wed, 3 Nov 2010 08:44:12 +0100 Subject: ipv4: netfilter: arp_tables: fix information leak to userland Structure arpt_getinfo is copied to userland with the field "name" that has the last elements unitialized. It leads to leaking of contents of kernel stack memory. Signed-off-by: Vasiliy Kulikov Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/arp_tables.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 3cad2591ace0..3fac340a28d5 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -927,6 +927,7 @@ static int get_info(struct net *net, void __user *user, private = &tmp; } #endif + memset(&info, 0, sizeof(info)); info.valid_hooks = t->valid_hooks; memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); -- cgit v1.2.3-59-g8ed1b From b5f15ac4f89f84853544c934fc7a744289e95e34 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Wed, 3 Nov 2010 08:45:06 +0100 Subject: ipv4: netfilter: ip_tables: fix information leak to userland Structure ipt_getinfo is copied to userland with the field "name" that has the last elements unitialized. It leads to leaking of contents of kernel stack memory. Signed-off-by: Vasiliy Kulikov Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/ip_tables.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index d31b007a6d80..a846d633b3b6 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1124,6 +1124,7 @@ static int get_info(struct net *net, void __user *user, private = &tmp; } #endif + memset(&info, 0, sizeof(info)); info.valid_hooks = t->valid_hooks; memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); -- cgit v1.2.3-59-g8ed1b From 1f1b9c9990205759aae31b7734b0ede41a867f32 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Nov 2010 01:21:39 +0000 Subject: fib: fib_result_assign() should not change fib refcounts After commit ebc0ffae5 (RCU conversion of fib_lookup()), fib_result_assign() should not change fib refcounts anymore. Thanks to Michael who did the bisection and bug report. Reported-by: Michael Ellerman Tested-by: Michael Ellerman Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_lookup.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index a29edf2219c8..c079cc0ec651 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -47,11 +47,8 @@ extern int fib_detect_death(struct fib_info *fi, int order, static inline void fib_result_assign(struct fib_result *res, struct fib_info *fi) { - if (res->fi != NULL) - fib_info_put(res->fi); + /* we used to play games with refcounts, but we now use RCU */ res->fi = fi; - if (fi != NULL) - atomic_inc(&fi->fib_clntref); } #endif /* _FIB_LOOKUP_H */ -- cgit v1.2.3-59-g8ed1b From 22e76c849d505d87c5ecf3d3e6742a65f0ff4860 Mon Sep 17 00:00:00 2001 From: Nelson Elhage Date: Wed, 3 Nov 2010 16:35:41 +0000 Subject: inet_diag: Make sure we actually run the same bytecode we audited. We were using nlmsg_find_attr() to look up the bytecode by attribute when auditing, but then just using the first attribute when actually running bytecode. So, if we received a message with two attribute elements, where only the second had type INET_DIAG_REQ_BYTECODE, we would validate and run different bytecode strings. Fix this by consistently using nlmsg_find_attr everywhere. Signed-off-by: Nelson Elhage Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index ba8042665849..2ada17129fce 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -490,9 +490,11 @@ static int inet_csk_diag_dump(struct sock *sk, { struct inet_diag_req *r = NLMSG_DATA(cb->nlh); - if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { + if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { struct inet_diag_entry entry; - struct rtattr *bc = (struct rtattr *)(r + 1); + const struct nlattr *bc = nlmsg_find_attr(cb->nlh, + sizeof(*r), + INET_DIAG_REQ_BYTECODE); struct inet_sock *inet = inet_sk(sk); entry.family = sk->sk_family; @@ -512,7 +514,7 @@ static int inet_csk_diag_dump(struct sock *sk, entry.dport = ntohs(inet->inet_dport); entry.userlocks = sk->sk_userlocks; - if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) + if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry)) return 0; } @@ -527,9 +529,11 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, { struct inet_diag_req *r = NLMSG_DATA(cb->nlh); - if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { + if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { struct inet_diag_entry entry; - struct rtattr *bc = (struct rtattr *)(r + 1); + const struct nlattr *bc = nlmsg_find_attr(cb->nlh, + sizeof(*r), + INET_DIAG_REQ_BYTECODE); entry.family = tw->tw_family; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) @@ -548,7 +552,7 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, entry.dport = ntohs(tw->tw_dport); entry.userlocks = 0; - if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) + if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry)) return 0; } @@ -618,7 +622,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, struct inet_diag_req *r = NLMSG_DATA(cb->nlh); struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt; - struct rtattr *bc = NULL; + const struct nlattr *bc = NULL; struct inet_sock *inet = inet_sk(sk); int j, s_j; int reqnum, s_reqnum; @@ -638,8 +642,9 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, if (!lopt || !lopt->qlen) goto out; - if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { - bc = (struct rtattr *)(r + 1); + if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { + bc = nlmsg_find_attr(cb->nlh, sizeof(*r), + INET_DIAG_REQ_BYTECODE); entry.sport = inet->inet_num; entry.userlocks = sk->sk_userlocks; } @@ -672,8 +677,8 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, &ireq->rmt_addr; entry.dport = ntohs(ireq->rmt_port); - if (!inet_diag_bc_run(RTA_DATA(bc), - RTA_PAYLOAD(bc), &entry)) + if (!inet_diag_bc_run(nla_data(bc), + nla_len(bc), &entry)) continue; } -- cgit v1.2.3-59-g8ed1b From 18943d292facbc70e6a36fc62399ae833f64671b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Nov 2010 11:15:54 +0000 Subject: inet: fix ip_mc_drop_socket() commit 8723e1b4ad9be4444 (inet: RCU changes in inetdev_by_index()) forgot one call site in ip_mc_drop_socket() We should not decrease idev refcount after inetdev_by_index() call, since refcount is not increased anymore. Reported-by: Markus Trippelsdorf Reported-by: Miles Lane Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index c8877c6c7216..3c53c2d89e3b 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2306,10 +2306,8 @@ void ip_mc_drop_socket(struct sock *sk) in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); (void) ip_mc_leave_src(sk, iml, in_dev); - if (in_dev != NULL) { + if (in_dev != NULL) ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); - in_dev_put(in_dev); - } /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); call_rcu(&iml->rcu, ip_mc_socklist_reclaim); -- cgit v1.2.3-59-g8ed1b From 8d987e5c75107ca7515fa19e857cfa24aab6ec8f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Nov 2010 23:24:26 +0000 Subject: net: avoid limits overflow Robin Holt tried to boot a 16TB machine and found some limits were reached : sysctl_tcp_mem[2], sysctl_udp_mem[2] We can switch infrastructure to use long "instead" of "int", now atomic_long_t primitives are available for free. Signed-off-by: Eric Dumazet Reported-by: Robin Holt Reviewed-by: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/net/dn.h | 2 +- include/net/sock.h | 4 ++-- include/net/tcp.h | 6 +++--- include/net/udp.h | 4 ++-- net/core/sock.c | 14 +++++++------- net/decnet/af_decnet.c | 2 +- net/decnet/sysctl_net_decnet.c | 4 ++-- net/ipv4/proc.c | 8 ++++---- net/ipv4/sysctl_net_ipv4.c | 5 ++--- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_input.c | 11 +++++++---- net/ipv4/udp.c | 4 ++-- net/sctp/protocol.c | 2 +- net/sctp/socket.c | 4 ++-- net/sctp/sysctl.c | 4 ++-- 15 files changed, 40 insertions(+), 38 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/dn.h b/include/net/dn.h index e5469f7b67a3..a514a3cf4573 100644 --- a/include/net/dn.h +++ b/include/net/dn.h @@ -225,7 +225,7 @@ extern int decnet_di_count; extern int decnet_dr_count; extern int decnet_no_fc_max_cwnd; -extern int sysctl_decnet_mem[3]; +extern long sysctl_decnet_mem[3]; extern int sysctl_decnet_wmem[3]; extern int sysctl_decnet_rmem[3]; diff --git a/include/net/sock.h b/include/net/sock.h index c7a736228ca2..a6338d039857 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -762,7 +762,7 @@ struct proto { /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); - atomic_t *memory_allocated; /* Current allocated memory. */ + atomic_long_t *memory_allocated; /* Current allocated memory. */ struct percpu_counter *sockets_allocated; /* Current number of sockets. */ /* * Pressure flag: try to collapse. @@ -771,7 +771,7 @@ struct proto { * is strict, actions are advisory and have some latency. */ int *memory_pressure; - int *sysctl_mem; + long *sysctl_mem; int *sysctl_wmem; int *sysctl_rmem; int max_header; diff --git a/include/net/tcp.h b/include/net/tcp.h index 4fee0424af7e..e36c874c7fb1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -224,7 +224,7 @@ extern int sysctl_tcp_fack; extern int sysctl_tcp_reordering; extern int sysctl_tcp_ecn; extern int sysctl_tcp_dsack; -extern int sysctl_tcp_mem[3]; +extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; extern int sysctl_tcp_app_win; @@ -247,7 +247,7 @@ extern int sysctl_tcp_cookie_size; extern int sysctl_tcp_thin_linear_timeouts; extern int sysctl_tcp_thin_dupack; -extern atomic_t tcp_memory_allocated; +extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; extern int tcp_memory_pressure; @@ -280,7 +280,7 @@ static inline bool tcp_too_many_orphans(struct sock *sk, int shift) } if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && - atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) + atomic_long_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) return true; return false; } diff --git a/include/net/udp.h b/include/net/udp.h index 200b82848c9a..bb967dd59bf7 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -105,10 +105,10 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table, extern struct proto udp_prot; -extern atomic_t udp_memory_allocated; +extern atomic_long_t udp_memory_allocated; /* sysctl variables for udp */ -extern int sysctl_udp_mem[3]; +extern long sysctl_udp_mem[3]; extern int sysctl_udp_rmem_min; extern int sysctl_udp_wmem_min; diff --git a/net/core/sock.c b/net/core/sock.c index 3eed5424e659..fb6080111461 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1653,10 +1653,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) { struct proto *prot = sk->sk_prot; int amt = sk_mem_pages(size); - int allocated; + long allocated; sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - allocated = atomic_add_return(amt, prot->memory_allocated); + allocated = atomic_long_add_return(amt, prot->memory_allocated); /* Under limit. */ if (allocated <= prot->sysctl_mem[0]) { @@ -1714,7 +1714,7 @@ suppress_allocation: /* Alas. Undo changes. */ sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; - atomic_sub(amt, prot->memory_allocated); + atomic_long_sub(amt, prot->memory_allocated); return 0; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -1727,12 +1727,12 @@ void __sk_mem_reclaim(struct sock *sk) { struct proto *prot = sk->sk_prot; - atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, + atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, prot->memory_allocated); sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; if (prot->memory_pressure && *prot->memory_pressure && - (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0])) + (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0])) *prot->memory_pressure = 0; } EXPORT_SYMBOL(__sk_mem_reclaim); @@ -2452,12 +2452,12 @@ static char proto_method_implemented(const void *method) static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { - seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s " + seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), - proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1, + proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L, proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", proto->max_header, proto->slab == NULL ? "no" : "yes", diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index d6b93d19790f..a76b78de679f 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -155,7 +155,7 @@ static const struct proto_ops dn_proto_ops; static DEFINE_RWLOCK(dn_hash_lock); static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; static struct hlist_head dn_wild_sk; -static atomic_t decnet_memory_allocated; +static atomic_long_t decnet_memory_allocated; static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen, int flags); static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags); diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index be3eb8e23288..28f8b5e5f73b 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -38,7 +38,7 @@ int decnet_log_martians = 1; int decnet_no_fc_max_cwnd = NSP_MIN_WINDOW; /* Reasonable defaults, I hope, based on tcp's defaults */ -int sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 }; +long sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 }; int sysctl_decnet_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; int sysctl_decnet_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; @@ -324,7 +324,7 @@ static ctl_table dn_table[] = { .data = &sysctl_decnet_mem, .maxlen = sizeof(sysctl_decnet_mem), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax }, { .procname = "decnet_rmem", diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 4ae1f203f7cb..1b48eb1ed453 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -59,13 +59,13 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) local_bh_enable(); socket_seq_show(seq); - seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", + seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", sock_prot_inuse_get(net, &tcp_prot), orphans, tcp_death_row.tw_count, sockets, - atomic_read(&tcp_memory_allocated)); - seq_printf(seq, "UDP: inuse %d mem %d\n", + atomic_long_read(&tcp_memory_allocated)); + seq_printf(seq, "UDP: inuse %d mem %ld\n", sock_prot_inuse_get(net, &udp_prot), - atomic_read(&udp_memory_allocated)); + atomic_long_read(&udp_memory_allocated)); seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d96c1da4b17c..e91911d7aae2 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -398,7 +398,7 @@ static struct ctl_table ipv4_table[] = { .data = &sysctl_tcp_mem, .maxlen = sizeof(sysctl_tcp_mem), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_doulongvec_minmax }, { .procname = "tcp_wmem", @@ -602,8 +602,7 @@ static struct ctl_table ipv4_table[] = { .data = &sysctl_udp_mem, .maxlen = sizeof(sysctl_udp_mem), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero + .proc_handler = proc_doulongvec_minmax, }, { .procname = "udp_rmem_min", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1664a0590bb8..245603c4ad48 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -282,7 +282,7 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); -int sysctl_tcp_mem[3] __read_mostly; +long sysctl_tcp_mem[3] __read_mostly; int sysctl_tcp_wmem[3] __read_mostly; int sysctl_tcp_rmem[3] __read_mostly; @@ -290,7 +290,7 @@ EXPORT_SYMBOL(sysctl_tcp_mem); EXPORT_SYMBOL(sysctl_tcp_rmem); EXPORT_SYMBOL(sysctl_tcp_wmem); -atomic_t tcp_memory_allocated; /* Current allocated memory. */ +atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); /* diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3357f69e353d..6d8ab1c4efc3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -259,8 +259,11 @@ static void tcp_fixup_sndbuf(struct sock *sk) int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); - if (sk->sk_sndbuf < 3 * sndmem) - sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]); + if (sk->sk_sndbuf < 3 * sndmem) { + sk->sk_sndbuf = 3 * sndmem; + if (sk->sk_sndbuf > sysctl_tcp_wmem[2]) + sk->sk_sndbuf = sysctl_tcp_wmem[2]; + } } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -396,7 +399,7 @@ static void tcp_clamp_window(struct sock *sk) if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_memory_pressure && - atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); } @@ -4861,7 +4864,7 @@ static int tcp_should_expand_sndbuf(struct sock *sk) return 0; /* If we are under soft global TCP memory pressure, do not expand. */ - if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) + if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) return 0; /* If we filled the congestion window, do not expand. */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 28cb2d733a3c..5e0a3a582a59 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -110,7 +110,7 @@ struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); -int sysctl_udp_mem[3] __read_mostly; +long sysctl_udp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_udp_mem); int sysctl_udp_rmem_min __read_mostly; @@ -119,7 +119,7 @@ EXPORT_SYMBOL(sysctl_udp_rmem_min); int sysctl_udp_wmem_min __read_mostly; EXPORT_SYMBOL(sysctl_udp_wmem_min); -atomic_t udp_memory_allocated; +atomic_long_t udp_memory_allocated; EXPORT_SYMBOL(udp_memory_allocated); #define MAX_UDP_PORTS 65536 diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 1ef29c74d85e..e58f9476f29c 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -92,7 +92,7 @@ static struct sctp_af *sctp_af_v6_specific; struct kmem_cache *sctp_chunk_cachep __read_mostly; struct kmem_cache *sctp_bucket_cachep __read_mostly; -int sysctl_sctp_mem[3]; +long sysctl_sctp_mem[3]; int sysctl_sctp_rmem[3]; int sysctl_sctp_wmem[3]; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index e34ca9cc1167..6bd554323a34 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -111,12 +111,12 @@ static void sctp_sock_migrate(struct sock *, struct sock *, static char *sctp_hmac_alg = SCTP_COOKIE_HMAC_ALG; extern struct kmem_cache *sctp_bucket_cachep; -extern int sysctl_sctp_mem[3]; +extern long sysctl_sctp_mem[3]; extern int sysctl_sctp_rmem[3]; extern int sysctl_sctp_wmem[3]; static int sctp_memory_pressure; -static atomic_t sctp_memory_allocated; +static atomic_long_t sctp_memory_allocated; struct percpu_counter sctp_sockets_allocated; static void sctp_enter_memory_pressure(struct sock *sk) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 832590bbe0c0..50cb57f0919e 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -54,7 +54,7 @@ static int sack_timer_max = 500; static int addr_scope_max = 3; /* check sctp_scope_policy_t in include/net/sctp/constants.h for max entries */ static int rwnd_scale_max = 16; -extern int sysctl_sctp_mem[3]; +extern long sysctl_sctp_mem[3]; extern int sysctl_sctp_rmem[3]; extern int sysctl_sctp_wmem[3]; @@ -203,7 +203,7 @@ static ctl_table sctp_table[] = { .data = &sysctl_sctp_mem, .maxlen = sizeof(sysctl_sctp_mem), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax }, { .procname = "sctp_rmem", -- cgit v1.2.3-59-g8ed1b From 7a1abd08d52fdeddb3e9a5a33f2f15cc6a5674d2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 10 Nov 2010 21:35:37 -0800 Subject: tcp: Increase TCP_MAXSEG socket option minimum. As noted by Steve Chen, since commit f5fff5dc8a7a3f395b0525c02ba92c95d42b7390 ("tcp: advertise MSS requested by user") we can end up with a situation where tcp_select_initial_window() does a divide by a zero (or even negative) mss value. The problem is that sometimes we effectively subtract TCPOLEN_TSTAMP_ALIGNED and/or TCPOLEN_MD5SIG_ALIGNED from the mss. Fix this by increasing the minimum from 8 to 64. Reported-by: Steve Chen Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 245603c4ad48..081419969485 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2246,7 +2246,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, /* Values greater than interface MTU won't take effect. However * at the point when this call is done we typically don't yet * know which interface is going to be used */ - if (val < 8 || val > MAX_TCP_WINDOW) { + if (val < 64 || val > MAX_TCP_WINDOW) { err = -EINVAL; break; } -- cgit v1.2.3-59-g8ed1b From 8f49c2703b33519aaaccc63f571b465b9d2b3a2d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 12 Nov 2010 13:35:00 -0800 Subject: tcp: Don't change unlocked socket state in tcp_v4_err(). Alexey Kuznetsov noticed a regression introduced by commit f1ecd5d9e7366609d640ff4040304ea197fbc618 ("Revert Backoff [v3]: Revert RTO on ICMP destination unreachable") The RTO and timer modification code added to tcp_v4_err() doesn't check sock_owned_by_user(), which if true means we don't have exclusive access to the socket and therefore cannot modify it's critical state. Just skip this new code block if sock_owned_by_user() is true and eliminate the now superfluous sock_owned_by_user() code block contained within. Reported-by: Alexey Kuznetsov Signed-off-by: David S. Miller CC: Damian Lukowski Acked-by: Eric Dumazet --- net/ipv4/tcp_ipv4.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8f8527d41682..69ccbc1dde9c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -415,6 +415,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) !icsk->icsk_backoff) break; + if (sock_owned_by_user(sk)) + break; + icsk->icsk_backoff--; inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) << icsk->icsk_backoff; @@ -429,11 +432,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (remaining) { inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, TCP_RTO_MAX); - } else if (sock_owned_by_user(sk)) { - /* RTO revert clocked out retransmission, - * but socket is locked. Will defer. */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - HZ/20, TCP_RTO_MAX); } else { /* RTO revert clocked out retransmission. * Will retransmit now */ -- cgit v1.2.3-59-g8ed1b