aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@mtd.linutronix.de>2005-11-06 15:36:37 +0100
committerThomas Gleixner <tglx@mtd.linutronix.de>2005-11-06 15:36:37 +0100
commit2fc2991175bf77395e6b15fe6b2304d3bf72da40 (patch)
treeb0ff38c09240e7c00e1577d447ebe89143d752dc /net/ipv4
parent[MTD] mtdchar: Return EINVAL for bad seeks instead of fixing up to valid byte (diff)
parent[PATCH] nvidiafb: Geforce 7800 series support added (diff)
downloadlinux-dev-2fc2991175bf77395e6b15fe6b2304d3bf72da40.tar.xz
linux-dev-2fc2991175bf77395e6b15fe6b2304d3bf72da40.zip
Merge branch 'master' of /home/tglx/work/mtd/git/linux-2.6.git/
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig17
-rw-r--r--net/ipv4/Makefile8
-rw-r--r--net/ipv4/af_inet.c192
-rw-r--r--net/ipv4/ah4.c18
-rw-r--r--net/ipv4/arp.c27
-rw-r--r--net/ipv4/datagram.c3
-rw-r--r--net/ipv4/devinet.c32
-rw-r--r--net/ipv4/esp4.c53
-rw-r--r--net/ipv4/fib_frontend.c12
-rw-r--r--net/ipv4/fib_hash.c4
-rw-r--r--net/ipv4/fib_lookup.h1
-rw-r--r--net/ipv4/fib_semantics.c20
-rw-r--r--net/ipv4/fib_trie.c2080
-rw-r--r--net/ipv4/icmp.c34
-rw-r--r--net/ipv4/igmp.c11
-rw-r--r--net/ipv4/inet_connection_sock.c641
-rw-r--r--net/ipv4/inet_diag.c868
-rw-r--r--net/ipv4/inet_hashtables.c165
-rw-r--r--net/ipv4/inet_timewait_sock.c385
-rw-r--r--net/ipv4/inetpeer.c19
-rw-r--r--net/ipv4/ip_forward.c6
-rw-r--r--net/ipv4/ip_fragment.c14
-rw-r--r--net/ipv4/ip_gre.c4
-rw-r--r--net/ipv4/ip_input.c141
-rw-r--r--net/ipv4/ip_options.c52
-rw-r--r--net/ipv4/ip_output.c115
-rw-r--r--net/ipv4/ip_sockglue.c11
-rw-r--r--net/ipv4/ipcomp.c9
-rw-r--r--net/ipv4/ipconfig.c15
-rw-r--r--net/ipv4/ipmr.c8
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c3
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c45
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c25
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c4
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c4
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c4
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c8
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c20
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c2
-rw-r--r--net/ipv4/multipath_drr.c2
-rw-r--r--net/ipv4/netfilter.c139
-rw-r--r--net/ipv4/netfilter/Kconfig133
-rw-r--r--net/ipv4/netfilter/Makefile20
-rw-r--r--net/ipv4/netfilter/arp_tables.c215
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c21
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c582
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c28
-rw-r--r--net/ipv4/netfilter/ip_conntrack_helper_pptp.c806
-rw-r--r--net/ipv4/netfilter/ip_conntrack_irc.c14
-rw-r--r--net/ipv4/netfilter/ip_conntrack_netbios_ns.c142
-rw-r--r--net/ipv4/netfilter/ip_conntrack_netlink.c1622
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_gre.c328
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_icmp.c72
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_sctp.c10
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c79
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c14
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c51
-rw-r--r--net/ipv4/netfilter/ip_conntrack_tftp.c7
-rw-r--r--net/ipv4/netfilter/ip_nat_core.c141
-rw-r--r--net/ipv4/netfilter/ip_nat_helper.c12
-rw-r--r--net/ipv4/netfilter/ip_nat_helper_pptp.c401
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_gre.c214
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_icmp.c23
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_tcp.c24
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_udp.c23
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_unknown.c13
-rw-r--r--net/ipv4/netfilter/ip_nat_rule.c21
-rw-r--r--net/ipv4/netfilter/ip_nat_snmp_basic.c2
-rw-r--r--net/ipv4/netfilter/ip_nat_standalone.c37
-rw-r--r--net/ipv4/netfilter/ip_queue.c58
-rw-r--r--net/ipv4/netfilter/ip_tables.c22
-rw-r--r--net/ipv4/netfilter/ipt_CLASSIFY.c4
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c225
-rw-r--r--net/ipv4/netfilter/ipt_CONNMARK.c15
-rw-r--r--net/ipv4/netfilter/ipt_DSCP.c3
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c23
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c86
-rw-r--r--net/ipv4/netfilter/ipt_MARK.c22
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c11
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c8
-rw-r--r--net/ipv4/netfilter/ipt_NFQUEUE.c70
-rw-r--r--net/ipv4/netfilter/ipt_REDIRECT.c16
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c6
-rw-r--r--net/ipv4/netfilter/ipt_TCPMSS.c10
-rw-r--r--net/ipv4/netfilter/ipt_TOS.c3
-rw-r--r--net/ipv4/netfilter/ipt_TTL.c119
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c53
-rw-r--r--net/ipv4/netfilter/ipt_addrtype.c2
-rw-r--r--net/ipv4/netfilter/ipt_connbytes.c162
-rw-r--r--net/ipv4/netfilter/ipt_connmark.c7
-rw-r--r--net/ipv4/netfilter/ipt_dccp.c176
-rw-r--r--net/ipv4/netfilter/ipt_hashlimit.c2
-rw-r--r--net/ipv4/netfilter/ipt_mark.c7
-rw-r--r--net/ipv4/netfilter/ipt_owner.c133
-rw-r--r--net/ipv4/netfilter/ipt_string.c91
-rw-r--r--net/ipv4/proc.c9
-rw-r--r--net/ipv4/protocol.c1
-rw-r--r--net/ipv4/raw.c9
-rw-r--r--net/ipv4/route.c43
-rw-r--r--net/ipv4/syncookies.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c47
-rw-r--r--net/ipv4/tcp.c421
-rw-r--r--net/ipv4/tcp_bic.c50
-rw-r--r--net/ipv4/tcp_cong.c44
-rw-r--r--net/ipv4/tcp_diag.c784
-rw-r--r--net/ipv4/tcp_highspeed.c17
-rw-r--r--net/ipv4/tcp_htcp.c53
-rw-r--r--net/ipv4/tcp_hybla.c31
-rw-r--r--net/ipv4/tcp_input.c556
-rw-r--r--net/ipv4/tcp_ipv4.c944
-rw-r--r--net/ipv4/tcp_minisocks.c605
-rw-r--r--net/ipv4/tcp_output.c338
-rw-r--r--net/ipv4/tcp_scalable.c6
-rw-r--r--net/ipv4/tcp_timer.c253
-rw-r--r--net/ipv4/tcp_vegas.c50
-rw-r--r--net/ipv4/tcp_westwood.c64
-rw-r--r--net/ipv4/udp.c41
-rw-r--r--net/ipv4/xfrm4_state.c2
118 files changed, 10585 insertions, 5397 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 0b3d9f1d8069..e55136ae09f4 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -413,20 +413,19 @@ config INET_TUNNEL
If unsure, say Y.
-config IP_TCPDIAG
- tristate "IP: TCP socket monitoring interface"
+config INET_DIAG
+ tristate "INET: socket monitoring interface"
default y
---help---
- Support for TCP socket monitoring interface used by native Linux
- tools such as ss. ss is included in iproute2, currently downloadable
- at <http://developer.osdl.org/dev/iproute2>. If you want IPv6 support
- and have selected IPv6 as a module, you need to build this as a
- module too.
+ Support for INET (TCP, DCCP, etc) socket monitoring interface used by
+ native Linux tools such as ss. ss is included in iproute2, currently
+ downloadable at <http://developer.osdl.org/dev/iproute2>.
If unsure, say Y.
-config IP_TCPDIAG_IPV6
- def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
+config INET_TCP_DIAG
+ depends on INET_DIAG
+ def_tristate INET_DIAG
config TCP_CONG_ADVANCED
bool "TCP: advanced congestion control"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 55dc6cca1e7b..f0435d00db6b 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -4,11 +4,12 @@
obj-y := route.o inetpeer.o protocol.o \
ip_input.o ip_fragment.o ip_forward.o ip_options.o \
- ip_output.o ip_sockglue.o \
+ ip_output.o ip_sockglue.o inet_hashtables.o \
+ inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o \
datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
- sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
+ sysctl_net_ipv4.o fib_frontend.o fib_semantics.o netfilter.o
obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
@@ -29,8 +30,9 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o
obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o
obj-$(CONFIG_NETFILTER) += netfilter/
obj-$(CONFIG_IP_VS) += ipvs/
-obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o
+obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
+obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 163ae4068b5f..a9d84f93442c 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -99,6 +99,7 @@
#include <net/arp.h>
#include <net/route.h>
#include <net/ip_fib.h>
+#include <net/inet_connection_sock.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <linux/skbuff.h>
@@ -112,11 +113,7 @@
#include <linux/mroute.h>
#endif
-DEFINE_SNMP_STAT(struct linux_mib, net_statistics);
-
-#ifdef INET_REFCNT_DEBUG
-atomic_t inet_sock_nr;
-#endif
+DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly;
extern void ip_mc_drop_socket(struct sock *sk);
@@ -153,11 +150,7 @@ void inet_sock_destruct(struct sock *sk)
if (inet->opt)
kfree(inet->opt);
dst_release(sk->sk_dst_cache);
-#ifdef INET_REFCNT_DEBUG
- atomic_dec(&inet_sock_nr);
- printk(KERN_DEBUG "INET socket %p released, %d are still alive\n",
- sk, atomic_read(&inet_sock_nr));
-#endif
+ sk_refcnt_debug_dec(sk);
}
/*
@@ -210,7 +203,7 @@ int inet_listen(struct socket *sock, int backlog)
* we can only allow the backlog to be adjusted.
*/
if (old_state != TCP_LISTEN) {
- err = tcp_listen_start(sk);
+ err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
if (err)
goto out;
}
@@ -235,12 +228,14 @@ static int inet_create(struct socket *sock, int protocol)
struct proto *answer_prot;
unsigned char answer_flags;
char answer_no_check;
- int err;
+ int try_loading_module = 0;
+ int err = -ESOCKTNOSUPPORT;
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
answer = NULL;
+lookup_protocol:
rcu_read_lock();
list_for_each_rcu(p, &inetsw[sock->type]) {
answer = list_entry(p, struct inet_protosw, list);
@@ -261,9 +256,28 @@ static int inet_create(struct socket *sock, int protocol)
answer = NULL;
}
- err = -ESOCKTNOSUPPORT;
- if (!answer)
- goto out_rcu_unlock;
+ if (unlikely(answer == NULL)) {
+ if (try_loading_module < 2) {
+ rcu_read_unlock();
+ /*
+ * Be more specific, e.g. net-pf-2-proto-132-type-1
+ * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
+ */
+ if (++try_loading_module == 1)
+ request_module("net-pf-%d-proto-%d-type-%d",
+ PF_INET, protocol, sock->type);
+ /*
+ * Fall back to generic, e.g. net-pf-2-proto-132
+ * (net-pf-PF_INET-proto-IPPROTO_SCTP)
+ */
+ else
+ request_module("net-pf-%d-proto-%d",
+ PF_INET, protocol);
+ goto lookup_protocol;
+ } else
+ goto out_rcu_unlock;
+ }
+
err = -EPERM;
if (answer->capability > 0 && !capable(answer->capability))
goto out_rcu_unlock;
@@ -317,9 +331,7 @@ static int inet_create(struct socket *sock, int protocol)
inet->mc_index = 0;
inet->mc_list = NULL;
-#ifdef INET_REFCNT_DEBUG
- atomic_inc(&inet_sock_nr);
-#endif
+ sk_refcnt_debug_inc(sk);
if (inet->num) {
/* It assumes that any protocol which allows
@@ -847,10 +859,6 @@ static struct net_proto_family inet_family_ops = {
.owner = THIS_MODULE,
};
-
-extern void tcp_init(void);
-extern void tcp_v4_init(struct net_proto_family *);
-
/* Upon startup we insert all the elements in inetsw_array[] into
* the linked list inetsw.
*/
@@ -961,6 +969,119 @@ void inet_unregister_protosw(struct inet_protosw *p)
}
}
+/*
+ * Shall we try to damage output packets if routing dev changes?
+ */
+
+int sysctl_ip_dynaddr;
+
+static int inet_sk_reselect_saddr(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ int err;
+ struct rtable *rt;
+ __u32 old_saddr = inet->saddr;
+ __u32 new_saddr;
+ __u32 daddr = inet->daddr;
+
+ if (inet->opt && inet->opt->srr)
+ daddr = inet->opt->faddr;
+
+ /* Query new route. */
+ err = ip_route_connect(&rt, daddr, 0,
+ RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if,
+ sk->sk_protocol,
+ inet->sport, inet->dport, sk);
+ if (err)
+ return err;
+
+ sk_setup_caps(sk, &rt->u.dst);
+
+ new_saddr = rt->rt_src;
+
+ if (new_saddr == old_saddr)
+ return 0;
+
+ if (sysctl_ip_dynaddr > 1) {
+ printk(KERN_INFO "%s(): shifting inet->"
+ "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
+ __FUNCTION__,
+ NIPQUAD(old_saddr),
+ NIPQUAD(new_saddr));
+ }
+
+ inet->saddr = inet->rcv_saddr = new_saddr;
+
+ /*
+ * XXX The only one ugly spot where we need to
+ * XXX really change the sockets identity after
+ * XXX it has entered the hashes. -DaveM
+ *
+ * Besides that, it does not check for connection
+ * uniqueness. Wait for troubles.
+ */
+ __sk_prot_rehash(sk);
+ return 0;
+}
+
+int inet_sk_rebuild_header(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
+ u32 daddr;
+ int err;
+
+ /* Route is OK, nothing to do. */
+ if (rt)
+ return 0;
+
+ /* Reroute. */
+ daddr = inet->daddr;
+ if (inet->opt && inet->opt->srr)
+ daddr = inet->opt->faddr;
+{
+ struct flowi fl = {
+ .oif = sk->sk_bound_dev_if,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = daddr,
+ .saddr = inet->saddr,
+ .tos = RT_CONN_FLAGS(sk),
+ },
+ },
+ .proto = sk->sk_protocol,
+ .uli_u = {
+ .ports = {
+ .sport = inet->sport,
+ .dport = inet->dport,
+ },
+ },
+ };
+
+ err = ip_route_output_flow(&rt, &fl, sk, 0);
+}
+ if (!err)
+ sk_setup_caps(sk, &rt->u.dst);
+ else {
+ /* Routing failed... */
+ sk->sk_route_caps = 0;
+ /*
+ * Other protocols have to map its equivalent state to TCP_SYN_SENT.
+ * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
+ */
+ if (!sysctl_ip_dynaddr ||
+ sk->sk_state != TCP_SYN_SENT ||
+ (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
+ (err = inet_sk_reselect_saddr(sk)) != 0)
+ sk->sk_err_soft = -err;
+ }
+
+ return err;
+}
+
+EXPORT_SYMBOL(inet_sk_rebuild_header);
+
#ifdef CONFIG_IP_MULTICAST
static struct net_protocol igmp_protocol = {
.handler = igmp_rcv,
@@ -1007,7 +1128,6 @@ static int __init init_ipv4_mibs(void)
}
static int ipv4_proc_init(void);
-extern void ipfrag_init(void);
/*
* IP protocol layer initialiser
@@ -1128,20 +1248,6 @@ module_init(inet_init);
/* ------------------------------------------------------------------------ */
#ifdef CONFIG_PROC_FS
-extern int fib_proc_init(void);
-extern void fib_proc_exit(void);
-#ifdef CONFIG_IP_FIB_TRIE
-extern int fib_stat_proc_init(void);
-extern void fib_stat_proc_exit(void);
-#endif
-extern int ip_misc_proc_init(void);
-extern int raw_proc_init(void);
-extern void raw_proc_exit(void);
-extern int tcp4_proc_init(void);
-extern void tcp4_proc_exit(void);
-extern int udp4_proc_init(void);
-extern void udp4_proc_exit(void);
-
static int __init ipv4_proc_init(void)
{
int rc = 0;
@@ -1154,19 +1260,11 @@ static int __init ipv4_proc_init(void)
goto out_udp;
if (fib_proc_init())
goto out_fib;
-#ifdef CONFIG_IP_FIB_TRIE
- if (fib_stat_proc_init())
- goto out_fib_stat;
-#endif
if (ip_misc_proc_init())
goto out_misc;
out:
return rc;
out_misc:
-#ifdef CONFIG_IP_FIB_TRIE
- fib_stat_proc_exit();
-out_fib_stat:
-#endif
fib_proc_exit();
out_fib:
udp4_proc_exit();
@@ -1205,7 +1303,3 @@ EXPORT_SYMBOL(inet_stream_ops);
EXPORT_SYMBOL(inet_unregister_protosw);
EXPORT_SYMBOL(net_statistics);
EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
-
-#ifdef INET_REFCNT_DEBUG
-EXPORT_SYMBOL(inet_sock_nr);
-#endif
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 514c85b2631a..035ad2c9e1ba 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -263,10 +263,8 @@ static int ah_init_state(struct xfrm_state *x)
error:
if (ahp) {
- if (ahp->work_icv)
- kfree(ahp->work_icv);
- if (ahp->tfm)
- crypto_free_tfm(ahp->tfm);
+ kfree(ahp->work_icv);
+ crypto_free_tfm(ahp->tfm);
kfree(ahp);
}
return -EINVAL;
@@ -279,14 +277,10 @@ static void ah_destroy(struct xfrm_state *x)
if (!ahp)
return;
- if (ahp->work_icv) {
- kfree(ahp->work_icv);
- ahp->work_icv = NULL;
- }
- if (ahp->tfm) {
- crypto_free_tfm(ahp->tfm);
- ahp->tfm = NULL;
- }
+ kfree(ahp->work_icv);
+ ahp->work_icv = NULL;
+ crypto_free_tfm(ahp->tfm);
+ ahp->tfm = NULL;
kfree(ahp);
}
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index a642fd612853..b425748f02d7 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -241,7 +241,7 @@ static int arp_constructor(struct neighbour *neigh)
neigh->type = inet_addr_type(addr);
rcu_read_lock();
- in_dev = rcu_dereference(__in_dev_get(dev));
+ in_dev = __in_dev_get_rcu(dev);
if (in_dev == NULL) {
rcu_read_unlock();
return -EINVAL;
@@ -697,12 +697,6 @@ void arp_send(int type, int ptype, u32 dest_ip,
arp_xmit(skb);
}
-static void parp_redo(struct sk_buff *skb)
-{
- nf_reset(skb);
- arp_rcv(skb, skb->dev, NULL);
-}
-
/*
* Process an arp request.
*/
@@ -865,7 +859,7 @@ static int arp_process(struct sk_buff *skb)
if (n)
neigh_release(n);
- if (skb->stamp.tv_sec == LOCALLY_ENQUEUED ||
+ if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
skb->pkt_type == PACKET_HOST ||
in_dev->arp_parms->proxy_delay == 0) {
arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
@@ -922,12 +916,17 @@ out:
return 0;
}
+static void parp_redo(struct sk_buff *skb)
+{
+ arp_process(skb);
+}
+
/*
* Receive an arp request from the device layer.
*/
-int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct arphdr *arp;
@@ -948,6 +947,8 @@ int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
goto out_of_mem;
+ memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
+
return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
freeskb:
@@ -988,8 +989,8 @@ static int arp_req_set(struct arpreq *r, struct net_device * dev)
ipv4_devconf.proxy_arp = 1;
return 0;
}
- if (__in_dev_get(dev)) {
- __in_dev_get(dev)->cnf.proxy_arp = 1;
+ if (__in_dev_get_rtnl(dev)) {
+ __in_dev_get_rtnl(dev)->cnf.proxy_arp = 1;
return 0;
}
return -ENXIO;
@@ -1094,8 +1095,8 @@ static int arp_req_delete(struct arpreq *r, struct net_device * dev)
ipv4_devconf.proxy_arp = 0;
return 0;
}
- if (__in_dev_get(dev)) {
- __in_dev_get(dev)->cnf.proxy_arp = 0;
+ if (__in_dev_get_rtnl(dev)) {
+ __in_dev_get_rtnl(dev)->cnf.proxy_arp = 0;
return 0;
}
return -ENXIO;
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index b1db561f2542..c1b42b5257f8 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -16,9 +16,10 @@
#include <linux/module.h>
#include <linux/ip.h>
#include <linux/in.h>
+#include <net/ip.h>
#include <net/sock.h>
-#include <net/tcp.h>
#include <net/route.h>
+#include <net/tcp_states.h>
int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d8a10e3dd77d..4ec4b2ca6ab1 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -351,7 +351,7 @@ static int inet_insert_ifa(struct in_ifaddr *ifa)
static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
{
- struct in_device *in_dev = __in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
ASSERT_RTNL();
@@ -449,7 +449,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
goto out;
rc = -ENOBUFS;
- if ((in_dev = __in_dev_get(dev)) == NULL) {
+ if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) {
in_dev = inetdev_init(dev);
if (!in_dev)
goto out;
@@ -584,7 +584,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
if (colon)
*colon = ':';
- if ((in_dev = __in_dev_get(dev)) != NULL) {
+ if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
if (tryaddrmatch) {
/* Matthias Andree */
/* compare label and address (4.4BSD style) */
@@ -715,6 +715,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
break;
ret = 0;
if (ifa->ifa_mask != sin->sin_addr.s_addr) {
+ u32 old_mask = ifa->ifa_mask;
inet_del_ifa(in_dev, ifap, 0);
ifa->ifa_mask = sin->sin_addr.s_addr;
ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask);
@@ -728,7 +729,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
if ((dev->flags & IFF_BROADCAST) &&
(ifa->ifa_prefixlen < 31) &&
(ifa->ifa_broadcast ==
- (ifa->ifa_local|~ifa->ifa_mask))) {
+ (ifa->ifa_local|~old_mask))) {
ifa->ifa_broadcast = (ifa->ifa_local |
~sin->sin_addr.s_addr);
}
@@ -748,7 +749,7 @@ rarok:
static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
{
- struct in_device *in_dev = __in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
struct in_ifaddr *ifa;
struct ifreq ifr;
int done = 0;
@@ -791,7 +792,7 @@ u32 inet_select_addr(const struct net_device *dev, u32 dst, int scope)
struct in_device *in_dev;
rcu_read_lock();
- in_dev = __in_dev_get(dev);
+ in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
goto no_in_dev;
@@ -818,7 +819,7 @@ no_in_dev:
read_lock(&dev_base_lock);
rcu_read_lock();
for (dev = dev_base; dev; dev = dev->next) {
- if ((in_dev = __in_dev_get(dev)) == NULL)
+ if ((in_dev = __in_dev_get_rcu(dev)) == NULL)
continue;
for_primary_ifa(in_dev) {
@@ -887,7 +888,7 @@ u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scop
if (dev) {
rcu_read_lock();
- if ((in_dev = __in_dev_get(dev)))
+ if ((in_dev = __in_dev_get_rcu(dev)))
addr = confirm_addr_indev(in_dev, dst, local, scope);
rcu_read_unlock();
@@ -897,7 +898,7 @@ u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scop
read_lock(&dev_base_lock);
rcu_read_lock();
for (dev = dev_base; dev; dev = dev->next) {
- if ((in_dev = __in_dev_get(dev))) {
+ if ((in_dev = __in_dev_get_rcu(dev))) {
addr = confirm_addr_indev(in_dev, dst, local, scope);
if (addr)
break;
@@ -957,7 +958,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
struct net_device *dev = ptr;
- struct in_device *in_dev = __in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
ASSERT_RTNL();
@@ -1078,7 +1079,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
if (idx > s_idx)
s_ip_idx = 0;
rcu_read_lock();
- if ((in_dev = __in_dev_get(dev)) == NULL) {
+ if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
rcu_read_unlock();
continue;
}
@@ -1111,13 +1112,12 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa)
struct sk_buff *skb = alloc_skb(size, GFP_KERNEL);
if (!skb)
- netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, ENOBUFS);
else if (inet_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) {
kfree_skb(skb);
- netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, EINVAL);
} else {
- NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL);
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV4_IFADDR, GFP_KERNEL);
}
}
@@ -1150,7 +1150,7 @@ void inet_forward_change(void)
for (dev = dev_base; dev; dev = dev->next) {
struct in_device *in_dev;
rcu_read_lock();
- in_dev = __in_dev_get(dev);
+ in_dev = __in_dev_get_rcu(dev);
if (in_dev)
in_dev->cnf.forwarding = on;
rcu_read_unlock();
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index ba57446d5d1f..1b18ce66e7b7 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -5,6 +5,7 @@
#include <net/esp.h>
#include <asm/scatterlist.h>
#include <linux/crypto.h>
+#include <linux/kernel.h>
#include <linux/pfkeyv2.h>
#include <linux/random.h>
#include <net/icmp.h>
@@ -42,10 +43,10 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
esp = x->data;
alen = esp->auth.icv_trunc_len;
tfm = esp->conf.tfm;
- blksize = (crypto_tfm_alg_blocksize(tfm) + 3) & ~3;
- clen = (clen + 2 + blksize-1)&~(blksize-1);
+ blksize = ALIGN(crypto_tfm_alg_blocksize(tfm), 4);
+ clen = ALIGN(clen + 2, blksize);
if (esp->conf.padlen)
- clen = (clen + esp->conf.padlen-1)&~(esp->conf.padlen-1);
+ clen = ALIGN(clen, esp->conf.padlen);
if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0)
goto error;
@@ -143,7 +144,7 @@ static int esp_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struc
struct ip_esp_hdr *esph;
struct esp_data *esp = x->data;
struct sk_buff *trailer;
- int blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
+ int blksize = ALIGN(crypto_tfm_alg_blocksize(esp->conf.tfm), 4);
int alen = esp->auth.icv_trunc_len;
int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen;
int nfrags;
@@ -304,16 +305,16 @@ static int esp_post_input(struct xfrm_state *x, struct xfrm_decap_state *decap,
static u32 esp4_get_max_size(struct xfrm_state *x, int mtu)
{
struct esp_data *esp = x->data;
- u32 blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
+ u32 blksize = ALIGN(crypto_tfm_alg_blocksize(esp->conf.tfm), 4);
if (x->props.mode) {
- mtu = (mtu + 2 + blksize-1)&~(blksize-1);
+ mtu = ALIGN(mtu + 2, blksize);
} else {
/* The worst case. */
- mtu += 2 + blksize;
+ mtu = ALIGN(mtu + 2, 4) + blksize - 4;
}
if (esp->conf.padlen)
- mtu = (mtu + esp->conf.padlen-1)&~(esp->conf.padlen-1);
+ mtu = ALIGN(mtu, esp->conf.padlen);
return mtu + x->props.header_len + esp->auth.icv_trunc_len;
}
@@ -331,8 +332,8 @@ static void esp4_err(struct sk_buff *skb, u32 info)
x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
if (!x)
return;
- NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
- ntohl(esph->spi), ntohl(iph->daddr)));
+ NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
+ ntohl(esph->spi), ntohl(iph->daddr));
xfrm_state_put(x);
}
@@ -343,22 +344,14 @@ static void esp_destroy(struct xfrm_state *x)
if (!esp)
return;
- if (esp->conf.tfm) {
- crypto_free_tfm(esp->conf.tfm);
- esp->conf.tfm = NULL;
- }
- if (esp->conf.ivec) {
- kfree(esp->conf.ivec);
- esp->conf.ivec = NULL;
- }
- if (esp->auth.tfm) {
- crypto_free_tfm(esp->auth.tfm);
- esp->auth.tfm = NULL;
- }
- if (esp->auth.work_icv) {
- kfree(esp->auth.work_icv);
- esp->auth.work_icv = NULL;
- }
+ crypto_free_tfm(esp->conf.tfm);
+ esp->conf.tfm = NULL;
+ kfree(esp->conf.ivec);
+ esp->conf.ivec = NULL;
+ crypto_free_tfm(esp->auth.tfm);
+ esp->auth.tfm = NULL;
+ kfree(esp->auth.work_icv);
+ esp->auth.work_icv = NULL;
kfree(esp);
}
@@ -395,10 +388,10 @@ static int esp_init_state(struct xfrm_state *x)
if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
crypto_tfm_alg_digestsize(esp->auth.tfm)) {
- NETDEBUG(printk(KERN_INFO "ESP: %s digestsize %u != %hu\n",
- x->aalg->alg_name,
- crypto_tfm_alg_digestsize(esp->auth.tfm),
- aalg_desc->uinfo.auth.icv_fullbits/8));
+ NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n",
+ x->aalg->alg_name,
+ crypto_tfm_alg_digestsize(esp->auth.tfm),
+ aalg_desc->uinfo.auth.icv_fullbits/8);
goto error;
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index cd8e45ab9580..990633c09dfe 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -173,7 +173,7 @@ int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
no_addr = rpf = 0;
rcu_read_lock();
- in_dev = __in_dev_get(dev);
+ in_dev = __in_dev_get_rcu(dev);
if (in_dev) {
no_addr = in_dev->ifa_list == NULL;
rpf = IN_DEV_RPFILTER(in_dev);
@@ -558,16 +558,15 @@ static void nl_fib_input(struct sock *sk, int len)
nl_fib_lookup(frn, tb);
pid = nlh->nlmsg_pid; /*pid of sending process */
- NETLINK_CB(skb).groups = 0; /* not in mcast group */
NETLINK_CB(skb).pid = 0; /* from kernel */
NETLINK_CB(skb).dst_pid = pid;
- NETLINK_CB(skb).dst_groups = 0; /* unicast */
+ NETLINK_CB(skb).dst_group = 0; /* unicast */
netlink_unicast(sk, skb, pid, MSG_DONTWAIT);
}
static void nl_fib_lookup_init(void)
{
- netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input);
+ netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, THIS_MODULE);
}
static void fib_disable_ip(struct net_device *dev, int force)
@@ -592,7 +591,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
break;
case NETDEV_DOWN:
fib_del_ifaddr(ifa);
- if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) {
+ if (ifa->ifa_dev->ifa_list == NULL) {
/* Last address was deleted from this interface.
Disable IP.
*/
@@ -608,7 +607,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = ptr;
- struct in_device *in_dev = __in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
if (event == NETDEV_UNREGISTER) {
fib_disable_ip(dev, 2);
@@ -662,5 +661,4 @@ void __init ip_fib_init(void)
}
EXPORT_SYMBOL(inet_addr_type);
-EXPORT_SYMBOL(ip_dev_find);
EXPORT_SYMBOL(ip_rt_ioctl);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index b10d6bb5ef3d..2a8c9afc3695 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -45,8 +45,8 @@
#include "fib_lookup.h"
-static kmem_cache_t *fn_hash_kmem;
-static kmem_cache_t *fn_alias_kmem;
+static kmem_cache_t *fn_hash_kmem __read_mostly;
+static kmem_cache_t *fn_alias_kmem __read_mostly;
struct fib_node {
struct hlist_node fn_hash;
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index b729d97cfa93..ef6609ea0eb7 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -7,6 +7,7 @@
struct fib_alias {
struct list_head fa_list;
+ struct rcu_head rcu;
struct fib_info *fa_info;
u8 fa_tos;
u8 fa_type;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c886b28ba9f5..186f20c4a45e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -290,10 +290,10 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
kfree_skb(skb);
return;
}
- NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE;
+ NETLINK_CB(skb).dst_group = RTNLGRP_IPV4_ROUTE;
if (n->nlmsg_flags&NLM_F_ECHO)
atomic_inc(&skb->users);
- netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL);
+ netlink_broadcast(rtnl, skb, pid, RTNLGRP_IPV4_ROUTE, GFP_KERNEL);
if (n->nlmsg_flags&NLM_F_ECHO)
netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
}
@@ -593,10 +593,13 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
struct hlist_head *new_laddrhash,
unsigned int new_size)
{
+ struct hlist_head *old_info_hash, *old_laddrhash;
unsigned int old_size = fib_hash_size;
- unsigned int i;
+ unsigned int i, bytes;
write_lock(&fib_info_lock);
+ old_info_hash = fib_info_hash;
+ old_laddrhash = fib_info_laddrhash;
fib_hash_size = new_size;
for (i = 0; i < old_size; i++) {
@@ -636,6 +639,10 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
fib_info_laddrhash = new_laddrhash;
write_unlock(&fib_info_lock);
+
+ bytes = old_size * sizeof(struct hlist_head *);
+ fib_hash_free(old_info_hash, bytes);
+ fib_hash_free(old_laddrhash, bytes);
}
struct fib_info *
@@ -847,6 +854,7 @@ failure:
return NULL;
}
+/* Note! fib_semantic_match intentionally uses RCU list functions. */
int fib_semantic_match(struct list_head *head, const struct flowi *flp,
struct fib_result *res, __u32 zone, __u32 mask,
int prefixlen)
@@ -854,7 +862,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
struct fib_alias *fa;
int nh_sel = 0;
- list_for_each_entry(fa, head, fa_list) {
+ list_for_each_entry_rcu(fa, head, fa_list) {
int err;
if (fa->fa_tos &&
@@ -1079,7 +1087,7 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
rta->rta_oif = &dev->ifindex;
if (colon) {
struct in_ifaddr *ifa;
- struct in_device *in_dev = __in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
if (!in_dev)
return -ENODEV;
*colon = ':';
@@ -1260,7 +1268,7 @@ int fib_sync_up(struct net_device *dev)
}
if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
continue;
- if (nh->nh_dev != dev || __in_dev_get(dev) == NULL)
+ if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
continue;
alive++;
spin_lock_bh(&fib_multipath_lock);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index a701405fab0b..66247f38b371 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
* 2 of the License, or (at your option) any later version.
*/
-#define VERSION "0.325"
+#define VERSION "0.404"
#include <linux/config.h>
#include <asm/uaccess.h>
@@ -62,6 +62,7 @@
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
@@ -77,56 +78,55 @@
#undef CONFIG_IP_FIB_TRIE_STATS
#define MAX_CHILDS 16384
-#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
#define KEYLENGTH (8*sizeof(t_key))
#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
-static DEFINE_RWLOCK(fib_lock);
-
typedef unsigned int t_key;
#define T_TNODE 0
#define T_LEAF 1
#define NODE_TYPE_MASK 0x1UL
-#define NODE_PARENT(_node) \
- ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
-#define NODE_SET_PARENT(_node, _ptr) \
- ((_node)->_parent = (((unsigned long)(_ptr)) | \
- ((_node)->_parent & NODE_TYPE_MASK)))
-#define NODE_INIT_PARENT(_node, _type) \
- ((_node)->_parent = (_type))
-#define NODE_TYPE(_node) \
- ((_node)->_parent & NODE_TYPE_MASK)
-
-#define IS_TNODE(n) (!(n->_parent & T_LEAF))
-#define IS_LEAF(n) (n->_parent & T_LEAF)
+#define NODE_PARENT(node) \
+ ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
+
+#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
+
+#define NODE_SET_PARENT(node, ptr) \
+ rcu_assign_pointer((node)->parent, \
+ ((unsigned long)(ptr)) | NODE_TYPE(node))
+
+#define IS_TNODE(n) (!(n->parent & T_LEAF))
+#define IS_LEAF(n) (n->parent & T_LEAF)
struct node {
- t_key key;
- unsigned long _parent;
+ t_key key;
+ unsigned long parent;
};
struct leaf {
- t_key key;
- unsigned long _parent;
+ t_key key;
+ unsigned long parent;
struct hlist_head list;
+ struct rcu_head rcu;
};
struct leaf_info {
struct hlist_node hlist;
+ struct rcu_head rcu;
int plen;
struct list_head falh;
};
struct tnode {
- t_key key;
- unsigned long _parent;
- unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
- unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
- unsigned short full_children; /* KEYLENGTH bits needed */
- unsigned short empty_children; /* KEYLENGTH bits needed */
- struct node *child[0];
+ t_key key;
+ unsigned long parent;
+ unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
+ unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
+ unsigned short full_children; /* KEYLENGTH bits needed */
+ unsigned short empty_children; /* KEYLENGTH bits needed */
+ struct rcu_head rcu;
+ struct node *child[0];
};
#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -150,77 +150,44 @@ struct trie_stat {
};
struct trie {
- struct node *trie;
+ struct node *trie;
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats stats;
#endif
- int size;
+ int size;
unsigned int revision;
};
-static int trie_debug = 0;
-
-static int tnode_full(struct tnode *tn, struct node *n);
static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
-static int tnode_child_length(struct tnode *tn);
static struct node *resize(struct trie *t, struct tnode *tn);
-static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
-static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
+static struct tnode *inflate(struct trie *t, struct tnode *tn);
+static struct tnode *halve(struct trie *t, struct tnode *tn);
static void tnode_free(struct tnode *tn);
-static void trie_dump_seq(struct seq_file *seq, struct trie *t);
-extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
-extern int fib_detect_death(struct fib_info *fi, int order,
- struct fib_info **last_resort, int *last_idx, int *dflt);
-extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
- struct nlmsghdr *n, struct netlink_skb_parms *req);
-
-static kmem_cache_t *fn_alias_kmem;
+static kmem_cache_t *fn_alias_kmem __read_mostly;
static struct trie *trie_local = NULL, *trie_main = NULL;
-static void trie_bug(char *err)
-{
- printk("Trie Bug: %s\n", err);
- BUG();
-}
+
+/* rcu_read_lock needs to be hold by caller from readside */
static inline struct node *tnode_get_child(struct tnode *tn, int i)
{
- if (i >= 1<<tn->bits)
- trie_bug("tnode_get_child");
+ BUG_ON(i >= 1 << tn->bits);
- return tn->child[i];
+ return rcu_dereference(tn->child[i]);
}
-static inline int tnode_child_length(struct tnode *tn)
+static inline int tnode_child_length(const struct tnode *tn)
{
- return 1<<tn->bits;
+ return 1 << tn->bits;
}
-/*
- _________________________________________________________________
- | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
- ----------------------------------------------------------------
- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
-
- _________________________________________________________________
- | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
- -----------------------------------------------------------------
- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
-
- tp->pos = 7
- tp->bits = 3
- n->pos = 15
- n->bits=4
- KEYLENGTH=32
-*/
-
static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
{
- if (offset < KEYLENGTH)
+ if (offset < KEYLENGTH)
return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
- else
+ else
return 0;
}
@@ -233,8 +200,8 @@ static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
{
if (bits == 0 || offset >= KEYLENGTH)
return 1;
- bits = bits > KEYLENGTH ? KEYLENGTH : bits;
- return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
+ bits = bits > KEYLENGTH ? KEYLENGTH : bits;
+ return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
}
static inline int tkey_mismatch(t_key a, int offset, t_key b)
@@ -249,14 +216,6 @@ static inline int tkey_mismatch(t_key a, int offset, t_key b)
return i;
}
-/* Candiate for fib_semantics */
-
-static void fn_free_alias(struct fib_alias *fa)
-{
- fib_release_info(fa->fa_info);
- kmem_cache_free(fn_alias_kmem, fa);
-}
-
/*
To understand this stuff, an understanding of keys and all their bits is
necessary. Every node in the trie has a key associated with it, but not
@@ -265,7 +224,7 @@ static void fn_free_alias(struct fib_alias *fa)
Consider a node 'n' and its parent 'tp'.
If n is a leaf, every bit in its key is significant. Its presence is
- necessitaded by path compression, since during a tree traversal (when
+ necessitated by path compression, since during a tree traversal (when
searching for a leaf - unless we are doing an insertion) we will completely
ignore all skipped bits we encounter. Thus we need to verify, at the end of
a potentially successful search, that we have indeed been walking the
@@ -295,7 +254,7 @@ static void fn_free_alias(struct fib_alias *fa)
tp->pos = 7
tp->bits = 3
n->pos = 15
- n->bits=4
+ n->bits = 4
First, let's just ignore the bits that come before the parent tp, that is
the bits from 0 to (tp->pos-1). They are *known* but at this point we do
@@ -320,60 +279,67 @@ static void fn_free_alias(struct fib_alias *fa)
*/
-static void check_tnode(struct tnode *tn)
+static inline void check_tnode(const struct tnode *tn)
{
- if (tn && tn->pos+tn->bits > 32) {
- printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
- }
+ WARN_ON(tn && tn->pos+tn->bits > 32);
}
static int halve_threshold = 25;
static int inflate_threshold = 50;
+static int halve_threshold_root = 15;
+static int inflate_threshold_root = 25;
-static struct leaf *leaf_new(void)
+
+static void __alias_free_mem(struct rcu_head *head)
{
- struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
- if (l) {
- NODE_INIT_PARENT(l, T_LEAF);
- INIT_HLIST_HEAD(&l->list);
- }
- return l;
+ struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
+ kmem_cache_free(fn_alias_kmem, fa);
}
-static struct leaf_info *leaf_info_new(int plen)
+static inline void alias_free_mem_rcu(struct fib_alias *fa)
{
- struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
- if (li) {
- li->plen = plen;
- INIT_LIST_HEAD(&li->falh);
- }
- return li;
+ call_rcu(&fa->rcu, __alias_free_mem);
+}
+
+static void __leaf_free_rcu(struct rcu_head *head)
+{
+ kfree(container_of(head, struct leaf, rcu));
+}
+
+static inline void free_leaf(struct leaf *leaf)
+{
+ call_rcu(&leaf->rcu, __leaf_free_rcu);
}
-static inline void free_leaf(struct leaf *l)
+static void __leaf_info_free_rcu(struct rcu_head *head)
{
- kfree(l);
+ kfree(container_of(head, struct leaf_info, rcu));
}
-static inline void free_leaf_info(struct leaf_info *li)
+static inline void free_leaf_info(struct leaf_info *leaf)
{
- kfree(li);
+ call_rcu(&leaf->rcu, __leaf_info_free_rcu);
}
static struct tnode *tnode_alloc(unsigned int size)
{
- if (size <= PAGE_SIZE) {
- return kmalloc(size, GFP_KERNEL);
- } else {
- return (struct tnode *)
- __get_free_pages(GFP_KERNEL, get_order(size));
- }
+ struct page *pages;
+
+ if (size <= PAGE_SIZE)
+ return kcalloc(size, 1, GFP_KERNEL);
+
+ pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
+ if (!pages)
+ return NULL;
+
+ return page_address(pages);
}
-static void __tnode_free(struct tnode *tn)
+static void __tnode_free_rcu(struct rcu_head *head)
{
+ struct tnode *tn = container_of(head, struct tnode, rcu);
unsigned int size = sizeof(struct tnode) +
- (1<<tn->bits) * sizeof(struct node *);
+ (1 << tn->bits) * sizeof(struct node *);
if (size <= PAGE_SIZE)
kfree(tn);
@@ -381,15 +347,40 @@ static void __tnode_free(struct tnode *tn)
free_pages((unsigned long)tn, get_order(size));
}
+static inline void tnode_free(struct tnode *tn)
+{
+ call_rcu(&tn->rcu, __tnode_free_rcu);
+}
+
+static struct leaf *leaf_new(void)
+{
+ struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
+ if (l) {
+ l->parent = T_LEAF;
+ INIT_HLIST_HEAD(&l->list);
+ }
+ return l;
+}
+
+static struct leaf_info *leaf_info_new(int plen)
+{
+ struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
+ if (li) {
+ li->plen = plen;
+ INIT_LIST_HEAD(&li->falh);
+ }
+ return li;
+}
+
static struct tnode* tnode_new(t_key key, int pos, int bits)
{
int nchildren = 1<<bits;
int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
struct tnode *tn = tnode_alloc(sz);
- if (tn) {
+ if (tn) {
memset(tn, 0, sz);
- NODE_INIT_PARENT(tn, T_TNODE);
+ tn->parent = T_TNODE;
tn->pos = pos;
tn->bits = bits;
tn->key = key;
@@ -397,38 +388,17 @@ static struct tnode* tnode_new(t_key key, int pos, int bits)
tn->empty_children = 1<<bits;
}
- if (trie_debug > 0)
- printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
- (unsigned int) (sizeof(struct node) * 1<<bits));
+ pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
+ (unsigned int) (sizeof(struct node) * 1<<bits));
return tn;
}
-static void tnode_free(struct tnode *tn)
-{
- if (!tn) {
- trie_bug("tnode_free\n");
- }
- if (IS_LEAF(tn)) {
- free_leaf((struct leaf *)tn);
- if (trie_debug > 0 )
- printk("FL %p \n", tn);
- }
- else if (IS_TNODE(tn)) {
- __tnode_free(tn);
- if (trie_debug > 0 )
- printk("FT %p \n", tn);
- }
- else {
- trie_bug("tnode_free\n");
- }
-}
-
/*
* Check whether a tnode 'n' is "full", i.e. it is an internal node
* and no bits are skipped. See discussion in dyntree paper p. 6
*/
-static inline int tnode_full(struct tnode *tn, struct node *n)
+static inline int tnode_full(const struct tnode *tn, const struct node *n)
{
if (n == NULL || IS_LEAF(n))
return 0;
@@ -448,15 +418,11 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, struct nod
static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
{
- struct node *chi;
+ struct node *chi = tn->child[i];
int isfull;
- if (i >= 1<<tn->bits) {
- printk("bits=%d, i=%d\n", tn->bits, i);
- trie_bug("tnode_put_child_reorg bits");
- }
- write_lock_bh(&fib_lock);
- chi = tn->child[i];
+ BUG_ON(i >= 1<<tn->bits);
+
/* update emptyChildren */
if (n == NULL && chi != NULL)
@@ -465,33 +431,34 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w
tn->empty_children--;
/* update fullChildren */
- if (wasfull == -1)
+ if (wasfull == -1)
wasfull = tnode_full(tn, chi);
isfull = tnode_full(tn, n);
if (wasfull && !isfull)
tn->full_children--;
-
else if (!wasfull && isfull)
tn->full_children++;
+
if (n)
NODE_SET_PARENT(n, tn);
- tn->child[i] = n;
- write_unlock_bh(&fib_lock);
+ rcu_assign_pointer(tn->child[i], n);
}
static struct node *resize(struct trie *t, struct tnode *tn)
{
int i;
int err = 0;
+ struct tnode *old_tn;
+ int inflate_threshold_use;
+ int halve_threshold_use;
if (!tn)
return NULL;
- if (trie_debug)
- printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
- tn, inflate_threshold, halve_threshold);
+ pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
+ tn, inflate_threshold, halve_threshold);
/* No children */
if (tn->empty_children == tnode_child_length(tn)) {
@@ -501,20 +468,16 @@ static struct node *resize(struct trie *t, struct tnode *tn)
/* One child */
if (tn->empty_children == tnode_child_length(tn) - 1)
for (i = 0; i < tnode_child_length(tn); i++) {
+ struct node *n;
- write_lock_bh(&fib_lock);
- if (tn->child[i] != NULL) {
-
- /* compress one level */
- struct node *n = tn->child[i];
- if (n)
- NODE_INIT_PARENT(n, NODE_TYPE(n));
+ n = tn->child[i];
+ if (!n)
+ continue;
- write_unlock_bh(&fib_lock);
- tnode_free(tn);
- return n;
- }
- write_unlock_bh(&fib_lock);
+ /* compress one level */
+ NODE_SET_PARENT(n, NULL);
+ tnode_free(tn);
+ return n;
}
/*
* Double as long as the resulting node has a number of
@@ -566,30 +529,38 @@ static struct node *resize(struct trie *t, struct tnode *tn)
*
* expand not_to_be_doubled and to_be_doubled, and shorten:
* 100 * (tnode_child_length(tn) - tn->empty_children +
- * tn->full_children ) >= inflate_threshold * new_child_length
+ * tn->full_children) >= inflate_threshold * new_child_length
*
* expand new_child_length:
* 100 * (tnode_child_length(tn) - tn->empty_children +
- * tn->full_children ) >=
+ * tn->full_children) >=
* inflate_threshold * tnode_child_length(tn) * 2
*
* shorten again:
* 50 * (tn->full_children + tnode_child_length(tn) -
- * tn->empty_children ) >= inflate_threshold *
+ * tn->empty_children) >= inflate_threshold *
* tnode_child_length(tn)
*
*/
check_tnode(tn);
+ /* Keep root node larger */
+
+ if(!tn->parent)
+ inflate_threshold_use = inflate_threshold_root;
+ else
+ inflate_threshold_use = inflate_threshold;
+
err = 0;
while ((tn->full_children > 0 &&
50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
- inflate_threshold * tnode_child_length(tn))) {
-
- tn = inflate(t, tn, &err);
+ inflate_threshold_use * tnode_child_length(tn))) {
- if (err) {
+ old_tn = tn;
+ tn = inflate(t, tn);
+ if (IS_ERR(tn)) {
+ tn = old_tn;
#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats.resize_node_skipped++;
#endif
@@ -604,14 +575,23 @@ static struct node *resize(struct trie *t, struct tnode *tn)
* node is above threshold.
*/
+
+ /* Keep root node larger */
+
+ if(!tn->parent)
+ halve_threshold_use = halve_threshold_root;
+ else
+ halve_threshold_use = halve_threshold;
+
err = 0;
while (tn->bits > 1 &&
100 * (tnode_child_length(tn) - tn->empty_children) <
- halve_threshold * tnode_child_length(tn)) {
-
- tn = halve(t, tn, &err);
+ halve_threshold_use * tnode_child_length(tn)) {
- if (err) {
+ old_tn = tn;
+ tn = halve(t, tn);
+ if (IS_ERR(tn)) {
+ tn = old_tn;
#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats.resize_node_skipped++;
#endif
@@ -621,44 +601,37 @@ static struct node *resize(struct trie *t, struct tnode *tn)
/* Only one child remains */
-
if (tn->empty_children == tnode_child_length(tn) - 1)
for (i = 0; i < tnode_child_length(tn); i++) {
-
- write_lock_bh(&fib_lock);
- if (tn->child[i] != NULL) {
- /* compress one level */
- struct node *n = tn->child[i];
+ struct node *n;
- if (n)
- NODE_INIT_PARENT(n, NODE_TYPE(n));
+ n = tn->child[i];
+ if (!n)
+ continue;
- write_unlock_bh(&fib_lock);
- tnode_free(tn);
- return n;
- }
- write_unlock_bh(&fib_lock);
+ /* compress one level */
+
+ NODE_SET_PARENT(n, NULL);
+ tnode_free(tn);
+ return n;
}
return (struct node *) tn;
}
-static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
+static struct tnode *inflate(struct trie *t, struct tnode *tn)
{
struct tnode *inode;
struct tnode *oldtnode = tn;
int olen = tnode_child_length(tn);
int i;
- if (trie_debug)
- printk("In inflate\n");
+ pr_debug("In inflate\n");
tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
- if (!tn) {
- *err = -ENOMEM;
- return oldtnode;
- }
+ if (!tn)
+ return ERR_PTR(-ENOMEM);
/*
* Preallocate and store tnodes before the actual work so we
@@ -666,8 +639,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
* fails. In case of failure we return the oldnode and inflate
* of tnode is ignored.
*/
-
- for(i = 0; i < olen; i++) {
+
+ for (i = 0; i < olen; i++) {
struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
if (inode &&
@@ -675,46 +648,30 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
inode->pos == oldtnode->pos + oldtnode->bits &&
inode->bits > 1) {
struct tnode *left, *right;
-
t_key m = TKEY_GET_MASK(inode->pos, 1);
left = tnode_new(inode->key&(~m), inode->pos + 1,
inode->bits - 1);
+ if (!left)
+ goto nomem;
- if (!left) {
- *err = -ENOMEM;
- break;
- }
-
right = tnode_new(inode->key|m, inode->pos + 1,
inode->bits - 1);
- if (!right) {
- *err = -ENOMEM;
- break;
- }
+ if (!right) {
+ tnode_free(left);
+ goto nomem;
+ }
put_child(t, tn, 2*i, (struct node *) left);
put_child(t, tn, 2*i+1, (struct node *) right);
}
}
- if (*err) {
- int size = tnode_child_length(tn);
- int j;
-
- for(j = 0; j < size; j++)
- if (tn->child[j])
- tnode_free((struct tnode *)tn->child[j]);
-
- tnode_free(tn);
-
- *err = -ENOMEM;
- return oldtnode;
- }
-
- for(i = 0; i < olen; i++) {
+ for (i = 0; i < olen; i++) {
struct node *node = tnode_get_child(oldtnode, i);
+ struct tnode *left, *right;
+ int size, j;
/* An empty child */
if (node == NULL)
@@ -740,76 +697,82 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
put_child(t, tn, 2*i+1, inode->child[1]);
tnode_free(inode);
+ continue;
}
- /* An internal node with more than two children */
- else {
- struct tnode *left, *right;
- int size, j;
-
- /* We will replace this node 'inode' with two new
- * ones, 'left' and 'right', each with half of the
- * original children. The two new nodes will have
- * a position one bit further down the key and this
- * means that the "significant" part of their keys
- * (see the discussion near the top of this file)
- * will differ by one bit, which will be "0" in
- * left's key and "1" in right's key. Since we are
- * moving the key position by one step, the bit that
- * we are moving away from - the bit at position
- * (inode->pos) - is the one that will differ between
- * left and right. So... we synthesize that bit in the
- * two new keys.
- * The mask 'm' below will be a single "one" bit at
- * the position (inode->pos)
- */
-
- /* Use the old key, but set the new significant
- * bit to zero.
- */
-
- left = (struct tnode *) tnode_get_child(tn, 2*i);
- put_child(t, tn, 2*i, NULL);
+ /* An internal node with more than two children */
+
+ /* We will replace this node 'inode' with two new
+ * ones, 'left' and 'right', each with half of the
+ * original children. The two new nodes will have
+ * a position one bit further down the key and this
+ * means that the "significant" part of their keys
+ * (see the discussion near the top of this file)
+ * will differ by one bit, which will be "0" in
+ * left's key and "1" in right's key. Since we are
+ * moving the key position by one step, the bit that
+ * we are moving away from - the bit at position
+ * (inode->pos) - is the one that will differ between
+ * left and right. So... we synthesize that bit in the
+ * two new keys.
+ * The mask 'm' below will be a single "one" bit at
+ * the position (inode->pos)
+ */
- if (!left)
- BUG();
+ /* Use the old key, but set the new significant
+ * bit to zero.
+ */
- right = (struct tnode *) tnode_get_child(tn, 2*i+1);
- put_child(t, tn, 2*i+1, NULL);
+ left = (struct tnode *) tnode_get_child(tn, 2*i);
+ put_child(t, tn, 2*i, NULL);
- if (!right)
- BUG();
+ BUG_ON(!left);
- size = tnode_child_length(left);
- for(j = 0; j < size; j++) {
- put_child(t, left, j, inode->child[j]);
- put_child(t, right, j, inode->child[j + size]);
- }
- put_child(t, tn, 2*i, resize(t, left));
- put_child(t, tn, 2*i+1, resize(t, right));
+ right = (struct tnode *) tnode_get_child(tn, 2*i+1);
+ put_child(t, tn, 2*i+1, NULL);
- tnode_free(inode);
+ BUG_ON(!right);
+
+ size = tnode_child_length(left);
+ for (j = 0; j < size; j++) {
+ put_child(t, left, j, inode->child[j]);
+ put_child(t, right, j, inode->child[j + size]);
}
+ put_child(t, tn, 2*i, resize(t, left));
+ put_child(t, tn, 2*i+1, resize(t, right));
+
+ tnode_free(inode);
}
tnode_free(oldtnode);
return tn;
+nomem:
+ {
+ int size = tnode_child_length(tn);
+ int j;
+
+ for (j = 0; j < size; j++)
+ if (tn->child[j])
+ tnode_free((struct tnode *)tn->child[j]);
+
+ tnode_free(tn);
+
+ return ERR_PTR(-ENOMEM);
+ }
}
-static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
+static struct tnode *halve(struct trie *t, struct tnode *tn)
{
struct tnode *oldtnode = tn;
struct node *left, *right;
int i;
int olen = tnode_child_length(tn);
- if (trie_debug) printk("In halve\n");
+ pr_debug("In halve\n");
tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
- if (!tn) {
- *err = -ENOMEM;
- return oldtnode;
- }
+ if (!tn)
+ return ERR_PTR(-ENOMEM);
/*
* Preallocate and store tnodes before the actual work so we
@@ -818,38 +781,27 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
* of tnode is ignored.
*/
- for(i = 0; i < olen; i += 2) {
+ for (i = 0; i < olen; i += 2) {
left = tnode_get_child(oldtnode, i);
right = tnode_get_child(oldtnode, i+1);
/* Two nonempty children */
- if (left && right) {
- struct tnode *newBinNode =
- tnode_new(left->key, tn->pos + tn->bits, 1);
+ if (left && right) {
+ struct tnode *newn;
- if (!newBinNode) {
- *err = -ENOMEM;
- break;
- }
- put_child(t, tn, i/2, (struct node *)newBinNode);
- }
- }
+ newn = tnode_new(left->key, tn->pos + tn->bits, 1);
- if (*err) {
- int size = tnode_child_length(tn);
- int j;
+ if (!newn)
+ goto nomem;
- for(j = 0; j < size; j++)
- if (tn->child[j])
- tnode_free((struct tnode *)tn->child[j]);
+ put_child(t, tn, i/2, (struct node *)newn);
+ }
- tnode_free(tn);
-
- *err = -ENOMEM;
- return oldtnode;
}
- for(i = 0; i < olen; i += 2) {
+ for (i = 0; i < olen; i += 2) {
+ struct tnode *newBinNode;
+
left = tnode_get_child(oldtnode, i);
right = tnode_get_child(oldtnode, i+1);
@@ -858,88 +810,100 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
if (right == NULL) /* Both are empty */
continue;
put_child(t, tn, i/2, right);
- } else if (right == NULL)
+ continue;
+ }
+
+ if (right == NULL) {
put_child(t, tn, i/2, left);
+ continue;
+ }
/* Two nonempty children */
- else {
- struct tnode *newBinNode =
- (struct tnode *) tnode_get_child(tn, i/2);
- put_child(t, tn, i/2, NULL);
-
- if (!newBinNode)
- BUG();
-
- put_child(t, newBinNode, 0, left);
- put_child(t, newBinNode, 1, right);
- put_child(t, tn, i/2, resize(t, newBinNode));
- }
+ newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
+ put_child(t, tn, i/2, NULL);
+ put_child(t, newBinNode, 0, left);
+ put_child(t, newBinNode, 1, right);
+ put_child(t, tn, i/2, resize(t, newBinNode));
}
tnode_free(oldtnode);
return tn;
+nomem:
+ {
+ int size = tnode_child_length(tn);
+ int j;
+
+ for (j = 0; j < size; j++)
+ if (tn->child[j])
+ tnode_free((struct tnode *)tn->child[j]);
+
+ tnode_free(tn);
+
+ return ERR_PTR(-ENOMEM);
+ }
}
-static void *trie_init(struct trie *t)
+static void trie_init(struct trie *t)
{
- if (t) {
- t->size = 0;
- t->trie = NULL;
- t->revision = 0;
+ if (!t)
+ return;
+
+ t->size = 0;
+ rcu_assign_pointer(t->trie, NULL);
+ t->revision = 0;
#ifdef CONFIG_IP_FIB_TRIE_STATS
- memset(&t->stats, 0, sizeof(struct trie_use_stats));
+ memset(&t->stats, 0, sizeof(struct trie_use_stats));
#endif
- }
- return t;
}
-static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
+/* readside must use rcu_read_lock currently dump routines
+ via get_fa_head and dump */
+
+static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
{
+ struct hlist_head *head = &l->list;
struct hlist_node *node;
struct leaf_info *li;
- hlist_for_each_entry(li, node, head, hlist) {
+ hlist_for_each_entry_rcu(li, node, head, hlist)
if (li->plen == plen)
return li;
- }
+
return NULL;
}
static inline struct list_head * get_fa_head(struct leaf *l, int plen)
{
- struct list_head *fa_head = NULL;
- struct leaf_info *li = find_leaf_info(&l->list, plen);
+ struct leaf_info *li = find_leaf_info(l, plen);
- if (li)
- fa_head = &li->falh;
+ if (!li)
+ return NULL;
- return fa_head;
+ return &li->falh;
}
static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
- struct leaf_info *li = NULL, *last = NULL;
- struct hlist_node *node, *tmp;
-
- write_lock_bh(&fib_lock);
-
- if (hlist_empty(head))
- hlist_add_head(&new->hlist, head);
- else {
- hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
-
- if (new->plen > li->plen)
- break;
-
- last = li;
- }
- if (last)
- hlist_add_after(&last->hlist, &new->hlist);
- else
- hlist_add_before(&new->hlist, &li->hlist);
- }
- write_unlock_bh(&fib_lock);
+ struct leaf_info *li = NULL, *last = NULL;
+ struct hlist_node *node;
+
+ if (hlist_empty(head)) {
+ hlist_add_head_rcu(&new->hlist, head);
+ } else {
+ hlist_for_each_entry(li, node, head, hlist) {
+ if (new->plen > li->plen)
+ break;
+
+ last = li;
+ }
+ if (last)
+ hlist_add_after_rcu(&last->hlist, &new->hlist);
+ else
+ hlist_add_before_rcu(&new->hlist, &li->hlist);
+ }
}
+/* rcu_read_lock needs to be hold by caller from readside */
+
static struct leaf *
fib_find_node(struct trie *t, u32 key)
{
@@ -948,61 +912,43 @@ fib_find_node(struct trie *t, u32 key)
struct node *n;
pos = 0;
- n = t->trie;
+ n = rcu_dereference(t->trie);
while (n != NULL && NODE_TYPE(n) == T_TNODE) {
tn = (struct tnode *) n;
-
+
check_tnode(tn);
-
+
if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
- pos=tn->pos + tn->bits;
+ pos = tn->pos + tn->bits;
n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
- }
- else
+ } else
break;
}
/* Case we have found a leaf. Compare prefixes */
- if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
- struct leaf *l = (struct leaf *) n;
- return l;
- }
+ if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
+ return (struct leaf *)n;
+
return NULL;
}
static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
{
- int i = 0;
int wasfull;
t_key cindex, key;
struct tnode *tp = NULL;
- if (!tn)
- BUG();
-
key = tn->key;
- i = 0;
while (tn != NULL && NODE_PARENT(tn) != NULL) {
- if (i > 10) {
- printk("Rebalance tn=%p \n", tn);
- if (tn) printk("tn->parent=%p \n", NODE_PARENT(tn));
-
- printk("Rebalance tp=%p \n", tp);
- if (tp) printk("tp->parent=%p \n", NODE_PARENT(tp));
- }
-
- if (i > 12) BUG();
- i++;
-
tp = NODE_PARENT(tn);
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
tn = (struct tnode *) resize (t, (struct tnode *)tn);
tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
-
+
if (!NODE_PARENT(tn))
break;
@@ -1015,6 +961,8 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
return (struct node*) tn;
}
+/* only used from updater-side */
+
static struct list_head *
fib_insert_node(struct trie *t, int *err, u32 key, int plen)
{
@@ -1050,20 +998,16 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
while (n != NULL && NODE_TYPE(n) == T_TNODE) {
tn = (struct tnode *) n;
-
+
check_tnode(tn);
-
+
if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
tp = tn;
- pos=tn->pos + tn->bits;
+ pos = tn->pos + tn->bits;
n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
- if (n && NODE_PARENT(n) != tn) {
- printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
- BUG();
- }
- }
- else
+ BUG_ON(n && NODE_PARENT(n) != tn);
+ } else
break;
}
@@ -1073,17 +1017,15 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
* tp is n's (parent) ----> NULL or TNODE
*/
- if (tp && IS_LEAF(tp))
- BUG();
-
+ BUG_ON(tp && IS_LEAF(tp));
/* Case 1: n is a leaf. Compare prefixes */
if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
- struct leaf *l = ( struct leaf *) n;
-
+ struct leaf *l = (struct leaf *) n;
+
li = leaf_info_new(plen);
-
+
if (!li) {
*err = -ENOMEM;
goto err;
@@ -1113,35 +1055,29 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
fa_head = &li->falh;
insert_leaf_info(&l->list, li);
- /* Case 2: n is NULL, and will just insert a new leaf */
if (t->trie && n == NULL) {
+ /* Case 2: n is NULL, and will just insert a new leaf */
NODE_SET_PARENT(l, tp);
-
- if (!tp)
- BUG();
- else {
- cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
- }
- }
- /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
- else {
+ cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+ put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
+ } else {
+ /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
/*
* Add a new tnode here
* first tnode need some special handling
*/
if (tp)
- pos=tp->pos+tp->bits;
+ pos = tp->pos+tp->bits;
else
- pos=0;
+ pos = 0;
+
if (n) {
newpos = tkey_mismatch(key, pos, n->key);
tn = tnode_new(n->key, newpos, 1);
- }
- else {
+ } else {
newpos = 0;
tn = tnode_new(key, newpos, 1); /* First tnode */
}
@@ -1151,32 +1087,33 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
tnode_free((struct tnode *) l);
*err = -ENOMEM;
goto err;
- }
-
+ }
+
NODE_SET_PARENT(tn, tp);
- missbit=tkey_extract_bits(key, newpos, 1);
+ missbit = tkey_extract_bits(key, newpos, 1);
put_child(t, tn, missbit, (struct node *)l);
put_child(t, tn, 1-missbit, n);
if (tp) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
- }
- else {
- t->trie = (struct node*) tn; /* First tnode */
+ } else {
+ rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
tp = tn;
}
}
- if (tp && tp->pos+tp->bits > 32) {
- printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
+
+ if (tp && tp->pos + tp->bits > 32)
+ printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
tp, tp->pos, tp->bits, key, plen);
- }
+
/* Rebalance the trie */
- t->trie = trie_rebalance(t, tp);
+
+ rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
done:
t->revision++;
-err:;
+err:
return fa_head;
}
@@ -1204,17 +1141,18 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
key = ntohl(key);
- if (trie_debug)
- printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
+ pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
- mask = ntohl( inet_make_mask(plen) );
+ mask = ntohl(inet_make_mask(plen));
if (key & ~mask)
return -EINVAL;
key = key & mask;
- if ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
+ fi = fib_create_info(r, rta, nlhdr, &err);
+
+ if (!fi)
goto err;
l = fib_find_node(t, key);
@@ -1236,8 +1174,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
* and we need to allocate a new one of those as well.
*/
- if (fa &&
- fa->fa_info->fib_priority == fi->fib_priority) {
+ if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
struct fib_alias *fa_orig;
err = -EEXIST;
@@ -1248,22 +1185,27 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
struct fib_info *fi_drop;
u8 state;
- write_lock_bh(&fib_lock);
+ err = -ENOBUFS;
+ new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
+ if (new_fa == NULL)
+ goto out;
fi_drop = fa->fa_info;
- fa->fa_info = fi;
- fa->fa_type = type;
- fa->fa_scope = r->rtm_scope;
+ new_fa->fa_tos = fa->fa_tos;
+ new_fa->fa_info = fi;
+ new_fa->fa_type = type;
+ new_fa->fa_scope = r->rtm_scope;
state = fa->fa_state;
- fa->fa_state &= ~FA_S_ACCESSED;
+ new_fa->fa_state &= ~FA_S_ACCESSED;
- write_unlock_bh(&fib_lock);
+ list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
+ alias_free_mem_rcu(fa);
fib_release_info(fi_drop);
if (state & FA_S_ACCESSED)
- rt_cache_flush(-1);
+ rt_cache_flush(-1);
- goto succeeded;
+ goto succeeded;
}
/* Error if we find a perfect match which
* uses the same scope, type, and nexthop
@@ -1285,7 +1227,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
fa = fa_orig;
}
err = -ENOENT;
- if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
+ if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
goto out;
err = -ENOBUFS;
@@ -1298,9 +1240,6 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
new_fa->fa_type = type;
new_fa->fa_scope = r->rtm_scope;
new_fa->fa_state = 0;
-#if 0
- new_fa->dst = NULL;
-#endif
/*
* Insert new entry to the list.
*/
@@ -1312,12 +1251,8 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
goto out_free_new_fa;
}
- write_lock_bh(&fib_lock);
-
- list_add_tail(&new_fa->fa_list,
- (fa ? &fa->fa_list : fa_head));
-
- write_unlock_bh(&fib_lock);
+ list_add_tail_rcu(&new_fa->fa_list,
+ (fa ? &fa->fa_list : fa_head));
rt_cache_flush(-1);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
@@ -1328,38 +1263,40 @@ out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa);
out:
fib_release_info(fi);
-err:;
+err:
return err;
}
-static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp,
- struct fib_result *res, int *err)
+
+/* should be called with rcu_read_lock */
+static inline int check_leaf(struct trie *t, struct leaf *l,
+ t_key key, int *plen, const struct flowi *flp,
+ struct fib_result *res)
{
- int i;
+ int err, i;
t_key mask;
struct leaf_info *li;
struct hlist_head *hhead = &l->list;
struct hlist_node *node;
- hlist_for_each_entry(li, node, hhead, hlist) {
-
+ hlist_for_each_entry_rcu(li, node, hhead, hlist) {
i = li->plen;
mask = ntohl(inet_make_mask(i));
if (l->key != (key & mask))
continue;
- if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) {
+ if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
*plen = i;
#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats.semantic_match_passed++;
#endif
- return 1;
+ return err;
}
#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats.semantic_match_miss++;
#endif
}
- return 0;
+ return 1;
}
static int
@@ -1370,13 +1307,17 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
struct node *n;
struct tnode *pn;
int pos, bits;
- t_key key=ntohl(flp->fl4_dst);
+ t_key key = ntohl(flp->fl4_dst);
int chopped_off;
t_key cindex = 0;
int current_prefix_length = KEYLENGTH;
- n = t->trie;
+ struct tnode *cn;
+ t_key node_prefix, key_prefix, pref_mismatch;
+ int mp;
+
+ rcu_read_lock();
- read_lock(&fib_lock);
+ n = rcu_dereference(t->trie);
if (!n)
goto failed;
@@ -1386,15 +1327,14 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
/* Just a leaf? */
if (IS_LEAF(n)) {
- if (check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
+ if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
goto found;
goto failed;
}
pn = (struct tnode *) n;
chopped_off = 0;
- while (pn) {
-
+ while (pn) {
pos = pn->pos;
bits = pn->bits;
@@ -1410,130 +1350,129 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
goto backtrace;
}
- if (IS_TNODE(n)) {
+ if (IS_LEAF(n)) {
+ if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
+ goto found;
+ else
+ goto backtrace;
+ }
+
#define HL_OPTIMIZE
#ifdef HL_OPTIMIZE
- struct tnode *cn = (struct tnode *)n;
- t_key node_prefix, key_prefix, pref_mismatch;
- int mp;
-
- /*
- * It's a tnode, and we can do some extra checks here if we
- * like, to avoid descending into a dead-end branch.
- * This tnode is in the parent's child array at index
- * key[p_pos..p_pos+p_bits] but potentially with some bits
- * chopped off, so in reality the index may be just a
- * subprefix, padded with zero at the end.
- * We can also take a look at any skipped bits in this
- * tnode - everything up to p_pos is supposed to be ok,
- * and the non-chopped bits of the index (se previous
- * paragraph) are also guaranteed ok, but the rest is
- * considered unknown.
- *
- * The skipped bits are key[pos+bits..cn->pos].
- */
-
- /* If current_prefix_length < pos+bits, we are already doing
- * actual prefix matching, which means everything from
- * pos+(bits-chopped_off) onward must be zero along some
- * branch of this subtree - otherwise there is *no* valid
- * prefix present. Here we can only check the skipped
- * bits. Remember, since we have already indexed into the
- * parent's child array, we know that the bits we chopped of
- * *are* zero.
- */
-
- /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
-
- if (current_prefix_length < pos+bits) {
- if (tkey_extract_bits(cn->key, current_prefix_length,
- cn->pos - current_prefix_length) != 0 ||
- !(cn->child[0]))
- goto backtrace;
+ cn = (struct tnode *)n;
+
+ /*
+ * It's a tnode, and we can do some extra checks here if we
+ * like, to avoid descending into a dead-end branch.
+ * This tnode is in the parent's child array at index
+ * key[p_pos..p_pos+p_bits] but potentially with some bits
+ * chopped off, so in reality the index may be just a
+ * subprefix, padded with zero at the end.
+ * We can also take a look at any skipped bits in this
+ * tnode - everything up to p_pos is supposed to be ok,
+ * and the non-chopped bits of the index (se previous
+ * paragraph) are also guaranteed ok, but the rest is
+ * considered unknown.
+ *
+ * The skipped bits are key[pos+bits..cn->pos].
+ */
+
+ /* If current_prefix_length < pos+bits, we are already doing
+ * actual prefix matching, which means everything from
+ * pos+(bits-chopped_off) onward must be zero along some
+ * branch of this subtree - otherwise there is *no* valid
+ * prefix present. Here we can only check the skipped
+ * bits. Remember, since we have already indexed into the
+ * parent's child array, we know that the bits we chopped of
+ * *are* zero.
+ */
+
+ /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
+
+ if (current_prefix_length < pos+bits) {
+ if (tkey_extract_bits(cn->key, current_prefix_length,
+ cn->pos - current_prefix_length) != 0 ||
+ !(cn->child[0]))
+ goto backtrace;
+ }
+
+ /*
+ * If chopped_off=0, the index is fully validated and we
+ * only need to look at the skipped bits for this, the new,
+ * tnode. What we actually want to do is to find out if
+ * these skipped bits match our key perfectly, or if we will
+ * have to count on finding a matching prefix further down,
+ * because if we do, we would like to have some way of
+ * verifying the existence of such a prefix at this point.
+ */
+
+ /* The only thing we can do at this point is to verify that
+ * any such matching prefix can indeed be a prefix to our
+ * key, and if the bits in the node we are inspecting that
+ * do not match our key are not ZERO, this cannot be true.
+ * Thus, find out where there is a mismatch (before cn->pos)
+ * and verify that all the mismatching bits are zero in the
+ * new tnode's key.
+ */
+
+ /* Note: We aren't very concerned about the piece of the key
+ * that precede pn->pos+pn->bits, since these have already been
+ * checked. The bits after cn->pos aren't checked since these are
+ * by definition "unknown" at this point. Thus, what we want to
+ * see is if we are about to enter the "prefix matching" state,
+ * and in that case verify that the skipped bits that will prevail
+ * throughout this subtree are zero, as they have to be if we are
+ * to find a matching prefix.
+ */
+
+ node_prefix = MASK_PFX(cn->key, cn->pos);
+ key_prefix = MASK_PFX(key, cn->pos);
+ pref_mismatch = key_prefix^node_prefix;
+ mp = 0;
+
+ /* In short: If skipped bits in this node do not match the search
+ * key, enter the "prefix matching" state.directly.
+ */
+ if (pref_mismatch) {
+ while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
+ mp++;
+ pref_mismatch = pref_mismatch <<1;
}
+ key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
- /*
- * If chopped_off=0, the index is fully validated and we
- * only need to look at the skipped bits for this, the new,
- * tnode. What we actually want to do is to find out if
- * these skipped bits match our key perfectly, or if we will
- * have to count on finding a matching prefix further down,
- * because if we do, we would like to have some way of
- * verifying the existence of such a prefix at this point.
- */
-
- /* The only thing we can do at this point is to verify that
- * any such matching prefix can indeed be a prefix to our
- * key, and if the bits in the node we are inspecting that
- * do not match our key are not ZERO, this cannot be true.
- * Thus, find out where there is a mismatch (before cn->pos)
- * and verify that all the mismatching bits are zero in the
- * new tnode's key.
- */
-
- /* Note: We aren't very concerned about the piece of the key
- * that precede pn->pos+pn->bits, since these have already been
- * checked. The bits after cn->pos aren't checked since these are
- * by definition "unknown" at this point. Thus, what we want to
- * see is if we are about to enter the "prefix matching" state,
- * and in that case verify that the skipped bits that will prevail
- * throughout this subtree are zero, as they have to be if we are
- * to find a matching prefix.
- */
-
- node_prefix = MASK_PFX(cn->key, cn->pos);
- key_prefix = MASK_PFX(key, cn->pos);
- pref_mismatch = key_prefix^node_prefix;
- mp = 0;
-
- /* In short: If skipped bits in this node do not match the search
- * key, enter the "prefix matching" state.directly.
- */
- if (pref_mismatch) {
- while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
- mp++;
- pref_mismatch = pref_mismatch <<1;
- }
- key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
-
- if (key_prefix != 0)
- goto backtrace;
-
- if (current_prefix_length >= cn->pos)
- current_prefix_length=mp;
- }
-#endif
- pn = (struct tnode *)n; /* Descend */
- chopped_off = 0;
- continue;
+ if (key_prefix != 0)
+ goto backtrace;
+
+ if (current_prefix_length >= cn->pos)
+ current_prefix_length = mp;
}
- if (IS_LEAF(n)) {
- if (check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
- goto found;
- }
+#endif
+ pn = (struct tnode *)n; /* Descend */
+ chopped_off = 0;
+ continue;
+
backtrace:
chopped_off++;
/* As zero don't change the child key (cindex) */
- while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
+ while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
chopped_off++;
- }
/* Decrease current_... with bits chopped off */
if (current_prefix_length > pn->pos + pn->bits - chopped_off)
current_prefix_length = pn->pos + pn->bits - chopped_off;
-
+
/*
* Either we do the actual chop off according or if we have
* chopped off all bits in this tnode walk up to our parent.
*/
- if (chopped_off <= pn->bits)
+ if (chopped_off <= pn->bits) {
cindex &= ~(1 << (chopped_off-1));
- else {
+ } else {
if (NODE_PARENT(pn) == NULL)
goto failed;
-
+
/* Get Child's index */
cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
pn = NODE_PARENT(pn);
@@ -1548,10 +1487,11 @@ backtrace:
failed:
ret = 1;
found:
- read_unlock(&fib_lock);
+ rcu_read_unlock();
return ret;
}
+/* only called from updater side */
static int trie_leaf_remove(struct trie *t, t_key key)
{
t_key cindex;
@@ -1559,24 +1499,20 @@ static int trie_leaf_remove(struct trie *t, t_key key)
struct node *n = t->trie;
struct leaf *l;
- if (trie_debug)
- printk("entering trie_leaf_remove(%p)\n", n);
+ pr_debug("entering trie_leaf_remove(%p)\n", n);
/* Note that in the case skipped bits, those bits are *not* checked!
* When we finish this, we will have NULL or a T_LEAF, and the
* T_LEAF may or may not match our key.
*/
- while (n != NULL && IS_TNODE(n)) {
+ while (n != NULL && IS_TNODE(n)) {
struct tnode *tn = (struct tnode *) n;
check_tnode(tn);
n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
- if (n && NODE_PARENT(n) != tn) {
- printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
- BUG();
- }
- }
+ BUG_ON(n && NODE_PARENT(n) != tn);
+ }
l = (struct leaf *) n;
if (!n || !tkey_equals(l->key, key))
@@ -1590,23 +1526,24 @@ static int trie_leaf_remove(struct trie *t, t_key key)
t->revision++;
t->size--;
+ preempt_disable();
tp = NODE_PARENT(n);
tnode_free((struct tnode *) n);
if (tp) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
put_child(t, (struct tnode *)tp, cindex, NULL);
- t->trie = trie_rebalance(t, tp);
- }
- else
- t->trie = NULL;
+ rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
+ } else
+ rcu_assign_pointer(t->trie, NULL);
+ preempt_enable();
return 1;
}
static int
fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
- struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
+ struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
{
struct trie *t = (struct trie *) tb->tb_data;
u32 key, mask;
@@ -1615,6 +1552,8 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
struct fib_alias *fa, *fa_to_delete;
struct list_head *fa_head;
struct leaf *l;
+ struct leaf_info *li;
+
if (plen > 32)
return -EINVAL;
@@ -1624,7 +1563,7 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
memcpy(&key, rta->rta_dst, 4);
key = ntohl(key);
- mask = ntohl( inet_make_mask(plen) );
+ mask = ntohl(inet_make_mask(plen));
if (key & ~mask)
return -EINVAL;
@@ -1641,11 +1580,11 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
if (!fa)
return -ESRCH;
- if (trie_debug)
- printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
+ pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
fa_to_delete = NULL;
fa_head = fa->fa_list.prev;
+
list_for_each_entry(fa, fa_head, fa_list) {
struct fib_info *fi = fa->fa_info;
@@ -1664,39 +1603,31 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
}
}
- if (fa_to_delete) {
- int kill_li = 0;
- struct leaf_info *li;
-
- fa = fa_to_delete;
- rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
+ if (!fa_to_delete)
+ return -ESRCH;
- l = fib_find_node(t, key);
- li = find_leaf_info(&l->list, plen);
+ fa = fa_to_delete;
+ rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
- write_lock_bh(&fib_lock);
+ l = fib_find_node(t, key);
+ li = find_leaf_info(l, plen);
- list_del(&fa->fa_list);
+ list_del_rcu(&fa->fa_list);
- if (list_empty(fa_head)) {
- hlist_del(&li->hlist);
- kill_li = 1;
- }
- write_unlock_bh(&fib_lock);
-
- if (kill_li)
- free_leaf_info(li);
+ if (list_empty(fa_head)) {
+ hlist_del_rcu(&li->hlist);
+ free_leaf_info(li);
+ }
- if (hlist_empty(&l->list))
- trie_leaf_remove(t, key);
+ if (hlist_empty(&l->list))
+ trie_leaf_remove(t, key);
- if (fa->fa_state & FA_S_ACCESSED)
- rt_cache_flush(-1);
+ if (fa->fa_state & FA_S_ACCESSED)
+ rt_cache_flush(-1);
- fn_free_alias(fa);
- return 0;
- }
- return -ESRCH;
+ fib_release_info(fa->fa_info);
+ alias_free_mem_rcu(fa);
+ return 0;
}
static int trie_flush_list(struct trie *t, struct list_head *head)
@@ -1706,14 +1637,11 @@ static int trie_flush_list(struct trie *t, struct list_head *head)
list_for_each_entry_safe(fa, fa_node, head, fa_list) {
struct fib_info *fi = fa->fa_info;
-
- if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
-
- write_lock_bh(&fib_lock);
- list_del(&fa->fa_list);
- write_unlock_bh(&fib_lock);
- fn_free_alias(fa);
+ if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
+ list_del_rcu(&fa->fa_list);
+ fib_release_info(fa->fa_info);
+ alias_free_mem_rcu(fa);
found++;
}
}
@@ -1728,37 +1656,34 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l)
struct leaf_info *li = NULL;
hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
-
found += trie_flush_list(t, &li->falh);
if (list_empty(&li->falh)) {
-
- write_lock_bh(&fib_lock);
- hlist_del(&li->hlist);
- write_unlock_bh(&fib_lock);
-
+ hlist_del_rcu(&li->hlist);
free_leaf_info(li);
}
}
return found;
}
+/* rcu_read_lock needs to be hold by caller from readside */
+
static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
{
struct node *c = (struct node *) thisleaf;
struct tnode *p;
int idx;
+ struct node *trie = rcu_dereference(t->trie);
if (c == NULL) {
- if (t->trie == NULL)
+ if (trie == NULL)
return NULL;
- if (IS_LEAF(t->trie)) /* trie w. just a leaf */
- return (struct leaf *) t->trie;
+ if (IS_LEAF(trie)) /* trie w. just a leaf */
+ return (struct leaf *) trie;
- p = (struct tnode*) t->trie; /* Start */
- }
- else
+ p = (struct tnode*) trie; /* Start */
+ } else
p = (struct tnode *) NODE_PARENT(c);
while (p) {
@@ -1771,29 +1696,31 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
pos = 0;
last = 1 << p->bits;
- for(idx = pos; idx < last ; idx++) {
- if (p->child[idx]) {
-
- /* Decend if tnode */
-
- while (IS_TNODE(p->child[idx])) {
- p = (struct tnode*) p->child[idx];
- idx = 0;
-
- /* Rightmost non-NULL branch */
- if (p && IS_TNODE(p))
- while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++;
-
- /* Done with this tnode? */
- if (idx >= (1 << p->bits) || p->child[idx] == NULL )
- goto up;
- }
- return (struct leaf*) p->child[idx];
+ for (idx = pos; idx < last ; idx++) {
+ c = rcu_dereference(p->child[idx]);
+
+ if (!c)
+ continue;
+
+ /* Decend if tnode */
+ while (IS_TNODE(c)) {
+ p = (struct tnode *) c;
+ idx = 0;
+
+ /* Rightmost non-NULL branch */
+ if (p && IS_TNODE(p))
+ while (!(c = rcu_dereference(p->child[idx]))
+ && idx < (1<<p->bits)) idx++;
+
+ /* Done with this tnode? */
+ if (idx >= (1 << p->bits) || !c)
+ goto up;
}
+ return (struct leaf *) c;
}
up:
/* No more children go up one step */
- c = (struct node*) p;
+ c = (struct node *) p;
p = (struct tnode *) NODE_PARENT(p);
}
return NULL; /* Ready. Root of trie */
@@ -1807,7 +1734,7 @@ static int fn_trie_flush(struct fib_table *tb)
t->revision++;
- for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
+ for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
found += trie_flush_leaf(t, l);
if (ll && hlist_empty(&ll->list))
@@ -1818,12 +1745,11 @@ static int fn_trie_flush(struct fib_table *tb)
if (ll && hlist_empty(&ll->list))
trie_leaf_remove(t, ll->key);
- if (trie_debug)
- printk("trie_flush found=%d\n", found);
+ pr_debug("trie_flush found=%d\n", found);
return found;
}
-static int trie_last_dflt=-1;
+static int trie_last_dflt = -1;
static void
fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
@@ -1840,7 +1766,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
last_resort = NULL;
order = -1;
- read_lock(&fib_lock);
+ rcu_read_lock();
l = fib_find_node(t, 0);
if (!l)
@@ -1853,20 +1779,20 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
if (list_empty(fa_head))
goto out;
- list_for_each_entry(fa, fa_head, fa_list) {
+ list_for_each_entry_rcu(fa, fa_head, fa_list) {
struct fib_info *next_fi = fa->fa_info;
-
+
if (fa->fa_scope != res->scope ||
fa->fa_type != RTN_UNICAST)
continue;
-
+
if (next_fi->fib_priority > res->fi->fib_priority)
break;
if (!next_fi->fib_nh[0].nh_gw ||
next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
continue;
fa->fa_state |= FA_S_ACCESSED;
-
+
if (fi == NULL) {
if (next_fi != res->fi)
break;
@@ -1904,7 +1830,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
}
trie_last_dflt = last_idx;
out:;
- read_unlock(&fib_lock);
+ rcu_read_unlock();
}
static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
@@ -1913,26 +1839,19 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
int i, s_i;
struct fib_alias *fa;
- u32 xkey=htonl(key);
+ u32 xkey = htonl(key);
- s_i=cb->args[3];
+ s_i = cb->args[3];
i = 0;
- list_for_each_entry(fa, fah, fa_list) {
+ /* rcu_read_lock is hold by caller */
+
+ list_for_each_entry_rcu(fa, fah, fa_list) {
if (i < s_i) {
i++;
continue;
}
- if (fa->fa_info->fib_nh == NULL) {
- printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
- i++;
- continue;
- }
- if (fa->fa_info == NULL) {
- printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
- i++;
- continue;
- }
+ BUG_ON(!fa->fa_info);
if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq,
@@ -1946,10 +1865,10 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
fa->fa_info, 0) < 0) {
cb->args[3] = i;
return -1;
- }
+ }
i++;
}
- cb->args[3]=i;
+ cb->args[3] = i;
return skb->len;
}
@@ -1959,10 +1878,10 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
int h, s_h;
struct list_head *fa_head;
struct leaf *l = NULL;
- s_h=cb->args[2];
- for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
+ s_h = cb->args[2];
+ for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
if (h < s_h)
continue;
if (h > s_h)
@@ -1970,7 +1889,7 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
sizeof(cb->args) - 3*sizeof(cb->args[0]));
fa_head = get_fa_head(l, plen);
-
+
if (!fa_head)
continue;
@@ -1978,11 +1897,11 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
continue;
if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
- cb->args[2]=h;
+ cb->args[2] = h;
return -1;
}
}
- cb->args[2]=h;
+ cb->args[2] = h;
return skb->len;
}
@@ -1993,25 +1912,24 @@ static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
s_m = cb->args[1];
- read_lock(&fib_lock);
- for (m=0; m<=32; m++) {
-
+ rcu_read_lock();
+ for (m = 0; m <= 32; m++) {
if (m < s_m)
continue;
if (m > s_m)
memset(&cb->args[2], 0,
- sizeof(cb->args) - 2*sizeof(cb->args[0]));
+ sizeof(cb->args) - 2*sizeof(cb->args[0]));
if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
cb->args[1] = m;
goto out;
}
}
- read_unlock(&fib_lock);
+ rcu_read_unlock();
cb->args[1] = m;
return skb->len;
- out:
- read_unlock(&fib_lock);
+out:
+ rcu_read_unlock();
return -1;
}
@@ -2051,383 +1969,147 @@ struct fib_table * __init fib_hash_init(int id)
trie_init(t);
if (id == RT_TABLE_LOCAL)
- trie_local = t;
+ trie_local = t;
else if (id == RT_TABLE_MAIN)
- trie_main = t;
+ trie_main = t;
if (id == RT_TABLE_LOCAL)
- printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
+ printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
return tb;
}
-/* Trie dump functions */
-
-static void putspace_seq(struct seq_file *seq, int n)
-{
- while (n--) seq_printf(seq, " ");
-}
+#ifdef CONFIG_PROC_FS
+/* Depth first Trie walk iterator */
+struct fib_trie_iter {
+ struct tnode *tnode;
+ struct trie *trie;
+ unsigned index;
+ unsigned depth;
+};
-static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
+static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
{
- while (bits--)
- seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
-}
+ struct tnode *tn = iter->tnode;
+ unsigned cindex = iter->index;
+ struct tnode *p;
-static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
- int pend, int cindex, int bits)
-{
- putspace_seq(seq, indent);
- if (IS_LEAF(n))
- seq_printf(seq, "|");
- else
- seq_printf(seq, "+");
- if (bits) {
- seq_printf(seq, "%d/", cindex);
- printbin_seq(seq, cindex, bits);
- seq_printf(seq, ": ");
- }
- else
- seq_printf(seq, "<root>: ");
- seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
-
- if (IS_LEAF(n))
- seq_printf(seq, "key=%d.%d.%d.%d\n",
- n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
- else {
- int plen = ((struct tnode *)n)->pos;
- t_key prf=MASK_PFX(n->key, plen);
- seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
- prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
- }
- if (IS_LEAF(n)) {
- struct leaf *l=(struct leaf *)n;
- struct fib_alias *fa;
- int i;
- for (i=32; i>=0; i--)
- if (find_leaf_info(&l->list, i)) {
-
- struct list_head *fa_head = get_fa_head(l, i);
-
- if (!fa_head)
- continue;
-
- if (list_empty(fa_head))
- continue;
-
- putspace_seq(seq, indent+2);
- seq_printf(seq, "{/%d...dumping}\n", i);
-
-
- list_for_each_entry(fa, fa_head, fa_list) {
- putspace_seq(seq, indent+2);
- if (fa->fa_info->fib_nh == NULL) {
- seq_printf(seq, "Error _fib_nh=NULL\n");
- continue;
- }
- if (fa->fa_info == NULL) {
- seq_printf(seq, "Error fa_info=NULL\n");
- continue;
- }
-
- seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
- fa->fa_type,
- fa->fa_scope,
- fa->fa_tos);
- }
- }
- }
- else if (IS_TNODE(n)) {
- struct tnode *tn = (struct tnode *)n;
- putspace_seq(seq, indent); seq_printf(seq, "| ");
- seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
- printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
- seq_printf(seq, "}\n");
- putspace_seq(seq, indent); seq_printf(seq, "| ");
- seq_printf(seq, "{pos=%d", tn->pos);
- seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
- seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
- putspace_seq(seq, indent); seq_printf(seq, "| ");
- seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
- }
-}
+ pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
+ iter->tnode, iter->index, iter->depth);
+rescan:
+ while (cindex < (1<<tn->bits)) {
+ struct node *n = tnode_get_child(tn, cindex);
-static void trie_dump_seq(struct seq_file *seq, struct trie *t)
-{
- struct node *n = t->trie;
- int cindex=0;
- int indent=1;
- int pend=0;
- int depth = 0;
-
- read_lock(&fib_lock);
-
- seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
- if (n) {
- printnode_seq(seq, indent, n, pend, cindex, 0);
- if (IS_TNODE(n)) {
- struct tnode *tn = (struct tnode *)n;
- pend = tn->pos+tn->bits;
- putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
- indent += 3;
- depth++;
-
- while (tn && cindex < (1 << tn->bits)) {
- if (tn->child[cindex]) {
-
- /* Got a child */
-
- printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
- if (IS_LEAF(tn->child[cindex])) {
- cindex++;
-
- }
- else {
- /*
- * New tnode. Decend one level
- */
-
- depth++;
- n = tn->child[cindex];
- tn = (struct tnode *)n;
- pend = tn->pos+tn->bits;
- putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
- indent+=3;
- cindex=0;
- }
- }
- else
- cindex++;
-
- /*
- * Test if we are done
- */
-
- while (cindex >= (1 << tn->bits)) {
-
- /*
- * Move upwards and test for root
- * pop off all traversed nodes
- */
-
- if (NODE_PARENT(tn) == NULL) {
- tn = NULL;
- n = NULL;
- break;
- }
- else {
- cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
- tn = NODE_PARENT(tn);
- cindex++;
- n = (struct node *)tn;
- pend = tn->pos+tn->bits;
- indent-=3;
- depth--;
- }
- }
+ if (n) {
+ if (IS_LEAF(n)) {
+ iter->tnode = tn;
+ iter->index = cindex + 1;
+ } else {
+ /* push down one level */
+ iter->tnode = (struct tnode *) n;
+ iter->index = 0;
+ ++iter->depth;
}
+ return n;
}
- else n = NULL;
- }
- else seq_printf(seq, "------ trie is empty\n");
-
- read_unlock(&fib_lock);
-}
-static struct trie_stat *trie_stat_new(void)
-{
- struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
- int i;
-
- if (s) {
- s->totdepth = 0;
- s->maxdepth = 0;
- s->tnodes = 0;
- s->leaves = 0;
- s->nullpointers = 0;
-
- for(i=0; i< MAX_CHILDS; i++)
- s->nodesizes[i] = 0;
+ ++cindex;
}
- return s;
-}
-static struct trie_stat *trie_collect_stats(struct trie *t)
-{
- struct node *n = t->trie;
- struct trie_stat *s = trie_stat_new();
- int cindex = 0;
- int indent = 1;
- int pend = 0;
- int depth = 0;
-
- read_lock(&fib_lock);
-
- if (s) {
- if (n) {
- if (IS_TNODE(n)) {
- struct tnode *tn = (struct tnode *)n;
- pend = tn->pos+tn->bits;
- indent += 3;
- s->nodesizes[tn->bits]++;
- depth++;
-
- while (tn && cindex < (1 << tn->bits)) {
- if (tn->child[cindex]) {
- /* Got a child */
-
- if (IS_LEAF(tn->child[cindex])) {
- cindex++;
-
- /* stats */
- if (depth > s->maxdepth)
- s->maxdepth = depth;
- s->totdepth += depth;
- s->leaves++;
- }
-
- else {
- /*
- * New tnode. Decend one level
- */
-
- s->tnodes++;
- s->nodesizes[tn->bits]++;
- depth++;
-
- n = tn->child[cindex];
- tn = (struct tnode *)n;
- pend = tn->pos+tn->bits;
-
- indent += 3;
- cindex = 0;
- }
- }
- else {
- cindex++;
- s->nullpointers++;
- }
-
- /*
- * Test if we are done
- */
-
- while (cindex >= (1 << tn->bits)) {
-
- /*
- * Move upwards and test for root
- * pop off all traversed nodes
- */
-
-
- if (NODE_PARENT(tn) == NULL) {
- tn = NULL;
- n = NULL;
- break;
- }
- else {
- cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
- tn = NODE_PARENT(tn);
- cindex++;
- n = (struct node *)tn;
- pend = tn->pos+tn->bits;
- indent -= 3;
- depth--;
- }
- }
- }
- }
- else n = NULL;
- }
+ /* Current node exhausted, pop back up */
+ p = NODE_PARENT(tn);
+ if (p) {
+ cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
+ tn = p;
+ --iter->depth;
+ goto rescan;
}
- read_unlock(&fib_lock);
- return s;
-}
-
-#ifdef CONFIG_PROC_FS
-
-static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
-{
+ /* got root? */
return NULL;
}
-static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
+static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
+ struct trie *t)
{
- return NULL;
-}
-
-static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
-{
- void *v = NULL;
+ struct node *n = rcu_dereference(t->trie);
- if (ip_fib_main_table)
- v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
- return v;
+ if (n && IS_TNODE(n)) {
+ iter->tnode = (struct tnode *) n;
+ iter->trie = t;
+ iter->index = 0;
+ iter->depth = 1;
+ return n;
+ }
+ return NULL;
}
-static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
- ++*pos;
- return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
-}
+ struct node *n;
+ struct fib_trie_iter iter;
-static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
-{
+ memset(s, 0, sizeof(*s));
+ rcu_read_lock();
+ for (n = fib_trie_get_first(&iter, t); n;
+ n = fib_trie_get_next(&iter)) {
+ if (IS_LEAF(n)) {
+ s->leaves++;
+ s->totdepth += iter.depth;
+ if (iter.depth > s->maxdepth)
+ s->maxdepth = iter.depth;
+ } else {
+ const struct tnode *tn = (const struct tnode *) n;
+ int i;
+
+ s->tnodes++;
+ s->nodesizes[tn->bits]++;
+ for (i = 0; i < (1<<tn->bits); i++)
+ if (!tn->child[i])
+ s->nullpointers++;
+ }
+ }
+ rcu_read_unlock();
}
/*
* This outputs /proc/net/fib_triestats
- *
- * It always works in backward compatibility mode.
- * The format of the file is not supposed to be changed.
*/
-
-static void collect_and_show(struct trie *t, struct seq_file *seq)
+static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
{
- int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
- int i, max, pointers;
- struct trie_stat *stat;
- int avdepth;
+ unsigned i, max, pointers, bytes, avdepth;
- stat = trie_collect_stats(t);
+ if (stat->leaves)
+ avdepth = stat->totdepth*100 / stat->leaves;
+ else
+ avdepth = 0;
- bytes=0;
- seq_printf(seq, "trie=%p\n", t);
+ seq_printf(seq, "\tAver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
+ seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
- if (stat) {
- if (stat->leaves)
- avdepth=stat->totdepth*100 / stat->leaves;
- else
- avdepth=0;
- seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
- seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
-
- seq_printf(seq, "Leaves: %d\n", stat->leaves);
- bytes += sizeof(struct leaf) * stat->leaves;
- seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
- bytes += sizeof(struct tnode) * stat->tnodes;
-
- max = MAX_CHILDS-1;
-
- while (max >= 0 && stat->nodesizes[max] == 0)
- max--;
- pointers = 0;
-
- for (i = 1; i <= max; i++)
- if (stat->nodesizes[i] != 0) {
- seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
- pointers += (1<<i) * stat->nodesizes[i];
- }
- seq_printf(seq, "\n");
- seq_printf(seq, "Pointers: %d\n", pointers);
- bytes += sizeof(struct node *) * pointers;
- seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
- seq_printf(seq, "Total size: %d kB\n", bytes / 1024);
+ seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
- kfree(stat);
- }
+ bytes = sizeof(struct leaf) * stat->leaves;
+ seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
+ bytes += sizeof(struct tnode) * stat->tnodes;
+
+ max = MAX_CHILDS-1;
+ while (max >= 0 && stat->nodesizes[max] == 0)
+ max--;
+
+ pointers = 0;
+ for (i = 1; i <= max; i++)
+ if (stat->nodesizes[i] != 0) {
+ seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
+ pointers += (1<<i) * stat->nodesizes[i];
+ }
+ seq_putc(seq, '\n');
+ seq_printf(seq, "\tPointers: %d\n", pointers);
+
+ bytes += sizeof(struct node *) * pointers;
+ seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
+ seq_printf(seq, "Total size: %d kB\n", (bytes + 1023) / 1024);
#ifdef CONFIG_IP_FIB_TRIE_STATS
seq_printf(seq, "Counters:\n---------\n");
@@ -2445,168 +2127,378 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
- char bf[128];
+ struct trie_stat *stat;
- if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
- sizeof(struct leaf), sizeof(struct tnode));
- if (trie_local)
- collect_and_show(trie_local, seq);
+ stat = kmalloc(sizeof(*stat), GFP_KERNEL);
+ if (!stat)
+ return -ENOMEM;
+
+ seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
+ sizeof(struct leaf), sizeof(struct tnode));
- if (trie_main)
- collect_and_show(trie_main, seq);
+ if (trie_local) {
+ seq_printf(seq, "Local:\n");
+ trie_collect_stats(trie_local, stat);
+ trie_show_stats(seq, stat);
}
- else {
- snprintf(bf, sizeof(bf),
- "*\t%08X\t%08X", 200, 400);
-
- seq_printf(seq, "%-127s\n", bf);
+
+ if (trie_main) {
+ seq_printf(seq, "Main:\n");
+ trie_collect_stats(trie_main, stat);
+ trie_show_stats(seq, stat);
}
+ kfree(stat);
+
return 0;
}
-static struct seq_operations fib_triestat_seq_ops = {
- .start = fib_triestat_seq_start,
- .next = fib_triestat_seq_next,
- .stop = fib_triestat_seq_stop,
- .show = fib_triestat_seq_show,
-};
-
static int fib_triestat_seq_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
-
- rc = seq_open(file, &fib_triestat_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
-out:
- return rc;
-out_kfree:
- goto out;
+ return single_open(file, fib_triestat_seq_show, NULL);
}
-static struct file_operations fib_triestat_seq_fops = {
+static struct file_operations fib_triestat_fops = {
.owner = THIS_MODULE,
.open = fib_triestat_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = single_release,
};
-int __init fib_stat_proc_init(void)
+static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
+ loff_t pos)
{
- if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
- return -ENOMEM;
- return 0;
+ loff_t idx = 0;
+ struct node *n;
+
+ for (n = fib_trie_get_first(iter, trie_local);
+ n; ++idx, n = fib_trie_get_next(iter)) {
+ if (pos == idx)
+ return n;
+ }
+
+ for (n = fib_trie_get_first(iter, trie_main);
+ n; ++idx, n = fib_trie_get_next(iter)) {
+ if (pos == idx)
+ return n;
+ }
+ return NULL;
}
-void __init fib_stat_proc_exit(void)
+static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
{
- proc_net_remove("fib_triestat");
+ rcu_read_lock();
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+ return fib_trie_get_idx(seq->private, *pos - 1);
}
-static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
+static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
+ struct fib_trie_iter *iter = seq->private;
+ void *l = v;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return fib_trie_get_idx(iter, 0);
+
+ v = fib_trie_get_next(iter);
+ BUG_ON(v == l);
+ if (v)
+ return v;
+
+ /* continue scan in next trie */
+ if (iter->trie == trie_local)
+ return fib_trie_get_first(iter, trie_main);
+
return NULL;
}
-static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
+static void fib_trie_seq_stop(struct seq_file *seq, void *v)
{
- return NULL;
+ rcu_read_unlock();
}
-static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
+static void seq_indent(struct seq_file *seq, int n)
+{
+ while (n-- > 0) seq_puts(seq, " ");
+}
+
+static inline const char *rtn_scope(enum rt_scope_t s)
{
- void *v = NULL;
+ static char buf[32];
- if (ip_fib_main_table)
- v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
- return v;
+ switch(s) {
+ case RT_SCOPE_UNIVERSE: return "universe";
+ case RT_SCOPE_SITE: return "site";
+ case RT_SCOPE_LINK: return "link";
+ case RT_SCOPE_HOST: return "host";
+ case RT_SCOPE_NOWHERE: return "nowhere";
+ default:
+ snprintf(buf, sizeof(buf), "scope=%d", s);
+ return buf;
+ }
}
-static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+static const char *rtn_type_names[__RTN_MAX] = {
+ [RTN_UNSPEC] = "UNSPEC",
+ [RTN_UNICAST] = "UNICAST",
+ [RTN_LOCAL] = "LOCAL",
+ [RTN_BROADCAST] = "BROADCAST",
+ [RTN_ANYCAST] = "ANYCAST",
+ [RTN_MULTICAST] = "MULTICAST",
+ [RTN_BLACKHOLE] = "BLACKHOLE",
+ [RTN_UNREACHABLE] = "UNREACHABLE",
+ [RTN_PROHIBIT] = "PROHIBIT",
+ [RTN_THROW] = "THROW",
+ [RTN_NAT] = "NAT",
+ [RTN_XRESOLVE] = "XRESOLVE",
+};
+
+static inline const char *rtn_type(unsigned t)
{
- ++*pos;
- return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
+ static char buf[32];
+
+ if (t < __RTN_MAX && rtn_type_names[t])
+ return rtn_type_names[t];
+ snprintf(buf, sizeof(buf), "type %d", t);
+ return buf;
}
-static void fib_trie_seq_stop(struct seq_file *seq, void *v)
+/* Pretty print the trie */
+static int fib_trie_seq_show(struct seq_file *seq, void *v)
+{
+ const struct fib_trie_iter *iter = seq->private;
+ struct node *n = v;
+
+ if (v == SEQ_START_TOKEN)
+ return 0;
+
+ if (IS_TNODE(n)) {
+ struct tnode *tn = (struct tnode *) n;
+ t_key prf = ntohl(MASK_PFX(tn->key, tn->pos));
+
+ if (!NODE_PARENT(n)) {
+ if (iter->trie == trie_local)
+ seq_puts(seq, "<local>:\n");
+ else
+ seq_puts(seq, "<main>:\n");
+ }
+ seq_indent(seq, iter->depth-1);
+ seq_printf(seq, " +-- %d.%d.%d.%d/%d %d %d %d\n",
+ NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
+ tn->empty_children);
+
+ } else {
+ struct leaf *l = (struct leaf *) n;
+ int i;
+ u32 val = ntohl(l->key);
+
+ seq_indent(seq, iter->depth);
+ seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val));
+ for (i = 32; i >= 0; i--) {
+ struct leaf_info *li = find_leaf_info(l, i);
+ if (li) {
+ struct fib_alias *fa;
+ list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+ seq_indent(seq, iter->depth+1);
+ seq_printf(seq, " /%d %s %s", i,
+ rtn_scope(fa->fa_scope),
+ rtn_type(fa->fa_type));
+ if (fa->fa_tos)
+ seq_printf(seq, "tos =%d\n",
+ fa->fa_tos);
+ seq_putc(seq, '\n');
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+static struct seq_operations fib_trie_seq_ops = {
+ .start = fib_trie_seq_start,
+ .next = fib_trie_seq_next,
+ .stop = fib_trie_seq_stop,
+ .show = fib_trie_seq_show,
+};
+
+static int fib_trie_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc = -ENOMEM;
+ struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (!s)
+ goto out;
+
+ rc = seq_open(file, &fib_trie_seq_ops);
+ if (rc)
+ goto out_kfree;
+
+ seq = file->private_data;
+ seq->private = s;
+ memset(s, 0, sizeof(*s));
+out:
+ return rc;
+out_kfree:
+ kfree(s);
+ goto out;
+}
+
+static struct file_operations fib_trie_fops = {
+ .owner = THIS_MODULE,
+ .open = fib_trie_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static unsigned fib_flag_trans(int type, u32 mask, const struct fib_info *fi)
{
+ static unsigned type2flags[RTN_MAX + 1] = {
+ [7] = RTF_REJECT, [8] = RTF_REJECT,
+ };
+ unsigned flags = type2flags[type];
+ if (fi && fi->fib_nh->nh_gw)
+ flags |= RTF_GATEWAY;
+ if (mask == 0xFFFFFFFF)
+ flags |= RTF_HOST;
+ flags |= RTF_UP;
+ return flags;
}
/*
- * This outputs /proc/net/fib_trie.
- *
- * It always works in backward compatibility mode.
- * The format of the file is not supposed to be changed.
+ * This outputs /proc/net/route.
+ * The format of the file is not supposed to be changed
+ * and needs to be same as fib_hash output to avoid breaking
+ * legacy utilities
*/
-
-static int fib_trie_seq_show(struct seq_file *seq, void *v)
+static int fib_route_seq_show(struct seq_file *seq, void *v)
{
+ struct leaf *l = v;
+ int i;
char bf[128];
if (v == SEQ_START_TOKEN) {
- if (trie_local)
- trie_dump_seq(seq, trie_local);
-
- if (trie_main)
- trie_dump_seq(seq, trie_main);
+ seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
+ "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
+ "\tWindow\tIRTT");
+ return 0;
}
- else {
- snprintf(bf, sizeof(bf),
- "*\t%08X\t%08X", 200, 400);
- seq_printf(seq, "%-127s\n", bf);
+ if (IS_TNODE(l))
+ return 0;
+
+ for (i=32; i>=0; i--) {
+ struct leaf_info *li = find_leaf_info(l, i);
+ struct fib_alias *fa;
+ u32 mask, prefix;
+
+ if (!li)
+ continue;
+
+ mask = inet_make_mask(li->plen);
+ prefix = htonl(l->key);
+
+ list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+ const struct fib_info *fi = fa->fa_info;
+ unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
+
+ if (fa->fa_type == RTN_BROADCAST
+ || fa->fa_type == RTN_MULTICAST)
+ continue;
+
+ if (fi)
+ snprintf(bf, sizeof(bf),
+ "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
+ fi->fib_dev ? fi->fib_dev->name : "*",
+ prefix,
+ fi->fib_nh->nh_gw, flags, 0, 0,
+ fi->fib_priority,
+ mask,
+ (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
+ fi->fib_window,
+ fi->fib_rtt >> 3);
+ else
+ snprintf(bf, sizeof(bf),
+ "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
+ prefix, 0, flags, 0, 0, 0,
+ mask, 0, 0, 0);
+
+ seq_printf(seq, "%-127s\n", bf);
+ }
}
return 0;
}
-static struct seq_operations fib_trie_seq_ops = {
- .start = fib_trie_seq_start,
- .next = fib_trie_seq_next,
- .stop = fib_trie_seq_stop,
- .show = fib_trie_seq_show,
+static struct seq_operations fib_route_seq_ops = {
+ .start = fib_trie_seq_start,
+ .next = fib_trie_seq_next,
+ .stop = fib_trie_seq_stop,
+ .show = fib_route_seq_show,
};
-static int fib_trie_seq_open(struct inode *inode, struct file *file)
+static int fib_route_seq_open(struct inode *inode, struct file *file)
{
struct seq_file *seq;
int rc = -ENOMEM;
+ struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
- rc = seq_open(file, &fib_trie_seq_ops);
+ if (!s)
+ goto out;
+
+ rc = seq_open(file, &fib_route_seq_ops);
if (rc)
goto out_kfree;
- seq = file->private_data;
+ seq = file->private_data;
+ seq->private = s;
+ memset(s, 0, sizeof(*s));
out:
return rc;
out_kfree:
+ kfree(s);
goto out;
}
-static struct file_operations fib_trie_seq_fops = {
- .owner = THIS_MODULE,
- .open = fib_trie_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release= seq_release_private,
+static struct file_operations fib_route_fops = {
+ .owner = THIS_MODULE,
+ .open = fib_route_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
};
int __init fib_proc_init(void)
{
- if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
- return -ENOMEM;
+ if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
+ goto out1;
+
+ if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
+ goto out2;
+
+ if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
+ goto out3;
+
return 0;
+
+out3:
+ proc_net_remove("fib_triestat");
+out2:
+ proc_net_remove("fib_trie");
+out1:
+ return -ENOMEM;
}
void __init fib_proc_exit(void)
{
proc_net_remove("fib_trie");
+ proc_net_remove("fib_triestat");
+ proc_net_remove("route");
}
#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 279f57abfecb..175e093ec564 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -114,7 +114,7 @@ struct icmp_bxm {
/*
* Statistics
*/
-DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics);
+DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly;
/* An array of errno for error messages from dest unreach. */
/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
@@ -188,7 +188,7 @@ struct icmp_err icmp_err_convert[] = {
/* Control parameters for ECHO replies. */
int sysctl_icmp_echo_ignore_all;
-int sysctl_icmp_echo_ignore_broadcasts;
+int sysctl_icmp_echo_ignore_broadcasts = 1;
/* Control parameter - ignore bogus broadcast responses? */
int sysctl_icmp_ignore_bogus_error_responses;
@@ -349,12 +349,12 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
{
struct sk_buff *skb;
- ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param,
- icmp_param->data_len+icmp_param->head_len,
- icmp_param->head_len,
- ipc, rt, MSG_DONTWAIT);
-
- if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) {
+ if (ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param,
+ icmp_param->data_len+icmp_param->head_len,
+ icmp_param->head_len,
+ ipc, rt, MSG_DONTWAIT) < 0)
+ ip_flush_pending_frames(icmp_socket->sk);
+ else if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) {
struct icmphdr *icmph = skb->h.icmph;
unsigned int csum = 0;
struct sk_buff *skb1;
@@ -627,11 +627,10 @@ static void icmp_unreach(struct sk_buff *skb)
break;
case ICMP_FRAG_NEEDED:
if (ipv4_config.no_pmtu_disc) {
- LIMIT_NETDEBUG(
- printk(KERN_INFO "ICMP: %u.%u.%u.%u: "
+ LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: "
"fragmentation needed "
"and DF set.\n",
- NIPQUAD(iph->daddr)));
+ NIPQUAD(iph->daddr));
} else {
info = ip_rt_frag_needed(iph,
ntohs(icmph->un.frag.mtu));
@@ -640,10 +639,9 @@ static void icmp_unreach(struct sk_buff *skb)
}
break;
case ICMP_SR_FAILED:
- LIMIT_NETDEBUG(
- printk(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
+ LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
"Route Failed.\n",
- NIPQUAD(iph->daddr)));
+ NIPQUAD(iph->daddr));
break;
default:
break;
@@ -936,8 +934,7 @@ int icmp_rcv(struct sk_buff *skb)
case CHECKSUM_HW:
if (!(u16)csum_fold(skb->csum))
break;
- NETDEBUG(if (net_ratelimit())
- printk(KERN_DEBUG "icmp v4 hw csum failure\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n");
case CHECKSUM_NONE:
if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))
goto error;
@@ -1111,12 +1108,9 @@ void __init icmp_init(struct net_proto_family *ops)
struct inet_sock *inet;
int i;
- for (i = 0; i < NR_CPUS; i++) {
+ for_each_cpu(i) {
int err;
- if (!cpu_possible(i))
- continue;
-
err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_ICMP,
&per_cpu(__icmp_socket, i));
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 5088f90835ae..c6247fc84060 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -904,7 +904,7 @@ int igmp_rcv(struct sk_buff *skb)
case IGMP_MTRACE_RESP:
break;
default:
- NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type));
+ NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type);
}
in_dev_put(in_dev);
kfree_skb(skb);
@@ -1323,7 +1323,7 @@ static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
}
if (dev) {
imr->imr_ifindex = dev->ifindex;
- idev = __in_dev_get(dev);
+ idev = __in_dev_get_rtnl(dev);
}
return idev;
}
@@ -1603,7 +1603,7 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc)
}
pmc->sources = NULL;
pmc->sfmode = MCAST_EXCLUDE;
- pmc->sfcount[MCAST_EXCLUDE] = 0;
+ pmc->sfcount[MCAST_INCLUDE] = 0;
pmc->sfcount[MCAST_EXCLUDE] = 1;
}
@@ -1908,8 +1908,11 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max));
goto done;
}
- } else
+ } else {
newpsl = NULL;
+ (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
+ msf->imsf_fmode, 0, NULL, 0);
+ }
psl = pmc->sflist;
if (psl) {
(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
new file mode 100644
index 000000000000..94468a76c5b4
--- /dev/null
+++ b/net/ipv4/inet_connection_sock.c
@@ -0,0 +1,641 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Support for INET connection oriented protocols.
+ *
+ * Authors: See the TCP sources
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or(at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/jhash.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp_states.h>
+#include <net/xfrm.h>
+
+#ifdef INET_CSK_DEBUG
+const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
+EXPORT_SYMBOL(inet_csk_timer_bug_msg);
+#endif
+
+/*
+ * This array holds the first and last local port number.
+ * For high-usage systems, use sysctl to change this to
+ * 32768-61000
+ */
+int sysctl_local_port_range[2] = { 1024, 4999 };
+
+static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
+{
+ const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
+ struct sock *sk2;
+ struct hlist_node *node;
+ int reuse = sk->sk_reuse;
+
+ sk_for_each_bound(sk2, node, &tb->owners) {
+ if (sk != sk2 &&
+ !inet_v6_ipv6only(sk2) &&
+ (!sk->sk_bound_dev_if ||
+ !sk2->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+ if (!reuse || !sk2->sk_reuse ||
+ sk2->sk_state == TCP_LISTEN) {
+ const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
+ if (!sk2_rcv_saddr || !sk_rcv_saddr ||
+ sk2_rcv_saddr == sk_rcv_saddr)
+ break;
+ }
+ }
+ }
+ return node != NULL;
+}
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ */
+int inet_csk_get_port(struct inet_hashinfo *hashinfo,
+ struct sock *sk, unsigned short snum)
+{
+ struct inet_bind_hashbucket *head;
+ struct hlist_node *node;
+ struct inet_bind_bucket *tb;
+ int ret;
+
+ local_bh_disable();
+ if (!snum) {
+ int low = sysctl_local_port_range[0];
+ int high = sysctl_local_port_range[1];
+ int remaining = (high - low) + 1;
+ int rover;
+
+ spin_lock(&hashinfo->portalloc_lock);
+ if (hashinfo->port_rover < low)
+ rover = low;
+ else
+ rover = hashinfo->port_rover;
+ do {
+ rover++;
+ if (rover > high)
+ rover = low;
+ head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
+ spin_lock(&head->lock);
+ inet_bind_bucket_for_each(tb, node, &head->chain)
+ if (tb->port == rover)
+ goto next;
+ break;
+ next:
+ spin_unlock(&head->lock);
+ } while (--remaining > 0);
+ hashinfo->port_rover = rover;
+ spin_unlock(&hashinfo->portalloc_lock);
+
+ /* Exhausted local port range during search? It is not
+ * possible for us to be holding one of the bind hash
+ * locks if this test triggers, because if 'remaining'
+ * drops to zero, we broke out of the do/while loop at
+ * the top level, not from the 'break;' statement.
+ */
+ ret = 1;
+ if (remaining <= 0)
+ goto fail;
+
+ /* OK, here is the one we will use. HEAD is
+ * non-NULL and we hold it's mutex.
+ */
+ snum = rover;
+ } else {
+ head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
+ spin_lock(&head->lock);
+ inet_bind_bucket_for_each(tb, node, &head->chain)
+ if (tb->port == snum)
+ goto tb_found;
+ }
+ tb = NULL;
+ goto tb_not_found;
+tb_found:
+ if (!hlist_empty(&tb->owners)) {
+ if (sk->sk_reuse > 1)
+ goto success;
+ if (tb->fastreuse > 0 &&
+ sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
+ goto success;
+ } else {
+ ret = 1;
+ if (inet_csk_bind_conflict(sk, tb))
+ goto fail_unlock;
+ }
+ }
+tb_not_found:
+ ret = 1;
+ if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL)
+ goto fail_unlock;
+ if (hlist_empty(&tb->owners)) {
+ if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
+ tb->fastreuse = 1;
+ else
+ tb->fastreuse = 0;
+ } else if (tb->fastreuse &&
+ (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
+ tb->fastreuse = 0;
+success:
+ if (!inet_csk(sk)->icsk_bind_hash)
+ inet_bind_hash(sk, tb, snum);
+ BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
+ ret = 0;
+
+fail_unlock:
+ spin_unlock(&head->lock);
+fail:
+ local_bh_enable();
+ return ret;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_get_port);
+
+/*
+ * Wait for an incoming connection, avoid race conditions. This must be called
+ * with the socket locked.
+ */
+static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ DEFINE_WAIT(wait);
+ int err;
+
+ /*
+ * True wake-one mechanism for incoming connections: only
+ * one process gets woken up, not the 'whole herd'.
+ * Since we do not 'race & poll' for established sockets
+ * anymore, the common case will execute the loop only once.
+ *
+ * Subtle issue: "add_wait_queue_exclusive()" will be added
+ * after any current non-exclusive waiters, and we know that
+ * it will always _stay_ after any new non-exclusive waiters
+ * because all non-exclusive waiters are added at the
+ * beginning of the wait-queue. As such, it's ok to "drop"
+ * our exclusiveness temporarily when we get woken up without
+ * having to remove and re-insert us on the wait queue.
+ */
+ for (;;) {
+ prepare_to_wait_exclusive(sk->sk_sleep, &wait,
+ TASK_INTERRUPTIBLE);
+ release_sock(sk);
+ if (reqsk_queue_empty(&icsk->icsk_accept_queue))
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ err = 0;
+ if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
+ break;
+ err = -EINVAL;
+ if (sk->sk_state != TCP_LISTEN)
+ break;
+ err = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ break;
+ err = -EAGAIN;
+ if (!timeo)
+ break;
+ }
+ finish_wait(sk->sk_sleep, &wait);
+ return err;
+}
+
+/*
+ * This will accept the next outstanding connection.
+ */
+struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct sock *newsk;
+ int error;
+
+ lock_sock(sk);
+
+ /* We need to make sure that this socket is listening,
+ * and that it has something pending.
+ */
+ error = -EINVAL;
+ if (sk->sk_state != TCP_LISTEN)
+ goto out_err;
+
+ /* Find already established connection */
+ if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
+ long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+ /* If this is a non blocking socket don't sleep */
+ error = -EAGAIN;
+ if (!timeo)
+ goto out_err;
+
+ error = inet_csk_wait_for_connect(sk, timeo);
+ if (error)
+ goto out_err;
+ }
+
+ newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
+ BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
+out:
+ release_sock(sk);
+ return newsk;
+out_err:
+ newsk = NULL;
+ *err = error;
+ goto out;
+}
+
+EXPORT_SYMBOL(inet_csk_accept);
+
+/*
+ * Using different timers for retransmit, delayed acks and probes
+ * We may wish use just one timer maintaining a list of expire jiffies
+ * to optimize.
+ */
+void inet_csk_init_xmit_timers(struct sock *sk,
+ void (*retransmit_handler)(unsigned long),
+ void (*delack_handler)(unsigned long),
+ void (*keepalive_handler)(unsigned long))
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ init_timer(&icsk->icsk_retransmit_timer);
+ init_timer(&icsk->icsk_delack_timer);
+ init_timer(&sk->sk_timer);
+
+ icsk->icsk_retransmit_timer.function = retransmit_handler;
+ icsk->icsk_delack_timer.function = delack_handler;
+ sk->sk_timer.function = keepalive_handler;
+
+ icsk->icsk_retransmit_timer.data =
+ icsk->icsk_delack_timer.data =
+ sk->sk_timer.data = (unsigned long)sk;
+
+ icsk->icsk_pending = icsk->icsk_ack.pending = 0;
+}
+
+EXPORT_SYMBOL(inet_csk_init_xmit_timers);
+
+void inet_csk_clear_xmit_timers(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
+
+ sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+ sk_stop_timer(sk, &icsk->icsk_delack_timer);
+ sk_stop_timer(sk, &sk->sk_timer);
+}
+
+EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
+
+void inet_csk_delete_keepalive_timer(struct sock *sk)
+{
+ sk_stop_timer(sk, &sk->sk_timer);
+}
+
+EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
+
+void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
+{
+ sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+}
+
+EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
+
+struct dst_entry* inet_csk_route_req(struct sock *sk,
+ const struct request_sock *req)
+{
+ struct rtable *rt;
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct ip_options *opt = inet_rsk(req)->opt;
+ struct flowi fl = { .oif = sk->sk_bound_dev_if,
+ .nl_u = { .ip4_u =
+ { .daddr = ((opt && opt->srr) ?
+ opt->faddr :
+ ireq->rmt_addr),
+ .saddr = ireq->loc_addr,
+ .tos = RT_CONN_FLAGS(sk) } },
+ .proto = sk->sk_protocol,
+ .uli_u = { .ports =
+ { .sport = inet_sk(sk)->sport,
+ .dport = ireq->rmt_port } } };
+
+ if (ip_route_output_flow(&rt, &fl, sk, 0)) {
+ IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
+ }
+ if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
+ ip_rt_put(rt);
+ IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
+ }
+ return &rt->u.dst;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_route_req);
+
+static inline u32 inet_synq_hash(const u32 raddr, const u16 rport,
+ const u32 rnd, const u16 synq_hsize)
+{
+ return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
+#else
+#define AF_INET_FAMILY(fam) 1
+#endif
+
+struct request_sock *inet_csk_search_req(const struct sock *sk,
+ struct request_sock ***prevp,
+ const __u16 rport, const __u32 raddr,
+ const __u32 laddr)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+ struct request_sock *req, **prev;
+
+ for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
+ lopt->nr_table_entries)];
+ (req = *prev) != NULL;
+ prev = &req->dl_next) {
+ const struct inet_request_sock *ireq = inet_rsk(req);
+
+ if (ireq->rmt_port == rport &&
+ ireq->rmt_addr == raddr &&
+ ireq->loc_addr == laddr &&
+ AF_INET_FAMILY(req->rsk_ops->family)) {
+ BUG_TRAP(!req->sk);
+ *prevp = prev;
+ break;
+ }
+ }
+
+ return req;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_search_req);
+
+void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
+ const unsigned timeout)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+ const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
+ lopt->hash_rnd, lopt->nr_table_entries);
+
+ reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
+ inet_csk_reqsk_queue_added(sk, timeout);
+}
+
+/* Only thing we need from tcp.h */
+extern int sysctl_tcp_synack_retries;
+
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
+
+void inet_csk_reqsk_queue_prune(struct sock *parent,
+ const unsigned long interval,
+ const unsigned long timeout,
+ const unsigned long max_rto)
+{
+ struct inet_connection_sock *icsk = inet_csk(parent);
+ struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+ struct listen_sock *lopt = queue->listen_opt;
+ int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+ int thresh = max_retries;
+ unsigned long now = jiffies;
+ struct request_sock **reqp, *req;
+ int i, budget;
+
+ if (lopt == NULL || lopt->qlen == 0)
+ return;
+
+ /* Normally all the openreqs are young and become mature
+ * (i.e. converted to established socket) for first timeout.
+ * If synack was not acknowledged for 3 seconds, it means
+ * one of the following things: synack was lost, ack was lost,
+ * rtt is high or nobody planned to ack (i.e. synflood).
+ * When server is a bit loaded, queue is populated with old
+ * open requests, reducing effective size of queue.
+ * When server is well loaded, queue size reduces to zero
+ * after several minutes of work. It is not synflood,
+ * it is normal operation. The solution is pruning
+ * too old entries overriding normal timeout, when
+ * situation becomes dangerous.
+ *
+ * Essentially, we reserve half of room for young
+ * embrions; and abort old ones without pity, if old
+ * ones are about to clog our table.
+ */
+ if (lopt->qlen>>(lopt->max_qlen_log-1)) {
+ int young = (lopt->qlen_young<<1);
+
+ while (thresh > 2) {
+ if (lopt->qlen < young)
+ break;
+ thresh--;
+ young <<= 1;
+ }
+ }
+
+ if (queue->rskq_defer_accept)
+ max_retries = queue->rskq_defer_accept;
+
+ budget = 2 * (lopt->nr_table_entries / (timeout / interval));
+ i = lopt->clock_hand;
+
+ do {
+ reqp=&lopt->syn_table[i];
+ while ((req = *reqp) != NULL) {
+ if (time_after_eq(now, req->expires)) {
+ if ((req->retrans < thresh ||
+ (inet_rsk(req)->acked && req->retrans < max_retries))
+ && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
+ unsigned long timeo;
+
+ if (req->retrans++ == 0)
+ lopt->qlen_young--;
+ timeo = min((timeout << req->retrans), max_rto);
+ req->expires = now + timeo;
+ reqp = &req->dl_next;
+ continue;
+ }
+
+ /* Drop this request */
+ inet_csk_reqsk_queue_unlink(parent, req, reqp);
+ reqsk_queue_removed(queue, req);
+ reqsk_free(req);
+ continue;
+ }
+ reqp = &req->dl_next;
+ }
+
+ i = (i + 1) & (lopt->nr_table_entries - 1);
+
+ } while (--budget > 0);
+
+ lopt->clock_hand = i;
+
+ if (lopt->qlen)
+ inet_csk_reset_keepalive_timer(parent, interval);
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
+
+struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
+ const gfp_t priority)
+{
+ struct sock *newsk = sk_clone(sk, priority);
+
+ if (newsk != NULL) {
+ struct inet_connection_sock *newicsk = inet_csk(newsk);
+
+ newsk->sk_state = TCP_SYN_RECV;
+ newicsk->icsk_bind_hash = NULL;
+
+ inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
+ newsk->sk_write_space = sk_stream_write_space;
+
+ newicsk->icsk_retransmits = 0;
+ newicsk->icsk_backoff = 0;
+ newicsk->icsk_probes_out = 0;
+
+ /* Deinitialize accept_queue to trap illegal accesses. */
+ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
+ }
+ return newsk;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_clone);
+
+/*
+ * At this point, there should be no process reference to this
+ * socket, and thus no user references at all. Therefore we
+ * can assume the socket waitqueue is inactive and nobody will
+ * try to jump onto it.
+ */
+void inet_csk_destroy_sock(struct sock *sk)
+{
+ BUG_TRAP(sk->sk_state == TCP_CLOSE);
+ BUG_TRAP(sock_flag(sk, SOCK_DEAD));
+
+ /* It cannot be in hash table! */
+ BUG_TRAP(sk_unhashed(sk));
+
+ /* If it has not 0 inet_sk(sk)->num, it must be bound */
+ BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash);
+
+ sk->sk_prot->destroy(sk);
+
+ sk_stream_kill_queues(sk);
+
+ xfrm_sk_free_policy(sk);
+
+ sk_refcnt_debug_release(sk);
+
+ atomic_dec(sk->sk_prot->orphan_count);
+ sock_put(sk);
+}
+
+EXPORT_SYMBOL(inet_csk_destroy_sock);
+
+int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
+
+ if (rc != 0)
+ return rc;
+
+ sk->sk_max_ack_backlog = 0;
+ sk->sk_ack_backlog = 0;
+ inet_csk_delack_init(sk);
+
+ /* There is race window here: we announce ourselves listening,
+ * but this transition is still not validated by get_port().
+ * It is OK, because this socket enters to hash table only
+ * after validation is complete.
+ */
+ sk->sk_state = TCP_LISTEN;
+ if (!sk->sk_prot->get_port(sk, inet->num)) {
+ inet->sport = htons(inet->num);
+
+ sk_dst_reset(sk);
+ sk->sk_prot->hash(sk);
+
+ return 0;
+ }
+
+ sk->sk_state = TCP_CLOSE;
+ __reqsk_queue_destroy(&icsk->icsk_accept_queue);
+ return -EADDRINUSE;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_listen_start);
+
+/*
+ * This routine closes sockets which have been at least partially
+ * opened, but not yet accepted.
+ */
+void inet_csk_listen_stop(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct request_sock *acc_req;
+ struct request_sock *req;
+
+ inet_csk_delete_keepalive_timer(sk);
+
+ /* make all the listen_opt local to us */
+ acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
+
+ /* Following specs, it would be better either to send FIN
+ * (and enter FIN-WAIT-1, it is normal close)
+ * or to send active reset (abort).
+ * Certainly, it is pretty dangerous while synflood, but it is
+ * bad justification for our negligence 8)
+ * To be honest, we are not able to make either
+ * of the variants now. --ANK
+ */
+ reqsk_queue_destroy(&icsk->icsk_accept_queue);
+
+ while ((req = acc_req) != NULL) {
+ struct sock *child = req->sk;
+
+ acc_req = req->dl_next;
+
+ local_bh_disable();
+ bh_lock_sock(child);
+ BUG_TRAP(!sock_owned_by_user(child));
+ sock_hold(child);
+
+ sk->sk_prot->disconnect(child, O_NONBLOCK);
+
+ sock_orphan(child);
+
+ atomic_inc(sk->sk_prot->orphan_count);
+
+ inet_csk_destroy_sock(child);
+
+ bh_unlock_sock(child);
+ local_bh_enable();
+ sock_put(child);
+
+ sk_acceptq_removed(sk);
+ __reqsk_free(req);
+ }
+ BUG_TRAP(!sk->sk_ack_backlog);
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
new file mode 100644
index 000000000000..71f3c7350c6e
--- /dev/null
+++ b/net/ipv4/inet_diag.c
@@ -0,0 +1,868 @@
+/*
+ * inet_diag.c Module for monitoring INET transport protocols sockets.
+ *
+ * Version: $Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/random.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/time.h>
+
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/inet6_hashtables.h>
+
+#include <linux/inet.h>
+#include <linux/stddef.h>
+
+#include <linux/inet_diag.h>
+
+static const struct inet_diag_handler **inet_diag_table;
+
+struct inet_diag_entry {
+ u32 *saddr;
+ u32 *daddr;
+ u16 sport;
+ u16 dport;
+ u16 family;
+ u16 userlocks;
+};
+
+static struct sock *idiagnl;
+
+#define INET_DIAG_PUT(skb, attrtype, attrlen) \
+ RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
+
+static int inet_diag_fill(struct sk_buff *skb, struct sock *sk,
+ int ext, u32 pid, u32 seq, u16 nlmsg_flags,
+ const struct nlmsghdr *unlh)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ void *info = NULL;
+ struct inet_diag_meminfo *minfo = NULL;
+ unsigned char *b = skb->tail;
+ const struct inet_diag_handler *handler;
+
+ handler = inet_diag_table[unlh->nlmsg_type];
+ BUG_ON(handler == NULL);
+
+ nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
+ nlh->nlmsg_flags = nlmsg_flags;
+
+ r = NLMSG_DATA(nlh);
+ if (sk->sk_state != TCP_TIME_WAIT) {
+ if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
+ minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO,
+ sizeof(*minfo));
+ if (ext & (1 << (INET_DIAG_INFO - 1)))
+ info = INET_DIAG_PUT(skb, INET_DIAG_INFO,
+ handler->idiag_info_size);
+
+ if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) {
+ size_t len = strlen(icsk->icsk_ca_ops->name);
+ strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1),
+ icsk->icsk_ca_ops->name);
+ }
+ }
+ r->idiag_family = sk->sk_family;
+ r->idiag_state = sk->sk_state;
+ r->idiag_timer = 0;
+ r->idiag_retrans = 0;
+
+ r->id.idiag_if = sk->sk_bound_dev_if;
+ r->id.idiag_cookie[0] = (u32)(unsigned long)sk;
+ r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+
+ if (r->idiag_state == TCP_TIME_WAIT) {
+ const struct inet_timewait_sock *tw = inet_twsk(sk);
+ long tmo = tw->tw_ttd - jiffies;
+ if (tmo < 0)
+ tmo = 0;
+
+ r->id.idiag_sport = tw->tw_sport;
+ r->id.idiag_dport = tw->tw_dport;
+ r->id.idiag_src[0] = tw->tw_rcv_saddr;
+ r->id.idiag_dst[0] = tw->tw_daddr;
+ r->idiag_state = tw->tw_substate;
+ r->idiag_timer = 3;
+ r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ;
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = 0;
+ r->idiag_inode = 0;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ if (r->idiag_family == AF_INET6) {
+ const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk);
+
+ ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+ &tcp6tw->tw_v6_rcv_saddr);
+ ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+ &tcp6tw->tw_v6_daddr);
+ }
+#endif
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+ }
+
+ r->id.idiag_sport = inet->sport;
+ r->id.idiag_dport = inet->dport;
+ r->id.idiag_src[0] = inet->rcv_saddr;
+ r->id.idiag_dst[0] = inet->daddr;
+
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ if (r->idiag_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+ &np->rcv_saddr);
+ ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+ &np->daddr);
+ }
+#endif
+
+#define EXPIRES_IN_MS(tmo) ((tmo - jiffies) * 1000 + HZ - 1) / HZ
+
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+ r->idiag_timer = 1;
+ r->idiag_retrans = icsk->icsk_retransmits;
+ r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+ } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+ r->idiag_timer = 4;
+ r->idiag_retrans = icsk->icsk_probes_out;
+ r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+ } else if (timer_pending(&sk->sk_timer)) {
+ r->idiag_timer = 2;
+ r->idiag_retrans = icsk->icsk_probes_out;
+ r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
+ } else {
+ r->idiag_timer = 0;
+ r->idiag_expires = 0;
+ }
+#undef EXPIRES_IN_MS
+
+ r->idiag_uid = sock_i_uid(sk);
+ r->idiag_inode = sock_i_ino(sk);
+
+ if (minfo) {
+ minfo->idiag_rmem = atomic_read(&sk->sk_rmem_alloc);
+ minfo->idiag_wmem = sk->sk_wmem_queued;
+ minfo->idiag_fmem = sk->sk_forward_alloc;
+ minfo->idiag_tmem = atomic_read(&sk->sk_wmem_alloc);
+ }
+
+ handler->idiag_get_info(sk, r, info);
+
+ if (sk->sk_state < TCP_TIME_WAIT &&
+ icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)
+ icsk->icsk_ca_ops->get_info(sk, ext, skb);
+
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+rtattr_failure:
+nlmsg_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+static int inet_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh)
+{
+ int err;
+ struct sock *sk;
+ struct inet_diag_req *req = NLMSG_DATA(nlh);
+ struct sk_buff *rep;
+ struct inet_hashinfo *hashinfo;
+ const struct inet_diag_handler *handler;
+
+ handler = inet_diag_table[nlh->nlmsg_type];
+ BUG_ON(handler == NULL);
+ hashinfo = handler->idiag_hashinfo;
+
+ if (req->idiag_family == AF_INET) {
+ sk = inet_lookup(hashinfo, req->id.idiag_dst[0],
+ req->id.idiag_dport, req->id.idiag_src[0],
+ req->id.idiag_sport, req->id.idiag_if);
+ }
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ else if (req->idiag_family == AF_INET6) {
+ sk = inet6_lookup(hashinfo,
+ (struct in6_addr *)req->id.idiag_dst,
+ req->id.idiag_dport,
+ (struct in6_addr *)req->id.idiag_src,
+ req->id.idiag_sport,
+ req->id.idiag_if);
+ }
+#endif
+ else {
+ return -EINVAL;
+ }
+
+ if (sk == NULL)
+ return -ENOENT;
+
+ err = -ESTALE;
+ if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE ||
+ req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) &&
+ ((u32)(unsigned long)sk != req->id.idiag_cookie[0] ||
+ (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1]))
+ goto out;
+
+ err = -ENOMEM;
+ rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
+ sizeof(struct inet_diag_meminfo) +
+ handler->idiag_info_size + 64)),
+ GFP_KERNEL);
+ if (!rep)
+ goto out;
+
+ if (inet_diag_fill(rep, sk, req->idiag_ext,
+ NETLINK_CB(in_skb).pid,
+ nlh->nlmsg_seq, 0, nlh) <= 0)
+ BUG();
+
+ err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid,
+ MSG_DONTWAIT);
+ if (err > 0)
+ err = 0;
+
+out:
+ if (sk) {
+ if (sk->sk_state == TCP_TIME_WAIT)
+ inet_twsk_put((struct inet_timewait_sock *)sk);
+ else
+ sock_put(sk);
+ }
+ return err;
+}
+
+static int bitstring_match(const u32 *a1, const u32 *a2, int bits)
+{
+ int words = bits >> 5;
+
+ bits &= 0x1f;
+
+ if (words) {
+ if (memcmp(a1, a2, words << 2))
+ return 0;
+ }
+ if (bits) {
+ __u32 w1, w2;
+ __u32 mask;
+
+ w1 = a1[words];
+ w2 = a2[words];
+
+ mask = htonl((0xffffffff) << (32 - bits));
+
+ if ((w1 ^ w2) & mask)
+ return 0;
+ }
+
+ return 1;
+}
+
+
+static int inet_diag_bc_run(const void *bc, int len,
+ const struct inet_diag_entry *entry)
+{
+ while (len > 0) {
+ int yes = 1;
+ const struct inet_diag_bc_op *op = bc;
+
+ switch (op->code) {
+ case INET_DIAG_BC_NOP:
+ break;
+ case INET_DIAG_BC_JMP:
+ yes = 0;
+ break;
+ case INET_DIAG_BC_S_GE:
+ yes = entry->sport >= op[1].no;
+ break;
+ case INET_DIAG_BC_S_LE:
+ yes = entry->dport <= op[1].no;
+ break;
+ case INET_DIAG_BC_D_GE:
+ yes = entry->dport >= op[1].no;
+ break;
+ case INET_DIAG_BC_D_LE:
+ yes = entry->dport <= op[1].no;
+ break;
+ case INET_DIAG_BC_AUTO:
+ yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
+ break;
+ case INET_DIAG_BC_S_COND:
+ case INET_DIAG_BC_D_COND: {
+ struct inet_diag_hostcond *cond;
+ u32 *addr;
+
+ cond = (struct inet_diag_hostcond *)(op + 1);
+ if (cond->port != -1 &&
+ cond->port != (op->code == INET_DIAG_BC_S_COND ?
+ entry->sport : entry->dport)) {
+ yes = 0;
+ break;
+ }
+
+ if (cond->prefix_len == 0)
+ break;
+
+ if (op->code == INET_DIAG_BC_S_COND)
+ addr = entry->saddr;
+ else
+ addr = entry->daddr;
+
+ if (bitstring_match(addr, cond->addr, cond->prefix_len))
+ break;
+ if (entry->family == AF_INET6 &&
+ cond->family == AF_INET) {
+ if (addr[0] == 0 && addr[1] == 0 &&
+ addr[2] == htonl(0xffff) &&
+ bitstring_match(addr + 3, cond->addr,
+ cond->prefix_len))
+ break;
+ }
+ yes = 0;
+ break;
+ }
+ }
+
+ if (yes) {
+ len -= op->yes;
+ bc += op->yes;
+ } else {
+ len -= op->no;
+ bc += op->no;
+ }
+ }
+ return (len == 0);
+}
+
+static int valid_cc(const void *bc, int len, int cc)
+{
+ while (len >= 0) {
+ const struct inet_diag_bc_op *op = bc;
+
+ if (cc > len)
+ return 0;
+ if (cc == len)
+ return 1;
+ if (op->yes < 4)
+ return 0;
+ len -= op->yes;
+ bc += op->yes;
+ }
+ return 0;
+}
+
+static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
+{
+ const unsigned char *bc = bytecode;
+ int len = bytecode_len;
+
+ while (len > 0) {
+ struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc;
+
+//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
+ switch (op->code) {
+ case INET_DIAG_BC_AUTO:
+ case INET_DIAG_BC_S_COND:
+ case INET_DIAG_BC_D_COND:
+ case INET_DIAG_BC_S_GE:
+ case INET_DIAG_BC_S_LE:
+ case INET_DIAG_BC_D_GE:
+ case INET_DIAG_BC_D_LE:
+ if (op->yes < 4 || op->yes > len + 4)
+ return -EINVAL;
+ case INET_DIAG_BC_JMP:
+ if (op->no < 4 || op->no > len + 4)
+ return -EINVAL;
+ if (op->no < len &&
+ !valid_cc(bytecode, bytecode_len, len - op->no))
+ return -EINVAL;
+ break;
+ case INET_DIAG_BC_NOP:
+ if (op->yes < 4 || op->yes > len + 4)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ bc += op->yes;
+ len -= op->yes;
+ }
+ return len == 0 ? 0 : -EINVAL;
+}
+
+static int inet_diag_dump_sock(struct sk_buff *skb, struct sock *sk,
+ struct netlink_callback *cb)
+{
+ struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+
+ if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+ struct inet_diag_entry entry;
+ struct rtattr *bc = (struct rtattr *)(r + 1);
+ struct inet_sock *inet = inet_sk(sk);
+
+ entry.family = sk->sk_family;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ if (entry.family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ entry.saddr = np->rcv_saddr.s6_addr32;
+ entry.daddr = np->daddr.s6_addr32;
+ } else
+#endif
+ {
+ entry.saddr = &inet->rcv_saddr;
+ entry.daddr = &inet->daddr;
+ }
+ entry.sport = inet->num;
+ entry.dport = ntohs(inet->dport);
+ entry.userlocks = sk->sk_userlocks;
+
+ if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
+ return 0;
+ }
+
+ return inet_diag_fill(skb, sk, r->idiag_ext, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+}
+
+static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
+ struct request_sock *req,
+ u32 pid, u32 seq,
+ const struct nlmsghdr *unlh)
+{
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct inet_sock *inet = inet_sk(sk);
+ unsigned char *b = skb->tail;
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ long tmo;
+
+ nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
+ nlh->nlmsg_flags = NLM_F_MULTI;
+ r = NLMSG_DATA(nlh);
+
+ r->idiag_family = sk->sk_family;
+ r->idiag_state = TCP_SYN_RECV;
+ r->idiag_timer = 1;
+ r->idiag_retrans = req->retrans;
+
+ r->id.idiag_if = sk->sk_bound_dev_if;
+ r->id.idiag_cookie[0] = (u32)(unsigned long)req;
+ r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
+
+ tmo = req->expires - jiffies;
+ if (tmo < 0)
+ tmo = 0;
+
+ r->id.idiag_sport = inet->sport;
+ r->id.idiag_dport = ireq->rmt_port;
+ r->id.idiag_src[0] = ireq->loc_addr;
+ r->id.idiag_dst[0] = ireq->rmt_addr;
+ r->idiag_expires = jiffies_to_msecs(tmo);
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = sock_i_uid(sk);
+ r->idiag_inode = 0;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ if (r->idiag_family == AF_INET6) {
+ ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+ &tcp6_rsk(req)->loc_addr);
+ ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+ &tcp6_rsk(req)->rmt_addr);
+ }
+#endif
+ nlh->nlmsg_len = skb->tail - b;
+
+ return skb->len;
+
+nlmsg_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
+ struct netlink_callback *cb)
+{
+ struct inet_diag_entry entry;
+ struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt;
+ struct rtattr *bc = NULL;
+ struct inet_sock *inet = inet_sk(sk);
+ int j, s_j;
+ int reqnum, s_reqnum;
+ int err = 0;
+
+ s_j = cb->args[3];
+ s_reqnum = cb->args[4];
+
+ if (s_j > 0)
+ s_j--;
+
+ entry.family = sk->sk_family;
+
+ read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+
+ lopt = icsk->icsk_accept_queue.listen_opt;
+ if (!lopt || !lopt->qlen)
+ goto out;
+
+ if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+ bc = (struct rtattr *)(r + 1);
+ entry.sport = inet->num;
+ entry.userlocks = sk->sk_userlocks;
+ }
+
+ for (j = s_j; j < lopt->nr_table_entries; j++) {
+ struct request_sock *req, *head = lopt->syn_table[j];
+
+ reqnum = 0;
+ for (req = head; req; reqnum++, req = req->dl_next) {
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ if (reqnum < s_reqnum)
+ continue;
+ if (r->id.idiag_dport != ireq->rmt_port &&
+ r->id.idiag_dport)
+ continue;
+
+ if (bc) {
+ entry.saddr =
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ (entry.family == AF_INET6) ?
+ tcp6_rsk(req)->loc_addr.s6_addr32 :
+#endif
+ &ireq->loc_addr;
+ entry.daddr =
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ (entry.family == AF_INET6) ?
+ tcp6_rsk(req)->rmt_addr.s6_addr32 :
+#endif
+ &ireq->rmt_addr;
+ entry.dport = ntohs(ireq->rmt_port);
+
+ if (!inet_diag_bc_run(RTA_DATA(bc),
+ RTA_PAYLOAD(bc), &entry))
+ continue;
+ }
+
+ err = inet_diag_fill_req(skb, sk, req,
+ NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, cb->nlh);
+ if (err < 0) {
+ cb->args[3] = j + 1;
+ cb->args[4] = reqnum;
+ goto out;
+ }
+ }
+
+ s_reqnum = 0;
+ }
+
+out:
+ read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+
+ return err;
+}
+
+static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int i, num;
+ int s_i, s_num;
+ struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+ const struct inet_diag_handler *handler;
+ struct inet_hashinfo *hashinfo;
+
+ handler = inet_diag_table[cb->nlh->nlmsg_type];
+ BUG_ON(handler == NULL);
+ hashinfo = handler->idiag_hashinfo;
+
+ s_i = cb->args[1];
+ s_num = num = cb->args[2];
+
+ if (cb->args[0] == 0) {
+ if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
+ goto skip_listen_ht;
+
+ inet_listen_lock(hashinfo);
+ for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
+ struct sock *sk;
+ struct hlist_node *node;
+
+ num = 0;
+ sk_for_each(sk, node, &hashinfo->listening_hash[i]) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (num < s_num) {
+ num++;
+ continue;
+ }
+
+ if (r->id.idiag_sport != inet->sport &&
+ r->id.idiag_sport)
+ goto next_listen;
+
+ if (!(r->idiag_states & TCPF_LISTEN) ||
+ r->id.idiag_dport ||
+ cb->args[3] > 0)
+ goto syn_recv;
+
+ if (inet_diag_dump_sock(skb, sk, cb) < 0) {
+ inet_listen_unlock(hashinfo);
+ goto done;
+ }
+
+syn_recv:
+ if (!(r->idiag_states & TCPF_SYN_RECV))
+ goto next_listen;
+
+ if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
+ inet_listen_unlock(hashinfo);
+ goto done;
+ }
+
+next_listen:
+ cb->args[3] = 0;
+ cb->args[4] = 0;
+ ++num;
+ }
+
+ s_num = 0;
+ cb->args[3] = 0;
+ cb->args[4] = 0;
+ }
+ inet_listen_unlock(hashinfo);
+skip_listen_ht:
+ cb->args[0] = 1;
+ s_i = num = s_num = 0;
+ }
+
+ if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
+ return skb->len;
+
+ for (i = s_i; i < hashinfo->ehash_size; i++) {
+ struct inet_ehash_bucket *head = &hashinfo->ehash[i];
+ struct sock *sk;
+ struct hlist_node *node;
+
+ if (i > s_i)
+ s_num = 0;
+
+ read_lock_bh(&head->lock);
+
+ num = 0;
+ sk_for_each(sk, node, &head->chain) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (num < s_num)
+ goto next_normal;
+ if (!(r->idiag_states & (1 << sk->sk_state)))
+ goto next_normal;
+ if (r->id.idiag_sport != inet->sport &&
+ r->id.idiag_sport)
+ goto next_normal;
+ if (r->id.idiag_dport != inet->dport && r->id.idiag_dport)
+ goto next_normal;
+ if (inet_diag_dump_sock(skb, sk, cb) < 0) {
+ read_unlock_bh(&head->lock);
+ goto done;
+ }
+next_normal:
+ ++num;
+ }
+
+ if (r->idiag_states & TCPF_TIME_WAIT) {
+ sk_for_each(sk, node,
+ &hashinfo->ehash[i + hashinfo->ehash_size].chain) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (num < s_num)
+ goto next_dying;
+ if (r->id.idiag_sport != inet->sport &&
+ r->id.idiag_sport)
+ goto next_dying;
+ if (r->id.idiag_dport != inet->dport &&
+ r->id.idiag_dport)
+ goto next_dying;
+ if (inet_diag_dump_sock(skb, sk, cb) < 0) {
+ read_unlock_bh(&head->lock);
+ goto done;
+ }
+next_dying:
+ ++num;
+ }
+ }
+ read_unlock_bh(&head->lock);
+ }
+
+done:
+ cb->args[1] = i;
+ cb->args[2] = num;
+ return skb->len;
+}
+
+static int inet_diag_dump_done(struct netlink_callback *cb)
+{
+ return 0;
+}
+
+
+static __inline__ int
+inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
+ return 0;
+
+ if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX)
+ goto err_inval;
+
+ if (inet_diag_table[nlh->nlmsg_type] == NULL)
+ return -ENOENT;
+
+ if (NLMSG_LENGTH(sizeof(struct inet_diag_req)) > skb->len)
+ goto err_inval;
+
+ if (nlh->nlmsg_flags&NLM_F_DUMP) {
+ if (nlh->nlmsg_len >
+ (4 + NLMSG_SPACE(sizeof(struct inet_diag_req)))) {
+ struct rtattr *rta = (void *)(NLMSG_DATA(nlh) +
+ sizeof(struct inet_diag_req));
+ if (rta->rta_type != INET_DIAG_REQ_BYTECODE ||
+ rta->rta_len < 8 ||
+ rta->rta_len >
+ (nlh->nlmsg_len -
+ NLMSG_SPACE(sizeof(struct inet_diag_req))))
+ goto err_inval;
+ if (inet_diag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
+ goto err_inval;
+ }
+ return netlink_dump_start(idiagnl, skb, nlh,
+ inet_diag_dump,
+ inet_diag_dump_done);
+ } else {
+ return inet_diag_get_exact(skb, nlh);
+ }
+
+err_inval:
+ return -EINVAL;
+}
+
+
+static inline void inet_diag_rcv_skb(struct sk_buff *skb)
+{
+ int err;
+ struct nlmsghdr * nlh;
+
+ if (skb->len >= NLMSG_SPACE(0)) {
+ nlh = (struct nlmsghdr *)skb->data;
+ if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+ return;
+ err = inet_diag_rcv_msg(skb, nlh);
+ if (err || nlh->nlmsg_flags & NLM_F_ACK)
+ netlink_ack(skb, nlh, err);
+ }
+}
+
+static void inet_diag_rcv(struct sock *sk, int len)
+{
+ struct sk_buff *skb;
+ unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
+
+ while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) {
+ inet_diag_rcv_skb(skb);
+ kfree_skb(skb);
+ }
+}
+
+static DEFINE_SPINLOCK(inet_diag_register_lock);
+
+int inet_diag_register(const struct inet_diag_handler *h)
+{
+ const __u16 type = h->idiag_type;
+ int err = -EINVAL;
+
+ if (type >= INET_DIAG_GETSOCK_MAX)
+ goto out;
+
+ spin_lock(&inet_diag_register_lock);
+ err = -EEXIST;
+ if (inet_diag_table[type] == NULL) {
+ inet_diag_table[type] = h;
+ err = 0;
+ }
+ spin_unlock(&inet_diag_register_lock);
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(inet_diag_register);
+
+void inet_diag_unregister(const struct inet_diag_handler *h)
+{
+ const __u16 type = h->idiag_type;
+
+ if (type >= INET_DIAG_GETSOCK_MAX)
+ return;
+
+ spin_lock(&inet_diag_register_lock);
+ inet_diag_table[type] = NULL;
+ spin_unlock(&inet_diag_register_lock);
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(inet_diag_unregister);
+
+static int __init inet_diag_init(void)
+{
+ const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX *
+ sizeof(struct inet_diag_handler *));
+ int err = -ENOMEM;
+
+ inet_diag_table = kmalloc(inet_diag_table_size, GFP_KERNEL);
+ if (!inet_diag_table)
+ goto out;
+
+ memset(inet_diag_table, 0, inet_diag_table_size);
+ idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv,
+ THIS_MODULE);
+ if (idiagnl == NULL)
+ goto out_free_table;
+ err = 0;
+out:
+ return err;
+out_free_table:
+ kfree(inet_diag_table);
+ goto out;
+}
+
+static void __exit inet_diag_exit(void)
+{
+ sock_release(idiagnl->sk_socket);
+ kfree(inet_diag_table);
+}
+
+module_init(inet_diag_init);
+module_exit(inet_diag_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
new file mode 100644
index 000000000000..e8d29fe736d2
--- /dev/null
+++ b/net/ipv4/inet_hashtables.c
@@ -0,0 +1,165 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Generic INET transport hashtables
+ *
+ * Authors: Lotsa people, from code originally in tcp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+
+/*
+ * Allocate and initialize a new local port bind bucket.
+ * The bindhash mutex for snum's hash chain must be held here.
+ */
+struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep,
+ struct inet_bind_hashbucket *head,
+ const unsigned short snum)
+{
+ struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC);
+
+ if (tb != NULL) {
+ tb->port = snum;
+ tb->fastreuse = 0;
+ INIT_HLIST_HEAD(&tb->owners);
+ hlist_add_head(&tb->node, &head->chain);
+ }
+ return tb;
+}
+
+EXPORT_SYMBOL(inet_bind_bucket_create);
+
+/*
+ * Caller must hold hashbucket lock for this tb with local BH disabled
+ */
+void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb)
+{
+ if (hlist_empty(&tb->owners)) {
+ __hlist_del(&tb->node);
+ kmem_cache_free(cachep, tb);
+ }
+}
+
+void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
+ const unsigned short snum)
+{
+ inet_sk(sk)->num = snum;
+ sk_add_bind_node(sk, &tb->owners);
+ inet_csk(sk)->icsk_bind_hash = tb;
+}
+
+EXPORT_SYMBOL(inet_bind_hash);
+
+/*
+ * Get rid of any references to a local port held by the given sock.
+ */
+static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+{
+ const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size);
+ struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
+ struct inet_bind_bucket *tb;
+
+ spin_lock(&head->lock);
+ tb = inet_csk(sk)->icsk_bind_hash;
+ __sk_del_bind_node(sk);
+ inet_csk(sk)->icsk_bind_hash = NULL;
+ inet_sk(sk)->num = 0;
+ inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ spin_unlock(&head->lock);
+}
+
+void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+{
+ local_bh_disable();
+ __inet_put_port(hashinfo, sk);
+ local_bh_enable();
+}
+
+EXPORT_SYMBOL(inet_put_port);
+
+/*
+ * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
+ * Look, when several writers sleep and reader wakes them up, all but one
+ * immediately hit write lock and grab all the cpus. Exclusive sleep solves
+ * this, _but_ remember, it adds useless work on UP machines (wake up each
+ * exclusive lock release). It should be ifdefed really.
+ */
+void inet_listen_wlock(struct inet_hashinfo *hashinfo)
+{
+ write_lock(&hashinfo->lhash_lock);
+
+ if (atomic_read(&hashinfo->lhash_users)) {
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait_exclusive(&hashinfo->lhash_wait,
+ &wait, TASK_UNINTERRUPTIBLE);
+ if (!atomic_read(&hashinfo->lhash_users))
+ break;
+ write_unlock_bh(&hashinfo->lhash_lock);
+ schedule();
+ write_lock_bh(&hashinfo->lhash_lock);
+ }
+
+ finish_wait(&hashinfo->lhash_wait, &wait);
+ }
+}
+
+EXPORT_SYMBOL(inet_listen_wlock);
+
+/*
+ * Don't inline this cruft. Here are some nice properties to exploit here. The
+ * BSD API does not allow a listening sock to specify the remote port nor the
+ * remote address for the connection. So always assume those are both
+ * wildcarded during the search since they can never be otherwise.
+ */
+struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr,
+ const unsigned short hnum, const int dif)
+{
+ struct sock *result = NULL, *sk;
+ const struct hlist_node *node;
+ int hiscore = -1;
+
+ sk_for_each(sk, node, head) {
+ const struct inet_sock *inet = inet_sk(sk);
+
+ if (inet->num == hnum && !ipv6_only_sock(sk)) {
+ const __u32 rcv_saddr = inet->rcv_saddr;
+ int score = sk->sk_family == PF_INET ? 1 : 0;
+
+ if (rcv_saddr) {
+ if (rcv_saddr != daddr)
+ continue;
+ score += 2;
+ }
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ continue;
+ score += 2;
+ }
+ if (score == 5)
+ return sk;
+ if (score > hiscore) {
+ hiscore = score;
+ result = sk;
+ }
+ }
+ }
+ return result;
+}
+
+EXPORT_SYMBOL_GPL(__inet_lookup_listener);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
new file mode 100644
index 000000000000..a010e9a68811
--- /dev/null
+++ b/net/ipv4/inet_timewait_sock.c
@@ -0,0 +1,385 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Generic TIME_WAIT sockets functions
+ *
+ * From code orinally in TCP
+ */
+
+#include <linux/config.h>
+
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+
+/* Must be called with locally disabled BHs. */
+void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo)
+{
+ struct inet_bind_hashbucket *bhead;
+ struct inet_bind_bucket *tb;
+ /* Unlink from established hashes. */
+ struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, tw->tw_hash);
+
+ write_lock(&ehead->lock);
+ if (hlist_unhashed(&tw->tw_node)) {
+ write_unlock(&ehead->lock);
+ return;
+ }
+ __hlist_del(&tw->tw_node);
+ sk_node_init(&tw->tw_node);
+ write_unlock(&ehead->lock);
+
+ /* Disassociate with bind bucket. */
+ bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)];
+ spin_lock(&bhead->lock);
+ tb = tw->tw_tb;
+ __hlist_del(&tw->tw_bind_node);
+ tw->tw_tb = NULL;
+ inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ spin_unlock(&bhead->lock);
+#ifdef SOCK_REFCNT_DEBUG
+ if (atomic_read(&tw->tw_refcnt) != 1) {
+ printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
+ tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
+ }
+#endif
+ inet_twsk_put(tw);
+}
+
+EXPORT_SYMBOL_GPL(__inet_twsk_kill);
+
+/*
+ * Enter the time wait state. This is called with locally disabled BH.
+ * Essentially we whip up a timewait bucket, copy the relevant info into it
+ * from the SK, and mess with hash chains and list linkage.
+ */
+void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
+ struct inet_hashinfo *hashinfo)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
+ struct inet_bind_hashbucket *bhead;
+ /* Step 1: Put TW into bind hash. Original socket stays there too.
+ Note, that any socket with inet->num != 0 MUST be bound in
+ binding cache, even if it is closed.
+ */
+ bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)];
+ spin_lock(&bhead->lock);
+ tw->tw_tb = icsk->icsk_bind_hash;
+ BUG_TRAP(icsk->icsk_bind_hash);
+ inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
+ spin_unlock(&bhead->lock);
+
+ write_lock(&ehead->lock);
+
+ /* Step 2: Remove SK from established hash. */
+ if (__sk_del_node_init(sk))
+ sock_prot_dec_use(sk->sk_prot);
+
+ /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
+ inet_twsk_add_node(tw, &(ehead + hashinfo->ehash_size)->chain);
+ atomic_inc(&tw->tw_refcnt);
+
+ write_unlock(&ehead->lock);
+}
+
+EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
+
+struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
+{
+ struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab,
+ SLAB_ATOMIC);
+ if (tw != NULL) {
+ const struct inet_sock *inet = inet_sk(sk);
+
+ /* Give us an identity. */
+ tw->tw_daddr = inet->daddr;
+ tw->tw_rcv_saddr = inet->rcv_saddr;
+ tw->tw_bound_dev_if = sk->sk_bound_dev_if;
+ tw->tw_num = inet->num;
+ tw->tw_state = TCP_TIME_WAIT;
+ tw->tw_substate = state;
+ tw->tw_sport = inet->sport;
+ tw->tw_dport = inet->dport;
+ tw->tw_family = sk->sk_family;
+ tw->tw_reuse = sk->sk_reuse;
+ tw->tw_hash = sk->sk_hash;
+ tw->tw_ipv6only = 0;
+ tw->tw_prot = sk->sk_prot_creator;
+ atomic_set(&tw->tw_refcnt, 1);
+ inet_twsk_dead_node_init(tw);
+ __module_get(tw->tw_prot->owner);
+ }
+
+ return tw;
+}
+
+EXPORT_SYMBOL_GPL(inet_twsk_alloc);
+
+/* Returns non-zero if quota exceeded. */
+static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
+ const int slot)
+{
+ struct inet_timewait_sock *tw;
+ struct hlist_node *node;
+ unsigned int killed;
+ int ret;
+
+ /* NOTE: compare this to previous version where lock
+ * was released after detaching chain. It was racy,
+ * because tw buckets are scheduled in not serialized context
+ * in 2.3 (with netfilter), and with softnet it is common, because
+ * soft irqs are not sequenced.
+ */
+ killed = 0;
+ ret = 0;
+rescan:
+ inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) {
+ __inet_twsk_del_dead_node(tw);
+ spin_unlock(&twdr->death_lock);
+ __inet_twsk_kill(tw, twdr->hashinfo);
+ inet_twsk_put(tw);
+ killed++;
+ spin_lock(&twdr->death_lock);
+ if (killed > INET_TWDR_TWKILL_QUOTA) {
+ ret = 1;
+ break;
+ }
+
+ /* While we dropped twdr->death_lock, another cpu may have
+ * killed off the next TW bucket in the list, therefore
+ * do a fresh re-read of the hlist head node with the
+ * lock reacquired. We still use the hlist traversal
+ * macro in order to get the prefetches.
+ */
+ goto rescan;
+ }
+
+ twdr->tw_count -= killed;
+ NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
+
+ return ret;
+}
+
+void inet_twdr_hangman(unsigned long data)
+{
+ struct inet_timewait_death_row *twdr;
+ int unsigned need_timer;
+
+ twdr = (struct inet_timewait_death_row *)data;
+ spin_lock(&twdr->death_lock);
+
+ if (twdr->tw_count == 0)
+ goto out;
+
+ need_timer = 0;
+ if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
+ twdr->thread_slots |= (1 << twdr->slot);
+ mb();
+ schedule_work(&twdr->twkill_work);
+ need_timer = 1;
+ } else {
+ /* We purged the entire slot, anything left? */
+ if (twdr->tw_count)
+ need_timer = 1;
+ }
+ twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
+ if (need_timer)
+ mod_timer(&twdr->tw_timer, jiffies + twdr->period);
+out:
+ spin_unlock(&twdr->death_lock);
+}
+
+EXPORT_SYMBOL_GPL(inet_twdr_hangman);
+
+extern void twkill_slots_invalid(void);
+
+void inet_twdr_twkill_work(void *data)
+{
+ struct inet_timewait_death_row *twdr = data;
+ int i;
+
+ if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8))
+ twkill_slots_invalid();
+
+ while (twdr->thread_slots) {
+ spin_lock_bh(&twdr->death_lock);
+ for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
+ if (!(twdr->thread_slots & (1 << i)))
+ continue;
+
+ while (inet_twdr_do_twkill_work(twdr, i) != 0) {
+ if (need_resched()) {
+ spin_unlock_bh(&twdr->death_lock);
+ schedule();
+ spin_lock_bh(&twdr->death_lock);
+ }
+ }
+
+ twdr->thread_slots &= ~(1 << i);
+ }
+ spin_unlock_bh(&twdr->death_lock);
+ }
+}
+
+EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
+
+/* These are always called from BH context. See callers in
+ * tcp_input.c to verify this.
+ */
+
+/* This is for handling early-kills of TIME_WAIT sockets. */
+void inet_twsk_deschedule(struct inet_timewait_sock *tw,
+ struct inet_timewait_death_row *twdr)
+{
+ spin_lock(&twdr->death_lock);
+ if (inet_twsk_del_dead_node(tw)) {
+ inet_twsk_put(tw);
+ if (--twdr->tw_count == 0)
+ del_timer(&twdr->tw_timer);
+ }
+ spin_unlock(&twdr->death_lock);
+ __inet_twsk_kill(tw, twdr->hashinfo);
+}
+
+EXPORT_SYMBOL(inet_twsk_deschedule);
+
+void inet_twsk_schedule(struct inet_timewait_sock *tw,
+ struct inet_timewait_death_row *twdr,
+ const int timeo, const int timewait_len)
+{
+ struct hlist_head *list;
+ int slot;
+
+ /* timeout := RTO * 3.5
+ *
+ * 3.5 = 1+2+0.5 to wait for two retransmits.
+ *
+ * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
+ * our ACK acking that FIN can be lost. If N subsequent retransmitted
+ * FINs (or previous seqments) are lost (probability of such event
+ * is p^(N+1), where p is probability to lose single packet and
+ * time to detect the loss is about RTO*(2^N - 1) with exponential
+ * backoff). Normal timewait length is calculated so, that we
+ * waited at least for one retransmitted FIN (maximal RTO is 120sec).
+ * [ BTW Linux. following BSD, violates this requirement waiting
+ * only for 60sec, we should wait at least for 240 secs.
+ * Well, 240 consumes too much of resources 8)
+ * ]
+ * This interval is not reduced to catch old duplicate and
+ * responces to our wandering segments living for two MSLs.
+ * However, if we use PAWS to detect
+ * old duplicates, we can reduce the interval to bounds required
+ * by RTO, rather than MSL. So, if peer understands PAWS, we
+ * kill tw bucket after 3.5*RTO (it is important that this number
+ * is greater than TS tick!) and detect old duplicates with help
+ * of PAWS.
+ */
+ slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
+
+ spin_lock(&twdr->death_lock);
+
+ /* Unlink it, if it was scheduled */
+ if (inet_twsk_del_dead_node(tw))
+ twdr->tw_count--;
+ else
+ atomic_inc(&tw->tw_refcnt);
+
+ if (slot >= INET_TWDR_RECYCLE_SLOTS) {
+ /* Schedule to slow timer */
+ if (timeo >= timewait_len) {
+ slot = INET_TWDR_TWKILL_SLOTS - 1;
+ } else {
+ slot = (timeo + twdr->period - 1) / twdr->period;
+ if (slot >= INET_TWDR_TWKILL_SLOTS)
+ slot = INET_TWDR_TWKILL_SLOTS - 1;
+ }
+ tw->tw_ttd = jiffies + timeo;
+ slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
+ list = &twdr->cells[slot];
+ } else {
+ tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
+
+ if (twdr->twcal_hand < 0) {
+ twdr->twcal_hand = 0;
+ twdr->twcal_jiffie = jiffies;
+ twdr->twcal_timer.expires = twdr->twcal_jiffie +
+ (slot << INET_TWDR_RECYCLE_TICK);
+ add_timer(&twdr->twcal_timer);
+ } else {
+ if (time_after(twdr->twcal_timer.expires,
+ jiffies + (slot << INET_TWDR_RECYCLE_TICK)))
+ mod_timer(&twdr->twcal_timer,
+ jiffies + (slot << INET_TWDR_RECYCLE_TICK));
+ slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
+ }
+ list = &twdr->twcal_row[slot];
+ }
+
+ hlist_add_head(&tw->tw_death_node, list);
+
+ if (twdr->tw_count++ == 0)
+ mod_timer(&twdr->tw_timer, jiffies + twdr->period);
+ spin_unlock(&twdr->death_lock);
+}
+
+EXPORT_SYMBOL_GPL(inet_twsk_schedule);
+
+void inet_twdr_twcal_tick(unsigned long data)
+{
+ struct inet_timewait_death_row *twdr;
+ int n, slot;
+ unsigned long j;
+ unsigned long now = jiffies;
+ int killed = 0;
+ int adv = 0;
+
+ twdr = (struct inet_timewait_death_row *)data;
+
+ spin_lock(&twdr->death_lock);
+ if (twdr->twcal_hand < 0)
+ goto out;
+
+ slot = twdr->twcal_hand;
+ j = twdr->twcal_jiffie;
+
+ for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
+ if (time_before_eq(j, now)) {
+ struct hlist_node *node, *safe;
+ struct inet_timewait_sock *tw;
+
+ inet_twsk_for_each_inmate_safe(tw, node, safe,
+ &twdr->twcal_row[slot]) {
+ __inet_twsk_del_dead_node(tw);
+ __inet_twsk_kill(tw, twdr->hashinfo);
+ inet_twsk_put(tw);
+ killed++;
+ }
+ } else {
+ if (!adv) {
+ adv = 1;
+ twdr->twcal_jiffie = j;
+ twdr->twcal_hand = slot;
+ }
+
+ if (!hlist_empty(&twdr->twcal_row[slot])) {
+ mod_timer(&twdr->twcal_timer, j);
+ goto out;
+ }
+ }
+ j += 1 << INET_TWDR_RECYCLE_TICK;
+ slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);
+ }
+ twdr->twcal_hand = -1;
+
+out:
+ if ((twdr->tw_count -= killed) == 0)
+ del_timer(&twdr->tw_timer);
+ NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
+ spin_unlock(&twdr->death_lock);
+}
+
+EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 95473953c406..2fc3fd38924f 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -20,6 +20,7 @@
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/net.h>
+#include <net/ip.h>
#include <net/inetpeer.h>
/*
@@ -72,7 +73,7 @@
/* Exported for inet_getid inline function. */
DEFINE_SPINLOCK(inet_peer_idlock);
-static kmem_cache_t *peer_cachep;
+static kmem_cache_t *peer_cachep __read_mostly;
#define node_height(x) x->avl_height
static struct inet_peer peer_fake_node = {
@@ -99,8 +100,7 @@ DEFINE_SPINLOCK(inet_peer_unused_lock);
#define PEER_MAX_CLEANUP_WORK 30
static void peer_check_expire(unsigned long dummy);
-static struct timer_list peer_periodic_timer =
- TIMER_INITIALIZER(peer_check_expire, 0, 0);
+static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0);
/* Exported for sysctl_net_ipv4. */
int inet_peer_gc_mintime = 10 * HZ,
@@ -450,11 +450,12 @@ static void peer_check_expire(unsigned long dummy)
/* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
* interval depending on the total number of entries (more entries,
* less interval). */
- peer_periodic_timer.expires = jiffies
- + inet_peer_gc_maxtime
- - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
- peer_total / inet_peer_threshold * HZ;
+ if (peer_total >= inet_peer_threshold)
+ peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
+ else
+ peer_periodic_timer.expires = jiffies
+ + inet_peer_gc_maxtime
+ - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
+ peer_total / inet_peer_threshold * HZ;
add_timer(&peer_periodic_timer);
}
-
-EXPORT_SYMBOL(inet_peer_idlock);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 77094aac6c28..0923add122b4 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -76,16 +76,12 @@ int ip_forward(struct sk_buff *skb)
* that reaches zero, we must reply an ICMP control message telling
* that the packet's lifetime expired.
*/
-
- iph = skb->nh.iph;
-
- if (iph->ttl <= 1)
+ if (skb->nh.iph->ttl <= 1)
goto too_many_hops;
if (!xfrm4_route_forward(skb))
goto drop;
- iph = skb->nh.iph;
rt = (struct rtable*)skb->dst;
if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 7f68e27eb4ea..e7d26d9943c2 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -377,7 +377,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
return ip_frag_intern(hash, qp);
out_nomem:
- NETDEBUG(if (net_ratelimit()) printk(KERN_ERR "ip_frag_create: no memory left !\n"));
+ LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n");
return NULL;
}
@@ -457,7 +457,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (pskb_pull(skb, ihl) == NULL)
goto err;
- if (pskb_trim(skb, end-offset))
+ if (pskb_trim_rcsum(skb, end-offset))
goto err;
/* Find out which fragments are in front and at the back of us
@@ -533,7 +533,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (skb->dev)
qp->iif = skb->dev->ifindex;
skb->dev = NULL;
- qp->stamp = skb->stamp;
+ skb_get_timestamp(skb, &qp->stamp);
qp->meat += skb->len;
atomic_add(skb->truesize, &ip_frag_mem);
if (offset == 0)
@@ -615,7 +615,7 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
head->next = NULL;
head->dev = dev;
- head->stamp = qp->stamp;
+ skb_set_timestamp(head, &qp->stamp);
iph = head->nh.iph;
iph->frag_off = 0;
@@ -625,10 +625,8 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
return head;
out_nomem:
- NETDEBUG(if (net_ratelimit())
- printk(KERN_ERR
- "IP: queue_glue: no memory for gluing queue %p\n",
- qp));
+ LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing "
+ "queue %p\n", qp);
goto out_fail;
out_oversize:
if (net_ratelimit())
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f0d5740d7e22..896ce3f8f53a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1104,10 +1104,10 @@ static int ipgre_open(struct net_device *dev)
return -EADDRNOTAVAIL;
dev = rt->u.dst.dev;
ip_rt_put(rt);
- if (__in_dev_get(dev) == NULL)
+ if (__in_dev_get_rtnl(dev) == NULL)
return -EADDRNOTAVAIL;
t->mlink = dev->ifindex;
- ip_mc_inc_group(__in_dev_get(dev), t->parms.iph.daddr);
+ ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
}
return 0;
}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index c703528e0bcd..473d0f2b2e0d 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -150,7 +150,7 @@
* SNMP management statistics
*/
-DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics);
+DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics) __read_mostly;
/*
* Process Router Attention IP option
@@ -225,8 +225,8 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb)
/* If there maybe a raw socket we must check - if not we
* don't care less
*/
- if (raw_sk)
- raw_v4_input(skb, skb->nh.iph, hash);
+ if (raw_sk && !raw_v4_input(skb, skb->nh.iph, hash))
+ raw_sk = NULL;
if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
int ret;
@@ -279,18 +279,70 @@ int ip_local_deliver(struct sk_buff *skb)
ip_local_deliver_finish);
}
-static inline int ip_rcv_finish(struct sk_buff *skb)
+static inline int ip_rcv_options(struct sk_buff *skb)
{
+ struct ip_options *opt;
+ struct iphdr *iph;
struct net_device *dev = skb->dev;
+
+ /* It looks as overkill, because not all
+ IP options require packet mangling.
+ But it is the easiest for now, especially taking
+ into account that combination of IP options
+ and running sniffer is extremely rare condition.
+ --ANK (980813)
+ */
+ if (skb_cow(skb, skb_headroom(skb))) {
+ IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+ goto drop;
+ }
+
+ iph = skb->nh.iph;
+
+ if (ip_options_compile(NULL, skb)) {
+ IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
+ goto drop;
+ }
+
+ opt = &(IPCB(skb)->opt);
+ if (unlikely(opt->srr)) {
+ struct in_device *in_dev = in_dev_get(dev);
+ if (in_dev) {
+ if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
+ if (IN_DEV_LOG_MARTIANS(in_dev) &&
+ net_ratelimit())
+ printk(KERN_INFO "source route option "
+ "%u.%u.%u.%u -> %u.%u.%u.%u\n",
+ NIPQUAD(iph->saddr),
+ NIPQUAD(iph->daddr));
+ in_dev_put(in_dev);
+ goto drop;
+ }
+
+ in_dev_put(in_dev);
+ }
+
+ if (ip_options_rcv_srr(skb))
+ goto drop;
+ }
+
+ return 0;
+drop:
+ return -1;
+}
+
+static inline int ip_rcv_finish(struct sk_buff *skb)
+{
struct iphdr *iph = skb->nh.iph;
- int err;
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
- if (skb->dst == NULL) {
- if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
+ if (likely(skb->dst == NULL)) {
+ int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
+ skb->dev);
+ if (unlikely(err)) {
if (err == -EHOSTUNREACH)
IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
goto drop;
@@ -298,7 +350,7 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
}
#ifdef CONFIG_NET_CLS_ROUTE
- if (skb->dst->tclassid) {
+ if (unlikely(skb->dst->tclassid)) {
struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id();
u32 idx = skb->dst->tclassid;
st[idx&0xFF].o_packets++;
@@ -308,48 +360,11 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
}
#endif
- if (iph->ihl > 5) {
- struct ip_options *opt;
-
- /* It looks as overkill, because not all
- IP options require packet mangling.
- But it is the easiest for now, especially taking
- into account that combination of IP options
- and running sniffer is extremely rare condition.
- --ANK (980813)
- */
-
- if (skb_cow(skb, skb_headroom(skb))) {
- IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
- goto drop;
- }
- iph = skb->nh.iph;
-
- if (ip_options_compile(NULL, skb))
- goto inhdr_error;
-
- opt = &(IPCB(skb)->opt);
- if (opt->srr) {
- struct in_device *in_dev = in_dev_get(dev);
- if (in_dev) {
- if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
- if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
- printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n",
- NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
- in_dev_put(in_dev);
- goto drop;
- }
- in_dev_put(in_dev);
- }
- if (ip_options_rcv_srr(skb))
- goto drop;
- }
- }
+ if (iph->ihl > 5 && ip_rcv_options(skb))
+ goto drop;
return dst_input(skb);
-inhdr_error:
- IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
return NET_RX_DROP;
@@ -358,9 +373,10 @@ drop:
/*
* Main IP Receive routine.
*/
-int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct iphdr *iph;
+ u32 len;
/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
@@ -392,29 +408,27 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
*/
if (iph->ihl < 5 || iph->version != 4)
- goto inhdr_error;
+ goto inhdr_error;
if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;
iph = skb->nh.iph;
- if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
- goto inhdr_error;
+ if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+ goto inhdr_error;
- {
- __u32 len = ntohs(iph->tot_len);
- if (skb->len < len || len < (iph->ihl<<2))
- goto inhdr_error;
+ len = ntohs(iph->tot_len);
+ if (skb->len < len || len < (iph->ihl*4))
+ goto inhdr_error;
- /* Our transport medium may have padded the buffer out. Now we know it
- * is IP we can trim to the true length of the frame.
- * Note this now means skb->len holds ntohs(iph->tot_len).
- */
- if (pskb_trim_rcsum(skb, len)) {
- IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
- goto drop;
- }
+ /* Our transport medium may have padded the buffer out. Now we know it
+ * is IP we can trim to the true length of the frame.
+ * Note this now means skb->len holds ntohs(iph->tot_len).
+ */
+ if (pskb_trim_rcsum(skb, len)) {
+ IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+ goto drop;
}
return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
@@ -428,5 +442,4 @@ out:
return NET_RX_DROP;
}
-EXPORT_SYMBOL(ip_rcv);
EXPORT_SYMBOL(ip_statistics);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 6d89f3f3e701..bce4e875193b 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -489,23 +489,18 @@ void ip_options_undo(struct ip_options * opt)
}
}
-int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user)
+static struct ip_options *ip_options_get_alloc(const int optlen)
{
- struct ip_options *opt;
+ struct ip_options *opt = kmalloc(sizeof(*opt) + ((optlen + 3) & ~3),
+ GFP_KERNEL);
+ if (opt)
+ memset(opt, 0, sizeof(*opt));
+ return opt;
+}
- opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL);
- if (!opt)
- return -ENOMEM;
- memset(opt, 0, sizeof(struct ip_options));
- if (optlen) {
- if (user) {
- if (copy_from_user(opt->__data, data, optlen)) {
- kfree(opt);
- return -EFAULT;
- }
- } else
- memcpy(opt->__data, data, optlen);
- }
+static int ip_options_get_finish(struct ip_options **optp,
+ struct ip_options *opt, int optlen)
+{
while (optlen & 3)
opt->__data[optlen++] = IPOPT_END;
opt->optlen = optlen;
@@ -521,6 +516,30 @@ int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, in
return 0;
}
+int ip_options_get_from_user(struct ip_options **optp, unsigned char __user *data, int optlen)
+{
+ struct ip_options *opt = ip_options_get_alloc(optlen);
+
+ if (!opt)
+ return -ENOMEM;
+ if (optlen && copy_from_user(opt->__data, data, optlen)) {
+ kfree(opt);
+ return -EFAULT;
+ }
+ return ip_options_get_finish(optp, opt, optlen);
+}
+
+int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen)
+{
+ struct ip_options *opt = ip_options_get_alloc(optlen);
+
+ if (!opt)
+ return -ENOMEM;
+ if (optlen)
+ memcpy(opt->__data, data, optlen);
+ return ip_options_get_finish(optp, opt, optlen);
+}
+
void ip_forward_options(struct sk_buff *skb)
{
struct ip_options * opt = &(IPCB(skb)->opt);
@@ -620,6 +639,3 @@ int ip_options_rcv_srr(struct sk_buff *skb)
}
return 0;
}
-
-EXPORT_SYMBOL(ip_options_compile);
-EXPORT_SYMBOL(ip_options_undo);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 80d13103b2b0..17758234a3e3 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -69,13 +69,10 @@
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
-#include <net/tcp.h>
-#include <net/udp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/arp.h>
#include <net/icmp.h>
-#include <net/raw.h>
#include <net/checksum.h>
#include <net/inetpeer.h>
#include <net/checksum.h>
@@ -84,12 +81,8 @@
#include <linux/netfilter_bridge.h>
#include <linux/mroute.h>
#include <linux/netlink.h>
+#include <linux/tcp.h>
-/*
- * Shall we try to damage output packets if routing dev changes?
- */
-
-int sysctl_ip_dynaddr;
int sysctl_ip_default_ttl = IPDEFTTL;
/* Generate a checksum for an outgoing IP datagram. */
@@ -165,6 +158,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
dst_output);
}
+EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
+
static inline int ip_finish_output2(struct sk_buff *skb)
{
struct dst_entry *dst = skb->dst;
@@ -205,7 +200,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
return -EINVAL;
}
-int ip_finish_output(struct sk_buff *skb)
+static inline int ip_finish_output(struct sk_buff *skb)
{
struct net_device *dev = skb->dst->dev;
@@ -280,7 +275,8 @@ int ip_output(struct sk_buff *skb)
{
IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
- if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
+ if (skb->len > dst_mtu(skb->dst) &&
+ !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
return ip_fragment(skb, ip_finish_output);
else
return ip_finish_output(skb);
@@ -329,8 +325,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
if (ip_route_output_flow(&rt, &fl, sk, 0))
goto no_route;
}
- __sk_dst_set(sk, &rt->u.dst);
- tcp_v4_setup_caps(sk, &rt->u.dst);
+ sk_setup_caps(sk, &rt->u.dst);
}
skb->dst = dst_clone(&rt->u.dst);
@@ -392,12 +387,14 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
#endif
#ifdef CONFIG_NETFILTER
to->nfmark = from->nfmark;
- to->nfcache = from->nfcache;
/* Connection association is same as pre-frag packet */
nf_conntrack_put(to->nfct);
to->nfct = from->nfct;
nf_conntrack_get(to->nfct);
to->nfctinfo = from->nfctinfo;
+#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+ to->ipvs_property = from->ipvs_property;
+#endif
#ifdef CONFIG_BRIDGE_NETFILTER
nf_bridge_put(to->nf_bridge);
to->nf_bridge = from->nf_bridge;
@@ -580,7 +577,7 @@ slow_path:
*/
if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
- NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
+ NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
err = -ENOMEM;
goto fail;
}
@@ -692,6 +689,60 @@ csum_page(struct page *page, int offset, int copy)
return csum;
}
+inline int ip_ufo_append_data(struct sock *sk,
+ int getfrag(void *from, char *to, int offset, int len,
+ int odd, struct sk_buff *skb),
+ void *from, int length, int hh_len, int fragheaderlen,
+ int transhdrlen, int mtu,unsigned int flags)
+{
+ struct sk_buff *skb;
+ int err;
+
+ /* There is support for UDP fragmentation offload by network
+ * device, so create one single skb packet containing complete
+ * udp datagram
+ */
+ if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
+ skb = sock_alloc_send_skb(sk,
+ hh_len + fragheaderlen + transhdrlen + 20,
+ (flags & MSG_DONTWAIT), &err);
+
+ if (skb == NULL)
+ return err;
+
+ /* reserve space for Hardware header */
+ skb_reserve(skb, hh_len);
+
+ /* create space for UDP/IP header */
+ skb_put(skb,fragheaderlen + transhdrlen);
+
+ /* initialize network header pointer */
+ skb->nh.raw = skb->data;
+
+ /* initialize protocol header pointer */
+ skb->h.raw = skb->data + fragheaderlen;
+
+ skb->ip_summed = CHECKSUM_HW;
+ skb->csum = 0;
+ sk->sk_sndmsg_off = 0;
+ }
+
+ err = skb_append_datato_frags(sk,skb, getfrag, from,
+ (length - transhdrlen));
+ if (!err) {
+ /* specify the length of each IP datagram fragment*/
+ skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
+ __skb_queue_tail(&sk->sk_write_queue, skb);
+
+ return 0;
+ }
+ /* There is not enough support do UFO ,
+ * so follow normal path
+ */
+ kfree_skb(skb);
+ return err;
+}
+
/*
* ip_append_data() and ip_append_page() can make one large IP datagram
* from many pieces of data. Each pieces will be holded on the socket
@@ -781,6 +832,15 @@ int ip_append_data(struct sock *sk,
csummode = CHECKSUM_HW;
inet->cork.length += length;
+ if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
+ (rt->u.dst.dev->features & NETIF_F_UFO)) {
+
+ if(ip_ufo_append_data(sk, getfrag, from, length, hh_len,
+ fragheaderlen, transhdrlen, mtu, flags))
+ goto error;
+
+ return 0;
+ }
/* So, what's going on in the loop below?
*
@@ -1012,14 +1072,23 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
return -EINVAL;
inet->cork.length += size;
+ if ((sk->sk_protocol == IPPROTO_UDP) &&
+ (rt->u.dst.dev->features & NETIF_F_UFO))
+ skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
+
while (size > 0) {
int i;
- /* Check if the remaining data fits into current packet. */
- len = mtu - skb->len;
- if (len < size)
- len = maxfraglen - skb->len;
+ if (skb_shinfo(skb)->ufo_size)
+ len = size;
+ else {
+
+ /* Check if the remaining data fits into current packet. */
+ len = mtu - skb->len;
+ if (len < size)
+ len = maxfraglen - skb->len;
+ }
if (len <= 0) {
struct sk_buff *skb_prev;
char *data;
@@ -1027,10 +1096,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
int alloclen;
skb_prev = skb;
- if (skb_prev)
- fraggap = skb_prev->len - maxfraglen;
- else
- fraggap = 0;
+ fraggap = skb_prev->len - maxfraglen;
alloclen = fragheaderlen + hh_len + fraggap + 15;
skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
@@ -1329,12 +1395,7 @@ void __init ip_init(void)
#endif
}
-EXPORT_SYMBOL(ip_finish_output);
EXPORT_SYMBOL(ip_fragment);
EXPORT_SYMBOL(ip_generic_getfrag);
EXPORT_SYMBOL(ip_queue_xmit);
EXPORT_SYMBOL(ip_send_check);
-
-#ifdef CONFIG_SYSCTL
-EXPORT_SYMBOL(sysctl_ip_default_ttl);
-#endif
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index fc7c481d0d79..2f0b47da5b37 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -153,7 +153,7 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
switch (cmsg->cmsg_type) {
case IP_RETOPTS:
err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
- err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0);
+ err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40);
if (err)
return err;
break;
@@ -425,7 +425,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
struct ip_options * opt = NULL;
if (optlen > 40 || optlen < 0)
goto e_inval;
- err = ip_options_get(&opt, optval, optlen, 1);
+ err = ip_options_get_from_user(&opt, optval, optlen);
if (err)
break;
if (sk->sk_type == SOCK_STREAM) {
@@ -614,7 +614,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
}
case IP_MSFILTER:
{
- extern int sysctl_optmem_max;
extern int sysctl_igmp_max_msf;
struct ip_msfilter *msf;
@@ -769,7 +768,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
}
case MCAST_MSFILTER:
{
- extern int sysctl_optmem_max;
extern int sysctl_igmp_max_msf;
struct sockaddr_in *psin;
struct ip_msfilter *msf = NULL;
@@ -848,6 +846,9 @@ mc_msf_out:
case IP_IPSEC_POLICY:
case IP_XFRM_POLICY:
+ err = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ break;
err = xfrm_user_policy(sk, optname, optval, optlen);
break;
@@ -1087,7 +1088,5 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
EXPORT_SYMBOL(ip_cmsg_recv);
-#ifdef CONFIG_IP_SCTP_MODULE
EXPORT_SYMBOL(ip_getsockopt);
EXPORT_SYMBOL(ip_setsockopt);
-#endif
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 2065944fd9e5..fc718df17b40 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -214,8 +214,8 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
spi, IPPROTO_COMP, AF_INET);
if (!x)
return;
- NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
- spi, NIPQUAD(iph->daddr)));
+ NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
+ spi, NIPQUAD(iph->daddr));
xfrm_state_put(x);
}
@@ -345,8 +345,7 @@ static void ipcomp_free_tfms(struct crypto_tfm **tfms)
for_each_cpu(cpu) {
struct crypto_tfm *tfm = *per_cpu_ptr(tfms, cpu);
- if (tfm)
- crypto_free_tfm(tfm);
+ crypto_free_tfm(tfm);
}
free_percpu(tfms);
}
@@ -358,7 +357,7 @@ static struct crypto_tfm **ipcomp_alloc_tfms(const char *alg_name)
int cpu;
/* This can be any valid CPU ID so we don't need locking. */
- cpu = smp_processor_id();
+ cpu = raw_smp_processor_id();
list_for_each_entry(pos, &ipcomp_tfms_list, list) {
struct crypto_tfm *tfm;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index d2bf8e1930a3..e8674baaa8d9 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -54,6 +54,7 @@
#include <linux/major.h>
#include <linux/root_dev.h>
#include <linux/delay.h>
+#include <linux/nfs_fs.h>
#include <net/arp.h>
#include <net/ip.h>
#include <net/ipconfig.h>
@@ -393,7 +394,7 @@ static int __init ic_defaults(void)
#ifdef IPCONFIG_RARP
-static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
+static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
static struct packet_type rarp_packet_type __initdata = {
.type = __constant_htons(ETH_P_RARP),
@@ -414,7 +415,7 @@ static inline void ic_rarp_cleanup(void)
* Process received RARP packet.
*/
static int __init
-ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct arphdr *rarp;
unsigned char *rarp_ptr;
@@ -555,7 +556,7 @@ struct bootp_pkt { /* BOOTP packet format */
#define DHCPRELEASE 7
#define DHCPINFORM 8
-static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
+static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
static struct packet_type bootp_packet_type __initdata = {
.type = __constant_htons(ETH_P_IP),
@@ -823,7 +824,7 @@ static void __init ic_do_bootp_ext(u8 *ext)
/*
* Receive BOOTP reply.
*/
-static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct bootp_pkt *b;
struct iphdr *h;
@@ -1102,10 +1103,8 @@ static int __init ic_dynamic(void)
#endif
jiff = jiffies + (d->next ? CONF_INTER_TIMEOUT : timeout);
- while (time_before(jiffies, jiff) && !ic_got_reply) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(1);
- }
+ while (time_before(jiffies, jiff) && !ic_got_reply)
+ schedule_timeout_uninterruptible(1);
#ifdef IPCONFIG_DHCP
/* DHCP isn't done until we get a DHCPACK. */
if ((ic_got_reply & IC_BOOTP)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index dc806b578427..302b7eb507c9 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -103,7 +103,7 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
In this case data path is free of exclusive locks at all.
*/
-static kmem_cache_t *mrt_cachep;
+static kmem_cache_t *mrt_cachep __read_mostly;
static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
@@ -149,7 +149,7 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v)
if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
dev->flags |= IFF_MULTICAST;
- in_dev = __in_dev_get(dev);
+ in_dev = __in_dev_get_rtnl(dev);
if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
goto failure;
in_dev->cnf.rp_filter = 0;
@@ -278,7 +278,7 @@ static int vif_delete(int vifi)
dev_set_allmulti(dev, -1);
- if ((in_dev = __in_dev_get(dev)) != NULL) {
+ if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
in_dev->cnf.mc_forwarding--;
ip_rt_multicast_event(in_dev);
}
@@ -421,7 +421,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock)
return -EINVAL;
}
- if ((in_dev = __in_dev_get(dev)) == NULL)
+ if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
return -EADDRNOTAVAIL;
in_dev->cnf.mc_forwarding++;
dev_set_allmulti(dev, +1);
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index d9212addd193..fc6f95aaa969 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -26,6 +26,7 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <net/protocol.h>
+#include <net/tcp.h>
#include <asm/system.h>
#include <linux/stat.h>
#include <linux/proc_fs.h>
@@ -603,7 +604,7 @@ static struct file_operations ip_vs_app_fops = {
/*
* Replace a segment of data with a new segment
*/
-int ip_vs_skb_replace(struct sk_buff *skb, int pri,
+int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
char *o_buf, int o_len, char *n_buf, int n_len)
{
struct iphdr *iph;
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index d0145a8b1551..f828fa2eb7de 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -40,7 +40,7 @@
static struct list_head *ip_vs_conn_tab;
/* SLAB cache for IPVS connections */
-static kmem_cache_t *ip_vs_conn_cachep;
+static kmem_cache_t *ip_vs_conn_cachep __read_mostly;
/* counter for current IPVS connections */
static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
@@ -196,6 +196,7 @@ static inline struct ip_vs_conn *__ip_vs_conn_in_get
list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
if (s_addr==cp->caddr && s_port==cp->cport &&
d_port==cp->vport && d_addr==cp->vaddr &&
+ ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
protocol==cp->protocol) {
/* HIT */
atomic_inc(&cp->refcnt);
@@ -227,6 +228,40 @@ struct ip_vs_conn *ip_vs_conn_in_get
return cp;
}
+/* Get reference to connection template */
+struct ip_vs_conn *ip_vs_ct_in_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+ unsigned hash;
+ struct ip_vs_conn *cp;
+
+ hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
+
+ ct_read_lock(hash);
+
+ list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (s_addr==cp->caddr && s_port==cp->cport &&
+ d_port==cp->vport && d_addr==cp->vaddr &&
+ cp->flags & IP_VS_CONN_F_TEMPLATE &&
+ protocol==cp->protocol) {
+ /* HIT */
+ atomic_inc(&cp->refcnt);
+ goto out;
+ }
+ }
+ cp = NULL;
+
+ out:
+ ct_read_unlock(hash);
+
+ IP_VS_DBG(7, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+ ip_vs_proto_name(protocol),
+ NIPQUAD(s_addr), ntohs(s_port),
+ NIPQUAD(d_addr), ntohs(d_port),
+ cp?"hit":"not hit");
+
+ return cp;
+}
/*
* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
@@ -367,7 +402,7 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
atomic_read(&dest->refcnt));
/* Update the connection counters */
- if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
/* It is a normal connection, so increase the inactive
connection counter because it is in TCP SYNRECV
state (inactive) or other protocol inacive state */
@@ -406,7 +441,7 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
atomic_read(&dest->refcnt));
/* Update the connection counters */
- if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
/* It is a normal connection, so decrease the inactconns
or activeconns counter */
if (cp->flags & IP_VS_CONN_F_INACTIVE) {
@@ -467,7 +502,7 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
/*
* Invalidate the connection template
*/
- if (ct->cport) {
+ if (ct->vport != 65535) {
if (ip_vs_conn_unhash(ct)) {
ct->dport = 65535;
ct->vport = 65535;
@@ -776,7 +811,7 @@ void ip_vs_random_dropentry(void)
ct_write_lock_bh(hash);
list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
/* connection template */
continue;
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 5fb257dd07cb..981cc3244ef2 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -22,6 +22,7 @@
*
* Changes:
* Paul `Rusty' Russell properly handle non-linear skbs
+ * Harald Welte don't use nfcache
*
*/
@@ -242,10 +243,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
if (ports[1] == svc->port) {
/* Check if a template already exists */
if (svc->port != FTPPORT)
- ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+ ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
iph->daddr, ports[1]);
else
- ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+ ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
iph->daddr, 0);
if (!ct || !ip_vs_check_template(ct)) {
@@ -271,14 +272,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
iph->daddr,
ports[1],
dest->addr, dest->port,
- 0,
+ IP_VS_CONN_F_TEMPLATE,
dest);
else
ct = ip_vs_conn_new(iph->protocol,
snet, 0,
iph->daddr, 0,
dest->addr, 0,
- 0,
+ IP_VS_CONN_F_TEMPLATE,
dest);
if (ct == NULL)
return NULL;
@@ -297,10 +298,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
* port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
*/
if (svc->fwmark)
- ct = ip_vs_conn_in_get(IPPROTO_IP, snet, 0,
+ ct = ip_vs_ct_in_get(IPPROTO_IP, snet, 0,
htonl(svc->fwmark), 0);
else
- ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+ ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
iph->daddr, 0);
if (!ct || !ip_vs_check_template(ct)) {
@@ -325,14 +326,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
snet, 0,
htonl(svc->fwmark), 0,
dest->addr, 0,
- 0,
+ IP_VS_CONN_F_TEMPLATE,
dest);
else
ct = ip_vs_conn_new(iph->protocol,
snet, 0,
iph->daddr, 0,
dest->addr, 0,
- 0,
+ IP_VS_CONN_F_TEMPLATE,
dest);
if (ct == NULL)
return NULL;
@@ -529,7 +530,7 @@ static unsigned int ip_vs_post_routing(unsigned int hooknum,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY))
+ if (!((*pskb)->ipvs_property))
return NF_ACCEPT;
/* The packet was sent from IPVS, exit this chain */
@@ -701,7 +702,7 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
/* do the statistics and put it back */
ip_vs_out_stats(cp, skb);
- skb->nfcache |= NFC_IPVS_PROPERTY;
+ skb->ipvs_property = 1;
verdict = NF_ACCEPT;
out:
@@ -739,7 +740,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
EnterFunction(11);
- if (skb->nfcache & NFC_IPVS_PROPERTY)
+ if (skb->ipvs_property)
return NF_ACCEPT;
iph = skb->nh.iph;
@@ -821,7 +822,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
ip_vs_conn_put(cp);
- skb->nfcache |= NFC_IPVS_PROPERTY;
+ skb->ipvs_property = 1;
LeaveFunction(11);
return NF_ACCEPT;
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 7d99ede2ef79..2d66848e7aa0 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -1598,7 +1598,7 @@ static ctl_table vs_table[] = {
{ .ctl_name = 0 }
};
-static ctl_table ipv4_table[] = {
+static ctl_table ipvs_ipv4_table[] = {
{
.ctl_name = NET_IPV4,
.procname = "ipv4",
@@ -1613,7 +1613,7 @@ static ctl_table vs_root_table[] = {
.ctl_name = CTL_NET,
.procname = "net",
.mode = 0555,
- .child = ipv4_table,
+ .child = ipvs_ipv4_table,
},
{ .ctl_name = 0 }
};
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index c035838b780a..561cda326fa8 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -131,7 +131,7 @@ static ctl_table vs_table[] = {
{ .ctl_name = 0 }
};
-static ctl_table ipv4_table[] = {
+static ctl_table ipvs_ipv4_table[] = {
{
.ctl_name = NET_IPV4,
.procname = "ipv4",
@@ -146,7 +146,7 @@ static ctl_table lblc_root_table[] = {
.ctl_name = CTL_NET,
.procname = "net",
.mode = 0555,
- .child = ipv4_table
+ .child = ipvs_ipv4_table
},
{ .ctl_name = 0 }
};
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index 22b5dd55d271..ce456dbf09a5 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -320,7 +320,7 @@ static ctl_table vs_table[] = {
{ .ctl_name = 0 }
};
-static ctl_table ipv4_table[] = {
+static ctl_table ipvs_ipv4_table[] = {
{
.ctl_name = NET_IPV4,
.procname = "ipv4",
@@ -335,7 +335,7 @@ static ctl_table lblcr_root_table[] = {
.ctl_name = CTL_NET,
.procname = "net",
.mode = 0555,
- .child = ipv4_table
+ .child = ipvs_ipv4_table
},
{ .ctl_name = 0 }
};
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index e65de675da74..c19408973c09 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -604,14 +604,14 @@ void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
}
-static void tcp_init(struct ip_vs_protocol *pp)
+static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
{
IP_VS_INIT_HASH_TABLE(tcp_apps);
pp->timeout_table = tcp_timeouts;
}
-static void tcp_exit(struct ip_vs_protocol *pp)
+static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
{
}
@@ -621,8 +621,8 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
.protocol = IPPROTO_TCP,
.dont_defrag = 0,
.appcnt = ATOMIC_INIT(0),
- .init = tcp_init,
- .exit = tcp_exit,
+ .init = ip_vs_tcp_init,
+ .exit = ip_vs_tcp_exit,
.register_app = tcp_register_app,
.unregister_app = tcp_unregister_app,
.conn_schedule = tcp_conn_schedule,
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 574d1f509b46..2e5ced3d8062 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -297,16 +297,24 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
for (i=0; i<m->nr_conns; i++) {
+ unsigned flags;
+
s = (struct ip_vs_sync_conn *)p;
- cp = ip_vs_conn_in_get(s->protocol,
- s->caddr, s->cport,
- s->vaddr, s->vport);
+ flags = ntohs(s->flags);
+ if (!(flags & IP_VS_CONN_F_TEMPLATE))
+ cp = ip_vs_conn_in_get(s->protocol,
+ s->caddr, s->cport,
+ s->vaddr, s->vport);
+ else
+ cp = ip_vs_ct_in_get(s->protocol,
+ s->caddr, s->cport,
+ s->vaddr, s->vport);
if (!cp) {
cp = ip_vs_conn_new(s->protocol,
s->caddr, s->cport,
s->vaddr, s->vport,
s->daddr, s->dport,
- ntohs(s->flags), NULL);
+ flags, NULL);
if (!cp) {
IP_VS_ERR("ip_vs_conn_new failed\n");
return;
@@ -315,11 +323,11 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
} else if (!cp->dest) {
/* it is an entry created by the synchronization */
cp->state = ntohs(s->state);
- cp->flags = ntohs(s->flags) | IP_VS_CONN_F_HASHED;
+ cp->flags = flags | IP_VS_CONN_F_HASHED;
} /* Note that we don't touch its state and flags
if it is a normal entry. */
- if (ntohs(s->flags) & IP_VS_CONN_F_SEQ_MASK) {
+ if (flags & IP_VS_CONN_F_SEQ_MASK) {
opt = (struct ip_vs_sync_conn_options *)&s[1];
memcpy(&cp->in_seq, opt, sizeof(*opt));
p += FULL_CONN_SIZE;
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index a8512a3fd08a..3b87482049cf 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -127,7 +127,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
#define IP_VS_XMIT(skb, rt) \
do { \
- (skb)->nfcache |= NFC_IPVS_PROPERTY; \
+ (skb)->ipvs_property = 1; \
(skb)->ip_summed = CHECKSUM_NONE; \
NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \
(rt)->u.dst.dev, dst_output); \
diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c
index c9cf8726051d..db67373f9b34 100644
--- a/net/ipv4/multipath_drr.c
+++ b/net/ipv4/multipath_drr.c
@@ -107,7 +107,7 @@ static int drr_dev_event(struct notifier_block *this,
return NOTIFY_DONE;
}
-struct notifier_block drr_dev_notifier = {
+static struct notifier_block drr_dev_notifier = {
.notifier_call = drr_dev_event,
};
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
new file mode 100644
index 000000000000..ae0779d82c5d
--- /dev/null
+++ b/net/ipv4/netfilter.c
@@ -0,0 +1,139 @@
+/* IPv4 specific functions of netfilter core */
+
+#include <linux/config.h>
+#ifdef CONFIG_NETFILTER
+
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <net/route.h>
+#include <linux/ip.h>
+
+/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
+int ip_route_me_harder(struct sk_buff **pskb)
+{
+ struct iphdr *iph = (*pskb)->nh.iph;
+ struct rtable *rt;
+ struct flowi fl = {};
+ struct dst_entry *odst;
+ unsigned int hh_len;
+
+ /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
+ * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook.
+ */
+ if (inet_addr_type(iph->saddr) == RTN_LOCAL) {
+ fl.nl_u.ip4_u.daddr = iph->daddr;
+ fl.nl_u.ip4_u.saddr = iph->saddr;
+ fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
+ fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark;
+#endif
+ fl.proto = iph->protocol;
+ if (ip_route_output_key(&rt, &fl) != 0)
+ return -1;
+
+ /* Drop old route. */
+ dst_release((*pskb)->dst);
+ (*pskb)->dst = &rt->u.dst;
+ } else {
+ /* non-local src, find valid iif to satisfy
+ * rp-filter when calling ip_route_input. */
+ fl.nl_u.ip4_u.daddr = iph->saddr;
+ if (ip_route_output_key(&rt, &fl) != 0)
+ return -1;
+
+ odst = (*pskb)->dst;
+ if (ip_route_input(*pskb, iph->daddr, iph->saddr,
+ RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
+ dst_release(&rt->u.dst);
+ return -1;
+ }
+ dst_release(&rt->u.dst);
+ dst_release(odst);
+ }
+
+ if ((*pskb)->dst->error)
+ return -1;
+
+ /* Change in oif may mean change in hh_len. */
+ hh_len = (*pskb)->dst->dev->hard_header_len;
+ if (skb_headroom(*pskb) < hh_len) {
+ struct sk_buff *nskb;
+
+ nskb = skb_realloc_headroom(*pskb, hh_len);
+ if (!nskb)
+ return -1;
+ if ((*pskb)->sk)
+ skb_set_owner_w(nskb, (*pskb)->sk);
+ kfree_skb(*pskb);
+ *pskb = nskb;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ip_route_me_harder);
+
+/*
+ * Extra routing may needed on local out, as the QUEUE target never
+ * returns control to the table.
+ */
+
+struct ip_rt_info {
+ u_int32_t daddr;
+ u_int32_t saddr;
+ u_int8_t tos;
+};
+
+static void queue_save(const struct sk_buff *skb, struct nf_info *info)
+{
+ struct ip_rt_info *rt_info = nf_info_reroute(info);
+
+ if (info->hook == NF_IP_LOCAL_OUT) {
+ const struct iphdr *iph = skb->nh.iph;
+
+ rt_info->tos = iph->tos;
+ rt_info->daddr = iph->daddr;
+ rt_info->saddr = iph->saddr;
+ }
+}
+
+static int queue_reroute(struct sk_buff **pskb, const struct nf_info *info)
+{
+ const struct ip_rt_info *rt_info = nf_info_reroute(info);
+
+ if (info->hook == NF_IP_LOCAL_OUT) {
+ struct iphdr *iph = (*pskb)->nh.iph;
+
+ if (!(iph->tos == rt_info->tos
+ && iph->daddr == rt_info->daddr
+ && iph->saddr == rt_info->saddr))
+ return ip_route_me_harder(pskb);
+ }
+ return 0;
+}
+
+static struct nf_queue_rerouter ip_reroute = {
+ .rer_size = sizeof(struct ip_rt_info),
+ .save = queue_save,
+ .reroute = queue_reroute,
+};
+
+static int init(void)
+{
+ return nf_register_queue_rerouter(PF_INET, &ip_reroute);
+}
+
+static void fini(void)
+{
+ nf_unregister_queue_rerouter(PF_INET);
+}
+
+module_init(init);
+module_exit(fini);
+
+#endif /* CONFIG_NETFILTER */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 46d4cb1c06f0..7d917e4ce1d9 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -34,12 +34,31 @@ config IP_NF_CT_ACCT
config IP_NF_CONNTRACK_MARK
bool 'Connection mark tracking support'
+ depends on IP_NF_CONNTRACK
help
This option enables support for connection marks, used by the
`CONNMARK' target and `connmark' match. Similar to the mark value
of packets, but this mark value is kept in the conntrack session
instead of the individual packets.
+config IP_NF_CONNTRACK_EVENTS
+ bool "Connection tracking events"
+ depends on IP_NF_CONNTRACK
+ help
+ If this option is enabled, the connection tracking code will
+ provide a notifier chain that can be used by other kernel code
+ to get notified about changes in the connection tracking state.
+
+ IF unsure, say `N'.
+
+config IP_NF_CONNTRACK_NETLINK
+ tristate 'Connection tracking netlink interface'
+ depends on IP_NF_CONNTRACK && NETFILTER_NETLINK
+ depends on IP_NF_CONNTRACK!=y || NETFILTER_NETLINK!=m
+ help
+ This option enables support for a netlink-based userspace interface
+
+
config IP_NF_CT_PROTO_SCTP
tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)'
depends on IP_NF_CONNTRACK && EXPERIMENTAL
@@ -75,6 +94,25 @@ config IP_NF_IRC
To compile it as a module, choose M here. If unsure, say Y.
+config IP_NF_NETBIOS_NS
+ tristate "NetBIOS name service protocol support (EXPERIMENTAL)"
+ depends on IP_NF_CONNTRACK && EXPERIMENTAL
+ help
+ NetBIOS name service requests are sent as broadcast messages from an
+ unprivileged port and responded to with unicast messages to the
+ same port. This make them hard to firewall properly because connection
+ tracking doesn't deal with broadcasts. This helper tracks locally
+ originating NetBIOS name service requests and the corresponding
+ responses. It relies on correct IP address configuration, specifically
+ netmask and broadcast address. When properly configured, the output
+ of "ip address show" should look similar to this:
+
+ $ ip -4 address show eth0
+ 4: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc pfifo_fast qlen 1000
+ inet 172.16.2.252/24 brd 172.16.2.255 scope global eth0
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config IP_NF_TFTP
tristate "TFTP protocol support"
depends on IP_NF_CONNTRACK
@@ -99,12 +137,33 @@ config IP_NF_AMANDA
To compile it as a module, choose M here. If unsure, say Y.
+config IP_NF_PPTP
+ tristate 'PPTP protocol support'
+ depends on IP_NF_CONNTRACK
+ help
+ This module adds support for PPTP (Point to Point Tunnelling
+ Protocol, RFC2637) connection tracking and NAT.
+
+ If you are running PPTP sessions over a stateful firewall or NAT
+ box, you may want to enable this feature.
+
+ Please note that not all PPTP modes of operation are supported yet.
+ For more info, read top of the file
+ net/ipv4/netfilter/ip_conntrack_pptp.c
+
+ If you want to compile it as a module, say M here and read
+ Documentation/modules.txt. If unsure, say `N'.
+
config IP_NF_QUEUE
- tristate "Userspace queueing via NETLINK"
+ tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
help
Netfilter has the ability to queue packets to user space: the
netlink device can be used to access them using this driver.
+ This option enables the old IPv4-only "ip_queue" implementation
+ which has been obsoleted by the new "nfnetlink_queue" code (see
+ CONFIG_NETFILTER_NETLINK_QUEUE).
+
To compile it as a module, choose M here. If unsure, say N.
config IP_NF_IPTABLES
@@ -340,6 +399,17 @@ config IP_NF_MATCH_SCTP
If you want to compile it as a module, say M here and read
<file:Documentation/modules.txt>. If unsure, say `N'.
+config IP_NF_MATCH_DCCP
+ tristate 'DCCP protocol match support'
+ depends on IP_NF_IPTABLES
+ help
+ With this option enabled, you will be able to use the iptables
+ `dccp' match in order to match on DCCP source/destination ports
+ and DCCP flags.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/modules.txt>. If unsure, say `N'.
+
config IP_NF_MATCH_COMMENT
tristate 'comment match support'
depends on IP_NF_IPTABLES
@@ -361,6 +431,16 @@ config IP_NF_MATCH_CONNMARK
<file:Documentation/modules.txt>. The module will be called
ipt_connmark.o. If unsure, say `N'.
+config IP_NF_MATCH_CONNBYTES
+ tristate 'Connection byte/packet counter match support'
+ depends on IP_NF_CT_ACCT && IP_NF_IPTABLES
+ help
+ This option adds a `connbytes' match, which allows you to match the
+ number of bytes and/or packets for each direction within a connection.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/modules.txt>. If unsure, say `N'.
+
config IP_NF_MATCH_HASHLIMIT
tristate 'hashlimit match support'
depends on IP_NF_IPTABLES
@@ -375,6 +455,19 @@ config IP_NF_MATCH_HASHLIMIT
destination IP' or `500pps from any given source IP' with a single
IPtables rule.
+config IP_NF_MATCH_STRING
+ tristate 'string match support'
+ depends on IP_NF_IPTABLES
+ select TEXTSEARCH
+ select TEXTSEARCH_KMP
+ select TEXTSEARCH_BM
+ select TEXTSEARCH_FSM
+ help
+ This option adds a `string' match, which allows you to look for
+ pattern matchings in packets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
# `filter', generic and specific targets
config IP_NF_FILTER
tristate "Packet filtering"
@@ -406,9 +499,14 @@ config IP_NF_TARGET_LOG
To compile it as a module, choose M here. If unsure, say N.
config IP_NF_TARGET_ULOG
- tristate "ULOG target support"
+ tristate "ULOG target support (OBSOLETE)"
depends on IP_NF_IPTABLES
---help---
+
+ This option enables the old IPv4-only "ipt_ULOG" implementation
+ which has been obsoleted by the new "nfnetlink_log" code (see
+ CONFIG_NETFILTER_NETLINK_LOG).
+
This option adds a `ULOG' target, which allows you to create rules in
any iptables table. The packet is passed to a userspace logging
daemon using netlink multicast sockets; unlike the LOG target
@@ -445,6 +543,17 @@ config IP_NF_TARGET_TCPMSS
To compile it as a module, choose M here. If unsure, say N.
+config IP_NF_TARGET_NFQUEUE
+ tristate "NFQUEUE Target Support"
+ depends on IP_NF_IPTABLES
+ help
+ This Target replaced the old obsolete QUEUE target.
+
+ As opposed to QUEUE, it supports 65535 different queues,
+ not just one.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
# NAT + specific targets
config IP_NF_NAT
tristate "Full NAT"
@@ -545,6 +654,12 @@ config IP_NF_NAT_AMANDA
default IP_NF_NAT if IP_NF_AMANDA=y
default m if IP_NF_AMANDA=m
+config IP_NF_NAT_PPTP
+ tristate
+ depends on IP_NF_NAT!=n && IP_NF_PPTP!=n
+ default IP_NF_NAT if IP_NF_PPTP=y
+ default m if IP_NF_PPTP=m
+
# mangle + specific targets
config IP_NF_MANGLE
tristate "Packet mangling"
@@ -616,6 +731,20 @@ config IP_NF_TARGET_CLASSIFY
To compile it as a module, choose M here. If unsure, say N.
+config IP_NF_TARGET_TTL
+ tristate 'TTL target support'
+ depends on IP_NF_MANGLE
+ help
+ This option adds a `TTL' target, which enables the user to modify
+ the TTL value of the IP header.
+
+ While it is safe to decrement/lower the TTL, this target also enables
+ functionality to increment and set the TTL value of the IP header to
+ arbitrary values. This is EXTREMELY DANGEROUS since you can easily
+ create immortal packets that loop forever on the network.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config IP_NF_TARGET_CONNMARK
tristate 'CONNMARK target support'
depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 45796d5924dd..dab4b58dd31e 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -4,21 +4,32 @@
# objects for the standalone - connection tracking / NAT
ip_conntrack-objs := ip_conntrack_standalone.o ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntrack_proto_tcp.o ip_conntrack_proto_udp.o ip_conntrack_proto_icmp.o
-iptable_nat-objs := ip_nat_standalone.o ip_nat_rule.o ip_nat_core.o ip_nat_helper.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o
+ip_nat-objs := ip_nat_core.o ip_nat_helper.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o
+iptable_nat-objs := ip_nat_rule.o ip_nat_standalone.o
+
+ip_conntrack_pptp-objs := ip_conntrack_helper_pptp.o ip_conntrack_proto_gre.o
+ip_nat_pptp-objs := ip_nat_helper_pptp.o ip_nat_proto_gre.o
# connection tracking
obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
+# conntrack netlink interface
+obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o
+
+
# SCTP protocol connection tracking
obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o
# connection tracking helpers
+obj-$(CONFIG_IP_NF_PPTP) += ip_conntrack_pptp.o
obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
+obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o
# NAT helpers
+obj-$(CONFIG_IP_NF_NAT_PPTP) += ip_nat_pptp.o
obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
@@ -30,7 +41,7 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
# the three instances of ip_tables
obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
-obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o
+obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o ip_nat.o
obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
# matches
@@ -38,6 +49,7 @@ obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o
obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o
obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o
obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o
+obj-$(CONFIG_IP_NF_MATCH_DCCP) += ipt_dccp.o
obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o
obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o
obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o
@@ -54,11 +66,13 @@ obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o
obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o
obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o
+obj-$(CONFIG_IP_NF_MATCH_CONNBYTES) += ipt_connbytes.o
obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o
obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o
obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o
obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o
+obj-$(CONFIG_IP_NF_MATCH_STRING) += ipt_string.o
# targets
obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
@@ -78,6 +92,8 @@ obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o
obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
+obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o
+obj-$(CONFIG_IP_NF_TARGET_NFQUEUE) += ipt_NFQUEUE.o
# generic ARP tables
obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index fa1634256680..3c2e9639bba6 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -347,58 +347,106 @@ unsigned int arpt_do_table(struct sk_buff **pskb,
return verdict;
}
-static inline void *find_inlist_lock_noload(struct list_head *head,
- const char *name,
- int *error,
- struct semaphore *mutex)
+/*
+ * These are weird, but module loading must not be done with mutex
+ * held (since they will register), and we have to have a single
+ * function to use try_then_request_module().
+ */
+
+/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */
+static inline struct arpt_table *find_table_lock(const char *name)
{
- void *ret;
+ struct arpt_table *t;
- *error = down_interruptible(mutex);
- if (*error != 0)
- return NULL;
+ if (down_interruptible(&arpt_mutex) != 0)
+ return ERR_PTR(-EINTR);
- ret = list_named_find(head, name);
- if (!ret) {
- *error = -ENOENT;
- up(mutex);
- }
- return ret;
+ list_for_each_entry(t, &arpt_tables, list)
+ if (strcmp(t->name, name) == 0 && try_module_get(t->me))
+ return t;
+ up(&arpt_mutex);
+ return NULL;
}
-#ifndef CONFIG_KMOD
-#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m))
-#else
-static void *
-find_inlist_lock(struct list_head *head,
- const char *name,
- const char *prefix,
- int *error,
- struct semaphore *mutex)
+
+/* Find target, grabs ref. Returns ERR_PTR() on error. */
+static inline struct arpt_target *find_target(const char *name, u8 revision)
{
- void *ret;
+ struct arpt_target *t;
+ int err = 0;
- ret = find_inlist_lock_noload(head, name, error, mutex);
- if (!ret) {
- duprintf("find_inlist: loading `%s%s'.\n", prefix, name);
- request_module("%s%s", prefix, name);
- ret = find_inlist_lock_noload(head, name, error, mutex);
+ if (down_interruptible(&arpt_mutex) != 0)
+ return ERR_PTR(-EINTR);
+
+ list_for_each_entry(t, &arpt_target, list) {
+ if (strcmp(t->name, name) == 0) {
+ if (t->revision == revision) {
+ if (try_module_get(t->me)) {
+ up(&arpt_mutex);
+ return t;
+ }
+ } else
+ err = -EPROTOTYPE; /* Found something. */
+ }
}
+ up(&arpt_mutex);
+ return ERR_PTR(err);
+}
- return ret;
+struct arpt_target *arpt_find_target(const char *name, u8 revision)
+{
+ struct arpt_target *target;
+
+ target = try_then_request_module(find_target(name, revision),
+ "arpt_%s", name);
+ if (IS_ERR(target) || !target)
+ return NULL;
+ return target;
}
-#endif
-static inline struct arpt_table *arpt_find_table_lock(const char *name, int *error, struct semaphore *mutex)
+static int target_revfn(const char *name, u8 revision, int *bestp)
{
- return find_inlist_lock(&arpt_tables, name, "arptable_", error, mutex);
+ struct arpt_target *t;
+ int have_rev = 0;
+
+ list_for_each_entry(t, &arpt_target, list) {
+ if (strcmp(t->name, name) == 0) {
+ if (t->revision > *bestp)
+ *bestp = t->revision;
+ if (t->revision == revision)
+ have_rev =1;
+ }
+ }
+ return have_rev;
}
-static struct arpt_target *arpt_find_target_lock(const char *name, int *error, struct semaphore *mutex)
+/* Returns true or false (if no such extension at all) */
+static inline int find_revision(const char *name, u8 revision,
+ int (*revfn)(const char *, u8, int *),
+ int *err)
{
- return find_inlist_lock(&arpt_target, name, "arpt_", error, mutex);
+ int have_rev, best = -1;
+
+ if (down_interruptible(&arpt_mutex) != 0) {
+ *err = -EINTR;
+ return 1;
+ }
+ have_rev = revfn(name, revision, &best);
+ up(&arpt_mutex);
+
+ /* Nothing at all? Return 0 to try loading module. */
+ if (best == -1) {
+ *err = -ENOENT;
+ return 0;
+ }
+
+ *err = best;
+ if (!have_rev)
+ *err = -EPROTONOSUPPORT;
+ return 1;
}
+
/* All zeroes == unconditional rule. */
static inline int unconditional(const struct arpt_arp *arp)
{
@@ -544,17 +592,15 @@ static inline int check_entry(struct arpt_entry *e, const char *name, unsigned i
}
t = arpt_get_target(e);
- target = arpt_find_target_lock(t->u.user.name, &ret, &arpt_mutex);
- if (!target) {
+ target = try_then_request_module(find_target(t->u.user.name,
+ t->u.user.revision),
+ "arpt_%s", t->u.user.name);
+ if (IS_ERR(target) || !target) {
duprintf("check_entry: `%s' not found\n", t->u.user.name);
+ ret = target ? PTR_ERR(target) : -ENOENT;
goto out;
}
- if (!try_module_get((target->me))) {
- ret = -ENOENT;
- goto out_unlock;
- }
t->u.kernel.target = target;
- up(&arpt_mutex);
if (t->u.kernel.target == &arpt_standard_target) {
if (!standard_check(t, size)) {
@@ -576,8 +622,6 @@ static inline int check_entry(struct arpt_entry *e, const char *name, unsigned i
(*i)++;
return 0;
-out_unlock:
- up(&arpt_mutex);
out:
return ret;
}
@@ -716,8 +760,10 @@ static int translate_table(const char *name,
}
/* And one copy for every other CPU */
- for (i = 1; i < num_possible_cpus(); i++) {
- memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
+ for_each_cpu(i) {
+ if (i == 0)
+ continue;
+ memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
newinfo->entries,
SMP_ALIGN(newinfo->size));
}
@@ -767,7 +813,7 @@ static void get_counters(const struct arpt_table_info *t,
unsigned int cpu;
unsigned int i;
- for (cpu = 0; cpu < num_possible_cpus(); cpu++) {
+ for_each_cpu(cpu) {
i = 0;
ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
t->size,
@@ -844,8 +890,8 @@ static int get_entries(const struct arpt_get_entries *entries,
int ret;
struct arpt_table *t;
- t = arpt_find_table_lock(entries->name, &ret, &arpt_mutex);
- if (t) {
+ t = find_table_lock(entries->name);
+ if (t || !IS_ERR(t)) {
duprintf("t->private->number = %u\n",
t->private->number);
if (entries->size == t->private->size)
@@ -857,10 +903,10 @@ static int get_entries(const struct arpt_get_entries *entries,
entries->size);
ret = -EINVAL;
}
+ module_put(t->me);
up(&arpt_mutex);
} else
- duprintf("get_entries: Can't find %s!\n",
- entries->name);
+ ret = t ? PTR_ERR(t) : -ENOENT;
return ret;
}
@@ -885,7 +931,8 @@ static int do_replace(void __user *user, unsigned int len)
return -ENOMEM;
newinfo = vmalloc(sizeof(struct arpt_table_info)
- + SMP_ALIGN(tmp.size) * num_possible_cpus());
+ + SMP_ALIGN(tmp.size) *
+ (highest_possible_processor_id()+1));
if (!newinfo)
return -ENOMEM;
@@ -910,22 +957,19 @@ static int do_replace(void __user *user, unsigned int len)
duprintf("arp_tables: Translated table\n");
- t = arpt_find_table_lock(tmp.name, &ret, &arpt_mutex);
- if (!t)
+ t = try_then_request_module(find_table_lock(tmp.name),
+ "arptable_%s", tmp.name);
+ if (!t || IS_ERR(t)) {
+ ret = t ? PTR_ERR(t) : -ENOENT;
goto free_newinfo_counters_untrans;
+ }
/* You lied! */
if (tmp.valid_hooks != t->valid_hooks) {
duprintf("Valid hook crap: %08X vs %08X\n",
tmp.valid_hooks, t->valid_hooks);
ret = -EINVAL;
- goto free_newinfo_counters_untrans_unlock;
- }
-
- /* Get a reference in advance, we're not allowed fail later */
- if (!try_module_get(t->me)) {
- ret = -EBUSY;
- goto free_newinfo_counters_untrans_unlock;
+ goto put_module;
}
oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret);
@@ -956,7 +1000,6 @@ static int do_replace(void __user *user, unsigned int len)
put_module:
module_put(t->me);
- free_newinfo_counters_untrans_unlock:
up(&arpt_mutex);
free_newinfo_counters_untrans:
ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL);
@@ -986,7 +1029,7 @@ static int do_add_counters(void __user *user, unsigned int len)
unsigned int i;
struct arpt_counters_info tmp, *paddc;
struct arpt_table *t;
- int ret;
+ int ret = 0;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1003,9 +1046,11 @@ static int do_add_counters(void __user *user, unsigned int len)
goto free;
}
- t = arpt_find_table_lock(tmp.name, &ret, &arpt_mutex);
- if (!t)
+ t = find_table_lock(tmp.name);
+ if (!t || IS_ERR(t)) {
+ ret = t ? PTR_ERR(t) : -ENOENT;
goto free;
+ }
write_lock_bh(&t->lock);
if (t->private->number != paddc->num_counters) {
@@ -1022,6 +1067,7 @@ static int do_add_counters(void __user *user, unsigned int len)
unlock_up_free:
write_unlock_bh(&t->lock);
up(&arpt_mutex);
+ module_put(t->me);
free:
vfree(paddc);
@@ -1076,8 +1122,10 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
break;
}
name[ARPT_TABLE_MAXNAMELEN-1] = '\0';
- t = arpt_find_table_lock(name, &ret, &arpt_mutex);
- if (t) {
+
+ t = try_then_request_module(find_table_lock(name),
+ "arptable_%s", name);
+ if (t && !IS_ERR(t)) {
struct arpt_getinfo info;
info.valid_hooks = t->valid_hooks;
@@ -1093,9 +1141,10 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
ret = -EFAULT;
else
ret = 0;
-
up(&arpt_mutex);
- }
+ module_put(t->me);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
}
break;
@@ -1116,6 +1165,24 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
break;
}
+ case ARPT_SO_GET_REVISION_TARGET: {
+ struct arpt_get_revision rev;
+
+ if (*len != sizeof(rev)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
+ ret = -EFAULT;
+ break;
+ }
+
+ try_then_request_module(find_revision(rev.name, rev.revision,
+ target_revfn, &ret),
+ "arpt_%s", rev.name);
+ break;
+ }
+
default:
duprintf("do_arpt_get_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
@@ -1133,12 +1200,9 @@ int arpt_register_target(struct arpt_target *target)
if (ret != 0)
return ret;
- if (!list_named_insert(&arpt_target, target)) {
- duprintf("arpt_register_target: `%s' already in list!\n",
- target->name);
- ret = -EINVAL;
- }
+ list_add(&target->list, &arpt_target);
up(&arpt_mutex);
+
return ret;
}
@@ -1158,7 +1222,8 @@ int arpt_register_table(struct arpt_table *table,
= { 0, 0, 0, { 0 }, { 0 }, { } };
newinfo = vmalloc(sizeof(struct arpt_table_info)
- + SMP_ALIGN(repl->size) * num_possible_cpus());
+ + SMP_ALIGN(repl->size) *
+ (highest_possible_processor_id()+1));
if (!newinfo) {
ret = -ENOMEM;
return ret;
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index 01e1b58322a9..fa3f914117ec 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -40,7 +40,7 @@ MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
static char *conns[] = { "DATA ", "MESG ", "INDEX " };
/* This is slow, but it's simple. --RR */
-static char amanda_buffer[65536];
+static char *amanda_buffer;
static DEFINE_SPINLOCK(amanda_buffer_lock);
unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
@@ -65,7 +65,7 @@ static int help(struct sk_buff **pskb,
/* increase the UDP timeout of the master connection as replies from
* Amanda clients to the server can be quite delayed */
- ip_ct_refresh_acct(ct, ctinfo, NULL, master_timeout * HZ);
+ ip_ct_refresh(ct, *pskb, master_timeout * HZ);
/* No data? */
dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
@@ -108,6 +108,7 @@ static int help(struct sk_buff **pskb,
}
exp->expectfn = NULL;
+ exp->flags = 0;
exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
exp->tuple.src.u.tcp.port = 0;
@@ -153,11 +154,25 @@ static struct ip_conntrack_helper amanda_helper = {
static void __exit fini(void)
{
ip_conntrack_helper_unregister(&amanda_helper);
+ kfree(amanda_buffer);
}
static int __init init(void)
{
- return ip_conntrack_helper_register(&amanda_helper);
+ int ret;
+
+ amanda_buffer = kmalloc(65536, GFP_KERNEL);
+ if (!amanda_buffer)
+ return -ENOMEM;
+
+ ret = ip_conntrack_helper_register(&amanda_helper);
+ if (ret < 0) {
+ kfree(amanda_buffer);
+ return ret;
+ }
+ return 0;
+
+
}
module_init(init);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index a7f0c821a9b2..422ab68ee7fb 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -37,6 +37,7 @@
#include <linux/err.h>
#include <linux/percpu.h>
#include <linux/moduleparam.h>
+#include <linux/notifier.h>
/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
registrations, conntrack timers*/
@@ -49,7 +50,7 @@
#include <linux/netfilter_ipv4/ip_conntrack_core.h>
#include <linux/netfilter_ipv4/listhelp.h>
-#define IP_CONNTRACK_VERSION "2.1"
+#define IP_CONNTRACK_VERSION "2.4"
#if 0
#define DEBUGP printk
@@ -69,35 +70,98 @@ static LIST_HEAD(helpers);
unsigned int ip_conntrack_htable_size = 0;
int ip_conntrack_max;
struct list_head *ip_conntrack_hash;
-static kmem_cache_t *ip_conntrack_cachep;
-static kmem_cache_t *ip_conntrack_expect_cachep;
+static kmem_cache_t *ip_conntrack_cachep __read_mostly;
+static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
struct ip_conntrack ip_conntrack_untracked;
unsigned int ip_ct_log_invalid;
static LIST_HEAD(unconfirmed);
static int ip_conntrack_vmalloc;
-DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
+static unsigned int ip_conntrack_next_id = 1;
+static unsigned int ip_conntrack_expect_next_id = 1;
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+struct notifier_block *ip_conntrack_chain;
+struct notifier_block *ip_conntrack_expect_chain;
+
+DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
+
+/* deliver cached events and clear cache entry - must be called with locally
+ * disabled softirqs */
+static inline void
+__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
+{
+ DEBUGP("ecache: delivering events for %p\n", ecache->ct);
+ if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
+ notifier_call_chain(&ip_conntrack_chain, ecache->events,
+ ecache->ct);
+ ecache->events = 0;
+ ip_conntrack_put(ecache->ct);
+ ecache->ct = NULL;
+}
+
+/* Deliver all cached events for a particular conntrack. This is called
+ * by code prior to async packet handling or freeing the skb */
+void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
+{
+ struct ip_conntrack_ecache *ecache;
+
+ local_bh_disable();
+ ecache = &__get_cpu_var(ip_conntrack_ecache);
+ if (ecache->ct == ct)
+ __ip_ct_deliver_cached_events(ecache);
+ local_bh_enable();
+}
-void
-ip_conntrack_put(struct ip_conntrack *ct)
+void __ip_ct_event_cache_init(struct ip_conntrack *ct)
{
- IP_NF_ASSERT(ct);
- nf_conntrack_put(&ct->ct_general);
+ struct ip_conntrack_ecache *ecache;
+
+ /* take care of delivering potentially old events */
+ ecache = &__get_cpu_var(ip_conntrack_ecache);
+ BUG_ON(ecache->ct == ct);
+ if (ecache->ct)
+ __ip_ct_deliver_cached_events(ecache);
+ /* initialize for this conntrack/packet */
+ ecache->ct = ct;
+ nf_conntrack_get(&ct->ct_general);
+}
+
+/* flush the event cache - touches other CPU's data and must not be called while
+ * packets are still passing through the code */
+static void ip_ct_event_cache_flush(void)
+{
+ struct ip_conntrack_ecache *ecache;
+ int cpu;
+
+ for_each_cpu(cpu) {
+ ecache = &per_cpu(ip_conntrack_ecache, cpu);
+ if (ecache->ct)
+ ip_conntrack_put(ecache->ct);
+ }
}
+#else
+static inline void ip_ct_event_cache_flush(void) {}
+#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
+
+DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
static int ip_conntrack_hash_rnd_initted;
static unsigned int ip_conntrack_hash_rnd;
-static u_int32_t
-hash_conntrack(const struct ip_conntrack_tuple *tuple)
+static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
+ unsigned int size, unsigned int rnd)
{
-#if 0
- dump_tuple(tuple);
-#endif
return (jhash_3words(tuple->src.ip,
(tuple->dst.ip ^ tuple->dst.protonum),
(tuple->src.u.all | (tuple->dst.u.all << 16)),
- ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
+ rnd) % size);
+}
+
+static u_int32_t
+hash_conntrack(const struct ip_conntrack_tuple *tuple)
+{
+ return __hash_conntrack(tuple, ip_conntrack_htable_size,
+ ip_conntrack_hash_rnd);
}
int
@@ -137,13 +201,14 @@ ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
/* ip_conntrack_expect helper functions */
-static void unlink_expect(struct ip_conntrack_expect *exp)
+void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
{
ASSERT_WRITE_LOCK(&ip_conntrack_lock);
IP_NF_ASSERT(!timer_pending(&exp->timeout));
list_del(&exp->list);
CONNTRACK_STAT_INC(expect_delete);
exp->master->expecting--;
+ ip_conntrack_expect_put(exp);
}
static void expectation_timed_out(unsigned long ul_expect)
@@ -151,11 +216,38 @@ static void expectation_timed_out(unsigned long ul_expect)
struct ip_conntrack_expect *exp = (void *)ul_expect;
write_lock_bh(&ip_conntrack_lock);
- unlink_expect(exp);
+ ip_ct_unlink_expect(exp);
write_unlock_bh(&ip_conntrack_lock);
ip_conntrack_expect_put(exp);
}
+struct ip_conntrack_expect *
+__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
+{
+ struct ip_conntrack_expect *i;
+
+ list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+ if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
+ atomic_inc(&i->use);
+ return i;
+ }
+ }
+ return NULL;
+}
+
+/* Just find a expectation corresponding to a tuple. */
+struct ip_conntrack_expect *
+ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
+{
+ struct ip_conntrack_expect *i;
+
+ read_lock_bh(&ip_conntrack_lock);
+ i = __ip_conntrack_expect_find(tuple);
+ read_unlock_bh(&ip_conntrack_lock);
+
+ return i;
+}
+
/* If an expectation for this connection is found, it gets delete from
* global list then returned. */
static struct ip_conntrack_expect *
@@ -170,17 +262,21 @@ find_expectation(const struct ip_conntrack_tuple *tuple)
master ct never got confirmed, we'd hold a reference to it
and weird things would happen to future packets). */
if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
- && is_confirmed(i->master)
- && del_timer(&i->timeout)) {
- unlink_expect(i);
- return i;
+ && is_confirmed(i->master)) {
+ if (i->flags & IP_CT_EXPECT_PERMANENT) {
+ atomic_inc(&i->use);
+ return i;
+ } else if (del_timer(&i->timeout)) {
+ ip_ct_unlink_expect(i);
+ return i;
+ }
}
}
return NULL;
}
/* delete all expectations for this conntrack */
-static void remove_expectations(struct ip_conntrack *ct)
+void ip_ct_remove_expectations(struct ip_conntrack *ct)
{
struct ip_conntrack_expect *i, *tmp;
@@ -190,7 +286,7 @@ static void remove_expectations(struct ip_conntrack *ct)
list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
if (i->master == ct && del_timer(&i->timeout)) {
- unlink_expect(i);
+ ip_ct_unlink_expect(i);
ip_conntrack_expect_put(i);
}
}
@@ -210,7 +306,7 @@ clean_from_lists(struct ip_conntrack *ct)
LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
/* Destroy all pending expectations */
- remove_expectations(ct);
+ ip_ct_remove_expectations(ct);
}
static void
@@ -223,10 +319,13 @@ destroy_conntrack(struct nf_conntrack *nfct)
IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
IP_NF_ASSERT(!timer_pending(&ct->timeout));
+ ip_conntrack_event(IPCT_DESTROY, ct);
+ set_bit(IPS_DYING_BIT, &ct->status);
+
/* To make sure we don't get any weird locking issues here:
* destroy_conntrack() MUST NOT be called with a write lock
* to ip_conntrack_lock!!! -HW */
- proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
+ proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
if (proto && proto->destroy)
proto->destroy(ct);
@@ -238,7 +337,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
* except TFTP can create an expectation on the first packet,
* before connection is in the list, so we need to clean here,
* too. */
- remove_expectations(ct);
+ ip_ct_remove_expectations(ct);
/* We overload first tuple to link into unconfirmed list. */
if (!is_confirmed(ct)) {
@@ -253,8 +352,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
ip_conntrack_put(ct->master);
DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
- kmem_cache_free(ip_conntrack_cachep, ct);
- atomic_dec(&ip_conntrack_count);
+ ip_conntrack_free(ct);
}
static void death_by_timeout(unsigned long ul_conntrack)
@@ -280,7 +378,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
&& ip_ct_tuple_equal(tuple, &i->tuple);
}
-static struct ip_conntrack_tuple_hash *
+struct ip_conntrack_tuple_hash *
__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
const struct ip_conntrack *ignored_conntrack)
{
@@ -315,6 +413,29 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
return h;
}
+static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
+ unsigned int hash,
+ unsigned int repl_hash)
+{
+ ct->id = ++ip_conntrack_next_id;
+ list_prepend(&ip_conntrack_hash[hash],
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+ list_prepend(&ip_conntrack_hash[repl_hash],
+ &ct->tuplehash[IP_CT_DIR_REPLY].list);
+}
+
+void ip_conntrack_hash_insert(struct ip_conntrack *ct)
+{
+ unsigned int hash, repl_hash;
+
+ hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+ write_lock_bh(&ip_conntrack_lock);
+ __ip_conntrack_hash_insert(ct, hash, repl_hash);
+ write_unlock_bh(&ip_conntrack_lock);
+}
+
/* Confirm a connection given skb; places it in hash table */
int
__ip_conntrack_confirm(struct sk_buff **pskb)
@@ -361,10 +482,7 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
/* Remove from unconfirmed list */
list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
- list_prepend(&ip_conntrack_hash[hash],
- &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
- list_prepend(&ip_conntrack_hash[repl_hash],
- &ct->tuplehash[IP_CT_DIR_REPLY]);
+ __ip_conntrack_hash_insert(ct, hash, repl_hash);
/* Timer relative to confirmation time, not original
setting time, otherwise we'd get timer wrap in
weird delay cases. */
@@ -374,6 +492,16 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
set_bit(IPS_CONFIRMED_BIT, &ct->status);
CONNTRACK_STAT_INC(insert);
write_unlock_bh(&ip_conntrack_lock);
+ if (ct->helper)
+ ip_conntrack_event_cache(IPCT_HELPER, *pskb);
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+ if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
+ test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
+ ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
+#endif
+ ip_conntrack_event_cache(master_ct(ct) ?
+ IPCT_RELATED : IPCT_NEW, *pskb);
+
return NF_ACCEPT;
}
@@ -438,34 +566,84 @@ static inline int helper_cmp(const struct ip_conntrack_helper *i,
return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
}
-static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
+static struct ip_conntrack_helper *
+__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
{
return LIST_FIND(&helpers, helper_cmp,
struct ip_conntrack_helper *,
tuple);
}
-/* Allocate a new conntrack: we return -ENOMEM if classification
- failed due to stress. Otherwise it really is unclassifiable. */
-static struct ip_conntrack_tuple_hash *
-init_conntrack(const struct ip_conntrack_tuple *tuple,
- struct ip_conntrack_protocol *protocol,
- struct sk_buff *skb)
+struct ip_conntrack_helper *
+ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
+{
+ struct ip_conntrack_helper *helper;
+
+ /* need ip_conntrack_lock to assure that helper exists until
+ * try_module_get() is called */
+ read_lock_bh(&ip_conntrack_lock);
+
+ helper = __ip_conntrack_helper_find(tuple);
+ if (helper) {
+ /* need to increase module usage count to assure helper will
+ * not go away while the caller is e.g. busy putting a
+ * conntrack in the hash that uses the helper */
+ if (!try_module_get(helper->me))
+ helper = NULL;
+ }
+
+ read_unlock_bh(&ip_conntrack_lock);
+
+ return helper;
+}
+
+void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
+{
+ module_put(helper->me);
+}
+
+struct ip_conntrack_protocol *
+__ip_conntrack_proto_find(u_int8_t protocol)
+{
+ return ip_ct_protos[protocol];
+}
+
+/* this is guaranteed to always return a valid protocol helper, since
+ * it falls back to generic_protocol */
+struct ip_conntrack_protocol *
+ip_conntrack_proto_find_get(u_int8_t protocol)
+{
+ struct ip_conntrack_protocol *p;
+
+ preempt_disable();
+ p = __ip_conntrack_proto_find(protocol);
+ if (p) {
+ if (!try_module_get(p->me))
+ p = &ip_conntrack_generic_protocol;
+ }
+ preempt_enable();
+
+ return p;
+}
+
+void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
+{
+ module_put(p->me);
+}
+
+struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
+ struct ip_conntrack_tuple *repl)
{
struct ip_conntrack *conntrack;
- struct ip_conntrack_tuple repl_tuple;
- size_t hash;
- struct ip_conntrack_expect *exp;
if (!ip_conntrack_hash_rnd_initted) {
get_random_bytes(&ip_conntrack_hash_rnd, 4);
ip_conntrack_hash_rnd_initted = 1;
}
- hash = hash_conntrack(tuple);
-
if (ip_conntrack_max
&& atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
+ unsigned int hash = hash_conntrack(orig);
/* Try dropping from this hash chain. */
if (!early_drop(&ip_conntrack_hash[hash])) {
if (net_ratelimit())
@@ -476,11 +654,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
}
}
- if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
- DEBUGP("Can't invert tuple.\n");
- return NULL;
- }
-
conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
if (!conntrack) {
DEBUGP("Can't allocate conntrack.\n");
@@ -490,17 +663,50 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
memset(conntrack, 0, sizeof(*conntrack));
atomic_set(&conntrack->ct_general.use, 1);
conntrack->ct_general.destroy = destroy_conntrack;
- conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
- conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
- if (!protocol->new(conntrack, skb)) {
- kmem_cache_free(ip_conntrack_cachep, conntrack);
- return NULL;
- }
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
+ conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
/* Don't set timer yet: wait for confirmation */
init_timer(&conntrack->timeout);
conntrack->timeout.data = (unsigned long)conntrack;
conntrack->timeout.function = death_by_timeout;
+ atomic_inc(&ip_conntrack_count);
+
+ return conntrack;
+}
+
+void
+ip_conntrack_free(struct ip_conntrack *conntrack)
+{
+ atomic_dec(&ip_conntrack_count);
+ kmem_cache_free(ip_conntrack_cachep, conntrack);
+}
+
+/* Allocate a new conntrack: we return -ENOMEM if classification
+ * failed due to stress. Otherwise it really is unclassifiable */
+static struct ip_conntrack_tuple_hash *
+init_conntrack(struct ip_conntrack_tuple *tuple,
+ struct ip_conntrack_protocol *protocol,
+ struct sk_buff *skb)
+{
+ struct ip_conntrack *conntrack;
+ struct ip_conntrack_tuple repl_tuple;
+ struct ip_conntrack_expect *exp;
+
+ if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
+ DEBUGP("Can't invert tuple.\n");
+ return NULL;
+ }
+
+ conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
+ if (conntrack == NULL || IS_ERR(conntrack))
+ return (struct ip_conntrack_tuple_hash *)conntrack;
+
+ if (!protocol->new(conntrack, skb)) {
+ ip_conntrack_free(conntrack);
+ return NULL;
+ }
+
write_lock_bh(&ip_conntrack_lock);
exp = find_expectation(tuple);
@@ -521,7 +727,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
nf_conntrack_get(&conntrack->master->ct_general);
CONNTRACK_STAT_INC(expect_new);
} else {
- conntrack->helper = ip_ct_find_helper(&repl_tuple);
+ conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
CONNTRACK_STAT_INC(new);
}
@@ -529,7 +735,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
/* Overload tuple linked list to put us in unconfirmed list. */
list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
- atomic_inc(&ip_conntrack_count);
write_unlock_bh(&ip_conntrack_lock);
if (exp) {
@@ -607,7 +812,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
struct ip_conntrack *ct;
enum ip_conntrack_info ctinfo;
struct ip_conntrack_protocol *proto;
- int set_reply;
+ int set_reply = 0;
int ret;
/* Previously seen (loopback or untracked)? Ignore. */
@@ -625,9 +830,6 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
return NF_DROP;
}
- /* FIXME: Do this right please. --RR */
- (*pskb)->nfcache |= NFC_UNKNOWN;
-
/* Doesn't cover locally-generated broadcast, so not worth it. */
#if 0
/* Ignore broadcast: no `connection'. */
@@ -643,7 +845,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
}
#endif
- proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
+ proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
/* It may be an special packet, error, unclean...
* inverse of the return code tells to the netfilter
@@ -679,8 +881,8 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
return -ret;
}
- if (set_reply)
- set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
+ if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
+ ip_conntrack_event_cache(IPCT_STATUS, *pskb);
return ret;
}
@@ -689,7 +891,7 @@ int invert_tuplepr(struct ip_conntrack_tuple *inverse,
const struct ip_conntrack_tuple *orig)
{
return ip_ct_invert_tuple(inverse, orig,
- ip_ct_find_proto(orig->dst.protonum));
+ __ip_conntrack_proto_find(orig->dst.protonum));
}
/* Would two expected things clash? */
@@ -725,7 +927,7 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
/* choose the the oldest expectation to evict */
list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
if (expect_matches(i, exp) && del_timer(&i->timeout)) {
- unlink_expect(i);
+ ip_ct_unlink_expect(i);
write_unlock_bh(&ip_conntrack_lock);
ip_conntrack_expect_put(i);
return;
@@ -734,6 +936,9 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
write_unlock_bh(&ip_conntrack_lock);
}
+/* We don't increase the master conntrack refcount for non-fulfilled
+ * conntracks. During the conntrack destruction, the expectations are
+ * always killed before the conntrack itself */
struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
{
struct ip_conntrack_expect *new;
@@ -744,17 +949,14 @@ struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
return NULL;
}
new->master = me;
- atomic_inc(&new->master->ct_general.use);
atomic_set(&new->use, 1);
return new;
}
void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
{
- if (atomic_dec_and_test(&exp->use)) {
- ip_conntrack_put(exp->master);
+ if (atomic_dec_and_test(&exp->use))
kmem_cache_free(ip_conntrack_expect_cachep, exp);
- }
}
static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
@@ -769,6 +971,8 @@ static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
add_timer(&exp->timeout);
+ exp->id = ++ip_conntrack_expect_next_id;
+ atomic_inc(&exp->use);
CONNTRACK_STAT_INC(expect_create);
}
@@ -780,7 +984,7 @@ static void evict_oldest_expect(struct ip_conntrack *master)
list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
if (i->master == master) {
if (del_timer(&i->timeout)) {
- unlink_expect(i);
+ ip_ct_unlink_expect(i);
ip_conntrack_expect_put(i);
}
break;
@@ -827,6 +1031,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
evict_oldest_expect(expect->master);
ip_conntrack_expect_insert(expect);
+ ip_conntrack_expect_event(IPEXP_NEW, expect);
ret = 0;
out:
write_unlock_bh(&ip_conntrack_lock);
@@ -847,7 +1052,7 @@ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
if (!conntrack->master && conntrack->expecting == 0)
- conntrack->helper = ip_ct_find_helper(newreply);
+ conntrack->helper = __ip_conntrack_helper_find(newreply);
write_unlock_bh(&ip_conntrack_lock);
}
@@ -861,11 +1066,26 @@ int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
return 0;
}
+struct ip_conntrack_helper *
+__ip_conntrack_helper_find_byname(const char *name)
+{
+ struct ip_conntrack_helper *h;
+
+ list_for_each_entry(h, &helpers, list) {
+ if (!strcmp(h->name, name))
+ return h;
+ }
+
+ return NULL;
+}
+
static inline int unhelp(struct ip_conntrack_tuple_hash *i,
const struct ip_conntrack_helper *me)
{
- if (tuplehash_to_ctrack(i)->helper == me)
+ if (tuplehash_to_ctrack(i)->helper == me) {
+ ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
tuplehash_to_ctrack(i)->helper = NULL;
+ }
return 0;
}
@@ -881,7 +1101,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
/* Get rid of expectations */
list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
if (exp->master->helper == me && del_timer(&exp->timeout)) {
- unlink_expect(exp);
+ ip_ct_unlink_expect(exp);
ip_conntrack_expect_put(exp);
}
}
@@ -896,42 +1116,83 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
synchronize_net();
}
-static inline void ct_add_counters(struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- const struct sk_buff *skb)
-{
-#ifdef CONFIG_IP_NF_CT_ACCT
- if (skb) {
- ct->counters[CTINFO2DIR(ctinfo)].packets++;
- ct->counters[CTINFO2DIR(ctinfo)].bytes +=
- ntohs(skb->nh.iph->tot_len);
- }
-#endif
-}
-
-/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
-void ip_ct_refresh_acct(struct ip_conntrack *ct,
+/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
+void __ip_ct_refresh_acct(struct ip_conntrack *ct,
enum ip_conntrack_info ctinfo,
const struct sk_buff *skb,
- unsigned long extra_jiffies)
+ unsigned long extra_jiffies,
+ int do_acct)
{
+ int event = 0;
+
IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
+ IP_NF_ASSERT(skb);
+
+ write_lock_bh(&ip_conntrack_lock);
/* If not in hash table, timer will not be active yet */
if (!is_confirmed(ct)) {
ct->timeout.expires = extra_jiffies;
- ct_add_counters(ct, ctinfo, skb);
+ event = IPCT_REFRESH;
} else {
- write_lock_bh(&ip_conntrack_lock);
/* Need del_timer for race avoidance (may already be dying). */
if (del_timer(&ct->timeout)) {
ct->timeout.expires = jiffies + extra_jiffies;
add_timer(&ct->timeout);
+ event = IPCT_REFRESH;
}
- ct_add_counters(ct, ctinfo, skb);
- write_unlock_bh(&ip_conntrack_lock);
}
+
+#ifdef CONFIG_IP_NF_CT_ACCT
+ if (do_acct) {
+ ct->counters[CTINFO2DIR(ctinfo)].packets++;
+ ct->counters[CTINFO2DIR(ctinfo)].bytes +=
+ ntohs(skb->nh.iph->tot_len);
+ if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
+ || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
+ event |= IPCT_COUNTER_FILLING;
+ }
+#endif
+
+ write_unlock_bh(&ip_conntrack_lock);
+
+ /* must be unlocked when calling event cache */
+ if (event)
+ ip_conntrack_event_cache(event, skb);
+}
+
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
+ * in ip_conntrack_core, since we don't want the protocols to autoload
+ * or depend on ctnetlink */
+int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
+ const struct ip_conntrack_tuple *tuple)
+{
+ NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
+ &tuple->src.u.tcp.port);
+ NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
+ &tuple->dst.u.tcp.port);
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
+ struct ip_conntrack_tuple *t)
+{
+ if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
+ return -EINVAL;
+
+ t->src.u.tcp.port =
+ *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
+ t->dst.u.tcp.port =
+ *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
+
+ return 0;
}
+#endif
/* Returns new sk_buff, or NULL */
struct sk_buff *
@@ -943,10 +1204,8 @@ ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
skb = ip_defrag(skb, user);
local_bh_enable();
- if (skb) {
+ if (skb)
ip_send_check(skb->nh.iph);
- skb->nfcache |= NFC_ALTERED;
- }
return skb;
}
@@ -1086,26 +1345,23 @@ static int kill_all(struct ip_conntrack *i, void *data)
return 1;
}
-static void free_conntrack_hash(void)
+static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
{
- if (ip_conntrack_vmalloc)
- vfree(ip_conntrack_hash);
+ if (vmalloced)
+ vfree(hash);
else
- free_pages((unsigned long)ip_conntrack_hash,
- get_order(sizeof(struct list_head)
- * ip_conntrack_htable_size));
+ free_pages((unsigned long)hash,
+ get_order(sizeof(struct list_head) * size));
}
-/* Mishearing the voices in his head, our hero wonders how he's
- supposed to kill the mall. */
-void ip_conntrack_cleanup(void)
+void ip_conntrack_flush()
{
- ip_ct_attach = NULL;
/* This makes sure all current packets have passed through
netfilter framework. Roll on, two-stage module
delete... */
synchronize_net();
-
+
+ ip_ct_event_cache_flush();
i_see_dead_people:
ip_ct_iterate_cleanup(kill_all, NULL);
if (atomic_read(&ip_conntrack_count) != 0) {
@@ -1115,15 +1371,93 @@ void ip_conntrack_cleanup(void)
/* wait until all references to ip_conntrack_untracked are dropped */
while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
schedule();
+}
+/* Mishearing the voices in his head, our hero wonders how he's
+ supposed to kill the mall. */
+void ip_conntrack_cleanup(void)
+{
+ ip_ct_attach = NULL;
+ ip_conntrack_flush();
kmem_cache_destroy(ip_conntrack_cachep);
kmem_cache_destroy(ip_conntrack_expect_cachep);
- free_conntrack_hash();
+ free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
+ ip_conntrack_htable_size);
nf_unregister_sockopt(&so_getorigdst);
}
-static int hashsize;
-module_param(hashsize, int, 0400);
+static struct list_head *alloc_hashtable(int size, int *vmalloced)
+{
+ struct list_head *hash;
+ unsigned int i;
+
+ *vmalloced = 0;
+ hash = (void*)__get_free_pages(GFP_KERNEL,
+ get_order(sizeof(struct list_head)
+ * size));
+ if (!hash) {
+ *vmalloced = 1;
+ printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
+ hash = vmalloc(sizeof(struct list_head) * size);
+ }
+
+ if (hash)
+ for (i = 0; i < size; i++)
+ INIT_LIST_HEAD(&hash[i]);
+
+ return hash;
+}
+
+int set_hashsize(const char *val, struct kernel_param *kp)
+{
+ int i, bucket, hashsize, vmalloced;
+ int old_vmalloced, old_size;
+ int rnd;
+ struct list_head *hash, *old_hash;
+ struct ip_conntrack_tuple_hash *h;
+
+ /* On boot, we can set this without any fancy locking. */
+ if (!ip_conntrack_htable_size)
+ return param_set_int(val, kp);
+
+ hashsize = simple_strtol(val, NULL, 0);
+ if (!hashsize)
+ return -EINVAL;
+
+ hash = alloc_hashtable(hashsize, &vmalloced);
+ if (!hash)
+ return -ENOMEM;
+
+ /* We have to rehash for the new table anyway, so we also can
+ * use a new random seed */
+ get_random_bytes(&rnd, 4);
+
+ write_lock_bh(&ip_conntrack_lock);
+ for (i = 0; i < ip_conntrack_htable_size; i++) {
+ while (!list_empty(&ip_conntrack_hash[i])) {
+ h = list_entry(ip_conntrack_hash[i].next,
+ struct ip_conntrack_tuple_hash, list);
+ list_del(&h->list);
+ bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
+ list_add_tail(&h->list, &hash[bucket]);
+ }
+ }
+ old_size = ip_conntrack_htable_size;
+ old_vmalloced = ip_conntrack_vmalloc;
+ old_hash = ip_conntrack_hash;
+
+ ip_conntrack_htable_size = hashsize;
+ ip_conntrack_vmalloc = vmalloced;
+ ip_conntrack_hash = hash;
+ ip_conntrack_hash_rnd = rnd;
+ write_unlock_bh(&ip_conntrack_lock);
+
+ free_conntrack_hash(old_hash, old_vmalloced, old_size);
+ return 0;
+}
+
+module_param_call(hashsize, set_hashsize, param_get_uint,
+ &ip_conntrack_htable_size, 0600);
int __init ip_conntrack_init(void)
{
@@ -1132,9 +1466,7 @@ int __init ip_conntrack_init(void)
/* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
* machine has 256 buckets. >= 1GB machines have 8192 buckets. */
- if (hashsize) {
- ip_conntrack_htable_size = hashsize;
- } else {
+ if (!ip_conntrack_htable_size) {
ip_conntrack_htable_size
= (((num_physpages << PAGE_SHIFT) / 16384)
/ sizeof(struct list_head));
@@ -1156,20 +1488,8 @@ int __init ip_conntrack_init(void)
return ret;
}
- /* AK: the hash table is twice as big than needed because it
- uses list_head. it would be much nicer to caches to use a
- single pointer list head here. */
- ip_conntrack_vmalloc = 0;
- ip_conntrack_hash
- =(void*)__get_free_pages(GFP_KERNEL,
- get_order(sizeof(struct list_head)
- *ip_conntrack_htable_size));
- if (!ip_conntrack_hash) {
- ip_conntrack_vmalloc = 1;
- printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
- ip_conntrack_hash = vmalloc(sizeof(struct list_head)
- * ip_conntrack_htable_size);
- }
+ ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
+ &ip_conntrack_vmalloc);
if (!ip_conntrack_hash) {
printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
goto err_unreg_sockopt;
@@ -1201,9 +1521,6 @@ int __init ip_conntrack_init(void)
ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
write_unlock_bh(&ip_conntrack_lock);
- for (i = 0; i < ip_conntrack_htable_size; i++)
- INIT_LIST_HEAD(&ip_conntrack_hash[i]);
-
/* For use by ipt_REJECT */
ip_ct_attach = ip_conntrack_attach;
@@ -1218,7 +1535,8 @@ int __init ip_conntrack_init(void)
err_free_conntrack_slab:
kmem_cache_destroy(ip_conntrack_cachep);
err_free_hash:
- free_conntrack_hash();
+ free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
+ ip_conntrack_htable_size);
err_unreg_sockopt:
nf_unregister_sockopt(&so_getorigdst);
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index 7a3b773be3f9..d77d6b3f5f80 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -25,14 +25,13 @@ MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
MODULE_DESCRIPTION("ftp connection tracking helper");
/* This is slow, but it's simple. --RR */
-static char ftp_buffer[65536];
-
+static char *ftp_buffer;
static DEFINE_SPINLOCK(ip_ftp_lock);
#define MAX_PORTS 8
-static int ports[MAX_PORTS];
+static short ports[MAX_PORTS];
static int ports_c;
-module_param_array(ports, int, &ports_c, 0400);
+module_param_array(ports, short, &ports_c, 0400);
static int loose;
module_param(loose, int, 0600);
@@ -262,7 +261,8 @@ static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir)
}
/* We don't update if it's older than what we have. */
-static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir)
+static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir,
+ struct sk_buff *skb)
{
unsigned int i, oldest = NUM_SEQ_TO_REMEMBER;
@@ -276,10 +276,13 @@ static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir)
oldest = i;
}
- if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER)
+ if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) {
info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
- else if (oldest != NUM_SEQ_TO_REMEMBER)
+ ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
+ } else if (oldest != NUM_SEQ_TO_REMEMBER) {
info->seq_aft_nl[dir][oldest] = nl_seq;
+ ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
+ }
}
static int help(struct sk_buff **pskb,
@@ -418,6 +421,7 @@ static int help(struct sk_buff **pskb,
{ 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
exp->expectfn = NULL;
+ exp->flags = 0;
/* Now, NAT might want to mangle the packet, and register the
* (possibly changed) expectation itself. */
@@ -439,14 +443,14 @@ out_update_nl:
/* Now if this ends in \n, update ftp info. Seq may have been
* adjusted by NAT code. */
if (ends_in_nl)
- update_nl_seq(seq, ct_ftp_info,dir);
+ update_nl_seq(seq, ct_ftp_info,dir, *pskb);
out:
spin_unlock_bh(&ip_ftp_lock);
return ret;
}
static struct ip_conntrack_helper ftp[MAX_PORTS];
-static char ftp_names[MAX_PORTS][10];
+static char ftp_names[MAX_PORTS][sizeof("ftp-65535")];
/* Not __exit: called from init() */
static void fini(void)
@@ -457,6 +461,8 @@ static void fini(void)
ports[i]);
ip_conntrack_helper_unregister(&ftp[i]);
}
+
+ kfree(ftp_buffer);
}
static int __init init(void)
@@ -464,6 +470,10 @@ static int __init init(void)
int i, ret;
char *tmpname;
+ ftp_buffer = kmalloc(65536, GFP_KERNEL);
+ if (!ftp_buffer)
+ return -ENOMEM;
+
if (ports_c == 0)
ports[ports_c++] = FTP_PORT;
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
new file mode 100644
index 000000000000..926a6684643d
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
@@ -0,0 +1,806 @@
+/*
+ * ip_conntrack_pptp.c - Version 3.0
+ *
+ * Connection tracking support for PPTP (Point to Point Tunneling Protocol).
+ * PPTP is a a protocol for creating virtual private networks.
+ * It is a specification defined by Microsoft and some vendors
+ * working with Microsoft. PPTP is built on top of a modified
+ * version of the Internet Generic Routing Encapsulation Protocol.
+ * GRE is defined in RFC 1701 and RFC 1702. Documentation of
+ * PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ * Limitations:
+ * - We blindly assume that control connections are always
+ * established in PNS->PAC direction. This is a violation
+ * of RFFC2673
+ * - We can only support one single call within each session
+ *
+ * TODO:
+ * - testing of incoming PPTP calls
+ *
+ * Changes:
+ * 2002-02-05 - Version 1.3
+ * - Call ip_conntrack_unexpect_related() from
+ * pptp_destroy_siblings() to destroy expectations in case
+ * CALL_DISCONNECT_NOTIFY or tcp fin packet was seen
+ * (Philip Craig <philipc@snapgear.com>)
+ * - Add Version information at module loadtime
+ * 2002-02-10 - Version 1.6
+ * - move to C99 style initializers
+ * - remove second expectation if first arrives
+ * 2004-10-22 - Version 2.0
+ * - merge Mandrake's 2.6.x port with recent 2.6.x API changes
+ * - fix lots of linear skb assumptions from Mandrake's port
+ * 2005-06-10 - Version 2.1
+ * - use ip_conntrack_expect_free() instead of kfree() on the
+ * expect's (which are from the slab for quite some time)
+ * 2005-06-10 - Version 3.0
+ * - port helper to post-2.6.11 API changes,
+ * funded by Oxcoda NetBox Blue (http://www.netboxblue.com/)
+ * 2005-07-30 - Version 3.1
+ * - port helper to 2.6.13 API changes
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h>
+#include <linux/netfilter_ipv4/ip_conntrack_pptp.h>
+
+#define IP_CT_PPTP_VERSION "3.1"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("Netfilter connection tracking helper module for PPTP");
+
+static DEFINE_SPINLOCK(ip_pptp_lock);
+
+int
+(*ip_nat_pptp_hook_outbound)(struct sk_buff **pskb,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq);
+
+int
+(*ip_nat_pptp_hook_inbound)(struct sk_buff **pskb,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq);
+
+int
+(*ip_nat_pptp_hook_exp_gre)(struct ip_conntrack_expect *expect_orig,
+ struct ip_conntrack_expect *expect_reply);
+
+void
+(*ip_nat_pptp_hook_expectfn)(struct ip_conntrack *ct,
+ struct ip_conntrack_expect *exp);
+
+#if 0
+/* PptpControlMessageType names */
+const char *pptp_msg_name[] = {
+ "UNKNOWN_MESSAGE",
+ "START_SESSION_REQUEST",
+ "START_SESSION_REPLY",
+ "STOP_SESSION_REQUEST",
+ "STOP_SESSION_REPLY",
+ "ECHO_REQUEST",
+ "ECHO_REPLY",
+ "OUT_CALL_REQUEST",
+ "OUT_CALL_REPLY",
+ "IN_CALL_REQUEST",
+ "IN_CALL_REPLY",
+ "IN_CALL_CONNECT",
+ "CALL_CLEAR_REQUEST",
+ "CALL_DISCONNECT_NOTIFY",
+ "WAN_ERROR_NOTIFY",
+ "SET_LINK_INFO"
+};
+EXPORT_SYMBOL(pptp_msg_name);
+#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, __FUNCTION__, ## args)
+#else
+#define DEBUGP(format, args...)
+#endif
+
+#define SECS *HZ
+#define MINS * 60 SECS
+#define HOURS * 60 MINS
+
+#define PPTP_GRE_TIMEOUT (10 MINS)
+#define PPTP_GRE_STREAM_TIMEOUT (5 HOURS)
+
+static void pptp_expectfn(struct ip_conntrack *ct,
+ struct ip_conntrack_expect *exp)
+{
+ DEBUGP("increasing timeouts\n");
+
+ /* increase timeout of GRE data channel conntrack entry */
+ ct->proto.gre.timeout = PPTP_GRE_TIMEOUT;
+ ct->proto.gre.stream_timeout = PPTP_GRE_STREAM_TIMEOUT;
+
+ /* Can you see how rusty this code is, compared with the pre-2.6.11
+ * one? That's what happened to my shiny newnat of 2002 ;( -HW */
+
+ if (!ip_nat_pptp_hook_expectfn) {
+ struct ip_conntrack_tuple inv_t;
+ struct ip_conntrack_expect *exp_other;
+
+ /* obviously this tuple inversion only works until you do NAT */
+ invert_tuplepr(&inv_t, &exp->tuple);
+ DEBUGP("trying to unexpect other dir: ");
+ DUMP_TUPLE(&inv_t);
+
+ exp_other = ip_conntrack_expect_find(&inv_t);
+ if (exp_other) {
+ /* delete other expectation. */
+ DEBUGP("found\n");
+ ip_conntrack_unexpect_related(exp_other);
+ ip_conntrack_expect_put(exp_other);
+ } else {
+ DEBUGP("not found\n");
+ }
+ } else {
+ /* we need more than simple inversion */
+ ip_nat_pptp_hook_expectfn(ct, exp);
+ }
+}
+
+static int destroy_sibling_or_exp(const struct ip_conntrack_tuple *t)
+{
+ struct ip_conntrack_tuple_hash *h;
+ struct ip_conntrack_expect *exp;
+
+ DEBUGP("trying to timeout ct or exp for tuple ");
+ DUMP_TUPLE(t);
+
+ h = ip_conntrack_find_get(t, NULL);
+ if (h) {
+ struct ip_conntrack *sibling = tuplehash_to_ctrack(h);
+ DEBUGP("setting timeout of conntrack %p to 0\n", sibling);
+ sibling->proto.gre.timeout = 0;
+ sibling->proto.gre.stream_timeout = 0;
+ if (del_timer(&sibling->timeout))
+ sibling->timeout.function((unsigned long)sibling);
+ ip_conntrack_put(sibling);
+ return 1;
+ } else {
+ exp = ip_conntrack_expect_find(t);
+ if (exp) {
+ DEBUGP("unexpect_related of expect %p\n", exp);
+ ip_conntrack_unexpect_related(exp);
+ ip_conntrack_expect_put(exp);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+
+/* timeout GRE data connections */
+static void pptp_destroy_siblings(struct ip_conntrack *ct)
+{
+ struct ip_conntrack_tuple t;
+
+ /* Since ct->sibling_list has literally rusted away in 2.6.11,
+ * we now need another way to find out about our sibling
+ * contrack and expects... -HW */
+
+ /* try original (pns->pac) tuple */
+ memcpy(&t, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, sizeof(t));
+ t.dst.protonum = IPPROTO_GRE;
+ t.src.u.gre.key = htons(ct->help.ct_pptp_info.pns_call_id);
+ t.dst.u.gre.key = htons(ct->help.ct_pptp_info.pac_call_id);
+
+ if (!destroy_sibling_or_exp(&t))
+ DEBUGP("failed to timeout original pns->pac ct/exp\n");
+
+ /* try reply (pac->pns) tuple */
+ memcpy(&t, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, sizeof(t));
+ t.dst.protonum = IPPROTO_GRE;
+ t.src.u.gre.key = htons(ct->help.ct_pptp_info.pac_call_id);
+ t.dst.u.gre.key = htons(ct->help.ct_pptp_info.pns_call_id);
+
+ if (!destroy_sibling_or_exp(&t))
+ DEBUGP("failed to timeout reply pac->pns ct/exp\n");
+}
+
+/* expect GRE connections (PNS->PAC and PAC->PNS direction) */
+static inline int
+exp_gre(struct ip_conntrack *master,
+ u_int32_t seq,
+ __be16 callid,
+ __be16 peer_callid)
+{
+ struct ip_conntrack_tuple inv_tuple;
+ struct ip_conntrack_tuple exp_tuples[] = {
+ /* tuple in original direction, PNS->PAC */
+ { .src = { .ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip,
+ .u = { .gre = { .key = peer_callid } }
+ },
+ .dst = { .ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip,
+ .u = { .gre = { .key = callid } },
+ .protonum = IPPROTO_GRE
+ },
+ },
+ /* tuple in reply direction, PAC->PNS */
+ { .src = { .ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip,
+ .u = { .gre = { .key = callid } }
+ },
+ .dst = { .ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip,
+ .u = { .gre = { .key = peer_callid } },
+ .protonum = IPPROTO_GRE
+ },
+ }
+ };
+ struct ip_conntrack_expect *exp_orig, *exp_reply;
+ int ret = 1;
+
+ exp_orig = ip_conntrack_expect_alloc(master);
+ if (exp_orig == NULL)
+ goto out;
+
+ exp_reply = ip_conntrack_expect_alloc(master);
+ if (exp_reply == NULL)
+ goto out_put_orig;
+
+ memcpy(&exp_orig->tuple, &exp_tuples[0], sizeof(exp_orig->tuple));
+
+ exp_orig->mask.src.ip = 0xffffffff;
+ exp_orig->mask.src.u.all = 0;
+ exp_orig->mask.dst.u.all = 0;
+ exp_orig->mask.dst.u.gre.key = htons(0xffff);
+ exp_orig->mask.dst.ip = 0xffffffff;
+ exp_orig->mask.dst.protonum = 0xff;
+
+ exp_orig->master = master;
+ exp_orig->expectfn = pptp_expectfn;
+ exp_orig->flags = 0;
+
+ exp_orig->dir = IP_CT_DIR_ORIGINAL;
+
+ /* both expectations are identical apart from tuple */
+ memcpy(exp_reply, exp_orig, sizeof(*exp_reply));
+ memcpy(&exp_reply->tuple, &exp_tuples[1], sizeof(exp_reply->tuple));
+
+ exp_reply->dir = !exp_orig->dir;
+
+ if (ip_nat_pptp_hook_exp_gre)
+ ret = ip_nat_pptp_hook_exp_gre(exp_orig, exp_reply);
+ else {
+
+ DEBUGP("calling expect_related PNS->PAC");
+ DUMP_TUPLE(&exp_orig->tuple);
+
+ if (ip_conntrack_expect_related(exp_orig) != 0) {
+ DEBUGP("cannot expect_related()\n");
+ goto out_put_both;
+ }
+
+ DEBUGP("calling expect_related PAC->PNS");
+ DUMP_TUPLE(&exp_reply->tuple);
+
+ if (ip_conntrack_expect_related(exp_reply) != 0) {
+ DEBUGP("cannot expect_related()\n");
+ goto out_unexpect_orig;
+ }
+
+ /* Add GRE keymap entries */
+ if (ip_ct_gre_keymap_add(master, &exp_reply->tuple, 0) != 0) {
+ DEBUGP("cannot keymap_add() exp\n");
+ goto out_unexpect_both;
+ }
+
+ invert_tuplepr(&inv_tuple, &exp_reply->tuple);
+ if (ip_ct_gre_keymap_add(master, &inv_tuple, 1) != 0) {
+ ip_ct_gre_keymap_destroy(master);
+ DEBUGP("cannot keymap_add() exp_inv\n");
+ goto out_unexpect_both;
+ }
+ ret = 0;
+ }
+
+out_put_both:
+ ip_conntrack_expect_put(exp_reply);
+out_put_orig:
+ ip_conntrack_expect_put(exp_orig);
+out:
+ return ret;
+
+out_unexpect_both:
+ ip_conntrack_unexpect_related(exp_reply);
+out_unexpect_orig:
+ ip_conntrack_unexpect_related(exp_orig);
+ goto out_put_both;
+}
+
+static inline int
+pptp_inbound_pkt(struct sk_buff **pskb,
+ struct tcphdr *tcph,
+ unsigned int nexthdr_off,
+ unsigned int datalen,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ struct PptpControlHeader _ctlh, *ctlh;
+ unsigned int reqlen;
+ union pptp_ctrl_union _pptpReq, *pptpReq;
+ struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info;
+ u_int16_t msg;
+ __be16 *cid, *pcid;
+ u_int32_t seq;
+
+ ctlh = skb_header_pointer(*pskb, nexthdr_off, sizeof(_ctlh), &_ctlh);
+ if (!ctlh) {
+ DEBUGP("error during skb_header_pointer\n");
+ return NF_ACCEPT;
+ }
+ nexthdr_off += sizeof(_ctlh);
+ datalen -= sizeof(_ctlh);
+
+ reqlen = datalen;
+ if (reqlen > sizeof(*pptpReq))
+ reqlen = sizeof(*pptpReq);
+ pptpReq = skb_header_pointer(*pskb, nexthdr_off, reqlen, &_pptpReq);
+ if (!pptpReq) {
+ DEBUGP("error during skb_header_pointer\n");
+ return NF_ACCEPT;
+ }
+
+ msg = ntohs(ctlh->messageType);
+ DEBUGP("inbound control message %s\n", pptp_msg_name[msg]);
+
+ switch (msg) {
+ case PPTP_START_SESSION_REPLY:
+ if (reqlen < sizeof(_pptpReq.srep)) {
+ DEBUGP("%s: short packet\n", pptp_msg_name[msg]);
+ break;
+ }
+
+ /* server confirms new control session */
+ if (info->sstate < PPTP_SESSION_REQUESTED) {
+ DEBUGP("%s without START_SESS_REQUEST\n",
+ pptp_msg_name[msg]);
+ break;
+ }
+ if (pptpReq->srep.resultCode == PPTP_START_OK)
+ info->sstate = PPTP_SESSION_CONFIRMED;
+ else
+ info->sstate = PPTP_SESSION_ERROR;
+ break;
+
+ case PPTP_STOP_SESSION_REPLY:
+ if (reqlen < sizeof(_pptpReq.strep)) {
+ DEBUGP("%s: short packet\n", pptp_msg_name[msg]);
+ break;
+ }
+
+ /* server confirms end of control session */
+ if (info->sstate > PPTP_SESSION_STOPREQ) {
+ DEBUGP("%s without STOP_SESS_REQUEST\n",
+ pptp_msg_name[msg]);
+ break;
+ }
+ if (pptpReq->strep.resultCode == PPTP_STOP_OK)
+ info->sstate = PPTP_SESSION_NONE;
+ else
+ info->sstate = PPTP_SESSION_ERROR;
+ break;
+
+ case PPTP_OUT_CALL_REPLY:
+ if (reqlen < sizeof(_pptpReq.ocack)) {
+ DEBUGP("%s: short packet\n", pptp_msg_name[msg]);
+ break;
+ }
+
+ /* server accepted call, we now expect GRE frames */
+ if (info->sstate != PPTP_SESSION_CONFIRMED) {
+ DEBUGP("%s but no session\n", pptp_msg_name[msg]);
+ break;
+ }
+ if (info->cstate != PPTP_CALL_OUT_REQ &&
+ info->cstate != PPTP_CALL_OUT_CONF) {
+ DEBUGP("%s without OUTCALL_REQ\n", pptp_msg_name[msg]);
+ break;
+ }
+ if (pptpReq->ocack.resultCode != PPTP_OUTCALL_CONNECT) {
+ info->cstate = PPTP_CALL_NONE;
+ break;
+ }
+
+ cid = &pptpReq->ocack.callID;
+ pcid = &pptpReq->ocack.peersCallID;
+
+ info->pac_call_id = ntohs(*cid);
+
+ if (htons(info->pns_call_id) != *pcid) {
+ DEBUGP("%s for unknown callid %u\n",
+ pptp_msg_name[msg], ntohs(*pcid));
+ break;
+ }
+
+ DEBUGP("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg],
+ ntohs(*cid), ntohs(*pcid));
+
+ info->cstate = PPTP_CALL_OUT_CONF;
+
+ seq = ntohl(tcph->seq) + sizeof(struct pptp_pkt_hdr)
+ + sizeof(struct PptpControlHeader)
+ + ((void *)pcid - (void *)pptpReq);
+
+ if (exp_gre(ct, seq, *cid, *pcid) != 0)
+ printk("ip_conntrack_pptp: error during exp_gre\n");
+ break;
+
+ case PPTP_IN_CALL_REQUEST:
+ if (reqlen < sizeof(_pptpReq.icack)) {
+ DEBUGP("%s: short packet\n", pptp_msg_name[msg]);
+ break;
+ }
+
+ /* server tells us about incoming call request */
+ if (info->sstate != PPTP_SESSION_CONFIRMED) {
+ DEBUGP("%s but no session\n", pptp_msg_name[msg]);
+ break;
+ }
+ pcid = &pptpReq->icack.peersCallID;
+ DEBUGP("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(*pcid));
+ info->cstate = PPTP_CALL_IN_REQ;
+ info->pac_call_id = ntohs(*pcid);
+ break;
+
+ case PPTP_IN_CALL_CONNECT:
+ if (reqlen < sizeof(_pptpReq.iccon)) {
+ DEBUGP("%s: short packet\n", pptp_msg_name[msg]);
+ break;
+ }
+
+ /* server tells us about incoming call established */
+ if (info->sstate != PPTP_SESSION_CONFIRMED) {
+ DEBUGP("%s but no session\n", pptp_msg_name[msg]);
+ break;
+ }
+ if (info->sstate != PPTP_CALL_IN_REP
+ && info->sstate != PPTP_CALL_IN_CONF) {
+ DEBUGP("%s but never sent IN_CALL_REPLY\n",
+ pptp_msg_name[msg]);
+ break;
+ }
+
+ pcid = &pptpReq->iccon.peersCallID;
+ cid = &info->pac_call_id;
+
+ if (info->pns_call_id != ntohs(*pcid)) {
+ DEBUGP("%s for unknown CallID %u\n",
+ pptp_msg_name[msg], ntohs(*pcid));
+ break;
+ }
+
+ DEBUGP("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(*pcid));
+ info->cstate = PPTP_CALL_IN_CONF;
+
+ /* we expect a GRE connection from PAC to PNS */
+ seq = ntohl(tcph->seq) + sizeof(struct pptp_pkt_hdr)
+ + sizeof(struct PptpControlHeader)
+ + ((void *)pcid - (void *)pptpReq);
+
+ if (exp_gre(ct, seq, *cid, *pcid) != 0)
+ printk("ip_conntrack_pptp: error during exp_gre\n");
+
+ break;
+
+ case PPTP_CALL_DISCONNECT_NOTIFY:
+ if (reqlen < sizeof(_pptpReq.disc)) {
+ DEBUGP("%s: short packet\n", pptp_msg_name[msg]);
+ break;
+ }
+
+ /* server confirms disconnect */
+ cid = &pptpReq->disc.callID;
+ DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(*cid));
+ info->cstate = PPTP_CALL_NONE;
+
+ /* untrack this call id, unexpect GRE packets */
+ pptp_destroy_siblings(ct);
+ break;
+
+ case PPTP_WAN_ERROR_NOTIFY:
+ break;
+
+ case PPTP_ECHO_REQUEST:
+ case PPTP_ECHO_REPLY:
+ /* I don't have to explain these ;) */
+ break;
+ default:
+ DEBUGP("invalid %s (TY=%d)\n", (msg <= PPTP_MSG_MAX)
+ ? pptp_msg_name[msg]:pptp_msg_name[0], msg);
+ break;
+ }
+
+
+ if (ip_nat_pptp_hook_inbound)
+ return ip_nat_pptp_hook_inbound(pskb, ct, ctinfo, ctlh,
+ pptpReq);
+
+ return NF_ACCEPT;
+
+}
+
+static inline int
+pptp_outbound_pkt(struct sk_buff **pskb,
+ struct tcphdr *tcph,
+ unsigned int nexthdr_off,
+ unsigned int datalen,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ struct PptpControlHeader _ctlh, *ctlh;
+ unsigned int reqlen;
+ union pptp_ctrl_union _pptpReq, *pptpReq;
+ struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info;
+ u_int16_t msg;
+ __be16 *cid, *pcid;
+
+ ctlh = skb_header_pointer(*pskb, nexthdr_off, sizeof(_ctlh), &_ctlh);
+ if (!ctlh)
+ return NF_ACCEPT;
+ nexthdr_off += sizeof(_ctlh);
+ datalen -= sizeof(_ctlh);
+
+ reqlen = datalen;
+ if (reqlen > sizeof(*pptpReq))
+ reqlen = sizeof(*pptpReq);
+ pptpReq = skb_header_pointer(*pskb, nexthdr_off, reqlen, &_pptpReq);
+ if (!pptpReq)
+ return NF_ACCEPT;
+
+ msg = ntohs(ctlh->messageType);
+ DEBUGP("outbound control message %s\n", pptp_msg_name[msg]);
+
+ switch (msg) {
+ case PPTP_START_SESSION_REQUEST:
+ /* client requests for new control session */
+ if (info->sstate != PPTP_SESSION_NONE) {
+ DEBUGP("%s but we already have one",
+ pptp_msg_name[msg]);
+ }
+ info->sstate = PPTP_SESSION_REQUESTED;
+ break;
+ case PPTP_STOP_SESSION_REQUEST:
+ /* client requests end of control session */
+ info->sstate = PPTP_SESSION_STOPREQ;
+ break;
+
+ case PPTP_OUT_CALL_REQUEST:
+ if (reqlen < sizeof(_pptpReq.ocreq)) {
+ DEBUGP("%s: short packet\n", pptp_msg_name[msg]);
+ /* FIXME: break; */
+ }
+
+ /* client initiating connection to server */
+ if (info->sstate != PPTP_SESSION_CONFIRMED) {
+ DEBUGP("%s but no session\n",
+ pptp_msg_name[msg]);
+ break;
+ }
+ info->cstate = PPTP_CALL_OUT_REQ;
+ /* track PNS call id */
+ cid = &pptpReq->ocreq.callID;
+ DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(*cid));
+ info->pns_call_id = ntohs(*cid);
+ break;
+ case PPTP_IN_CALL_REPLY:
+ if (reqlen < sizeof(_pptpReq.icack)) {
+ DEBUGP("%s: short packet\n", pptp_msg_name[msg]);
+ break;
+ }
+
+ /* client answers incoming call */
+ if (info->cstate != PPTP_CALL_IN_REQ
+ && info->cstate != PPTP_CALL_IN_REP) {
+ DEBUGP("%s without incall_req\n",
+ pptp_msg_name[msg]);
+ break;
+ }
+ if (pptpReq->icack.resultCode != PPTP_INCALL_ACCEPT) {
+ info->cstate = PPTP_CALL_NONE;
+ break;
+ }
+ pcid = &pptpReq->icack.peersCallID;
+ if (info->pac_call_id != ntohs(*pcid)) {
+ DEBUGP("%s for unknown call %u\n",
+ pptp_msg_name[msg], ntohs(*pcid));
+ break;
+ }
+ DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(*pcid));
+ /* part two of the three-way handshake */
+ info->cstate = PPTP_CALL_IN_REP;
+ info->pns_call_id = ntohs(pptpReq->icack.callID);
+ break;
+
+ case PPTP_CALL_CLEAR_REQUEST:
+ /* client requests hangup of call */
+ if (info->sstate != PPTP_SESSION_CONFIRMED) {
+ DEBUGP("CLEAR_CALL but no session\n");
+ break;
+ }
+ /* FUTURE: iterate over all calls and check if
+ * call ID is valid. We don't do this without newnat,
+ * because we only know about last call */
+ info->cstate = PPTP_CALL_CLEAR_REQ;
+ break;
+ case PPTP_SET_LINK_INFO:
+ break;
+ case PPTP_ECHO_REQUEST:
+ case PPTP_ECHO_REPLY:
+ /* I don't have to explain these ;) */
+ break;
+ default:
+ DEBUGP("invalid %s (TY=%d)\n", (msg <= PPTP_MSG_MAX)?
+ pptp_msg_name[msg]:pptp_msg_name[0], msg);
+ /* unknown: no need to create GRE masq table entry */
+ break;
+ }
+
+ if (ip_nat_pptp_hook_outbound)
+ return ip_nat_pptp_hook_outbound(pskb, ct, ctinfo, ctlh,
+ pptpReq);
+
+ return NF_ACCEPT;
+}
+
+
+/* track caller id inside control connection, call expect_related */
+static int
+conntrack_pptp_help(struct sk_buff **pskb,
+ struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
+
+{
+ struct pptp_pkt_hdr _pptph, *pptph;
+ struct tcphdr _tcph, *tcph;
+ u_int32_t tcplen = (*pskb)->len - (*pskb)->nh.iph->ihl * 4;
+ u_int32_t datalen;
+ int dir = CTINFO2DIR(ctinfo);
+ struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info;
+ unsigned int nexthdr_off;
+
+ int oldsstate, oldcstate;
+ int ret;
+
+ /* don't do any tracking before tcp handshake complete */
+ if (ctinfo != IP_CT_ESTABLISHED
+ && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) {
+ DEBUGP("ctinfo = %u, skipping\n", ctinfo);
+ return NF_ACCEPT;
+ }
+
+ nexthdr_off = (*pskb)->nh.iph->ihl*4;
+ tcph = skb_header_pointer(*pskb, nexthdr_off, sizeof(_tcph), &_tcph);
+ BUG_ON(!tcph);
+ nexthdr_off += tcph->doff * 4;
+ datalen = tcplen - tcph->doff * 4;
+
+ if (tcph->fin || tcph->rst) {
+ DEBUGP("RST/FIN received, timeouting GRE\n");
+ /* can't do this after real newnat */
+ info->cstate = PPTP_CALL_NONE;
+
+ /* untrack this call id, unexpect GRE packets */
+ pptp_destroy_siblings(ct);
+ }
+
+ pptph = skb_header_pointer(*pskb, nexthdr_off, sizeof(_pptph), &_pptph);
+ if (!pptph) {
+ DEBUGP("no full PPTP header, can't track\n");
+ return NF_ACCEPT;
+ }
+ nexthdr_off += sizeof(_pptph);
+ datalen -= sizeof(_pptph);
+
+ /* if it's not a control message we can't do anything with it */
+ if (ntohs(pptph->packetType) != PPTP_PACKET_CONTROL ||
+ ntohl(pptph->magicCookie) != PPTP_MAGIC_COOKIE) {
+ DEBUGP("not a control packet\n");
+ return NF_ACCEPT;
+ }
+
+ oldsstate = info->sstate;
+ oldcstate = info->cstate;
+
+ spin_lock_bh(&ip_pptp_lock);
+
+ /* FIXME: We just blindly assume that the control connection is always
+ * established from PNS->PAC. However, RFC makes no guarantee */
+ if (dir == IP_CT_DIR_ORIGINAL)
+ /* client -> server (PNS -> PAC) */
+ ret = pptp_outbound_pkt(pskb, tcph, nexthdr_off, datalen, ct,
+ ctinfo);
+ else
+ /* server -> client (PAC -> PNS) */
+ ret = pptp_inbound_pkt(pskb, tcph, nexthdr_off, datalen, ct,
+ ctinfo);
+ DEBUGP("sstate: %d->%d, cstate: %d->%d\n",
+ oldsstate, info->sstate, oldcstate, info->cstate);
+ spin_unlock_bh(&ip_pptp_lock);
+
+ return ret;
+}
+
+/* control protocol helper */
+static struct ip_conntrack_helper pptp = {
+ .list = { NULL, NULL },
+ .name = "pptp",
+ .me = THIS_MODULE,
+ .max_expected = 2,
+ .timeout = 5 * 60,
+ .tuple = { .src = { .ip = 0,
+ .u = { .tcp = { .port =
+ __constant_htons(PPTP_CONTROL_PORT) } }
+ },
+ .dst = { .ip = 0,
+ .u = { .all = 0 },
+ .protonum = IPPROTO_TCP
+ }
+ },
+ .mask = { .src = { .ip = 0,
+ .u = { .tcp = { .port = __constant_htons(0xffff) } }
+ },
+ .dst = { .ip = 0,
+ .u = { .all = 0 },
+ .protonum = 0xff
+ }
+ },
+ .help = conntrack_pptp_help
+};
+
+extern void __exit ip_ct_proto_gre_fini(void);
+extern int __init ip_ct_proto_gre_init(void);
+
+/* ip_conntrack_pptp initialization */
+static int __init init(void)
+{
+ int retcode;
+
+ retcode = ip_ct_proto_gre_init();
+ if (retcode < 0)
+ return retcode;
+
+ DEBUGP(" registering helper\n");
+ if ((retcode = ip_conntrack_helper_register(&pptp))) {
+ printk(KERN_ERR "Unable to register conntrack application "
+ "helper for pptp: %d\n", retcode);
+ ip_ct_proto_gre_fini();
+ return retcode;
+ }
+
+ printk("ip_conntrack_pptp version %s loaded\n", IP_CT_PPTP_VERSION);
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ ip_conntrack_helper_unregister(&pptp);
+ ip_ct_proto_gre_fini();
+ printk("ip_conntrack_pptp version %s unloaded\n", IP_CT_PPTP_VERSION);
+}
+
+module_init(init);
+module_exit(fini);
+
+EXPORT_SYMBOL(ip_nat_pptp_hook_outbound);
+EXPORT_SYMBOL(ip_nat_pptp_hook_inbound);
+EXPORT_SYMBOL(ip_nat_pptp_hook_exp_gre);
+EXPORT_SYMBOL(ip_nat_pptp_hook_expectfn);
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index 4a28f297d502..15457415a4f3 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -34,12 +34,12 @@
#include <linux/moduleparam.h>
#define MAX_PORTS 8
-static int ports[MAX_PORTS];
+static short ports[MAX_PORTS];
static int ports_c;
static int max_dcc_channels = 8;
static unsigned int dcc_timeout = 300;
/* This is slow, but it's simple. --RR */
-static char irc_buffer[65536];
+static char *irc_buffer;
static DEFINE_SPINLOCK(irc_buffer_lock);
unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
@@ -52,7 +52,7 @@ EXPORT_SYMBOL_GPL(ip_nat_irc_hook);
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_DESCRIPTION("IRC (DCC) connection tracking helper");
MODULE_LICENSE("GPL");
-module_param_array(ports, int, &ports_c, 0400);
+module_param_array(ports, short, &ports_c, 0400);
MODULE_PARM_DESC(ports, "port numbers of IRC servers");
module_param(max_dcc_channels, int, 0400);
MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session");
@@ -221,6 +221,7 @@ static int help(struct sk_buff **pskb,
{ { 0, { 0 } },
{ 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
exp->expectfn = NULL;
+ exp->flags = 0;
if (ip_nat_irc_hook)
ret = ip_nat_irc_hook(pskb, ctinfo,
addr_beg_p - ib_ptr,
@@ -239,7 +240,7 @@ static int help(struct sk_buff **pskb,
}
static struct ip_conntrack_helper irc_helpers[MAX_PORTS];
-static char irc_names[MAX_PORTS][10];
+static char irc_names[MAX_PORTS][sizeof("irc-65535")];
static void fini(void);
@@ -257,6 +258,10 @@ static int __init init(void)
printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n");
return -EBUSY;
}
+
+ irc_buffer = kmalloc(65536, GFP_KERNEL);
+ if (!irc_buffer)
+ return -ENOMEM;
/* If no port given, default to standard irc port */
if (ports_c == 0)
@@ -304,6 +309,7 @@ static void fini(void)
ports[i]);
ip_conntrack_helper_unregister(&irc_helpers[i]);
}
+ kfree(irc_buffer);
}
module_init(init);
diff --git a/net/ipv4/netfilter/ip_conntrack_netbios_ns.c b/net/ipv4/netfilter/ip_conntrack_netbios_ns.c
new file mode 100644
index 000000000000..186646eb249f
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_netbios_ns.c
@@ -0,0 +1,142 @@
+/*
+ * NetBIOS name service broadcast connection tracking helper
+ *
+ * (c) 2005 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+/*
+ * This helper tracks locally originating NetBIOS name service
+ * requests by issuing permanent expectations (valid until
+ * timing out) matching all reply connections from the
+ * destination network. The only NetBIOS specific thing is
+ * actually the port number.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+
+#define NMBD_PORT 137
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper");
+MODULE_LICENSE("GPL");
+
+static unsigned int timeout = 3;
+module_param(timeout, int, 0600);
+MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
+
+static int help(struct sk_buff **pskb,
+ struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
+{
+ struct ip_conntrack_expect *exp;
+ struct iphdr *iph = (*pskb)->nh.iph;
+ struct rtable *rt = (struct rtable *)(*pskb)->dst;
+ struct in_device *in_dev;
+ u_int32_t mask = 0;
+
+ /* we're only interested in locally generated packets */
+ if ((*pskb)->sk == NULL)
+ goto out;
+ if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
+ goto out;
+ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ goto out;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(rt->u.dst.dev);
+ if (in_dev != NULL) {
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_broadcast == iph->daddr) {
+ mask = ifa->ifa_mask;
+ break;
+ }
+ } endfor_ifa(in_dev);
+ }
+ rcu_read_unlock();
+
+ if (mask == 0)
+ goto out;
+
+ exp = ip_conntrack_expect_alloc(ct);
+ if (exp == NULL)
+ goto out;
+
+ exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ exp->tuple.src.u.udp.port = ntohs(NMBD_PORT);
+
+ exp->mask.src.ip = mask;
+ exp->mask.src.u.udp.port = 0xFFFF;
+ exp->mask.dst.ip = 0xFFFFFFFF;
+ exp->mask.dst.u.udp.port = 0xFFFF;
+ exp->mask.dst.protonum = 0xFF;
+
+ exp->expectfn = NULL;
+ exp->flags = IP_CT_EXPECT_PERMANENT;
+
+ ip_conntrack_expect_related(exp);
+ ip_conntrack_expect_put(exp);
+
+ ip_ct_refresh(ct, *pskb, timeout * HZ);
+out:
+ return NF_ACCEPT;
+}
+
+static struct ip_conntrack_helper helper = {
+ .name = "netbios-ns",
+ .tuple = {
+ .src = {
+ .u = {
+ .udp = {
+ .port = __constant_htons(NMBD_PORT),
+ }
+ }
+ },
+ .dst = {
+ .protonum = IPPROTO_UDP,
+ },
+ },
+ .mask = {
+ .src = {
+ .u = {
+ .udp = {
+ .port = 0xFFFF,
+ }
+ }
+ },
+ .dst = {
+ .protonum = 0xFF,
+ },
+ },
+ .max_expected = 1,
+ .me = THIS_MODULE,
+ .help = help,
+};
+
+static int __init init(void)
+{
+ helper.timeout = timeout;
+ return ip_conntrack_helper_register(&helper);
+}
+
+static void __exit fini(void)
+{
+ ip_conntrack_helper_unregister(&helper);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
new file mode 100644
index 000000000000..166e6069f121
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -0,0 +1,1622 @@
+/* Connection tracking via netlink socket. Allows for user space
+ * protocol helpers and general trouble making from userspace.
+ *
+ * (C) 2001 by Jay Schulist <jschlst@samba.org>
+ * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2003 by Patrick Mchardy <kaber@trash.net>
+ * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * I've reworked this stuff to use attributes instead of conntrack
+ * structures. 5.44 am. I need more tea. --pablo 05/07/11.
+ *
+ * Initial connection tracking via netlink development funded and
+ * generally made possible by Network Robots, Inc. (www.networkrobots.com)
+ *
+ * Further development of this code funded by Astaro AG (http://www.astaro.com)
+ *
+ * This software may be used and distributed according to the terms
+ * of the GNU General Public License, incorporated herein by reference.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/spinlock.h>
+#include <linux/notifier.h>
+#include <linux/rtnetlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+MODULE_LICENSE("GPL");
+
+static char __initdata version[] = "0.90";
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+
+static inline int
+ctnetlink_dump_tuples_proto(struct sk_buff *skb,
+ const struct ip_conntrack_tuple *tuple)
+{
+ struct ip_conntrack_protocol *proto;
+
+ NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum);
+
+ proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
+ if (proto && proto->tuple_to_nfattr)
+ return proto->tuple_to_nfattr(skb, tuple);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_tuples(struct sk_buff *skb,
+ const struct ip_conntrack_tuple *tuple)
+{
+ struct nfattr *nest_parms;
+
+ nest_parms = NFA_NEST(skb, CTA_TUPLE_IP);
+ NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), &tuple->src.ip);
+ NFA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), &tuple->dst.ip);
+ NFA_NEST_END(skb, nest_parms);
+
+ nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO);
+ ctnetlink_dump_tuples_proto(skb, tuple);
+ NFA_NEST_END(skb, nest_parms);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_status(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ u_int32_t status = htonl((u_int32_t) ct->status);
+ NFA_PUT(skb, CTA_STATUS, sizeof(status), &status);
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_timeout(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ long timeout_l = ct->timeout.expires - jiffies;
+ u_int32_t timeout;
+
+ if (timeout_l < 0)
+ timeout = 0;
+ else
+ timeout = htonl(timeout_l / HZ);
+
+ NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout);
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ struct ip_conntrack_protocol *proto = ip_conntrack_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
+
+ struct nfattr *nest_proto;
+ int ret;
+
+ if (!proto || !proto->to_nfattr)
+ return 0;
+
+ nest_proto = NFA_NEST(skb, CTA_PROTOINFO);
+
+ ret = proto->to_nfattr(skb, nest_proto, ct);
+
+ ip_conntrack_proto_put(proto);
+
+ NFA_NEST_END(skb, nest_proto);
+
+ return ret;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ struct nfattr *nest_helper;
+
+ if (!ct->helper)
+ return 0;
+
+ nest_helper = NFA_NEST(skb, CTA_HELP);
+ NFA_PUT(skb, CTA_HELP_NAME, CTA_HELP_MAXNAMESIZE, &ct->helper->name);
+
+ if (ct->helper->to_nfattr)
+ ct->helper->to_nfattr(skb, ct);
+
+ NFA_NEST_END(skb, nest_helper);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+#ifdef CONFIG_IP_NF_CT_ACCT
+static inline int
+ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct,
+ enum ip_conntrack_dir dir)
+{
+ enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
+ struct nfattr *nest_count = NFA_NEST(skb, type);
+ u_int64_t tmp;
+
+ tmp = htonl(ct->counters[dir].packets);
+ NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(u_int32_t), &tmp);
+
+ tmp = htonl(ct->counters[dir].bytes);
+ NFA_PUT(skb, CTA_COUNTERS32_BYTES, sizeof(u_int32_t), &tmp);
+
+ NFA_NEST_END(skb, nest_count);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+#else
+#define ctnetlink_dump_counters(a, b, c) (0)
+#endif
+
+#ifdef CONFIG_IP_NF_CONNTRACK_MARK
+static inline int
+ctnetlink_dump_mark(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ u_int32_t mark = htonl(ct->mark);
+
+ NFA_PUT(skb, CTA_MARK, sizeof(u_int32_t), &mark);
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+#else
+#define ctnetlink_dump_mark(a, b) (0)
+#endif
+
+static inline int
+ctnetlink_dump_id(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ u_int32_t id = htonl(ct->id);
+ NFA_PUT(skb, CTA_ID, sizeof(u_int32_t), &id);
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_use(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ unsigned int use = htonl(atomic_read(&ct->ct_general.use));
+
+ NFA_PUT(skb, CTA_USE, sizeof(u_int32_t), &use);
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple)
+
+static int
+ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+ int event, int nowait,
+ const struct ip_conntrack *ct)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ struct nfattr *nest_parms;
+ unsigned char *b;
+
+ b = skb->tail;
+
+ event |= NFNL_SUBSYS_CTNETLINK << 8;
+ nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
+ nfmsg = NLMSG_DATA(nlh);
+
+ nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
+ nfmsg->nfgen_family = AF_INET;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
+ if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+ goto nfattr_failure;
+ NFA_NEST_END(skb, nest_parms);
+
+ nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
+ if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
+ goto nfattr_failure;
+ NFA_NEST_END(skb, nest_parms);
+
+ if (ctnetlink_dump_status(skb, ct) < 0 ||
+ ctnetlink_dump_timeout(skb, ct) < 0 ||
+ ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
+ ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
+ ctnetlink_dump_protoinfo(skb, ct) < 0 ||
+ ctnetlink_dump_helpinfo(skb, ct) < 0 ||
+ ctnetlink_dump_mark(skb, ct) < 0 ||
+ ctnetlink_dump_id(skb, ct) < 0 ||
+ ctnetlink_dump_use(skb, ct) < 0)
+ goto nfattr_failure;
+
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+nfattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+static int ctnetlink_conntrack_event(struct notifier_block *this,
+ unsigned long events, void *ptr)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ struct nfattr *nest_parms;
+ struct ip_conntrack *ct = (struct ip_conntrack *)ptr;
+ struct sk_buff *skb;
+ unsigned int type;
+ unsigned char *b;
+ unsigned int flags = 0, group;
+
+ /* ignore our fake conntrack entry */
+ if (ct == &ip_conntrack_untracked)
+ return NOTIFY_DONE;
+
+ if (events & IPCT_DESTROY) {
+ type = IPCTNL_MSG_CT_DELETE;
+ group = NFNLGRP_CONNTRACK_DESTROY;
+ goto alloc_skb;
+ }
+ if (events & (IPCT_NEW | IPCT_RELATED)) {
+ type = IPCTNL_MSG_CT_NEW;
+ flags = NLM_F_CREATE|NLM_F_EXCL;
+ /* dump everything */
+ events = ~0UL;
+ group = NFNLGRP_CONNTRACK_NEW;
+ goto alloc_skb;
+ }
+ if (events & (IPCT_STATUS |
+ IPCT_PROTOINFO |
+ IPCT_HELPER |
+ IPCT_HELPINFO |
+ IPCT_NATINFO)) {
+ type = IPCTNL_MSG_CT_NEW;
+ group = NFNLGRP_CONNTRACK_UPDATE;
+ goto alloc_skb;
+ }
+
+ return NOTIFY_DONE;
+
+alloc_skb:
+ /* FIXME: Check if there are any listeners before, don't hurt performance */
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+ if (!skb)
+ return NOTIFY_DONE;
+
+ b = skb->tail;
+
+ type |= NFNL_SUBSYS_CTNETLINK << 8;
+ nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
+ nfmsg = NLMSG_DATA(nlh);
+
+ nlh->nlmsg_flags = flags;
+ nfmsg->nfgen_family = AF_INET;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
+ if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+ goto nfattr_failure;
+ NFA_NEST_END(skb, nest_parms);
+
+ nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
+ if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
+ goto nfattr_failure;
+ NFA_NEST_END(skb, nest_parms);
+
+ /* NAT stuff is now a status flag */
+ if ((events & IPCT_STATUS || events & IPCT_NATINFO)
+ && ctnetlink_dump_status(skb, ct) < 0)
+ goto nfattr_failure;
+ if (events & IPCT_REFRESH
+ && ctnetlink_dump_timeout(skb, ct) < 0)
+ goto nfattr_failure;
+ if (events & IPCT_PROTOINFO
+ && ctnetlink_dump_protoinfo(skb, ct) < 0)
+ goto nfattr_failure;
+ if (events & IPCT_HELPINFO
+ && ctnetlink_dump_helpinfo(skb, ct) < 0)
+ goto nfattr_failure;
+
+ if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
+ ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)
+ goto nfattr_failure;
+
+ nlh->nlmsg_len = skb->tail - b;
+ nfnetlink_send(skb, 0, group, 0);
+ return NOTIFY_DONE;
+
+nlmsg_failure:
+nfattr_failure:
+ kfree_skb(skb);
+ return NOTIFY_DONE;
+}
+#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
+
+static int ctnetlink_done(struct netlink_callback *cb)
+{
+ DEBUGP("entered %s\n", __FUNCTION__);
+ return 0;
+}
+
+static int
+ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ip_conntrack *ct = NULL;
+ struct ip_conntrack_tuple_hash *h;
+ struct list_head *i;
+ u_int32_t *id = (u_int32_t *) &cb->args[1];
+
+ DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__,
+ cb->args[0], *id);
+
+ read_lock_bh(&ip_conntrack_lock);
+ for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) {
+ list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
+ h = (struct ip_conntrack_tuple_hash *) i;
+ if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+ continue;
+ ct = tuplehash_to_ctrack(h);
+ if (ct->id <= *id)
+ continue;
+ if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq,
+ IPCTNL_MSG_CT_NEW,
+ 1, ct) < 0)
+ goto out;
+ *id = ct->id;
+ }
+ }
+out:
+ read_unlock_bh(&ip_conntrack_lock);
+
+ DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
+
+ return skb->len;
+}
+
+#ifdef CONFIG_IP_NF_CT_ACCT
+static int
+ctnetlink_dump_table_w(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ip_conntrack *ct = NULL;
+ struct ip_conntrack_tuple_hash *h;
+ struct list_head *i;
+ u_int32_t *id = (u_int32_t *) &cb->args[1];
+
+ DEBUGP("entered %s, last bucket=%u id=%u\n", __FUNCTION__,
+ cb->args[0], *id);
+
+ write_lock_bh(&ip_conntrack_lock);
+ for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) {
+ list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
+ h = (struct ip_conntrack_tuple_hash *) i;
+ if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+ continue;
+ ct = tuplehash_to_ctrack(h);
+ if (ct->id <= *id)
+ continue;
+ if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq,
+ IPCTNL_MSG_CT_NEW,
+ 1, ct) < 0)
+ goto out;
+ *id = ct->id;
+
+ memset(&ct->counters, 0, sizeof(ct->counters));
+ }
+ }
+out:
+ write_unlock_bh(&ip_conntrack_lock);
+
+ DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
+
+ return skb->len;
+}
+#endif
+
+static const int cta_min_ip[CTA_IP_MAX] = {
+ [CTA_IP_V4_SRC-1] = sizeof(u_int32_t),
+ [CTA_IP_V4_DST-1] = sizeof(u_int32_t),
+};
+
+static inline int
+ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple)
+{
+ struct nfattr *tb[CTA_IP_MAX];
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+
+ if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0)
+ goto nfattr_failure;
+
+ if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip))
+ return -EINVAL;
+
+ if (!tb[CTA_IP_V4_SRC-1])
+ return -EINVAL;
+ tuple->src.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_SRC-1]);
+
+ if (!tb[CTA_IP_V4_DST-1])
+ return -EINVAL;
+ tuple->dst.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_DST-1]);
+
+ DEBUGP("leaving\n");
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static const int cta_min_proto[CTA_PROTO_MAX] = {
+ [CTA_PROTO_NUM-1] = sizeof(u_int16_t),
+ [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t),
+ [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t),
+ [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t),
+ [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t),
+ [CTA_PROTO_ICMP_ID-1] = sizeof(u_int16_t),
+};
+
+static inline int
+ctnetlink_parse_tuple_proto(struct nfattr *attr,
+ struct ip_conntrack_tuple *tuple)
+{
+ struct nfattr *tb[CTA_PROTO_MAX];
+ struct ip_conntrack_protocol *proto;
+ int ret = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0)
+ goto nfattr_failure;
+
+ if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
+ return -EINVAL;
+
+ if (!tb[CTA_PROTO_NUM-1])
+ return -EINVAL;
+ tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]);
+
+ proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
+
+ if (likely(proto && proto->nfattr_to_tuple)) {
+ ret = proto->nfattr_to_tuple(tb, tuple);
+ ip_conntrack_proto_put(proto);
+ }
+
+ return ret;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple,
+ enum ctattr_tuple type)
+{
+ struct nfattr *tb[CTA_TUPLE_MAX];
+ int err;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ memset(tuple, 0, sizeof(*tuple));
+
+ if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0)
+ goto nfattr_failure;
+
+ if (!tb[CTA_TUPLE_IP-1])
+ return -EINVAL;
+
+ err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple);
+ if (err < 0)
+ return err;
+
+ if (!tb[CTA_TUPLE_PROTO-1])
+ return -EINVAL;
+
+ err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple);
+ if (err < 0)
+ return err;
+
+ /* orig and expect tuples get DIR_ORIGINAL */
+ if (type == CTA_TUPLE_REPLY)
+ tuple->dst.dir = IP_CT_DIR_REPLY;
+ else
+ tuple->dst.dir = IP_CT_DIR_ORIGINAL;
+
+ DUMP_TUPLE(tuple);
+
+ DEBUGP("leaving\n");
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+static const int cta_min_protonat[CTA_PROTONAT_MAX] = {
+ [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t),
+ [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t),
+};
+
+static int ctnetlink_parse_nat_proto(struct nfattr *attr,
+ const struct ip_conntrack *ct,
+ struct ip_nat_range *range)
+{
+ struct nfattr *tb[CTA_PROTONAT_MAX];
+ struct ip_nat_protocol *npt;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0)
+ goto nfattr_failure;
+
+ if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat))
+ goto nfattr_failure;
+
+ npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
+ if (!npt)
+ return 0;
+
+ if (!npt->nfattr_to_range) {
+ ip_nat_proto_put(npt);
+ return 0;
+ }
+
+ /* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */
+ if (npt->nfattr_to_range(tb, range) > 0)
+ range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
+
+ ip_nat_proto_put(npt);
+
+ DEBUGP("leaving\n");
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_parse_nat(struct nfattr *cda[],
+ const struct ip_conntrack *ct, struct ip_nat_range *range)
+{
+ struct nfattr *tb[CTA_NAT_MAX];
+ int err;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ memset(range, 0, sizeof(*range));
+
+ if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0)
+ goto nfattr_failure;
+
+ if (tb[CTA_NAT_MINIP-1])
+ range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]);
+
+ if (!tb[CTA_NAT_MAXIP-1])
+ range->max_ip = range->min_ip;
+ else
+ range->max_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MAXIP-1]);
+
+ if (range->min_ip)
+ range->flags |= IP_NAT_RANGE_MAP_IPS;
+
+ if (!tb[CTA_NAT_PROTO-1])
+ return 0;
+
+ err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range);
+ if (err < 0)
+ return err;
+
+ DEBUGP("leaving\n");
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+#endif
+
+static inline int
+ctnetlink_parse_help(struct nfattr *attr, char **helper_name)
+{
+ struct nfattr *tb[CTA_HELP_MAX];
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0)
+ goto nfattr_failure;
+
+ if (!tb[CTA_HELP_NAME-1])
+ return -EINVAL;
+
+ *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static int
+ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+ struct ip_conntrack_tuple_hash *h;
+ struct ip_conntrack_tuple tuple;
+ struct ip_conntrack *ct;
+ int err = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (cda[CTA_TUPLE_ORIG-1])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
+ else if (cda[CTA_TUPLE_REPLY-1])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
+ else {
+ /* Flush the whole table */
+ ip_conntrack_flush();
+ return 0;
+ }
+
+ if (err < 0)
+ return err;
+
+ h = ip_conntrack_find_get(&tuple, NULL);
+ if (!h) {
+ DEBUGP("tuple not found in conntrack hash\n");
+ return -ENOENT;
+ }
+
+ ct = tuplehash_to_ctrack(h);
+
+ if (cda[CTA_ID-1]) {
+ u_int32_t id = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_ID-1]));
+ if (ct->id != id) {
+ ip_conntrack_put(ct);
+ return -ENOENT;
+ }
+ }
+ if (del_timer(&ct->timeout)) {
+ ip_conntrack_put(ct);
+ ct->timeout.function((unsigned long)ct);
+ return 0;
+ }
+ ip_conntrack_put(ct);
+ DEBUGP("leaving\n");
+
+ return 0;
+}
+
+static int
+ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+ struct ip_conntrack_tuple_hash *h;
+ struct ip_conntrack_tuple tuple;
+ struct ip_conntrack *ct;
+ struct sk_buff *skb2 = NULL;
+ int err = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct nfgenmsg *msg = NLMSG_DATA(nlh);
+ u32 rlen;
+
+ if (msg->nfgen_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ if (NFNL_MSG_TYPE(nlh->nlmsg_type) ==
+ IPCTNL_MSG_CT_GET_CTRZERO) {
+#ifdef CONFIG_IP_NF_CT_ACCT
+ if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+ ctnetlink_dump_table_w,
+ ctnetlink_done)) != 0)
+ return -EINVAL;
+#else
+ return -ENOTSUPP;
+#endif
+ } else {
+ if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+ ctnetlink_dump_table,
+ ctnetlink_done)) != 0)
+ return -EINVAL;
+ }
+
+ rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (rlen > skb->len)
+ rlen = skb->len;
+ skb_pull(skb, rlen);
+ return 0;
+ }
+
+ if (cda[CTA_TUPLE_ORIG-1])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
+ else if (cda[CTA_TUPLE_REPLY-1])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
+ else
+ return -EINVAL;
+
+ if (err < 0)
+ return err;
+
+ h = ip_conntrack_find_get(&tuple, NULL);
+ if (!h) {
+ DEBUGP("tuple not found in conntrack hash");
+ return -ENOENT;
+ }
+ DEBUGP("tuple found\n");
+ ct = tuplehash_to_ctrack(h);
+
+ err = -ENOMEM;
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+ if (!skb2) {
+ ip_conntrack_put(ct);
+ return -ENOMEM;
+ }
+ NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid;
+
+ err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq,
+ IPCTNL_MSG_CT_NEW, 1, ct);
+ ip_conntrack_put(ct);
+ if (err <= 0)
+ goto out;
+
+ err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+ if (err < 0)
+ goto out;
+
+ DEBUGP("leaving\n");
+ return 0;
+
+out:
+ if (skb2)
+ kfree_skb(skb2);
+ return -1;
+}
+
+static inline int
+ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+ unsigned long d;
+ unsigned status = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_STATUS-1]));
+ d = ct->status ^ status;
+
+ if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
+ /* unchangeable */
+ return -EINVAL;
+
+ if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
+ /* SEEN_REPLY bit can only be set */
+ return -EINVAL;
+
+
+ if (d & IPS_ASSURED && !(status & IPS_ASSURED))
+ /* ASSURED bit can only be set */
+ return -EINVAL;
+
+ if (cda[CTA_NAT-1]) {
+#ifndef CONFIG_IP_NF_NAT_NEEDED
+ return -EINVAL;
+#else
+ unsigned int hooknum;
+ struct ip_nat_range range;
+
+ if (ctnetlink_parse_nat(cda, ct, &range) < 0)
+ return -EINVAL;
+
+ DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n",
+ NIPQUAD(range.min_ip), NIPQUAD(range.max_ip),
+ htons(range.min.all), htons(range.max.all));
+
+ /* This is tricky but it works. ip_nat_setup_info needs the
+ * hook number as parameter, so let's do the correct
+ * conversion and run away */
+ if (status & IPS_SRC_NAT_DONE)
+ hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */
+ else if (status & IPS_DST_NAT_DONE)
+ hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */
+ else
+ return -EINVAL; /* Missing NAT flags */
+
+ DEBUGP("NAT status: %lu\n",
+ status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
+
+ if (ip_nat_initialized(ct, hooknum))
+ return -EEXIST;
+ ip_nat_setup_info(ct, &range, hooknum);
+
+ DEBUGP("NAT status after setup_info: %lu\n",
+ ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
+#endif
+ }
+
+ /* Be careful here, modifying NAT bits can screw up things,
+ * so don't let users modify them directly if they don't pass
+ * ip_nat_range. */
+ ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK);
+ return 0;
+}
+
+
+static inline int
+ctnetlink_change_helper(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+ struct ip_conntrack_helper *helper;
+ char *helpname;
+ int err;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ /* don't change helper of sibling connections */
+ if (ct->master)
+ return -EINVAL;
+
+ err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname);
+ if (err < 0)
+ return err;
+
+ helper = __ip_conntrack_helper_find_byname(helpname);
+ if (!helper) {
+ if (!strcmp(helpname, ""))
+ helper = NULL;
+ else
+ return -EINVAL;
+ }
+
+ if (ct->helper) {
+ if (!helper) {
+ /* we had a helper before ... */
+ ip_ct_remove_expectations(ct);
+ ct->helper = NULL;
+ } else {
+ /* need to zero data of old helper */
+ memset(&ct->help, 0, sizeof(ct->help));
+ }
+ }
+
+ ct->helper = helper;
+
+ return 0;
+}
+
+static inline int
+ctnetlink_change_timeout(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+ u_int32_t timeout = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1]));
+
+ if (!del_timer(&ct->timeout))
+ return -ETIME;
+
+ ct->timeout.expires = jiffies + timeout * HZ;
+ add_timer(&ct->timeout);
+
+ return 0;
+}
+
+static inline int
+ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+ struct nfattr *tb[CTA_PROTOINFO_MAX], *attr = cda[CTA_PROTOINFO-1];
+ struct ip_conntrack_protocol *proto;
+ u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
+ int err = 0;
+
+ if (nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr) < 0)
+ goto nfattr_failure;
+
+ proto = ip_conntrack_proto_find_get(npt);
+ if (!proto)
+ return -EINVAL;
+
+ if (proto->from_nfattr)
+ err = proto->from_nfattr(tb, ct);
+ ip_conntrack_proto_put(proto);
+
+ return err;
+
+nfattr_failure:
+ return -ENOMEM;
+}
+
+static int
+ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+ int err;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (cda[CTA_HELP-1]) {
+ err = ctnetlink_change_helper(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+ if (cda[CTA_TIMEOUT-1]) {
+ err = ctnetlink_change_timeout(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+ if (cda[CTA_STATUS-1]) {
+ err = ctnetlink_change_status(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+ if (cda[CTA_PROTOINFO-1]) {
+ err = ctnetlink_change_protoinfo(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+ DEBUGP("all done\n");
+ return 0;
+}
+
+static int
+ctnetlink_create_conntrack(struct nfattr *cda[],
+ struct ip_conntrack_tuple *otuple,
+ struct ip_conntrack_tuple *rtuple)
+{
+ struct ip_conntrack *ct;
+ int err = -EINVAL;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ ct = ip_conntrack_alloc(otuple, rtuple);
+ if (ct == NULL || IS_ERR(ct))
+ return -ENOMEM;
+
+ if (!cda[CTA_TIMEOUT-1])
+ goto err;
+ ct->timeout.expires = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1]));
+
+ ct->timeout.expires = jiffies + ct->timeout.expires * HZ;
+ ct->status |= IPS_CONFIRMED;
+
+ err = ctnetlink_change_status(ct, cda);
+ if (err < 0)
+ goto err;
+
+ if (cda[CTA_PROTOINFO-1]) {
+ err = ctnetlink_change_protoinfo(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+ ct->helper = ip_conntrack_helper_find_get(rtuple);
+
+ add_timer(&ct->timeout);
+ ip_conntrack_hash_insert(ct);
+
+ if (ct->helper)
+ ip_conntrack_helper_put(ct->helper);
+
+ DEBUGP("conntrack with id %u inserted\n", ct->id);
+ return 0;
+
+err:
+ ip_conntrack_free(ct);
+ return err;
+}
+
+static int
+ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+ struct ip_conntrack_tuple otuple, rtuple;
+ struct ip_conntrack_tuple_hash *h = NULL;
+ int err = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (cda[CTA_TUPLE_ORIG-1]) {
+ err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG);
+ if (err < 0)
+ return err;
+ }
+
+ if (cda[CTA_TUPLE_REPLY-1]) {
+ err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY);
+ if (err < 0)
+ return err;
+ }
+
+ write_lock_bh(&ip_conntrack_lock);
+ if (cda[CTA_TUPLE_ORIG-1])
+ h = __ip_conntrack_find(&otuple, NULL);
+ else if (cda[CTA_TUPLE_REPLY-1])
+ h = __ip_conntrack_find(&rtuple, NULL);
+
+ if (h == NULL) {
+ write_unlock_bh(&ip_conntrack_lock);
+ DEBUGP("no such conntrack, create new\n");
+ err = -ENOENT;
+ if (nlh->nlmsg_flags & NLM_F_CREATE)
+ err = ctnetlink_create_conntrack(cda, &otuple, &rtuple);
+ return err;
+ }
+ /* implicit 'else' */
+
+ /* we only allow nat config for new conntracks */
+ if (cda[CTA_NAT-1]) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* We manipulate the conntrack inside the global conntrack table lock,
+ * so there's no need to increase the refcount */
+ DEBUGP("conntrack found\n");
+ err = -EEXIST;
+ if (!(nlh->nlmsg_flags & NLM_F_EXCL))
+ err = ctnetlink_change_conntrack(tuplehash_to_ctrack(h), cda);
+
+out_unlock:
+ write_unlock_bh(&ip_conntrack_lock);
+ return err;
+}
+
+/***********************************************************************
+ * EXPECT
+ ***********************************************************************/
+
+static inline int
+ctnetlink_exp_dump_tuple(struct sk_buff *skb,
+ const struct ip_conntrack_tuple *tuple,
+ enum ctattr_expect type)
+{
+ struct nfattr *nest_parms = NFA_NEST(skb, type);
+
+ if (ctnetlink_dump_tuples(skb, tuple) < 0)
+ goto nfattr_failure;
+
+ NFA_NEST_END(skb, nest_parms);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_exp_dump_expect(struct sk_buff *skb,
+ const struct ip_conntrack_expect *exp)
+{
+ struct ip_conntrack *master = exp->master;
+ u_int32_t timeout = htonl((exp->timeout.expires - jiffies) / HZ);
+ u_int32_t id = htonl(exp->id);
+
+ if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0)
+ goto nfattr_failure;
+ if (ctnetlink_exp_dump_tuple(skb, &exp->mask, CTA_EXPECT_MASK) < 0)
+ goto nfattr_failure;
+ if (ctnetlink_exp_dump_tuple(skb,
+ &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ CTA_EXPECT_MASTER) < 0)
+ goto nfattr_failure;
+
+ NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout);
+ NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static int
+ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+ int event,
+ int nowait,
+ const struct ip_conntrack_expect *exp)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned char *b;
+
+ b = skb->tail;
+
+ event |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
+ nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
+ nfmsg = NLMSG_DATA(nlh);
+
+ nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
+ nfmsg->nfgen_family = AF_INET;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (ctnetlink_exp_dump_expect(skb, exp) < 0)
+ goto nfattr_failure;
+
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+nfattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+static int ctnetlink_expect_event(struct notifier_block *this,
+ unsigned long events, void *ptr)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ struct ip_conntrack_expect *exp = (struct ip_conntrack_expect *)ptr;
+ struct sk_buff *skb;
+ unsigned int type;
+ unsigned char *b;
+ int flags = 0;
+ u16 proto;
+
+ if (events & IPEXP_NEW) {
+ type = IPCTNL_MSG_EXP_NEW;
+ flags = NLM_F_CREATE|NLM_F_EXCL;
+ } else
+ return NOTIFY_DONE;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+ if (!skb)
+ return NOTIFY_DONE;
+
+ b = skb->tail;
+
+ type |= NFNL_SUBSYS_CTNETLINK << 8;
+ nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
+ nfmsg = NLMSG_DATA(nlh);
+
+ nlh->nlmsg_flags = flags;
+ nfmsg->nfgen_family = AF_INET;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (ctnetlink_exp_dump_expect(skb, exp) < 0)
+ goto nfattr_failure;
+
+ nlh->nlmsg_len = skb->tail - b;
+ proto = exp->tuple.dst.protonum;
+ nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0);
+ return NOTIFY_DONE;
+
+nlmsg_failure:
+nfattr_failure:
+ kfree_skb(skb);
+ return NOTIFY_DONE;
+}
+#endif
+
+static int
+ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ip_conntrack_expect *exp = NULL;
+ struct list_head *i;
+ u_int32_t *id = (u_int32_t *) &cb->args[0];
+
+ DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id);
+
+ read_lock_bh(&ip_conntrack_lock);
+ list_for_each_prev(i, &ip_conntrack_expect_list) {
+ exp = (struct ip_conntrack_expect *) i;
+ if (exp->id <= *id)
+ continue;
+ if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq,
+ IPCTNL_MSG_EXP_NEW,
+ 1, exp) < 0)
+ goto out;
+ *id = exp->id;
+ }
+out:
+ read_unlock_bh(&ip_conntrack_lock);
+
+ DEBUGP("leaving, last id=%llu\n", *id);
+
+ return skb->len;
+}
+
+static int
+ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+ struct ip_conntrack_tuple tuple;
+ struct ip_conntrack_expect *exp;
+ struct sk_buff *skb2;
+ int err = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct nfgenmsg *msg = NLMSG_DATA(nlh);
+ u32 rlen;
+
+ if (msg->nfgen_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+ ctnetlink_exp_dump_table,
+ ctnetlink_done)) != 0)
+ return -EINVAL;
+ rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (rlen > skb->len)
+ rlen = skb->len;
+ skb_pull(skb, rlen);
+ return 0;
+ }
+
+ if (cda[CTA_EXPECT_MASTER-1])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER);
+ else
+ return -EINVAL;
+
+ if (err < 0)
+ return err;
+
+ exp = ip_conntrack_expect_find(&tuple);
+ if (!exp)
+ return -ENOENT;
+
+ err = -ENOMEM;
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb2)
+ goto out;
+ NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid;
+
+ err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid,
+ nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
+ 1, exp);
+ if (err <= 0)
+ goto out;
+
+ ip_conntrack_expect_put(exp);
+
+ err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+ if (err < 0)
+ goto free;
+
+ return err;
+
+out:
+ ip_conntrack_expect_put(exp);
+free:
+ if (skb2)
+ kfree_skb(skb2);
+ return err;
+}
+
+static int
+ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+ struct ip_conntrack_expect *exp, *tmp;
+ struct ip_conntrack_tuple tuple;
+ struct ip_conntrack_helper *h;
+ int err;
+
+ if (cda[CTA_EXPECT_TUPLE-1]) {
+ /* delete a single expect by tuple */
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
+ if (err < 0)
+ return err;
+
+ /* bump usage count to 2 */
+ exp = ip_conntrack_expect_find(&tuple);
+ if (!exp)
+ return -ENOENT;
+
+ if (cda[CTA_EXPECT_ID-1]) {
+ u_int32_t id =
+ *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]);
+ if (exp->id != ntohl(id)) {
+ ip_conntrack_expect_put(exp);
+ return -ENOENT;
+ }
+ }
+
+ /* after list removal, usage count == 1 */
+ ip_conntrack_unexpect_related(exp);
+ /* have to put what we 'get' above.
+ * after this line usage count == 0 */
+ ip_conntrack_expect_put(exp);
+ } else if (cda[CTA_EXPECT_HELP_NAME-1]) {
+ char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]);
+
+ /* delete all expectations for this helper */
+ write_lock_bh(&ip_conntrack_lock);
+ h = __ip_conntrack_helper_find_byname(name);
+ if (!h) {
+ write_unlock_bh(&ip_conntrack_lock);
+ return -EINVAL;
+ }
+ list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
+ list) {
+ if (exp->master->helper == h
+ && del_timer(&exp->timeout)) {
+ ip_ct_unlink_expect(exp);
+ ip_conntrack_expect_put(exp);
+ }
+ }
+ write_unlock(&ip_conntrack_lock);
+ } else {
+ /* This basically means we have to flush everything*/
+ write_lock_bh(&ip_conntrack_lock);
+ list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
+ list) {
+ if (del_timer(&exp->timeout)) {
+ ip_ct_unlink_expect(exp);
+ ip_conntrack_expect_put(exp);
+ }
+ }
+ write_unlock_bh(&ip_conntrack_lock);
+ }
+
+ return 0;
+}
+static int
+ctnetlink_change_expect(struct ip_conntrack_expect *x, struct nfattr *cda[])
+{
+ return -EOPNOTSUPP;
+}
+
+static int
+ctnetlink_create_expect(struct nfattr *cda[])
+{
+ struct ip_conntrack_tuple tuple, mask, master_tuple;
+ struct ip_conntrack_tuple_hash *h = NULL;
+ struct ip_conntrack_expect *exp;
+ struct ip_conntrack *ct;
+ int err = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ /* caller guarantees that those three CTA_EXPECT_* exist */
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
+ if (err < 0)
+ return err;
+ err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK);
+ if (err < 0)
+ return err;
+ err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER);
+ if (err < 0)
+ return err;
+
+ /* Look for master conntrack of this expectation */
+ h = ip_conntrack_find_get(&master_tuple, NULL);
+ if (!h)
+ return -ENOENT;
+ ct = tuplehash_to_ctrack(h);
+
+ if (!ct->helper) {
+ /* such conntrack hasn't got any helper, abort */
+ err = -EINVAL;
+ goto out;
+ }
+
+ exp = ip_conntrack_expect_alloc(ct);
+ if (!exp) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ exp->expectfn = NULL;
+ exp->flags = 0;
+ exp->master = ct;
+ memcpy(&exp->tuple, &tuple, sizeof(struct ip_conntrack_tuple));
+ memcpy(&exp->mask, &mask, sizeof(struct ip_conntrack_tuple));
+
+ err = ip_conntrack_expect_related(exp);
+ ip_conntrack_expect_put(exp);
+
+out:
+ ip_conntrack_put(tuplehash_to_ctrack(h));
+ return err;
+}
+
+static int
+ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+ struct ip_conntrack_tuple tuple;
+ struct ip_conntrack_expect *exp;
+ int err = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (!cda[CTA_EXPECT_TUPLE-1]
+ || !cda[CTA_EXPECT_MASK-1]
+ || !cda[CTA_EXPECT_MASTER-1])
+ return -EINVAL;
+
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
+ if (err < 0)
+ return err;
+
+ write_lock_bh(&ip_conntrack_lock);
+ exp = __ip_conntrack_expect_find(&tuple);
+
+ if (!exp) {
+ write_unlock_bh(&ip_conntrack_lock);
+ err = -ENOENT;
+ if (nlh->nlmsg_flags & NLM_F_CREATE)
+ err = ctnetlink_create_expect(cda);
+ return err;
+ }
+
+ err = -EEXIST;
+ if (!(nlh->nlmsg_flags & NLM_F_EXCL))
+ err = ctnetlink_change_expect(exp, cda);
+ write_unlock_bh(&ip_conntrack_lock);
+
+ DEBUGP("leaving\n");
+
+ return err;
+}
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+static struct notifier_block ctnl_notifier = {
+ .notifier_call = ctnetlink_conntrack_event,
+};
+
+static struct notifier_block ctnl_notifier_exp = {
+ .notifier_call = ctnetlink_expect_event,
+};
+#endif
+
+static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
+ [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack,
+ .attr_count = CTA_MAX,
+ .cap_required = CAP_NET_ADMIN },
+ [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack,
+ .attr_count = CTA_MAX,
+ .cap_required = CAP_NET_ADMIN },
+ [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack,
+ .attr_count = CTA_MAX,
+ .cap_required = CAP_NET_ADMIN },
+ [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack,
+ .attr_count = CTA_MAX,
+ .cap_required = CAP_NET_ADMIN },
+};
+
+static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
+ [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect,
+ .attr_count = CTA_EXPECT_MAX,
+ .cap_required = CAP_NET_ADMIN },
+ [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect,
+ .attr_count = CTA_EXPECT_MAX,
+ .cap_required = CAP_NET_ADMIN },
+ [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect,
+ .attr_count = CTA_EXPECT_MAX,
+ .cap_required = CAP_NET_ADMIN },
+};
+
+static struct nfnetlink_subsystem ctnl_subsys = {
+ .name = "conntrack",
+ .subsys_id = NFNL_SUBSYS_CTNETLINK,
+ .cb_count = IPCTNL_MSG_MAX,
+ .cb = ctnl_cb,
+};
+
+static struct nfnetlink_subsystem ctnl_exp_subsys = {
+ .name = "conntrack_expect",
+ .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP,
+ .cb_count = IPCTNL_MSG_EXP_MAX,
+ .cb = ctnl_exp_cb,
+};
+
+static int __init ctnetlink_init(void)
+{
+ int ret;
+
+ printk("ctnetlink v%s: registering with nfnetlink.\n", version);
+ ret = nfnetlink_subsys_register(&ctnl_subsys);
+ if (ret < 0) {
+ printk("ctnetlink_init: cannot register with nfnetlink.\n");
+ goto err_out;
+ }
+
+ ret = nfnetlink_subsys_register(&ctnl_exp_subsys);
+ if (ret < 0) {
+ printk("ctnetlink_init: cannot register exp with nfnetlink.\n");
+ goto err_unreg_subsys;
+ }
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+ ret = ip_conntrack_register_notifier(&ctnl_notifier);
+ if (ret < 0) {
+ printk("ctnetlink_init: cannot register notifier.\n");
+ goto err_unreg_exp_subsys;
+ }
+
+ ret = ip_conntrack_expect_register_notifier(&ctnl_notifier_exp);
+ if (ret < 0) {
+ printk("ctnetlink_init: cannot expect register notifier.\n");
+ goto err_unreg_notifier;
+ }
+#endif
+
+ return 0;
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+err_unreg_notifier:
+ ip_conntrack_unregister_notifier(&ctnl_notifier);
+err_unreg_exp_subsys:
+ nfnetlink_subsys_unregister(&ctnl_exp_subsys);
+#endif
+err_unreg_subsys:
+ nfnetlink_subsys_unregister(&ctnl_subsys);
+err_out:
+ return ret;
+}
+
+static void __exit ctnetlink_exit(void)
+{
+ printk("ctnetlink: unregistering from nfnetlink.\n");
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+ ip_conntrack_unregister_notifier(&ctnl_notifier_exp);
+ ip_conntrack_unregister_notifier(&ctnl_notifier);
+#endif
+
+ nfnetlink_subsys_unregister(&ctnl_exp_subsys);
+ nfnetlink_subsys_unregister(&ctnl_subsys);
+ return;
+}
+
+module_init(ctnetlink_init);
+module_exit(ctnetlink_exit);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
new file mode 100644
index 000000000000..744abb9d377a
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
@@ -0,0 +1,328 @@
+/*
+ * ip_conntrack_proto_gre.c - Version 3.0
+ *
+ * Connection tracking protocol helper module for GRE.
+ *
+ * GRE is a generic encapsulation protocol, which is generally not very
+ * suited for NAT, as it has no protocol-specific part as port numbers.
+ *
+ * It has an optional key field, which may help us distinguishing two
+ * connections between the same two hosts.
+ *
+ * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
+ *
+ * PPTP is built on top of a modified version of GRE, and has a mandatory
+ * field called "CallID", which serves us for the same purpose as the key
+ * field in plain GRE.
+ *
+ * Documentation about PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/list.h>
+
+static DEFINE_RWLOCK(ip_ct_gre_lock);
+#define ASSERT_READ_LOCK(x)
+#define ASSERT_WRITE_LOCK(x)
+
+#include <linux/netfilter_ipv4/listhelp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+
+#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h>
+#include <linux/netfilter_ipv4/ip_conntrack_pptp.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("netfilter connection tracking protocol helper for GRE");
+
+/* shamelessly stolen from ip_conntrack_proto_udp.c */
+#define GRE_TIMEOUT (30*HZ)
+#define GRE_STREAM_TIMEOUT (180*HZ)
+
+#if 0
+#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, __FUNCTION__, ## args)
+#define DUMP_TUPLE_GRE(x) printk("%u.%u.%u.%u:0x%x -> %u.%u.%u.%u:0x%x\n", \
+ NIPQUAD((x)->src.ip), ntohs((x)->src.u.gre.key), \
+ NIPQUAD((x)->dst.ip), ntohs((x)->dst.u.gre.key))
+#else
+#define DEBUGP(x, args...)
+#define DUMP_TUPLE_GRE(x)
+#endif
+
+/* GRE KEYMAP HANDLING FUNCTIONS */
+static LIST_HEAD(gre_keymap_list);
+
+static inline int gre_key_cmpfn(const struct ip_ct_gre_keymap *km,
+ const struct ip_conntrack_tuple *t)
+{
+ return ((km->tuple.src.ip == t->src.ip) &&
+ (km->tuple.dst.ip == t->dst.ip) &&
+ (km->tuple.dst.protonum == t->dst.protonum) &&
+ (km->tuple.dst.u.all == t->dst.u.all));
+}
+
+/* look up the source key for a given tuple */
+static u_int32_t gre_keymap_lookup(struct ip_conntrack_tuple *t)
+{
+ struct ip_ct_gre_keymap *km;
+ u_int32_t key = 0;
+
+ read_lock_bh(&ip_ct_gre_lock);
+ km = LIST_FIND(&gre_keymap_list, gre_key_cmpfn,
+ struct ip_ct_gre_keymap *, t);
+ if (km)
+ key = km->tuple.src.u.gre.key;
+ read_unlock_bh(&ip_ct_gre_lock);
+
+ DEBUGP("lookup src key 0x%x up key for ", key);
+ DUMP_TUPLE_GRE(t);
+
+ return key;
+}
+
+/* add a single keymap entry, associate with specified master ct */
+int
+ip_ct_gre_keymap_add(struct ip_conntrack *ct,
+ struct ip_conntrack_tuple *t, int reply)
+{
+ struct ip_ct_gre_keymap **exist_km, *km, *old;
+
+ if (!ct->helper || strcmp(ct->helper->name, "pptp")) {
+ DEBUGP("refusing to add GRE keymap to non-pptp session\n");
+ return -1;
+ }
+
+ if (!reply)
+ exist_km = &ct->help.ct_pptp_info.keymap_orig;
+ else
+ exist_km = &ct->help.ct_pptp_info.keymap_reply;
+
+ if (*exist_km) {
+ /* check whether it's a retransmission */
+ old = LIST_FIND(&gre_keymap_list, gre_key_cmpfn,
+ struct ip_ct_gre_keymap *, t);
+ if (old == *exist_km) {
+ DEBUGP("retransmission\n");
+ return 0;
+ }
+
+ DEBUGP("trying to override keymap_%s for ct %p\n",
+ reply? "reply":"orig", ct);
+ return -EEXIST;
+ }
+
+ km = kmalloc(sizeof(*km), GFP_ATOMIC);
+ if (!km)
+ return -ENOMEM;
+
+ memcpy(&km->tuple, t, sizeof(*t));
+ *exist_km = km;
+
+ DEBUGP("adding new entry %p: ", km);
+ DUMP_TUPLE_GRE(&km->tuple);
+
+ write_lock_bh(&ip_ct_gre_lock);
+ list_append(&gre_keymap_list, km);
+ write_unlock_bh(&ip_ct_gre_lock);
+
+ return 0;
+}
+
+/* destroy the keymap entries associated with specified master ct */
+void ip_ct_gre_keymap_destroy(struct ip_conntrack *ct)
+{
+ DEBUGP("entering for ct %p\n", ct);
+
+ if (!ct->helper || strcmp(ct->helper->name, "pptp")) {
+ DEBUGP("refusing to destroy GRE keymap to non-pptp session\n");
+ return;
+ }
+
+ write_lock_bh(&ip_ct_gre_lock);
+ if (ct->help.ct_pptp_info.keymap_orig) {
+ DEBUGP("removing %p from list\n",
+ ct->help.ct_pptp_info.keymap_orig);
+ list_del(&ct->help.ct_pptp_info.keymap_orig->list);
+ kfree(ct->help.ct_pptp_info.keymap_orig);
+ ct->help.ct_pptp_info.keymap_orig = NULL;
+ }
+ if (ct->help.ct_pptp_info.keymap_reply) {
+ DEBUGP("removing %p from list\n",
+ ct->help.ct_pptp_info.keymap_reply);
+ list_del(&ct->help.ct_pptp_info.keymap_reply->list);
+ kfree(ct->help.ct_pptp_info.keymap_reply);
+ ct->help.ct_pptp_info.keymap_reply = NULL;
+ }
+ write_unlock_bh(&ip_ct_gre_lock);
+}
+
+
+/* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */
+
+/* invert gre part of tuple */
+static int gre_invert_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack_tuple *orig)
+{
+ tuple->dst.u.gre.key = orig->src.u.gre.key;
+ tuple->src.u.gre.key = orig->dst.u.gre.key;
+
+ return 1;
+}
+
+/* gre hdr info to tuple */
+static int gre_pkt_to_tuple(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct ip_conntrack_tuple *tuple)
+{
+ struct gre_hdr_pptp _pgrehdr, *pgrehdr;
+ u_int32_t srckey;
+ struct gre_hdr _grehdr, *grehdr;
+
+ /* first only delinearize old RFC1701 GRE header */
+ grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr);
+ if (!grehdr || grehdr->version != GRE_VERSION_PPTP) {
+ /* try to behave like "ip_conntrack_proto_generic" */
+ tuple->src.u.all = 0;
+ tuple->dst.u.all = 0;
+ return 1;
+ }
+
+ /* PPTP header is variable length, only need up to the call_id field */
+ pgrehdr = skb_header_pointer(skb, dataoff, 8, &_pgrehdr);
+ if (!pgrehdr)
+ return 1;
+
+ if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) {
+ DEBUGP("GRE_VERSION_PPTP but unknown proto\n");
+ return 0;
+ }
+
+ tuple->dst.u.gre.key = pgrehdr->call_id;
+ srckey = gre_keymap_lookup(tuple);
+ tuple->src.u.gre.key = srckey;
+
+ return 1;
+}
+
+/* print gre part of tuple */
+static int gre_print_tuple(struct seq_file *s,
+ const struct ip_conntrack_tuple *tuple)
+{
+ return seq_printf(s, "srckey=0x%x dstkey=0x%x ",
+ ntohs(tuple->src.u.gre.key),
+ ntohs(tuple->dst.u.gre.key));
+}
+
+/* print private data for conntrack */
+static int gre_print_conntrack(struct seq_file *s,
+ const struct ip_conntrack *ct)
+{
+ return seq_printf(s, "timeout=%u, stream_timeout=%u ",
+ (ct->proto.gre.timeout / HZ),
+ (ct->proto.gre.stream_timeout / HZ));
+}
+
+/* Returns verdict for packet, and may modify conntrack */
+static int gre_packet(struct ip_conntrack *ct,
+ const struct sk_buff *skb,
+ enum ip_conntrack_info conntrackinfo)
+{
+ /* If we've seen traffic both ways, this is a GRE connection.
+ * Extend timeout. */
+ if (ct->status & IPS_SEEN_REPLY) {
+ ip_ct_refresh_acct(ct, conntrackinfo, skb,
+ ct->proto.gre.stream_timeout);
+ /* Also, more likely to be important, and not a probe. */
+ set_bit(IPS_ASSURED_BIT, &ct->status);
+ ip_conntrack_event_cache(IPCT_STATUS, skb);
+ } else
+ ip_ct_refresh_acct(ct, conntrackinfo, skb,
+ ct->proto.gre.timeout);
+
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int gre_new(struct ip_conntrack *ct,
+ const struct sk_buff *skb)
+{
+ DEBUGP(": ");
+ DUMP_TUPLE_GRE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+
+ /* initialize to sane value. Ideally a conntrack helper
+ * (e.g. in case of pptp) is increasing them */
+ ct->proto.gre.stream_timeout = GRE_STREAM_TIMEOUT;
+ ct->proto.gre.timeout = GRE_TIMEOUT;
+
+ return 1;
+}
+
+/* Called when a conntrack entry has already been removed from the hashes
+ * and is about to be deleted from memory */
+static void gre_destroy(struct ip_conntrack *ct)
+{
+ struct ip_conntrack *master = ct->master;
+ DEBUGP(" entering\n");
+
+ if (!master)
+ DEBUGP("no master !?!\n");
+ else
+ ip_ct_gre_keymap_destroy(master);
+}
+
+/* protocol helper struct */
+static struct ip_conntrack_protocol gre = {
+ .proto = IPPROTO_GRE,
+ .name = "gre",
+ .pkt_to_tuple = gre_pkt_to_tuple,
+ .invert_tuple = gre_invert_tuple,
+ .print_tuple = gre_print_tuple,
+ .print_conntrack = gre_print_conntrack,
+ .packet = gre_packet,
+ .new = gre_new,
+ .destroy = gre_destroy,
+ .me = THIS_MODULE,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
+ .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
+#endif
+};
+
+/* ip_conntrack_proto_gre initialization */
+int __init ip_ct_proto_gre_init(void)
+{
+ return ip_conntrack_protocol_register(&gre);
+}
+
+void __exit ip_ct_proto_gre_fini(void)
+{
+ struct list_head *pos, *n;
+
+ /* delete all keymap entries */
+ write_lock_bh(&ip_ct_gre_lock);
+ list_for_each_safe(pos, n, &gre_keymap_list) {
+ DEBUGP("deleting keymap %p at module unload time\n", pos);
+ list_del(pos);
+ kfree(pos);
+ }
+ write_unlock_bh(&ip_ct_gre_lock);
+
+ ip_conntrack_protocol_unregister(&gre);
+}
+
+EXPORT_SYMBOL(ip_ct_gre_keymap_add);
+EXPORT_SYMBOL(ip_ct_gre_keymap_destroy);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index 602c74db3252..98f0015dd255 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -102,22 +102,24 @@ static int icmp_packet(struct ip_conntrack *ct,
ct->timeout.function((unsigned long)ct);
} else {
atomic_inc(&ct->proto.icmp.count);
+ ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
}
return NF_ACCEPT;
}
+static u_int8_t valid_new[] = {
+ [ICMP_ECHO] = 1,
+ [ICMP_TIMESTAMP] = 1,
+ [ICMP_INFO_REQUEST] = 1,
+ [ICMP_ADDRESS] = 1
+};
+
/* Called when a new connection for this protocol found. */
static int icmp_new(struct ip_conntrack *conntrack,
const struct sk_buff *skb)
{
- static u_int8_t valid_new[]
- = { [ICMP_ECHO] = 1,
- [ICMP_TIMESTAMP] = 1,
- [ICMP_INFO_REQUEST] = 1,
- [ICMP_ADDRESS] = 1 };
-
if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
|| !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) {
/* Can't create a new ICMP `conn' with this. */
@@ -158,11 +160,12 @@ icmp_error_message(struct sk_buff *skb,
return NF_ACCEPT;
}
- innerproto = ip_ct_find_proto(inside->ip.protocol);
+ innerproto = ip_conntrack_proto_find_get(inside->ip.protocol);
dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4;
/* Are they talking about one of our connections? */
if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) {
DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol);
+ ip_conntrack_proto_put(innerproto);
return NF_ACCEPT;
}
@@ -170,8 +173,10 @@ icmp_error_message(struct sk_buff *skb,
been preserved inside the ICMP. */
if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
DEBUGP("icmp_error_track: Can't invert tuple\n");
+ ip_conntrack_proto_put(innerproto);
return NF_ACCEPT;
}
+ ip_conntrack_proto_put(innerproto);
*ctinfo = IP_CT_RELATED;
@@ -212,7 +217,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih);
if (icmph == NULL) {
if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_icmp: short packet ");
return -NF_ACCEPT;
}
@@ -226,13 +231,13 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
if (!(u16)csum_fold(skb->csum))
break;
if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_icmp: bad HW ICMP checksum ");
return -NF_ACCEPT;
case CHECKSUM_NONE:
if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) {
if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_icmp: bad ICMP checksum ");
return -NF_ACCEPT;
}
@@ -249,7 +254,7 @@ checksum_skipped:
*/
if (icmph->type > NR_ICMP_TYPES) {
if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_icmp: invalid ICMP type ");
return -NF_ACCEPT;
}
@@ -265,6 +270,46 @@ checksum_skipped:
return icmp_error_message(skb, ctinfo, hooknum);
}
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+static int icmp_tuple_to_nfattr(struct sk_buff *skb,
+ const struct ip_conntrack_tuple *t)
+{
+ NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t),
+ &t->src.u.icmp.id);
+ NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t),
+ &t->dst.u.icmp.type);
+ NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t),
+ &t->dst.u.icmp.code);
+
+ if (t->dst.u.icmp.type >= sizeof(valid_new)
+ || !valid_new[t->dst.u.icmp.type])
+ return -EINVAL;
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static int icmp_nfattr_to_tuple(struct nfattr *tb[],
+ struct ip_conntrack_tuple *tuple)
+{
+ if (!tb[CTA_PROTO_ICMP_TYPE-1]
+ || !tb[CTA_PROTO_ICMP_CODE-1])
+ return -1;
+
+ tuple->dst.u.icmp.type =
+ *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]);
+ tuple->dst.u.icmp.code =
+ *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]);
+ tuple->src.u.icmp.id =
+ *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]);
+
+ return 0;
+}
+#endif
+
struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
{
.proto = IPPROTO_ICMP,
@@ -276,4 +321,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
.packet = icmp_packet,
.new = icmp_new,
.error = icmp_error,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .tuple_to_nfattr = icmp_tuple_to_nfattr,
+ .nfattr_to_tuple = icmp_nfattr_to_tuple,
+#endif
};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
index 31d75390bf12..59a4a0111dd3 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -404,6 +404,8 @@ static int sctp_packet(struct ip_conntrack *conntrack,
}
conntrack->proto.sctp.state = newconntrack;
+ if (oldsctpstate != newconntrack)
+ ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
write_unlock_bh(&sctp_lock);
}
@@ -414,6 +416,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
&& newconntrack == SCTP_CONNTRACK_ESTABLISHED) {
DEBUGP("Setting assured bit\n");
set_bit(IPS_ASSURED_BIT, &conntrack->status);
+ ip_conntrack_event_cache(IPCT_STATUS, skb);
}
return NF_ACCEPT;
@@ -503,7 +506,12 @@ static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = {
.packet = sctp_packet,
.new = sctp_new,
.destroy = NULL,
- .me = THIS_MODULE
+ .me = THIS_MODULE,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
+ .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
+#endif
};
#ifdef CONFIG_SYSCTL
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 809dfed766d4..d6701cafbcc2 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -336,6 +336,50 @@ static int tcp_print_conntrack(struct seq_file *s,
return seq_printf(s, "%s ", tcp_conntrack_names[state]);
}
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa,
+ const struct ip_conntrack *ct)
+{
+ struct nfattr *nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP);
+
+ read_lock_bh(&tcp_lock);
+ NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t),
+ &ct->proto.tcp.state);
+ read_unlock_bh(&tcp_lock);
+
+ NFA_NEST_END(skb, nest_parms);
+
+ return 0;
+
+nfattr_failure:
+ read_unlock_bh(&tcp_lock);
+ return -1;
+}
+
+static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct)
+{
+ struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1];
+ struct nfattr *tb[CTA_PROTOINFO_TCP_MAX];
+
+ if (nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr) < 0)
+ goto nfattr_failure;
+
+ if (!tb[CTA_PROTOINFO_TCP_STATE-1])
+ return -EINVAL;
+
+ write_lock_bh(&tcp_lock);
+ ct->proto.tcp.state =
+ *(u_int8_t *)NFA_DATA(tb[CTA_PROTOINFO_TCP_STATE-1]);
+ write_unlock_bh(&tcp_lock);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+#endif
+
static unsigned int get_conntrack_index(const struct tcphdr *tcph)
{
if (tcph->rst) return TCP_RST_SET;
@@ -699,7 +743,7 @@ static int tcp_in_window(struct ip_ct_tcp *state,
res = 1;
} else {
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: %s ",
before(seq, sender->td_maxend + 1) ?
after(end, sender->td_end - receiver->td_maxwin - 1) ?
@@ -798,7 +842,7 @@ static int tcp_error(struct sk_buff *skb,
sizeof(_tcph), &_tcph);
if (th == NULL) {
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: short packet ");
return -NF_ACCEPT;
}
@@ -806,7 +850,7 @@ static int tcp_error(struct sk_buff *skb,
/* Not whole TCP header or malformed packet */
if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: truncated/malformed packet ");
return -NF_ACCEPT;
}
@@ -823,7 +867,7 @@ static int tcp_error(struct sk_buff *skb,
skb->ip_summed == CHECKSUM_HW ? skb->csum
: skb_checksum(skb, iph->ihl*4, tcplen, 0))) {
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: bad TCP checksum ");
return -NF_ACCEPT;
}
@@ -832,7 +876,7 @@ static int tcp_error(struct sk_buff *skb,
tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
if (!tcp_valid_flags[tcpflags]) {
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: invalid TCP flag combination ");
return -NF_ACCEPT;
}
@@ -880,8 +924,9 @@ static int tcp_packet(struct ip_conntrack *conntrack,
*/
write_unlock_bh(&tcp_lock);
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
- "ip_ct_tcp: killing out of sync session ");
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ NULL, "ip_ct_tcp: "
+ "killing out of sync session ");
if (del_timer(&conntrack->timeout))
conntrack->timeout.function((unsigned long)
conntrack);
@@ -895,7 +940,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
write_unlock_bh(&tcp_lock);
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: invalid packet ignored ");
return NF_ACCEPT;
case TCP_CONNTRACK_MAX:
@@ -905,7 +950,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
old_state);
write_unlock_bh(&tcp_lock);
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: invalid state ");
return -NF_ACCEPT;
case TCP_CONNTRACK_SYN_SENT:
@@ -926,7 +971,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
write_unlock_bh(&tcp_lock);
if (LOG_INVALID(IPPROTO_TCP))
nf_log_packet(PF_INET, 0, skb, NULL, NULL,
- "ip_ct_tcp: invalid SYN");
+ NULL, "ip_ct_tcp: invalid SYN");
return -NF_ACCEPT;
}
case TCP_CONNTRACK_CLOSE:
@@ -973,6 +1018,10 @@ static int tcp_packet(struct ip_conntrack *conntrack,
? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
write_unlock_bh(&tcp_lock);
+ ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
+ if (new_state != old_state)
+ ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
+
if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
/* If only reply is a RST, we can consider ourselves not to
have an established connection: this is a fairly common
@@ -991,7 +1040,8 @@ static int tcp_packet(struct ip_conntrack *conntrack,
/* Set ASSURED if we see see valid ack in ESTABLISHED
after SYN_RECV or a valid answer for a picked up
connection. */
- set_bit(IPS_ASSURED_BIT, &conntrack->status);
+ set_bit(IPS_ASSURED_BIT, &conntrack->status);
+ ip_conntrack_event_cache(IPCT_STATUS, skb);
}
ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout);
@@ -1096,4 +1146,11 @@ struct ip_conntrack_protocol ip_conntrack_protocol_tcp =
.packet = tcp_packet,
.new = tcp_new,
.error = tcp_error,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .to_nfattr = tcp_to_nfattr,
+ .from_nfattr = nfattr_to_tcp,
+ .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
+ .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
+#endif
};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 8c1eaba098d4..f2dcac7c7660 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -73,7 +73,8 @@ static int udp_packet(struct ip_conntrack *conntrack,
ip_ct_refresh_acct(conntrack, ctinfo, skb,
ip_ct_udp_timeout_stream);
/* Also, more likely to be important, and not a probe */
- set_bit(IPS_ASSURED_BIT, &conntrack->status);
+ if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status))
+ ip_conntrack_event_cache(IPCT_STATUS, skb);
} else
ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
@@ -97,7 +98,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr);
if (hdr == NULL) {
if (LOG_INVALID(IPPROTO_UDP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_udp: short packet ");
return -NF_ACCEPT;
}
@@ -105,7 +106,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
/* Truncated/malformed packets */
if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
if (LOG_INVALID(IPPROTO_UDP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_udp: truncated/malformed packet ");
return -NF_ACCEPT;
}
@@ -125,7 +126,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
skb->ip_summed == CHECKSUM_HW ? skb->csum
: skb_checksum(skb, iph->ihl*4, udplen, 0))) {
if (LOG_INVALID(IPPROTO_UDP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_udp: bad UDP checksum ");
return -NF_ACCEPT;
}
@@ -144,4 +145,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_udp =
.packet = udp_packet,
.new = udp_new,
.error = udp_error,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
+ .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
+#endif
};
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 61798c46e91d..dd476b191f4b 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -5,7 +5,7 @@
*/
/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -147,8 +147,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
if (DIRECTION(hash))
return 0;
- proto = ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
- .tuple.dst.protonum);
+ proto = __ip_conntrack_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
IP_NF_ASSERT(proto);
if (seq_printf(s, "%-8s %u %ld ",
@@ -185,7 +184,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
return -ENOSPC;
#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
- if (seq_printf(s, "mark=%lu ", conntrack->mark))
+ if (seq_printf(s, "mark=%u ", conntrack->mark))
return -ENOSPC;
#endif
@@ -283,7 +282,7 @@ static int exp_seq_show(struct seq_file *s, void *v)
seq_printf(s, "proto=%u ", expect->tuple.dst.protonum);
print_tuple(s, &expect->tuple,
- ip_ct_find_proto(expect->tuple.dst.protonum));
+ __ip_conntrack_proto_find(expect->tuple.dst.protonum));
return seq_putc(s, '\n');
}
@@ -889,6 +888,7 @@ static int init_or_cleanup(int init)
return ret;
cleanup:
+ synchronize_net();
#ifdef CONFIG_SYSCTL
unregister_sysctl_table(ip_ct_sysctl_header);
cleanup_localinops:
@@ -971,6 +971,14 @@ void need_ip_conntrack(void)
{
}
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+EXPORT_SYMBOL_GPL(ip_conntrack_chain);
+EXPORT_SYMBOL_GPL(ip_conntrack_expect_chain);
+EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier);
+EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier);
+EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init);
+EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache);
+#endif
EXPORT_SYMBOL(ip_conntrack_protocol_register);
EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
EXPORT_SYMBOL(ip_ct_get_tuple);
@@ -981,13 +989,17 @@ EXPORT_SYMBOL(need_ip_conntrack);
EXPORT_SYMBOL(ip_conntrack_helper_register);
EXPORT_SYMBOL(ip_conntrack_helper_unregister);
EXPORT_SYMBOL(ip_ct_iterate_cleanup);
-EXPORT_SYMBOL(ip_ct_refresh_acct);
-EXPORT_SYMBOL(ip_ct_protos);
-EXPORT_SYMBOL(ip_ct_find_proto);
+EXPORT_SYMBOL(__ip_ct_refresh_acct);
+
EXPORT_SYMBOL(ip_conntrack_expect_alloc);
EXPORT_SYMBOL(ip_conntrack_expect_put);
+EXPORT_SYMBOL_GPL(__ip_conntrack_expect_find);
+EXPORT_SYMBOL_GPL(ip_conntrack_expect_find);
EXPORT_SYMBOL(ip_conntrack_expect_related);
EXPORT_SYMBOL(ip_conntrack_unexpect_related);
+EXPORT_SYMBOL_GPL(ip_conntrack_expect_list);
+EXPORT_SYMBOL_GPL(ip_ct_unlink_expect);
+
EXPORT_SYMBOL(ip_conntrack_tuple_taken);
EXPORT_SYMBOL(ip_ct_gather_frags);
EXPORT_SYMBOL(ip_conntrack_htable_size);
@@ -995,7 +1007,28 @@ EXPORT_SYMBOL(ip_conntrack_lock);
EXPORT_SYMBOL(ip_conntrack_hash);
EXPORT_SYMBOL(ip_conntrack_untracked);
EXPORT_SYMBOL_GPL(ip_conntrack_find_get);
-EXPORT_SYMBOL_GPL(ip_conntrack_put);
#ifdef CONFIG_IP_NF_NAT_NEEDED
EXPORT_SYMBOL(ip_conntrack_tcp_update);
#endif
+
+EXPORT_SYMBOL_GPL(ip_conntrack_flush);
+EXPORT_SYMBOL_GPL(__ip_conntrack_find);
+
+EXPORT_SYMBOL_GPL(ip_conntrack_alloc);
+EXPORT_SYMBOL_GPL(ip_conntrack_free);
+EXPORT_SYMBOL_GPL(ip_conntrack_hash_insert);
+
+EXPORT_SYMBOL_GPL(ip_ct_remove_expectations);
+
+EXPORT_SYMBOL_GPL(ip_conntrack_helper_find_get);
+EXPORT_SYMBOL_GPL(ip_conntrack_helper_put);
+EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname);
+
+EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get);
+EXPORT_SYMBOL_GPL(ip_conntrack_proto_put);
+EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find);
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr);
+EXPORT_SYMBOL_GPL(ip_ct_port_nfattr_to_tuple);
+#endif
diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c
index f8ff170f390a..a78736b8525d 100644
--- a/net/ipv4/netfilter/ip_conntrack_tftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_tftp.c
@@ -26,9 +26,9 @@ MODULE_DESCRIPTION("tftp connection tracking helper");
MODULE_LICENSE("GPL");
#define MAX_PORTS 8
-static int ports[MAX_PORTS];
+static short ports[MAX_PORTS];
static int ports_c;
-module_param_array(ports, int, &ports_c, 0400);
+module_param_array(ports, short, &ports_c, 0400);
MODULE_PARM_DESC(ports, "port numbers of tftp servers");
#if 0
@@ -75,6 +75,7 @@ static int tftp_help(struct sk_buff **pskb,
exp->mask.dst.u.udp.port = 0xffff;
exp->mask.dst.protonum = 0xff;
exp->expectfn = NULL;
+ exp->flags = 0;
DEBUGP("expect: ");
DUMP_TUPLE(&exp->tuple);
@@ -99,7 +100,7 @@ static int tftp_help(struct sk_buff **pskb,
}
static struct ip_conntrack_helper tftp[MAX_PORTS];
-static char tftp_names[MAX_PORTS][10];
+static char tftp_names[MAX_PORTS][sizeof("tftp-65535")];
static void fini(void)
{
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 739b6dde1c82..c5e3abd24672 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -47,8 +47,41 @@ DEFINE_RWLOCK(ip_nat_lock);
static unsigned int ip_nat_htable_size;
static struct list_head *bysource;
+
+#define MAX_IP_NAT_PROTO 256
struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
+static inline struct ip_nat_protocol *
+__ip_nat_proto_find(u_int8_t protonum)
+{
+ return ip_nat_protos[protonum];
+}
+
+struct ip_nat_protocol *
+ip_nat_proto_find_get(u_int8_t protonum)
+{
+ struct ip_nat_protocol *p;
+
+ /* we need to disable preemption to make sure 'p' doesn't get
+ * removed until we've grabbed the reference */
+ preempt_disable();
+ p = __ip_nat_proto_find(protonum);
+ if (p) {
+ if (!try_module_get(p->me))
+ p = &ip_nat_unknown_protocol;
+ }
+ preempt_enable();
+
+ return p;
+}
+EXPORT_SYMBOL_GPL(ip_nat_proto_find_get);
+
+void
+ip_nat_proto_put(struct ip_nat_protocol *p)
+{
+ module_put(p->me);
+}
+EXPORT_SYMBOL_GPL(ip_nat_proto_put);
/* We keep an extra hash for each conntrack, for fast searching. */
static inline unsigned int
@@ -80,6 +113,7 @@ ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
oldcheck^0xFFFF));
}
+EXPORT_SYMBOL(ip_nat_cheat_check);
/* Is this tuple already taken? (not by us) */
int
@@ -96,6 +130,7 @@ ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
invert_tuplepr(&reply, tuple);
return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
}
+EXPORT_SYMBOL(ip_nat_used_tuple);
/* If we source map this tuple so reply looks like reply_tuple, will
* that meet the constraints of range. */
@@ -103,7 +138,8 @@ static int
in_range(const struct ip_conntrack_tuple *tuple,
const struct ip_nat_range *range)
{
- struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum);
+ struct ip_nat_protocol *proto =
+ __ip_nat_proto_find(tuple->dst.protonum);
/* If we are supposed to map IPs, then we must be in the
range specified, otherwise let this drag us onto a new src IP. */
@@ -216,8 +252,7 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple,
struct ip_conntrack *conntrack,
enum ip_nat_manip_type maniptype)
{
- struct ip_nat_protocol *proto
- = ip_nat_find_proto(orig_tuple->dst.protonum);
+ struct ip_nat_protocol *proto;
/* 1) If this srcip/proto/src-proto-part is currently mapped,
and that same mapping gives a unique tuple within the given
@@ -242,14 +277,20 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple,
/* 3) The per-protocol part of the manip is made to map into
the range to make a unique tuple. */
+ proto = ip_nat_proto_find_get(orig_tuple->dst.protonum);
+
/* Only bother mapping if it's not already in range and unique */
if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
|| proto->in_range(tuple, maniptype, &range->min, &range->max))
- && !ip_nat_used_tuple(tuple, conntrack))
+ && !ip_nat_used_tuple(tuple, conntrack)) {
+ ip_nat_proto_put(proto);
return;
+ }
/* Last change: get protocol to try to obtain unique tuple. */
proto->unique_tuple(tuple, range, maniptype, conntrack);
+
+ ip_nat_proto_put(proto);
}
unsigned int
@@ -310,6 +351,7 @@ ip_nat_setup_info(struct ip_conntrack *conntrack,
return NF_ACCEPT;
}
+EXPORT_SYMBOL(ip_nat_setup_info);
/* Returns true if succeeded. */
static int
@@ -320,17 +362,20 @@ manip_pkt(u_int16_t proto,
enum ip_nat_manip_type maniptype)
{
struct iphdr *iph;
+ struct ip_nat_protocol *p;
- (*pskb)->nfcache |= NFC_ALTERED;
- if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph)))
+ if (!skb_make_writable(pskb, iphdroff + sizeof(*iph)))
return 0;
iph = (void *)(*pskb)->data + iphdroff;
/* Manipulate protcol part. */
- if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff,
- target, maniptype))
+ p = ip_nat_proto_find_get(proto);
+ if (!p->manip_pkt(pskb, iphdroff, target, maniptype)) {
+ ip_nat_proto_put(p);
return 0;
+ }
+ ip_nat_proto_put(p);
iph = (void *)(*pskb)->data + iphdroff;
@@ -347,10 +392,10 @@ manip_pkt(u_int16_t proto,
}
/* Do packet manipulations according to ip_nat_setup_info. */
-unsigned int nat_packet(struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int hooknum,
- struct sk_buff **pskb)
+unsigned int ip_nat_packet(struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int hooknum,
+ struct sk_buff **pskb)
{
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
unsigned long statusbit;
@@ -377,12 +422,13 @@ unsigned int nat_packet(struct ip_conntrack *ct,
}
return NF_ACCEPT;
}
+EXPORT_SYMBOL_GPL(ip_nat_packet);
/* Dir is direction ICMP is coming from (opposite to packet it contains) */
-int icmp_reply_translation(struct sk_buff **pskb,
- struct ip_conntrack *ct,
- enum ip_nat_manip_type manip,
- enum ip_conntrack_dir dir)
+int ip_nat_icmp_reply_translation(struct sk_buff **pskb,
+ struct ip_conntrack *ct,
+ enum ip_nat_manip_type manip,
+ enum ip_conntrack_dir dir)
{
struct {
struct icmphdr icmp;
@@ -391,7 +437,7 @@ int icmp_reply_translation(struct sk_buff **pskb,
struct ip_conntrack_tuple inner, target;
int hdrlen = (*pskb)->nh.iph->ihl * 4;
- if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside)))
+ if (!skb_make_writable(pskb, hdrlen + sizeof(*inside)))
return 0;
inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
@@ -426,7 +472,8 @@ int icmp_reply_translation(struct sk_buff **pskb,
if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 +
sizeof(struct icmphdr) + inside->ip.ihl*4,
- &inner, ip_ct_find_proto(inside->ip.protocol)))
+ &inner,
+ __ip_conntrack_proto_find(inside->ip.protocol)))
return 0;
/* Change inner back to look like incoming packet. We do the
@@ -468,6 +515,7 @@ int icmp_reply_translation(struct sk_buff **pskb,
return 1;
}
+EXPORT_SYMBOL_GPL(ip_nat_icmp_reply_translation);
/* Protocol registration. */
int ip_nat_protocol_register(struct ip_nat_protocol *proto)
@@ -484,6 +532,7 @@ int ip_nat_protocol_register(struct ip_nat_protocol *proto)
write_unlock_bh(&ip_nat_lock);
return ret;
}
+EXPORT_SYMBOL(ip_nat_protocol_register);
/* Noone stores the protocol anywhere; simply delete it. */
void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
@@ -495,8 +544,54 @@ void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
/* Someone could be still looking at the proto in a bh. */
synchronize_net();
}
+EXPORT_SYMBOL(ip_nat_protocol_unregister);
+
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+int
+ip_nat_port_range_to_nfattr(struct sk_buff *skb,
+ const struct ip_nat_range *range)
+{
+ NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(u_int16_t),
+ &range->min.tcp.port);
+ NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(u_int16_t),
+ &range->max.tcp.port);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+int
+ip_nat_port_nfattr_to_range(struct nfattr *tb[], struct ip_nat_range *range)
+{
+ int ret = 0;
+
+ /* we have to return whether we actually parsed something or not */
+
+ if (tb[CTA_PROTONAT_PORT_MIN-1]) {
+ ret = 1;
+ range->min.tcp.port =
+ *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]);
+ }
+
+ if (!tb[CTA_PROTONAT_PORT_MAX-1]) {
+ if (ret)
+ range->max.tcp.port = range->min.tcp.port;
+ } else {
+ ret = 1;
+ range->max.tcp.port =
+ *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip_nat_port_nfattr_to_range);
+EXPORT_SYMBOL_GPL(ip_nat_port_range_to_nfattr);
+#endif
-int __init ip_nat_init(void)
+static int __init ip_nat_init(void)
{
size_t i;
@@ -538,10 +633,14 @@ static int clean_nat(struct ip_conntrack *i, void *data)
return 0;
}
-/* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
-void ip_nat_cleanup(void)
+static void __exit ip_nat_cleanup(void)
{
ip_ct_iterate_cleanup(&clean_nat, NULL);
ip_conntrack_destroyed = NULL;
vfree(bysource);
}
+
+MODULE_LICENSE("GPL");
+
+module_init(ip_nat_init);
+module_exit(ip_nat_cleanup);
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
index 158f34f32c04..5d506e0564d5 100644
--- a/net/ipv4/netfilter/ip_nat_helper.c
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -168,7 +168,7 @@ ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
struct tcphdr *tcph;
int datalen;
- if (!skb_ip_make_writable(pskb, (*pskb)->len))
+ if (!skb_make_writable(pskb, (*pskb)->len))
return 0;
if (rep_len > match_len
@@ -199,6 +199,7 @@ ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
}
return 1;
}
+EXPORT_SYMBOL(ip_nat_mangle_tcp_packet);
/* Generic function for mangling variable-length address changes inside
* NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
@@ -228,7 +229,7 @@ ip_nat_mangle_udp_packet(struct sk_buff **pskb,
match_offset + match_len)
return 0;
- if (!skb_ip_make_writable(pskb, (*pskb)->len))
+ if (!skb_make_writable(pskb, (*pskb)->len))
return 0;
if (rep_len > match_len
@@ -256,6 +257,7 @@ ip_nat_mangle_udp_packet(struct sk_buff **pskb,
return 1;
}
+EXPORT_SYMBOL(ip_nat_mangle_udp_packet);
/* Adjust one found SACK option including checksum correction */
static void
@@ -315,7 +317,7 @@ ip_nat_sack_adjust(struct sk_buff **pskb,
optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr);
optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4;
- if (!skb_ip_make_writable(pskb, optend))
+ if (!skb_make_writable(pskb, optend))
return 0;
dir = CTINFO2DIR(ctinfo);
@@ -363,7 +365,7 @@ ip_nat_seq_adjust(struct sk_buff **pskb,
this_way = &ct->nat.info.seq[dir];
other_way = &ct->nat.info.seq[!dir];
- if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
+ if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
return 0;
tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
@@ -399,6 +401,7 @@ ip_nat_seq_adjust(struct sk_buff **pskb,
return 1;
}
+EXPORT_SYMBOL(ip_nat_seq_adjust);
/* Setup NAT on this expected conntrack so it follows master. */
/* If we fail to get a free NAT slot, we'll get dropped on confirm */
@@ -425,3 +428,4 @@ void ip_nat_follow_master(struct ip_conntrack *ct,
/* hook doesn't matter, but it has to do destination manip */
ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
}
+EXPORT_SYMBOL(ip_nat_follow_master);
diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c
new file mode 100644
index 000000000000..3cdd0684d30d
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_helper_pptp.c
@@ -0,0 +1,401 @@
+/*
+ * ip_nat_pptp.c - Version 3.0
+ *
+ * NAT support for PPTP (Point to Point Tunneling Protocol).
+ * PPTP is a a protocol for creating virtual private networks.
+ * It is a specification defined by Microsoft and some vendors
+ * working with Microsoft. PPTP is built on top of a modified
+ * version of the Internet Generic Routing Encapsulation Protocol.
+ * GRE is defined in RFC 1701 and RFC 1702. Documentation of
+ * PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ * TODO: - NAT to a unique tuple, not to TCP source port
+ * (needs netfilter tuple reservation)
+ *
+ * Changes:
+ * 2002-02-10 - Version 1.3
+ * - Use ip_nat_mangle_tcp_packet() because of cloned skb's
+ * in local connections (Philip Craig <philipc@snapgear.com>)
+ * - add checks for magicCookie and pptp version
+ * - make argument list of pptp_{out,in}bound_packet() shorter
+ * - move to C99 style initializers
+ * - print version number at module loadtime
+ * 2003-09-22 - Version 1.5
+ * - use SNATed tcp sourceport as callid, since we get called before
+ * TCP header is mangled (Philip Craig <philipc@snapgear.com>)
+ * 2004-10-22 - Version 2.0
+ * - kernel 2.6.x version
+ * 2005-06-10 - Version 3.0
+ * - kernel >= 2.6.11 version,
+ * funded by Oxcoda NetBox Blue (http://www.netboxblue.com/)
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_nat_pptp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h>
+#include <linux/netfilter_ipv4/ip_conntrack_pptp.h>
+
+#define IP_NAT_PPTP_VERSION "3.0"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP");
+
+
+#if 0
+extern const char *pptp_msg_name[];
+#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, \
+ __FUNCTION__, ## args)
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static void pptp_nat_expected(struct ip_conntrack *ct,
+ struct ip_conntrack_expect *exp)
+{
+ struct ip_conntrack *master = ct->master;
+ struct ip_conntrack_expect *other_exp;
+ struct ip_conntrack_tuple t;
+ struct ip_ct_pptp_master *ct_pptp_info;
+ struct ip_nat_pptp *nat_pptp_info;
+
+ ct_pptp_info = &master->help.ct_pptp_info;
+ nat_pptp_info = &master->nat.help.nat_pptp_info;
+
+ /* And here goes the grand finale of corrosion... */
+
+ if (exp->dir == IP_CT_DIR_ORIGINAL) {
+ DEBUGP("we are PNS->PAC\n");
+ /* therefore, build tuple for PAC->PNS */
+ t.src.ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip;
+ t.src.u.gre.key = htons(master->help.ct_pptp_info.pac_call_id);
+ t.dst.ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip;
+ t.dst.u.gre.key = htons(master->help.ct_pptp_info.pns_call_id);
+ t.dst.protonum = IPPROTO_GRE;
+ } else {
+ DEBUGP("we are PAC->PNS\n");
+ /* build tuple for PNS->PAC */
+ t.src.ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
+ t.src.u.gre.key =
+ htons(master->nat.help.nat_pptp_info.pns_call_id);
+ t.dst.ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip;
+ t.dst.u.gre.key =
+ htons(master->nat.help.nat_pptp_info.pac_call_id);
+ t.dst.protonum = IPPROTO_GRE;
+ }
+
+ DEBUGP("trying to unexpect other dir: ");
+ DUMP_TUPLE(&t);
+ other_exp = ip_conntrack_expect_find(&t);
+ if (other_exp) {
+ ip_conntrack_unexpect_related(other_exp);
+ ip_conntrack_expect_put(other_exp);
+ DEBUGP("success\n");
+ } else {
+ DEBUGP("not found!\n");
+ }
+
+ ip_nat_follow_master(ct, exp);
+}
+
+/* outbound packets == from PNS to PAC */
+static int
+pptp_outbound_pkt(struct sk_buff **pskb,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq)
+
+{
+ struct ip_ct_pptp_master *ct_pptp_info = &ct->help.ct_pptp_info;
+ struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info;
+
+ u_int16_t msg, *cid = NULL, new_callid;
+
+ new_callid = htons(ct_pptp_info->pns_call_id);
+
+ switch (msg = ntohs(ctlh->messageType)) {
+ case PPTP_OUT_CALL_REQUEST:
+ cid = &pptpReq->ocreq.callID;
+ /* FIXME: ideally we would want to reserve a call ID
+ * here. current netfilter NAT core is not able to do
+ * this :( For now we use TCP source port. This breaks
+ * multiple calls within one control session */
+
+ /* save original call ID in nat_info */
+ nat_pptp_info->pns_call_id = ct_pptp_info->pns_call_id;
+
+ /* don't use tcph->source since we are at a DSTmanip
+ * hook (e.g. PREROUTING) and pkt is not mangled yet */
+ new_callid = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
+
+ /* save new call ID in ct info */
+ ct_pptp_info->pns_call_id = ntohs(new_callid);
+ break;
+ case PPTP_IN_CALL_REPLY:
+ cid = &pptpReq->icreq.callID;
+ break;
+ case PPTP_CALL_CLEAR_REQUEST:
+ cid = &pptpReq->clrreq.callID;
+ break;
+ default:
+ DEBUGP("unknown outbound packet 0x%04x:%s\n", msg,
+ (msg <= PPTP_MSG_MAX)?
+ pptp_msg_name[msg]:pptp_msg_name[0]);
+ /* fall through */
+
+ case PPTP_SET_LINK_INFO:
+ /* only need to NAT in case PAC is behind NAT box */
+ case PPTP_START_SESSION_REQUEST:
+ case PPTP_START_SESSION_REPLY:
+ case PPTP_STOP_SESSION_REQUEST:
+ case PPTP_STOP_SESSION_REPLY:
+ case PPTP_ECHO_REQUEST:
+ case PPTP_ECHO_REPLY:
+ /* no need to alter packet */
+ return NF_ACCEPT;
+ }
+
+ /* only OUT_CALL_REQUEST, IN_CALL_REPLY, CALL_CLEAR_REQUEST pass
+ * down to here */
+
+ IP_NF_ASSERT(cid);
+
+ DEBUGP("altering call id from 0x%04x to 0x%04x\n",
+ ntohs(*cid), ntohs(new_callid));
+
+ /* mangle packet */
+ if (ip_nat_mangle_tcp_packet(pskb, ct, ctinfo,
+ (void *)cid - ((void *)ctlh - sizeof(struct pptp_pkt_hdr)),
+ sizeof(new_callid),
+ (char *)&new_callid,
+ sizeof(new_callid)) == 0)
+ return NF_DROP;
+
+ return NF_ACCEPT;
+}
+
+static int
+pptp_exp_gre(struct ip_conntrack_expect *expect_orig,
+ struct ip_conntrack_expect *expect_reply)
+{
+ struct ip_ct_pptp_master *ct_pptp_info =
+ &expect_orig->master->help.ct_pptp_info;
+ struct ip_nat_pptp *nat_pptp_info =
+ &expect_orig->master->nat.help.nat_pptp_info;
+
+ struct ip_conntrack *ct = expect_orig->master;
+
+ struct ip_conntrack_tuple inv_t;
+ struct ip_conntrack_tuple *orig_t, *reply_t;
+
+ /* save original PAC call ID in nat_info */
+ nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id;
+
+ /* alter expectation */
+ orig_t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ reply_t = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+
+ /* alter expectation for PNS->PAC direction */
+ invert_tuplepr(&inv_t, &expect_orig->tuple);
+ expect_orig->saved_proto.gre.key = htons(nat_pptp_info->pac_call_id);
+ expect_orig->tuple.src.u.gre.key = htons(nat_pptp_info->pns_call_id);
+ expect_orig->tuple.dst.u.gre.key = htons(ct_pptp_info->pac_call_id);
+ inv_t.src.ip = reply_t->src.ip;
+ inv_t.dst.ip = reply_t->dst.ip;
+ inv_t.src.u.gre.key = htons(nat_pptp_info->pac_call_id);
+ inv_t.dst.u.gre.key = htons(ct_pptp_info->pns_call_id);
+
+ if (!ip_conntrack_expect_related(expect_orig)) {
+ DEBUGP("successfully registered expect\n");
+ } else {
+ DEBUGP("can't expect_related(expect_orig)\n");
+ return 1;
+ }
+
+ /* alter expectation for PAC->PNS direction */
+ invert_tuplepr(&inv_t, &expect_reply->tuple);
+ expect_reply->saved_proto.gre.key = htons(nat_pptp_info->pns_call_id);
+ expect_reply->tuple.src.u.gre.key = htons(nat_pptp_info->pac_call_id);
+ expect_reply->tuple.dst.u.gre.key = htons(ct_pptp_info->pns_call_id);
+ inv_t.src.ip = orig_t->src.ip;
+ inv_t.dst.ip = orig_t->dst.ip;
+ inv_t.src.u.gre.key = htons(nat_pptp_info->pns_call_id);
+ inv_t.dst.u.gre.key = htons(ct_pptp_info->pac_call_id);
+
+ if (!ip_conntrack_expect_related(expect_reply)) {
+ DEBUGP("successfully registered expect\n");
+ } else {
+ DEBUGP("can't expect_related(expect_reply)\n");
+ ip_conntrack_unexpect_related(expect_orig);
+ return 1;
+ }
+
+ if (ip_ct_gre_keymap_add(ct, &expect_reply->tuple, 0) < 0) {
+ DEBUGP("can't register original keymap\n");
+ ip_conntrack_unexpect_related(expect_orig);
+ ip_conntrack_unexpect_related(expect_reply);
+ return 1;
+ }
+
+ if (ip_ct_gre_keymap_add(ct, &inv_t, 1) < 0) {
+ DEBUGP("can't register reply keymap\n");
+ ip_conntrack_unexpect_related(expect_orig);
+ ip_conntrack_unexpect_related(expect_reply);
+ ip_ct_gre_keymap_destroy(ct);
+ return 1;
+ }
+
+ return 0;
+}
+
+/* inbound packets == from PAC to PNS */
+static int
+pptp_inbound_pkt(struct sk_buff **pskb,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq)
+{
+ struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info;
+ u_int16_t msg, new_cid = 0, new_pcid, *pcid = NULL, *cid = NULL;
+
+ int ret = NF_ACCEPT, rv;
+
+ new_pcid = htons(nat_pptp_info->pns_call_id);
+
+ switch (msg = ntohs(ctlh->messageType)) {
+ case PPTP_OUT_CALL_REPLY:
+ pcid = &pptpReq->ocack.peersCallID;
+ cid = &pptpReq->ocack.callID;
+ break;
+ case PPTP_IN_CALL_CONNECT:
+ pcid = &pptpReq->iccon.peersCallID;
+ break;
+ case PPTP_IN_CALL_REQUEST:
+ /* only need to nat in case PAC is behind NAT box */
+ break;
+ case PPTP_WAN_ERROR_NOTIFY:
+ pcid = &pptpReq->wanerr.peersCallID;
+ break;
+ case PPTP_CALL_DISCONNECT_NOTIFY:
+ pcid = &pptpReq->disc.callID;
+ break;
+ case PPTP_SET_LINK_INFO:
+ pcid = &pptpReq->setlink.peersCallID;
+ break;
+
+ default:
+ DEBUGP("unknown inbound packet %s\n", (msg <= PPTP_MSG_MAX)?
+ pptp_msg_name[msg]:pptp_msg_name[0]);
+ /* fall through */
+
+ case PPTP_START_SESSION_REQUEST:
+ case PPTP_START_SESSION_REPLY:
+ case PPTP_STOP_SESSION_REQUEST:
+ case PPTP_STOP_SESSION_REPLY:
+ case PPTP_ECHO_REQUEST:
+ case PPTP_ECHO_REPLY:
+ /* no need to alter packet */
+ return NF_ACCEPT;
+ }
+
+ /* only OUT_CALL_REPLY, IN_CALL_CONNECT, IN_CALL_REQUEST,
+ * WAN_ERROR_NOTIFY, CALL_DISCONNECT_NOTIFY pass down here */
+
+ /* mangle packet */
+ IP_NF_ASSERT(pcid);
+ DEBUGP("altering peer call id from 0x%04x to 0x%04x\n",
+ ntohs(*pcid), ntohs(new_pcid));
+
+ rv = ip_nat_mangle_tcp_packet(pskb, ct, ctinfo,
+ (void *)pcid - ((void *)ctlh - sizeof(struct pptp_pkt_hdr)),
+ sizeof(new_pcid), (char *)&new_pcid,
+ sizeof(new_pcid));
+ if (rv != NF_ACCEPT)
+ return rv;
+
+ if (new_cid) {
+ IP_NF_ASSERT(cid);
+ DEBUGP("altering call id from 0x%04x to 0x%04x\n",
+ ntohs(*cid), ntohs(new_cid));
+ rv = ip_nat_mangle_tcp_packet(pskb, ct, ctinfo,
+ (void *)cid - ((void *)ctlh - sizeof(struct pptp_pkt_hdr)),
+ sizeof(new_cid),
+ (char *)&new_cid,
+ sizeof(new_cid));
+ if (rv != NF_ACCEPT)
+ return rv;
+ }
+
+ /* check for earlier return value of 'switch' above */
+ if (ret != NF_ACCEPT)
+ return ret;
+
+ /* great, at least we don't need to resize packets */
+ return NF_ACCEPT;
+}
+
+
+extern int __init ip_nat_proto_gre_init(void);
+extern void __exit ip_nat_proto_gre_fini(void);
+
+static int __init init(void)
+{
+ int ret;
+
+ DEBUGP("%s: registering NAT helper\n", __FILE__);
+
+ ret = ip_nat_proto_gre_init();
+ if (ret < 0)
+ return ret;
+
+ BUG_ON(ip_nat_pptp_hook_outbound);
+ ip_nat_pptp_hook_outbound = &pptp_outbound_pkt;
+
+ BUG_ON(ip_nat_pptp_hook_inbound);
+ ip_nat_pptp_hook_inbound = &pptp_inbound_pkt;
+
+ BUG_ON(ip_nat_pptp_hook_exp_gre);
+ ip_nat_pptp_hook_exp_gre = &pptp_exp_gre;
+
+ BUG_ON(ip_nat_pptp_hook_expectfn);
+ ip_nat_pptp_hook_expectfn = &pptp_nat_expected;
+
+ printk("ip_nat_pptp version %s loaded\n", IP_NAT_PPTP_VERSION);
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ DEBUGP("cleanup_module\n" );
+
+ ip_nat_pptp_hook_expectfn = NULL;
+ ip_nat_pptp_hook_exp_gre = NULL;
+ ip_nat_pptp_hook_inbound = NULL;
+ ip_nat_pptp_hook_outbound = NULL;
+
+ ip_nat_proto_gre_fini();
+ /* Make sure noone calls it, meanwhile */
+ synchronize_net();
+
+ printk("ip_nat_pptp version %s unloaded\n", IP_NAT_PPTP_VERSION);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c
new file mode 100644
index 000000000000..7c1285401672
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_gre.c
@@ -0,0 +1,214 @@
+/*
+ * ip_nat_proto_gre.c - Version 2.0
+ *
+ * NAT protocol helper module for GRE.
+ *
+ * GRE is a generic encapsulation protocol, which is generally not very
+ * suited for NAT, as it has no protocol-specific part as port numbers.
+ *
+ * It has an optional key field, which may help us distinguishing two
+ * connections between the same two hosts.
+ *
+ * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
+ *
+ * PPTP is built on top of a modified version of GRE, and has a mandatory
+ * field called "CallID", which serves us for the same purpose as the key
+ * field in plain GRE.
+ *
+ * Documentation about PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
+
+#if 0
+#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, \
+ __FUNCTION__, ## args)
+#else
+#define DEBUGP(x, args...)
+#endif
+
+/* is key in given range between min and max */
+static int
+gre_in_range(const struct ip_conntrack_tuple *tuple,
+ enum ip_nat_manip_type maniptype,
+ const union ip_conntrack_manip_proto *min,
+ const union ip_conntrack_manip_proto *max)
+{
+ u_int32_t key;
+
+ if (maniptype == IP_NAT_MANIP_SRC)
+ key = tuple->src.u.gre.key;
+ else
+ key = tuple->dst.u.gre.key;
+
+ return ntohl(key) >= ntohl(min->gre.key)
+ && ntohl(key) <= ntohl(max->gre.key);
+}
+
+/* generate unique tuple ... */
+static int
+gre_unique_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_range *range,
+ enum ip_nat_manip_type maniptype,
+ const struct ip_conntrack *conntrack)
+{
+ static u_int16_t key;
+ u_int16_t *keyptr;
+ unsigned int min, i, range_size;
+
+ if (maniptype == IP_NAT_MANIP_SRC)
+ keyptr = &tuple->src.u.gre.key;
+ else
+ keyptr = &tuple->dst.u.gre.key;
+
+ if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
+ DEBUGP("%p: NATing GRE PPTP\n", conntrack);
+ min = 1;
+ range_size = 0xffff;
+ } else {
+ min = ntohl(range->min.gre.key);
+ range_size = ntohl(range->max.gre.key) - min + 1;
+ }
+
+ DEBUGP("min = %u, range_size = %u\n", min, range_size);
+
+ for (i = 0; i < range_size; i++, key++) {
+ *keyptr = htonl(min + key % range_size);
+ if (!ip_nat_used_tuple(tuple, conntrack))
+ return 1;
+ }
+
+ DEBUGP("%p: no NAT mapping\n", conntrack);
+
+ return 0;
+}
+
+/* manipulate a GRE packet according to maniptype */
+static int
+gre_manip_pkt(struct sk_buff **pskb,
+ unsigned int iphdroff,
+ const struct ip_conntrack_tuple *tuple,
+ enum ip_nat_manip_type maniptype)
+{
+ struct gre_hdr *greh;
+ struct gre_hdr_pptp *pgreh;
+ struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
+ unsigned int hdroff = iphdroff + iph->ihl*4;
+
+ /* pgreh includes two optional 32bit fields which are not required
+ * to be there. That's where the magic '8' comes from */
+ if (!skb_make_writable(pskb, hdroff + sizeof(*pgreh)-8))
+ return 0;
+
+ greh = (void *)(*pskb)->data + hdroff;
+ pgreh = (struct gre_hdr_pptp *) greh;
+
+ /* we only have destination manip of a packet, since 'source key'
+ * is not present in the packet itself */
+ if (maniptype == IP_NAT_MANIP_DST) {
+ /* key manipulation is always dest */
+ switch (greh->version) {
+ case 0:
+ if (!greh->key) {
+ DEBUGP("can't nat GRE w/o key\n");
+ break;
+ }
+ if (greh->csum) {
+ /* FIXME: Never tested this code... */
+ *(gre_csum(greh)) =
+ ip_nat_cheat_check(~*(gre_key(greh)),
+ tuple->dst.u.gre.key,
+ *(gre_csum(greh)));
+ }
+ *(gre_key(greh)) = tuple->dst.u.gre.key;
+ break;
+ case GRE_VERSION_PPTP:
+ DEBUGP("call_id -> 0x%04x\n",
+ ntohl(tuple->dst.u.gre.key));
+ pgreh->call_id = htons(ntohl(tuple->dst.u.gre.key));
+ break;
+ default:
+ DEBUGP("can't nat unknown GRE version\n");
+ return 0;
+ break;
+ }
+ }
+ return 1;
+}
+
+/* print out a nat tuple */
+static unsigned int
+gre_print(char *buffer,
+ const struct ip_conntrack_tuple *match,
+ const struct ip_conntrack_tuple *mask)
+{
+ unsigned int len = 0;
+
+ if (mask->src.u.gre.key)
+ len += sprintf(buffer + len, "srckey=0x%x ",
+ ntohl(match->src.u.gre.key));
+
+ if (mask->dst.u.gre.key)
+ len += sprintf(buffer + len, "dstkey=0x%x ",
+ ntohl(match->src.u.gre.key));
+
+ return len;
+}
+
+/* print a range of keys */
+static unsigned int
+gre_print_range(char *buffer, const struct ip_nat_range *range)
+{
+ if (range->min.gre.key != 0
+ || range->max.gre.key != 0xFFFF) {
+ if (range->min.gre.key == range->max.gre.key)
+ return sprintf(buffer, "key 0x%x ",
+ ntohl(range->min.gre.key));
+ else
+ return sprintf(buffer, "keys 0x%u-0x%u ",
+ ntohl(range->min.gre.key),
+ ntohl(range->max.gre.key));
+ } else
+ return 0;
+}
+
+/* nat helper struct */
+static struct ip_nat_protocol gre = {
+ .name = "GRE",
+ .protonum = IPPROTO_GRE,
+ .manip_pkt = gre_manip_pkt,
+ .in_range = gre_in_range,
+ .unique_tuple = gre_unique_tuple,
+ .print = gre_print,
+ .print_range = gre_print_range,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .range_to_nfattr = ip_nat_port_range_to_nfattr,
+ .nfattr_to_range = ip_nat_port_nfattr_to_range,
+#endif
+};
+
+int __init ip_nat_proto_gre_init(void)
+{
+ return ip_nat_protocol_register(&gre);
+}
+
+void __exit ip_nat_proto_gre_fini(void)
+{
+ ip_nat_protocol_unregister(&gre);
+}
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
index 6596c9ee1655..938719043999 100644
--- a/net/ipv4/netfilter/ip_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c
@@ -62,7 +62,7 @@ icmp_manip_pkt(struct sk_buff **pskb,
struct icmphdr *hdr;
unsigned int hdroff = iphdroff + iph->ihl*4;
- if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
+ if (!skb_make_writable(pskb, hdroff + sizeof(*hdr)))
return 0;
hdr = (struct icmphdr *)((*pskb)->data + hdroff);
@@ -106,11 +106,18 @@ icmp_print_range(char *buffer, const struct ip_nat_range *range)
else return 0;
}
-struct ip_nat_protocol ip_nat_protocol_icmp
-= { "ICMP", IPPROTO_ICMP,
- icmp_manip_pkt,
- icmp_in_range,
- icmp_unique_tuple,
- icmp_print,
- icmp_print_range
+struct ip_nat_protocol ip_nat_protocol_icmp = {
+ .name = "ICMP",
+ .protonum = IPPROTO_ICMP,
+ .me = THIS_MODULE,
+ .manip_pkt = icmp_manip_pkt,
+ .in_range = icmp_in_range,
+ .unique_tuple = icmp_unique_tuple,
+ .print = icmp_print,
+ .print_range = icmp_print_range,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .range_to_nfattr = ip_nat_port_range_to_nfattr,
+ .nfattr_to_range = ip_nat_port_nfattr_to_range,
+#endif
};
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
index a98e36d2b3c6..1d381bf68574 100644
--- a/net/ipv4/netfilter/ip_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c
@@ -12,6 +12,7 @@
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/if.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
#include <linux/netfilter_ipv4/ip_nat.h>
#include <linux/netfilter_ipv4/ip_nat_rule.h>
#include <linux/netfilter_ipv4/ip_nat_protocol.h>
@@ -102,7 +103,7 @@ tcp_manip_pkt(struct sk_buff **pskb,
if ((*pskb)->len >= hdroff + sizeof(struct tcphdr))
hdrsize = sizeof(struct tcphdr);
- if (!skb_ip_make_writable(pskb, hdroff + hdrsize))
+ if (!skb_make_writable(pskb, hdroff + hdrsize))
return 0;
iph = (struct iphdr *)((*pskb)->data + iphdroff);
@@ -169,11 +170,18 @@ tcp_print_range(char *buffer, const struct ip_nat_range *range)
else return 0;
}
-struct ip_nat_protocol ip_nat_protocol_tcp
-= { "TCP", IPPROTO_TCP,
- tcp_manip_pkt,
- tcp_in_range,
- tcp_unique_tuple,
- tcp_print,
- tcp_print_range
+struct ip_nat_protocol ip_nat_protocol_tcp = {
+ .name = "TCP",
+ .protonum = IPPROTO_TCP,
+ .me = THIS_MODULE,
+ .manip_pkt = tcp_manip_pkt,
+ .in_range = tcp_in_range,
+ .unique_tuple = tcp_unique_tuple,
+ .print = tcp_print,
+ .print_range = tcp_print_range,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .range_to_nfattr = ip_nat_port_range_to_nfattr,
+ .nfattr_to_range = ip_nat_port_nfattr_to_range,
+#endif
};
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
index 9f66e5625664..c4906e1aa24a 100644
--- a/net/ipv4/netfilter/ip_nat_proto_udp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_udp.c
@@ -94,7 +94,7 @@ udp_manip_pkt(struct sk_buff **pskb,
u32 oldip, newip;
u16 *portptr, newport;
- if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
+ if (!skb_make_writable(pskb, hdroff + sizeof(*hdr)))
return 0;
iph = (struct iphdr *)((*pskb)->data + iphdroff);
@@ -156,11 +156,18 @@ udp_print_range(char *buffer, const struct ip_nat_range *range)
else return 0;
}
-struct ip_nat_protocol ip_nat_protocol_udp
-= { "UDP", IPPROTO_UDP,
- udp_manip_pkt,
- udp_in_range,
- udp_unique_tuple,
- udp_print,
- udp_print_range
+struct ip_nat_protocol ip_nat_protocol_udp = {
+ .name = "UDP",
+ .protonum = IPPROTO_UDP,
+ .me = THIS_MODULE,
+ .manip_pkt = udp_manip_pkt,
+ .in_range = udp_in_range,
+ .unique_tuple = udp_unique_tuple,
+ .print = udp_print,
+ .print_range = udp_print_range,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .range_to_nfattr = ip_nat_port_range_to_nfattr,
+ .nfattr_to_range = ip_nat_port_nfattr_to_range,
+#endif
};
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c
index f5525bd58d16..99bbef56f84e 100644
--- a/net/ipv4/netfilter/ip_nat_proto_unknown.c
+++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c
@@ -61,10 +61,11 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range)
}
struct ip_nat_protocol ip_nat_unknown_protocol = {
- "unknown", 0,
- unknown_manip_pkt,
- unknown_in_range,
- unknown_unique_tuple,
- unknown_print,
- unknown_print_range
+ .name = "unknown",
+ .me = THIS_MODULE,
+ .manip_pkt = unknown_manip_pkt,
+ .in_range = unknown_in_range,
+ .unique_tuple = unknown_unique_tuple,
+ .print = unknown_print,
+ .print_range = unknown_print_range
};
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
index 60d70fa41a15..cb66b8bddeb3 100644
--- a/net/ipv4/netfilter/ip_nat_rule.c
+++ b/net/ipv4/netfilter/ip_nat_rule.c
@@ -255,6 +255,27 @@ alloc_null_binding(struct ip_conntrack *conntrack,
return ip_nat_setup_info(conntrack, &range, hooknum);
}
+unsigned int
+alloc_null_binding_confirmed(struct ip_conntrack *conntrack,
+ struct ip_nat_info *info,
+ unsigned int hooknum)
+{
+ u_int32_t ip
+ = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
+ ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip
+ : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip);
+ u_int16_t all
+ = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
+ ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.all
+ : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.all);
+ struct ip_nat_range range
+ = { IP_NAT_RANGE_MAP_IPS, ip, ip, { all }, { all } };
+
+ DEBUGP("Allocating NULL binding for confirmed %p (%u.%u.%u.%u)\n",
+ conntrack, NIPQUAD(ip));
+ return ip_nat_setup_info(conntrack, &range, hooknum);
+}
+
int ip_nat_rule_find(struct sk_buff **pskb,
unsigned int hooknum,
const struct net_device *in,
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index 2a48b6e635ae..93b2c5111bb2 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -1275,7 +1275,7 @@ static int help(struct sk_buff **pskb,
return NF_DROP;
}
- if (!skb_ip_make_writable(pskb, (*pskb)->len))
+ if (!skb_make_writable(pskb, (*pskb)->len))
return NF_DROP;
spin_lock_bh(&snmp_lock);
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index bc59d0d6e89e..30cd4e18c129 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -73,8 +73,6 @@ ip_nat_fn(unsigned int hooknum,
IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
& htons(IP_MF|IP_OFFSET)));
- (*pskb)->nfcache |= NFC_UNKNOWN;
-
/* If we had a hardware checksum before, it's now invalid */
if ((*pskb)->ip_summed == CHECKSUM_HW)
if (skb_checksum_help(*pskb, (out == NULL)))
@@ -102,12 +100,16 @@ ip_nat_fn(unsigned int hooknum,
return NF_ACCEPT;
}
+ /* Don't try to NAT if this packet is not conntracked */
+ if (ct == &ip_conntrack_untracked)
+ return NF_ACCEPT;
+
switch (ctinfo) {
case IP_CT_RELATED:
case IP_CT_RELATED+IP_CT_IS_REPLY:
if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) {
- if (!icmp_reply_translation(pskb, ct, maniptype,
- CTINFO2DIR(ctinfo)))
+ if (!ip_nat_icmp_reply_translation(pskb, ct, maniptype,
+ CTINFO2DIR(ctinfo)))
return NF_DROP;
else
return NF_ACCEPT;
@@ -121,8 +123,12 @@ ip_nat_fn(unsigned int hooknum,
if (!ip_nat_initialized(ct, maniptype)) {
unsigned int ret;
- /* LOCAL_IN hook doesn't have a chain! */
- if (hooknum == NF_IP_LOCAL_IN)
+ if (unlikely(is_confirmed(ct)))
+ /* NAT module was loaded late */
+ ret = alloc_null_binding_confirmed(ct, info,
+ hooknum);
+ else if (hooknum == NF_IP_LOCAL_IN)
+ /* LOCAL_IN hook doesn't have a chain! */
ret = alloc_null_binding(ct, info, hooknum);
else
ret = ip_nat_rule_find(pskb, hooknum,
@@ -146,7 +152,7 @@ ip_nat_fn(unsigned int hooknum,
}
IP_NF_ASSERT(info);
- return nat_packet(ct, ctinfo, hooknum, pskb);
+ return ip_nat_packet(ct, ctinfo, hooknum, pskb);
}
static unsigned int
@@ -319,15 +325,10 @@ static int init_or_cleanup(int init)
printk("ip_nat_init: can't setup rules.\n");
goto cleanup_nothing;
}
- ret = ip_nat_init();
- if (ret < 0) {
- printk("ip_nat_init: can't setup rules.\n");
- goto cleanup_rule_init;
- }
ret = nf_register_hook(&ip_nat_in_ops);
if (ret < 0) {
printk("ip_nat_init: can't register in hook.\n");
- goto cleanup_nat;
+ goto cleanup_rule_init;
}
ret = nf_register_hook(&ip_nat_out_ops);
if (ret < 0) {
@@ -368,8 +369,6 @@ static int init_or_cleanup(int init)
nf_unregister_hook(&ip_nat_out_ops);
cleanup_inops:
nf_unregister_hook(&ip_nat_in_ops);
- cleanup_nat:
- ip_nat_cleanup();
cleanup_rule_init:
ip_nat_rule_cleanup();
cleanup_nothing:
@@ -389,12 +388,4 @@ static void __exit fini(void)
module_init(init);
module_exit(fini);
-EXPORT_SYMBOL(ip_nat_setup_info);
-EXPORT_SYMBOL(ip_nat_protocol_register);
-EXPORT_SYMBOL(ip_nat_protocol_unregister);
-EXPORT_SYMBOL(ip_nat_cheat_check);
-EXPORT_SYMBOL(ip_nat_mangle_tcp_packet);
-EXPORT_SYMBOL(ip_nat_mangle_udp_packet);
-EXPORT_SYMBOL(ip_nat_used_tuple);
-EXPORT_SYMBOL(ip_nat_follow_master);
MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index eda1fba431a4..36339eb39e17 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -43,17 +43,10 @@
#define NET_IPQ_QMAX 2088
#define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
-struct ipq_rt_info {
- __u8 tos;
- __u32 daddr;
- __u32 saddr;
-};
-
struct ipq_queue_entry {
struct list_head list;
struct nf_info *info;
struct sk_buff *skb;
- struct ipq_rt_info rt_info;
};
typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
@@ -214,6 +207,12 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
break;
case IPQ_COPY_PACKET:
+ if (entry->skb->ip_summed == CHECKSUM_HW &&
+ (*errp = skb_checksum_help(entry->skb,
+ entry->info->outdev == NULL))) {
+ read_unlock_bh(&queue_lock);
+ return NULL;
+ }
if (copy_range == 0 || copy_range > entry->skb->len)
data_len = entry->skb->len;
else
@@ -241,8 +240,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
pmsg->packet_id = (unsigned long )entry;
pmsg->data_len = data_len;
- pmsg->timestamp_sec = entry->skb->stamp.tv_sec;
- pmsg->timestamp_usec = entry->skb->stamp.tv_usec;
+ pmsg->timestamp_sec = entry->skb->tstamp.off_sec;
+ pmsg->timestamp_usec = entry->skb->tstamp.off_usec;
pmsg->mark = entry->skb->nfmark;
pmsg->hook = entry->info->hook;
pmsg->hw_protocol = entry->skb->protocol;
@@ -281,7 +280,8 @@ nlmsg_failure:
}
static int
-ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
+ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info,
+ unsigned int queuenum, void *data)
{
int status = -EINVAL;
struct sk_buff *nskb;
@@ -299,14 +299,6 @@ ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
entry->info = info;
entry->skb = skb;
- if (entry->info->hook == NF_IP_LOCAL_OUT) {
- struct iphdr *iph = skb->nh.iph;
-
- entry->rt_info.tos = iph->tos;
- entry->rt_info.daddr = iph->daddr;
- entry->rt_info.saddr = iph->saddr;
- }
-
nskb = ipq_build_packet_message(entry, &status);
if (nskb == NULL)
goto err_out_free;
@@ -382,23 +374,11 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
}
skb_put(e->skb, diff);
}
- if (!skb_ip_make_writable(&e->skb, v->data_len))
+ if (!skb_make_writable(&e->skb, v->data_len))
return -ENOMEM;
memcpy(e->skb->data, v->payload, v->data_len);
- e->skb->nfcache |= NFC_ALTERED;
-
- /*
- * Extra routing may needed on local out, as the QUEUE target never
- * returns control to the table.
- */
- if (e->info->hook == NF_IP_LOCAL_OUT) {
- struct iphdr *iph = e->skb->nh.iph;
-
- if (!(iph->tos == e->rt_info.tos
- && iph->daddr == e->rt_info.daddr
- && iph->saddr == e->rt_info.saddr))
- return ip_route_me_harder(&e->skb);
- }
+ e->skb->ip_summed = CHECKSUM_NONE;
+
return 0;
}
@@ -676,6 +656,11 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length)
}
#endif /* CONFIG_PROC_FS */
+static struct nf_queue_handler nfqh = {
+ .name = "ip_queue",
+ .outfn = &ipq_enqueue_packet,
+};
+
static int
init_or_cleanup(int init)
{
@@ -686,7 +671,8 @@ init_or_cleanup(int init)
goto cleanup;
netlink_register_notifier(&ipq_nl_notifier);
- ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk);
+ ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk,
+ THIS_MODULE);
if (ipqnl == NULL) {
printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
goto cleanup_netlink_notifier;
@@ -703,7 +689,7 @@ init_or_cleanup(int init)
register_netdevice_notifier(&ipq_dev_notifier);
ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
- status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL);
+ status = nf_register_queue_handler(PF_INET, &nfqh);
if (status < 0) {
printk(KERN_ERR "ip_queue: failed to register queue handler\n");
goto cleanup_sysctl;
@@ -711,7 +697,7 @@ init_or_cleanup(int init)
return status;
cleanup:
- nf_unregister_queue_handler(PF_INET);
+ nf_unregister_queue_handlers(&nfqh);
synchronize_net();
ipq_flush(NF_DROP);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index c88dfcd38c56..75c27e92f6ab 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -27,6 +27,7 @@
#include <asm/semaphore.h>
#include <linux/proc_fs.h>
#include <linux/err.h>
+#include <linux/cpumask.h>
#include <linux/netfilter_ipv4/ip_tables.h>
@@ -312,7 +313,6 @@ ipt_do_table(struct sk_buff **pskb,
do {
IP_NF_ASSERT(e);
IP_NF_ASSERT(back);
- (*pskb)->nfcache |= e->nfcache;
if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {
struct ipt_entry_target *t;
@@ -341,8 +341,8 @@ ipt_do_table(struct sk_buff **pskb,
back->comefrom);
continue;
}
- if (table_base + v
- != (void *)e + e->next_offset) {
+ if (table_base + v != (void *)e + e->next_offset
+ && !(e->ip.flags & IPT_F_GOTO)) {
/* Save old back ptr in next entry */
struct ipt_entry *next
= (void *)e + e->next_offset;
@@ -922,8 +922,10 @@ translate_table(const char *name,
}
/* And one copy for every other CPU */
- for (i = 1; i < num_possible_cpus(); i++) {
- memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
+ for_each_cpu(i) {
+ if (i == 0)
+ continue;
+ memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
newinfo->entries,
SMP_ALIGN(newinfo->size));
}
@@ -944,7 +946,7 @@ replace_table(struct ipt_table *table,
struct ipt_entry *table_base;
unsigned int i;
- for (i = 0; i < num_possible_cpus(); i++) {
+ for_each_cpu(i) {
table_base =
(void *)newinfo->entries
+ TABLE_OFFSET(newinfo, i);
@@ -991,7 +993,7 @@ get_counters(const struct ipt_table_info *t,
unsigned int cpu;
unsigned int i;
- for (cpu = 0; cpu < num_possible_cpus(); cpu++) {
+ for_each_cpu(cpu) {
i = 0;
IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
t->size,
@@ -1129,7 +1131,8 @@ do_replace(void __user *user, unsigned int len)
return -ENOMEM;
newinfo = vmalloc(sizeof(struct ipt_table_info)
- + SMP_ALIGN(tmp.size) * num_possible_cpus());
+ + SMP_ALIGN(tmp.size) *
+ (highest_possible_processor_id()+1));
if (!newinfo)
return -ENOMEM;
@@ -1459,7 +1462,8 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
= { 0, 0, 0, { 0 }, { 0 }, { } };
newinfo = vmalloc(sizeof(struct ipt_table_info)
- + SMP_ALIGN(repl->size) * num_possible_cpus());
+ + SMP_ALIGN(repl->size) *
+ (highest_possible_processor_id()+1));
if (!newinfo)
return -ENOMEM;
diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/ipv4/netfilter/ipt_CLASSIFY.c
index 9842e6e23184..dab78d8bd494 100644
--- a/net/ipv4/netfilter/ipt_CLASSIFY.c
+++ b/net/ipv4/netfilter/ipt_CLASSIFY.c
@@ -32,10 +32,8 @@ target(struct sk_buff **pskb,
{
const struct ipt_classify_target_info *clinfo = targinfo;
- if((*pskb)->priority != clinfo->priority) {
+ if((*pskb)->priority != clinfo->priority)
(*pskb)->priority = clinfo->priority;
- (*pskb)->nfcache |= NFC_ALTERED;
- }
return IPT_CONTINUE;
}
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 6706d3a1bc4f..9bcb398fbc1f 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -13,6 +13,7 @@
#include <linux/config.h>
#include <linux/proc_fs.h>
#include <linux/jhash.h>
+#include <linux/bitops.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/tcp.h>
@@ -30,7 +31,7 @@
#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
#include <linux/netfilter_ipv4/ip_conntrack.h>
-#define CLUSTERIP_VERSION "0.7"
+#define CLUSTERIP_VERSION "0.8"
#define DEBUG_CLUSTERIP
@@ -49,13 +50,14 @@ MODULE_DESCRIPTION("iptables target for CLUSTERIP");
struct clusterip_config {
struct list_head list; /* list of all configs */
atomic_t refcount; /* reference count */
+ atomic_t entries; /* number of entries/rules
+ * referencing us */
u_int32_t clusterip; /* the IP address */
u_int8_t clustermac[ETH_ALEN]; /* the MAC address */
struct net_device *dev; /* device */
u_int16_t num_total_nodes; /* total number of nodes */
- u_int16_t num_local_nodes; /* number of local nodes */
- u_int16_t local_nodes[CLUSTERIP_MAX_NODES]; /* node number array */
+ unsigned long local_nodes; /* node number array */
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *pde; /* proc dir entry */
@@ -66,8 +68,7 @@ struct clusterip_config {
static LIST_HEAD(clusterip_configs);
-/* clusterip_lock protects the clusterip_configs list _AND_ the configurable
- * data within all structurses (num_local_nodes, local_nodes[]) */
+/* clusterip_lock protects the clusterip_configs list */
static DEFINE_RWLOCK(clusterip_lock);
#ifdef CONFIG_PROC_FS
@@ -76,23 +77,48 @@ static struct proc_dir_entry *clusterip_procdir;
#endif
static inline void
-clusterip_config_get(struct clusterip_config *c) {
+clusterip_config_get(struct clusterip_config *c)
+{
atomic_inc(&c->refcount);
}
static inline void
-clusterip_config_put(struct clusterip_config *c) {
- if (atomic_dec_and_test(&c->refcount)) {
+clusterip_config_put(struct clusterip_config *c)
+{
+ if (atomic_dec_and_test(&c->refcount))
+ kfree(c);
+}
+
+/* increase the count of entries(rules) using/referencing this config */
+static inline void
+clusterip_config_entry_get(struct clusterip_config *c)
+{
+ atomic_inc(&c->entries);
+}
+
+/* decrease the count of entries using/referencing this config. If last
+ * entry(rule) is removed, remove the config from lists, but don't free it
+ * yet, since proc-files could still be holding references */
+static inline void
+clusterip_config_entry_put(struct clusterip_config *c)
+{
+ if (atomic_dec_and_test(&c->entries)) {
write_lock_bh(&clusterip_lock);
list_del(&c->list);
write_unlock_bh(&clusterip_lock);
+
dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0);
dev_put(c->dev);
- kfree(c);
+
+ /* In case anyone still accesses the file, the open/close
+ * functions are also incrementing the refcount on their own,
+ * so it's safe to remove the entry even if it's in use. */
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry(c->pde->name, c->pde->parent);
+#endif
}
}
-
static struct clusterip_config *
__clusterip_config_find(u_int32_t clusterip)
{
@@ -111,7 +137,7 @@ __clusterip_config_find(u_int32_t clusterip)
}
static inline struct clusterip_config *
-clusterip_config_find_get(u_int32_t clusterip)
+clusterip_config_find_get(u_int32_t clusterip, int entry)
{
struct clusterip_config *c;
@@ -122,11 +148,24 @@ clusterip_config_find_get(u_int32_t clusterip)
return NULL;
}
atomic_inc(&c->refcount);
+ if (entry)
+ atomic_inc(&c->entries);
read_unlock_bh(&clusterip_lock);
return c;
}
+static void
+clusterip_config_init_nodelist(struct clusterip_config *c,
+ const struct ipt_clusterip_tgt_info *i)
+{
+ int n;
+
+ for (n = 0; n < i->num_local_nodes; n++) {
+ set_bit(i->local_nodes[n] - 1, &c->local_nodes);
+ }
+}
+
static struct clusterip_config *
clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
struct net_device *dev)
@@ -143,11 +182,11 @@ clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
c->clusterip = ip;
memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
c->num_total_nodes = i->num_total_nodes;
- c->num_local_nodes = i->num_local_nodes;
- memcpy(&c->local_nodes, &i->local_nodes, sizeof(&c->local_nodes));
+ clusterip_config_init_nodelist(c, i);
c->hash_mode = i->hash_mode;
c->hash_initval = i->hash_initval;
atomic_set(&c->refcount, 1);
+ atomic_set(&c->entries, 1);
#ifdef CONFIG_PROC_FS
/* create proc dir entry */
@@ -171,53 +210,28 @@ clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
static int
clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
{
- int i;
-
- write_lock_bh(&clusterip_lock);
- if (c->num_local_nodes >= CLUSTERIP_MAX_NODES
- || nodenum > CLUSTERIP_MAX_NODES) {
- write_unlock_bh(&clusterip_lock);
+ if (nodenum == 0 ||
+ nodenum > c->num_total_nodes)
return 1;
- }
-
- /* check if we alrady have this number in our array */
- for (i = 0; i < c->num_local_nodes; i++) {
- if (c->local_nodes[i] == nodenum) {
- write_unlock_bh(&clusterip_lock);
- return 1;
- }
- }
- c->local_nodes[c->num_local_nodes++] = nodenum;
+ /* check if we already have this number in our bitfield */
+ if (test_and_set_bit(nodenum - 1, &c->local_nodes))
+ return 1;
- write_unlock_bh(&clusterip_lock);
return 0;
}
static int
clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
{
- int i;
-
- write_lock_bh(&clusterip_lock);
-
- if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) {
- write_unlock_bh(&clusterip_lock);
+ if (nodenum == 0 ||
+ nodenum > c->num_total_nodes)
return 1;
- }
- for (i = 0; i < c->num_local_nodes; i++) {
- if (c->local_nodes[i] == nodenum) {
- int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1));
- memmove(&c->local_nodes[i], &c->local_nodes[i+1], size);
- c->num_local_nodes--;
- write_unlock_bh(&clusterip_lock);
- return 0;
- }
- }
+ if (test_and_clear_bit(nodenum - 1, &c->local_nodes))
+ return 0;
- write_unlock_bh(&clusterip_lock);
return 1;
}
@@ -285,25 +299,7 @@ clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config)
static inline int
clusterip_responsible(struct clusterip_config *config, u_int32_t hash)
{
- int i;
-
- read_lock_bh(&clusterip_lock);
-
- if (config->num_local_nodes == 0) {
- read_unlock_bh(&clusterip_lock);
- return 0;
- }
-
- for (i = 0; i < config->num_local_nodes; i++) {
- if (config->local_nodes[i] == hash) {
- read_unlock_bh(&clusterip_lock);
- return 1;
- }
- }
-
- read_unlock_bh(&clusterip_lock);
-
- return 0;
+ return test_bit(hash - 1, &config->local_nodes);
}
/***********************************************************************
@@ -367,7 +363,7 @@ target(struct sk_buff **pskb,
#ifdef DEBUG_CLUSTERP
DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
#endif
- DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark);
+ DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark);
if (!clusterip_responsible(cipinfo->config, hash)) {
DEBUGP("not responsible\n");
return NF_DROP;
@@ -415,8 +411,26 @@ checkentry(const char *tablename,
/* FIXME: further sanity checks */
- config = clusterip_config_find_get(e->ip.dst.s_addr);
- if (!config) {
+ config = clusterip_config_find_get(e->ip.dst.s_addr, 1);
+ if (config) {
+ if (cipinfo->config != NULL) {
+ /* Case A: This is an entry that gets reloaded, since
+ * it still has a cipinfo->config pointer. Simply
+ * increase the entry refcount and return */
+ if (cipinfo->config != config) {
+ printk(KERN_ERR "CLUSTERIP: Reloaded entry "
+ "has invalid config pointer!\n");
+ return 0;
+ }
+ clusterip_config_entry_get(cipinfo->config);
+ } else {
+ /* Case B: This is a new rule referring to an existing
+ * clusterip config. */
+ cipinfo->config = config;
+ clusterip_config_entry_get(cipinfo->config);
+ }
+ } else {
+ /* Case C: This is a completely new clusterip config */
if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
printk(KERN_WARNING "CLUSTERIP: no config found for %u.%u.%u.%u, need 'new'\n", NIPQUAD(e->ip.dst.s_addr));
return 0;
@@ -443,10 +457,9 @@ checkentry(const char *tablename,
}
dev_mc_add(config->dev,config->clustermac, ETH_ALEN, 0);
}
+ cipinfo->config = config;
}
- cipinfo->config = config;
-
return 1;
}
@@ -455,13 +468,10 @@ static void destroy(void *matchinfo, unsigned int matchinfosize)
{
struct ipt_clusterip_tgt_info *cipinfo = matchinfo;
- /* we first remove the proc entry and then drop the reference
- * count. In case anyone still accesses the file, the open/close
- * functions are also incrementing the refcount on their own */
-#ifdef CONFIG_PROC_FS
- remove_proc_entry(cipinfo->config->pde->name,
- cipinfo->config->pde->parent);
-#endif
+ /* if no more entries are referencing the config, remove it
+ * from the list and destroy the proc entry */
+ clusterip_config_entry_put(cipinfo->config);
+
clusterip_config_put(cipinfo->config);
}
@@ -533,7 +543,7 @@ arp_mangle(unsigned int hook,
/* if there is no clusterip configuration for the arp reply's
* source ip, we don't want to mangle it */
- c = clusterip_config_find_get(payload->src_ip);
+ c = clusterip_config_find_get(payload->src_ip, 0);
if (!c)
return NF_ACCEPT;
@@ -574,56 +584,69 @@ static struct nf_hook_ops cip_arp_ops = {
#ifdef CONFIG_PROC_FS
+struct clusterip_seq_position {
+ unsigned int pos; /* position */
+ unsigned int weight; /* number of bits set == size */
+ unsigned int bit; /* current bit */
+ unsigned long val; /* current value */
+};
+
static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
{
struct proc_dir_entry *pde = s->private;
struct clusterip_config *c = pde->data;
- unsigned int *nodeidx;
-
- read_lock_bh(&clusterip_lock);
- if (*pos >= c->num_local_nodes)
+ unsigned int weight;
+ u_int32_t local_nodes;
+ struct clusterip_seq_position *idx;
+
+ /* FIXME: possible race */
+ local_nodes = c->local_nodes;
+ weight = hweight32(local_nodes);
+ if (*pos >= weight)
return NULL;
- nodeidx = kmalloc(sizeof(unsigned int), GFP_KERNEL);
- if (!nodeidx)
+ idx = kmalloc(sizeof(struct clusterip_seq_position), GFP_KERNEL);
+ if (!idx)
return ERR_PTR(-ENOMEM);
- *nodeidx = *pos;
- return nodeidx;
+ idx->pos = *pos;
+ idx->weight = weight;
+ idx->bit = ffs(local_nodes);
+ idx->val = local_nodes;
+ clear_bit(idx->bit - 1, &idx->val);
+
+ return idx;
}
static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct proc_dir_entry *pde = s->private;
- struct clusterip_config *c = pde->data;
- unsigned int *nodeidx = (unsigned int *)v;
+ struct clusterip_seq_position *idx = (struct clusterip_seq_position *)v;
- *pos = ++(*nodeidx);
- if (*pos >= c->num_local_nodes) {
+ *pos = ++idx->pos;
+ if (*pos >= idx->weight) {
kfree(v);
return NULL;
}
- return nodeidx;
+ idx->bit = ffs(idx->val);
+ clear_bit(idx->bit - 1, &idx->val);
+ return idx;
}
static void clusterip_seq_stop(struct seq_file *s, void *v)
{
kfree(v);
-
- read_unlock_bh(&clusterip_lock);
}
static int clusterip_seq_show(struct seq_file *s, void *v)
{
- struct proc_dir_entry *pde = s->private;
- struct clusterip_config *c = pde->data;
- unsigned int *nodeidx = (unsigned int *)v;
+ struct clusterip_seq_position *idx = (struct clusterip_seq_position *)v;
- if (*nodeidx != 0)
+ if (idx->pos != 0)
seq_putc(s, ',');
- seq_printf(s, "%u", c->local_nodes[*nodeidx]);
- if (*nodeidx == c->num_local_nodes-1)
+ seq_printf(s, "%u", idx->bit);
+
+ if (idx->pos == idx->weight - 1)
seq_putc(s, '\n');
return 0;
diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c
index 30ddd3e18eb7..134638021339 100644
--- a/net/ipv4/netfilter/ipt_CONNMARK.c
+++ b/net/ipv4/netfilter/ipt_CONNMARK.c
@@ -40,9 +40,9 @@ target(struct sk_buff **pskb,
void *userinfo)
{
const struct ipt_connmark_target_info *markinfo = targinfo;
- unsigned long diff;
- unsigned long nfmark;
- unsigned long newmark;
+ u_int32_t diff;
+ u_int32_t nfmark;
+ u_int32_t newmark;
enum ip_conntrack_info ctinfo;
struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
@@ -61,10 +61,8 @@ target(struct sk_buff **pskb,
case IPT_CONNMARK_RESTORE:
nfmark = (*pskb)->nfmark;
diff = (ct->mark ^ nfmark) & markinfo->mask;
- if (diff != 0) {
+ if (diff != 0)
(*pskb)->nfmark = nfmark ^ diff;
- (*pskb)->nfcache |= NFC_ALTERED;
- }
break;
}
}
@@ -94,6 +92,11 @@ checkentry(const char *tablename,
}
}
+ if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) {
+ printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n");
+ return 0;
+ }
+
return 1;
}
diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c
index 3ea4509099f9..6e319570a28c 100644
--- a/net/ipv4/netfilter/ipt_DSCP.c
+++ b/net/ipv4/netfilter/ipt_DSCP.c
@@ -39,7 +39,7 @@ target(struct sk_buff **pskb,
if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) {
u_int16_t diffs[2];
- if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+ if (!skb_make_writable(pskb, sizeof(struct iphdr)))
return NF_DROP;
diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -51,7 +51,6 @@ target(struct sk_buff **pskb,
sizeof(diffs),
(*pskb)->nh.iph->check
^ 0xFFFF));
- (*pskb)->nfcache |= NFC_ALTERED;
}
return IPT_CONTINUE;
}
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index ada9911118e9..a1319693f648 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -31,7 +31,7 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
!= (einfo->ip_ect & IPT_ECN_IP_MASK)) {
u_int16_t diffs[2];
- if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+ if (!skb_make_writable(pskb, sizeof(struct iphdr)))
return 0;
diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -43,7 +43,6 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
sizeof(diffs),
(*pskb)->nh.iph->check
^0xFFFF));
- (*pskb)->nfcache |= NFC_ALTERED;
}
return 1;
}
@@ -61,16 +60,20 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
if (!tcph)
return 0;
- if (!(einfo->operation & IPT_ECN_OP_SET_ECE
- || tcph->ece == einfo->proto.tcp.ece)
- && (!(einfo->operation & IPT_ECN_OP_SET_CWR
- || tcph->cwr == einfo->proto.tcp.cwr)))
+ if ((!(einfo->operation & IPT_ECN_OP_SET_ECE) ||
+ tcph->ece == einfo->proto.tcp.ece) &&
+ ((!(einfo->operation & IPT_ECN_OP_SET_CWR) ||
+ tcph->cwr == einfo->proto.tcp.cwr)))
return 1;
- if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
+ if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
return 0;
tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4;
+ if ((*pskb)->ip_summed == CHECKSUM_HW &&
+ skb_checksum_help(*pskb, inward))
+ return 0;
+
diffs[0] = ((u_int16_t *)tcph)[6];
if (einfo->operation & IPT_ECN_OP_SET_ECE)
tcph->ece = einfo->proto.tcp.ece;
@@ -79,14 +82,10 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
diffs[1] = ((u_int16_t *)tcph)[6];
diffs[0] = diffs[0] ^ 0xFFFF;
- if ((*pskb)->ip_summed != CHECKSUM_HW)
+ if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY)
tcph->check = csum_fold(csum_partial((char *)diffs,
sizeof(diffs),
tcph->check^0xFFFF));
- else
- if (skb_checksum_help(*pskb, inward))
- return 0;
- (*pskb)->nfcache |= NFC_ALTERED;
return 1;
}
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index ef08733d26da..92ed050fac69 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -27,10 +27,6 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("iptables syslog logging module");
-static unsigned int nflog = 1;
-module_param(nflog, int, 0400);
-MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
-
#if 0
#define DEBUGP printk
#else
@@ -41,11 +37,17 @@ MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
static DEFINE_SPINLOCK(log_lock);
/* One level of recursion won't kill us */
-static void dump_packet(const struct ipt_log_info *info,
+static void dump_packet(const struct nf_loginfo *info,
const struct sk_buff *skb,
unsigned int iphoff)
{
struct iphdr _iph, *ih;
+ unsigned int logflags;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_MASK;
ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
if (ih == NULL) {
@@ -76,7 +78,7 @@ static void dump_packet(const struct ipt_log_info *info,
if (ntohs(ih->frag_off) & IP_OFFSET)
printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
- if ((info->logflags & IPT_LOG_IPOPT)
+ if ((logflags & IPT_LOG_IPOPT)
&& ih->ihl * 4 > sizeof(struct iphdr)) {
unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op;
unsigned int i, optsize;
@@ -119,7 +121,7 @@ static void dump_packet(const struct ipt_log_info *info,
printk("SPT=%u DPT=%u ",
ntohs(th->source), ntohs(th->dest));
/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
- if (info->logflags & IPT_LOG_TCPSEQ)
+ if (logflags & IPT_LOG_TCPSEQ)
printk("SEQ=%u ACK=%u ",
ntohl(th->seq), ntohl(th->ack_seq));
/* Max length: 13 "WINDOW=65535 " */
@@ -146,7 +148,7 @@ static void dump_packet(const struct ipt_log_info *info,
/* Max length: 11 "URGP=65535 " */
printk("URGP=%u ", ntohs(th->urg_ptr));
- if ((info->logflags & IPT_LOG_TCPOPT)
+ if ((logflags & IPT_LOG_TCPOPT)
&& th->doff * 4 > sizeof(struct tcphdr)) {
unsigned char _opt[4 * 15 - sizeof(struct tcphdr)];
unsigned char *op;
@@ -328,7 +330,7 @@ static void dump_packet(const struct ipt_log_info *info,
}
/* Max length: 15 "UID=4294967295 " */
- if ((info->logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
+ if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
read_lock_bh(&skb->sk->sk_callback_lock);
if (skb->sk->sk_socket && skb->sk->sk_socket->file)
printk("UID=%u ", skb->sk->sk_socket->file->f_uid);
@@ -349,19 +351,31 @@ static void dump_packet(const struct ipt_log_info *info,
/* maxlen = 230+ 91 + 230 + 252 = 803 */
}
+struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = 0,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
static void
-ipt_log_packet(unsigned int hooknum,
+ipt_log_packet(unsigned int pf,
+ unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
- const struct ipt_log_info *loginfo,
- const char *level_string,
+ const struct nf_loginfo *loginfo,
const char *prefix)
{
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
spin_lock_bh(&log_lock);
- printk(level_string);
- printk("%sIN=%s OUT=%s ",
- prefix == NULL ? loginfo->prefix : prefix,
+ printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+ prefix,
in ? in->name : "",
out ? out->name : "");
#ifdef CONFIG_BRIDGE_NETFILTER
@@ -405,28 +419,15 @@ ipt_log_target(struct sk_buff **pskb,
void *userinfo)
{
const struct ipt_log_info *loginfo = targinfo;
- char level_string[4] = "< >";
+ struct nf_loginfo li;
- level_string[1] = '0' + (loginfo->level % 8);
- ipt_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL);
+ li.type = NF_LOG_TYPE_LOG;
+ li.u.log.level = loginfo->level;
+ li.u.log.logflags = loginfo->logflags;
- return IPT_CONTINUE;
-}
+ nf_log_packet(PF_INET, hooknum, *pskb, in, out, &li, loginfo->prefix);
-static void
-ipt_logfn(unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const char *prefix)
-{
- struct ipt_log_info loginfo = {
- .level = 0,
- .logflags = IPT_LOG_MASK,
- .prefix = ""
- };
-
- ipt_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix);
+ return IPT_CONTINUE;
}
static int ipt_log_checkentry(const char *tablename,
@@ -464,20 +465,29 @@ static struct ipt_target ipt_log_reg = {
.me = THIS_MODULE,
};
+static struct nf_logger ipt_log_logger ={
+ .name = "ipt_LOG",
+ .logfn = &ipt_log_packet,
+ .me = THIS_MODULE,
+};
+
static int __init init(void)
{
if (ipt_register_target(&ipt_log_reg))
return -EINVAL;
- if (nflog)
- nf_log_register(PF_INET, &ipt_logfn);
+ if (nf_log_register(PF_INET, &ipt_log_logger) < 0) {
+ printk(KERN_WARNING "ipt_LOG: not logging via system console "
+ "since somebody else already registered for PF_INET\n");
+ /* we cannot make module load fail here, since otherwise
+ * iptables userspace would abort */
+ }
return 0;
}
static void __exit fini(void)
{
- if (nflog)
- nf_log_unregister(PF_INET, &ipt_logfn);
+ nf_log_unregister_logger(&ipt_log_logger);
ipt_unregister_target(&ipt_log_reg);
}
diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c
index 33c6f9b63b8d..52b4f2c296bf 100644
--- a/net/ipv4/netfilter/ipt_MARK.c
+++ b/net/ipv4/netfilter/ipt_MARK.c
@@ -29,10 +29,9 @@ target_v0(struct sk_buff **pskb,
{
const struct ipt_mark_target_info *markinfo = targinfo;
- if((*pskb)->nfmark != markinfo->mark) {
+ if((*pskb)->nfmark != markinfo->mark)
(*pskb)->nfmark = markinfo->mark;
- (*pskb)->nfcache |= NFC_ALTERED;
- }
+
return IPT_CONTINUE;
}
@@ -61,10 +60,9 @@ target_v1(struct sk_buff **pskb,
break;
}
- if((*pskb)->nfmark != mark) {
+ if((*pskb)->nfmark != mark)
(*pskb)->nfmark = mark;
- (*pskb)->nfcache |= NFC_ALTERED;
- }
+
return IPT_CONTINUE;
}
@@ -76,6 +74,8 @@ checkentry_v0(const char *tablename,
unsigned int targinfosize,
unsigned int hook_mask)
{
+ struct ipt_mark_target_info *markinfo = targinfo;
+
if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) {
printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
targinfosize,
@@ -88,6 +88,11 @@ checkentry_v0(const char *tablename,
return 0;
}
+ if (markinfo->mark > 0xffffffff) {
+ printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+ return 0;
+ }
+
return 1;
}
@@ -120,6 +125,11 @@ checkentry_v1(const char *tablename,
return 0;
}
+ if (markinfo->mark > 0xffffffff) {
+ printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+ return 0;
+ }
+
return 1;
}
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 91e74502c3d3..275a174c6fe6 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -86,15 +86,16 @@ masquerade_target(struct sk_buff **pskb,
IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
- /* FIXME: For the moment, don't do local packets, breaks
- testsuite for 2.3.49 --RR */
- if ((*pskb)->sk)
- return NF_ACCEPT;
-
ct = ip_conntrack_get(*pskb, &ctinfo);
IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
|| ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
+ /* Source address is 0.0.0.0 - locally generated packet that is
+ * probably not supposed to be masqueraded.
+ */
+ if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip == 0)
+ return NF_ACCEPT;
+
mr = targinfo;
rt = (struct rtable *)(*pskb)->dst;
newsrc = inet_select_addr(out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index 06254b29d034..e6e7b6095363 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -46,7 +46,8 @@ check(const char *tablename,
DEBUGP(MODULENAME":check: size %u.\n", targinfosize);
return 0;
}
- if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING))) {
+ if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING) |
+ (1 << NF_IP_LOCAL_OUT))) {
DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask);
return 0;
}
@@ -76,12 +77,13 @@ target(struct sk_buff **pskb,
struct ip_nat_range newrange;
IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
- || hooknum == NF_IP_POST_ROUTING);
+ || hooknum == NF_IP_POST_ROUTING
+ || hooknum == NF_IP_LOCAL_OUT);
ct = ip_conntrack_get(*pskb, &ctinfo);
netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
- if (hooknum == NF_IP_PRE_ROUTING)
+ if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT)
new_ip = (*pskb)->nh.iph->daddr & ~netmask;
else
new_ip = (*pskb)->nh.iph->saddr & ~netmask;
diff --git a/net/ipv4/netfilter/ipt_NFQUEUE.c b/net/ipv4/netfilter/ipt_NFQUEUE.c
new file mode 100644
index 000000000000..3cedc9be8807
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_NFQUEUE.c
@@ -0,0 +1,70 @@
+/* iptables module for using new netfilter netlink queue
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_NFQUEUE.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables NFQUEUE target");
+MODULE_LICENSE("GPL");
+
+static unsigned int
+target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ const struct ipt_NFQ_info *tinfo = targinfo;
+
+ return NF_QUEUE_NR(tinfo->queuenum);
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_NFQ_info))) {
+ printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n",
+ targinfosize,
+ IPT_ALIGN(sizeof(struct ipt_NFQ_info)));
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ipt_target ipt_NFQ_reg = {
+ .name = "NFQUEUE",
+ .target = target,
+ .checkentry = checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_target(&ipt_NFQ_reg);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&ipt_NFQ_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
index d2e13447678e..5245bfd33d52 100644
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -88,14 +88,18 @@ redirect_target(struct sk_buff **pskb,
newdst = htonl(0x7F000001);
else {
struct in_device *indev;
+ struct in_ifaddr *ifa;
- /* Device might not have an associated in_device. */
- indev = (struct in_device *)(*pskb)->dev->ip_ptr;
- if (indev == NULL || indev->ifa_list == NULL)
- return NF_DROP;
+ newdst = 0;
+
+ rcu_read_lock();
+ indev = __in_dev_get_rcu((*pskb)->dev);
+ if (indev && (ifa = indev->ifa_list))
+ newdst = ifa->ifa_local;
+ rcu_read_unlock();
- /* Grab first address on interface. */
- newdst = indev->ifa_list->ifa_local;
+ if (!newdst)
+ return NF_DROP;
}
/* Transfer from original range. */
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 915696446020..f057025a719e 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -92,10 +92,7 @@ static inline struct rtable *route_reverse(struct sk_buff *skb,
fl.fl_ip_sport = tcph->dest;
fl.fl_ip_dport = tcph->source;
- if (xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0)) {
- dst_release(&rt->u.dst);
- rt = NULL;
- }
+ xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0);
return rt;
}
@@ -156,7 +153,6 @@ static void send_reset(struct sk_buff *oldskb, int hook)
/* This packet will not be the same as the other: clear nf fields */
nf_reset(nskb);
- nskb->nfcache = 0;
nskb->nfmark = 0;
#ifdef CONFIG_BRIDGE_NETFILTER
nf_bridge_put(nskb->nf_bridge);
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
index 1049050b2bfb..8db70d6908c3 100644
--- a/net/ipv4/netfilter/ipt_TCPMSS.c
+++ b/net/ipv4/netfilter/ipt_TCPMSS.c
@@ -58,7 +58,11 @@ ipt_tcpmss_target(struct sk_buff **pskb,
unsigned int i;
u_int8_t *opt;
- if (!skb_ip_make_writable(pskb, (*pskb)->len))
+ if (!skb_make_writable(pskb, (*pskb)->len))
+ return NF_DROP;
+
+ if ((*pskb)->ip_summed == CHECKSUM_HW &&
+ skb_checksum_help(*pskb, out == NULL))
return NF_DROP;
iph = (*pskb)->nh.iph;
@@ -186,10 +190,6 @@ ipt_tcpmss_target(struct sk_buff **pskb,
newmss);
retmodified:
- /* We never hw checksum SYN packets. */
- BUG_ON((*pskb)->ip_summed == CHECKSUM_HW);
-
- (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED;
return IPT_CONTINUE;
}
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c
index 85c70d240f8b..deadb36d4428 100644
--- a/net/ipv4/netfilter/ipt_TOS.c
+++ b/net/ipv4/netfilter/ipt_TOS.c
@@ -33,7 +33,7 @@ target(struct sk_buff **pskb,
if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) {
u_int16_t diffs[2];
- if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+ if (!skb_make_writable(pskb, sizeof(struct iphdr)))
return NF_DROP;
diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -46,7 +46,6 @@ target(struct sk_buff **pskb,
sizeof(diffs),
(*pskb)->nh.iph->check
^0xFFFF));
- (*pskb)->nfcache |= NFC_ALTERED;
}
return IPT_CONTINUE;
}
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c
new file mode 100644
index 000000000000..b9ae6a9382f3
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_TTL.c
@@ -0,0 +1,119 @@
+/* TTL modification target for IP tables
+ * (C) 2000,2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_TTL.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("IP tables TTL modification module");
+MODULE_LICENSE("GPL");
+
+static unsigned int
+ipt_ttl_target(struct sk_buff **pskb, const struct net_device *in,
+ const struct net_device *out, unsigned int hooknum,
+ const void *targinfo, void *userinfo)
+{
+ struct iphdr *iph;
+ const struct ipt_TTL_info *info = targinfo;
+ u_int16_t diffs[2];
+ int new_ttl;
+
+ if (!skb_make_writable(pskb, (*pskb)->len))
+ return NF_DROP;
+
+ iph = (*pskb)->nh.iph;
+
+ switch (info->mode) {
+ case IPT_TTL_SET:
+ new_ttl = info->ttl;
+ break;
+ case IPT_TTL_INC:
+ new_ttl = iph->ttl + info->ttl;
+ if (new_ttl > 255)
+ new_ttl = 255;
+ break;
+ case IPT_TTL_DEC:
+ new_ttl = iph->ttl - info->ttl;
+ if (new_ttl < 0)
+ new_ttl = 0;
+ break;
+ default:
+ new_ttl = iph->ttl;
+ break;
+ }
+
+ if (new_ttl != iph->ttl) {
+ diffs[0] = htons(((unsigned)iph->ttl) << 8) ^ 0xFFFF;
+ iph->ttl = new_ttl;
+ diffs[1] = htons(((unsigned)iph->ttl) << 8);
+ iph->check = csum_fold(csum_partial((char *)diffs,
+ sizeof(diffs),
+ iph->check^0xFFFF));
+ }
+
+ return IPT_CONTINUE;
+}
+
+static int ipt_ttl_checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ struct ipt_TTL_info *info = targinfo;
+
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_TTL_info))) {
+ printk(KERN_WARNING "ipt_TTL: targinfosize %u != %Zu\n",
+ targinfosize,
+ IPT_ALIGN(sizeof(struct ipt_TTL_info)));
+ return 0;
+ }
+
+ if (strcmp(tablename, "mangle")) {
+ printk(KERN_WARNING "ipt_TTL: can only be called from "
+ "\"mangle\" table, not \"%s\"\n", tablename);
+ return 0;
+ }
+
+ if (info->mode > IPT_TTL_MAXMODE) {
+ printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n",
+ info->mode);
+ return 0;
+ }
+
+ if ((info->mode != IPT_TTL_SET) && (info->ttl == 0))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_target ipt_TTL = {
+ .name = "TTL",
+ .target = ipt_ttl_target,
+ .checkentry = ipt_ttl_checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_target(&ipt_TTL);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&ipt_TTL);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 52a0076302a7..2883ccd8a91d 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -62,6 +62,7 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
MODULE_DESCRIPTION("iptables userspace logging module");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG);
#define ULOG_NL_EVENT 111 /* Harald's favorite number */
#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */
@@ -115,10 +116,10 @@ static void ulog_send(unsigned int nlgroupnum)
if (ub->qlen > 1)
ub->lastnlh->nlmsg_type = NLMSG_DONE;
- NETLINK_CB(ub->skb).dst_groups = (1 << nlgroupnum);
- DEBUGP("ipt_ULOG: throwing %d packets to netlink mask %u\n",
- ub->qlen, nlgroupnum);
- netlink_broadcast(nflognl, ub->skb, 0, (1 << nlgroupnum), GFP_ATOMIC);
+ NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
+ DEBUGP("ipt_ULOG: throwing %d packets to netlink group %u\n",
+ ub->qlen, nlgroupnum + 1);
+ netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC);
ub->qlen = 0;
ub->skb = NULL;
@@ -219,13 +220,13 @@ static void ipt_ulog_packet(unsigned int hooknum,
pm = NLMSG_DATA(nlh);
/* We might not have a timestamp, get one */
- if (skb->stamp.tv_sec == 0)
- do_gettimeofday((struct timeval *)&skb->stamp);
+ if (skb->tstamp.off_sec == 0)
+ __net_timestamp((struct sk_buff *)skb);
/* copy hook, prefix, timestamp, payload, etc. */
pm->data_len = copy_len;
- pm->timestamp_sec = skb->stamp.tv_sec;
- pm->timestamp_usec = skb->stamp.tv_usec;
+ pm->timestamp_sec = skb->tstamp.off_sec;
+ pm->timestamp_usec = skb->tstamp.off_usec;
pm->mark = skb->nfmark;
pm->hook = hooknum;
if (prefix != NULL)
@@ -303,18 +304,27 @@ static unsigned int ipt_ulog_target(struct sk_buff **pskb,
return IPT_CONTINUE;
}
-static void ipt_logfn(unsigned int hooknum,
+static void ipt_logfn(unsigned int pf,
+ unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
+ const struct nf_loginfo *li,
const char *prefix)
{
- struct ipt_ulog_info loginfo = {
- .nl_group = ULOG_DEFAULT_NLGROUP,
- .copy_range = 0,
- .qthreshold = ULOG_DEFAULT_QTHRESHOLD,
- .prefix = ""
- };
+ struct ipt_ulog_info loginfo;
+
+ if (!li || li->type != NF_LOG_TYPE_ULOG) {
+ loginfo.nl_group = ULOG_DEFAULT_NLGROUP;
+ loginfo.copy_range = 0;
+ loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD;
+ loginfo.prefix[0] = '\0';
+ } else {
+ loginfo.nl_group = li->u.ulog.group;
+ loginfo.copy_range = li->u.ulog.copy_len;
+ loginfo.qthreshold = li->u.ulog.qthreshold;
+ strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
+ }
ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
}
@@ -354,6 +364,12 @@ static struct ipt_target ipt_ulog_reg = {
.me = THIS_MODULE,
};
+static struct nf_logger ipt_ulog_logger = {
+ .name = "ipt_ULOG",
+ .logfn = &ipt_logfn,
+ .me = THIS_MODULE,
+};
+
static int __init init(void)
{
int i;
@@ -372,7 +388,8 @@ static int __init init(void)
ulog_buffers[i].timer.data = i;
}
- nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL);
+ nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
+ THIS_MODULE);
if (!nflognl)
return -ENOMEM;
@@ -381,7 +398,7 @@ static int __init init(void)
return -EINVAL;
}
if (nflog)
- nf_log_register(PF_INET, &ipt_logfn);
+ nf_log_register(PF_INET, &ipt_ulog_logger);
return 0;
}
@@ -394,7 +411,7 @@ static void __exit fini(void)
DEBUGP("ipt_ULOG: cleanup_module\n");
if (nflog)
- nf_log_unregister(PF_INET, &ipt_logfn);
+ nf_log_unregister_logger(&ipt_ulog_logger);
ipt_unregister_target(&ipt_ulog_reg);
sock_release(nflognl->sk_socket);
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
index f5909a4c3fc7..e19c2a52d00c 100644
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ b/net/ipv4/netfilter/ipt_addrtype.c
@@ -48,7 +48,7 @@ static int checkentry(const char *tablename, const struct ipt_ip *ip,
unsigned int hook_mask)
{
if (matchsize != IPT_ALIGN(sizeof(struct ipt_addrtype_info))) {
- printk(KERN_ERR "ipt_addrtype: invalid size (%u != %Zu)\n.",
+ printk(KERN_ERR "ipt_addrtype: invalid size (%u != %Zu)\n",
matchsize, IPT_ALIGN(sizeof(struct ipt_addrtype_info)));
return 0;
}
diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c
new file mode 100644
index 000000000000..df4a42c6da22
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_connbytes.c
@@ -0,0 +1,162 @@
+/* Kernel module to match connection tracking byte counter.
+ * GPL (C) 2002 Martin Devera (devik@cdi.cz).
+ *
+ * 2004-07-20 Harald Welte <laforge@netfilter.org>
+ * - reimplemented to use per-connection accounting counters
+ * - add functionality to match number of packets
+ * - add functionality to match average packet size
+ * - add support to match directions seperately
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_connbytes.h>
+
+#include <asm/div64.h>
+#include <asm/bitops.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection");
+
+/* 64bit divisor, dividend and result. dynamic precision */
+static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
+{
+ u_int32_t d = divisor;
+
+ if (divisor > 0xffffffffULL) {
+ unsigned int shift = fls(divisor >> 32);
+
+ d = divisor >> shift;
+ dividend >>= shift;
+ }
+
+ do_div(dividend, d);
+ return dividend;
+}
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_connbytes_info *sinfo = matchinfo;
+ enum ip_conntrack_info ctinfo;
+ struct ip_conntrack *ct;
+ u_int64_t what = 0; /* initialize to make gcc happy */
+
+ if (!(ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo)))
+ return 0; /* no match */
+
+ switch (sinfo->what) {
+ case IPT_CONNBYTES_PKTS:
+ switch (sinfo->direction) {
+ case IPT_CONNBYTES_DIR_ORIGINAL:
+ what = ct->counters[IP_CT_DIR_ORIGINAL].packets;
+ break;
+ case IPT_CONNBYTES_DIR_REPLY:
+ what = ct->counters[IP_CT_DIR_REPLY].packets;
+ break;
+ case IPT_CONNBYTES_DIR_BOTH:
+ what = ct->counters[IP_CT_DIR_ORIGINAL].packets;
+ what += ct->counters[IP_CT_DIR_REPLY].packets;
+ break;
+ }
+ break;
+ case IPT_CONNBYTES_BYTES:
+ switch (sinfo->direction) {
+ case IPT_CONNBYTES_DIR_ORIGINAL:
+ what = ct->counters[IP_CT_DIR_ORIGINAL].bytes;
+ break;
+ case IPT_CONNBYTES_DIR_REPLY:
+ what = ct->counters[IP_CT_DIR_REPLY].bytes;
+ break;
+ case IPT_CONNBYTES_DIR_BOTH:
+ what = ct->counters[IP_CT_DIR_ORIGINAL].bytes;
+ what += ct->counters[IP_CT_DIR_REPLY].bytes;
+ break;
+ }
+ break;
+ case IPT_CONNBYTES_AVGPKT:
+ switch (sinfo->direction) {
+ case IPT_CONNBYTES_DIR_ORIGINAL:
+ what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes,
+ ct->counters[IP_CT_DIR_ORIGINAL].packets);
+ break;
+ case IPT_CONNBYTES_DIR_REPLY:
+ what = div64_64(ct->counters[IP_CT_DIR_REPLY].bytes,
+ ct->counters[IP_CT_DIR_REPLY].packets);
+ break;
+ case IPT_CONNBYTES_DIR_BOTH:
+ {
+ u_int64_t bytes;
+ u_int64_t pkts;
+ bytes = ct->counters[IP_CT_DIR_ORIGINAL].bytes +
+ ct->counters[IP_CT_DIR_REPLY].bytes;
+ pkts = ct->counters[IP_CT_DIR_ORIGINAL].packets+
+ ct->counters[IP_CT_DIR_REPLY].packets;
+
+ /* FIXME_THEORETICAL: what to do if sum
+ * overflows ? */
+
+ what = div64_64(bytes, pkts);
+ }
+ break;
+ }
+ break;
+ }
+
+ if (sinfo->count.to)
+ return (what <= sinfo->count.to && what >= sinfo->count.from);
+ else
+ return (what >= sinfo->count.from);
+}
+
+static int check(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ const struct ipt_connbytes_info *sinfo = matchinfo;
+
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_connbytes_info)))
+ return 0;
+
+ if (sinfo->what != IPT_CONNBYTES_PKTS &&
+ sinfo->what != IPT_CONNBYTES_BYTES &&
+ sinfo->what != IPT_CONNBYTES_AVGPKT)
+ return 0;
+
+ if (sinfo->direction != IPT_CONNBYTES_DIR_ORIGINAL &&
+ sinfo->direction != IPT_CONNBYTES_DIR_REPLY &&
+ sinfo->direction != IPT_CONNBYTES_DIR_BOTH)
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match state_match = {
+ .name = "connbytes",
+ .match = &match,
+ .checkentry = &check,
+ .me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&state_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&state_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c
index 2706f96cea55..bf8de47ce004 100644
--- a/net/ipv4/netfilter/ipt_connmark.c
+++ b/net/ipv4/netfilter/ipt_connmark.c
@@ -54,9 +54,16 @@ checkentry(const char *tablename,
unsigned int matchsize,
unsigned int hook_mask)
{
+ struct ipt_connmark_info *cm =
+ (struct ipt_connmark_info *)matchinfo;
if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info)))
return 0;
+ if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) {
+ printk(KERN_WARNING "connmark: only support 32bit mark\n");
+ return 0;
+ }
+
return 1;
}
diff --git a/net/ipv4/netfilter/ipt_dccp.c b/net/ipv4/netfilter/ipt_dccp.c
new file mode 100644
index 000000000000..ad3278bba6c1
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_dccp.c
@@ -0,0 +1,176 @@
+/*
+ * iptables module for DCCP protocol header matching
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <net/ip.h>
+#include <linux/dccp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_dccp.h>
+
+#define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
+ || (!!((invflag) & (option)) ^ (cond)))
+
+static unsigned char *dccp_optbuf;
+static DEFINE_SPINLOCK(dccp_buflock);
+
+static inline int
+dccp_find_option(u_int8_t option,
+ const struct sk_buff *skb,
+ const struct dccp_hdr *dh,
+ int *hotdrop)
+{
+ /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+ unsigned char *op;
+ unsigned int optoff = __dccp_hdr_len(dh);
+ unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh);
+ unsigned int i;
+
+ if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) {
+ *hotdrop = 1;
+ return 0;
+ }
+
+ if (!optlen)
+ return 0;
+
+ spin_lock_bh(&dccp_buflock);
+ op = skb_header_pointer(skb,
+ skb->nh.iph->ihl*4 + optoff,
+ optlen, dccp_optbuf);
+ if (op == NULL) {
+ /* If we don't have the whole header, drop packet. */
+ spin_unlock_bh(&dccp_buflock);
+ *hotdrop = 1;
+ return 0;
+ }
+
+ for (i = 0; i < optlen; ) {
+ if (op[i] == option) {
+ spin_unlock_bh(&dccp_buflock);
+ return 1;
+ }
+
+ if (op[i] < 2)
+ i++;
+ else
+ i += op[i+1]?:1;
+ }
+
+ spin_unlock_bh(&dccp_buflock);
+ return 0;
+}
+
+
+static inline int
+match_types(const struct dccp_hdr *dh, u_int16_t typemask)
+{
+ return (typemask & (1 << dh->dccph_type));
+}
+
+static inline int
+match_option(u_int8_t option, const struct sk_buff *skb,
+ const struct dccp_hdr *dh, int *hotdrop)
+{
+ return dccp_find_option(option, skb, dh, hotdrop);
+}
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_dccp_info *info =
+ (const struct ipt_dccp_info *)matchinfo;
+ struct dccp_hdr _dh, *dh;
+
+ if (offset)
+ return 0;
+
+ dh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_dh), &_dh);
+ if (dh == NULL) {
+ *hotdrop = 1;
+ return 0;
+ }
+
+ return DCCHECK(((ntohs(dh->dccph_sport) >= info->spts[0])
+ && (ntohs(dh->dccph_sport) <= info->spts[1])),
+ IPT_DCCP_SRC_PORTS, info->flags, info->invflags)
+ && DCCHECK(((ntohs(dh->dccph_dport) >= info->dpts[0])
+ && (ntohs(dh->dccph_dport) <= info->dpts[1])),
+ IPT_DCCP_DEST_PORTS, info->flags, info->invflags)
+ && DCCHECK(match_types(dh, info->typemask),
+ IPT_DCCP_TYPE, info->flags, info->invflags)
+ && DCCHECK(match_option(info->option, skb, dh, hotdrop),
+ IPT_DCCP_OPTION, info->flags, info->invflags);
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ const struct ipt_dccp_info *info;
+
+ info = (const struct ipt_dccp_info *)matchinfo;
+
+ return ip->proto == IPPROTO_DCCP
+ && !(ip->invflags & IPT_INV_PROTO)
+ && matchsize == IPT_ALIGN(sizeof(struct ipt_dccp_info))
+ && !(info->flags & ~IPT_DCCP_VALID_FLAGS)
+ && !(info->invflags & ~IPT_DCCP_VALID_FLAGS)
+ && !(info->invflags & ~info->flags);
+}
+
+static struct ipt_match dccp_match =
+{
+ .name = "dccp",
+ .match = &match,
+ .checkentry = &checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ int ret;
+
+ /* doff is 8 bits, so the maximum option size is (4*256). Don't put
+ * this in BSS since DaveM is worried about locked TLB's for kernel
+ * BSS. */
+ dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL);
+ if (!dccp_optbuf)
+ return -ENOMEM;
+ ret = ipt_register_match(&dccp_match);
+ if (ret)
+ kfree(dccp_optbuf);
+
+ return ret;
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&dccp_match);
+ kfree(dccp_optbuf);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Match for DCCP protocol packets");
+
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index 564b49bfebcf..2dd1cccbdab9 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -94,7 +94,7 @@ struct ipt_hashlimit_htable {
static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */
static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */
static HLIST_HEAD(hashlimit_htables);
-static kmem_cache_t *hashlimit_cachep;
+static kmem_cache_t *hashlimit_cachep __read_mostly;
static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b)
{
diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c
index 8955728127b9..00bef6cdd3f8 100644
--- a/net/ipv4/netfilter/ipt_mark.c
+++ b/net/ipv4/netfilter/ipt_mark.c
@@ -37,9 +37,16 @@ checkentry(const char *tablename,
unsigned int matchsize,
unsigned int hook_mask)
{
+ struct ipt_mark_info *minfo = (struct ipt_mark_info *) matchinfo;
+
if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info)))
return 0;
+ if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) {
+ printk(KERN_WARNING "mark: only supports 32bit mark\n");
+ return 0;
+ }
+
return 1;
}
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
index 3b9065e06381..0cee2862ed85 100644
--- a/net/ipv4/netfilter/ipt_owner.c
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -11,6 +11,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/file.h>
+#include <linux/rcupdate.h>
#include <net/sock.h>
#include <linux/netfilter_ipv4/ipt_owner.h>
@@ -21,106 +22,6 @@ MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
MODULE_DESCRIPTION("iptables owner match");
static int
-match_comm(const struct sk_buff *skb, const char *comm)
-{
- struct task_struct *g, *p;
- struct files_struct *files;
- int i;
-
- read_lock(&tasklist_lock);
- do_each_thread(g, p) {
- if(strncmp(p->comm, comm, sizeof(p->comm)))
- continue;
-
- task_lock(p);
- files = p->files;
- if(files) {
- spin_lock(&files->file_lock);
- for (i=0; i < files->max_fds; i++) {
- if (fcheck_files(files, i) ==
- skb->sk->sk_socket->file) {
- spin_unlock(&files->file_lock);
- task_unlock(p);
- read_unlock(&tasklist_lock);
- return 1;
- }
- }
- spin_unlock(&files->file_lock);
- }
- task_unlock(p);
- } while_each_thread(g, p);
- read_unlock(&tasklist_lock);
- return 0;
-}
-
-static int
-match_pid(const struct sk_buff *skb, pid_t pid)
-{
- struct task_struct *p;
- struct files_struct *files;
- int i;
-
- read_lock(&tasklist_lock);
- p = find_task_by_pid(pid);
- if (!p)
- goto out;
- task_lock(p);
- files = p->files;
- if(files) {
- spin_lock(&files->file_lock);
- for (i=0; i < files->max_fds; i++) {
- if (fcheck_files(files, i) ==
- skb->sk->sk_socket->file) {
- spin_unlock(&files->file_lock);
- task_unlock(p);
- read_unlock(&tasklist_lock);
- return 1;
- }
- }
- spin_unlock(&files->file_lock);
- }
- task_unlock(p);
-out:
- read_unlock(&tasklist_lock);
- return 0;
-}
-
-static int
-match_sid(const struct sk_buff *skb, pid_t sid)
-{
- struct task_struct *g, *p;
- struct file *file = skb->sk->sk_socket->file;
- int i, found=0;
-
- read_lock(&tasklist_lock);
- do_each_thread(g, p) {
- struct files_struct *files;
- if (p->signal->session != sid)
- continue;
-
- task_lock(p);
- files = p->files;
- if (files) {
- spin_lock(&files->file_lock);
- for (i=0; i < files->max_fds; i++) {
- if (fcheck_files(files, i) == file) {
- found = 1;
- break;
- }
- }
- spin_unlock(&files->file_lock);
- }
- task_unlock(p);
- if (found)
- goto out;
- } while_each_thread(g, p);
-out:
- read_unlock(&tasklist_lock);
-
- return found;
-}
-
-static int
match(const struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -145,24 +46,6 @@ match(const struct sk_buff *skb,
return 0;
}
- if(info->match & IPT_OWNER_PID) {
- if (!match_pid(skb, info->pid) ^
- !!(info->invert & IPT_OWNER_PID))
- return 0;
- }
-
- if(info->match & IPT_OWNER_SID) {
- if (!match_sid(skb, info->sid) ^
- !!(info->invert & IPT_OWNER_SID))
- return 0;
- }
-
- if(info->match & IPT_OWNER_COMM) {
- if (!match_comm(skb, info->comm) ^
- !!(info->invert & IPT_OWNER_COMM))
- return 0;
- }
-
return 1;
}
@@ -173,6 +56,8 @@ checkentry(const char *tablename,
unsigned int matchsize,
unsigned int hook_mask)
{
+ const struct ipt_owner_info *info = matchinfo;
+
if (hook_mask
& ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) {
printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n");
@@ -184,15 +69,13 @@ checkentry(const char *tablename,
IPT_ALIGN(sizeof(struct ipt_owner_info)));
return 0;
}
-#ifdef CONFIG_SMP
- /* files->file_lock can not be used in a BH */
- if (((struct ipt_owner_info *)matchinfo)->match
- & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
- printk("ipt_owner: pid, sid and command matching is broken "
- "on SMP.\n");
+
+ if (info->match & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
+ printk("ipt_owner: pid, sid and command matching "
+ "not supported anymore\n");
return 0;
}
-#endif
+
return 1;
}
diff --git a/net/ipv4/netfilter/ipt_string.c b/net/ipv4/netfilter/ipt_string.c
new file mode 100644
index 000000000000..b5def204d798
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_string.c
@@ -0,0 +1,91 @@
+/* String matching match for iptables
+ *
+ * (C) 2005 Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_string.h>
+#include <linux/textsearch.h>
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>");
+MODULE_DESCRIPTION("IP tables string match module");
+MODULE_LICENSE("GPL");
+
+static int match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ struct ts_state state;
+ struct ipt_string_info *conf = (struct ipt_string_info *) matchinfo;
+
+ memset(&state, 0, sizeof(struct ts_state));
+
+ return (skb_find_text((struct sk_buff *)skb, conf->from_offset,
+ conf->to_offset, conf->config, &state)
+ != UINT_MAX) && !conf->invert;
+}
+
+#define STRING_TEXT_PRIV(m) ((struct ipt_string_info *) m)
+
+static int checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ struct ipt_string_info *conf = matchinfo;
+ struct ts_config *ts_conf;
+
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_string_info)))
+ return 0;
+
+ /* Damn, can't handle this case properly with iptables... */
+ if (conf->from_offset > conf->to_offset)
+ return 0;
+
+ ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen,
+ GFP_KERNEL, TS_AUTOLOAD);
+ if (IS_ERR(ts_conf))
+ return 0;
+
+ conf->config = ts_conf;
+
+ return 1;
+}
+
+static void destroy(void *matchinfo, unsigned int matchsize)
+{
+ textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config);
+}
+
+static struct ipt_match string_match = {
+ .name = "string",
+ .match = match,
+ .checkentry = checkentry,
+ .destroy = destroy,
+ .me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&string_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&string_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 912bbcc7f415..a65e508fbd40 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -59,13 +59,10 @@ static int fold_prot_inuse(struct proto *proto)
*/
static int sockstat_seq_show(struct seq_file *seq, void *v)
{
- /* From net/socket.c */
- extern void socket_seq_show(struct seq_file *seq);
-
socket_seq_show(seq);
seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
- tcp_tw_count, atomic_read(&tcp_sockets_allocated),
+ tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
atomic_read(&tcp_memory_allocated));
seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot));
seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot));
@@ -93,9 +90,7 @@ fold_field(void *mib[], int offt)
unsigned long res = 0;
int i;
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_possible(i))
- continue;
+ for_each_cpu(i) {
res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt);
res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt);
}
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 0db405a869f2..291831e792af 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -40,7 +40,6 @@
#include <linux/timer.h>
#include <net/ip.h>
#include <net/protocol.h>
-#include <net/tcp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/icmp.h>
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index d1835b1bc8c4..4b0d7e4d6269 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -59,7 +59,6 @@
#include <linux/netdevice.h>
#include <linux/in_route.h>
#include <linux/route.h>
-#include <linux/tcp.h>
#include <linux/skbuff.h>
#include <net/dst.h>
#include <net/sock.h>
@@ -71,6 +70,7 @@
#include <net/udp.h>
#include <net/raw.h>
#include <net/snmp.h>
+#include <net/tcp_states.h>
#include <net/inet_common.h>
#include <net/checksum.h>
#include <net/xfrm.h>
@@ -150,10 +150,11 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
* RFC 1122: SHOULD pass TOS value up to the transport layer.
* -> It does. And not only TOS, but all IP header.
*/
-void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
+int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
{
struct sock *sk;
struct hlist_head *head;
+ int delivered = 0;
read_lock(&raw_v4_lock);
head = &raw_v4_htable[hash];
@@ -164,6 +165,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
skb->dev->ifindex);
while (sk) {
+ delivered = 1;
if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {
struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
@@ -177,6 +179,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
}
out:
read_unlock(&raw_v4_lock);
+ return delivered;
}
void raw_err (struct sock *sk, struct sk_buff *skb, u32 info)
@@ -358,7 +361,7 @@ static void raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
if (type && code) {
get_user(fl->fl_icmp_type, type);
- __get_user(fl->fl_icmp_code, code);
+ get_user(fl->fl_icmp_code, code);
probed = 1;
}
break;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d675ff80b04d..381dd6a6aebb 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -240,7 +240,9 @@ static unsigned rt_hash_mask;
static int rt_hash_log;
static unsigned int rt_hash_rnd;
-struct rt_cache_stat *rt_cache_stat;
+static struct rt_cache_stat *rt_cache_stat;
+#define RT_CACHE_STAT_INC(field) \
+ (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
static int rt_intern_hash(unsigned hash, struct rtable *rth,
struct rtable **res);
@@ -1758,6 +1760,7 @@ static inline int __mkroute_input(struct sk_buff *skb,
goto cleanup;
}
+ atomic_set(&rth->u.dst.__refcnt, 1);
rth->u.dst.flags= DST_HOST;
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
if (res->fi->fib_nhs > 1)
@@ -1818,7 +1821,6 @@ static inline int ip_mkroute_input_def(struct sk_buff *skb,
err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
if (err)
return err;
- atomic_set(&rth->u.dst.__refcnt, 1);
/* put it into the cache */
hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
@@ -1832,8 +1834,8 @@ static inline int ip_mkroute_input(struct sk_buff *skb,
u32 daddr, u32 saddr, u32 tos)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
- struct rtable* rth = NULL;
- unsigned char hop, hopcount, lasthop;
+ struct rtable* rth = NULL, *rtres;
+ unsigned char hop, hopcount;
int err = -EINVAL;
unsigned int hash;
@@ -1842,8 +1844,6 @@ static inline int ip_mkroute_input(struct sk_buff *skb,
else
hopcount = 1;
- lasthop = hopcount - 1;
-
/* distinguish between multipath and singlepath */
if (hopcount < 2)
return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
@@ -1853,6 +1853,10 @@ static inline int ip_mkroute_input(struct sk_buff *skb,
for (hop = 0; hop < hopcount; hop++) {
res->nh_sel = hop;
+ /* put reference to previous result */
+ if (hop)
+ ip_rt_put(rtres);
+
/* create a routing cache entry */
err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
&rth);
@@ -1861,7 +1865,7 @@ static inline int ip_mkroute_input(struct sk_buff *skb,
/* put it into the cache */
hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
- err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+ err = rt_intern_hash(hash, rth, &rtres);
if (err)
return err;
@@ -1871,13 +1875,8 @@ static inline int ip_mkroute_input(struct sk_buff *skb,
FIB_RES_NETMASK(*res),
res->prefixlen,
&FIB_RES_NH(*res));
-
- /* only for the last hop the reference count is handled
- * outside
- */
- if (hop == lasthop)
- atomic_set(&(skb->dst->__refcnt), 1);
}
+ skb->dst = &rtres->u.dst;
return err;
#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
@@ -2129,7 +2128,7 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
struct in_device *in_dev;
rcu_read_lock();
- if ((in_dev = __in_dev_get(dev)) != NULL) {
+ if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
int our = ip_check_mc(in_dev, daddr, saddr,
skb->nh.iph->protocol);
if (our
@@ -2206,6 +2205,7 @@ static inline int __mkroute_output(struct rtable **result,
goto cleanup;
}
+ atomic_set(&rth->u.dst.__refcnt, 1);
rth->u.dst.flags= DST_HOST;
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
if (res->fi) {
@@ -2288,8 +2288,6 @@ static inline int ip_mkroute_output_def(struct rtable **rp,
if (err == 0) {
u32 tos = RT_FL_TOS(oldflp);
- atomic_set(&rth->u.dst.__refcnt, 1);
-
hash = rt_hash_code(oldflp->fl4_dst,
oldflp->fl4_src ^ (oldflp->oif << 5), tos);
err = rt_intern_hash(hash, rth, rp);
@@ -2324,6 +2322,10 @@ static inline int ip_mkroute_output(struct rtable** rp,
dev2nexthop = FIB_RES_DEV(*res);
dev_hold(dev2nexthop);
+ /* put reference to previous result */
+ if (hop)
+ ip_rt_put(*rp);
+
err = __mkroute_output(&rth, res, fl, oldflp,
dev2nexthop, flags);
@@ -2348,7 +2350,6 @@ static inline int ip_mkroute_output(struct rtable** rp,
if (err != 0)
return err;
}
- atomic_set(&(*rp)->u.dst.__refcnt, 1);
return err;
} else {
return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
@@ -2442,7 +2443,9 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
err = -ENODEV;
if (dev_out == NULL)
goto out;
- if (__in_dev_get(dev_out) == NULL) {
+
+ /* RACE: Check return value of inet_select_addr instead. */
+ if (__in_dev_get_rtnl(dev_out) == NULL) {
dev_put(dev_out);
goto out; /* Wrong error code */
}
@@ -2600,6 +2603,8 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
return ip_route_output_slow(rp, flp);
}
+EXPORT_SYMBOL_GPL(__ip_route_output_key);
+
int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
{
int err;
@@ -2618,6 +2623,8 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk,
return 0;
}
+EXPORT_SYMBOL_GPL(ip_route_output_flow);
+
int ip_route_output_key(struct rtable **rp, struct flowi *flp)
{
return ip_route_output_flow(rp, flp, NULL, 0);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 72d014442185..a34e60ea48a1 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -169,8 +169,6 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
}
-extern struct request_sock_ops tcp_request_sock_ops;
-
static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst)
@@ -180,7 +178,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
child = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
if (child)
- tcp_acceptq_queue(sk, req, child);
+ inet_csk_reqsk_queue_add(sk, req, child);
else
reqsk_free(req);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e32894532416..652685623519 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -11,7 +11,9 @@
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/config.h>
+#include <linux/igmp.h>
#include <net/snmp.h>
+#include <net/icmp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/tcp.h>
@@ -19,36 +21,6 @@
/* From af_inet.c */
extern int sysctl_ip_nonlocal_bind;
-/* From icmp.c */
-extern int sysctl_icmp_echo_ignore_all;
-extern int sysctl_icmp_echo_ignore_broadcasts;
-extern int sysctl_icmp_ignore_bogus_error_responses;
-extern int sysctl_icmp_errors_use_inbound_ifaddr;
-
-/* From ip_fragment.c */
-extern int sysctl_ipfrag_low_thresh;
-extern int sysctl_ipfrag_high_thresh;
-extern int sysctl_ipfrag_time;
-extern int sysctl_ipfrag_secret_interval;
-
-/* From ip_output.c */
-extern int sysctl_ip_dynaddr;
-
-/* From icmp.c */
-extern int sysctl_icmp_ratelimit;
-extern int sysctl_icmp_ratemask;
-
-/* From igmp.c */
-extern int sysctl_igmp_max_memberships;
-extern int sysctl_igmp_max_msf;
-
-/* From inetpeer.c */
-extern int inet_peer_threshold;
-extern int inet_peer_minttl;
-extern int inet_peer_maxttl;
-extern int inet_peer_gc_mintime;
-extern int inet_peer_gc_maxtime;
-
#ifdef CONFIG_SYSCTL
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
@@ -57,8 +29,6 @@ static int ip_local_port_range_max[] = { 65535, 65535 };
struct ipv4_config ipv4_config;
-extern ctl_table ipv4_route_table[];
-
#ifdef CONFIG_SYSCTL
static
@@ -136,10 +106,11 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file *
return ret;
}
-int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen,
- void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen,
- void **context)
+static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name,
+ int nlen, void __user *oldval,
+ size_t __user *oldlenp,
+ void __user *newval, size_t newlen,
+ void **context)
{
char val[TCP_CA_NAME_MAX];
ctl_table tbl = {
@@ -259,7 +230,7 @@ ctl_table ipv4_table[] = {
{
.ctl_name = NET_TCP_MAX_TW_BUCKETS,
.procname = "tcp_max_tw_buckets",
- .data = &sysctl_tcp_max_tw_buckets,
+ .data = &tcp_death_row.sysctl_max_tw_buckets,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec
@@ -363,7 +334,7 @@ ctl_table ipv4_table[] = {
{
.ctl_name = NET_TCP_TW_RECYCLE,
.procname = "tcp_tw_recycle",
- .data = &sysctl_tcp_tw_recycle,
+ .data = &tcp_death_row.sysctl_tw_recycle,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ddb6ce4ecff2..f3f0013a9580 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -269,13 +269,12 @@
int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
-DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
-
-kmem_cache_t *tcp_bucket_cachep;
-kmem_cache_t *tcp_timewait_cachep;
+DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
atomic_t tcp_orphan_count = ATOMIC_INIT(0);
+EXPORT_SYMBOL_GPL(tcp_orphan_count);
+
int sysctl_tcp_mem[3];
int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
@@ -311,15 +310,6 @@ void tcp_enter_memory_pressure(void)
EXPORT_SYMBOL(tcp_enter_memory_pressure);
/*
- * LISTEN is a special case for poll..
- */
-static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
- poll_table *wait)
-{
- return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
-}
-
-/*
* Wait for a TCP event.
*
* Note that we don't need to lock the socket, as the upper poll layers
@@ -334,7 +324,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
poll_wait(file, sk->sk_sleep, wait);
if (sk->sk_state == TCP_LISTEN)
- return tcp_listen_poll(sk, wait);
+ return inet_csk_listen_poll(sk);
/* Socket is not locked. We are protected from async events
by poll logic and correct handling of state changes
@@ -457,109 +447,6 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
return put_user(answ, (int __user *)arg);
}
-
-int tcp_listen_start(struct sock *sk)
-{
- struct inet_sock *inet = inet_sk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
-
- if (rc != 0)
- return rc;
-
- sk->sk_max_ack_backlog = 0;
- sk->sk_ack_backlog = 0;
- tcp_delack_init(tp);
-
- /* There is race window here: we announce ourselves listening,
- * but this transition is still not validated by get_port().
- * It is OK, because this socket enters to hash table only
- * after validation is complete.
- */
- sk->sk_state = TCP_LISTEN;
- if (!sk->sk_prot->get_port(sk, inet->num)) {
- inet->sport = htons(inet->num);
-
- sk_dst_reset(sk);
- sk->sk_prot->hash(sk);
-
- return 0;
- }
-
- sk->sk_state = TCP_CLOSE;
- reqsk_queue_destroy(&tp->accept_queue);
- return -EADDRINUSE;
-}
-
-/*
- * This routine closes sockets which have been at least partially
- * opened, but not yet accepted.
- */
-
-static void tcp_listen_stop (struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct listen_sock *lopt;
- struct request_sock *acc_req;
- struct request_sock *req;
- int i;
-
- tcp_delete_keepalive_timer(sk);
-
- /* make all the listen_opt local to us */
- lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
- acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
-
- if (lopt->qlen) {
- for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
- while ((req = lopt->syn_table[i]) != NULL) {
- lopt->syn_table[i] = req->dl_next;
- lopt->qlen--;
- reqsk_free(req);
-
- /* Following specs, it would be better either to send FIN
- * (and enter FIN-WAIT-1, it is normal close)
- * or to send active reset (abort).
- * Certainly, it is pretty dangerous while synflood, but it is
- * bad justification for our negligence 8)
- * To be honest, we are not able to make either
- * of the variants now. --ANK
- */
- }
- }
- }
- BUG_TRAP(!lopt->qlen);
-
- kfree(lopt);
-
- while ((req = acc_req) != NULL) {
- struct sock *child = req->sk;
-
- acc_req = req->dl_next;
-
- local_bh_disable();
- bh_lock_sock(child);
- BUG_TRAP(!sock_owned_by_user(child));
- sock_hold(child);
-
- tcp_disconnect(child, O_NONBLOCK);
-
- sock_orphan(child);
-
- atomic_inc(&tcp_orphan_count);
-
- tcp_destroy_sock(child);
-
- bh_unlock_sock(child);
- local_bh_enable();
- sock_put(child);
-
- sk_acceptq_removed(sk);
- __reqsk_free(req);
- }
- BUG_TRAP(!sk->sk_ack_backlog);
-}
-
static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
{
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
@@ -584,7 +471,7 @@ static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
sk_charge_skb(sk, skb);
if (!sk->sk_send_head)
sk->sk_send_head = skb;
- else if (tp->nonagle&TCP_NAGLE_PUSH)
+ if (tp->nonagle & TCP_NAGLE_PUSH)
tp->nonagle &= ~TCP_NAGLE_PUSH;
}
@@ -665,8 +552,7 @@ new_segment:
tcp_mark_push(tp, skb);
goto new_segment;
}
- if (sk->sk_forward_alloc < copy &&
- !sk_stream_mem_schedule(sk, copy, 0))
+ if (!sk_stream_wmem_schedule(sk, copy))
goto wait_for_memory;
if (can_coalesce) {
@@ -883,19 +769,23 @@ new_segment:
if (off == PAGE_SIZE) {
put_page(page);
TCP_PAGE(sk) = page = NULL;
+ off = 0;
}
- }
+ } else
+ off = 0;
+
+ if (copy > PAGE_SIZE - off)
+ copy = PAGE_SIZE - off;
+
+ if (!sk_stream_wmem_schedule(sk, copy))
+ goto wait_for_memory;
if (!page) {
/* Allocate new cache page. */
if (!(page = sk_stream_alloc_page(sk)))
goto wait_for_memory;
- off = 0;
}
- if (copy > PAGE_SIZE - off)
- copy = PAGE_SIZE - off;
-
/* Time to copy data. We are close to
* the end! */
err = skb_copy_to_page(sk, from, skb, page,
@@ -975,7 +865,7 @@ do_fault:
if (!skb->len) {
if (sk->sk_send_head == skb)
sk->sk_send_head = NULL;
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, &sk->sk_write_queue);
sk_stream_free_skb(sk, skb);
}
@@ -1057,20 +947,21 @@ static void cleanup_rbuf(struct sock *sk, int copied)
BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
#endif
- if (tcp_ack_scheduled(tp)) {
+ if (inet_csk_ack_scheduled(sk)) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
/* Delayed ACKs frequently hit locked sockets during bulk
* receive. */
- if (tp->ack.blocked ||
+ if (icsk->icsk_ack.blocked ||
/* Once-per-two-segments ACK was not sent by tcp_input.c */
- tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
+ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
/*
* If this read emptied read buffer, we send ACK, if
* connection is not bidirectional, user drained
* receive buffer and there was a small segment
* in queue.
*/
- (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
- !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
+ (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
+ !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
time_to_ack = 1;
}
@@ -1572,40 +1463,6 @@ void tcp_shutdown(struct sock *sk, int how)
}
}
-/*
- * At this point, there should be no process reference to this
- * socket, and thus no user references at all. Therefore we
- * can assume the socket waitqueue is inactive and nobody will
- * try to jump onto it.
- */
-void tcp_destroy_sock(struct sock *sk)
-{
- BUG_TRAP(sk->sk_state == TCP_CLOSE);
- BUG_TRAP(sock_flag(sk, SOCK_DEAD));
-
- /* It cannot be in hash table! */
- BUG_TRAP(sk_unhashed(sk));
-
- /* If it has not 0 inet_sk(sk)->num, it must be bound */
- BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
-
- sk->sk_prot->destroy(sk);
-
- sk_stream_kill_queues(sk);
-
- xfrm_sk_free_policy(sk);
-
-#ifdef INET_REFCNT_DEBUG
- if (atomic_read(&sk->sk_refcnt) != 1) {
- printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
- sk, atomic_read(&sk->sk_refcnt));
- }
-#endif
-
- atomic_dec(&tcp_orphan_count);
- sock_put(sk);
-}
-
void tcp_close(struct sock *sk, long timeout)
{
struct sk_buff *skb;
@@ -1618,7 +1475,7 @@ void tcp_close(struct sock *sk, long timeout)
tcp_set_state(sk, TCP_CLOSE);
/* Special case. */
- tcp_listen_stop(sk);
+ inet_csk_listen_stop(sk);
goto adjudge_to_death;
}
@@ -1721,12 +1578,12 @@ adjudge_to_death:
tcp_send_active_reset(sk, GFP_ATOMIC);
NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
} else {
- int tmo = tcp_fin_time(tp);
+ const int tmo = tcp_fin_time(sk);
if (tmo > TCP_TIMEWAIT_LEN) {
- tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
+ inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
} else {
- atomic_inc(&tcp_orphan_count);
+ atomic_inc(sk->sk_prot->orphan_count);
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
goto out;
}
@@ -1734,7 +1591,7 @@ adjudge_to_death:
}
if (sk->sk_state != TCP_CLOSE) {
sk_stream_mem_reclaim(sk);
- if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
+ if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
(sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
if (net_ratelimit())
@@ -1745,10 +1602,10 @@ adjudge_to_death:
NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
}
}
- atomic_inc(&tcp_orphan_count);
+ atomic_inc(sk->sk_prot->orphan_count);
if (sk->sk_state == TCP_CLOSE)
- tcp_destroy_sock(sk);
+ inet_csk_destroy_sock(sk);
/* Otherwise, socket is reprieved until protocol close. */
out:
@@ -1769,6 +1626,7 @@ static inline int tcp_need_reset(int state)
int tcp_disconnect(struct sock *sk, int flags)
{
struct inet_sock *inet = inet_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int err = 0;
int old_state = sk->sk_state;
@@ -1778,7 +1636,7 @@ int tcp_disconnect(struct sock *sk, int flags)
/* ABORT function of RFC793 */
if (old_state == TCP_LISTEN) {
- tcp_listen_stop(sk);
+ inet_csk_listen_stop(sk);
} else if (tcp_need_reset(old_state) ||
(tp->snd_nxt != tp->write_seq &&
(1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
@@ -1805,125 +1663,34 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->srtt = 0;
if ((tp->write_seq += tp->max_window + 2) == 0)
tp->write_seq = 1;
- tp->backoff = 0;
+ icsk->icsk_backoff = 0;
tp->snd_cwnd = 2;
- tp->probes_out = 0;
+ icsk->icsk_probes_out = 0;
tp->packets_out = 0;
tp->snd_ssthresh = 0x7fffffff;
tp->snd_cwnd_cnt = 0;
- tcp_set_ca_state(tp, TCP_CA_Open);
+ tcp_set_ca_state(sk, TCP_CA_Open);
tcp_clear_retrans(tp);
- tcp_delack_init(tp);
+ inet_csk_delack_init(sk);
sk->sk_send_head = NULL;
tp->rx_opt.saw_tstamp = 0;
tcp_sack_reset(&tp->rx_opt);
__sk_dst_reset(sk);
- BUG_TRAP(!inet->num || tp->bind_hash);
+ BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
sk->sk_error_report(sk);
return err;
}
/*
- * Wait for an incoming connection, avoid race
- * conditions. This must be called with the socket locked.
- */
-static int wait_for_connect(struct sock *sk, long timeo)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- DEFINE_WAIT(wait);
- int err;
-
- /*
- * True wake-one mechanism for incoming connections: only
- * one process gets woken up, not the 'whole herd'.
- * Since we do not 'race & poll' for established sockets
- * anymore, the common case will execute the loop only once.
- *
- * Subtle issue: "add_wait_queue_exclusive()" will be added
- * after any current non-exclusive waiters, and we know that
- * it will always _stay_ after any new non-exclusive waiters
- * because all non-exclusive waiters are added at the
- * beginning of the wait-queue. As such, it's ok to "drop"
- * our exclusiveness temporarily when we get woken up without
- * having to remove and re-insert us on the wait queue.
- */
- for (;;) {
- prepare_to_wait_exclusive(sk->sk_sleep, &wait,
- TASK_INTERRUPTIBLE);
- release_sock(sk);
- if (reqsk_queue_empty(&tp->accept_queue))
- timeo = schedule_timeout(timeo);
- lock_sock(sk);
- err = 0;
- if (!reqsk_queue_empty(&tp->accept_queue))
- break;
- err = -EINVAL;
- if (sk->sk_state != TCP_LISTEN)
- break;
- err = sock_intr_errno(timeo);
- if (signal_pending(current))
- break;
- err = -EAGAIN;
- if (!timeo)
- break;
- }
- finish_wait(sk->sk_sleep, &wait);
- return err;
-}
-
-/*
- * This will accept the next outstanding connection.
- */
-
-struct sock *tcp_accept(struct sock *sk, int flags, int *err)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct sock *newsk;
- int error;
-
- lock_sock(sk);
-
- /* We need to make sure that this socket is listening,
- * and that it has something pending.
- */
- error = -EINVAL;
- if (sk->sk_state != TCP_LISTEN)
- goto out_err;
-
- /* Find already established connection */
- if (reqsk_queue_empty(&tp->accept_queue)) {
- long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
-
- /* If this is a non blocking socket don't sleep */
- error = -EAGAIN;
- if (!timeo)
- goto out_err;
-
- error = wait_for_connect(sk, timeo);
- if (error)
- goto out_err;
- }
-
- newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
- BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
-out:
- release_sock(sk);
- return newsk;
-out_err:
- newsk = NULL;
- *err = error;
- goto out;
-}
-
-/*
* Socket option code for TCP.
*/
int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
int optlen)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
int val;
int err = 0;
@@ -1945,7 +1712,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
name[val] = 0;
lock_sock(sk);
- err = tcp_set_congestion_control(tp, name);
+ err = tcp_set_congestion_control(sk, name);
release_sock(sk);
return err;
}
@@ -2022,7 +1789,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
elapsed = tp->keepalive_time - elapsed;
else
elapsed = 0;
- tcp_reset_keepalive_timer(sk, elapsed);
+ inet_csk_reset_keepalive_timer(sk, elapsed);
}
}
break;
@@ -2042,7 +1809,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
if (val < 1 || val > MAX_TCP_SYNCNT)
err = -EINVAL;
else
- tp->syn_retries = val;
+ icsk->icsk_syn_retries = val;
break;
case TCP_LINGER2:
@@ -2055,15 +1822,15 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
break;
case TCP_DEFER_ACCEPT:
- tp->defer_accept = 0;
+ icsk->icsk_accept_queue.rskq_defer_accept = 0;
if (val > 0) {
/* Translate value in seconds to number of
* retransmits */
- while (tp->defer_accept < 32 &&
+ while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
val > ((TCP_TIMEOUT_INIT / HZ) <<
- tp->defer_accept))
- tp->defer_accept++;
- tp->defer_accept++;
+ icsk->icsk_accept_queue.rskq_defer_accept))
+ icsk->icsk_accept_queue.rskq_defer_accept++;
+ icsk->icsk_accept_queue.rskq_defer_accept++;
}
break;
@@ -2081,16 +1848,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
case TCP_QUICKACK:
if (!val) {
- tp->ack.pingpong = 1;
+ icsk->icsk_ack.pingpong = 1;
} else {
- tp->ack.pingpong = 0;
+ icsk->icsk_ack.pingpong = 0;
if ((1 << sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
- tcp_ack_scheduled(tp)) {
- tp->ack.pending |= TCP_ACK_PUSHED;
+ inet_csk_ack_scheduled(sk)) {
+ icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
cleanup_rbuf(sk, 1);
if (!(val & 1))
- tp->ack.pingpong = 1;
+ icsk->icsk_ack.pingpong = 1;
}
}
break;
@@ -2107,15 +1874,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
u32 now = tcp_time_stamp;
memset(info, 0, sizeof(*info));
info->tcpi_state = sk->sk_state;
- info->tcpi_ca_state = tp->ca_state;
- info->tcpi_retransmits = tp->retransmits;
- info->tcpi_probes = tp->probes_out;
- info->tcpi_backoff = tp->backoff;
+ info->tcpi_ca_state = icsk->icsk_ca_state;
+ info->tcpi_retransmits = icsk->icsk_retransmits;
+ info->tcpi_probes = icsk->icsk_probes_out;
+ info->tcpi_backoff = icsk->icsk_backoff;
if (tp->rx_opt.tstamp_ok)
info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
@@ -2130,10 +1898,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
if (tp->ecn_flags&TCP_ECN_OK)
info->tcpi_options |= TCPI_OPT_ECN;
- info->tcpi_rto = jiffies_to_usecs(tp->rto);
- info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
+ info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
+ info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
info->tcpi_snd_mss = tp->mss_cache;
- info->tcpi_rcv_mss = tp->ack.rcv_mss;
+ info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
info->tcpi_unacked = tp->packets_out;
info->tcpi_sacked = tp->sacked_out;
@@ -2142,7 +1910,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_fackets = tp->fackets_out;
info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
- info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
+ info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
info->tcpi_pmtu = tp->pmtu_cookie;
@@ -2165,6 +1933,7 @@ EXPORT_SYMBOL_GPL(tcp_get_info);
int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
int __user *optlen)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int val, len;
@@ -2202,7 +1971,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
break;
case TCP_SYNCNT:
- val = tp->syn_retries ? : sysctl_tcp_syn_retries;
+ val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
break;
case TCP_LINGER2:
val = tp->linger2;
@@ -2210,8 +1979,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
val = (val ? : sysctl_tcp_fin_timeout) / HZ;
break;
case TCP_DEFER_ACCEPT:
- val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
- (tp->defer_accept - 1));
+ val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
+ ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
break;
case TCP_WINDOW_CLAMP:
val = tp->window_clamp;
@@ -2232,7 +2001,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
return 0;
}
case TCP_QUICKACK:
- val = !tp->ack.pingpong;
+ val = !icsk->icsk_ack.pingpong;
break;
case TCP_CONGESTION:
@@ -2241,7 +2010,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
if (put_user(len, optlen))
return -EFAULT;
- if (copy_to_user(optval, tp->ca_ops->name, len))
+ if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
return -EFAULT;
return 0;
default:
@@ -2278,79 +2047,72 @@ void __init tcp_init(void)
__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
sizeof(skb->cb));
- tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
- sizeof(struct tcp_bind_bucket),
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
- if (!tcp_bucket_cachep)
+ tcp_hashinfo.bind_bucket_cachep =
+ kmem_cache_create("tcp_bind_bucket",
+ sizeof(struct inet_bind_bucket), 0,
+ SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (!tcp_hashinfo.bind_bucket_cachep)
panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
- tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
- sizeof(struct tcp_tw_bucket),
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
- if (!tcp_timewait_cachep)
- panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
-
/* Size and allocate the main established and bind bucket
* hash tables.
*
* The methodology is similar to that of the buffer cache.
*/
- tcp_ehash = (struct tcp_ehash_bucket *)
+ tcp_hashinfo.ehash =
alloc_large_system_hash("TCP established",
- sizeof(struct tcp_ehash_bucket),
+ sizeof(struct inet_ehash_bucket),
thash_entries,
(num_physpages >= 128 * 1024) ?
(25 - PAGE_SHIFT) :
(27 - PAGE_SHIFT),
HASH_HIGHMEM,
- &tcp_ehash_size,
+ &tcp_hashinfo.ehash_size,
NULL,
0);
- tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
- for (i = 0; i < (tcp_ehash_size << 1); i++) {
- rwlock_init(&tcp_ehash[i].lock);
- INIT_HLIST_HEAD(&tcp_ehash[i].chain);
+ tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
+ for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
+ rwlock_init(&tcp_hashinfo.ehash[i].lock);
+ INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
}
- tcp_bhash = (struct tcp_bind_hashbucket *)
+ tcp_hashinfo.bhash =
alloc_large_system_hash("TCP bind",
- sizeof(struct tcp_bind_hashbucket),
- tcp_ehash_size,
+ sizeof(struct inet_bind_hashbucket),
+ tcp_hashinfo.ehash_size,
(num_physpages >= 128 * 1024) ?
(25 - PAGE_SHIFT) :
(27 - PAGE_SHIFT),
HASH_HIGHMEM,
- &tcp_bhash_size,
+ &tcp_hashinfo.bhash_size,
NULL,
64 * 1024);
- tcp_bhash_size = 1 << tcp_bhash_size;
- for (i = 0; i < tcp_bhash_size; i++) {
- spin_lock_init(&tcp_bhash[i].lock);
- INIT_HLIST_HEAD(&tcp_bhash[i].chain);
+ tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
+ for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
+ spin_lock_init(&tcp_hashinfo.bhash[i].lock);
+ INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
}
/* Try to be a bit smarter and adjust defaults depending
* on available memory.
*/
for (order = 0; ((1 << order) << PAGE_SHIFT) <
- (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
+ (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
order++)
;
if (order >= 4) {
sysctl_local_port_range[0] = 32768;
sysctl_local_port_range[1] = 61000;
- sysctl_tcp_max_tw_buckets = 180000;
+ tcp_death_row.sysctl_max_tw_buckets = 180000;
sysctl_tcp_max_orphans = 4096 << (order - 4);
sysctl_max_syn_backlog = 1024;
} else if (order < 3) {
sysctl_local_port_range[0] = 1024 * (3 - order);
- sysctl_tcp_max_tw_buckets >>= (3 - order);
+ tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
sysctl_tcp_max_orphans >>= (3 - order);
sysctl_max_syn_backlog = 128;
}
- tcp_port_rover = sysctl_local_port_range[0] - 1;
+ tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
sysctl_tcp_mem[0] = 768 << order;
sysctl_tcp_mem[1] = 1024 << order;
@@ -2365,14 +2127,12 @@ void __init tcp_init(void)
printk(KERN_INFO "TCP: Hash tables configured "
"(established %d bind %d)\n",
- tcp_ehash_size << 1, tcp_bhash_size);
+ tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
tcp_register_congestion_control(&tcp_reno);
}
-EXPORT_SYMBOL(tcp_accept);
EXPORT_SYMBOL(tcp_close);
-EXPORT_SYMBOL(tcp_destroy_sock);
EXPORT_SYMBOL(tcp_disconnect);
EXPORT_SYMBOL(tcp_getsockopt);
EXPORT_SYMBOL(tcp_ioctl);
@@ -2384,4 +2144,3 @@ EXPORT_SYMBOL(tcp_sendpage);
EXPORT_SYMBOL(tcp_setsockopt);
EXPORT_SYMBOL(tcp_shutdown);
EXPORT_SYMBOL(tcp_statistics);
-EXPORT_SYMBOL(tcp_timewait_cachep);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index ec38d45d6649..ae35e0609047 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -27,7 +27,7 @@
*/
static int fast_convergence = 1;
-static int max_increment = 32;
+static int max_increment = 16;
static int low_window = 14;
static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
static int low_utilization_threshold = 153;
@@ -86,11 +86,11 @@ static inline void bictcp_reset(struct bictcp *ca)
ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
}
-static void bictcp_init(struct tcp_sock *tp)
+static void bictcp_init(struct sock *sk)
{
- bictcp_reset(tcp_ca(tp));
+ bictcp_reset(inet_csk_ca(sk));
if (initial_ssthresh)
- tp->snd_ssthresh = initial_ssthresh;
+ tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
}
/*
@@ -136,7 +136,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1))
/* slow start */
ca->cnt = (cwnd * (BICTCP_B-1))
- / cwnd-ca->last_max_cwnd;
+ / (cwnd - ca->last_max_cwnd);
else
/* linear increase */
ca->cnt = cwnd / max_increment;
@@ -156,9 +156,10 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
/* Detect low utilization in congestion avoidance */
-static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
+static inline void bictcp_low_utilization(struct sock *sk, int flag)
{
- struct bictcp *ca = tcp_ca(tp);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
u32 dist, delay;
/* No time stamp */
@@ -208,12 +209,13 @@ static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
}
-static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
+static void bictcp_cong_avoid(struct sock *sk, u32 ack,
u32 seq_rtt, u32 in_flight, int data_acked)
{
- struct bictcp *ca = tcp_ca(tp);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
- bictcp_low_utilization(tp, data_acked);
+ bictcp_low_utilization(sk, data_acked);
if (in_flight < tp->snd_cwnd)
return;
@@ -242,9 +244,10 @@ static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
* behave like Reno until low_window is reached,
* then increase congestion window slowly
*/
-static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
+static u32 bictcp_recalc_ssthresh(struct sock *sk)
{
- struct bictcp *ca = tcp_ca(tp);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
ca->epoch_start = 0; /* end of epoch */
@@ -269,31 +272,34 @@ static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
}
-static u32 bictcp_undo_cwnd(struct tcp_sock *tp)
+static u32 bictcp_undo_cwnd(struct sock *sk)
{
- struct bictcp *ca = tcp_ca(tp);
-
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct bictcp *ca = inet_csk_ca(sk);
return max(tp->snd_cwnd, ca->last_max_cwnd);
}
-static u32 bictcp_min_cwnd(struct tcp_sock *tp)
+static u32 bictcp_min_cwnd(struct sock *sk)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
return tp->snd_ssthresh;
}
-static void bictcp_state(struct tcp_sock *tp, u8 new_state)
+static void bictcp_state(struct sock *sk, u8 new_state)
{
if (new_state == TCP_CA_Loss)
- bictcp_reset(tcp_ca(tp));
+ bictcp_reset(inet_csk_ca(sk));
}
/* Track delayed acknowledgement ratio using sliding window
* ratio = (15*ratio + sample) / 16
*/
-static void bictcp_acked(struct tcp_sock *tp, u32 cnt)
+static void bictcp_acked(struct sock *sk, u32 cnt)
{
- if (cnt > 0 && tp->ca_state == TCP_CA_Open) {
- struct bictcp *ca = tcp_ca(tp);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
+ struct bictcp *ca = inet_csk_ca(sk);
cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
ca->delayed_ack += cnt;
}
@@ -314,7 +320,7 @@ static struct tcp_congestion_ops bictcp = {
static int __init bictcp_register(void)
{
- BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE);
+ BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
return tcp_register_congestion_control(&bictcp);
}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 4970d10a7785..bbf2d6624e89 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -73,33 +73,36 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
/* Assign choice of congestion control. */
-void tcp_init_congestion_control(struct tcp_sock *tp)
+void tcp_init_congestion_control(struct sock *sk)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_congestion_ops *ca;
- if (tp->ca_ops != &tcp_init_congestion_ops)
+ if (icsk->icsk_ca_ops != &tcp_init_congestion_ops)
return;
rcu_read_lock();
list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
if (try_module_get(ca->owner)) {
- tp->ca_ops = ca;
+ icsk->icsk_ca_ops = ca;
break;
}
}
rcu_read_unlock();
- if (tp->ca_ops->init)
- tp->ca_ops->init(tp);
+ if (icsk->icsk_ca_ops->init)
+ icsk->icsk_ca_ops->init(sk);
}
/* Manage refcounts on socket close. */
-void tcp_cleanup_congestion_control(struct tcp_sock *tp)
+void tcp_cleanup_congestion_control(struct sock *sk)
{
- if (tp->ca_ops->release)
- tp->ca_ops->release(tp);
- module_put(tp->ca_ops->owner);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->release)
+ icsk->icsk_ca_ops->release(sk);
+ module_put(icsk->icsk_ca_ops->owner);
}
/* Used by sysctl to change default congestion control */
@@ -143,14 +146,15 @@ void tcp_get_default_congestion_control(char *name)
}
/* Change congestion control for socket */
-int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
+int tcp_set_congestion_control(struct sock *sk, const char *name)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_congestion_ops *ca;
int err = 0;
rcu_read_lock();
ca = tcp_ca_find(name);
- if (ca == tp->ca_ops)
+ if (ca == icsk->icsk_ca_ops)
goto out;
if (!ca)
@@ -160,10 +164,10 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
err = -EBUSY;
else {
- tcp_cleanup_congestion_control(tp);
- tp->ca_ops = ca;
- if (tp->ca_ops->init)
- tp->ca_ops->init(tp);
+ tcp_cleanup_congestion_control(sk);
+ icsk->icsk_ca_ops = ca;
+ if (icsk->icsk_ca_ops->init)
+ icsk->icsk_ca_ops->init(sk);
}
out:
rcu_read_unlock();
@@ -177,9 +181,11 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
/* This is Jacobson's slow start and congestion avoidance.
* SIGCOMM '88, p. 328.
*/
-void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
int flag)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+
if (in_flight < tp->snd_cwnd)
return;
@@ -202,15 +208,17 @@ void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
/* Slow start threshold is half the congestion window (min 2) */
-u32 tcp_reno_ssthresh(struct tcp_sock *tp)
+u32 tcp_reno_ssthresh(struct sock *sk)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
return max(tp->snd_cwnd >> 1U, 2U);
}
EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
/* Lower bound on congestion window. */
-u32 tcp_reno_min_cwnd(struct tcp_sock *tp)
+u32 tcp_reno_min_cwnd(struct sock *sk)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
return tp->snd_ssthresh/2;
}
EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index f66945cb158f..c148c1081880 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -1,5 +1,5 @@
/*
- * tcp_diag.c Module for monitoring TCP sockets.
+ * tcp_diag.c Module for monitoring TCP transport protocols sockets.
*
* Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
*
@@ -12,779 +12,43 @@
*/
#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/fcntl.h>
-#include <linux/random.h>
-#include <linux/cache.h>
-#include <linux/init.h>
-#include <linux/time.h>
-
-#include <net/icmp.h>
-#include <net/tcp.h>
-#include <net/ipv6.h>
-#include <net/inet_common.h>
-
-#include <linux/inet.h>
-#include <linux/stddef.h>
-
-#include <linux/tcp_diag.h>
-struct tcpdiag_entry
-{
- u32 *saddr;
- u32 *daddr;
- u16 sport;
- u16 dport;
- u16 family;
- u16 userlocks;
-};
+#include <linux/module.h>
+#include <linux/inet_diag.h>
-static struct sock *tcpnl;
+#include <linux/tcp.h>
-#define TCPDIAG_PUT(skb, attrtype, attrlen) \
- RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
+#include <net/tcp.h>
-static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
- int ext, u32 pid, u32 seq, u16 nlmsg_flags)
+static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *_info)
{
- struct inet_sock *inet = inet_sk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcpdiagmsg *r;
- struct nlmsghdr *nlh;
- struct tcp_info *info = NULL;
- struct tcpdiag_meminfo *minfo = NULL;
- unsigned char *b = skb->tail;
-
- nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
- nlh->nlmsg_flags = nlmsg_flags;
- r = NLMSG_DATA(nlh);
- if (sk->sk_state != TCP_TIME_WAIT) {
- if (ext & (1<<(TCPDIAG_MEMINFO-1)))
- minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo));
- if (ext & (1<<(TCPDIAG_INFO-1)))
- info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
-
- if (ext & (1<<(TCPDIAG_CONG-1))) {
- size_t len = strlen(tp->ca_ops->name);
- strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1),
- tp->ca_ops->name);
- }
- }
- r->tcpdiag_family = sk->sk_family;
- r->tcpdiag_state = sk->sk_state;
- r->tcpdiag_timer = 0;
- r->tcpdiag_retrans = 0;
-
- r->id.tcpdiag_if = sk->sk_bound_dev_if;
- r->id.tcpdiag_cookie[0] = (u32)(unsigned long)sk;
- r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
-
- if (r->tcpdiag_state == TCP_TIME_WAIT) {
- struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk;
- long tmo = tw->tw_ttd - jiffies;
- if (tmo < 0)
- tmo = 0;
-
- r->id.tcpdiag_sport = tw->tw_sport;
- r->id.tcpdiag_dport = tw->tw_dport;
- r->id.tcpdiag_src[0] = tw->tw_rcv_saddr;
- r->id.tcpdiag_dst[0] = tw->tw_daddr;
- r->tcpdiag_state = tw->tw_substate;
- r->tcpdiag_timer = 3;
- r->tcpdiag_expires = (tmo*1000+HZ-1)/HZ;
- r->tcpdiag_rqueue = 0;
- r->tcpdiag_wqueue = 0;
- r->tcpdiag_uid = 0;
- r->tcpdiag_inode = 0;
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- if (r->tcpdiag_family == AF_INET6) {
- ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
- &tw->tw_v6_rcv_saddr);
- ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
- &tw->tw_v6_daddr);
- }
-#endif
- nlh->nlmsg_len = skb->tail - b;
- return skb->len;
- }
-
- r->id.tcpdiag_sport = inet->sport;
- r->id.tcpdiag_dport = inet->dport;
- r->id.tcpdiag_src[0] = inet->rcv_saddr;
- r->id.tcpdiag_dst[0] = inet->daddr;
-
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- if (r->tcpdiag_family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
-
- ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
- &np->rcv_saddr);
- ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
- &np->daddr);
- }
-#endif
-
-#define EXPIRES_IN_MS(tmo) ((tmo-jiffies)*1000+HZ-1)/HZ
-
- if (tp->pending == TCP_TIME_RETRANS) {
- r->tcpdiag_timer = 1;
- r->tcpdiag_retrans = tp->retransmits;
- r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
- } else if (tp->pending == TCP_TIME_PROBE0) {
- r->tcpdiag_timer = 4;
- r->tcpdiag_retrans = tp->probes_out;
- r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
- } else if (timer_pending(&sk->sk_timer)) {
- r->tcpdiag_timer = 2;
- r->tcpdiag_retrans = tp->probes_out;
- r->tcpdiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
- } else {
- r->tcpdiag_timer = 0;
- r->tcpdiag_expires = 0;
- }
-#undef EXPIRES_IN_MS
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_info *info = _info;
- r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq;
- r->tcpdiag_wqueue = tp->write_seq - tp->snd_una;
- r->tcpdiag_uid = sock_i_uid(sk);
- r->tcpdiag_inode = sock_i_ino(sk);
-
- if (minfo) {
- minfo->tcpdiag_rmem = atomic_read(&sk->sk_rmem_alloc);
- minfo->tcpdiag_wmem = sk->sk_wmem_queued;
- minfo->tcpdiag_fmem = sk->sk_forward_alloc;
- minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc);
- }
-
- if (info)
+ r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq;
+ r->idiag_wqueue = tp->write_seq - tp->snd_una;
+ if (info != NULL)
tcp_get_info(sk, info);
-
- if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info)
- tp->ca_ops->get_info(tp, ext, skb);
-
- nlh->nlmsg_len = skb->tail - b;
- return skb->len;
-
-rtattr_failure:
-nlmsg_failure:
- skb_trim(skb, b - skb->data);
- return -1;
-}
-
-extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport,
- int dif);
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
- struct in6_addr *daddr, u16 dport,
- int dif);
-#else
-static inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
- struct in6_addr *daddr, u16 dport,
- int dif)
-{
- return NULL;
-}
-#endif
-
-static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh)
-{
- int err;
- struct sock *sk;
- struct tcpdiagreq *req = NLMSG_DATA(nlh);
- struct sk_buff *rep;
-
- if (req->tcpdiag_family == AF_INET) {
- sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport,
- req->id.tcpdiag_src[0], req->id.tcpdiag_sport,
- req->id.tcpdiag_if);
- }
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- else if (req->tcpdiag_family == AF_INET6) {
- sk = tcp_v6_lookup((struct in6_addr*)req->id.tcpdiag_dst, req->id.tcpdiag_dport,
- (struct in6_addr*)req->id.tcpdiag_src, req->id.tcpdiag_sport,
- req->id.tcpdiag_if);
- }
-#endif
- else {
- return -EINVAL;
- }
-
- if (sk == NULL)
- return -ENOENT;
-
- err = -ESTALE;
- if ((req->id.tcpdiag_cookie[0] != TCPDIAG_NOCOOKIE ||
- req->id.tcpdiag_cookie[1] != TCPDIAG_NOCOOKIE) &&
- ((u32)(unsigned long)sk != req->id.tcpdiag_cookie[0] ||
- (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.tcpdiag_cookie[1]))
- goto out;
-
- err = -ENOMEM;
- rep = alloc_skb(NLMSG_SPACE(sizeof(struct tcpdiagmsg)+
- sizeof(struct tcpdiag_meminfo)+
- sizeof(struct tcp_info)+64), GFP_KERNEL);
- if (!rep)
- goto out;
-
- if (tcpdiag_fill(rep, sk, req->tcpdiag_ext,
- NETLINK_CB(in_skb).pid,
- nlh->nlmsg_seq, 0) <= 0)
- BUG();
-
- err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
- if (err > 0)
- err = 0;
-
-out:
- if (sk) {
- if (sk->sk_state == TCP_TIME_WAIT)
- tcp_tw_put((struct tcp_tw_bucket*)sk);
- else
- sock_put(sk);
- }
- return err;
-}
-
-static int bitstring_match(const u32 *a1, const u32 *a2, int bits)
-{
- int words = bits >> 5;
-
- bits &= 0x1f;
-
- if (words) {
- if (memcmp(a1, a2, words << 2))
- return 0;
- }
- if (bits) {
- __u32 w1, w2;
- __u32 mask;
-
- w1 = a1[words];
- w2 = a2[words];
-
- mask = htonl((0xffffffff) << (32 - bits));
-
- if ((w1 ^ w2) & mask)
- return 0;
- }
-
- return 1;
-}
-
-
-static int tcpdiag_bc_run(const void *bc, int len,
- const struct tcpdiag_entry *entry)
-{
- while (len > 0) {
- int yes = 1;
- const struct tcpdiag_bc_op *op = bc;
-
- switch (op->code) {
- case TCPDIAG_BC_NOP:
- break;
- case TCPDIAG_BC_JMP:
- yes = 0;
- break;
- case TCPDIAG_BC_S_GE:
- yes = entry->sport >= op[1].no;
- break;
- case TCPDIAG_BC_S_LE:
- yes = entry->dport <= op[1].no;
- break;
- case TCPDIAG_BC_D_GE:
- yes = entry->dport >= op[1].no;
- break;
- case TCPDIAG_BC_D_LE:
- yes = entry->dport <= op[1].no;
- break;
- case TCPDIAG_BC_AUTO:
- yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
- break;
- case TCPDIAG_BC_S_COND:
- case TCPDIAG_BC_D_COND:
- {
- struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(op+1);
- u32 *addr;
-
- if (cond->port != -1 &&
- cond->port != (op->code == TCPDIAG_BC_S_COND ?
- entry->sport : entry->dport)) {
- yes = 0;
- break;
- }
-
- if (cond->prefix_len == 0)
- break;
-
- if (op->code == TCPDIAG_BC_S_COND)
- addr = entry->saddr;
- else
- addr = entry->daddr;
-
- if (bitstring_match(addr, cond->addr, cond->prefix_len))
- break;
- if (entry->family == AF_INET6 &&
- cond->family == AF_INET) {
- if (addr[0] == 0 && addr[1] == 0 &&
- addr[2] == htonl(0xffff) &&
- bitstring_match(addr+3, cond->addr, cond->prefix_len))
- break;
- }
- yes = 0;
- break;
- }
- }
-
- if (yes) {
- len -= op->yes;
- bc += op->yes;
- } else {
- len -= op->no;
- bc += op->no;
- }
- }
- return (len == 0);
-}
-
-static int valid_cc(const void *bc, int len, int cc)
-{
- while (len >= 0) {
- const struct tcpdiag_bc_op *op = bc;
-
- if (cc > len)
- return 0;
- if (cc == len)
- return 1;
- if (op->yes < 4)
- return 0;
- len -= op->yes;
- bc += op->yes;
- }
- return 0;
-}
-
-static int tcpdiag_bc_audit(const void *bytecode, int bytecode_len)
-{
- const unsigned char *bc = bytecode;
- int len = bytecode_len;
-
- while (len > 0) {
- struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc;
-
-//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
- switch (op->code) {
- case TCPDIAG_BC_AUTO:
- case TCPDIAG_BC_S_COND:
- case TCPDIAG_BC_D_COND:
- case TCPDIAG_BC_S_GE:
- case TCPDIAG_BC_S_LE:
- case TCPDIAG_BC_D_GE:
- case TCPDIAG_BC_D_LE:
- if (op->yes < 4 || op->yes > len+4)
- return -EINVAL;
- case TCPDIAG_BC_JMP:
- if (op->no < 4 || op->no > len+4)
- return -EINVAL;
- if (op->no < len &&
- !valid_cc(bytecode, bytecode_len, len-op->no))
- return -EINVAL;
- break;
- case TCPDIAG_BC_NOP:
- if (op->yes < 4 || op->yes > len+4)
- return -EINVAL;
- break;
- default:
- return -EINVAL;
- }
- bc += op->yes;
- len -= op->yes;
- }
- return len == 0 ? 0 : -EINVAL;
-}
-
-static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk,
- struct netlink_callback *cb)
-{
- struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
-
- if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
- struct tcpdiag_entry entry;
- struct rtattr *bc = (struct rtattr *)(r + 1);
- struct inet_sock *inet = inet_sk(sk);
-
- entry.family = sk->sk_family;
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- if (entry.family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
-
- entry.saddr = np->rcv_saddr.s6_addr32;
- entry.daddr = np->daddr.s6_addr32;
- } else
-#endif
- {
- entry.saddr = &inet->rcv_saddr;
- entry.daddr = &inet->daddr;
- }
- entry.sport = inet->num;
- entry.dport = ntohs(inet->dport);
- entry.userlocks = sk->sk_userlocks;
-
- if (!tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
- return 0;
- }
-
- return tcpdiag_fill(skb, sk, r->tcpdiag_ext, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI);
}
-static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
- struct request_sock *req,
- u32 pid, u32 seq)
-{
- const struct inet_request_sock *ireq = inet_rsk(req);
- struct inet_sock *inet = inet_sk(sk);
- unsigned char *b = skb->tail;
- struct tcpdiagmsg *r;
- struct nlmsghdr *nlh;
- long tmo;
-
- nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
- nlh->nlmsg_flags = NLM_F_MULTI;
- r = NLMSG_DATA(nlh);
-
- r->tcpdiag_family = sk->sk_family;
- r->tcpdiag_state = TCP_SYN_RECV;
- r->tcpdiag_timer = 1;
- r->tcpdiag_retrans = req->retrans;
-
- r->id.tcpdiag_if = sk->sk_bound_dev_if;
- r->id.tcpdiag_cookie[0] = (u32)(unsigned long)req;
- r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
-
- tmo = req->expires - jiffies;
- if (tmo < 0)
- tmo = 0;
-
- r->id.tcpdiag_sport = inet->sport;
- r->id.tcpdiag_dport = ireq->rmt_port;
- r->id.tcpdiag_src[0] = ireq->loc_addr;
- r->id.tcpdiag_dst[0] = ireq->rmt_addr;
- r->tcpdiag_expires = jiffies_to_msecs(tmo),
- r->tcpdiag_rqueue = 0;
- r->tcpdiag_wqueue = 0;
- r->tcpdiag_uid = sock_i_uid(sk);
- r->tcpdiag_inode = 0;
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- if (r->tcpdiag_family == AF_INET6) {
- ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
- &tcp6_rsk(req)->loc_addr);
- ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
- &tcp6_rsk(req)->rmt_addr);
- }
-#endif
- nlh->nlmsg_len = skb->tail - b;
-
- return skb->len;
-
-nlmsg_failure:
- skb_trim(skb, b - skb->data);
- return -1;
-}
-
-static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
- struct netlink_callback *cb)
-{
- struct tcpdiag_entry entry;
- struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
- struct tcp_sock *tp = tcp_sk(sk);
- struct listen_sock *lopt;
- struct rtattr *bc = NULL;
- struct inet_sock *inet = inet_sk(sk);
- int j, s_j;
- int reqnum, s_reqnum;
- int err = 0;
-
- s_j = cb->args[3];
- s_reqnum = cb->args[4];
-
- if (s_j > 0)
- s_j--;
-
- entry.family = sk->sk_family;
-
- read_lock_bh(&tp->accept_queue.syn_wait_lock);
-
- lopt = tp->accept_queue.listen_opt;
- if (!lopt || !lopt->qlen)
- goto out;
-
- if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
- bc = (struct rtattr *)(r + 1);
- entry.sport = inet->num;
- entry.userlocks = sk->sk_userlocks;
- }
-
- for (j = s_j; j < TCP_SYNQ_HSIZE; j++) {
- struct request_sock *req, *head = lopt->syn_table[j];
-
- reqnum = 0;
- for (req = head; req; reqnum++, req = req->dl_next) {
- struct inet_request_sock *ireq = inet_rsk(req);
-
- if (reqnum < s_reqnum)
- continue;
- if (r->id.tcpdiag_dport != ireq->rmt_port &&
- r->id.tcpdiag_dport)
- continue;
-
- if (bc) {
- entry.saddr =
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- (entry.family == AF_INET6) ?
- tcp6_rsk(req)->loc_addr.s6_addr32 :
-#endif
- &ireq->loc_addr;
- entry.daddr =
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- (entry.family == AF_INET6) ?
- tcp6_rsk(req)->rmt_addr.s6_addr32 :
-#endif
- &ireq->rmt_addr;
- entry.dport = ntohs(ireq->rmt_port);
-
- if (!tcpdiag_bc_run(RTA_DATA(bc),
- RTA_PAYLOAD(bc), &entry))
- continue;
- }
-
- err = tcpdiag_fill_req(skb, sk, req,
- NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq);
- if (err < 0) {
- cb->args[3] = j + 1;
- cb->args[4] = reqnum;
- goto out;
- }
- }
-
- s_reqnum = 0;
- }
-
-out:
- read_unlock_bh(&tp->accept_queue.syn_wait_lock);
-
- return err;
-}
-
-static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb)
-{
- int i, num;
- int s_i, s_num;
- struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
-
- s_i = cb->args[1];
- s_num = num = cb->args[2];
-
- if (cb->args[0] == 0) {
- if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV)))
- goto skip_listen_ht;
- tcp_listen_lock();
- for (i = s_i; i < TCP_LHTABLE_SIZE; i++) {
- struct sock *sk;
- struct hlist_node *node;
-
- num = 0;
- sk_for_each(sk, node, &tcp_listening_hash[i]) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (num < s_num) {
- num++;
- continue;
- }
-
- if (r->id.tcpdiag_sport != inet->sport &&
- r->id.tcpdiag_sport)
- goto next_listen;
-
- if (!(r->tcpdiag_states&TCPF_LISTEN) ||
- r->id.tcpdiag_dport ||
- cb->args[3] > 0)
- goto syn_recv;
-
- if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
- tcp_listen_unlock();
- goto done;
- }
-
-syn_recv:
- if (!(r->tcpdiag_states&TCPF_SYN_RECV))
- goto next_listen;
-
- if (tcpdiag_dump_reqs(skb, sk, cb) < 0) {
- tcp_listen_unlock();
- goto done;
- }
-
-next_listen:
- cb->args[3] = 0;
- cb->args[4] = 0;
- ++num;
- }
-
- s_num = 0;
- cb->args[3] = 0;
- cb->args[4] = 0;
- }
- tcp_listen_unlock();
-skip_listen_ht:
- cb->args[0] = 1;
- s_i = num = s_num = 0;
- }
-
- if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV)))
- return skb->len;
-
- for (i = s_i; i < tcp_ehash_size; i++) {
- struct tcp_ehash_bucket *head = &tcp_ehash[i];
- struct sock *sk;
- struct hlist_node *node;
-
- if (i > s_i)
- s_num = 0;
-
- read_lock_bh(&head->lock);
-
- num = 0;
- sk_for_each(sk, node, &head->chain) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (num < s_num)
- goto next_normal;
- if (!(r->tcpdiag_states & (1 << sk->sk_state)))
- goto next_normal;
- if (r->id.tcpdiag_sport != inet->sport &&
- r->id.tcpdiag_sport)
- goto next_normal;
- if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport)
- goto next_normal;
- if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
- read_unlock_bh(&head->lock);
- goto done;
- }
-next_normal:
- ++num;
- }
-
- if (r->tcpdiag_states&TCPF_TIME_WAIT) {
- sk_for_each(sk, node,
- &tcp_ehash[i + tcp_ehash_size].chain) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (num < s_num)
- goto next_dying;
- if (r->id.tcpdiag_sport != inet->sport &&
- r->id.tcpdiag_sport)
- goto next_dying;
- if (r->id.tcpdiag_dport != inet->dport &&
- r->id.tcpdiag_dport)
- goto next_dying;
- if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
- read_unlock_bh(&head->lock);
- goto done;
- }
-next_dying:
- ++num;
- }
- }
- read_unlock_bh(&head->lock);
- }
-
-done:
- cb->args[1] = i;
- cb->args[2] = num;
- return skb->len;
-}
-
-static int tcpdiag_dump_done(struct netlink_callback *cb)
-{
- return 0;
-}
-
-
-static __inline__ int
-tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
-{
- if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
- return 0;
-
- if (nlh->nlmsg_type != TCPDIAG_GETSOCK)
- goto err_inval;
-
- if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len)
- goto err_inval;
-
- if (nlh->nlmsg_flags&NLM_F_DUMP) {
- if (nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(struct tcpdiagreq))) {
- struct rtattr *rta = (struct rtattr*)(NLMSG_DATA(nlh) + sizeof(struct tcpdiagreq));
- if (rta->rta_type != TCPDIAG_REQ_BYTECODE ||
- rta->rta_len < 8 ||
- rta->rta_len > nlh->nlmsg_len - NLMSG_SPACE(sizeof(struct tcpdiagreq)))
- goto err_inval;
- if (tcpdiag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
- goto err_inval;
- }
- return netlink_dump_start(tcpnl, skb, nlh,
- tcpdiag_dump,
- tcpdiag_dump_done);
- } else {
- return tcpdiag_get_exact(skb, nlh);
- }
-
-err_inval:
- return -EINVAL;
-}
-
-
-static inline void tcpdiag_rcv_skb(struct sk_buff *skb)
-{
- int err;
- struct nlmsghdr * nlh;
-
- if (skb->len >= NLMSG_SPACE(0)) {
- nlh = (struct nlmsghdr *)skb->data;
- if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
- return;
- err = tcpdiag_rcv_msg(skb, nlh);
- if (err || nlh->nlmsg_flags & NLM_F_ACK)
- netlink_ack(skb, nlh, err);
- }
-}
-
-static void tcpdiag_rcv(struct sock *sk, int len)
-{
- struct sk_buff *skb;
- unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
-
- while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) {
- tcpdiag_rcv_skb(skb);
- kfree_skb(skb);
- }
-}
+static struct inet_diag_handler tcp_diag_handler = {
+ .idiag_hashinfo = &tcp_hashinfo,
+ .idiag_get_info = tcp_diag_get_info,
+ .idiag_type = TCPDIAG_GETSOCK,
+ .idiag_info_size = sizeof(struct tcp_info),
+};
-static int __init tcpdiag_init(void)
+static int __init tcp_diag_init(void)
{
- tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv);
- if (tcpnl == NULL)
- return -ENOMEM;
- return 0;
+ return inet_diag_register(&tcp_diag_handler);
}
-static void __exit tcpdiag_exit(void)
+static void __exit tcp_diag_exit(void)
{
- sock_release(tcpnl->sk_socket);
+ inet_diag_unregister(&tcp_diag_handler);
}
-module_init(tcpdiag_init);
-module_exit(tcpdiag_exit);
+module_init(tcp_diag_init);
+module_exit(tcp_diag_exit);
MODULE_LICENSE("GPL");
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 36c51f8136bf..6acc04bde080 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -98,9 +98,10 @@ struct hstcp {
u32 ai;
};
-static void hstcp_init(struct tcp_sock *tp)
+static void hstcp_init(struct sock *sk)
{
- struct hstcp *ca = tcp_ca(tp);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct hstcp *ca = inet_csk_ca(sk);
ca->ai = 0;
@@ -109,10 +110,11 @@ static void hstcp_init(struct tcp_sock *tp)
tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
}
-static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
+static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
u32 in_flight, int good)
{
- struct hstcp *ca = tcp_ca(tp);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct hstcp *ca = inet_csk_ca(sk);
if (in_flight < tp->snd_cwnd)
return;
@@ -143,9 +145,10 @@ static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
}
}
-static u32 hstcp_ssthresh(struct tcp_sock *tp)
+static u32 hstcp_ssthresh(struct sock *sk)
{
- struct hstcp *ca = tcp_ca(tp);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct hstcp *ca = inet_csk_ca(sk);
/* Do multiplicative decrease */
return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
@@ -164,7 +167,7 @@ static struct tcp_congestion_ops tcp_highspeed = {
static int __init hstcp_register(void)
{
- BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE);
+ BUG_ON(sizeof(struct hstcp) > ICSK_CA_PRIV_SIZE);
return tcp_register_congestion_control(&tcp_highspeed);
}
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 40168275acf9..e47b37984e95 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -55,18 +55,21 @@ static inline void htcp_reset(struct htcp *ca)
ca->snd_cwnd_cnt2 = 0;
}
-static u32 htcp_cwnd_undo(struct tcp_sock *tp)
+static u32 htcp_cwnd_undo(struct sock *sk)
{
- struct htcp *ca = tcp_ca(tp);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct htcp *ca = inet_csk_ca(sk);
ca->ccount = ca->undo_ccount;
ca->maxRTT = ca->undo_maxRTT;
ca->old_maxB = ca->undo_old_maxB;
return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta);
}
-static inline void measure_rtt(struct tcp_sock *tp)
+static inline void measure_rtt(struct sock *sk)
{
- struct htcp *ca = tcp_ca(tp);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct htcp *ca = inet_csk_ca(sk);
u32 srtt = tp->srtt>>3;
/* keep track of minimum RTT seen so far, minRTT is zero at first */
@@ -74,7 +77,7 @@ static inline void measure_rtt(struct tcp_sock *tp)
ca->minRTT = srtt;
/* max RTT */
- if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
+ if (icsk->icsk_ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
if (ca->maxRTT < ca->minRTT)
ca->maxRTT = ca->minRTT;
if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50)
@@ -82,13 +85,16 @@ static inline void measure_rtt(struct tcp_sock *tp)
}
}
-static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked)
+static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked)
{
- struct htcp *ca = tcp_ca(tp);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct htcp *ca = inet_csk_ca(sk);
u32 now = tcp_time_stamp;
/* achieved throughput calculations */
- if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) {
+ if (icsk->icsk_ca_state != TCP_CA_Open &&
+ icsk->icsk_ca_state != TCP_CA_Disorder) {
ca->packetcount = 0;
ca->lasttime = now;
return;
@@ -173,9 +179,9 @@ static inline void htcp_alpha_update(struct htcp *ca)
* that point do we really have a real sense of maxRTT (the queues en route
* were getting just too full now).
*/
-static void htcp_param_update(struct tcp_sock *tp)
+static void htcp_param_update(struct sock *sk)
{
- struct htcp *ca = tcp_ca(tp);
+ struct htcp *ca = inet_csk_ca(sk);
u32 minRTT = ca->minRTT;
u32 maxRTT = ca->maxRTT;
@@ -187,17 +193,19 @@ static void htcp_param_update(struct tcp_sock *tp)
ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100;
}
-static u32 htcp_recalc_ssthresh(struct tcp_sock *tp)
+static u32 htcp_recalc_ssthresh(struct sock *sk)
{
- struct htcp *ca = tcp_ca(tp);
- htcp_param_update(tp);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct htcp *ca = inet_csk_ca(sk);
+ htcp_param_update(sk);
return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
}
-static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
u32 in_flight, int data_acked)
{
- struct htcp *ca = tcp_ca(tp);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct htcp *ca = inet_csk_ca(sk);
if (in_flight < tp->snd_cwnd)
return;
@@ -207,7 +215,7 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
if (tp->snd_cwnd < tp->snd_cwnd_clamp)
tp->snd_cwnd++;
} else {
- measure_rtt(tp);
+ measure_rtt(sk);
/* keep track of number of round-trip times since last backoff event */
if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) {
@@ -229,28 +237,29 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
}
/* Lower bound on congestion window. */
-static u32 htcp_min_cwnd(struct tcp_sock *tp)
+static u32 htcp_min_cwnd(struct sock *sk)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
return tp->snd_ssthresh;
}
-static void htcp_init(struct tcp_sock *tp)
+static void htcp_init(struct sock *sk)
{
- struct htcp *ca = tcp_ca(tp);
+ struct htcp *ca = inet_csk_ca(sk);
memset(ca, 0, sizeof(struct htcp));
ca->alpha = ALPHA_BASE;
ca->beta = BETA_MIN;
}
-static void htcp_state(struct tcp_sock *tp, u8 new_state)
+static void htcp_state(struct sock *sk, u8 new_state)
{
switch (new_state) {
case TCP_CA_CWR:
case TCP_CA_Recovery:
case TCP_CA_Loss:
- htcp_reset(tcp_ca(tp));
+ htcp_reset(inet_csk_ca(sk));
break;
}
}
@@ -269,7 +278,7 @@ static struct tcp_congestion_ops htcp = {
static int __init htcp_register(void)
{
- BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE);
+ BUG_ON(sizeof(struct htcp) > ICSK_CA_PRIV_SIZE);
BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
if (!use_bandwidth_switch)
htcp.pkts_acked = NULL;
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 13a66342c304..77add63623df 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -33,19 +33,20 @@ MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
/* This is called to refresh values for hybla parameters */
-static inline void hybla_recalc_param (struct tcp_sock *tp)
+static inline void hybla_recalc_param (struct sock *sk)
{
- struct hybla *ca = tcp_ca(tp);
+ struct hybla *ca = inet_csk_ca(sk);
- ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8);
+ ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8);
ca->rho = ca->rho_3ls >> 3;
ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
ca->rho2 = ca->rho2_7ls >>7;
}
-static void hybla_init(struct tcp_sock *tp)
+static void hybla_init(struct sock *sk)
{
- struct hybla *ca = tcp_ca(tp);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct hybla *ca = inet_csk_ca(sk);
ca->rho = 0;
ca->rho2 = 0;
@@ -57,17 +58,16 @@ static void hybla_init(struct tcp_sock *tp)
tp->snd_cwnd_clamp = 65535;
/* 1st Rho measurement based on initial srtt */
- hybla_recalc_param(tp);
+ hybla_recalc_param(sk);
/* set minimum rtt as this is the 1st ever seen */
ca->minrtt = tp->srtt;
tp->snd_cwnd = ca->rho;
}
-static void hybla_state(struct tcp_sock *tp, u8 ca_state)
+static void hybla_state(struct sock *sk, u8 ca_state)
{
- struct hybla *ca = tcp_ca(tp);
-
+ struct hybla *ca = inet_csk_ca(sk);
ca->hybla_en = (ca_state == TCP_CA_Open);
}
@@ -86,27 +86,28 @@ static inline u32 hybla_fraction(u32 odds)
* o Give cwnd a new value based on the model proposed
* o remember increments <1
*/
-static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
u32 in_flight, int flag)
{
- struct hybla *ca = tcp_ca(tp);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct hybla *ca = inet_csk_ca(sk);
u32 increment, odd, rho_fractions;
int is_slowstart = 0;
/* Recalculate rho only if this srtt is the lowest */
if (tp->srtt < ca->minrtt){
- hybla_recalc_param(tp);
+ hybla_recalc_param(sk);
ca->minrtt = tp->srtt;
}
if (!ca->hybla_en)
- return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag);
+ return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
if (in_flight < tp->snd_cwnd)
return;
if (ca->rho == 0)
- hybla_recalc_param(tp);
+ hybla_recalc_param(sk);
rho_fractions = ca->rho_3ls - (ca->rho << 3);
@@ -170,7 +171,7 @@ static struct tcp_congestion_ops tcp_hybla = {
static int __init hybla_register(void)
{
- BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE);
+ BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE);
return tcp_register_congestion_control(&tcp_hybla);
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 53a8a5399f1e..3e98b57578dc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -114,20 +114,21 @@ int sysctl_tcp_moderate_rcvbuf = 1;
/* Adapt the MSS value used to make delayed ack decision to the
* real world.
*/
-static inline void tcp_measure_rcv_mss(struct tcp_sock *tp,
- struct sk_buff *skb)
+static inline void tcp_measure_rcv_mss(struct sock *sk,
+ const struct sk_buff *skb)
{
- unsigned int len, lss;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const unsigned int lss = icsk->icsk_ack.last_seg_size;
+ unsigned int len;
- lss = tp->ack.last_seg_size;
- tp->ack.last_seg_size = 0;
+ icsk->icsk_ack.last_seg_size = 0;
/* skb->len may jitter because of SACKs, even if peer
* sends good full-sized frames.
*/
len = skb->len;
- if (len >= tp->ack.rcv_mss) {
- tp->ack.rcv_mss = len;
+ if (len >= icsk->icsk_ack.rcv_mss) {
+ icsk->icsk_ack.rcv_mss = len;
} else {
/* Otherwise, we make more careful check taking into account,
* that SACKs block is variable.
@@ -147,41 +148,44 @@ static inline void tcp_measure_rcv_mss(struct tcp_sock *tp,
* tcp header plus fixed timestamp option length.
* Resulting "len" is MSS free of SACK jitter.
*/
- len -= tp->tcp_header_len;
- tp->ack.last_seg_size = len;
+ len -= tcp_sk(sk)->tcp_header_len;
+ icsk->icsk_ack.last_seg_size = len;
if (len == lss) {
- tp->ack.rcv_mss = len;
+ icsk->icsk_ack.rcv_mss = len;
return;
}
}
- tp->ack.pending |= TCP_ACK_PUSHED;
+ icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
}
}
-static void tcp_incr_quickack(struct tcp_sock *tp)
+static void tcp_incr_quickack(struct sock *sk)
{
- unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
if (quickacks==0)
quickacks=2;
- if (quickacks > tp->ack.quick)
- tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+ if (quickacks > icsk->icsk_ack.quick)
+ icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
}
-void tcp_enter_quickack_mode(struct tcp_sock *tp)
+void tcp_enter_quickack_mode(struct sock *sk)
{
- tcp_incr_quickack(tp);
- tp->ack.pingpong = 0;
- tp->ack.ato = TCP_ATO_MIN;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ tcp_incr_quickack(sk);
+ icsk->icsk_ack.pingpong = 0;
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
}
/* Send ACKs quickly, if "quick" count is not exhausted
* and the session is not interactive.
*/
-static __inline__ int tcp_in_quickack_mode(struct tcp_sock *tp)
+static inline int tcp_in_quickack_mode(const struct sock *sk)
{
- return (tp->ack.quick && !tp->ack.pingpong);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
}
/* Buffer size and advertised window tuning.
@@ -224,8 +228,8 @@ static void tcp_fixup_sndbuf(struct sock *sk)
*/
/* Slow part of check#2. */
-static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
- struct sk_buff *skb)
+static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
+ const struct sk_buff *skb)
{
/* Optimize this! */
int truesize = tcp_win_from_space(skb->truesize)/2;
@@ -233,7 +237,7 @@ static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
while (tp->rcv_ssthresh <= window) {
if (truesize <= skb->len)
- return 2*tp->ack.rcv_mss;
+ return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
truesize >>= 1;
window >>= 1;
@@ -260,7 +264,7 @@ static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
if (incr) {
tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp);
- tp->ack.quick |= 1;
+ inet_csk(sk)->icsk_ack.quick |= 1;
}
}
}
@@ -321,11 +325,12 @@ static void tcp_init_buffer_space(struct sock *sk)
/* 5. Recalculate window clamp after socket hit its memory bounds. */
static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb;
unsigned int app_win = tp->rcv_nxt - tp->copied_seq;
int ofo_win = 0;
- tp->ack.quick = 0;
+ icsk->icsk_ack.quick = 0;
skb_queue_walk(&tp->out_of_order_queue, skb) {
ofo_win += skb->len;
@@ -346,12 +351,10 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
app_win += ofo_win;
if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf)
app_win >>= 1;
- if (app_win > tp->ack.rcv_mss)
- app_win -= tp->ack.rcv_mss;
+ if (app_win > icsk->icsk_ack.rcv_mss)
+ app_win -= icsk->icsk_ack.rcv_mss;
app_win = max(app_win, 2U*tp->advmss);
- if (!ofo_win)
- tp->window_clamp = min(tp->window_clamp, app_win);
tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
}
}
@@ -415,11 +418,12 @@ new_measure:
tp->rcv_rtt_est.time = tcp_time_stamp;
}
-static inline void tcp_rcv_rtt_measure_ts(struct tcp_sock *tp, struct sk_buff *skb)
+static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb)
{
+ struct tcp_sock *tp = tcp_sk(sk);
if (tp->rx_opt.rcv_tsecr &&
(TCP_SKB_CB(skb)->end_seq -
- TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss))
+ TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
}
@@ -492,41 +496,42 @@ new_measure:
*/
static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
u32 now;
- tcp_schedule_ack(tp);
+ inet_csk_schedule_ack(sk);
- tcp_measure_rcv_mss(tp, skb);
+ tcp_measure_rcv_mss(sk, skb);
tcp_rcv_rtt_measure(tp);
now = tcp_time_stamp;
- if (!tp->ack.ato) {
+ if (!icsk->icsk_ack.ato) {
/* The _first_ data packet received, initialize
* delayed ACK engine.
*/
- tcp_incr_quickack(tp);
- tp->ack.ato = TCP_ATO_MIN;
+ tcp_incr_quickack(sk);
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
} else {
- int m = now - tp->ack.lrcvtime;
+ int m = now - icsk->icsk_ack.lrcvtime;
if (m <= TCP_ATO_MIN/2) {
/* The fastest case is the first. */
- tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2;
- } else if (m < tp->ack.ato) {
- tp->ack.ato = (tp->ack.ato>>1) + m;
- if (tp->ack.ato > tp->rto)
- tp->ack.ato = tp->rto;
- } else if (m > tp->rto) {
+ icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
+ } else if (m < icsk->icsk_ack.ato) {
+ icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
+ if (icsk->icsk_ack.ato > icsk->icsk_rto)
+ icsk->icsk_ack.ato = icsk->icsk_rto;
+ } else if (m > icsk->icsk_rto) {
/* Too long gap. Apparently sender falled to
* restart window, so that we send ACKs quickly.
*/
- tcp_incr_quickack(tp);
+ tcp_incr_quickack(sk);
sk_stream_mem_reclaim(sk);
}
}
- tp->ack.lrcvtime = now;
+ icsk->icsk_ack.lrcvtime = now;
TCP_ECN_check_ce(tp, skb);
@@ -543,8 +548,10 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
* To save cycles in the RFC 1323 implementation it was better to break
* it up into three procedures. -- erics
*/
-static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
+static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
long m = mrtt; /* RTT */
/* The following amusing code comes from Jacobson's
@@ -604,15 +611,16 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
tp->rtt_seq = tp->snd_nxt;
}
- if (tp->ca_ops->rtt_sample)
- tp->ca_ops->rtt_sample(tp, *usrtt);
+ if (icsk->icsk_ca_ops->rtt_sample)
+ icsk->icsk_ca_ops->rtt_sample(sk, *usrtt);
}
/* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
-static inline void tcp_set_rto(struct tcp_sock *tp)
+static inline void tcp_set_rto(struct sock *sk)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
/* Old crap is replaced with new one. 8)
*
* More seriously:
@@ -623,7 +631,7 @@ static inline void tcp_set_rto(struct tcp_sock *tp)
* is invisible. Actually, Linux-2.4 also generates erratic
* ACKs in some curcumstances.
*/
- tp->rto = (tp->srtt >> 3) + tp->rttvar;
+ inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;
/* 2. Fixups made earlier cannot be right.
* If we do not estimate RTO correctly without them,
@@ -635,10 +643,10 @@ static inline void tcp_set_rto(struct tcp_sock *tp)
/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
* guarantees that rto is higher.
*/
-static inline void tcp_bound_rto(struct tcp_sock *tp)
+static inline void tcp_bound_rto(struct sock *sk)
{
- if (tp->rto > TCP_RTO_MAX)
- tp->rto = TCP_RTO_MAX;
+ if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
+ inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
}
/* Save metrics learned by this TCP session.
@@ -656,9 +664,10 @@ void tcp_update_metrics(struct sock *sk)
dst_confirm(dst);
if (dst && (dst->flags&DST_HOST)) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
int m;
- if (tp->backoff || !tp->srtt) {
+ if (icsk->icsk_backoff || !tp->srtt) {
/* This session failed to estimate rtt. Why?
* Probably, no packets returned in time.
* Reset our results.
@@ -707,7 +716,7 @@ void tcp_update_metrics(struct sock *sk)
tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
dst->metrics[RTAX_CWND-1] = tp->snd_cwnd;
} else if (tp->snd_cwnd > tp->snd_ssthresh &&
- tp->ca_state == TCP_CA_Open) {
+ icsk->icsk_ca_state == TCP_CA_Open) {
/* Cong. avoidance phase, cwnd is reliable. */
if (!dst_metric_locked(dst, RTAX_SSTHRESH))
dst->metrics[RTAX_SSTHRESH-1] =
@@ -801,9 +810,9 @@ static void tcp_init_metrics(struct sock *sk)
tp->mdev = dst_metric(dst, RTAX_RTTVAR);
tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
}
- tcp_set_rto(tp);
- tcp_bound_rto(tp);
- if (tp->rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
+ tcp_set_rto(sk);
+ tcp_bound_rto(sk);
+ if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
goto reset;
tp->snd_cwnd = tcp_init_cwnd(tp, dst);
tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -817,12 +826,14 @@ reset:
if (!tp->rx_opt.saw_tstamp && tp->srtt) {
tp->srtt = 0;
tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
- tp->rto = TCP_TIMEOUT_INIT;
+ inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
}
}
-static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
+static void tcp_update_reordering(struct sock *sk, const int metric,
+ const int ts)
{
+ struct tcp_sock *tp = tcp_sk(sk);
if (metric > tp->reordering) {
tp->reordering = min(TCP_MAX_REORDERING, metric);
@@ -837,7 +848,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER);
#if FASTRETRANS_DEBUG > 1
printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
- tp->rx_opt.sack_ok, tp->ca_state,
+ tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
tp->reordering,
tp->fackets_out,
tp->sacked_out,
@@ -899,6 +910,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
static int
tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked;
struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2);
@@ -909,14 +921,6 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
int flag = 0;
int i;
- /* So, SACKs for already sent large segments will be lost.
- * Not good, but alternative is to resegment the queue. */
- if (sk->sk_route_caps & NETIF_F_TSO) {
- sk->sk_route_caps &= ~NETIF_F_TSO;
- sock_set_flag(sk, SOCK_NO_LARGESEND);
- tp->mss_cache = tp->mss_cache;
- }
-
if (!tp->sacked_out)
tp->fackets_out = 0;
prior_fackets = tp->fackets_out;
@@ -964,20 +968,42 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
flag |= FLAG_DATA_LOST;
sk_stream_for_retrans_queue(skb, sk) {
- u8 sacked = TCP_SKB_CB(skb)->sacked;
- int in_sack;
+ int in_sack, pcount;
+ u8 sacked;
/* The retransmission queue is always in order, so
* we can short-circuit the walk early.
*/
- if(!before(TCP_SKB_CB(skb)->seq, end_seq))
+ if (!before(TCP_SKB_CB(skb)->seq, end_seq))
break;
- fack_count += tcp_skb_pcount(skb);
-
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
!before(end_seq, TCP_SKB_CB(skb)->end_seq);
+ pcount = tcp_skb_pcount(skb);
+
+ if (pcount > 1 && !in_sack &&
+ after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
+ unsigned int pkt_len;
+
+ in_sack = !after(start_seq,
+ TCP_SKB_CB(skb)->seq);
+
+ if (!in_sack)
+ pkt_len = (start_seq -
+ TCP_SKB_CB(skb)->seq);
+ else
+ pkt_len = (end_seq -
+ TCP_SKB_CB(skb)->seq);
+ if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->tso_size))
+ break;
+ pcount = tcp_skb_pcount(skb);
+ }
+
+ fack_count += pcount;
+
+ sacked = TCP_SKB_CB(skb)->sacked;
+
/* Account D-SACK for retransmitted packet. */
if ((dup_sack && in_sack) &&
(sacked & TCPCB_RETRANS) &&
@@ -1064,7 +1090,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
* we have to account for reordering! Ugly,
* but should help.
*/
- if (lost_retrans && tp->ca_state == TCP_CA_Recovery) {
+ if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) {
struct sk_buff *skb;
sk_stream_for_retrans_queue(skb, sk) {
@@ -1093,8 +1119,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
tp->left_out = tp->sacked_out + tp->lost_out;
- if ((reord < tp->fackets_out) && tp->ca_state != TCP_CA_Loss)
- tcp_update_reordering(tp, ((tp->fackets_out + 1) - reord), 0);
+ if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss)
+ tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
#if FASTRETRANS_DEBUG > 0
BUG_TRAP((int)tp->sacked_out >= 0);
@@ -1111,17 +1137,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
*/
void tcp_enter_frto(struct sock *sk)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
tp->frto_counter = 1;
- if (tp->ca_state <= TCP_CA_Disorder ||
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
tp->snd_una == tp->high_seq ||
- (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
- tp->prior_ssthresh = tcp_current_ssthresh(tp);
- tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
- tcp_ca_event(tp, CA_EVENT_FRTO);
+ (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+ tcp_ca_event(sk, CA_EVENT_FRTO);
}
/* Have to clear retransmission markers here to keep the bookkeeping
@@ -1138,7 +1165,7 @@ void tcp_enter_frto(struct sock *sk)
}
tcp_sync_left_out(tp);
- tcp_set_ca_state(tp, TCP_CA_Open);
+ tcp_set_ca_state(sk, TCP_CA_Open);
tp->frto_highmark = tp->snd_nxt;
}
@@ -1184,7 +1211,7 @@ static void tcp_enter_frto_loss(struct sock *sk)
tp->reordering = min_t(unsigned int, tp->reordering,
sysctl_tcp_reordering);
- tcp_set_ca_state(tp, TCP_CA_Loss);
+ tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->frto_highmark;
TCP_ECN_queue_cwr(tp);
}
@@ -1208,16 +1235,17 @@ void tcp_clear_retrans(struct tcp_sock *tp)
*/
void tcp_enter_loss(struct sock *sk, int how)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int cnt = 0;
/* Reduce ssthresh if it has not yet been made inside this window. */
- if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
- (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
- tp->prior_ssthresh = tcp_current_ssthresh(tp);
- tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
- tcp_ca_event(tp, CA_EVENT_LOSS);
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
+ (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+ tcp_ca_event(sk, CA_EVENT_LOSS);
}
tp->snd_cwnd = 1;
tp->snd_cwnd_cnt = 0;
@@ -1248,12 +1276,12 @@ void tcp_enter_loss(struct sock *sk, int how)
tp->reordering = min_t(unsigned int, tp->reordering,
sysctl_tcp_reordering);
- tcp_set_ca_state(tp, TCP_CA_Loss);
+ tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
TCP_ECN_queue_cwr(tp);
}
-static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp)
+static int tcp_check_sack_reneging(struct sock *sk)
{
struct sk_buff *skb;
@@ -1265,12 +1293,14 @@ static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp)
*/
if ((skb = skb_peek(&sk->sk_write_queue)) != NULL &&
(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING);
tcp_enter_loss(sk, 1);
- tp->retransmits++;
+ icsk->icsk_retransmits++;
tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ icsk->icsk_rto, TCP_RTO_MAX);
return 1;
}
return 0;
@@ -1281,15 +1311,15 @@ static inline int tcp_fackets_out(struct tcp_sock *tp)
return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out;
}
-static inline int tcp_skb_timedout(struct tcp_sock *tp, struct sk_buff *skb)
+static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
{
- return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto);
+ return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
}
static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
{
return tp->packets_out &&
- tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue));
+ tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue));
}
/* Linux NewReno/SACK/FACK/ECN state machine.
@@ -1423,8 +1453,9 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp)
* in assumption of absent reordering, interpret this as reordering.
* The only another reason could be bug in receiver TCP.
*/
-static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend)
+static void tcp_check_reno_reordering(struct sock *sk, const int addend)
{
+ struct tcp_sock *tp = tcp_sk(sk);
u32 holes;
holes = max(tp->lost_out, 1U);
@@ -1432,16 +1463,17 @@ static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend)
if ((tp->sacked_out + holes) > tp->packets_out) {
tp->sacked_out = tp->packets_out - holes;
- tcp_update_reordering(tp, tp->packets_out+addend, 0);
+ tcp_update_reordering(sk, tp->packets_out + addend, 0);
}
}
/* Emulate SACKs for SACKless connection: account for a new dupack. */
-static void tcp_add_reno_sack(struct tcp_sock *tp)
+static void tcp_add_reno_sack(struct sock *sk)
{
+ struct tcp_sock *tp = tcp_sk(sk);
tp->sacked_out++;
- tcp_check_reno_reordering(tp, 0);
+ tcp_check_reno_reordering(sk, 0);
tcp_sync_left_out(tp);
}
@@ -1456,7 +1488,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acke
else
tp->sacked_out -= acked-1;
}
- tcp_check_reno_reordering(tp, acked);
+ tcp_check_reno_reordering(sk, acked);
tcp_sync_left_out(tp);
}
@@ -1509,7 +1541,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
struct sk_buff *skb;
sk_stream_for_retrans_queue(skb, sk) {
- if (tcp_skb_timedout(tp, skb) &&
+ if (tcp_skb_timedout(sk, skb) &&
!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
@@ -1530,14 +1562,16 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
}
/* Decrease cwnd each second ack. */
-static void tcp_cwnd_down(struct tcp_sock *tp)
+static void tcp_cwnd_down(struct sock *sk)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
int decr = tp->snd_cwnd_cnt + 1;
tp->snd_cwnd_cnt = decr&1;
decr >>= 1;
- if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp))
+ if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk))
tp->snd_cwnd -= decr;
tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1571,11 +1605,15 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
#define DBGUNDO(x...) do { } while (0)
#endif
-static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
+static void tcp_undo_cwr(struct sock *sk, const int undo)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+
if (tp->prior_ssthresh) {
- if (tp->ca_ops->undo_cwnd)
- tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->undo_cwnd)
+ tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
else
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
@@ -1603,9 +1641,9 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
/* Happy end! We did not retransmit anything
* or our original transmission succeeded.
*/
- DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans");
- tcp_undo_cwr(tp, 1);
- if (tp->ca_state == TCP_CA_Loss)
+ DBGUNDO(sk, tp, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
+ tcp_undo_cwr(sk, 1);
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
else
NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO);
@@ -1618,7 +1656,7 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
tcp_moderate_cwnd(tp);
return 1;
}
- tcp_set_ca_state(tp, TCP_CA_Open);
+ tcp_set_ca_state(sk, TCP_CA_Open);
return 0;
}
@@ -1627,7 +1665,7 @@ static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp)
{
if (tp->undo_marker && !tp->undo_retrans) {
DBGUNDO(sk, tp, "D-SACK");
- tcp_undo_cwr(tp, 1);
+ tcp_undo_cwr(sk, 1);
tp->undo_marker = 0;
NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO);
}
@@ -1648,10 +1686,10 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp,
if (tp->retrans_out == 0)
tp->retrans_stamp = 0;
- tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1);
+ tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
DBGUNDO(sk, tp, "Hoe");
- tcp_undo_cwr(tp, 0);
+ tcp_undo_cwr(sk, 0);
NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO);
/* So... Do not make Hoe's retransmit yet.
@@ -1674,22 +1712,23 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
DBGUNDO(sk, tp, "partial loss");
tp->lost_out = 0;
tp->left_out = tp->sacked_out;
- tcp_undo_cwr(tp, 1);
+ tcp_undo_cwr(sk, 1);
NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
- tp->retransmits = 0;
+ inet_csk(sk)->icsk_retransmits = 0;
tp->undo_marker = 0;
if (!IsReno(tp))
- tcp_set_ca_state(tp, TCP_CA_Open);
+ tcp_set_ca_state(sk, TCP_CA_Open);
return 1;
}
return 0;
}
-static inline void tcp_complete_cwr(struct tcp_sock *tp)
+static inline void tcp_complete_cwr(struct sock *sk)
{
+ struct tcp_sock *tp = tcp_sk(sk);
tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
tp->snd_cwnd_stamp = tcp_time_stamp;
- tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR);
+ tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
}
static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
@@ -1700,21 +1739,21 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
tp->retrans_stamp = 0;
if (flag&FLAG_ECE)
- tcp_enter_cwr(tp);
+ tcp_enter_cwr(sk);
- if (tp->ca_state != TCP_CA_CWR) {
+ if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
int state = TCP_CA_Open;
if (tp->left_out || tp->retrans_out || tp->undo_marker)
state = TCP_CA_Disorder;
- if (tp->ca_state != state) {
- tcp_set_ca_state(tp, state);
+ if (inet_csk(sk)->icsk_ca_state != state) {
+ tcp_set_ca_state(sk, state);
tp->high_seq = tp->snd_nxt;
}
tcp_moderate_cwnd(tp);
} else {
- tcp_cwnd_down(tp);
+ tcp_cwnd_down(sk);
}
}
@@ -1733,6 +1772,7 @@ static void
tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
int prior_packets, int flag)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP));
@@ -1750,13 +1790,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
tp->prior_ssthresh = 0;
/* B. In all the states check for reneging SACKs. */
- if (tp->sacked_out && tcp_check_sack_reneging(sk, tp))
+ if (tp->sacked_out && tcp_check_sack_reneging(sk))
return;
/* C. Process data loss notification, provided it is valid. */
if ((flag&FLAG_DATA_LOST) &&
before(tp->snd_una, tp->high_seq) &&
- tp->ca_state != TCP_CA_Open &&
+ icsk->icsk_ca_state != TCP_CA_Open &&
tp->fackets_out > tp->reordering) {
tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq);
NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
@@ -1767,14 +1807,14 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
/* E. Check state exit conditions. State can be terminated
* when high_seq is ACKed. */
- if (tp->ca_state == TCP_CA_Open) {
+ if (icsk->icsk_ca_state == TCP_CA_Open) {
if (!sysctl_tcp_frto)
BUG_TRAP(tp->retrans_out == 0);
tp->retrans_stamp = 0;
} else if (!before(tp->snd_una, tp->high_seq)) {
- switch (tp->ca_state) {
+ switch (icsk->icsk_ca_state) {
case TCP_CA_Loss:
- tp->retransmits = 0;
+ icsk->icsk_retransmits = 0;
if (tcp_try_undo_recovery(sk, tp))
return;
break;
@@ -1783,8 +1823,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
/* CWR is to be held something *above* high_seq
* is ACKed for CWR bit to reach receiver. */
if (tp->snd_una != tp->high_seq) {
- tcp_complete_cwr(tp);
- tcp_set_ca_state(tp, TCP_CA_Open);
+ tcp_complete_cwr(sk);
+ tcp_set_ca_state(sk, TCP_CA_Open);
}
break;
@@ -1795,7 +1835,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
* catching for all duplicate ACKs. */
IsReno(tp) || tp->snd_una != tp->high_seq) {
tp->undo_marker = 0;
- tcp_set_ca_state(tp, TCP_CA_Open);
+ tcp_set_ca_state(sk, TCP_CA_Open);
}
break;
@@ -1804,17 +1844,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
tcp_reset_reno_sack(tp);
if (tcp_try_undo_recovery(sk, tp))
return;
- tcp_complete_cwr(tp);
+ tcp_complete_cwr(sk);
break;
}
}
/* F. Process state. */
- switch (tp->ca_state) {
+ switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
if (prior_snd_una == tp->snd_una) {
if (IsReno(tp) && is_dupack)
- tcp_add_reno_sack(tp);
+ tcp_add_reno_sack(sk);
} else {
int acked = prior_packets - tp->packets_out;
if (IsReno(tp))
@@ -1824,13 +1864,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
break;
case TCP_CA_Loss:
if (flag&FLAG_DATA_ACKED)
- tp->retransmits = 0;
+ icsk->icsk_retransmits = 0;
if (!tcp_try_undo_loss(sk, tp)) {
tcp_moderate_cwnd(tp);
tcp_xmit_retransmit_queue(sk);
return;
}
- if (tp->ca_state != TCP_CA_Open)
+ if (icsk->icsk_ca_state != TCP_CA_Open)
return;
/* Loss is undone; fall through to processing in Open state. */
default:
@@ -1838,10 +1878,10 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
if (tp->snd_una != prior_snd_una)
tcp_reset_reno_sack(tp);
if (is_dupack)
- tcp_add_reno_sack(tp);
+ tcp_add_reno_sack(sk);
}
- if (tp->ca_state == TCP_CA_Disorder)
+ if (icsk->icsk_ca_state == TCP_CA_Disorder)
tcp_try_undo_dsack(sk, tp);
if (!tcp_time_to_recover(sk, tp)) {
@@ -1861,30 +1901,28 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
tp->undo_marker = tp->snd_una;
tp->undo_retrans = tp->retrans_out;
- if (tp->ca_state < TCP_CA_CWR) {
+ if (icsk->icsk_ca_state < TCP_CA_CWR) {
if (!(flag&FLAG_ECE))
- tp->prior_ssthresh = tcp_current_ssthresh(tp);
- tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
TCP_ECN_queue_cwr(tp);
}
tp->snd_cwnd_cnt = 0;
- tcp_set_ca_state(tp, TCP_CA_Recovery);
+ tcp_set_ca_state(sk, TCP_CA_Recovery);
}
if (is_dupack || tcp_head_timedout(sk, tp))
tcp_update_scoreboard(sk, tp);
- tcp_cwnd_down(tp);
+ tcp_cwnd_down(sk);
tcp_xmit_retransmit_queue(sk);
}
/* Read draft-ietf-tcplw-high-performance before mucking
* with this code. (Superceeds RFC1323)
*/
-static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
+static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
{
- __u32 seq_rtt;
-
/* RTTM Rule: A TSecr value received in a segment is used to
* update the averaged RTT measurement only if the segment
* acknowledges some new data, i.e., only if it advances the
@@ -1900,14 +1938,15 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
* answer arrives rto becomes 120 seconds! If at least one of segments
* in window is lost... Voila. --ANK (010210)
*/
- seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
- tcp_rtt_estimator(tp, seq_rtt, usrtt);
- tcp_set_rto(tp);
- tp->backoff = 0;
- tcp_bound_rto(tp);
+ struct tcp_sock *tp = tcp_sk(sk);
+ const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
+ tcp_rtt_estimator(sk, seq_rtt, usrtt);
+ tcp_set_rto(sk);
+ inet_csk(sk)->icsk_backoff = 0;
+ tcp_bound_rto(sk);
}
-static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag)
+static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag)
{
/* We don't have a timestamp. Can only use
* packets that are not retransmitted to determine
@@ -1921,27 +1960,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int
if (flag & FLAG_RETRANS_DATA_ACKED)
return;
- tcp_rtt_estimator(tp, seq_rtt, usrtt);
- tcp_set_rto(tp);
- tp->backoff = 0;
- tcp_bound_rto(tp);
+ tcp_rtt_estimator(sk, seq_rtt, usrtt);
+ tcp_set_rto(sk);
+ inet_csk(sk)->icsk_backoff = 0;
+ tcp_bound_rto(sk);
}
-static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
- int flag, s32 seq_rtt, u32 *usrtt)
+static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
+ const s32 seq_rtt, u32 *usrtt)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
- tcp_ack_saw_tstamp(tp, usrtt, flag);
+ tcp_ack_saw_tstamp(sk, usrtt, flag);
else if (seq_rtt >= 0)
- tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag);
+ tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag);
}
-static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
u32 in_flight, int good)
{
- tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good);
- tp->snd_cwnd_stamp = tcp_time_stamp;
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good);
+ tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
}
/* Restart timer after forward progress on connection.
@@ -1951,9 +1992,9 @@ static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
{
if (!tp->packets_out) {
- tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS);
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
} else {
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
}
}
@@ -2068,9 +2109,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
seq_rtt = -1;
} else if (seq_rtt < 0)
seq_rtt = now - scb->when;
- if (seq_usrtt)
- *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000
- + (usnow.tv_usec - skb->stamp.tv_usec);
+ if (seq_usrtt) {
+ struct timeval tv;
+
+ skb_get_timestamp(skb, &tv);
+ *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000
+ + (usnow.tv_usec - tv.tv_usec);
+ }
if (sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= tcp_skb_pcount(skb);
@@ -2085,16 +2130,17 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
seq_rtt = now - scb->when;
tcp_dec_pcount_approx(&tp->fackets_out, skb);
tcp_packets_out_dec(tp, skb);
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, &sk->sk_write_queue);
sk_stream_free_skb(sk, skb);
}
if (acked&FLAG_ACKED) {
- tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt);
tcp_ack_packets_out(sk, tp);
- if (tp->ca_ops->pkts_acked)
- tp->ca_ops->pkts_acked(tp, pkts_acked);
+ if (icsk->icsk_ca_ops->pkts_acked)
+ icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked);
}
#if FASTRETRANS_DEBUG > 0
@@ -2102,19 +2148,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
BUG_TRAP((int)tp->lost_out >= 0);
BUG_TRAP((int)tp->retrans_out >= 0);
if (!tp->packets_out && tp->rx_opt.sack_ok) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
if (tp->lost_out) {
printk(KERN_DEBUG "Leak l=%u %d\n",
- tp->lost_out, tp->ca_state);
+ tp->lost_out, icsk->icsk_ca_state);
tp->lost_out = 0;
}
if (tp->sacked_out) {
printk(KERN_DEBUG "Leak s=%u %d\n",
- tp->sacked_out, tp->ca_state);
+ tp->sacked_out, icsk->icsk_ca_state);
tp->sacked_out = 0;
}
if (tp->retrans_out) {
printk(KERN_DEBUG "Leak r=%u %d\n",
- tp->retrans_out, tp->ca_state);
+ tp->retrans_out, icsk->icsk_ca_state);
tp->retrans_out = 0;
}
}
@@ -2125,40 +2172,43 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
static void tcp_ack_probe(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
/* Was it a usable window open? */
if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
tp->snd_una + tp->snd_wnd)) {
- tp->backoff = 0;
- tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0);
+ icsk->icsk_backoff = 0;
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
/* Socket must be waked up by subsequent tcp_data_snd_check().
* This function is not for random using!
*/
} else {
- tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0,
- min(tp->rto << tp->backoff, TCP_RTO_MAX));
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+ min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
+ TCP_RTO_MAX);
}
}
-static inline int tcp_ack_is_dubious(struct tcp_sock *tp, int flag)
+static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
{
return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
- tp->ca_state != TCP_CA_Open);
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
}
-static inline int tcp_may_raise_cwnd(struct tcp_sock *tp, int flag)
+static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
- !((1<<tp->ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR));
+ !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
}
/* Check that window update is acceptable.
* The function assumes that snd_una<=ack<=snd_next.
*/
-static inline int tcp_may_update_window(struct tcp_sock *tp, u32 ack,
- u32 ack_seq, u32 nwin)
+static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
+ const u32 ack_seq, const u32 nwin)
{
return (after(ack, tp->snd_una) ||
after(ack_seq, tp->snd_wl1) ||
@@ -2189,6 +2239,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
/* Note, it is the only place, where
* fast path is recovered for sending TCP.
*/
+ tp->pred_flags = 0;
tcp_fast_path_check(sk, tp);
if (nwin > tp->max_window) {
@@ -2241,6 +2292,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_snd_una = tp->snd_una;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
@@ -2268,7 +2320,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
tp->snd_una = ack;
flag |= FLAG_WIN_UPDATE;
- tcp_ca_event(tp, CA_EVENT_FAST_ACK);
+ tcp_ca_event(sk, CA_EVENT_FAST_ACK);
NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
} else {
@@ -2285,7 +2337,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
flag |= FLAG_ECE;
- tcp_ca_event(tp, CA_EVENT_SLOW_ACK);
+ tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
}
/* We passed data and got it acked, remove any soft error
@@ -2301,19 +2353,19 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
- tp->ca_ops->rtt_sample ? &seq_usrtt : NULL);
+ icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL);
if (tp->frto_counter)
tcp_process_frto(sk, prior_snd_una);
- if (tcp_ack_is_dubious(tp, flag)) {
+ if (tcp_ack_is_dubious(sk, flag)) {
/* Advanve CWND, if state allows this. */
- if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag))
- tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0);
+ if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
+ tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0);
tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
} else {
if ((flag & FLAG_DATA_ACKED))
- tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1);
+ tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1);
}
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
@@ -2322,7 +2374,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
return 1;
no_queue:
- tp->probes_out = 0;
+ icsk->icsk_probes_out = 0;
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
@@ -2500,8 +2552,9 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
* up to bandwidth of 18Gigabit/sec. 8) ]
*/
-static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb)
+static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
{
+ struct tcp_sock *tp = tcp_sk(sk);
struct tcphdr *th = skb->h.th;
u32 seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
@@ -2516,14 +2569,15 @@ static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb)
!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
/* 4. ... and sits in replay window. */
- (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (tp->rto*1024)/HZ);
+ (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
}
-static inline int tcp_paws_discard(struct tcp_sock *tp, struct sk_buff *skb)
+static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
- !tcp_disordered_ack(tp, skb));
+ !tcp_disordered_ack(sk, skb));
}
/* Check segment sequence number for validity.
@@ -2586,7 +2640,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
{
struct tcp_sock *tp = tcp_sk(sk);
- tcp_schedule_ack(tp);
+ inet_csk_schedule_ack(sk);
sk->sk_shutdown |= RCV_SHUTDOWN;
sock_set_flag(sk, SOCK_DONE);
@@ -2596,7 +2650,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
- tp->ack.pingpong = 1;
+ inet_csk(sk)->icsk_ack.pingpong = 1;
break;
case TCP_CLOSE_WAIT:
@@ -2694,7 +2748,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
- tcp_enter_quickack_mode(tp);
+ tcp_enter_quickack_mode(sk);
if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -2853,7 +2907,7 @@ static void tcp_ofo_queue(struct sock *sk)
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
SOCK_DEBUG(sk, "ofo packet was already received \n");
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, &tp->out_of_order_queue);
__kfree_skb(skb);
continue;
}
@@ -2861,7 +2915,7 @@ static void tcp_ofo_queue(struct sock *sk)
tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(skb)->end_seq);
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, &tp->out_of_order_queue);
__skb_queue_tail(&sk->sk_receive_queue, skb);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if(skb->h.th->fin)
@@ -2942,7 +2996,7 @@ queue_and_out:
* gap in queue is filled.
*/
if (skb_queue_empty(&tp->out_of_order_queue))
- tp->ack.pingpong = 0;
+ inet_csk(sk)->icsk_ack.pingpong = 0;
}
if (tp->rx_opt.num_sacks)
@@ -2963,8 +3017,8 @@ queue_and_out:
tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
out_of_window:
- tcp_enter_quickack_mode(tp);
- tcp_schedule_ack(tp);
+ tcp_enter_quickack_mode(sk);
+ inet_csk_schedule_ack(sk);
drop:
__kfree_skb(skb);
return;
@@ -2974,7 +3028,7 @@ drop:
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
goto out_of_window;
- tcp_enter_quickack_mode(tp);
+ tcp_enter_quickack_mode(sk);
if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
/* Partial packet, seq < rcv_next < end_seq */
@@ -3003,7 +3057,7 @@ drop:
/* Disable header prediction. */
tp->pred_flags = 0;
- tcp_schedule_ack(tp);
+ inet_csk_schedule_ack(sk);
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
@@ -3027,7 +3081,7 @@ drop:
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
if (seq == TCP_SKB_CB(skb1)->end_seq) {
- __skb_append(skb1, skb);
+ __skb_append(skb1, skb, &tp->out_of_order_queue);
if (!tp->rx_opt.num_sacks ||
tp->selective_acks[0].end_seq != seq)
@@ -3071,7 +3125,7 @@ drop:
tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq);
break;
}
- __skb_unlink(skb1, skb1->list);
+ __skb_unlink(skb1, &tp->out_of_order_queue);
tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq);
__kfree_skb(skb1);
}
@@ -3088,8 +3142,9 @@ add_sack:
* simplifies code)
*/
static void
-tcp_collapse(struct sock *sk, struct sk_buff *head,
- struct sk_buff *tail, u32 start, u32 end)
+tcp_collapse(struct sock *sk, struct sk_buff_head *list,
+ struct sk_buff *head, struct sk_buff *tail,
+ u32 start, u32 end)
{
struct sk_buff *skb;
@@ -3099,7 +3154,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
/* No new bits? It is possible on ofo queue. */
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
struct sk_buff *next = skb->next;
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, list);
__kfree_skb(skb);
NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
skb = next;
@@ -3145,7 +3200,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head);
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
- __skb_insert(nskb, skb->prev, skb, skb->list);
+ __skb_insert(nskb, skb->prev, skb, list);
sk_stream_set_owner_r(nskb, sk);
/* Copy data, releasing collapsed skbs. */
@@ -3164,7 +3219,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
}
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
struct sk_buff *next = skb->next;
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, list);
__kfree_skb(skb);
NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
skb = next;
@@ -3200,7 +3255,8 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
if (skb == (struct sk_buff *)&tp->out_of_order_queue ||
after(TCP_SKB_CB(skb)->seq, end) ||
before(TCP_SKB_CB(skb)->end_seq, start)) {
- tcp_collapse(sk, head, skb, start, end);
+ tcp_collapse(sk, &tp->out_of_order_queue,
+ head, skb, start, end);
head = skb;
if (skb == (struct sk_buff *)&tp->out_of_order_queue)
break;
@@ -3237,7 +3293,8 @@ static int tcp_prune_queue(struct sock *sk)
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
tcp_collapse_ofo_queue(sk);
- tcp_collapse(sk, sk->sk_receive_queue.next,
+ tcp_collapse(sk, &sk->sk_receive_queue,
+ sk->sk_receive_queue.next,
(struct sk_buff*)&sk->sk_receive_queue,
tp->copied_seq, tp->rcv_nxt);
sk_stream_mem_reclaim(sk);
@@ -3286,12 +3343,12 @@ void tcp_cwnd_application_limited(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tp->ca_state == TCP_CA_Open &&
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
/* Limited by application or receiver window. */
u32 win_used = max(tp->snd_cwnd_used, 2U);
if (win_used < tp->snd_cwnd) {
- tp->snd_ssthresh = tcp_current_ssthresh(tp);
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
}
tp->snd_cwnd_used = 0;
@@ -3370,13 +3427,13 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
struct tcp_sock *tp = tcp_sk(sk);
/* More than one full frame received... */
- if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss
+ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss
/* ... and right edge of window advances far enough.
* (tcp_recvmsg() will send ACK otherwise). Or...
*/
&& __tcp_select_window(sk) >= tp->rcv_wnd) ||
/* We ACK each frame or... */
- tcp_in_quickack_mode(tp) ||
+ tcp_in_quickack_mode(sk) ||
/* We have out of order data. */
(ofo_possible &&
skb_peek(&tp->out_of_order_queue))) {
@@ -3390,8 +3447,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
static __inline__ void tcp_ack_snd_check(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
- if (!tcp_ack_scheduled(tp)) {
+ if (!inet_csk_ack_scheduled(sk)) {
/* We sent a data segment already. */
return;
}
@@ -3462,7 +3518,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
tp->copied_seq++;
if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, &sk->sk_receive_queue);
__kfree_skb(skb);
}
}
@@ -3645,7 +3701,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
- tcp_rcv_rtt_measure_ts(tp, skb);
+ tcp_rcv_rtt_measure_ts(sk, skb);
/* We know that such packets are checksummed
* on entry.
@@ -3678,7 +3734,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
- tcp_rcv_rtt_measure_ts(tp, skb);
+ tcp_rcv_rtt_measure_ts(sk, skb);
__skb_pull(skb, tcp_header_len);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
@@ -3699,7 +3755,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
- tcp_rcv_rtt_measure_ts(tp, skb);
+ tcp_rcv_rtt_measure_ts(sk, skb);
if ((int)skb->truesize > sk->sk_forward_alloc)
goto step5;
@@ -3719,7 +3775,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
/* Well, only one small jumplet in fast path... */
tcp_ack(sk, skb, FLAG_DATA);
tcp_data_snd_check(sk, tp);
- if (!tcp_ack_scheduled(tp))
+ if (!inet_csk_ack_scheduled(sk))
goto no_ack;
}
@@ -3741,7 +3797,7 @@ slow_path:
* RFC1323: H1. Apply PAWS check first.
*/
if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
- tcp_paws_discard(tp, skb)) {
+ tcp_paws_discard(sk, skb)) {
if (!th->rst) {
NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
tcp_send_dupack(sk, skb);
@@ -3788,7 +3844,7 @@ step5:
if(th->ack)
tcp_ack(sk, skb, FLAG_SLOWPATH);
- tcp_rcv_rtt_measure_ts(tp, skb);
+ tcp_rcv_rtt_measure_ts(sk, skb);
/* Process urgent data. */
tcp_urg(sk, skb, th);
@@ -3817,6 +3873,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_parse_options(skb, &tp->rx_opt, 0);
if (th->ack) {
+ struct inet_connection_sock *icsk;
/* rfc793:
* "If the state is SYN-SENT then
* first check the ACK bit
@@ -3920,7 +3977,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_init_metrics(sk);
- tcp_init_congestion_control(tp);
+ tcp_init_congestion_control(sk);
/* Prevent spurious tcp_cwnd_restart() on first data
* packet.
@@ -3930,7 +3987,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_init_buffer_space(sk);
if (sock_flag(sk, SOCK_KEEPOPEN))
- tcp_reset_keepalive_timer(sk, keepalive_time_when(tp));
+ inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
if (!tp->rx_opt.snd_wscale)
__tcp_fast_path_on(tp, tp->snd_wnd);
@@ -3942,7 +3999,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
sk_wake_async(sk, 0, POLL_OUT);
}
- if (sk->sk_write_pending || tp->defer_accept || tp->ack.pingpong) {
+ icsk = inet_csk(sk);
+
+ if (sk->sk_write_pending ||
+ icsk->icsk_accept_queue.rskq_defer_accept ||
+ icsk->icsk_ack.pingpong) {
/* Save one ACK. Data will be ready after
* several ticks, if write_pending is set.
*
@@ -3950,12 +4011,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* look so _wonderfully_ clever, that I was not able
* to stand against the temptation 8) --ANK
*/
- tcp_schedule_ack(tp);
- tp->ack.lrcvtime = tcp_time_stamp;
- tp->ack.ato = TCP_ATO_MIN;
- tcp_incr_quickack(tp);
- tcp_enter_quickack_mode(tp);
- tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
+ inet_csk_schedule_ack(sk);
+ icsk->icsk_ack.lrcvtime = tcp_time_stamp;
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
+ tcp_incr_quickack(sk);
+ tcp_enter_quickack_mode(sk);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ TCP_DELACK_MAX, TCP_RTO_MAX);
discard:
__kfree_skb(skb);
@@ -4111,7 +4173,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
}
if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
- tcp_paws_discard(tp, skb)) {
+ tcp_paws_discard(sk, skb)) {
if (!th->rst) {
NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
tcp_send_dupack(sk, skb);
@@ -4180,7 +4242,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
*/
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
!tp->srtt)
- tcp_ack_saw_tstamp(tp, 0, 0);
+ tcp_ack_saw_tstamp(sk, NULL, 0);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4192,7 +4254,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tcp_init_metrics(sk);
- tcp_init_congestion_control(tp);
+ tcp_init_congestion_control(sk);
/* Prevent spurious tcp_cwnd_restart() on
* first data packet.
@@ -4227,9 +4289,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
return 1;
}
- tmo = tcp_fin_time(tp);
+ tmo = tcp_fin_time(sk);
if (tmo > TCP_TIMEWAIT_LEN) {
- tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+ inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
} else if (th->fin || sock_owned_by_user(sk)) {
/* Bad case. We could lose such FIN otherwise.
* It is not a big problem, but it looks confusing
@@ -4237,7 +4299,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* if it spins in bh_lock_sock(), but it is really
* marginal case.
*/
- tcp_reset_keepalive_timer(sk, tmo);
+ inet_csk_reset_keepalive_timer(sk, tmo);
} else {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
goto discard;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 62f62bb05c2a..c85819d8474b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -64,7 +64,9 @@
#include <linux/times.h>
#include <net/icmp.h>
+#include <net/inet_hashtables.h>
#include <net/tcp.h>
+#include <net/transp_v6.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
#include <net/xfrm.h>
@@ -75,7 +77,6 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-extern int sysctl_ip_dynaddr;
int sysctl_tcp_tw_reuse;
int sysctl_tcp_low_latency;
@@ -88,458 +89,29 @@ static struct socket *tcp_socket;
void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
struct sk_buff *skb);
-struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
- .__tcp_lhash_lock = RW_LOCK_UNLOCKED,
- .__tcp_lhash_users = ATOMIC_INIT(0),
- .__tcp_lhash_wait
- = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
- .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
+struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
+ .lhash_lock = RW_LOCK_UNLOCKED,
+ .lhash_users = ATOMIC_INIT(0),
+ .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
+ .portalloc_lock = SPIN_LOCK_UNLOCKED,
+ .port_rover = 1024 - 1,
};
-/*
- * This array holds the first and last local port number.
- * For high-usage systems, use sysctl to change this to
- * 32768-61000
- */
-int sysctl_local_port_range[2] = { 1024, 4999 };
-int tcp_port_rover = 1024 - 1;
-
-static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
- __u32 faddr, __u16 fport)
-{
- int h = (laddr ^ lport) ^ (faddr ^ fport);
- h ^= h >> 16;
- h ^= h >> 8;
- return h & (tcp_ehash_size - 1);
-}
-
-static __inline__ int tcp_sk_hashfn(struct sock *sk)
-{
- struct inet_sock *inet = inet_sk(sk);
- __u32 laddr = inet->rcv_saddr;
- __u16 lport = inet->num;
- __u32 faddr = inet->daddr;
- __u16 fport = inet->dport;
-
- return tcp_hashfn(laddr, lport, faddr, fport);
-}
-
-/* Allocate and initialize a new TCP local port bind bucket.
- * The bindhash mutex for snum's hash chain must be held here.
- */
-struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
- unsigned short snum)
-{
- struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
- SLAB_ATOMIC);
- if (tb) {
- tb->port = snum;
- tb->fastreuse = 0;
- INIT_HLIST_HEAD(&tb->owners);
- hlist_add_head(&tb->node, &head->chain);
- }
- return tb;
-}
-
-/* Caller must hold hashbucket lock for this tb with local BH disabled */
-void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
-{
- if (hlist_empty(&tb->owners)) {
- __hlist_del(&tb->node);
- kmem_cache_free(tcp_bucket_cachep, tb);
- }
-}
-
-/* Caller must disable local BH processing. */
-static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
-{
- struct tcp_bind_hashbucket *head =
- &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
- struct tcp_bind_bucket *tb;
-
- spin_lock(&head->lock);
- tb = tcp_sk(sk)->bind_hash;
- sk_add_bind_node(child, &tb->owners);
- tcp_sk(child)->bind_hash = tb;
- spin_unlock(&head->lock);
-}
-
-inline void tcp_inherit_port(struct sock *sk, struct sock *child)
-{
- local_bh_disable();
- __tcp_inherit_port(sk, child);
- local_bh_enable();
-}
-
-void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
- unsigned short snum)
-{
- inet_sk(sk)->num = snum;
- sk_add_bind_node(sk, &tb->owners);
- tcp_sk(sk)->bind_hash = tb;
-}
-
-static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
-{
- const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
- struct sock *sk2;
- struct hlist_node *node;
- int reuse = sk->sk_reuse;
-
- sk_for_each_bound(sk2, node, &tb->owners) {
- if (sk != sk2 &&
- !tcp_v6_ipv6only(sk2) &&
- (!sk->sk_bound_dev_if ||
- !sk2->sk_bound_dev_if ||
- sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
- if (!reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) {
- const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
- if (!sk2_rcv_saddr || !sk_rcv_saddr ||
- sk2_rcv_saddr == sk_rcv_saddr)
- break;
- }
- }
- }
- return node != NULL;
-}
-
-/* Obtain a reference to a local port for the given sock,
- * if snum is zero it means select any available local port.
- */
static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
{
- struct tcp_bind_hashbucket *head;
- struct hlist_node *node;
- struct tcp_bind_bucket *tb;
- int ret;
-
- local_bh_disable();
- if (!snum) {
- int low = sysctl_local_port_range[0];
- int high = sysctl_local_port_range[1];
- int remaining = (high - low) + 1;
- int rover;
-
- spin_lock(&tcp_portalloc_lock);
- if (tcp_port_rover < low)
- rover = low;
- else
- rover = tcp_port_rover;
- do {
- rover++;
- if (rover > high)
- rover = low;
- head = &tcp_bhash[tcp_bhashfn(rover)];
- spin_lock(&head->lock);
- tb_for_each(tb, node, &head->chain)
- if (tb->port == rover)
- goto next;
- break;
- next:
- spin_unlock(&head->lock);
- } while (--remaining > 0);
- tcp_port_rover = rover;
- spin_unlock(&tcp_portalloc_lock);
-
- /* Exhausted local port range during search? */
- ret = 1;
- if (remaining <= 0)
- goto fail;
-
- /* OK, here is the one we will use. HEAD is
- * non-NULL and we hold it's mutex.
- */
- snum = rover;
- } else {
- head = &tcp_bhash[tcp_bhashfn(snum)];
- spin_lock(&head->lock);
- tb_for_each(tb, node, &head->chain)
- if (tb->port == snum)
- goto tb_found;
- }
- tb = NULL;
- goto tb_not_found;
-tb_found:
- if (!hlist_empty(&tb->owners)) {
- if (sk->sk_reuse > 1)
- goto success;
- if (tb->fastreuse > 0 &&
- sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
- goto success;
- } else {
- ret = 1;
- if (tcp_bind_conflict(sk, tb))
- goto fail_unlock;
- }
- }
-tb_not_found:
- ret = 1;
- if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
- goto fail_unlock;
- if (hlist_empty(&tb->owners)) {
- if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
- tb->fastreuse = 1;
- else
- tb->fastreuse = 0;
- } else if (tb->fastreuse &&
- (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
- tb->fastreuse = 0;
-success:
- if (!tcp_sk(sk)->bind_hash)
- tcp_bind_hash(sk, tb, snum);
- BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
- ret = 0;
-
-fail_unlock:
- spin_unlock(&head->lock);
-fail:
- local_bh_enable();
- return ret;
-}
-
-/* Get rid of any references to a local port held by the
- * given sock.
- */
-static void __tcp_put_port(struct sock *sk)
-{
- struct inet_sock *inet = inet_sk(sk);
- struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
- struct tcp_bind_bucket *tb;
-
- spin_lock(&head->lock);
- tb = tcp_sk(sk)->bind_hash;
- __sk_del_bind_node(sk);
- tcp_sk(sk)->bind_hash = NULL;
- inet->num = 0;
- tcp_bucket_destroy(tb);
- spin_unlock(&head->lock);
-}
-
-void tcp_put_port(struct sock *sk)
-{
- local_bh_disable();
- __tcp_put_port(sk);
- local_bh_enable();
-}
-
-/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
- * Look, when several writers sleep and reader wakes them up, all but one
- * immediately hit write lock and grab all the cpus. Exclusive sleep solves
- * this, _but_ remember, it adds useless work on UP machines (wake up each
- * exclusive lock release). It should be ifdefed really.
- */
-
-void tcp_listen_wlock(void)
-{
- write_lock(&tcp_lhash_lock);
-
- if (atomic_read(&tcp_lhash_users)) {
- DEFINE_WAIT(wait);
-
- for (;;) {
- prepare_to_wait_exclusive(&tcp_lhash_wait,
- &wait, TASK_UNINTERRUPTIBLE);
- if (!atomic_read(&tcp_lhash_users))
- break;
- write_unlock_bh(&tcp_lhash_lock);
- schedule();
- write_lock_bh(&tcp_lhash_lock);
- }
-
- finish_wait(&tcp_lhash_wait, &wait);
- }
-}
-
-static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
-{
- struct hlist_head *list;
- rwlock_t *lock;
-
- BUG_TRAP(sk_unhashed(sk));
- if (listen_possible && sk->sk_state == TCP_LISTEN) {
- list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
- lock = &tcp_lhash_lock;
- tcp_listen_wlock();
- } else {
- list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
- lock = &tcp_ehash[sk->sk_hashent].lock;
- write_lock(lock);
- }
- __sk_add_node(sk, list);
- sock_prot_inc_use(sk->sk_prot);
- write_unlock(lock);
- if (listen_possible && sk->sk_state == TCP_LISTEN)
- wake_up(&tcp_lhash_wait);
+ return inet_csk_get_port(&tcp_hashinfo, sk, snum);
}
static void tcp_v4_hash(struct sock *sk)
{
- if (sk->sk_state != TCP_CLOSE) {
- local_bh_disable();
- __tcp_v4_hash(sk, 1);
- local_bh_enable();
- }
+ inet_hash(&tcp_hashinfo, sk);
}
void tcp_unhash(struct sock *sk)
{
- rwlock_t *lock;
-
- if (sk_unhashed(sk))
- goto ende;
-
- if (sk->sk_state == TCP_LISTEN) {
- local_bh_disable();
- tcp_listen_wlock();
- lock = &tcp_lhash_lock;
- } else {
- struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
- lock = &head->lock;
- write_lock_bh(&head->lock);
- }
-
- if (__sk_del_node_init(sk))
- sock_prot_dec_use(sk->sk_prot);
- write_unlock_bh(lock);
-
- ende:
- if (sk->sk_state == TCP_LISTEN)
- wake_up(&tcp_lhash_wait);
-}
-
-/* Don't inline this cruft. Here are some nice properties to
- * exploit here. The BSD API does not allow a listening TCP
- * to specify the remote port nor the remote address for the
- * connection. So always assume those are both wildcarded
- * during the search since they can never be otherwise.
- */
-static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
- unsigned short hnum, int dif)
-{
- struct sock *result = NULL, *sk;
- struct hlist_node *node;
- int score, hiscore;
-
- hiscore=-1;
- sk_for_each(sk, node, head) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (inet->num == hnum && !ipv6_only_sock(sk)) {
- __u32 rcv_saddr = inet->rcv_saddr;
-
- score = (sk->sk_family == PF_INET ? 1 : 0);
- if (rcv_saddr) {
- if (rcv_saddr != daddr)
- continue;
- score+=2;
- }
- if (sk->sk_bound_dev_if) {
- if (sk->sk_bound_dev_if != dif)
- continue;
- score+=2;
- }
- if (score == 5)
- return sk;
- if (score > hiscore) {
- hiscore = score;
- result = sk;
- }
- }
- }
- return result;
-}
-
-/* Optimize the common listener case. */
-static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
- unsigned short hnum, int dif)
-{
- struct sock *sk = NULL;
- struct hlist_head *head;
-
- read_lock(&tcp_lhash_lock);
- head = &tcp_listening_hash[tcp_lhashfn(hnum)];
- if (!hlist_empty(head)) {
- struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
-
- if (inet->num == hnum && !sk->sk_node.next &&
- (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
- (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
- !sk->sk_bound_dev_if)
- goto sherry_cache;
- sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
- }
- if (sk) {
-sherry_cache:
- sock_hold(sk);
- }
- read_unlock(&tcp_lhash_lock);
- return sk;
-}
-
-/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
- * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
- *
- * Local BH must be disabled here.
- */
-
-static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
- u32 daddr, u16 hnum,
- int dif)
-{
- struct tcp_ehash_bucket *head;
- TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
- __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
- struct sock *sk;
- struct hlist_node *node;
- /* Optimize here for direct hit, only listening connections can
- * have wildcards anyways.
- */
- int hash = tcp_hashfn(daddr, hnum, saddr, sport);
- head = &tcp_ehash[hash];
- read_lock(&head->lock);
- sk_for_each(sk, node, &head->chain) {
- if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
- goto hit; /* You sunk my battleship! */
- }
-
- /* Must check for a TIME_WAIT'er before going to listener hash. */
- sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
- if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
- goto hit;
- }
- sk = NULL;
-out:
- read_unlock(&head->lock);
- return sk;
-hit:
- sock_hold(sk);
- goto out;
-}
-
-static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
- u32 daddr, u16 hnum, int dif)
-{
- struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
- daddr, hnum, dif);
-
- return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
-}
-
-inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
- u16 dport, int dif)
-{
- struct sock *sk;
-
- local_bh_disable();
- sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
- local_bh_enable();
-
- return sk;
+ inet_unhash(&tcp_hashinfo, sk);
}
-EXPORT_SYMBOL_GPL(tcp_v4_lookup);
-
static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
{
return secure_tcp_sequence_number(skb->nh.iph->daddr,
@@ -550,27 +122,29 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
/* called with local bh disabled */
static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
- struct tcp_tw_bucket **twp)
+ struct inet_timewait_sock **twp)
{
struct inet_sock *inet = inet_sk(sk);
u32 daddr = inet->rcv_saddr;
u32 saddr = inet->daddr;
int dif = sk->sk_bound_dev_if;
- TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
- __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
- int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
- struct tcp_ehash_bucket *head = &tcp_ehash[hash];
+ INET_ADDR_COOKIE(acookie, saddr, daddr)
+ const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+ unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
+ struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
struct sock *sk2;
- struct hlist_node *node;
- struct tcp_tw_bucket *tw;
+ const struct hlist_node *node;
+ struct inet_timewait_sock *tw;
+ prefetch(head->chain.first);
write_lock(&head->lock);
/* Check TIME-WAIT sockets first. */
- sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
- tw = (struct tcp_tw_bucket *)sk2;
+ sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
+ tw = inet_twsk(sk2);
- if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
+ if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
+ const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
struct tcp_sock *tp = tcp_sk(sk);
/* With PAWS, it is safe from the viewpoint
@@ -587,15 +161,15 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
fall back to VJ's scheme and use initial
timestamp retrieved from peer table.
*/
- if (tw->tw_ts_recent_stamp &&
+ if (tcptw->tw_ts_recent_stamp &&
(!twp || (sysctl_tcp_tw_reuse &&
xtime.tv_sec -
- tw->tw_ts_recent_stamp > 1))) {
- if ((tp->write_seq =
- tw->tw_snd_nxt + 65535 + 2) == 0)
+ tcptw->tw_ts_recent_stamp > 1))) {
+ tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
+ if (tp->write_seq == 0)
tp->write_seq = 1;
- tp->rx_opt.ts_recent = tw->tw_ts_recent;
- tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+ tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
+ tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
sock_hold(sk2);
goto unique;
} else
@@ -606,7 +180,7 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
/* And established part... */
sk_for_each(sk2, node, &head->chain) {
- if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
+ if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
goto not_unique;
}
@@ -615,7 +189,7 @@ unique:
* in hash table socket with a funny identity. */
inet->num = lport;
inet->sport = htons(lport);
- sk->sk_hashent = hash;
+ sk->sk_hash = hash;
BUG_TRAP(sk_unhashed(sk));
__sk_add_node(sk, &head->chain);
sock_prot_inc_use(sk->sk_prot);
@@ -626,10 +200,10 @@ unique:
NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
} else if (tw) {
/* Silly. Should hash-dance instead... */
- tcp_tw_deschedule(tw);
+ inet_twsk_deschedule(tw, &tcp_death_row);
NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
- tcp_tw_put(tw);
+ inet_twsk_put(tw);
}
return 0;
@@ -652,9 +226,9 @@ static inline u32 connect_port_offset(const struct sock *sk)
*/
static inline int tcp_v4_hash_connect(struct sock *sk)
{
- unsigned short snum = inet_sk(sk)->num;
- struct tcp_bind_hashbucket *head;
- struct tcp_bind_bucket *tb;
+ const unsigned short snum = inet_sk(sk)->num;
+ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
int ret;
if (!snum) {
@@ -666,19 +240,19 @@ static inline int tcp_v4_hash_connect(struct sock *sk)
static u32 hint;
u32 offset = hint + connect_port_offset(sk);
struct hlist_node *node;
- struct tcp_tw_bucket *tw = NULL;
+ struct inet_timewait_sock *tw = NULL;
local_bh_disable();
for (i = 1; i <= range; i++) {
port = low + (i + offset) % range;
- head = &tcp_bhash[tcp_bhashfn(port)];
+ head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
spin_lock(&head->lock);
/* Does not bother with rcv_saddr checks,
* because the established check is already
* unique enough.
*/
- tb_for_each(tb, node, &head->chain) {
+ inet_bind_bucket_for_each(tb, node, &head->chain) {
if (tb->port == port) {
BUG_TRAP(!hlist_empty(&tb->owners));
if (tb->fastreuse >= 0)
@@ -691,7 +265,7 @@ static inline int tcp_v4_hash_connect(struct sock *sk)
}
}
- tb = tcp_bucket_create(head, port);
+ tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
if (!tb) {
spin_unlock(&head->lock);
break;
@@ -710,27 +284,27 @@ ok:
hint += i;
/* Head lock still held and bh's disabled */
- tcp_bind_hash(sk, tb, port);
+ inet_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) {
inet_sk(sk)->sport = htons(port);
- __tcp_v4_hash(sk, 0);
+ __inet_hash(&tcp_hashinfo, sk, 0);
}
spin_unlock(&head->lock);
if (tw) {
- tcp_tw_deschedule(tw);
- tcp_tw_put(tw);
+ inet_twsk_deschedule(tw, &tcp_death_row);;
+ inet_twsk_put(tw);
}
ret = 0;
goto out;
}
- head = &tcp_bhash[tcp_bhashfn(snum)];
- tb = tcp_sk(sk)->bind_hash;
+ head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
+ tb = inet_csk(sk)->icsk_bind_hash;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- __tcp_v4_hash(sk, 0);
+ __inet_hash(&tcp_hashinfo, sk, 0);
spin_unlock_bh(&head->lock);
return 0;
} else {
@@ -793,7 +367,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
tp->write_seq = 0;
}
- if (sysctl_tcp_tw_recycle &&
+ if (tcp_death_row.sysctl_tw_recycle &&
!tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
struct inet_peer *peer = rt_get_peer(rt);
@@ -832,8 +406,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
goto failure;
/* OK, now commit destination to socket. */
- __sk_dst_set(sk, &rt->u.dst);
- tcp_v4_setup_caps(sk, &rt->u.dst);
+ sk_setup_caps(sk, &rt->u.dst);
if (!tp->write_seq)
tp->write_seq = secure_tcp_sequence_number(inet->saddr,
@@ -859,53 +432,6 @@ failure:
return err;
}
-static __inline__ int tcp_v4_iif(struct sk_buff *skb)
-{
- return ((struct rtable *)skb->dst)->rt_iif;
-}
-
-static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
-{
- return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
-}
-
-static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
- struct request_sock ***prevp,
- __u16 rport,
- __u32 raddr, __u32 laddr)
-{
- struct listen_sock *lopt = tp->accept_queue.listen_opt;
- struct request_sock *req, **prev;
-
- for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
- (req = *prev) != NULL;
- prev = &req->dl_next) {
- const struct inet_request_sock *ireq = inet_rsk(req);
-
- if (ireq->rmt_port == rport &&
- ireq->rmt_addr == raddr &&
- ireq->loc_addr == laddr &&
- TCP_INET_FAMILY(req->rsk_ops->family)) {
- BUG_TRAP(!req->sk);
- *prevp = prev;
- break;
- }
- }
-
- return req;
-}
-
-static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct listen_sock *lopt = tp->accept_queue.listen_opt;
- u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
-
- reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
- tcp_synq_added(sk);
-}
-
-
/*
* This routine does path mtu discovery as defined in RFC1191.
*/
@@ -988,14 +514,14 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
return;
}
- sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
- th->source, tcp_v4_iif(skb));
+ sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
+ th->source, inet_iif(skb));
if (!sk) {
ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
return;
}
if (sk->sk_state == TCP_TIME_WAIT) {
- tcp_tw_put((struct tcp_tw_bucket *)sk);
+ inet_twsk_put((struct inet_timewait_sock *)sk);
return;
}
@@ -1049,8 +575,8 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
if (sock_owned_by_user(sk))
goto out;
- req = tcp_v4_search_req(tp, &prev, th->dest,
- iph->daddr, iph->saddr);
+ req = inet_csk_search_req(sk, &prev, th->dest,
+ iph->daddr, iph->saddr);
if (!req)
goto out;
@@ -1070,7 +596,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
* created socket, and POSIX does not want network
* errors returned from accept().
*/
- tcp_synq_drop(sk, req, prev);
+ inet_csk_reqsk_queue_drop(sk, req, prev);
goto out;
case TCP_SYN_SENT:
@@ -1240,12 +766,13 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
{
- struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+ const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
- tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
- tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
+ tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+ tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
- tcp_tw_put(tw);
+ inet_twsk_put(tw);
}
static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
@@ -1254,36 +781,6 @@ static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
req->ts_recent);
}
-static struct dst_entry* tcp_v4_route_req(struct sock *sk,
- struct request_sock *req)
-{
- struct rtable *rt;
- const struct inet_request_sock *ireq = inet_rsk(req);
- struct ip_options *opt = inet_rsk(req)->opt;
- struct flowi fl = { .oif = sk->sk_bound_dev_if,
- .nl_u = { .ip4_u =
- { .daddr = ((opt && opt->srr) ?
- opt->faddr :
- ireq->rmt_addr),
- .saddr = ireq->loc_addr,
- .tos = RT_CONN_FLAGS(sk) } },
- .proto = IPPROTO_TCP,
- .uli_u = { .ports =
- { .sport = inet_sk(sk)->sport,
- .dport = ireq->rmt_port } } };
-
- if (ip_route_output_flow(&rt, &fl, sk, 0)) {
- IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
- return NULL;
- }
- if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
- ip_rt_put(rt);
- IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
- return NULL;
- }
- return &rt->u.dst;
-}
-
/*
* Send a SYN-ACK after having received an ACK.
* This still operates on a request_sock only, not on a big
@@ -1297,7 +794,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
struct sk_buff * skb;
/* First, grab a route. */
- if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
+ if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
goto out;
skb = tcp_make_synack(sk, dst, req);
@@ -1399,7 +896,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* limitations, they conserve resources and peer is
* evidently real one.
*/
- if (tcp_synq_is_full(sk) && !isn) {
+ if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
#ifdef CONFIG_SYN_COOKIES
if (sysctl_tcp_syncookies) {
want_cookie = 1;
@@ -1413,7 +910,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* clogging syn queue with openreqs with exponentially increasing
* timeout.
*/
- if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
goto drop;
req = reqsk_alloc(&tcp_request_sock_ops);
@@ -1469,8 +966,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* are made in the function processing timewait state.
*/
if (tmp_opt.saw_tstamp &&
- sysctl_tcp_tw_recycle &&
- (dst = tcp_v4_route_req(sk, req)) != NULL &&
+ tcp_death_row.sysctl_tw_recycle &&
+ (dst = inet_csk_route_req(sk, req)) != NULL &&
(peer = rt_get_peer((struct rtable *)dst)) != NULL &&
peer->v4daddr == saddr) {
if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
@@ -1483,7 +980,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
}
/* Kill the following clause, if you dislike this way. */
else if (!sysctl_tcp_syncookies &&
- (sysctl_max_syn_backlog - tcp_synq_len(sk) <
+ (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
(sysctl_max_syn_backlog >> 2)) &&
(!peer || !peer->tcp_ts_stamp) &&
(!dst || !dst_metric(dst, RTAX_RTT))) {
@@ -1494,12 +991,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* to destinations, already remembered
* to the moment of synflood.
*/
- NETDEBUG(if (net_ratelimit()) \
- printk(KERN_DEBUG "TCP: drop open "
- "request from %u.%u."
- "%u.%u/%u\n", \
- NIPQUAD(saddr),
- ntohs(skb->h.th->source)));
+ LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
+ "request from %u.%u.%u.%u/%u\n",
+ NIPQUAD(saddr),
+ ntohs(skb->h.th->source));
dst_release(dst);
goto drop_and_free;
}
@@ -1514,7 +1009,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
if (want_cookie) {
reqsk_free(req);
} else {
- tcp_v4_synq_add(sk, req);
+ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
}
return 0;
@@ -1542,15 +1037,14 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
if (sk_acceptq_is_full(sk))
goto exit_overflow;
- if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
+ if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
goto exit;
newsk = tcp_create_openreq_child(sk, req, skb);
if (!newsk)
goto exit;
- newsk->sk_dst_cache = dst;
- tcp_v4_setup_caps(newsk, dst);
+ sk_setup_caps(newsk, dst);
newtp = tcp_sk(newsk);
newinet = inet_sk(newsk);
@@ -1560,7 +1054,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newinet->saddr = ireq->loc_addr;
newinet->opt = ireq->opt;
ireq->opt = NULL;
- newinet->mc_index = tcp_v4_iif(skb);
+ newinet->mc_index = inet_iif(skb);
newinet->mc_ttl = skb->nh.iph->ttl;
newtp->ext_header_len = 0;
if (newinet->opt)
@@ -1571,8 +1065,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
tcp_initialize_rcv_mss(newsk);
- __tcp_v4_hash(newsk, 0);
- __tcp_inherit_port(sk, newsk);
+ __inet_hash(&tcp_hashinfo, newsk, 0);
+ __inet_inherit_port(&tcp_hashinfo, sk, newsk);
return newsk;
@@ -1588,27 +1082,24 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
struct tcphdr *th = skb->h.th;
struct iphdr *iph = skb->nh.iph;
- struct tcp_sock *tp = tcp_sk(sk);
struct sock *nsk;
struct request_sock **prev;
/* Find possible connection requests. */
- struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
- iph->saddr, iph->daddr);
+ struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
+ iph->saddr, iph->daddr);
if (req)
return tcp_check_req(sk, skb, req, prev);
- nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
- th->source,
- skb->nh.iph->daddr,
- ntohs(th->dest),
- tcp_v4_iif(skb));
+ nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
+ th->source, skb->nh.iph->daddr,
+ ntohs(th->dest), inet_iif(skb));
if (nsk) {
if (nsk->sk_state != TCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
- tcp_tw_put((struct tcp_tw_bucket *)nsk);
+ inet_twsk_put((struct inet_timewait_sock *)nsk);
return NULL;
}
@@ -1627,8 +1118,7 @@ static int tcp_v4_checksum_init(struct sk_buff *skb)
skb->nh.iph->daddr, skb->csum))
return 0;
- NETDEBUG(if (net_ratelimit())
- printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n");
skb->ip_summed = CHECKSUM_NONE;
}
if (skb->len <= 76) {
@@ -1744,9 +1234,9 @@ int tcp_v4_rcv(struct sk_buff *skb)
TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
TCP_SKB_CB(skb)->sacked = 0;
- sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
- skb->nh.iph->daddr, ntohs(th->dest),
- tcp_v4_iif(skb));
+ sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
+ skb->nh.iph->daddr, ntohs(th->dest),
+ inet_iif(skb));
if (!sk)
goto no_tcp_socket;
@@ -1798,24 +1288,26 @@ discard_and_relse:
do_time_wait:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- tcp_tw_put((struct tcp_tw_bucket *) sk);
+ inet_twsk_put((struct inet_timewait_sock *) sk);
goto discard_it;
}
if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
TCP_INC_STATS_BH(TCP_MIB_INERRS);
- tcp_tw_put((struct tcp_tw_bucket *) sk);
+ inet_twsk_put((struct inet_timewait_sock *) sk);
goto discard_it;
}
- switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
- skb, th, skb->len)) {
+ switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
+ skb, th)) {
case TCP_TW_SYN: {
- struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
- ntohs(th->dest),
- tcp_v4_iif(skb));
+ struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
+ skb->nh.iph->daddr,
+ ntohs(th->dest),
+ inet_iif(skb));
if (sk2) {
- tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
- tcp_tw_put((struct tcp_tw_bucket *)sk);
+ inet_twsk_deschedule((struct inet_timewait_sock *)sk,
+ &tcp_death_row);
+ inet_twsk_put((struct inet_timewait_sock *)sk);
sk = sk2;
goto process;
}
@@ -1831,112 +1323,6 @@ do_time_wait:
goto discard_it;
}
-/* With per-bucket locks this operation is not-atomic, so that
- * this version is not worse.
- */
-static void __tcp_v4_rehash(struct sock *sk)
-{
- sk->sk_prot->unhash(sk);
- sk->sk_prot->hash(sk);
-}
-
-static int tcp_v4_reselect_saddr(struct sock *sk)
-{
- struct inet_sock *inet = inet_sk(sk);
- int err;
- struct rtable *rt;
- __u32 old_saddr = inet->saddr;
- __u32 new_saddr;
- __u32 daddr = inet->daddr;
-
- if (inet->opt && inet->opt->srr)
- daddr = inet->opt->faddr;
-
- /* Query new route. */
- err = ip_route_connect(&rt, daddr, 0,
- RT_CONN_FLAGS(sk),
- sk->sk_bound_dev_if,
- IPPROTO_TCP,
- inet->sport, inet->dport, sk);
- if (err)
- return err;
-
- __sk_dst_set(sk, &rt->u.dst);
- tcp_v4_setup_caps(sk, &rt->u.dst);
-
- new_saddr = rt->rt_src;
-
- if (new_saddr == old_saddr)
- return 0;
-
- if (sysctl_ip_dynaddr > 1) {
- printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
- "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
- NIPQUAD(old_saddr),
- NIPQUAD(new_saddr));
- }
-
- inet->saddr = new_saddr;
- inet->rcv_saddr = new_saddr;
-
- /* XXX The only one ugly spot where we need to
- * XXX really change the sockets identity after
- * XXX it has entered the hashes. -DaveM
- *
- * Besides that, it does not check for connection
- * uniqueness. Wait for troubles.
- */
- __tcp_v4_rehash(sk);
- return 0;
-}
-
-int tcp_v4_rebuild_header(struct sock *sk)
-{
- struct inet_sock *inet = inet_sk(sk);
- struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
- u32 daddr;
- int err;
-
- /* Route is OK, nothing to do. */
- if (rt)
- return 0;
-
- /* Reroute. */
- daddr = inet->daddr;
- if (inet->opt && inet->opt->srr)
- daddr = inet->opt->faddr;
-
- {
- struct flowi fl = { .oif = sk->sk_bound_dev_if,
- .nl_u = { .ip4_u =
- { .daddr = daddr,
- .saddr = inet->saddr,
- .tos = RT_CONN_FLAGS(sk) } },
- .proto = IPPROTO_TCP,
- .uli_u = { .ports =
- { .sport = inet->sport,
- .dport = inet->dport } } };
-
- err = ip_route_output_flow(&rt, &fl, sk, 0);
- }
- if (!err) {
- __sk_dst_set(sk, &rt->u.dst);
- tcp_v4_setup_caps(sk, &rt->u.dst);
- return 0;
- }
-
- /* Routing failed... */
- sk->sk_route_caps = 0;
-
- if (!sysctl_ip_dynaddr ||
- sk->sk_state != TCP_SYN_SENT ||
- (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
- (err = tcp_v4_reselect_saddr(sk)) != 0)
- sk->sk_err_soft = -err;
-
- return err;
-}
-
static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
{
struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
@@ -1985,18 +1371,18 @@ int tcp_v4_remember_stamp(struct sock *sk)
return 0;
}
-int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
+int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
{
- struct inet_peer *peer = NULL;
-
- peer = inet_getpeer(tw->tw_daddr, 1);
+ struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
if (peer) {
- if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
+ const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+
+ if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
(peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
- peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
- peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
- peer->tcp_ts = tw->tw_ts_recent;
+ peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
+ peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
+ peer->tcp_ts = tcptw->tw_ts_recent;
}
inet_putpeer(peer);
return 1;
@@ -2008,7 +1394,7 @@ int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
struct tcp_func ipv4_specific = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
- .rebuild_header = tcp_v4_rebuild_header,
+ .rebuild_header = inet_sk_rebuild_header,
.conn_request = tcp_v4_conn_request,
.syn_recv_sock = tcp_v4_syn_recv_sock,
.remember_stamp = tcp_v4_remember_stamp,
@@ -2024,13 +1410,14 @@ struct tcp_func ipv4_specific = {
*/
static int tcp_v4_init_sock(struct sock *sk)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
skb_queue_head_init(&tp->out_of_order_queue);
tcp_init_xmit_timers(sk);
tcp_prequeue_init(tp);
- tp->rto = TCP_TIMEOUT_INIT;
+ icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev = TCP_TIMEOUT_INIT;
/* So many TCP implementations out there (incorrectly) count the
@@ -2048,7 +1435,7 @@ static int tcp_v4_init_sock(struct sock *sk)
tp->mss_cache = 536;
tp->reordering = sysctl_tcp_reordering;
- tp->ca_ops = &tcp_init_congestion_ops;
+ icsk->icsk_ca_ops = &tcp_init_congestion_ops;
sk->sk_state = TCP_CLOSE;
@@ -2071,7 +1458,7 @@ int tcp_v4_destroy_sock(struct sock *sk)
tcp_clear_xmit_timers(sk);
- tcp_cleanup_congestion_control(tp);
+ tcp_cleanup_congestion_control(sk);
/* Cleanup up the write buffer. */
sk_stream_writequeue_purge(sk);
@@ -2083,8 +1470,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
__skb_queue_purge(&tp->ucopy.prequeue);
/* Clean up a referenced TCP bind bucket. */
- if (tp->bind_hash)
- tcp_put_port(sk);
+ if (inet_csk(sk)->icsk_bind_hash)
+ inet_put_port(&tcp_hashinfo, sk);
/*
* If sendmsg cached page exists, toss it.
@@ -2104,13 +1491,13 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
#ifdef CONFIG_PROC_FS
/* Proc filesystem TCP sock list dumping. */
-static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
+static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
{
return hlist_empty(head) ? NULL :
- list_entry(head->first, struct tcp_tw_bucket, tw_node);
+ list_entry(head->first, struct inet_timewait_sock, tw_node);
}
-static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
+static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
{
return tw->tw_node.next ?
hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
@@ -2118,14 +1505,14 @@ static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
static void *listening_get_next(struct seq_file *seq, void *cur)
{
- struct tcp_sock *tp;
+ struct inet_connection_sock *icsk;
struct hlist_node *node;
struct sock *sk = cur;
struct tcp_iter_state* st = seq->private;
if (!sk) {
st->bucket = 0;
- sk = sk_head(&tcp_listening_hash[0]);
+ sk = sk_head(&tcp_hashinfo.listening_hash[0]);
goto get_sk;
}
@@ -2134,7 +1521,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
if (st->state == TCP_SEQ_STATE_OPENREQ) {
struct request_sock *req = cur;
- tp = tcp_sk(st->syn_wait_sk);
+ icsk = inet_csk(st->syn_wait_sk);
req = req->dl_next;
while (1) {
while (req) {
@@ -2147,17 +1534,17 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
if (++st->sbucket >= TCP_SYNQ_HSIZE)
break;
get_req:
- req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
+ req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
}
sk = sk_next(st->syn_wait_sk);
st->state = TCP_SEQ_STATE_LISTENING;
- read_unlock_bh(&tp->accept_queue.syn_wait_lock);
+ read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
} else {
- tp = tcp_sk(sk);
- read_lock_bh(&tp->accept_queue.syn_wait_lock);
- if (reqsk_queue_len(&tp->accept_queue))
+ icsk = inet_csk(sk);
+ read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ if (reqsk_queue_len(&icsk->icsk_accept_queue))
goto start_req;
- read_unlock_bh(&tp->accept_queue.syn_wait_lock);
+ read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
sk = sk_next(sk);
}
get_sk:
@@ -2166,9 +1553,9 @@ get_sk:
cur = sk;
goto out;
}
- tp = tcp_sk(sk);
- read_lock_bh(&tp->accept_queue.syn_wait_lock);
- if (reqsk_queue_len(&tp->accept_queue)) {
+ icsk = inet_csk(sk);
+ read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
start_req:
st->uid = sock_i_uid(sk);
st->syn_wait_sk = sk;
@@ -2176,10 +1563,10 @@ start_req:
st->sbucket = 0;
goto get_req;
}
- read_unlock_bh(&tp->accept_queue.syn_wait_lock);
+ read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
}
- if (++st->bucket < TCP_LHTABLE_SIZE) {
- sk = sk_head(&tcp_listening_hash[st->bucket]);
+ if (++st->bucket < INET_LHTABLE_SIZE) {
+ sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
goto get_sk;
}
cur = NULL;
@@ -2203,16 +1590,16 @@ static void *established_get_first(struct seq_file *seq)
struct tcp_iter_state* st = seq->private;
void *rc = NULL;
- for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
+ for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
struct sock *sk;
struct hlist_node *node;
- struct tcp_tw_bucket *tw;
+ struct inet_timewait_sock *tw;
/* We can reschedule _before_ having picked the target: */
cond_resched_softirq();
- read_lock(&tcp_ehash[st->bucket].lock);
- sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
+ read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
+ sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
if (sk->sk_family != st->family) {
continue;
}
@@ -2220,15 +1607,15 @@ static void *established_get_first(struct seq_file *seq)
goto out;
}
st->state = TCP_SEQ_STATE_TIME_WAIT;
- tw_for_each(tw, node,
- &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
+ inet_twsk_for_each(tw, node,
+ &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
if (tw->tw_family != st->family) {
continue;
}
rc = tw;
goto out;
}
- read_unlock(&tcp_ehash[st->bucket].lock);
+ read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
st->state = TCP_SEQ_STATE_ESTABLISHED;
}
out:
@@ -2238,7 +1625,7 @@ out:
static void *established_get_next(struct seq_file *seq, void *cur)
{
struct sock *sk = cur;
- struct tcp_tw_bucket *tw;
+ struct inet_timewait_sock *tw;
struct hlist_node *node;
struct tcp_iter_state* st = seq->private;
@@ -2255,15 +1642,15 @@ get_tw:
cur = tw;
goto out;
}
- read_unlock(&tcp_ehash[st->bucket].lock);
+ read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
st->state = TCP_SEQ_STATE_ESTABLISHED;
/* We can reschedule between buckets: */
cond_resched_softirq();
- if (++st->bucket < tcp_ehash_size) {
- read_lock(&tcp_ehash[st->bucket].lock);
- sk = sk_head(&tcp_ehash[st->bucket].chain);
+ if (++st->bucket < tcp_hashinfo.ehash_size) {
+ read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
+ sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
} else {
cur = NULL;
goto out;
@@ -2277,7 +1664,7 @@ get_tw:
}
st->state = TCP_SEQ_STATE_TIME_WAIT;
- tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
+ tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
goto get_tw;
found:
cur = sk;
@@ -2301,12 +1688,12 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
void *rc;
struct tcp_iter_state* st = seq->private;
- tcp_listen_lock();
+ inet_listen_lock(&tcp_hashinfo);
st->state = TCP_SEQ_STATE_LISTENING;
rc = listening_get_idx(seq, &pos);
if (!rc) {
- tcp_listen_unlock();
+ inet_listen_unlock(&tcp_hashinfo);
local_bh_disable();
st->state = TCP_SEQ_STATE_ESTABLISHED;
rc = established_get_idx(seq, pos);
@@ -2339,7 +1726,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
case TCP_SEQ_STATE_LISTENING:
rc = listening_get_next(seq, v);
if (!rc) {
- tcp_listen_unlock();
+ inet_listen_unlock(&tcp_hashinfo);
local_bh_disable();
st->state = TCP_SEQ_STATE_ESTABLISHED;
rc = established_get_first(seq);
@@ -2362,17 +1749,17 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
switch (st->state) {
case TCP_SEQ_STATE_OPENREQ:
if (v) {
- struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
- read_unlock_bh(&tp->accept_queue.syn_wait_lock);
+ struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
+ read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
}
case TCP_SEQ_STATE_LISTENING:
if (v != SEQ_START_TOKEN)
- tcp_listen_unlock();
+ inet_listen_unlock(&tcp_hashinfo);
break;
case TCP_SEQ_STATE_TIME_WAIT:
case TCP_SEQ_STATE_ESTABLISHED:
if (v)
- read_unlock(&tcp_ehash[st->bucket].lock);
+ read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
local_bh_enable();
break;
}
@@ -2469,18 +1856,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
int timer_active;
unsigned long timer_expires;
struct tcp_sock *tp = tcp_sk(sp);
+ const struct inet_connection_sock *icsk = inet_csk(sp);
struct inet_sock *inet = inet_sk(sp);
unsigned int dest = inet->daddr;
unsigned int src = inet->rcv_saddr;
__u16 destp = ntohs(inet->dport);
__u16 srcp = ntohs(inet->sport);
- if (tp->pending == TCP_TIME_RETRANS) {
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
timer_active = 1;
- timer_expires = tp->timeout;
- } else if (tp->pending == TCP_TIME_PROBE0) {
+ timer_expires = icsk->icsk_timeout;
+ } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
timer_active = 4;
- timer_expires = tp->timeout;
+ timer_expires = icsk->icsk_timeout;
} else if (timer_pending(&sp->sk_timer)) {
timer_active = 2;
timer_expires = sp->sk_timer.expires;
@@ -2495,17 +1883,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
timer_active,
jiffies_to_clock_t(timer_expires - jiffies),
- tp->retransmits,
+ icsk->icsk_retransmits,
sock_i_uid(sp),
- tp->probes_out,
+ icsk->icsk_probes_out,
sock_i_ino(sp),
atomic_read(&sp->sk_refcnt), sp,
- tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
+ icsk->icsk_rto,
+ icsk->icsk_ack.ato,
+ (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
tp->snd_cwnd,
tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
}
-static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
+static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
{
unsigned int dest, src;
__u16 destp, srcp;
@@ -2585,7 +1975,7 @@ struct proto tcp_prot = {
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
- .accept = tcp_accept,
+ .accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
@@ -2600,6 +1990,7 @@ struct proto tcp_prot = {
.get_port = tcp_v4_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.sockets_allocated = &tcp_sockets_allocated,
+ .orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
@@ -2607,6 +1998,7 @@ struct proto tcp_prot = {
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
+ .twsk_obj_size = sizeof(struct tcp_timewait_sock),
.rsk_prot = &tcp_request_sock_ops,
};
@@ -2628,19 +2020,13 @@ void __init tcp_v4_init(struct net_proto_family *ops)
}
EXPORT_SYMBOL(ipv4_specific);
-EXPORT_SYMBOL(tcp_bind_hash);
-EXPORT_SYMBOL(tcp_bucket_create);
+EXPORT_SYMBOL(inet_bind_bucket_create);
EXPORT_SYMBOL(tcp_hashinfo);
-EXPORT_SYMBOL(tcp_inherit_port);
-EXPORT_SYMBOL(tcp_listen_wlock);
-EXPORT_SYMBOL(tcp_port_rover);
EXPORT_SYMBOL(tcp_prot);
-EXPORT_SYMBOL(tcp_put_port);
EXPORT_SYMBOL(tcp_unhash);
EXPORT_SYMBOL(tcp_v4_conn_request);
EXPORT_SYMBOL(tcp_v4_connect);
EXPORT_SYMBOL(tcp_v4_do_rcv);
-EXPORT_SYMBOL(tcp_v4_rebuild_header);
EXPORT_SYMBOL(tcp_v4_remember_stamp);
EXPORT_SYMBOL(tcp_v4_send_check);
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f42a284164b7..b1a63b2c6b4a 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -35,13 +35,27 @@
#define SYNC_INIT 1
#endif
-int sysctl_tcp_tw_recycle;
-int sysctl_tcp_max_tw_buckets = NR_FILE*2;
-
int sysctl_tcp_syncookies = SYNC_INIT;
int sysctl_tcp_abort_on_overflow;
-static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo);
+struct inet_timewait_death_row tcp_death_row = {
+ .sysctl_max_tw_buckets = NR_FILE * 2,
+ .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
+ .death_lock = SPIN_LOCK_UNLOCKED,
+ .hashinfo = &tcp_hashinfo,
+ .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
+ (unsigned long)&tcp_death_row),
+ .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work,
+ inet_twdr_twkill_work,
+ &tcp_death_row),
+/* Short-time timewait calendar */
+
+ .twcal_hand = -1,
+ .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
+ (unsigned long)&tcp_death_row),
+};
+
+EXPORT_SYMBOL_GPL(tcp_death_row);
static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
{
@@ -52,47 +66,6 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
return (seq == e_win && seq == end_seq);
}
-/* New-style handling of TIME_WAIT sockets. */
-
-int tcp_tw_count;
-
-
-/* Must be called with locally disabled BHs. */
-static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
-{
- struct tcp_ehash_bucket *ehead;
- struct tcp_bind_hashbucket *bhead;
- struct tcp_bind_bucket *tb;
-
- /* Unlink from established hashes. */
- ehead = &tcp_ehash[tw->tw_hashent];
- write_lock(&ehead->lock);
- if (hlist_unhashed(&tw->tw_node)) {
- write_unlock(&ehead->lock);
- return;
- }
- __hlist_del(&tw->tw_node);
- sk_node_init(&tw->tw_node);
- write_unlock(&ehead->lock);
-
- /* Disassociate with bind bucket. */
- bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
- spin_lock(&bhead->lock);
- tb = tw->tw_tb;
- __hlist_del(&tw->tw_bind_node);
- tw->tw_tb = NULL;
- tcp_bucket_destroy(tb);
- spin_unlock(&bhead->lock);
-
-#ifdef INET_REFCNT_DEBUG
- if (atomic_read(&tw->tw_refcnt) != 1) {
- printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw,
- atomic_read(&tw->tw_refcnt));
- }
-#endif
- tcp_tw_put(tw);
-}
-
/*
* * Main purpose of TIME-WAIT state is to close connection gracefully,
* when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
@@ -122,19 +95,20 @@ static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
* to avoid misread sequence numbers, states etc. --ANK
*/
enum tcp_tw_status
-tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
- struct tcphdr *th, unsigned len)
+tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
+ const struct tcphdr *th)
{
+ struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
struct tcp_options_received tmp_opt;
int paws_reject = 0;
tmp_opt.saw_tstamp = 0;
- if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) {
+ if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
tcp_parse_options(skb, &tmp_opt, 0);
if (tmp_opt.saw_tstamp) {
- tmp_opt.ts_recent = tw->tw_ts_recent;
- tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+ tmp_opt.ts_recent = tcptw->tw_ts_recent;
+ tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
paws_reject = tcp_paws_check(&tmp_opt, th->rst);
}
}
@@ -145,20 +119,20 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
/* Out of window, send ACK */
if (paws_reject ||
!tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
- tw->tw_rcv_nxt,
- tw->tw_rcv_nxt + tw->tw_rcv_wnd))
+ tcptw->tw_rcv_nxt,
+ tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
return TCP_TW_ACK;
if (th->rst)
goto kill;
- if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt))
+ if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
goto kill_with_rst;
/* Dup ACK? */
- if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) ||
+ if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
- tcp_tw_put(tw);
+ inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
@@ -166,19 +140,19 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
* reset.
*/
if (!th->fin ||
- TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) {
+ TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
kill_with_rst:
- tcp_tw_deschedule(tw);
- tcp_tw_put(tw);
+ inet_twsk_deschedule(tw, &tcp_death_row);
+ inet_twsk_put(tw);
return TCP_TW_RST;
}
/* FIN arrived, enter true time-wait state. */
- tw->tw_substate = TCP_TIME_WAIT;
- tw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ tw->tw_substate = TCP_TIME_WAIT;
+ tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if (tmp_opt.saw_tstamp) {
- tw->tw_ts_recent_stamp = xtime.tv_sec;
- tw->tw_ts_recent = tmp_opt.rcv_tsval;
+ tcptw->tw_ts_recent_stamp = xtime.tv_sec;
+ tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
}
/* I am shamed, but failed to make it more elegant.
@@ -187,11 +161,13 @@ kill_with_rst:
* do not undertsnad recycling in any case, it not
* a big problem in practice. --ANK */
if (tw->tw_family == AF_INET &&
- sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp &&
+ tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
tcp_v4_tw_remember_stamp(tw))
- tcp_tw_schedule(tw, tw->tw_timeout);
+ inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
+ TCP_TIMEWAIT_LEN);
else
- tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+ inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+ TCP_TIMEWAIT_LEN);
return TCP_TW_ACK;
}
@@ -213,7 +189,7 @@ kill_with_rst:
*/
if (!paws_reject &&
- (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt &&
+ (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
(TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
/* In window segment, it may be only reset or bare ack. */
@@ -224,19 +200,20 @@ kill_with_rst:
*/
if (sysctl_tcp_rfc1337 == 0) {
kill:
- tcp_tw_deschedule(tw);
- tcp_tw_put(tw);
+ inet_twsk_deschedule(tw, &tcp_death_row);
+ inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
}
- tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+ inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+ TCP_TIMEWAIT_LEN);
if (tmp_opt.saw_tstamp) {
- tw->tw_ts_recent = tmp_opt.rcv_tsval;
- tw->tw_ts_recent_stamp = xtime.tv_sec;
+ tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
+ tcptw->tw_ts_recent_stamp = xtime.tv_sec;
}
- tcp_tw_put(tw);
+ inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
@@ -258,9 +235,10 @@ kill:
*/
if (th->syn && !th->rst && !th->ack && !paws_reject &&
- (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) ||
- (tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
- u32 isn = tw->tw_snd_nxt + 65535 + 2;
+ (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
+ (tmp_opt.saw_tstamp &&
+ (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
+ u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
if (isn == 0)
isn++;
TCP_SKB_CB(skb)->when = isn;
@@ -278,107 +256,57 @@ kill:
* Do not reschedule in the last case.
*/
if (paws_reject || th->ack)
- tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+ inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+ TCP_TIMEWAIT_LEN);
/* Send ACK. Note, we do not put the bucket,
* it will be released by caller.
*/
return TCP_TW_ACK;
}
- tcp_tw_put(tw);
+ inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
-/* Enter the time wait state. This is called with locally disabled BH.
- * Essentially we whip up a timewait bucket, copy the
- * relevant info into it from the SK, and mess with hash chains
- * and list linkage.
- */
-static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
-{
- struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
- struct tcp_bind_hashbucket *bhead;
-
- /* Step 1: Put TW into bind hash. Original socket stays there too.
- Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in
- binding cache, even if it is closed.
- */
- bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)];
- spin_lock(&bhead->lock);
- tw->tw_tb = tcp_sk(sk)->bind_hash;
- BUG_TRAP(tcp_sk(sk)->bind_hash);
- tw_add_bind_node(tw, &tw->tw_tb->owners);
- spin_unlock(&bhead->lock);
-
- write_lock(&ehead->lock);
-
- /* Step 2: Remove SK from established hash. */
- if (__sk_del_node_init(sk))
- sock_prot_dec_use(sk->sk_prot);
-
- /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
- tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
- atomic_inc(&tw->tw_refcnt);
-
- write_unlock(&ehead->lock);
-}
-
/*
* Move a socket to time-wait or dead fin-wait-2 state.
*/
void tcp_time_wait(struct sock *sk, int state, int timeo)
{
- struct tcp_tw_bucket *tw = NULL;
- struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_timewait_sock *tw = NULL;
+ const struct tcp_sock *tp = tcp_sk(sk);
int recycle_ok = 0;
- if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp)
+ if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
recycle_ok = tp->af_specific->remember_stamp(sk);
- if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
- tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
-
- if(tw != NULL) {
- struct inet_sock *inet = inet_sk(sk);
- int rto = (tp->rto<<2) - (tp->rto>>1);
-
- /* Give us an identity. */
- tw->tw_daddr = inet->daddr;
- tw->tw_rcv_saddr = inet->rcv_saddr;
- tw->tw_bound_dev_if = sk->sk_bound_dev_if;
- tw->tw_num = inet->num;
- tw->tw_state = TCP_TIME_WAIT;
- tw->tw_substate = state;
- tw->tw_sport = inet->sport;
- tw->tw_dport = inet->dport;
- tw->tw_family = sk->sk_family;
- tw->tw_reuse = sk->sk_reuse;
- tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
- atomic_set(&tw->tw_refcnt, 1);
+ if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
+ tw = inet_twsk_alloc(sk, state);
- tw->tw_hashent = sk->sk_hashent;
- tw->tw_rcv_nxt = tp->rcv_nxt;
- tw->tw_snd_nxt = tp->snd_nxt;
- tw->tw_rcv_wnd = tcp_receive_window(tp);
- tw->tw_ts_recent = tp->rx_opt.ts_recent;
- tw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
- tw_dead_node_init(tw);
+ if (tw != NULL) {
+ struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
+
+ tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
+ tcptw->tw_rcv_nxt = tp->rcv_nxt;
+ tcptw->tw_snd_nxt = tp->snd_nxt;
+ tcptw->tw_rcv_wnd = tcp_receive_window(tp);
+ tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
+ tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
if (tw->tw_family == PF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
- ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr);
- ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr);
- tw->tw_v6_ipv6only = np->ipv6only;
- } else {
- memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr));
- memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr));
- tw->tw_v6_ipv6only = 0;
+ ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr);
+ ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr);
+ tw->tw_ipv6only = np->ipv6only;
}
#endif
/* Linkage updates. */
- __tcp_tw_hashdance(sk, tw);
+ __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
/* Get the TIME_WAIT timeout firing. */
if (timeo < rto)
@@ -392,8 +320,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
timeo = TCP_TIMEWAIT_LEN;
}
- tcp_tw_schedule(tw, timeo);
- tcp_tw_put(tw);
+ inet_twsk_schedule(tw, &tcp_death_row, timeo,
+ TCP_TIMEWAIT_LEN);
+ inet_twsk_put(tw);
} else {
/* Sorry, if we're out of memory, just CLOSE this
* socket up. We've got bigger problems than
@@ -407,277 +336,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tcp_done(sk);
}
-/* Kill off TIME_WAIT sockets once their lifetime has expired. */
-static int tcp_tw_death_row_slot;
-
-static void tcp_twkill(unsigned long);
-
-/* TIME_WAIT reaping mechanism. */
-#define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
-#define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
-
-#define TCP_TWKILL_QUOTA 100
-
-static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
-static DEFINE_SPINLOCK(tw_death_lock);
-static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
-static void twkill_work(void *);
-static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
-static u32 twkill_thread_slots;
-
-/* Returns non-zero if quota exceeded. */
-static int tcp_do_twkill_work(int slot, unsigned int quota)
-{
- struct tcp_tw_bucket *tw;
- struct hlist_node *node;
- unsigned int killed;
- int ret;
-
- /* NOTE: compare this to previous version where lock
- * was released after detaching chain. It was racy,
- * because tw buckets are scheduled in not serialized context
- * in 2.3 (with netfilter), and with softnet it is common, because
- * soft irqs are not sequenced.
- */
- killed = 0;
- ret = 0;
-rescan:
- tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) {
- __tw_del_dead_node(tw);
- spin_unlock(&tw_death_lock);
- tcp_timewait_kill(tw);
- tcp_tw_put(tw);
- killed++;
- spin_lock(&tw_death_lock);
- if (killed > quota) {
- ret = 1;
- break;
- }
-
- /* While we dropped tw_death_lock, another cpu may have
- * killed off the next TW bucket in the list, therefore
- * do a fresh re-read of the hlist head node with the
- * lock reacquired. We still use the hlist traversal
- * macro in order to get the prefetches.
- */
- goto rescan;
- }
-
- tcp_tw_count -= killed;
- NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
-
- return ret;
-}
-
-static void tcp_twkill(unsigned long dummy)
-{
- int need_timer, ret;
-
- spin_lock(&tw_death_lock);
-
- if (tcp_tw_count == 0)
- goto out;
-
- need_timer = 0;
- ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
- if (ret) {
- twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
- mb();
- schedule_work(&tcp_twkill_work);
- need_timer = 1;
- } else {
- /* We purged the entire slot, anything left? */
- if (tcp_tw_count)
- need_timer = 1;
- }
- tcp_tw_death_row_slot =
- ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
- if (need_timer)
- mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
-out:
- spin_unlock(&tw_death_lock);
-}
-
-extern void twkill_slots_invalid(void);
-
-static void twkill_work(void *dummy)
-{
- int i;
-
- if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
- twkill_slots_invalid();
-
- while (twkill_thread_slots) {
- spin_lock_bh(&tw_death_lock);
- for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
- if (!(twkill_thread_slots & (1 << i)))
- continue;
-
- while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
- if (need_resched()) {
- spin_unlock_bh(&tw_death_lock);
- schedule();
- spin_lock_bh(&tw_death_lock);
- }
- }
-
- twkill_thread_slots &= ~(1 << i);
- }
- spin_unlock_bh(&tw_death_lock);
- }
-}
-
-/* These are always called from BH context. See callers in
- * tcp_input.c to verify this.
- */
-
-/* This is for handling early-kills of TIME_WAIT sockets. */
-void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
-{
- spin_lock(&tw_death_lock);
- if (tw_del_dead_node(tw)) {
- tcp_tw_put(tw);
- if (--tcp_tw_count == 0)
- del_timer(&tcp_tw_timer);
- }
- spin_unlock(&tw_death_lock);
- tcp_timewait_kill(tw);
-}
-
-/* Short-time timewait calendar */
-
-static int tcp_twcal_hand = -1;
-static int tcp_twcal_jiffie;
-static void tcp_twcal_tick(unsigned long);
-static struct timer_list tcp_twcal_timer =
- TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
-static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
-
-static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
-{
- struct hlist_head *list;
- int slot;
-
- /* timeout := RTO * 3.5
- *
- * 3.5 = 1+2+0.5 to wait for two retransmits.
- *
- * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
- * our ACK acking that FIN can be lost. If N subsequent retransmitted
- * FINs (or previous seqments) are lost (probability of such event
- * is p^(N+1), where p is probability to lose single packet and
- * time to detect the loss is about RTO*(2^N - 1) with exponential
- * backoff). Normal timewait length is calculated so, that we
- * waited at least for one retransmitted FIN (maximal RTO is 120sec).
- * [ BTW Linux. following BSD, violates this requirement waiting
- * only for 60sec, we should wait at least for 240 secs.
- * Well, 240 consumes too much of resources 8)
- * ]
- * This interval is not reduced to catch old duplicate and
- * responces to our wandering segments living for two MSLs.
- * However, if we use PAWS to detect
- * old duplicates, we can reduce the interval to bounds required
- * by RTO, rather than MSL. So, if peer understands PAWS, we
- * kill tw bucket after 3.5*RTO (it is important that this number
- * is greater than TS tick!) and detect old duplicates with help
- * of PAWS.
- */
- slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
-
- spin_lock(&tw_death_lock);
-
- /* Unlink it, if it was scheduled */
- if (tw_del_dead_node(tw))
- tcp_tw_count--;
- else
- atomic_inc(&tw->tw_refcnt);
-
- if (slot >= TCP_TW_RECYCLE_SLOTS) {
- /* Schedule to slow timer */
- if (timeo >= TCP_TIMEWAIT_LEN) {
- slot = TCP_TWKILL_SLOTS-1;
- } else {
- slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
- if (slot >= TCP_TWKILL_SLOTS)
- slot = TCP_TWKILL_SLOTS-1;
- }
- tw->tw_ttd = jiffies + timeo;
- slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
- list = &tcp_tw_death_row[slot];
- } else {
- tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
-
- if (tcp_twcal_hand < 0) {
- tcp_twcal_hand = 0;
- tcp_twcal_jiffie = jiffies;
- tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
- add_timer(&tcp_twcal_timer);
- } else {
- if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
- mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
- slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
- }
- list = &tcp_twcal_row[slot];
- }
-
- hlist_add_head(&tw->tw_death_node, list);
-
- if (tcp_tw_count++ == 0)
- mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
- spin_unlock(&tw_death_lock);
-}
-
-void tcp_twcal_tick(unsigned long dummy)
-{
- int n, slot;
- unsigned long j;
- unsigned long now = jiffies;
- int killed = 0;
- int adv = 0;
-
- spin_lock(&tw_death_lock);
- if (tcp_twcal_hand < 0)
- goto out;
-
- slot = tcp_twcal_hand;
- j = tcp_twcal_jiffie;
-
- for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
- if (time_before_eq(j, now)) {
- struct hlist_node *node, *safe;
- struct tcp_tw_bucket *tw;
-
- tw_for_each_inmate_safe(tw, node, safe,
- &tcp_twcal_row[slot]) {
- __tw_del_dead_node(tw);
- tcp_timewait_kill(tw);
- tcp_tw_put(tw);
- killed++;
- }
- } else {
- if (!adv) {
- adv = 1;
- tcp_twcal_jiffie = j;
- tcp_twcal_hand = slot;
- }
-
- if (!hlist_empty(&tcp_twcal_row[slot])) {
- mod_timer(&tcp_twcal_timer, j);
- goto out;
- }
- }
- j += (1<<TCP_TW_RECYCLE_TICK);
- slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
- }
- tcp_twcal_hand = -1;
-
-out:
- if ((tcp_tw_count -= killed) == 0)
- del_timer(&tcp_tw_timer);
- NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
- spin_unlock(&tw_death_lock);
-}
-
/* This is not only more efficient than what we used to do, it eliminates
* a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
*
@@ -686,75 +344,27 @@ out:
*/
struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
{
- /* allocate the newsk from the same slab of the master sock,
- * if not, at sk_free time we'll try to free it from the wrong
- * slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */
- struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0);
+ struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
- if(newsk != NULL) {
- struct inet_request_sock *ireq = inet_rsk(req);
+ if (newsk != NULL) {
+ const struct inet_request_sock *ireq = inet_rsk(req);
struct tcp_request_sock *treq = tcp_rsk(req);
+ struct inet_connection_sock *newicsk = inet_csk(sk);
struct tcp_sock *newtp;
- struct sk_filter *filter;
-
- memcpy(newsk, sk, sizeof(struct tcp_sock));
- newsk->sk_state = TCP_SYN_RECV;
-
- /* SANITY */
- sk_node_init(&newsk->sk_node);
- tcp_sk(newsk)->bind_hash = NULL;
-
- /* Clone the TCP header template */
- inet_sk(newsk)->dport = ireq->rmt_port;
-
- sock_lock_init(newsk);
- bh_lock_sock(newsk);
-
- rwlock_init(&newsk->sk_dst_lock);
- atomic_set(&newsk->sk_rmem_alloc, 0);
- skb_queue_head_init(&newsk->sk_receive_queue);
- atomic_set(&newsk->sk_wmem_alloc, 0);
- skb_queue_head_init(&newsk->sk_write_queue);
- atomic_set(&newsk->sk_omem_alloc, 0);
- newsk->sk_wmem_queued = 0;
- newsk->sk_forward_alloc = 0;
-
- sock_reset_flag(newsk, SOCK_DONE);
- newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
- newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
- newsk->sk_send_head = NULL;
- rwlock_init(&newsk->sk_callback_lock);
- skb_queue_head_init(&newsk->sk_error_queue);
- newsk->sk_write_space = sk_stream_write_space;
-
- if ((filter = newsk->sk_filter) != NULL)
- sk_filter_charge(newsk, filter);
-
- if (unlikely(xfrm_sk_clone_policy(newsk))) {
- /* It is still raw copy of parent, so invalidate
- * destructor and make plain sk_free() */
- newsk->sk_destruct = NULL;
- sk_free(newsk);
- return NULL;
- }
/* Now setup tcp_sock */
newtp = tcp_sk(newsk);
newtp->pred_flags = 0;
newtp->rcv_nxt = treq->rcv_isn + 1;
- newtp->snd_nxt = treq->snt_isn + 1;
- newtp->snd_una = treq->snt_isn + 1;
- newtp->snd_sml = treq->snt_isn + 1;
+ newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1;
tcp_prequeue_init(newtp);
tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
- newtp->retransmits = 0;
- newtp->backoff = 0;
newtp->srtt = 0;
newtp->mdev = TCP_TIMEOUT_INIT;
- newtp->rto = TCP_TIMEOUT_INIT;
+ newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newtp->packets_out = 0;
newtp->left_out = 0;
@@ -774,9 +384,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->frto_counter = 0;
newtp->frto_highmark = 0;
- newtp->ca_ops = &tcp_reno;
+ newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
- tcp_set_ca_state(newtp, TCP_CA_Open);
+ tcp_set_ca_state(newsk, TCP_CA_Open);
tcp_init_xmit_timers(newsk);
skb_queue_head_init(&newtp->out_of_order_queue);
newtp->rcv_wup = treq->rcv_isn + 1;
@@ -789,26 +399,12 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->rx_opt.dsack = 0;
newtp->rx_opt.eff_sacks = 0;
- newtp->probes_out = 0;
newtp->rx_opt.num_sacks = 0;
newtp->urg_data = 0;
- /* Deinitialize accept_queue to trap illegal accesses. */
- memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue));
-
- /* Back to base struct sock members. */
- newsk->sk_err = 0;
- newsk->sk_priority = 0;
- atomic_set(&newsk->sk_refcnt, 2);
-#ifdef INET_REFCNT_DEBUG
- atomic_inc(&inet_sock_nr);
-#endif
- atomic_inc(&tcp_sockets_allocated);
if (sock_flag(newsk, SOCK_KEEPOPEN))
- tcp_reset_keepalive_timer(newsk,
- keepalive_time_when(newtp));
- newsk->sk_socket = NULL;
- newsk->sk_sleep = NULL;
+ inet_csk_reset_keepalive_timer(newsk,
+ keepalive_time_when(newtp));
newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
@@ -838,7 +434,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->tcp_header_len = sizeof(struct tcphdr);
}
if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
- newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
+ newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
newtp->rx_opt.mss_clamp = req->mss;
TCP_ECN_openreq_child(newtp, req);
if (newtp->ecn_flags&TCP_ECN_OK)
@@ -934,9 +530,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
does sequence test, SYN is truncated, and thus we consider
it a bare ACK.
- If tp->defer_accept, we silently drop this bare ACK. Otherwise,
- we create an established connection. Both ends (listening sockets)
- accept the new incoming connection and try to talk to each other. 8-)
+ If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
+ bare ACK. Otherwise, we create an established connection. Both
+ ends (listening sockets) accept the new incoming connection and try
+ to talk to each other. 8-)
Note: This case is both harmless, and rare. Possibility is about the
same as us discovering intelligent life on another plant tomorrow.
@@ -1003,7 +600,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
return NULL;
/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
- if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+ if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+ TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
inet_rsk(req)->acked = 1;
return NULL;
}
@@ -1018,10 +616,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
if (child == NULL)
goto listen_overflow;
- tcp_synq_unlink(tp, req, prev);
- tcp_synq_removed(sk, req);
+ inet_csk_reqsk_queue_unlink(sk, req, prev);
+ inet_csk_reqsk_queue_removed(sk, req);
- tcp_acceptq_queue(sk, req, child);
+ inet_csk_reqsk_queue_add(sk, req, child);
return child;
listen_overflow:
@@ -1035,7 +633,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
if (!(flg & TCP_FLAG_RST))
req->rsk_ops->send_reset(skb);
- tcp_synq_drop(sk, req, prev);
+ inet_csk_reqsk_queue_drop(sk, req, prev);
return NULL;
}
@@ -1074,4 +672,3 @@ EXPORT_SYMBOL(tcp_check_req);
EXPORT_SYMBOL(tcp_child_process);
EXPORT_SYMBOL(tcp_create_openreq_child);
EXPORT_SYMBOL(tcp_timewait_state_process);
-EXPORT_SYMBOL(tcp_tw_deschedule);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e3f8ea1bfa9c..b907456a79f4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -105,18 +105,19 @@ static __u16 tcp_advertise_mss(struct sock *sk)
/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
* This is the first part of cwnd validation mechanism. */
-static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
+static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
{
+ struct tcp_sock *tp = tcp_sk(sk);
s32 delta = tcp_time_stamp - tp->lsndtime;
u32 restart_cwnd = tcp_init_cwnd(tp, dst);
u32 cwnd = tp->snd_cwnd;
- tcp_ca_event(tp, CA_EVENT_CWND_RESTART);
+ tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
- tp->snd_ssthresh = tcp_current_ssthresh(tp);
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
restart_cwnd = min(restart_cwnd, cwnd);
- while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
+ while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
cwnd >>= 1;
tp->snd_cwnd = max(cwnd, restart_cwnd);
tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -126,26 +127,25 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
static inline void tcp_event_data_sent(struct tcp_sock *tp,
struct sk_buff *skb, struct sock *sk)
{
- u32 now = tcp_time_stamp;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const u32 now = tcp_time_stamp;
- if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
- tcp_cwnd_restart(tp, __sk_dst_get(sk));
+ if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)
+ tcp_cwnd_restart(sk, __sk_dst_get(sk));
tp->lsndtime = now;
/* If it is a reply for ato after last received
* packet, enter pingpong mode.
*/
- if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
- tp->ack.pingpong = 1;
+ if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
+ icsk->icsk_ack.pingpong = 1;
}
static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
{
- struct tcp_sock *tp = tcp_sk(sk);
-
- tcp_dec_quickack_mode(tp, pkts);
- tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
+ tcp_dec_quickack_mode(sk, pkts);
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
/* Determine a window scaling and initial window to offer.
@@ -190,7 +190,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
}
/* Set initial window to value enough for senders,
- * following RFC1414. Senders, not following this RFC,
+ * following RFC2414. Senders, not following this RFC,
* will be satisfied with 2.
*/
if (mss > (1<<*rcv_wscale)) {
@@ -265,6 +265,7 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
{
if (skb != NULL) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -280,8 +281,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
#define SYSCTL_FLAG_SACK 0x4
/* If congestion control is doing timestamping */
- if (tp->ca_ops->rtt_sample)
- do_gettimeofday(&skb->stamp);
+ if (icsk->icsk_ca_ops->rtt_sample)
+ __net_timestamp(skb);
sysctl_flags = 0;
if (tcb->flags & TCPCB_FLAG_SYN) {
@@ -308,7 +309,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
}
if (tcp_packets_in_flight(tp) == 0)
- tcp_ca_event(tp, CA_EVENT_TX_START);
+ tcp_ca_event(sk, CA_EVENT_TX_START);
th = (struct tcphdr *) skb_push(skb, tcp_header_size);
skb->h.th = th;
@@ -366,7 +367,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
if (err <= 0)
return err;
- tcp_enter_cwr(tp);
+ tcp_enter_cwr(sk);
/* NET_XMIT_CN is special. It does not guarantee,
* that this packet is lost. It tells that device
@@ -403,11 +404,9 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
sk->sk_send_head = skb;
}
-static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
+static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (skb->len <= tp->mss_cache ||
+ if (skb->len <= mss_now ||
!(sk->sk_route_caps & NETIF_F_TSO)) {
/* Avoid the costly divide in the normal
* non-TSO case.
@@ -417,10 +416,10 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
} else {
unsigned int factor;
- factor = skb->len + (tp->mss_cache - 1);
- factor /= tp->mss_cache;
+ factor = skb->len + (mss_now - 1);
+ factor /= mss_now;
skb_shinfo(skb)->tso_segs = factor;
- skb_shinfo(skb)->tso_size = tp->mss_cache;
+ skb_shinfo(skb)->tso_size = mss_now;
}
}
@@ -429,13 +428,14 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
* packet to the list. This won't be called frequently, I hope.
* Remember, these are still headerless SKBs at this point.
*/
-static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
+int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
- int nsize;
+ int nsize, old_factor;
u16 flags;
+ BUG_ON(len > skb->len);
nsize = skb_headlen(skb) - len;
if (nsize < 0)
nsize = 0;
@@ -460,9 +460,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
flags = TCP_SKB_CB(skb)->flags;
TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
TCP_SKB_CB(buff)->flags = flags;
- TCP_SKB_CB(buff)->sacked =
- (TCP_SKB_CB(skb)->sacked &
- (TCPCB_LOST | TCPCB_EVER_RETRANS | TCPCB_AT_TAIL));
+ TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
@@ -484,30 +482,51 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
* skbs, which it never sent before. --ANK
*/
TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
- buff->stamp = skb->stamp;
+ buff->tstamp = skb->tstamp;
- if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
- tp->lost_out -= tcp_skb_pcount(skb);
- tp->left_out -= tcp_skb_pcount(skb);
- }
+ old_factor = tcp_skb_pcount(skb);
/* Fix up tso_factor for both original and new SKB. */
- tcp_set_skb_tso_segs(sk, skb);
- tcp_set_skb_tso_segs(sk, buff);
+ tcp_set_skb_tso_segs(sk, skb, mss_now);
+ tcp_set_skb_tso_segs(sk, buff, mss_now);
- if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
- tp->lost_out += tcp_skb_pcount(skb);
- tp->left_out += tcp_skb_pcount(skb);
- }
+ /* If this packet has been sent out already, we must
+ * adjust the various packet counters.
+ */
+ if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
+ int diff = old_factor - tcp_skb_pcount(skb) -
+ tcp_skb_pcount(buff);
+
+ tp->packets_out -= diff;
+
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+ tp->sacked_out -= diff;
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
+ tp->retrans_out -= diff;
+
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
+ tp->lost_out -= diff;
+ tp->left_out -= diff;
+ }
+
+ if (diff > 0) {
+ /* Adjust Reno SACK estimate. */
+ if (!tp->rx_opt.sack_ok) {
+ tp->sacked_out -= diff;
+ if ((int)tp->sacked_out < 0)
+ tp->sacked_out = 0;
+ tcp_sync_left_out(tp);
+ }
- if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
- tp->lost_out += tcp_skb_pcount(buff);
- tp->left_out += tcp_skb_pcount(buff);
+ tp->fackets_out -= diff;
+ if ((int)tp->fackets_out < 0)
+ tp->fackets_out = 0;
+ }
}
/* Link BUFF into the send queue. */
skb_header_release(buff);
- __skb_append(skb, buff);
+ __skb_append(skb, buff, &sk->sk_write_queue);
return 0;
}
@@ -569,7 +588,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
* factor and mss.
*/
if (tcp_skb_pcount(skb) > 1)
- tcp_set_skb_tso_segs(sk, skb);
+ tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
return 0;
}
@@ -698,7 +717,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
if (tp->packets_out > tp->snd_cwnd_used)
tp->snd_cwnd_used = tp->packets_out;
- if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
+ if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
tcp_cwnd_application_limited(sk);
}
}
@@ -734,12 +753,14 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk
/* This must be invoked the first time we consider transmitting
* SKB onto the wire.
*/
-static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
+static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
{
int tso_segs = tcp_skb_pcount(skb);
- if (!tso_segs) {
- tcp_set_skb_tso_segs(sk, skb);
+ if (!tso_segs ||
+ (tso_segs > 1 &&
+ skb_shinfo(skb)->tso_size != mss_now)) {
+ tcp_set_skb_tso_segs(sk, skb, mss_now);
tso_segs = tcp_skb_pcount(skb);
}
return tso_segs;
@@ -817,7 +838,7 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
struct tcp_sock *tp = tcp_sk(sk);
unsigned int cwnd_quota;
- tcp_init_tso_segs(sk, skb);
+ tcp_init_tso_segs(sk, skb, cur_mss);
if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
return 0;
@@ -854,14 +875,15 @@ int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
* know that all the data is in scatter-gather pages, and that the
* packet has never been sent out before (and thus is not cloned).
*/
-static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now)
{
struct sk_buff *buff;
int nlen = skb->len - len;
u16 flags;
/* All of a TSO frame must be composed of paged data. */
- BUG_ON(skb->len != skb->data_len);
+ if (skb->len != skb->data_len)
+ return tcp_fragment(sk, skb, len, mss_now);
buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
if (unlikely(buff == NULL))
@@ -887,12 +909,12 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
skb_split(skb, buff, len);
/* Fix up tso_factor for both original and new SKB. */
- tcp_set_skb_tso_segs(sk, skb);
- tcp_set_skb_tso_segs(sk, buff);
+ tcp_set_skb_tso_segs(sk, skb, mss_now);
+ tcp_set_skb_tso_segs(sk, buff, mss_now);
/* Link BUFF into the send queue. */
skb_header_release(buff);
- __skb_append(skb, buff);
+ __skb_append(skb, buff, &sk->sk_write_queue);
return 0;
}
@@ -904,12 +926,13 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
*/
static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
u32 send_win, cong_win, limit, in_flight;
if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
return 0;
- if (tp->ca_state != TCP_CA_Open)
+ if (icsk->icsk_ca_state != TCP_CA_Open)
return 0;
in_flight = tcp_packets_in_flight(tp);
@@ -924,10 +947,6 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_
limit = min(send_win, cong_win);
- /* If sk_send_head can be sent fully now, just do it. */
- if (skb->len <= limit)
- return 0;
-
if (sysctl_tcp_tso_win_divisor) {
u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
@@ -972,19 +991,20 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
if (unlikely(sk->sk_state == TCP_CLOSE))
return 0;
- skb = sk->sk_send_head;
- if (unlikely(!skb))
- return 0;
-
- tso_segs = tcp_init_tso_segs(sk, skb);
- cwnd_quota = tcp_cwnd_test(tp, skb);
- if (unlikely(!cwnd_quota))
- goto out;
-
sent_pkts = 0;
- while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) {
+ while ((skb = sk->sk_send_head)) {
+ unsigned int limit;
+
+ tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
BUG_ON(!tso_segs);
+ cwnd_quota = tcp_cwnd_test(tp, skb);
+ if (!cwnd_quota)
+ break;
+
+ if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
+ break;
+
if (tso_segs == 1) {
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
(tcp_skb_is_last(sk, skb) ?
@@ -995,9 +1015,10 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
break;
}
+ limit = mss_now;
if (tso_segs > 1) {
- u32 limit = tcp_window_allows(tp, skb,
- mss_now, cwnd_quota);
+ limit = tcp_window_allows(tp, skb,
+ mss_now, cwnd_quota);
if (skb->len < limit) {
unsigned int trim = skb->len % mss_now;
@@ -1005,15 +1026,12 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
if (trim)
limit = skb->len - trim;
}
- if (skb->len > limit) {
- if (tso_fragment(sk, skb, limit))
- break;
- }
- } else if (unlikely(skb->len > mss_now)) {
- if (unlikely(tcp_fragment(sk, skb, mss_now)))
- break;
}
+ if (skb->len > limit &&
+ unlikely(tso_fragment(sk, skb, limit, mss_now)))
+ break;
+
TCP_SKB_CB(skb)->when = tcp_time_stamp;
if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
@@ -1026,27 +1044,12 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
tcp_minshall_update(tp, mss_now, skb);
sent_pkts++;
-
- /* Do not optimize this to use tso_segs. If we chopped up
- * the packet above, tso_segs will no longer be valid.
- */
- cwnd_quota -= tcp_skb_pcount(skb);
-
- BUG_ON(cwnd_quota < 0);
- if (!cwnd_quota)
- break;
-
- skb = sk->sk_send_head;
- if (!skb)
- break;
- tso_segs = tcp_init_tso_segs(sk, skb);
}
if (likely(sent_pkts)) {
tcp_cwnd_validate(sk, tp);
return 0;
}
-out:
return !tp->packets_out && sk->sk_send_head;
}
@@ -1076,15 +1079,18 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
BUG_ON(!skb || skb->len < mss_now);
- tso_segs = tcp_init_tso_segs(sk, skb);
+ tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
if (likely(cwnd_quota)) {
+ unsigned int limit;
+
BUG_ON(!tso_segs);
+ limit = mss_now;
if (tso_segs > 1) {
- u32 limit = tcp_window_allows(tp, skb,
- mss_now, cwnd_quota);
+ limit = tcp_window_allows(tp, skb,
+ mss_now, cwnd_quota);
if (skb->len < limit) {
unsigned int trim = skb->len % mss_now;
@@ -1092,15 +1098,12 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
if (trim)
limit = skb->len - trim;
}
- if (skb->len > limit) {
- if (unlikely(tso_fragment(sk, skb, limit)))
- return;
- }
- } else if (unlikely(skb->len > mss_now)) {
- if (unlikely(tcp_fragment(sk, skb, mss_now)))
- return;
}
+ if (skb->len > limit &&
+ unlikely(tso_fragment(sk, skb, limit, mss_now)))
+ return;
+
/* Send it out now. */
TCP_SKB_CB(skb)->when = tcp_time_stamp;
@@ -1166,6 +1169,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
*/
u32 __tcp_select_window(struct sock *sk)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
/* MSS for the peer's data. Previous verions used mss_clamp
* here. I don't know if the value based on our guesses
@@ -1173,7 +1177,7 @@ u32 __tcp_select_window(struct sock *sk)
* but may be worse for the performance because of rcv_mss
* fluctuations. --SAW 1998/11/1
*/
- int mss = tp->ack.rcv_mss;
+ int mss = icsk->icsk_ack.rcv_mss;
int free_space = tcp_space(sk);
int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
int window;
@@ -1182,7 +1186,7 @@ u32 __tcp_select_window(struct sock *sk)
mss = full_space;
if (free_space < full_space/2) {
- tp->ack.quick = 0;
+ icsk->icsk_ack.quick = 0;
if (tcp_memory_pressure)
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
@@ -1257,7 +1261,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
tcp_skb_pcount(next_skb) != 1);
/* Ok. We will be able to collapse the packet. */
- __skb_unlink(next_skb, next_skb->list);
+ __skb_unlink(next_skb, &sk->sk_write_queue);
memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
@@ -1305,6 +1309,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
*/
void tcp_simple_retransmit(struct sock *sk)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
unsigned int mss = tcp_current_mss(sk, 0);
@@ -1335,12 +1340,12 @@ void tcp_simple_retransmit(struct sock *sk)
* in network, but units changed and effective
* cwnd/ssthresh really reduced now.
*/
- if (tp->ca_state != TCP_CA_Loss) {
+ if (icsk->icsk_ca_state != TCP_CA_Loss) {
tp->high_seq = tp->snd_nxt;
- tp->snd_ssthresh = tcp_current_ssthresh(tp);
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
tp->prior_ssthresh = 0;
tp->undo_marker = 0;
- tcp_set_ca_state(tp, TCP_CA_Loss);
+ tcp_set_ca_state(sk, TCP_CA_Loss);
}
tcp_xmit_retransmit_queue(sk);
}
@@ -1365,12 +1370,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
BUG();
-
- if (sk->sk_route_caps & NETIF_F_TSO) {
- sk->sk_route_caps &= ~NETIF_F_TSO;
- sock_set_flag(sk, SOCK_NO_LARGESEND);
- }
-
if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
return -ENOMEM;
}
@@ -1385,16 +1384,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
return -EAGAIN;
if (skb->len > cur_mss) {
- int old_factor = tcp_skb_pcount(skb);
- int new_factor;
-
- if (tcp_fragment(sk, skb, cur_mss))
+ if (tcp_fragment(sk, skb, cur_mss, cur_mss))
return -ENOMEM; /* We'll try again later. */
-
- /* New SKB created, account for it. */
- new_factor = tcp_skb_pcount(skb);
- tp->packets_out -= old_factor - new_factor;
- tp->packets_out += tcp_skb_pcount(skb->next);
}
/* Collapse two adjacent packets if worthwhile and we can. */
@@ -1474,6 +1465,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
*/
void tcp_xmit_retransmit_queue(struct sock *sk)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int packet_cnt = tp->lost_out;
@@ -1497,14 +1489,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
if (tcp_retransmit_skb(sk, skb))
return;
- if (tp->ca_state != TCP_CA_Loss)
+ if (icsk->icsk_ca_state != TCP_CA_Loss)
NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
else
NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
if (skb ==
skb_peek(&sk->sk_write_queue))
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto,
+ TCP_RTO_MAX);
}
packet_cnt -= tcp_skb_pcount(skb);
@@ -1517,7 +1511,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
/* OK, demanded retransmission is finished. */
/* Forward retransmissions are possible only during Recovery. */
- if (tp->ca_state != TCP_CA_Recovery)
+ if (icsk->icsk_ca_state != TCP_CA_Recovery)
return;
/* No forward retransmissions in Reno are possible. */
@@ -1557,7 +1551,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
break;
if (skb == skb_peek(&sk->sk_write_queue))
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto,
+ TCP_RTO_MAX);
NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
}
@@ -1586,7 +1582,7 @@ void tcp_send_fin(struct sock *sk)
} else {
/* Socket is locked, keep trying until memory is available. */
for (;;) {
- skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
+ skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL);
if (skb)
break;
yield();
@@ -1613,7 +1609,7 @@ void tcp_send_fin(struct sock *sk)
* was unread data in the receive queue. This behavior is recommended
* by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
*/
-void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
+void tcp_send_active_reset(struct sock *sk, gfp_t priority)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -1793,8 +1789,8 @@ static inline void tcp_connect_init(struct sock *sk)
tp->rcv_wup = 0;
tp->copied_seq = 0;
- tp->rto = TCP_TIMEOUT_INIT;
- tp->retransmits = 0;
+ inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+ inet_csk(sk)->icsk_retransmits = 0;
tcp_clear_retrans(tp);
}
@@ -1808,7 +1804,7 @@ int tcp_connect(struct sock *sk)
tcp_connect_init(sk);
- buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
+ buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
if (unlikely(buff == NULL))
return -ENOBUFS;
@@ -1837,7 +1833,8 @@ int tcp_connect(struct sock *sk)
TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
/* Timer for repeating the SYN until an answer. */
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
return 0;
}
@@ -1847,20 +1844,21 @@ int tcp_connect(struct sock *sk)
*/
void tcp_send_delayed_ack(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
- int ato = tp->ack.ato;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int ato = icsk->icsk_ack.ato;
unsigned long timeout;
if (ato > TCP_DELACK_MIN) {
+ const struct tcp_sock *tp = tcp_sk(sk);
int max_ato = HZ/2;
- if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
+ if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
max_ato = TCP_DELACK_MAX;
/* Slow path, intersegment interval is "high". */
/* If some rtt estimate is known, use it to bound delayed ack.
- * Do not use tp->rto here, use results of rtt measurements
+ * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
* directly.
*/
if (tp->srtt) {
@@ -1877,21 +1875,22 @@ void tcp_send_delayed_ack(struct sock *sk)
timeout = jiffies + ato;
/* Use new timeout only if there wasn't a older one earlier. */
- if (tp->ack.pending&TCP_ACK_TIMER) {
+ if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
/* If delack timer was blocked or is about to expire,
* send ACK now.
*/
- if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
+ if (icsk->icsk_ack.blocked ||
+ time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
tcp_send_ack(sk);
return;
}
- if (!time_before(timeout, tp->ack.timeout))
- timeout = tp->ack.timeout;
+ if (!time_before(timeout, icsk->icsk_ack.timeout))
+ timeout = icsk->icsk_ack.timeout;
}
- tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
- tp->ack.timeout = timeout;
- sk_reset_timer(sk, &tp->delack_timer, timeout);
+ icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+ icsk->icsk_ack.timeout = timeout;
+ sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
}
/* This routine sends an ack and also updates the window. */
@@ -1908,9 +1907,10 @@ void tcp_send_ack(struct sock *sk)
*/
buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
if (buff == NULL) {
- tcp_schedule_ack(tp);
- tp->ack.ato = TCP_ATO_MIN;
- tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
+ inet_csk_schedule_ack(sk);
+ inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ TCP_DELACK_MAX, TCP_RTO_MAX);
return;
}
@@ -1991,16 +1991,10 @@ int tcp_write_wakeup(struct sock *sk)
skb->len > mss) {
seg_size = min(seg_size, mss);
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
- if (tcp_fragment(sk, skb, seg_size))
+ if (tcp_fragment(sk, skb, seg_size, mss))
return -1;
- /* SWS override triggered forced fragmentation.
- * Disable TSO, the connection is too sick. */
- if (sk->sk_route_caps & NETIF_F_TSO) {
- sock_set_flag(sk, SOCK_NO_LARGESEND);
- sk->sk_route_caps &= ~NETIF_F_TSO;
- }
} else if (!tcp_skb_pcount(skb))
- tcp_set_skb_tso_segs(sk, skb);
+ tcp_set_skb_tso_segs(sk, skb, mss);
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
@@ -2024,6 +2018,7 @@ int tcp_write_wakeup(struct sock *sk)
*/
void tcp_send_probe0(struct sock *sk)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int err;
@@ -2031,28 +2026,31 @@ void tcp_send_probe0(struct sock *sk)
if (tp->packets_out || !sk->sk_send_head) {
/* Cancel probe timer, if it is not required. */
- tp->probes_out = 0;
- tp->backoff = 0;
+ icsk->icsk_probes_out = 0;
+ icsk->icsk_backoff = 0;
return;
}
if (err <= 0) {
- if (tp->backoff < sysctl_tcp_retries2)
- tp->backoff++;
- tp->probes_out++;
- tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
- min(tp->rto << tp->backoff, TCP_RTO_MAX));
+ if (icsk->icsk_backoff < sysctl_tcp_retries2)
+ icsk->icsk_backoff++;
+ icsk->icsk_probes_out++;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+ min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
+ TCP_RTO_MAX);
} else {
/* If packet was not sent due to local congestion,
- * do not backoff and do not remember probes_out.
+ * do not backoff and do not remember icsk_probes_out.
* Let local senders to fight for local resources.
*
* Use accumulated backoff yet.
*/
- if (!tp->probes_out)
- tp->probes_out=1;
- tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
- min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
+ if (!icsk->icsk_probes_out)
+ icsk->icsk_probes_out = 1;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+ min(icsk->icsk_rto << icsk->icsk_backoff,
+ TCP_RESOURCE_PROBE_INTERVAL),
+ TCP_RTO_MAX);
}
}
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 70e108e15c71..327770bf5522 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -16,9 +16,10 @@
#define TCP_SCALABLE_AI_CNT 50U
#define TCP_SCALABLE_MD_SCALE 3
-static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
u32 in_flight, int flag)
{
+ struct tcp_sock *tp = tcp_sk(sk);
if (in_flight < tp->snd_cwnd)
return;
@@ -35,8 +36,9 @@ static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
tp->snd_cwnd_stamp = tcp_time_stamp;
}
-static u32 tcp_scalable_ssthresh(struct tcp_sock *tp)
+static u32 tcp_scalable_ssthresh(struct sock *sk)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0084227438c2..415ee47ac1c5 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -36,49 +36,13 @@ static void tcp_write_timer(unsigned long);
static void tcp_delack_timer(unsigned long);
static void tcp_keepalive_timer (unsigned long data);
-#ifdef TCP_DEBUG
-const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
-EXPORT_SYMBOL(tcp_timer_bug_msg);
-#endif
-
-/*
- * Using different timers for retransmit, delayed acks and probes
- * We may wish use just one timer maintaining a list of expire jiffies
- * to optimize.
- */
-
void tcp_init_xmit_timers(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
-
- init_timer(&tp->retransmit_timer);
- tp->retransmit_timer.function=&tcp_write_timer;
- tp->retransmit_timer.data = (unsigned long) sk;
- tp->pending = 0;
-
- init_timer(&tp->delack_timer);
- tp->delack_timer.function=&tcp_delack_timer;
- tp->delack_timer.data = (unsigned long) sk;
- tp->ack.pending = 0;
-
- init_timer(&sk->sk_timer);
- sk->sk_timer.function = &tcp_keepalive_timer;
- sk->sk_timer.data = (unsigned long)sk;
+ inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
+ &tcp_keepalive_timer);
}
-void tcp_clear_xmit_timers(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- tp->pending = 0;
- sk_stop_timer(sk, &tp->retransmit_timer);
-
- tp->ack.pending = 0;
- tp->ack.blocked = 0;
- sk_stop_timer(sk, &tp->delack_timer);
-
- sk_stop_timer(sk, &sk->sk_timer);
-}
+EXPORT_SYMBOL(tcp_init_xmit_timers);
static void tcp_write_err(struct sock *sk)
{
@@ -155,15 +119,15 @@ static int tcp_orphan_retries(struct sock *sk, int alive)
/* A write timeout has occurred. Process the after effects. */
static int tcp_write_timeout(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
int retry_until;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
- if (tp->retransmits)
+ if (icsk->icsk_retransmits)
dst_negative_advice(&sk->sk_dst_cache);
- retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
+ retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
} else {
- if (tp->retransmits >= sysctl_tcp_retries1) {
+ if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
/* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
hole detection. :-(
@@ -189,16 +153,16 @@ static int tcp_write_timeout(struct sock *sk)
retry_until = sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
- int alive = (tp->rto < TCP_RTO_MAX);
+ const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
retry_until = tcp_orphan_retries(sk, alive);
- if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until))
+ if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until))
return 1;
}
}
- if (tp->retransmits >= retry_until) {
+ if (icsk->icsk_retransmits >= retry_until) {
/* Has it gone just too far? */
tcp_write_err(sk);
return 1;
@@ -210,26 +174,27 @@ static void tcp_delack_timer(unsigned long data)
{
struct sock *sk = (struct sock*)data;
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later. */
- tp->ack.blocked = 1;
+ icsk->icsk_ack.blocked = 1;
NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
- sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN);
+ sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
goto out_unlock;
}
sk_stream_mem_reclaim(sk);
- if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER))
+ if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
goto out;
- if (time_after(tp->ack.timeout, jiffies)) {
- sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout);
+ if (time_after(icsk->icsk_ack.timeout, jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
goto out;
}
- tp->ack.pending &= ~TCP_ACK_TIMER;
+ icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
if (!skb_queue_empty(&tp->ucopy.prequeue)) {
struct sk_buff *skb;
@@ -242,16 +207,16 @@ static void tcp_delack_timer(unsigned long data)
tp->ucopy.memory = 0;
}
- if (tcp_ack_scheduled(tp)) {
- if (!tp->ack.pingpong) {
+ if (inet_csk_ack_scheduled(sk)) {
+ if (!icsk->icsk_ack.pingpong) {
/* Delayed ACK missed: inflate ATO. */
- tp->ack.ato = min(tp->ack.ato << 1, tp->rto);
+ icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
} else {
/* Delayed ACK missed: leave pingpong mode and
* deflate ATO.
*/
- tp->ack.pingpong = 0;
- tp->ack.ato = TCP_ATO_MIN;
+ icsk->icsk_ack.pingpong = 0;
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
}
tcp_send_ack(sk);
NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
@@ -268,11 +233,12 @@ out_unlock:
static void tcp_probe_timer(struct sock *sk)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int max_probes;
if (tp->packets_out || !sk->sk_send_head) {
- tp->probes_out = 0;
+ icsk->icsk_probes_out = 0;
return;
}
@@ -283,7 +249,7 @@ static void tcp_probe_timer(struct sock *sk)
* FIXME: We ought not to do it, Solaris 2.5 actually has fixing
* this behaviour in Solaris down as a bug fix. [AC]
*
- * Let me to explain. probes_out is zeroed by incoming ACKs
+ * Let me to explain. icsk_probes_out is zeroed by incoming ACKs
* even if they advertise zero window. Hence, connection is killed only
* if we received no ACKs for normal connection timeout. It is not killed
* only because window stays zero for some time, window may be zero
@@ -294,15 +260,15 @@ static void tcp_probe_timer(struct sock *sk)
max_probes = sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
- int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
+ const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
max_probes = tcp_orphan_retries(sk, alive);
- if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
+ if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes))
return;
}
- if (tp->probes_out > max_probes) {
+ if (icsk->icsk_probes_out > max_probes) {
tcp_write_err(sk);
} else {
/* Only send another probe if we didn't close things up. */
@@ -317,6 +283,7 @@ static void tcp_probe_timer(struct sock *sk)
static void tcp_retransmit_timer(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
if (!tp->packets_out)
goto out;
@@ -351,20 +318,21 @@ static void tcp_retransmit_timer(struct sock *sk)
if (tcp_write_timeout(sk))
goto out;
- if (tp->retransmits == 0) {
- if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
+ if (icsk->icsk_retransmits == 0) {
+ if (icsk->icsk_ca_state == TCP_CA_Disorder ||
+ icsk->icsk_ca_state == TCP_CA_Recovery) {
if (tp->rx_opt.sack_ok) {
- if (tp->ca_state == TCP_CA_Recovery)
+ if (icsk->icsk_ca_state == TCP_CA_Recovery)
NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
else
NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
} else {
- if (tp->ca_state == TCP_CA_Recovery)
+ if (icsk->icsk_ca_state == TCP_CA_Recovery)
NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
else
NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
}
- } else if (tp->ca_state == TCP_CA_Loss) {
+ } else if (icsk->icsk_ca_state == TCP_CA_Loss) {
NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
} else {
NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
@@ -381,10 +349,11 @@ static void tcp_retransmit_timer(struct sock *sk)
/* Retransmission failed because of local congestion,
* do not backoff.
*/
- if (!tp->retransmits)
- tp->retransmits=1;
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
- min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
+ if (!icsk->icsk_retransmits)
+ icsk->icsk_retransmits = 1;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
+ TCP_RTO_MAX);
goto out;
}
@@ -403,13 +372,13 @@ static void tcp_retransmit_timer(struct sock *sk)
* implemented ftp to mars will work nicely. We will have to fix
* the 120 second clamps though!
*/
- tp->backoff++;
- tp->retransmits++;
+ icsk->icsk_backoff++;
+ icsk->icsk_retransmits++;
out_reset_timer:
- tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
- if (tp->retransmits > sysctl_tcp_retries1)
+ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
+ if (icsk->icsk_retransmits > sysctl_tcp_retries1)
__sk_dst_reset(sk);
out:;
@@ -418,32 +387,32 @@ out:;
static void tcp_write_timer(unsigned long data)
{
struct sock *sk = (struct sock*)data;
- struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
int event;
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later */
- sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20));
+ sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
goto out_unlock;
}
- if (sk->sk_state == TCP_CLOSE || !tp->pending)
+ if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
goto out;
- if (time_after(tp->timeout, jiffies)) {
- sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout);
+ if (time_after(icsk->icsk_timeout, jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
goto out;
}
- event = tp->pending;
- tp->pending = 0;
+ event = icsk->icsk_pending;
+ icsk->icsk_pending = 0;
switch (event) {
- case TCP_TIME_RETRANS:
+ case ICSK_TIME_RETRANS:
tcp_retransmit_timer(sk);
break;
- case TCP_TIME_PROBE0:
+ case ICSK_TIME_PROBE0:
tcp_probe_timer(sk);
break;
}
@@ -462,96 +431,8 @@ out_unlock:
static void tcp_synack_timer(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
- struct listen_sock *lopt = tp->accept_queue.listen_opt;
- int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
- int thresh = max_retries;
- unsigned long now = jiffies;
- struct request_sock **reqp, *req;
- int i, budget;
-
- if (lopt == NULL || lopt->qlen == 0)
- return;
-
- /* Normally all the openreqs are young and become mature
- * (i.e. converted to established socket) for first timeout.
- * If synack was not acknowledged for 3 seconds, it means
- * one of the following things: synack was lost, ack was lost,
- * rtt is high or nobody planned to ack (i.e. synflood).
- * When server is a bit loaded, queue is populated with old
- * open requests, reducing effective size of queue.
- * When server is well loaded, queue size reduces to zero
- * after several minutes of work. It is not synflood,
- * it is normal operation. The solution is pruning
- * too old entries overriding normal timeout, when
- * situation becomes dangerous.
- *
- * Essentially, we reserve half of room for young
- * embrions; and abort old ones without pity, if old
- * ones are about to clog our table.
- */
- if (lopt->qlen>>(lopt->max_qlen_log-1)) {
- int young = (lopt->qlen_young<<1);
-
- while (thresh > 2) {
- if (lopt->qlen < young)
- break;
- thresh--;
- young <<= 1;
- }
- }
-
- if (tp->defer_accept)
- max_retries = tp->defer_accept;
-
- budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
- i = lopt->clock_hand;
-
- do {
- reqp=&lopt->syn_table[i];
- while ((req = *reqp) != NULL) {
- if (time_after_eq(now, req->expires)) {
- if ((req->retrans < thresh ||
- (inet_rsk(req)->acked && req->retrans < max_retries))
- && !req->rsk_ops->rtx_syn_ack(sk, req, NULL)) {
- unsigned long timeo;
-
- if (req->retrans++ == 0)
- lopt->qlen_young--;
- timeo = min((TCP_TIMEOUT_INIT << req->retrans),
- TCP_RTO_MAX);
- req->expires = now + timeo;
- reqp = &req->dl_next;
- continue;
- }
-
- /* Drop this request */
- tcp_synq_unlink(tp, req, reqp);
- reqsk_queue_removed(&tp->accept_queue, req);
- reqsk_free(req);
- continue;
- }
- reqp = &req->dl_next;
- }
-
- i = (i+1)&(TCP_SYNQ_HSIZE-1);
-
- } while (--budget > 0);
-
- lopt->clock_hand = i;
-
- if (lopt->qlen)
- tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
-}
-
-void tcp_delete_keepalive_timer (struct sock *sk)
-{
- sk_stop_timer(sk, &sk->sk_timer);
-}
-
-void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
-{
- sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+ inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
+ TCP_TIMEOUT_INIT, TCP_RTO_MAX);
}
void tcp_set_keepalive(struct sock *sk, int val)
@@ -560,15 +441,16 @@ void tcp_set_keepalive(struct sock *sk, int val)
return;
if (val && !sock_flag(sk, SOCK_KEEPOPEN))
- tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
+ inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
else if (!val)
- tcp_delete_keepalive_timer(sk);
+ inet_csk_delete_keepalive_timer(sk);
}
static void tcp_keepalive_timer (unsigned long data)
{
struct sock *sk = (struct sock *) data;
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
__u32 elapsed;
@@ -576,7 +458,7 @@ static void tcp_keepalive_timer (unsigned long data)
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later. */
- tcp_reset_keepalive_timer (sk, HZ/20);
+ inet_csk_reset_keepalive_timer (sk, HZ/20);
goto out;
}
@@ -587,7 +469,7 @@ static void tcp_keepalive_timer (unsigned long data)
if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
if (tp->linger2 >= 0) {
- int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
+ const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
if (tmo > 0) {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
@@ -610,14 +492,14 @@ static void tcp_keepalive_timer (unsigned long data)
elapsed = tcp_time_stamp - tp->rcv_tstamp;
if (elapsed >= keepalive_time_when(tp)) {
- if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
- (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
+ if ((!tp->keepalive_probes && icsk->icsk_probes_out >= sysctl_tcp_keepalive_probes) ||
+ (tp->keepalive_probes && icsk->icsk_probes_out >= tp->keepalive_probes)) {
tcp_send_active_reset(sk, GFP_ATOMIC);
tcp_write_err(sk);
goto out;
}
if (tcp_write_wakeup(sk) <= 0) {
- tp->probes_out++;
+ icsk->icsk_probes_out++;
elapsed = keepalive_intvl_when(tp);
} else {
/* If keepalive was lost due to local congestion,
@@ -634,7 +516,7 @@ static void tcp_keepalive_timer (unsigned long data)
sk_stream_mem_reclaim(sk);
resched:
- tcp_reset_keepalive_timer (sk, elapsed);
+ inet_csk_reset_keepalive_timer (sk, elapsed);
goto out;
death:
@@ -644,8 +526,3 @@ out:
bh_unlock_sock(sk);
sock_put(sk);
}
-
-EXPORT_SYMBOL(tcp_clear_xmit_timers);
-EXPORT_SYMBOL(tcp_delete_keepalive_timer);
-EXPORT_SYMBOL(tcp_init_xmit_timers);
-EXPORT_SYMBOL(tcp_reset_keepalive_timer);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 9bd443db5193..93c5f92070f9 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -35,7 +35,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/skbuff.h>
-#include <linux/tcp_diag.h>
+#include <linux/inet_diag.h>
#include <net/tcp.h>
@@ -82,9 +82,10 @@ struct vegas {
* Instead we must wait until the completion of an RTT during
* which we actually receive ACKs.
*/
-static inline void vegas_enable(struct tcp_sock *tp)
+static inline void vegas_enable(struct sock *sk)
{
- struct vegas *vegas = tcp_ca(tp);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct vegas *vegas = inet_csk_ca(sk);
/* Begin taking Vegas samples next time we send something. */
vegas->doing_vegas_now = 1;
@@ -97,19 +98,19 @@ static inline void vegas_enable(struct tcp_sock *tp)
}
/* Stop taking Vegas samples for now. */
-static inline void vegas_disable(struct tcp_sock *tp)
+static inline void vegas_disable(struct sock *sk)
{
- struct vegas *vegas = tcp_ca(tp);
+ struct vegas *vegas = inet_csk_ca(sk);
vegas->doing_vegas_now = 0;
}
-static void tcp_vegas_init(struct tcp_sock *tp)
+static void tcp_vegas_init(struct sock *sk)
{
- struct vegas *vegas = tcp_ca(tp);
+ struct vegas *vegas = inet_csk_ca(sk);
vegas->baseRTT = 0x7fffffff;
- vegas_enable(tp);
+ vegas_enable(sk);
}
/* Do RTT sampling needed for Vegas.
@@ -120,9 +121,9 @@ static void tcp_vegas_init(struct tcp_sock *tp)
* o min-filter RTT samples from a much longer window (forever for now)
* to find the propagation delay (baseRTT)
*/
-static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
+static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt)
{
- struct vegas *vegas = tcp_ca(tp);
+ struct vegas *vegas = inet_csk_ca(sk);
u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
/* Filter to find propagation delay: */
@@ -136,13 +137,13 @@ static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
vegas->cntRTT++;
}
-static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
+static void tcp_vegas_state(struct sock *sk, u8 ca_state)
{
if (ca_state == TCP_CA_Open)
- vegas_enable(tp);
+ vegas_enable(sk);
else
- vegas_disable(tp);
+ vegas_disable(sk);
}
/*
@@ -154,20 +155,21 @@ static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
* packets, _then_ we can make Vegas calculations
* again.
*/
-static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event)
+static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
{
if (event == CA_EVENT_CWND_RESTART ||
event == CA_EVENT_TX_START)
- tcp_vegas_init(tp);
+ tcp_vegas_init(sk);
}
-static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
+static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
u32 seq_rtt, u32 in_flight, int flag)
{
- struct vegas *vegas = tcp_ca(tp);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct vegas *vegas = inet_csk_ca(sk);
if (!vegas->doing_vegas_now)
- return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag);
+ return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
/* The key players are v_beg_snd_una and v_beg_snd_nxt.
*
@@ -219,7 +221,7 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
* but that's not too awful, since we're taking the min,
* rather than averaging.
*/
- tcp_vegas_rtt_calc(tp, seq_rtt*1000);
+ tcp_vegas_rtt_calc(sk, seq_rtt * 1000);
/* We do the Vegas calculations only if we got enough RTT
* samples that we can be reasonably sure that we got
@@ -359,14 +361,14 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
}
/* Extract info for Tcp socket info provided via netlink. */
-static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext,
+static void tcp_vegas_get_info(struct sock *sk, u32 ext,
struct sk_buff *skb)
{
- const struct vegas *ca = tcp_ca(tp);
- if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
+ const struct vegas *ca = inet_csk_ca(sk);
+ if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
struct tcpvegas_info *info;
- info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO,
+ info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO,
sizeof(*info)));
info->tcpv_enabled = ca->doing_vegas_now;
@@ -393,7 +395,7 @@ static struct tcp_congestion_ops tcp_vegas = {
static int __init tcp_vegas_register(void)
{
- BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE);
+ BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE);
tcp_register_congestion_control(&tcp_vegas);
return 0;
}
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index ef827242c940..0c340c3756c2 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -8,7 +8,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/skbuff.h>
-#include <linux/tcp_diag.h>
+#include <linux/inet_diag.h>
#include <net/tcp.h>
/* TCP Westwood structure */
@@ -40,9 +40,9 @@ struct westwood {
* way as soon as possible. It will reasonably happen within the first
* RTT period of the connection lifetime.
*/
-static void tcp_westwood_init(struct tcp_sock *tp)
+static void tcp_westwood_init(struct sock *sk)
{
- struct westwood *w = tcp_ca(tp);
+ struct westwood *w = inet_csk_ca(sk);
w->bk = 0;
w->bw_ns_est = 0;
@@ -51,7 +51,7 @@ static void tcp_westwood_init(struct tcp_sock *tp)
w->cumul_ack = 0;
w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
w->rtt_win_sx = tcp_time_stamp;
- w->snd_una = tp->snd_una;
+ w->snd_una = tcp_sk(sk)->snd_una;
}
/*
@@ -74,11 +74,11 @@ static inline void westwood_filter(struct westwood *w, u32 delta)
* Called after processing group of packets.
* but all westwood needs is the last sample of srtt.
*/
-static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
+static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt)
{
- struct westwood *w = tcp_ca(tp);
+ struct westwood *w = inet_csk_ca(sk);
if (cnt > 0)
- w->rtt = tp->srtt >> 3;
+ w->rtt = tcp_sk(sk)->srtt >> 3;
}
/*
@@ -86,9 +86,9 @@ static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
* It updates RTT evaluation window if it is the right moment to do
* it. If so it calls filter for evaluating bandwidth.
*/
-static void westwood_update_window(struct tcp_sock *tp)
+static void westwood_update_window(struct sock *sk)
{
- struct westwood *w = tcp_ca(tp);
+ struct westwood *w = inet_csk_ca(sk);
s32 delta = tcp_time_stamp - w->rtt_win_sx;
/*
@@ -114,11 +114,12 @@ static void westwood_update_window(struct tcp_sock *tp)
* header prediction is successful. In such case in fact update is
* straight forward and doesn't need any particular care.
*/
-static inline void westwood_fast_bw(struct tcp_sock *tp)
+static inline void westwood_fast_bw(struct sock *sk)
{
- struct westwood *w = tcp_ca(tp);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct westwood *w = inet_csk_ca(sk);
- westwood_update_window(tp);
+ westwood_update_window(sk);
w->bk += tp->snd_una - w->snd_una;
w->snd_una = tp->snd_una;
@@ -130,9 +131,10 @@ static inline void westwood_fast_bw(struct tcp_sock *tp)
* This function evaluates cumul_ack for evaluating bk in case of
* delayed or partial acks.
*/
-static inline u32 westwood_acked_count(struct tcp_sock *tp)
+static inline u32 westwood_acked_count(struct sock *sk)
{
- struct westwood *w = tcp_ca(tp);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct westwood *w = inet_csk_ca(sk);
w->cumul_ack = tp->snd_una - w->snd_una;
@@ -160,9 +162,10 @@ static inline u32 westwood_acked_count(struct tcp_sock *tp)
return w->cumul_ack;
}
-static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
+static inline u32 westwood_bw_rttmin(const struct sock *sk)
{
- struct westwood *w = tcp_ca(tp);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct westwood *w = inet_csk_ca(sk);
return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
}
@@ -172,31 +175,32 @@ static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
* in packets we use mss_cache). Rttmin is guaranteed to be >= 2
* so avoids ever returning 0.
*/
-static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp)
+static u32 tcp_westwood_cwnd_min(struct sock *sk)
{
- return westwood_bw_rttmin(tp);
+ return westwood_bw_rttmin(sk);
}
-static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
+static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
{
- struct westwood *w = tcp_ca(tp);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct westwood *w = inet_csk_ca(sk);
switch(event) {
case CA_EVENT_FAST_ACK:
- westwood_fast_bw(tp);
+ westwood_fast_bw(sk);
break;
case CA_EVENT_COMPLETE_CWR:
- tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp);
+ tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk);
break;
case CA_EVENT_FRTO:
- tp->snd_ssthresh = westwood_bw_rttmin(tp);
+ tp->snd_ssthresh = westwood_bw_rttmin(sk);
break;
case CA_EVENT_SLOW_ACK:
- westwood_update_window(tp);
- w->bk += westwood_acked_count(tp);
+ westwood_update_window(sk);
+ w->bk += westwood_acked_count(sk);
w->rtt_min = min(w->rtt, w->rtt_min);
break;
@@ -208,15 +212,15 @@ static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
/* Extract info for Tcp socket info provided via netlink. */
-static void tcp_westwood_info(struct tcp_sock *tp, u32 ext,
+static void tcp_westwood_info(struct sock *sk, u32 ext,
struct sk_buff *skb)
{
- const struct westwood *ca = tcp_ca(tp);
- if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
+ const struct westwood *ca = inet_csk_ca(sk);
+ if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
struct rtattr *rta;
struct tcpvegas_info *info;
- rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info));
+ rta = __RTA_PUT(skb, INET_DIAG_VEGASINFO, sizeof(*info));
info = RTA_DATA(rta);
info->tcpv_enabled = 1;
info->tcpv_rttcnt = 0;
@@ -242,7 +246,7 @@ static struct tcp_congestion_ops tcp_westwood = {
static int __init tcp_westwood_register(void)
{
- BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE);
+ BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE);
return tcp_register_congestion_control(&tcp_westwood);
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7c24e64b443f..e0bd1013cb0d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -95,7 +95,8 @@
#include <linux/ipv6.h>
#include <linux/netdevice.h>
#include <net/snmp.h>
-#include <net/tcp.h>
+#include <net/ip.h>
+#include <net/tcp_states.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
@@ -112,7 +113,7 @@
* Snmp MIB for the UDP layer
*/
-DEFINE_SNMP_STAT(struct udp_mib, udp_statistics);
+DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly;
struct hlist_head udp_hash[UDP_HTABLE_SIZE];
DEFINE_RWLOCK(udp_hash_lock);
@@ -628,7 +629,7 @@ back_from_confirm:
/* ... which is an evident application bug. --ANK */
release_sock(sk);
- NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 2\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
err = -EINVAL;
goto out;
}
@@ -693,7 +694,7 @@ static int udp_sendpage(struct sock *sk, struct page *page, int offset,
if (unlikely(!up->pending)) {
release_sock(sk);
- NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 3\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n");
return -EINVAL;
}
@@ -1102,7 +1103,7 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
skb->ip_summed = CHECKSUM_UNNECESSARY;
if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
return 0;
- NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp v4 hw csum failure.\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n");
skb->ip_summed = CHECKSUM_NONE;
}
if (skb->ip_summed != CHECKSUM_UNNECESSARY)
@@ -1140,7 +1141,7 @@ int udp_rcv(struct sk_buff *skb)
if (ulen > len || ulen < sizeof(*uh))
goto short_packet;
- if (pskb_trim(skb, ulen))
+ if (pskb_trim_rcsum(skb, ulen))
goto short_packet;
if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0)
@@ -1181,14 +1182,13 @@ int udp_rcv(struct sk_buff *skb)
return(0);
short_packet:
- NETDEBUG(if (net_ratelimit())
- printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
- NIPQUAD(saddr),
- ntohs(uh->source),
- ulen,
- len,
- NIPQUAD(daddr),
- ntohs(uh->dest)));
+ LIMIT_NETDEBUG(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
+ NIPQUAD(saddr),
+ ntohs(uh->source),
+ ulen,
+ len,
+ NIPQUAD(daddr),
+ ntohs(uh->dest));
no_header:
UDP_INC_STATS_BH(UDP_MIB_INERRORS);
kfree_skb(skb);
@@ -1199,13 +1199,12 @@ csum_error:
* RFC1122: OK. Discards the bad packet silently (as far as
* the network is concerned, anyway) as per 4.1.3.4 (MUST).
*/
- NETDEBUG(if (net_ratelimit())
- printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
- NIPQUAD(saddr),
- ntohs(uh->source),
- NIPQUAD(daddr),
- ntohs(uh->dest),
- ulen));
+ LIMIT_NETDEBUG(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
+ NIPQUAD(saddr),
+ ntohs(uh->source),
+ NIPQUAD(daddr),
+ ntohs(uh->dest),
+ ulen);
drop:
UDP_INC_STATS_BH(UDP_MIB_INERRORS);
kfree_skb(skb);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 050611d7a967..d23e07fc81fa 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -128,8 +128,10 @@ void __init xfrm4_state_init(void)
xfrm_state_register_afinfo(&xfrm4_state_afinfo);
}
+#if 0
void __exit xfrm4_state_fini(void)
{
xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
}
+#endif /* 0 */