aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@g5.osdl.org>2006-01-04 16:27:41 -0800
committerLinus Torvalds <torvalds@g5.osdl.org>2006-01-04 16:27:41 -0800
commitd347da0deffa1d8f88f0d270eab040e4707c9916 (patch)
treee0911f2ef4d36a7b44f7a5379feabebbd37dcfc4
parentMerge git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb-2.6 (diff)
parent[TCP] tcp_vegas: Fix slow start (diff)
downloadlinux-dev-d347da0deffa1d8f88f0d270eab040e4707c9916.tar.xz
linux-dev-d347da0deffa1d8f88f0d270eab040e4707c9916.zip
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6
-rw-r--r--Documentation/networking/ip-sysctl.txt23
-rw-r--r--drivers/char/random.c10
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c2
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_multicast.c2
-rw-r--r--drivers/net/ns83820.c1
-rw-r--r--drivers/net/pppoe.c31
-rw-r--r--drivers/net/pppox.c10
-rw-r--r--drivers/net/sk98lin/skge.c1
-rw-r--r--drivers/net/skge.c1
-rw-r--r--drivers/net/tg3.c3
-rw-r--r--drivers/net/wireless/ipw2200.c15
-rw-r--r--fs/9p/trans_sock.c1
-rw-r--r--fs/nfs/callback.c3
-rw-r--r--include/asm-alpha/bitops.h1
-rw-r--r--include/asm-arm/bitops.h2
-rw-r--r--include/asm-arm26/bitops.h1
-rw-r--r--include/asm-cris/bitops.h1
-rw-r--r--include/asm-frv/bitops.h1
-rw-r--r--include/asm-generic/bitops.h1
-rw-r--r--include/asm-h8300/bitops.h1
-rw-r--r--include/asm-i386/bitops.h1
-rw-r--r--include/asm-ia64/bitops.h1
-rw-r--r--include/asm-m32r/bitops.h1
-rw-r--r--include/asm-m68k/bitops.h1
-rw-r--r--include/asm-m68knommu/bitops.h1
-rw-r--r--include/asm-mips/bitops.h2
-rw-r--r--include/asm-parisc/bitops.h1
-rw-r--r--include/asm-powerpc/bitops.h1
-rw-r--r--include/asm-s390/bitops.h1
-rw-r--r--include/asm-sh/bitops.h1
-rw-r--r--include/asm-sh64/bitops.h1
-rw-r--r--include/asm-sparc/bitops.h1
-rw-r--r--include/asm-sparc64/bitops.h1
-rw-r--r--include/asm-v850/bitops.h1
-rw-r--r--include/asm-x86_64/bitops.h27
-rw-r--r--include/asm-xtensa/bitops.h1
-rw-r--r--include/linux/bitops.h9
-rw-r--r--include/linux/dccp.h7
-rw-r--r--include/linux/etherdevice.h3
-rw-r--r--include/linux/if_pppox.h3
-rw-r--r--include/linux/ip.h121
-rw-r--r--include/linux/ipv6.h79
-rw-r--r--include/linux/net.h4
-rw-r--r--include/linux/pfkeyv2.h13
-rw-r--r--include/linux/pkt_sched.h7
-rw-r--r--include/linux/random.h6
-rw-r--r--include/linux/security.h132
-rw-r--r--include/linux/skbuff.h5
-rw-r--r--include/linux/socket.h1
-rw-r--r--include/linux/sysctl.h1
-rw-r--r--include/linux/tcp.h21
-rw-r--r--include/linux/udp.h6
-rw-r--r--include/linux/xfrm.h29
-rw-r--r--include/net/af_unix.h12
-rw-r--r--include/net/atmclip.h2
-rw-r--r--include/net/dst.h1
-rw-r--r--include/net/flow.h7
-rw-r--r--include/net/genetlink.h2
-rw-r--r--include/net/icmp.h9
-rw-r--r--include/net/ieee80211_crypt.h9
-rw-r--r--include/net/inet6_connection_sock.h42
-rw-r--r--include/net/inet6_hashtables.h32
-rw-r--r--include/net/inet_common.h4
-rw-r--r--include/net/inet_connection_sock.h45
-rw-r--r--include/net/inet_ecn.h2
-rw-r--r--include/net/inet_hashtables.h24
-rw-r--r--include/net/inet_sock.h193
-rw-r--r--include/net/inet_timewait_sock.h8
-rw-r--r--include/net/inetpeer.h1
-rw-r--r--include/net/ip.h19
-rw-r--r--include/net/ip_fib.h2
-rw-r--r--include/net/ip_vs.h12
-rw-r--r--include/net/ipv6.h12
-rw-r--r--include/net/ndisc.h17
-rw-r--r--include/net/neighbour.h2
-rw-r--r--include/net/pkt_act.h1
-rw-r--r--include/net/protocol.h3
-rw-r--r--include/net/raw.h2
-rw-r--r--include/net/request_sock.h2
-rw-r--r--include/net/sctp/structs.h76
-rw-r--r--include/net/sctp/user.h30
-rw-r--r--include/net/sock.h32
-rw-r--r--include/net/tcp.h246
-rw-r--r--include/net/tcp_states.h16
-rw-r--r--include/net/timewait_sock.h31
-rw-r--r--include/net/transp_v6.h2
-rw-r--r--include/net/udp.h4
-rw-r--r--include/net/xfrm.h30
-rw-r--r--init/main.c4
-rw-r--r--net/appletalk/ddp.c23
-rw-r--r--net/atm/pvc.c2
-rw-r--r--net/atm/svc.c2
-rw-r--r--net/ax25/af_ax25.c6
-rw-r--r--net/bluetooth/af_bluetooth.c5
-rw-r--r--net/bluetooth/bnep/sock.c2
-rw-r--r--net/bluetooth/cmtp/sock.c2
-rw-r--r--net/bluetooth/hci_sock.c2
-rw-r--r--net/bluetooth/hidp/sock.c2
-rw-r--r--net/bluetooth/l2cap.c9
-rw-r--r--net/bluetooth/rfcomm/sock.c4
-rw-r--r--net/bluetooth/sco.c9
-rw-r--r--net/bridge/br.c1
-rw-r--r--net/bridge/br_device.c91
-rw-r--r--net/bridge/br_if.c55
-rw-r--r--net/bridge/br_input.c11
-rw-r--r--net/bridge/br_netfilter.c4
-rw-r--r--net/bridge/br_notify.c14
-rw-r--r--net/bridge/br_private.h7
-rw-r--r--net/bridge/br_stp_if.c5
-rw-r--r--net/bridge/netfilter/Kconfig6
-rw-r--r--net/bridge/netfilter/ebt_log.c73
-rw-r--r--net/bridge/netfilter/ebt_ulog.c53
-rw-r--r--net/core/datagram.c36
-rw-r--r--net/core/dev.c1
-rw-r--r--net/core/filter.c112
-rw-r--r--net/core/flow.c8
-rw-r--r--net/core/netpoll.c1
-rw-r--r--net/core/pktgen.c6
-rw-r--r--net/core/skbuff.c27
-rw-r--r--net/core/sock.c21
-rw-r--r--net/core/stream.c10
-rw-r--r--net/dccp/Makefile4
-rw-r--r--net/dccp/ackvec.c33
-rw-r--r--net/dccp/ackvec.h12
-rw-r--r--net/dccp/ccid.h2
-rw-r--r--net/dccp/dccp.h24
-rw-r--r--net/dccp/diag.c2
-rw-r--r--net/dccp/input.c79
-rw-r--r--net/dccp/ipv4.c305
-rw-r--r--net/dccp/ipv6.c1261
-rw-r--r--net/dccp/ipv6.h37
-rw-r--r--net/dccp/minisocks.c23
-rw-r--r--net/dccp/output.c47
-rw-r--r--net/dccp/proto.c56
-rw-r--r--net/decnet/af_decnet.c6
-rw-r--r--net/decnet/dn_neigh.c13
-rw-r--r--net/decnet/dn_nsp_in.c17
-rw-r--r--net/econet/af_econet.c9
-rw-r--r--net/ieee80211/ieee80211_rx.c5
-rw-r--r--net/ipv4/Kconfig8
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c19
-rw-r--r--net/ipv4/ah4.c1
-rw-r--r--net/ipv4/arp.c1
-rw-r--r--net/ipv4/devinet.c1
-rw-r--r--net/ipv4/esp4.c1
-rw-r--r--net/ipv4/fib_frontend.c1
-rw-r--r--net/ipv4/fib_hash.c1
-rw-r--r--net/ipv4/fib_rules.c1
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/fib_trie.c8
-rw-r--r--net/ipv4/icmp.c1
-rw-r--r--net/ipv4/igmp.c2
-rw-r--r--net/ipv4/inet_connection_sock.c25
-rw-r--r--net/ipv4/inet_diag.c14
-rw-r--r--net/ipv4/inet_hashtables.c178
-rw-r--r--net/ipv4/inet_timewait_sock.c5
-rw-r--r--net/ipv4/inetpeer.c1
-rw-r--r--net/ipv4/ip_fragment.c68
-rw-r--r--net/ipv4/ip_input.c1
-rw-r--r--net/ipv4/ip_options.c1
-rw-r--r--net/ipv4/ip_output.c1
-rw-r--r--net/ipv4/ip_sockglue.c14
-rw-r--r--net/ipv4/ipcomp.c1
-rw-r--r--net/ipv4/ipconfig.c2
-rw-r--r--net/ipv4/ipmr.c1
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c28
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c21
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c10
-rw-r--r--net/ipv4/ipvs/ip_vs_dh.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_est.c3
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c29
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c29
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_ah.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_esp.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c24
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_udp.c3
-rw-r--r--net/ipv4/ipvs/ip_vs_sh.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c2
-rw-r--r--net/ipv4/netfilter/arp_tables.c175
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c2
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_gre.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c1
-rw-r--r--net/ipv4/netfilter/ip_nat_snmp_basic.c2
-rw-r--r--net/ipv4/netfilter/ip_tables.c199
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c2
-rw-r--r--net/ipv4/netfilter/ipt_physdev.c1
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/syncookies.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c11
-rw-r--r--net/ipv4/tcp.c10
-rw-r--r--net/ipv4/tcp_bic.c85
-rw-r--r--net/ipv4/tcp_cong.c28
-rw-r--r--net/ipv4/tcp_cubic.c411
-rw-r--r--net/ipv4/tcp_input.c99
-rw-r--r--net/ipv4/tcp_ipv4.c269
-rw-r--r--net/ipv4/tcp_minisocks.c16
-rw-r--r--net/ipv4/tcp_output.c118
-rw-r--r--net/ipv4/tcp_vegas.c4
-rw-r--r--net/ipv4/udp.c22
-rw-r--r--net/ipv6/Makefile3
-rw-r--r--net/ipv6/addrconf.c2
-rw-r--r--net/ipv6/af_inet6.c90
-rw-r--r--net/ipv6/ah6.c1
-rw-r--r--net/ipv6/esp6.c1
-rw-r--r--net/ipv6/exthdrs.c4
-rw-r--r--net/ipv6/inet6_connection_sock.c199
-rw-r--r--net/ipv6/inet6_hashtables.c183
-rw-r--r--net/ipv6/ip6_flowlabel.c2
-rw-r--r--net/ipv6/ip6_output.c2
-rw-r--r--net/ipv6/ipcomp6.c1
-rw-r--r--net/ipv6/ipv6_sockglue.c24
-rw-r--r--net/ipv6/mcast.c2
-rw-r--r--net/ipv6/netfilter/ip6_tables.c191
-rw-r--r--net/ipv6/netfilter/ip6t_LOG.c1
-rw-r--r--net/ipv6/netfilter/ip6t_ah.c1
-rw-r--r--net/ipv6/netfilter/ip6t_esp.c1
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c4
-rw-r--r--net/ipv6/raw.c16
-rw-r--r--net/ipv6/tcp_ipv6.c639
-rw-r--r--net/ipv6/udp.c16
-rw-r--r--net/ipx/af_ipx.c6
-rw-r--r--net/irda/af_irda.c23
-rw-r--r--net/key/af_key.c201
-rw-r--r--net/llc/af_llc.c11
-rw-r--r--net/netfilter/nfnetlink_log.c2
-rw-r--r--net/netfilter/nfnetlink_queue.c2
-rw-r--r--net/netlink/af_netlink.c4
-rw-r--r--net/netlink/genetlink.c2
-rw-r--r--net/netrom/af_netrom.c16
-rw-r--r--net/nonet.c5
-rw-r--r--net/packet/af_packet.c10
-rw-r--r--net/rose/af_rose.c2
-rw-r--r--net/sched/sch_netem.c49
-rw-r--r--net/sched/sch_teql.c1
-rw-r--r--net/sctp/associola.c81
-rw-r--r--net/sctp/input.c36
-rw-r--r--net/sctp/ipv6.c2
-rw-r--r--net/sctp/output.c17
-rw-r--r--net/sctp/protocol.c3
-rw-r--r--net/sctp/sm_sideeffect.c26
-rw-r--r--net/sctp/sm_statefuns.c20
-rw-r--r--net/sctp/socket.c691
-rw-r--r--net/sctp/transport.c32
-rw-r--r--net/socket.c243
-rw-r--r--net/sunrpc/svcsock.c2
-rw-r--r--net/unix/af_unix.c55
-rw-r--r--net/unix/garbage.c4
-rw-r--r--net/wanrouter/af_wanpipe.c6
-rw-r--r--net/x25/af_x25.c6
-rw-r--r--net/xfrm/xfrm_policy.c88
-rw-r--r--net/xfrm/xfrm_state.c9
-rw-r--r--net/xfrm/xfrm_user.c148
-rw-r--r--security/Kconfig13
-rw-r--r--security/dummy.c45
-rw-r--r--security/selinux/Makefile2
-rw-r--r--security/selinux/hooks.c39
-rw-r--r--security/selinux/include/av_perm_to_string.h2
-rw-r--r--security/selinux/include/av_permissions.h2
-rw-r--r--security/selinux/include/xfrm.h54
-rw-r--r--security/selinux/xfrm.c311
263 files changed, 6860 insertions, 2857 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index ebc09a159f62..2b7cf19a06ad 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -46,6 +46,29 @@ ipfrag_secret_interval - INTEGER
for the hash secret) for IP fragments.
Default: 600
+ipfrag_max_dist - INTEGER
+ ipfrag_max_dist is a non-negative integer value which defines the
+ maximum "disorder" which is allowed among fragments which share a
+ common IP source address. Note that reordering of packets is
+ not unusual, but if a large number of fragments arrive from a source
+ IP address while a particular fragment queue remains incomplete, it
+ probably indicates that one or more fragments belonging to that queue
+ have been lost. When ipfrag_max_dist is positive, an additional check
+ is done on fragments before they are added to a reassembly queue - if
+ ipfrag_max_dist (or more) fragments have arrived from a particular IP
+ address between additions to any IP fragment queue using that source
+ address, it's presumed that one or more fragments in the queue are
+ lost. The existing fragment queue will be dropped, and a new one
+ started. An ipfrag_max_dist value of zero disables this check.
+
+ Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can
+ result in unnecessarily dropping fragment queues when normal
+ reordering of packets occurs, which could lead to poor application
+ performance. Using a very large value, e.g. 50000, increases the
+ likelihood of incorrectly reassembling IP fragments that originate
+ from different IP datagrams, which could result in data corruption.
+ Default: 64
+
INET peer storage:
inet_peer_threshold - INTEGER
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 7999da25fe40..bdfdfd28594d 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1554,10 +1554,8 @@ __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr,
EXPORT_SYMBOL(secure_tcp_sequence_number);
-
-
-/* Generate secure starting point for ephemeral TCP port search */
-u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport)
+/* Generate secure starting point for ephemeral IPV4 transport port search */
+u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport)
{
struct keydata *keyptr = get_keyptr();
u32 hash[4];
@@ -1575,7 +1573,7 @@ u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport)
}
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dport)
+u32 secure_ipv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dport)
{
struct keydata *keyptr = get_keyptr();
u32 hash[12];
@@ -1586,7 +1584,7 @@ u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dp
return twothirdsMD4Transform(daddr, hash);
}
-EXPORT_SYMBOL(secure_tcpv6_port_ephemeral);
+EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
#endif
#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 475d98fa9e26..780009c7eaa6 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -47,6 +47,8 @@
#include <linux/ip.h>
#include <linux/in.h>
+#include <net/dst.h>
+
MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index ef3ee035bbc8..ed0c2ead8bc1 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -43,6 +43,8 @@
#include <linux/delay.h>
#include <linux/completion.h>
+#include <net/dst.h>
+
#include "ipoib.h"
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
diff --git a/drivers/net/ns83820.c b/drivers/net/ns83820.c
index f857ae94d261..b0c3b6ab6263 100644
--- a/drivers/net/ns83820.c
+++ b/drivers/net/ns83820.c
@@ -115,6 +115,7 @@
#include <linux/ethtool.h>
#include <linux/timer.h>
#include <linux/if_vlan.h>
+#include <linux/rtnetlink.h>
#include <asm/io.h>
#include <asm/uaccess.h>
diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c
index a842ecc60a34..9369f811075d 100644
--- a/drivers/net/pppoe.c
+++ b/drivers/net/pppoe.c
@@ -85,7 +85,7 @@ static int pppoe_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
static int pppoe_xmit(struct ppp_channel *chan, struct sk_buff *skb);
static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb);
-static struct proto_ops pppoe_ops;
+static const struct proto_ops pppoe_ops;
static DEFINE_RWLOCK(pppoe_hash_lock);
static struct ppp_channel_ops pppoe_chan_ops;
@@ -383,8 +383,6 @@ static int pppoe_rcv(struct sk_buff *skb,
{
struct pppoe_hdr *ph;
struct pppox_sock *po;
- struct sock *sk;
- int ret;
if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr)))
goto drop;
@@ -395,24 +393,8 @@ static int pppoe_rcv(struct sk_buff *skb,
ph = (struct pppoe_hdr *) skb->nh.raw;
po = get_item((unsigned long) ph->sid, eth_hdr(skb)->h_source);
- if (!po)
- goto drop;
-
- sk = sk_pppox(po);
- bh_lock_sock(sk);
-
- /* Socket state is unknown, must put skb into backlog. */
- if (sock_owned_by_user(sk) != 0) {
- sk_add_backlog(sk, skb);
- ret = NET_RX_SUCCESS;
- } else {
- ret = pppoe_rcv_core(sk, skb);
- }
-
- bh_unlock_sock(sk);
- sock_put(sk);
-
- return ret;
+ if (po != NULL)
+ return sk_receive_skb(sk_pppox(po), skb);
drop:
kfree_skb(skb);
out:
@@ -1081,9 +1063,7 @@ static int __init pppoe_proc_init(void)
static inline int pppoe_proc_init(void) { return 0; }
#endif /* CONFIG_PROC_FS */
-/* ->ioctl are set at pppox_create */
-
-static struct proto_ops pppoe_ops = {
+static const struct proto_ops pppoe_ops = {
.family = AF_PPPOX,
.owner = THIS_MODULE,
.release = pppoe_release,
@@ -1099,7 +1079,8 @@ static struct proto_ops pppoe_ops = {
.getsockopt = sock_no_getsockopt,
.sendmsg = pppoe_sendmsg,
.recvmsg = pppoe_recvmsg,
- .mmap = sock_no_mmap
+ .mmap = sock_no_mmap,
+ .ioctl = pppox_ioctl,
};
static struct pppox_proto pppoe_proto = {
diff --git a/drivers/net/pppox.c b/drivers/net/pppox.c
index 0c1e114527fb..9315046b3f55 100644
--- a/drivers/net/pppox.c
+++ b/drivers/net/pppox.c
@@ -68,8 +68,7 @@ EXPORT_SYMBOL(register_pppox_proto);
EXPORT_SYMBOL(unregister_pppox_proto);
EXPORT_SYMBOL(pppox_unbind_sock);
-static int pppox_ioctl(struct socket* sock, unsigned int cmd,
- unsigned long arg)
+int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
struct sock *sk = sock->sk;
struct pppox_sock *po = pppox_sk(sk);
@@ -105,6 +104,7 @@ static int pppox_ioctl(struct socket* sock, unsigned int cmd,
return rc;
}
+EXPORT_SYMBOL(pppox_ioctl);
static int pppox_create(struct socket *sock, int protocol)
{
@@ -119,11 +119,7 @@ static int pppox_create(struct socket *sock, int protocol)
goto out;
rc = pppox_protos[protocol]->create(sock);
- if (!rc) {
- /* We get to set the ioctl handler. */
- /* For everything else, pppox is just a shell. */
- sock->ops->ioctl = pppox_ioctl;
- }
+
module_put(pppox_protos[protocol]->owner);
out:
return rc;
diff --git a/drivers/net/sk98lin/skge.c b/drivers/net/sk98lin/skge.c
index ae7343934758..e1a2d52cc1fe 100644
--- a/drivers/net/sk98lin/skge.c
+++ b/drivers/net/sk98lin/skge.c
@@ -107,6 +107,7 @@
#include "h/skversion.h"
+#include <linux/in.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/init.h>
diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index 00d683063c01..d8cc3aea032a 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -25,6 +25,7 @@
*/
#include <linux/config.h>
+#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 2fc9893d69e1..eb86b059809b 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -24,6 +24,7 @@
#include <linux/compiler.h>
#include <linux/slab.h>
#include <linux/delay.h>
+#include <linux/in.h>
#include <linux/init.h>
#include <linux/ioport.h>
#include <linux/pci.h>
@@ -3650,7 +3651,7 @@ static int tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
TXD_FLAG_CPU_POST_DMA);
skb->nh.iph->check = 0;
- skb->nh.iph->tot_len = ntohs(mss + ip_tcp_len + tcp_opt_len);
+ skb->nh.iph->tot_len = htons(mss + ip_tcp_len + tcp_opt_len);
if (tp->tg3_flags2 & TG3_FLG2_HW_TSO) {
skb->h.th->check = 0;
base_flags &= ~TXD_FLAG_TCPUDP_CSUM;
diff --git a/drivers/net/wireless/ipw2200.c b/drivers/net/wireless/ipw2200.c
index 5e7c7e944c9d..64f6d1f25753 100644
--- a/drivers/net/wireless/ipw2200.c
+++ b/drivers/net/wireless/ipw2200.c
@@ -7456,8 +7456,7 @@ static void ipw_handle_data_packet(struct ipw_priv *priv,
/* HW decrypt will not clear the WEP bit, MIC, PN, etc. */
hdr = (struct ieee80211_hdr_4addr *)rxb->skb->data;
if (priv->ieee->iw_mode != IW_MODE_MONITOR &&
- ((is_multicast_ether_addr(hdr->addr1) ||
- is_broadcast_ether_addr(hdr->addr1)) ?
+ (is_multicast_ether_addr(hdr->addr1) ?
!priv->ieee->host_mc_decrypt : !priv->ieee->host_decrypt))
ipw_rebuild_decrypted_skb(priv, rxb->skb);
@@ -7648,8 +7647,7 @@ static inline int is_network_packet(struct ipw_priv *priv,
return 0;
/* {broad,multi}cast packets to our BSSID go through */
- if (is_multicast_ether_addr(header->addr1) ||
- is_broadcast_ether_addr(header->addr1))
+ if (is_multicast_ether_addr(header->addr1))
return !memcmp(header->addr3, priv->bssid, ETH_ALEN);
/* packets to our adapter go through */
@@ -7662,8 +7660,7 @@ static inline int is_network_packet(struct ipw_priv *priv,
return 0;
/* {broad,multi}cast packets to our BSS go through */
- if (is_multicast_ether_addr(header->addr1) ||
- is_broadcast_ether_addr(header->addr1))
+ if (is_multicast_ether_addr(header->addr1))
return !memcmp(header->addr2, priv->bssid, ETH_ALEN);
/* packets to our adapter go through */
@@ -9657,8 +9654,7 @@ static inline int ipw_tx_skb(struct ipw_priv *priv, struct ieee80211_txb *txb,
switch (priv->ieee->iw_mode) {
case IW_MODE_ADHOC:
hdr_len = IEEE80211_3ADDR_LEN;
- unicast = !(is_multicast_ether_addr(hdr->addr1) ||
- is_broadcast_ether_addr(hdr->addr1));
+ unicast = !is_multicast_ether_addr(hdr->addr1);
id = ipw_find_station(priv, hdr->addr1);
if (id == IPW_INVALID_STATION) {
id = ipw_add_station(priv, hdr->addr1);
@@ -9673,8 +9669,7 @@ static inline int ipw_tx_skb(struct ipw_priv *priv, struct ieee80211_txb *txb,
case IW_MODE_INFRA:
default:
- unicast = !(is_multicast_ether_addr(hdr->addr3) ||
- is_broadcast_ether_addr(hdr->addr3));
+ unicast = !is_multicast_ether_addr(hdr->addr3);
hdr_len = IEEE80211_3ADDR_LEN;
id = 0;
break;
diff --git a/fs/9p/trans_sock.c b/fs/9p/trans_sock.c
index a93c2bf94c33..6a9a75d40f73 100644
--- a/fs/9p/trans_sock.c
+++ b/fs/9p/trans_sock.c
@@ -26,6 +26,7 @@
*/
#include <linux/config.h>
+#include <linux/in.h>
#include <linux/module.h>
#include <linux/net.h>
#include <linux/ipv6.h>
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index f2ca782aba33..30cae3602867 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -14,6 +14,9 @@
#include <linux/sunrpc/svc.h>
#include <linux/sunrpc/svcsock.h>
#include <linux/nfs_fs.h>
+
+#include <net/inet_sock.h>
+
#include "nfs4_fs.h"
#include "callback.h"
diff --git a/include/asm-alpha/bitops.h b/include/asm-alpha/bitops.h
index 578ed3f1a607..302201f1a097 100644
--- a/include/asm-alpha/bitops.h
+++ b/include/asm-alpha/bitops.h
@@ -321,6 +321,7 @@ static inline int fls(int word)
#else
#define fls generic_fls
#endif
+#define fls64 generic_fls64
/* Compute powers of two for the given integer. */
static inline long floor_log2(unsigned long word)
diff --git a/include/asm-arm/bitops.h b/include/asm-arm/bitops.h
index 7399d431edfe..d02de721ecc1 100644
--- a/include/asm-arm/bitops.h
+++ b/include/asm-arm/bitops.h
@@ -332,6 +332,7 @@ static inline unsigned long __ffs(unsigned long word)
*/
#define fls(x) generic_fls(x)
+#define fls64(x) generic_fls64(x)
/*
* ffs: find first bit set. This is defined the same way as
@@ -351,6 +352,7 @@ static inline unsigned long __ffs(unsigned long word)
#define fls(x) \
( __builtin_constant_p(x) ? generic_fls(x) : \
({ int __r; asm("clz\t%0, %1" : "=r"(__r) : "r"(x) : "cc"); 32-__r; }) )
+#define fls64(x) generic_fls64(x)
#define ffs(x) ({ unsigned long __t = (x); fls(__t & -__t); })
#define __ffs(x) (ffs(x) - 1)
#define ffz(x) __ffs( ~(x) )
diff --git a/include/asm-arm26/bitops.h b/include/asm-arm26/bitops.h
index 7d062fb2e343..15cc6f2da792 100644
--- a/include/asm-arm26/bitops.h
+++ b/include/asm-arm26/bitops.h
@@ -259,6 +259,7 @@ static inline unsigned long __ffs(unsigned long word)
*/
#define fls(x) generic_fls(x)
+#define fls64(x) generic_fls64(x)
/*
* ffs: find first bit set. This is defined the same way as
diff --git a/include/asm-cris/bitops.h b/include/asm-cris/bitops.h
index 1bddb3f3a289..d3eb0f1e4208 100644
--- a/include/asm-cris/bitops.h
+++ b/include/asm-cris/bitops.h
@@ -240,6 +240,7 @@ static inline int test_bit(int nr, const volatile unsigned long *addr)
*/
#define fls(x) generic_fls(x)
+#define fls64(x) generic_fls64(x)
/*
* hweightN - returns the hamming weight of a N-bit word
diff --git a/include/asm-frv/bitops.h b/include/asm-frv/bitops.h
index b664bd5b6663..02be7b3a8a83 100644
--- a/include/asm-frv/bitops.h
+++ b/include/asm-frv/bitops.h
@@ -228,6 +228,7 @@ found_middle:
\
bit ? 33 - bit : bit; \
})
+#define fls64(x) generic_fls64(x)
/*
* Every architecture must define this function. It's the fastest
diff --git a/include/asm-generic/bitops.h b/include/asm-generic/bitops.h
index ce31b739fd80..0e6d9852008c 100644
--- a/include/asm-generic/bitops.h
+++ b/include/asm-generic/bitops.h
@@ -56,6 +56,7 @@ extern __inline__ int test_bit(int nr, const unsigned long * addr)
*/
#define fls(x) generic_fls(x)
+#define fls64(x) generic_fls64(x)
#ifdef __KERNEL__
diff --git a/include/asm-h8300/bitops.h b/include/asm-h8300/bitops.h
index 5036f595f8c9..c0411ec9d651 100644
--- a/include/asm-h8300/bitops.h
+++ b/include/asm-h8300/bitops.h
@@ -406,5 +406,6 @@ found_middle:
#endif /* __KERNEL__ */
#define fls(x) generic_fls(x)
+#define fls64(x) generic_fls64(x)
#endif /* _H8300_BITOPS_H */
diff --git a/include/asm-i386/bitops.h b/include/asm-i386/bitops.h
index ddf1739dc7fd..4807aa1d2e3d 100644
--- a/include/asm-i386/bitops.h
+++ b/include/asm-i386/bitops.h
@@ -372,6 +372,7 @@ static inline unsigned long ffz(unsigned long word)
*/
#define fls(x) generic_fls(x)
+#define fls64(x) generic_fls64(x)
#ifdef __KERNEL__
diff --git a/include/asm-ia64/bitops.h b/include/asm-ia64/bitops.h
index 7232528e2d0c..36d0fb95ea89 100644
--- a/include/asm-ia64/bitops.h
+++ b/include/asm-ia64/bitops.h
@@ -345,6 +345,7 @@ fls (int t)
x |= x >> 16;
return ia64_popcnt(x);
}
+#define fls64(x) generic_fls64(x)
/*
* ffs: find first bit set. This is defined the same way as the libc and compiler builtin
diff --git a/include/asm-m32r/bitops.h b/include/asm-m32r/bitops.h
index e78443981349..abea2fdd8689 100644
--- a/include/asm-m32r/bitops.h
+++ b/include/asm-m32r/bitops.h
@@ -465,6 +465,7 @@ static __inline__ unsigned long __ffs(unsigned long word)
* fls: find last bit set.
*/
#define fls(x) generic_fls(x)
+#define fls64(x) generic_fls64(x)
#ifdef __KERNEL__
diff --git a/include/asm-m68k/bitops.h b/include/asm-m68k/bitops.h
index b1bcf7c66516..13f4c0048463 100644
--- a/include/asm-m68k/bitops.h
+++ b/include/asm-m68k/bitops.h
@@ -310,6 +310,7 @@ static inline int fls(int x)
return 32 - cnt;
}
+#define fls64(x) generic_fls64(x)
/*
* Every architecture must define this function. It's the fastest
diff --git a/include/asm-m68knommu/bitops.h b/include/asm-m68knommu/bitops.h
index c42f88a9b9f9..4058dd086a02 100644
--- a/include/asm-m68knommu/bitops.h
+++ b/include/asm-m68knommu/bitops.h
@@ -499,5 +499,6 @@ found_middle:
* fls: find last bit set.
*/
#define fls(x) generic_fls(x)
+#define fls64(x) generic_fls64(x)
#endif /* _M68KNOMMU_BITOPS_H */
diff --git a/include/asm-mips/bitops.h b/include/asm-mips/bitops.h
index 5496f9064a6a..3b0c8aaf6e8b 100644
--- a/include/asm-mips/bitops.h
+++ b/include/asm-mips/bitops.h
@@ -695,7 +695,7 @@ static inline unsigned long fls(unsigned long word)
return flz(~word) + 1;
}
-
+#define fls64(x) generic_fls64(x)
/*
* find_next_zero_bit - find the first zero bit in a memory region
diff --git a/include/asm-parisc/bitops.h b/include/asm-parisc/bitops.h
index 55b98c67fd82..15d8c2b51584 100644
--- a/include/asm-parisc/bitops.h
+++ b/include/asm-parisc/bitops.h
@@ -263,6 +263,7 @@ static __inline__ int fls(int x)
return ret;
}
+#define fls64(x) generic_fls64(x)
/*
* hweightN: returns the hamming weight (i.e. the number
diff --git a/include/asm-powerpc/bitops.h b/include/asm-powerpc/bitops.h
index 5727229b0444..1996eaa8aeae 100644
--- a/include/asm-powerpc/bitops.h
+++ b/include/asm-powerpc/bitops.h
@@ -310,6 +310,7 @@ static __inline__ int fls(unsigned int x)
asm ("cntlzw %0,%1" : "=r" (lz) : "r" (x));
return 32 - lz;
}
+#define fls64(x) generic_fls64(x)
/*
* hweightN: returns the hamming weight (i.e. the number
diff --git a/include/asm-s390/bitops.h b/include/asm-s390/bitops.h
index b07c578b22ea..61232760cc3b 100644
--- a/include/asm-s390/bitops.h
+++ b/include/asm-s390/bitops.h
@@ -839,6 +839,7 @@ static inline int sched_find_first_bit(unsigned long *b)
* fls: find last bit set.
*/
#define fls(x) generic_fls(x)
+#define fls64(x) generic_fls64(x)
/*
* hweightN: returns the hamming weight (i.e. the number
diff --git a/include/asm-sh/bitops.h b/include/asm-sh/bitops.h
index 5163d1ff2f1b..1c5260860045 100644
--- a/include/asm-sh/bitops.h
+++ b/include/asm-sh/bitops.h
@@ -470,6 +470,7 @@ found_middle:
*/
#define fls(x) generic_fls(x)
+#define fls64(x) generic_fls64(x)
#endif /* __KERNEL__ */
diff --git a/include/asm-sh64/bitops.h b/include/asm-sh64/bitops.h
index e1ff63e09227..ce9c3ad45fe0 100644
--- a/include/asm-sh64/bitops.h
+++ b/include/asm-sh64/bitops.h
@@ -510,6 +510,7 @@ found_middle:
#define ffs(x) generic_ffs(x)
#define fls(x) generic_fls(x)
+#define fls64(x) generic_fls64(x)
#endif /* __KERNEL__ */
diff --git a/include/asm-sparc/bitops.h b/include/asm-sparc/bitops.h
index bfbd795a0a80..41722b5e45ef 100644
--- a/include/asm-sparc/bitops.h
+++ b/include/asm-sparc/bitops.h
@@ -298,6 +298,7 @@ static inline int ffs(int x)
* Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
*/
#define fls(x) generic_fls(x)
+#define fls64(x) generic_fls64(x)
/*
* hweightN: returns the hamming weight (i.e. the number
diff --git a/include/asm-sparc64/bitops.h b/include/asm-sparc64/bitops.h
index 6388b8376c50..6efc0162fb09 100644
--- a/include/asm-sparc64/bitops.h
+++ b/include/asm-sparc64/bitops.h
@@ -119,6 +119,7 @@ static inline unsigned long __ffs(unsigned long word)
*/
#define fls(x) generic_fls(x)
+#define fls64(x) generic_fls64(x)
#ifdef __KERNEL__
diff --git a/include/asm-v850/bitops.h b/include/asm-v850/bitops.h
index b91e799763fd..8955d2376ac8 100644
--- a/include/asm-v850/bitops.h
+++ b/include/asm-v850/bitops.h
@@ -276,6 +276,7 @@ found_middle:
#define ffs(x) generic_ffs (x)
#define fls(x) generic_fls (x)
+#define fls64(x) generic_fls64(x)
#define __ffs(x) ffs(x)
diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h
index 05a0d374404b..a4d5d0909453 100644
--- a/include/asm-x86_64/bitops.h
+++ b/include/asm-x86_64/bitops.h
@@ -340,6 +340,20 @@ static __inline__ unsigned long __ffs(unsigned long word)
return word;
}
+/*
+ * __fls: find last bit set.
+ * @word: The word to search
+ *
+ * Undefined if no zero exists, so code should check against ~0UL first.
+ */
+static __inline__ unsigned long __fls(unsigned long word)
+{
+ __asm__("bsrq %1,%0"
+ :"=r" (word)
+ :"rm" (word));
+ return word;
+}
+
#ifdef __KERNEL__
static inline int sched_find_first_bit(const unsigned long *b)
@@ -370,6 +384,19 @@ static __inline__ int ffs(int x)
}
/**
+ * fls64 - find last bit set in 64 bit word
+ * @x: the word to search
+ *
+ * This is defined the same way as fls.
+ */
+static __inline__ int fls64(__u64 x)
+{
+ if (x == 0)
+ return 0;
+ return __fls(x) + 1;
+}
+
+/**
* hweightN - returns the hamming weight of a N-bit word
* @x: the word to weigh
*
diff --git a/include/asm-xtensa/bitops.h b/include/asm-xtensa/bitops.h
index e76ee889e21d..0a2065f1a372 100644
--- a/include/asm-xtensa/bitops.h
+++ b/include/asm-xtensa/bitops.h
@@ -245,6 +245,7 @@ static __inline__ int fls (unsigned int x)
{
return __cntlz(x);
}
+#define fls64(x) generic_fls64(x)
static __inline__ int
find_next_bit(const unsigned long *addr, int size, int offset)
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 38c2fb7ebe09..6a2a19f14bb2 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -76,6 +76,15 @@ static __inline__ int generic_fls(int x)
*/
#include <asm/bitops.h>
+
+static inline int generic_fls64(__u64 x)
+{
+ __u32 h = x >> 32;
+ if (h)
+ return fls(x) + 32;
+ return fls(x);
+}
+
static __inline__ int get_bitmask_order(unsigned int count)
{
int order;
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 71fab4311e92..088529f54965 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -192,10 +192,9 @@ enum {
#include <linux/workqueue.h>
#include <net/inet_connection_sock.h>
+#include <net/inet_sock.h>
#include <net/inet_timewait_sock.h>
-#include <net/sock.h>
#include <net/tcp_states.h>
-#include <net/tcp.h>
enum dccp_state {
DCCP_OPEN = TCP_ESTABLISHED,
@@ -408,8 +407,6 @@ struct dccp_ackvec;
* @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss
* @dccps_timestamp_time - time of latest TIMESTAMP option
* @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option
- * @dccps_ext_header_len - network protocol overhead (IP/IPv6 options)
- * @dccps_pmtu_cookie - Last pmtu seen by socket
* @dccps_packet_size - Set thru setsockopt
* @dccps_role - Role of this sock, one of %dccp_role
* @dccps_ndp_count - number of Non Data Packets since last data packet
@@ -434,8 +431,6 @@ struct dccp_sock {
__u32 dccps_timestamp_echo;
__u32 dccps_packet_size;
unsigned long dccps_ndp_count;
- __u16 dccps_ext_header_len;
- __u32 dccps_pmtu_cookie;
__u32 dccps_mss_cache;
struct dccp_options dccps_options;
struct dccp_ackvec *dccps_hc_rx_ackvec;
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 5f49a30eb6f2..745c988359c0 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -63,10 +63,11 @@ static inline int is_zero_ether_addr(const u8 *addr)
* @addr: Pointer to a six-byte array containing the Ethernet address
*
* Return true if the address is a multicast address.
+ * By definition the broadcast address is also a multicast address.
*/
static inline int is_multicast_ether_addr(const u8 *addr)
{
- return ((addr[0] != 0xff) && (0x01 & addr[0]));
+ return (0x01 & addr[0]);
}
/**
diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h
index e677f73f13dd..4fab3d0a4bce 100644
--- a/include/linux/if_pppox.h
+++ b/include/linux/if_pppox.h
@@ -157,8 +157,7 @@ struct pppox_proto {
extern int register_pppox_proto(int proto_num, struct pppox_proto *pp);
extern void unregister_pppox_proto(int proto_num);
extern void pppox_unbind_sock(struct sock *sk);/* delete ppp-channel binding */
-extern int pppox_channel_ioctl(struct ppp_channel *pc, unsigned int cmd,
- unsigned long arg);
+extern int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
/* PPPoX socket states */
enum {
diff --git a/include/linux/ip.h b/include/linux/ip.h
index 33e8a19a1a0f..9e2eb9a602eb 100644
--- a/include/linux/ip.h
+++ b/include/linux/ip.h
@@ -16,6 +16,7 @@
*/
#ifndef _LINUX_IP_H
#define _LINUX_IP_H
+#include <linux/types.h>
#include <asm/byteorder.h>
#define IPTOS_TOS_MASK 0x1E
@@ -78,126 +79,6 @@
#define IPOPT_TS_TSANDADDR 1 /* timestamps and addresses */
#define IPOPT_TS_PRESPEC 3 /* specified modules only */
-#ifdef __KERNEL__
-#include <linux/config.h>
-#include <linux/types.h>
-#include <net/request_sock.h>
-#include <net/sock.h>
-#include <linux/igmp.h>
-#include <net/flow.h>
-
-struct ip_options {
- __u32 faddr; /* Saved first hop address */
- unsigned char optlen;
- unsigned char srr;
- unsigned char rr;
- unsigned char ts;
- unsigned char is_setbyuser:1, /* Set by setsockopt? */
- is_data:1, /* Options in __data, rather than skb */
- is_strictroute:1, /* Strict source route */
- srr_is_hit:1, /* Packet destination addr was our one */
- is_changed:1, /* IP checksum more not valid */
- rr_needaddr:1, /* Need to record addr of outgoing dev */
- ts_needtime:1, /* Need to record timestamp */
- ts_needaddr:1; /* Need to record addr of outgoing dev */
- unsigned char router_alert;
- unsigned char __pad1;
- unsigned char __pad2;
- unsigned char __data[0];
-};
-
-#define optlength(opt) (sizeof(struct ip_options) + opt->optlen)
-
-struct inet_request_sock {
- struct request_sock req;
- u32 loc_addr;
- u32 rmt_addr;
- u16 rmt_port;
- u16 snd_wscale : 4,
- rcv_wscale : 4,
- tstamp_ok : 1,
- sack_ok : 1,
- wscale_ok : 1,
- ecn_ok : 1,
- acked : 1;
- struct ip_options *opt;
-};
-
-static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
-{
- return (struct inet_request_sock *)sk;
-}
-
-struct ipv6_pinfo;
-
-struct inet_sock {
- /* sk and pinet6 has to be the first two members of inet_sock */
- struct sock sk;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
- struct ipv6_pinfo *pinet6;
-#endif
- /* Socket demultiplex comparisons on incoming packets. */
- __u32 daddr; /* Foreign IPv4 addr */
- __u32 rcv_saddr; /* Bound local IPv4 addr */
- __u16 dport; /* Destination port */
- __u16 num; /* Local port */
- __u32 saddr; /* Sending source */
- __s16 uc_ttl; /* Unicast TTL */
- __u16 cmsg_flags;
- struct ip_options *opt;
- __u16 sport; /* Source port */
- __u16 id; /* ID counter for DF pkts */
- __u8 tos; /* TOS */
- __u8 mc_ttl; /* Multicasting TTL */
- __u8 pmtudisc;
- unsigned recverr : 1,
- freebind : 1,
- hdrincl : 1,
- mc_loop : 1;
- int mc_index; /* Multicast device index */
- __u32 mc_addr;
- struct ip_mc_socklist *mc_list; /* Group array */
- /*
- * Following members are used to retain the infomation to build
- * an ip header on each ip fragmentation while the socket is corked.
- */
- struct {
- unsigned int flags;
- unsigned int fragsize;
- struct ip_options *opt;
- struct rtable *rt;
- int length; /* Total length of all frames */
- u32 addr;
- struct flowi fl;
- } cork;
-};
-
-#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */
-#define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */
-
-static inline struct inet_sock *inet_sk(const struct sock *sk)
-{
- return (struct inet_sock *)sk;
-}
-
-static inline void __inet_sk_copy_descendant(struct sock *sk_to,
- const struct sock *sk_from,
- const int ancestor_size)
-{
- memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1,
- sk_from->sk_prot->obj_size - ancestor_size);
-}
-#if !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE))
-static inline void inet_sk_copy_descendant(struct sock *sk_to,
- const struct sock *sk_from)
-{
- __inet_sk_copy_descendant(sk_to, sk_from, sizeof(struct inet_sock));
-}
-#endif
-#endif
-
-extern int inet_sk_rebuild_header(struct sock *sk);
-
struct iphdr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u8 ihl:4,
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index e0b922785d98..93bbed5c6cf4 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -171,12 +171,13 @@ enum {
};
#ifdef __KERNEL__
-#include <linux/in6.h> /* struct sockaddr_in6 */
#include <linux/icmpv6.h>
-#include <net/if_inet6.h> /* struct ipv6_mc_socklist */
#include <linux/tcp.h>
#include <linux/udp.h>
+#include <net/if_inet6.h> /* struct ipv6_mc_socklist */
+#include <net/inet_sock.h>
+
/*
This structure contains results of exthdrs parsing
as offsets from skb->nh.
@@ -199,18 +200,17 @@ static inline int inet6_iif(const struct sk_buff *skb)
return IP6CB(skb)->iif;
}
-struct tcp6_request_sock {
- struct tcp_request_sock req;
+struct inet6_request_sock {
struct in6_addr loc_addr;
struct in6_addr rmt_addr;
struct sk_buff *pktopts;
int iif;
};
-static inline struct tcp6_request_sock *tcp6_rsk(const struct request_sock *sk)
-{
- return (struct tcp6_request_sock *)sk;
-}
+struct tcp6_request_sock {
+ struct tcp_request_sock tcp6rsk_tcp;
+ struct inet6_request_sock tcp6rsk_inet6;
+};
/**
* struct ipv6_pinfo - ipv6 private area
@@ -298,12 +298,36 @@ struct tcp6_sock {
struct ipv6_pinfo inet6;
};
+extern int inet6_sk_rebuild_header(struct sock *sk);
+
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
{
return inet_sk(__sk)->pinet6;
}
+static inline struct inet6_request_sock *
+ inet6_rsk(const struct request_sock *rsk)
+{
+ return (struct inet6_request_sock *)(((u8 *)rsk) +
+ inet_rsk(rsk)->inet6_rsk_offset);
+}
+
+static inline u32 inet6_rsk_offset(struct request_sock *rsk)
+{
+ return rsk->rsk_ops->obj_size - sizeof(struct inet6_request_sock);
+}
+
+static inline struct request_sock *inet6_reqsk_alloc(struct request_sock_ops *ops)
+{
+ struct request_sock *req = reqsk_alloc(ops);
+
+ if (req != NULL)
+ inet_rsk(req)->inet6_rsk_offset = inet6_rsk_offset(req);
+
+ return req;
+}
+
static inline struct raw6_sock *raw6_sk(const struct sock *sk)
{
return (struct raw6_sock *)sk;
@@ -323,28 +347,37 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to,
#define __ipv6_only_sock(sk) (inet6_sk(sk)->ipv6only)
#define ipv6_only_sock(sk) ((sk)->sk_family == PF_INET6 && __ipv6_only_sock(sk))
-#include <linux/tcp.h>
+struct inet6_timewait_sock {
+ struct in6_addr tw_v6_daddr;
+ struct in6_addr tw_v6_rcv_saddr;
+};
struct tcp6_timewait_sock {
- struct tcp_timewait_sock tw_v6_sk;
- struct in6_addr tw_v6_daddr;
- struct in6_addr tw_v6_rcv_saddr;
+ struct tcp_timewait_sock tcp6tw_tcp;
+ struct inet6_timewait_sock tcp6tw_inet6;
};
-static inline struct tcp6_timewait_sock *tcp6_twsk(const struct sock *sk)
+static inline u16 inet6_tw_offset(const struct proto *prot)
{
- return (struct tcp6_timewait_sock *)sk;
+ return prot->twsk_prot->twsk_obj_size -
+ sizeof(struct inet6_timewait_sock);
}
-static inline struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk)
+static inline struct inet6_timewait_sock *inet6_twsk(const struct sock *sk)
+{
+ return (struct inet6_timewait_sock *)(((u8 *)sk) +
+ inet_twsk(sk)->tw_ipv6_offset);
+}
+
+static inline struct in6_addr *__inet6_rcv_saddr(const struct sock *sk)
{
return likely(sk->sk_state != TCP_TIME_WAIT) ?
- &inet6_sk(sk)->rcv_saddr : &tcp6_twsk(sk)->tw_v6_rcv_saddr;
+ &inet6_sk(sk)->rcv_saddr : &inet6_twsk(sk)->tw_v6_rcv_saddr;
}
-static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk)
+static inline struct in6_addr *inet6_rcv_saddr(const struct sock *sk)
{
- return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL;
+ return sk->sk_family == AF_INET6 ? __inet6_rcv_saddr(sk) : NULL;
}
static inline int inet_v6_ipv6only(const struct sock *sk)
@@ -361,13 +394,19 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
return NULL;
}
+static inline struct inet6_request_sock *
+ inet6_rsk(const struct request_sock *rsk)
+{
+ return NULL;
+}
+
static inline struct raw6_sock *raw6_sk(const struct sock *sk)
{
return NULL;
}
-#define __tcp_v6_rcv_saddr(__sk) NULL
-#define tcp_v6_rcv_saddr(__sk) NULL
+#define __inet6_rcv_saddr(__sk) NULL
+#define inet6_rcv_saddr(__sk) NULL
#define tcp_twsk_ipv6only(__sk) 0
#define inet_v6_ipv6only(__sk) 0
#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
diff --git a/include/linux/net.h b/include/linux/net.h
index d6a41e6577f6..28195a2d8ff0 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -107,7 +107,7 @@ enum sock_type {
struct socket {
socket_state state;
unsigned long flags;
- struct proto_ops *ops;
+ const struct proto_ops *ops;
struct fasync_struct *fasync_list;
struct file *file;
struct sock *sk;
@@ -260,7 +260,7 @@ SOCKCALL_WRAP(name, recvmsg, (struct kiocb *iocb, struct socket *sock, struct ms
SOCKCALL_WRAP(name, mmap, (struct file *file, struct socket *sock, struct vm_area_struct *vma), \
(file, sock, vma)) \
\
-static struct proto_ops name##_ops = { \
+static const struct proto_ops name##_ops = { \
.family = fam, \
.owner = THIS_MODULE, \
.release = __lock_##name##_release, \
diff --git a/include/linux/pfkeyv2.h b/include/linux/pfkeyv2.h
index 724066778aff..6351c4055ace 100644
--- a/include/linux/pfkeyv2.h
+++ b/include/linux/pfkeyv2.h
@@ -216,6 +216,16 @@ struct sadb_x_nat_t_port {
} __attribute__((packed));
/* sizeof(struct sadb_x_nat_t_port) == 8 */
+/* Generic LSM security context */
+struct sadb_x_sec_ctx {
+ uint16_t sadb_x_sec_len;
+ uint16_t sadb_x_sec_exttype;
+ uint8_t sadb_x_ctx_alg; /* LSMs: e.g., selinux == 1 */
+ uint8_t sadb_x_ctx_doi;
+ uint16_t sadb_x_ctx_len;
+} __attribute__((packed));
+/* sizeof(struct sadb_sec_ctx) = 8 */
+
/* Message types */
#define SADB_RESERVED 0
#define SADB_GETSPI 1
@@ -325,7 +335,8 @@ struct sadb_x_nat_t_port {
#define SADB_X_EXT_NAT_T_SPORT 21
#define SADB_X_EXT_NAT_T_DPORT 22
#define SADB_X_EXT_NAT_T_OA 23
-#define SADB_EXT_MAX 23
+#define SADB_X_EXT_SEC_CTX 24
+#define SADB_EXT_MAX 24
/* Identity Extension values */
#define SADB_IDENTTYPE_RESERVED 0
diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index e87b233615b3..d10f35338507 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -429,6 +429,7 @@ enum
TCA_NETEM_CORR,
TCA_NETEM_DELAY_DIST,
TCA_NETEM_REORDER,
+ TCA_NETEM_CORRUPT,
__TCA_NETEM_MAX,
};
@@ -457,6 +458,12 @@ struct tc_netem_reorder
__u32 correlation;
};
+struct tc_netem_corrupt
+{
+ __u32 probability;
+ __u32 correlation;
+};
+
#define NETEM_DIST_SCALE 8192
#endif
diff --git a/include/linux/random.h b/include/linux/random.h
index 7b2adb3322d5..5d6456bcdeba 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -52,9 +52,9 @@ extern void get_random_bytes(void *buf, int nbytes);
void generate_random_uuid(unsigned char uuid_out[16]);
extern __u32 secure_ip_id(__u32 daddr);
-extern u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport);
-extern u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr,
- __u16 dport);
+extern u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport);
+extern u32 secure_ipv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr,
+ __u16 dport);
extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr,
__u16 sport, __u16 dport);
extern __u32 secure_tcpv6_sequence_number(__u32 *saddr, __u32 *daddr,
diff --git a/include/linux/security.h b/include/linux/security.h
index f7e0ae018712..ef753654daa5 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -59,6 +59,12 @@ struct sk_buff;
struct sock;
struct sockaddr;
struct socket;
+struct flowi;
+struct dst_entry;
+struct xfrm_selector;
+struct xfrm_policy;
+struct xfrm_state;
+struct xfrm_user_sec_ctx;
extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb);
extern int cap_netlink_recv(struct sk_buff *skb);
@@ -788,6 +794,52 @@ struct swap_info_struct;
* which is used to copy security attributes between local stream sockets.
* @sk_free_security:
* Deallocate security structure.
+ * @sk_getsid:
+ * Retrieve the LSM-specific sid for the sock to enable caching of network
+ * authorizations.
+ *
+ * Security hooks for XFRM operations.
+ *
+ * @xfrm_policy_alloc_security:
+ * @xp contains the xfrm_policy being added to Security Policy Database
+ * used by the XFRM system.
+ * @sec_ctx contains the security context information being provided by
+ * the user-level policy update program (e.g., setkey).
+ * Allocate a security structure to the xp->selector.security field.
+ * The security field is initialized to NULL when the xfrm_policy is
+ * allocated.
+ * Return 0 if operation was successful (memory to allocate, legal context)
+ * @xfrm_policy_clone_security:
+ * @old contains an existing xfrm_policy in the SPD.
+ * @new contains a new xfrm_policy being cloned from old.
+ * Allocate a security structure to the new->selector.security field
+ * that contains the information from the old->selector.security field.
+ * Return 0 if operation was successful (memory to allocate).
+ * @xfrm_policy_free_security:
+ * @xp contains the xfrm_policy
+ * Deallocate xp->selector.security.
+ * @xfrm_state_alloc_security:
+ * @x contains the xfrm_state being added to the Security Association
+ * Database by the XFRM system.
+ * @sec_ctx contains the security context information being provided by
+ * the user-level SA generation program (e.g., setkey or racoon).
+ * Allocate a security structure to the x->sel.security field. The
+ * security field is initialized to NULL when the xfrm_state is
+ * allocated.
+ * Return 0 if operation was successful (memory to allocate, legal context).
+ * @xfrm_state_free_security:
+ * @x contains the xfrm_state.
+ * Deallocate x>sel.security.
+ * @xfrm_policy_lookup:
+ * @xp contains the xfrm_policy for which the access control is being
+ * checked.
+ * @sk_sid contains the sock security label that is used to authorize
+ * access to the policy xp.
+ * @dir contains the direction of the flow (input or output).
+ * Check permission when a sock selects a xfrm_policy for processing
+ * XFRMs on a packet. The hook is called when selecting either a
+ * per-socket policy or a generic xfrm policy.
+ * Return 0 if permission is granted.
*
* Security hooks affecting all Key Management operations
*
@@ -1237,8 +1289,18 @@ struct security_operations {
int (*socket_getpeersec) (struct socket *sock, char __user *optval, int __user *optlen, unsigned len);
int (*sk_alloc_security) (struct sock *sk, int family, gfp_t priority);
void (*sk_free_security) (struct sock *sk);
+ unsigned int (*sk_getsid) (struct sock *sk, struct flowi *fl, u8 dir);
#endif /* CONFIG_SECURITY_NETWORK */
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+ int (*xfrm_policy_alloc_security) (struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx);
+ int (*xfrm_policy_clone_security) (struct xfrm_policy *old, struct xfrm_policy *new);
+ void (*xfrm_policy_free_security) (struct xfrm_policy *xp);
+ int (*xfrm_state_alloc_security) (struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx);
+ void (*xfrm_state_free_security) (struct xfrm_state *x);
+ int (*xfrm_policy_lookup)(struct xfrm_policy *xp, u32 sk_sid, u8 dir);
+#endif /* CONFIG_SECURITY_NETWORK_XFRM */
+
/* key management security hooks */
#ifdef CONFIG_KEYS
int (*key_alloc)(struct key *key);
@@ -2679,6 +2741,11 @@ static inline void security_sk_free(struct sock *sk)
{
return security_ops->sk_free_security(sk);
}
+
+static inline unsigned int security_sk_sid(struct sock *sk, struct flowi *fl, u8 dir)
+{
+ return security_ops->sk_getsid(sk, fl, dir);
+}
#else /* CONFIG_SECURITY_NETWORK */
static inline int security_unix_stream_connect(struct socket * sock,
struct socket * other,
@@ -2795,8 +2862,73 @@ static inline int security_sk_alloc(struct sock *sk, int family, gfp_t priority)
static inline void security_sk_free(struct sock *sk)
{
}
+
+static inline unsigned int security_sk_sid(struct sock *sk, struct flowi *fl, u8 dir)
+{
+ return 0;
+}
#endif /* CONFIG_SECURITY_NETWORK */
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+static inline int security_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx)
+{
+ return security_ops->xfrm_policy_alloc_security(xp, sec_ctx);
+}
+
+static inline int security_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new)
+{
+ return security_ops->xfrm_policy_clone_security(old, new);
+}
+
+static inline void security_xfrm_policy_free(struct xfrm_policy *xp)
+{
+ security_ops->xfrm_policy_free_security(xp);
+}
+
+static inline int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
+{
+ return security_ops->xfrm_state_alloc_security(x, sec_ctx);
+}
+
+static inline void security_xfrm_state_free(struct xfrm_state *x)
+{
+ security_ops->xfrm_state_free_security(x);
+}
+
+static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
+{
+ return security_ops->xfrm_policy_lookup(xp, sk_sid, dir);
+}
+#else /* CONFIG_SECURITY_NETWORK_XFRM */
+static inline int security_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx)
+{
+ return 0;
+}
+
+static inline int security_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new)
+{
+ return 0;
+}
+
+static inline void security_xfrm_policy_free(struct xfrm_policy *xp)
+{
+}
+
+static inline int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
+{
+ return 0;
+}
+
+static inline void security_xfrm_state_free(struct xfrm_state *x)
+{
+}
+
+static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
+{
+ return 0;
+}
+#endif /* CONFIG_SECURITY_NETWORK_XFRM */
+
#ifdef CONFIG_KEYS
#ifdef CONFIG_SECURITY
static inline int security_key_alloc(struct key *key)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8c5d6001a923..483cfc47ec34 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -32,7 +32,6 @@
#define HAVE_ALLOC_SKB /* For the drivers to know */
#define HAVE_ALIGNABLE_SKB /* Ditto 8) */
-#define SLAB_SKB /* Slabified skbuffs */
#define CHECKSUM_NONE 0
#define CHECKSUM_HW 1
@@ -134,7 +133,7 @@ struct skb_frag_struct {
*/
struct skb_shared_info {
atomic_t dataref;
- unsigned int nr_frags;
+ unsigned short nr_frags;
unsigned short tso_size;
unsigned short tso_segs;
unsigned short ufo_size;
@@ -1239,6 +1238,8 @@ extern int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
int hlen,
struct iovec *iov);
extern void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
+extern void skb_kill_datagram(struct sock *sk, struct sk_buff *skb,
+ unsigned int flags);
extern unsigned int skb_checksum(const struct sk_buff *skb, int offset,
int len, unsigned int csum);
extern int skb_copy_bits(const struct sk_buff *skb, int offset,
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 1739c2d5b95b..9f4019156fd8 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -27,7 +27,6 @@ struct __kernel_sockaddr_storage {
#include <linux/compiler.h> /* __user */
extern int sysctl_somaxconn;
-extern void sock_init(void);
#ifdef CONFIG_PROC_FS
struct seq_file;
extern void socket_seq_show(struct seq_file *seq);
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 4be34ef8c2f7..93fa765e47d3 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -390,6 +390,7 @@ enum
NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109,
NET_TCP_CONG_CONTROL=110,
NET_TCP_ABC=111,
+ NET_IPV4_IPFRAG_MAX_DIST=112,
};
enum {
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 0e1da6602e05..f2bb2396853f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -55,22 +55,6 @@ struct tcphdr {
__u16 urg_ptr;
};
-#define TCP_ACTION_FIN (1 << 7)
-
-enum {
- TCPF_ESTABLISHED = (1 << 1),
- TCPF_SYN_SENT = (1 << 2),
- TCPF_SYN_RECV = (1 << 3),
- TCPF_FIN_WAIT1 = (1 << 4),
- TCPF_FIN_WAIT2 = (1 << 5),
- TCPF_TIME_WAIT = (1 << 6),
- TCPF_CLOSE = (1 << 7),
- TCPF_CLOSE_WAIT = (1 << 8),
- TCPF_LAST_ACK = (1 << 9),
- TCPF_LISTEN = (1 << 10),
- TCPF_CLOSING = (1 << 11)
-};
-
/*
* The union cast uses a gcc extension to avoid aliasing problems
* (union is compatible to any of its members)
@@ -254,10 +238,9 @@ struct tcp_sock {
__u32 snd_wl1; /* Sequence for window update */
__u32 snd_wnd; /* The window we expect to receive */
__u32 max_window; /* Maximal window ever seen from peer */
- __u32 pmtu_cookie; /* Last pmtu seen by socket */
__u32 mss_cache; /* Cached effective mss, not including SACKS */
__u16 xmit_size_goal; /* Goal for segmenting output packets */
- __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */
+ /* XXX Two bytes hole, try to pack */
__u32 window_clamp; /* Maximal window to advertise */
__u32 rcv_ssthresh; /* Current window clamp */
@@ -295,8 +278,6 @@ struct tcp_sock {
struct sk_buff_head out_of_order_queue; /* Out of order segments go here */
- struct tcp_func *af_specific; /* Operations which are AF_INET{4,6} specific */
-
__u32 rcv_wnd; /* Current receiver window */
__u32 rcv_wup; /* rcv_nxt on last window update sent */
__u32 write_seq; /* Tail(+1) of data held in tcp send buffer */
diff --git a/include/linux/udp.h b/include/linux/udp.h
index b60e0b4a25c4..85a55658831c 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -35,10 +35,10 @@ struct udphdr {
#define UDP_ENCAP_ESPINUDP 2 /* draft-ietf-ipsec-udp-encaps-06 */
#ifdef __KERNEL__
-
#include <linux/config.h>
-#include <net/sock.h>
-#include <linux/ip.h>
+#include <linux/types.h>
+
+#include <net/inet_sock.h>
struct udp_sock {
/* inet_sock has to be the first member */
diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 0fb077d68441..82fbb758e28f 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -27,6 +27,22 @@ struct xfrm_id
__u8 proto;
};
+struct xfrm_sec_ctx {
+ __u8 ctx_doi;
+ __u8 ctx_alg;
+ __u16 ctx_len;
+ __u32 ctx_sid;
+ char ctx_str[0];
+};
+
+/* Security Context Domains of Interpretation */
+#define XFRM_SC_DOI_RESERVED 0
+#define XFRM_SC_DOI_LSM 1
+
+/* Security Context Algorithms */
+#define XFRM_SC_ALG_RESERVED 0
+#define XFRM_SC_ALG_SELINUX 1
+
/* Selector, used as selector both on policy rules (SPD) and SAs. */
struct xfrm_selector
@@ -146,6 +162,18 @@ enum {
#define XFRM_NR_MSGTYPES (XFRM_MSG_MAX + 1 - XFRM_MSG_BASE)
+/*
+ * Generic LSM security context for comunicating to user space
+ * NOTE: Same format as sadb_x_sec_ctx
+ */
+struct xfrm_user_sec_ctx {
+ __u16 len;
+ __u16 exttype;
+ __u8 ctx_alg; /* LSMs: e.g., selinux == 1 */
+ __u8 ctx_doi;
+ __u16 ctx_len;
+};
+
struct xfrm_user_tmpl {
struct xfrm_id id;
__u16 family;
@@ -176,6 +204,7 @@ enum xfrm_attr_type_t {
XFRMA_TMPL, /* 1 or more struct xfrm_user_tmpl */
XFRMA_SA,
XFRMA_POLICY,
+ XFRMA_SEC_CTX, /* struct xfrm_sec_ctx */
__XFRMA_MAX
#define XFRMA_MAX (__XFRMA_MAX - 1)
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index b5d785ab4a0e..bfc1779fc753 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -13,7 +13,7 @@ extern void unix_gc(void);
#define UNIX_HASH_SIZE 256
extern struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
-extern rwlock_t unix_table_lock;
+extern spinlock_t unix_table_lock;
extern atomic_t unix_tot_inflight;
@@ -58,10 +58,10 @@ struct unix_skb_parms {
#define UNIXCB(skb) (*(struct unix_skb_parms*)&((skb)->cb))
#define UNIXCREDS(skb) (&UNIXCB((skb)).creds)
-#define unix_state_rlock(s) read_lock(&unix_sk(s)->lock)
-#define unix_state_runlock(s) read_unlock(&unix_sk(s)->lock)
-#define unix_state_wlock(s) write_lock(&unix_sk(s)->lock)
-#define unix_state_wunlock(s) write_unlock(&unix_sk(s)->lock)
+#define unix_state_rlock(s) spin_lock(&unix_sk(s)->lock)
+#define unix_state_runlock(s) spin_unlock(&unix_sk(s)->lock)
+#define unix_state_wlock(s) spin_lock(&unix_sk(s)->lock)
+#define unix_state_wunlock(s) spin_unlock(&unix_sk(s)->lock)
#ifdef __KERNEL__
/* The AF_UNIX socket */
@@ -76,7 +76,7 @@ struct unix_sock {
struct sock *other;
struct sock *gc_tree;
atomic_t inflight;
- rwlock_t lock;
+ spinlock_t lock;
wait_queue_head_t peer_wait;
};
#define unix_sk(__sk) ((struct unix_sock *)__sk)
diff --git a/include/net/atmclip.h b/include/net/atmclip.h
index 47048b1d179a..90fcc98e676f 100644
--- a/include/net/atmclip.h
+++ b/include/net/atmclip.h
@@ -7,7 +7,6 @@
#define _ATMCLIP_H
#include <linux/netdevice.h>
-#include <linux/skbuff.h>
#include <linux/atm.h>
#include <linux/atmdev.h>
#include <linux/atmarp.h>
@@ -18,6 +17,7 @@
#define CLIP_VCC(vcc) ((struct clip_vcc *) ((vcc)->user_back))
#define NEIGH2ENTRY(neigh) ((struct atmarp_entry *) (neigh)->primary_key)
+struct sk_buff;
struct clip_vcc {
struct atm_vcc *vcc; /* VCC descriptor */
diff --git a/include/net/dst.h b/include/net/dst.h
index 6c196a5baf24..bee8b84d329d 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -9,6 +9,7 @@
#define _NET_DST_H
#include <linux/config.h>
+#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/rcupdate.h>
#include <linux/jiffies.h>
diff --git a/include/net/flow.h b/include/net/flow.h
index 9a5c94b1a0ec..ec7eb86eb203 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -84,11 +84,12 @@ struct flowi {
#define FLOW_DIR_OUT 1
#define FLOW_DIR_FWD 2
-typedef void (*flow_resolve_t)(struct flowi *key, u16 family, u8 dir,
+struct sock;
+typedef void (*flow_resolve_t)(struct flowi *key, u32 sk_sid, u16 family, u8 dir,
void **objp, atomic_t **obj_refp);
-extern void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
- flow_resolve_t resolver);
+extern void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir,
+ flow_resolve_t resolver);
extern void flow_cache_flush(void);
extern atomic_t flow_cache_genid;
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 52d8b1a73d52..c5b96b2b8155 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -60,7 +60,7 @@ struct genl_info
*/
struct genl_ops
{
- unsigned int cmd;
+ u8 cmd;
unsigned int flags;
struct nla_policy *policy;
int (*doit)(struct sk_buff *skb,
diff --git a/include/net/icmp.h b/include/net/icmp.h
index 6cdebeee5f96..e7c3f20fbafc 100644
--- a/include/net/icmp.h
+++ b/include/net/icmp.h
@@ -20,12 +20,9 @@
#include <linux/config.h>
#include <linux/icmp.h>
-#include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/protocol.h>
+#include <net/inet_sock.h>
#include <net/snmp.h>
-#include <linux/ip.h>
struct icmp_err {
int errno;
@@ -38,6 +35,10 @@ DECLARE_SNMP_STAT(struct icmp_mib, icmp_statistics);
#define ICMP_INC_STATS_BH(field) SNMP_INC_STATS_BH(icmp_statistics, field)
#define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmp_statistics, field)
+struct dst_entry;
+struct net_proto_family;
+struct sk_buff;
+
extern void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info);
extern int icmp_rcv(struct sk_buff *skb);
extern int icmp_ioctl(struct sock *sk, int cmd, unsigned long arg);
diff --git a/include/net/ieee80211_crypt.h b/include/net/ieee80211_crypt.h
index 225fc751d464..03b766afdc39 100644
--- a/include/net/ieee80211_crypt.h
+++ b/include/net/ieee80211_crypt.h
@@ -23,12 +23,17 @@
#ifndef IEEE80211_CRYPT_H
#define IEEE80211_CRYPT_H
-#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <asm/atomic.h>
enum {
IEEE80211_CRYPTO_TKIP_COUNTERMEASURES = (1 << 0),
};
+struct sk_buff;
+struct module;
+
struct ieee80211_crypto_ops {
const char *name;
struct list_head list;
@@ -87,6 +92,8 @@ struct ieee80211_crypt_data {
atomic_t refcnt;
};
+struct ieee80211_device;
+
int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops);
int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops);
struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name);
diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h
new file mode 100644
index 000000000000..b33b438bffcc
--- /dev/null
+++ b/include/net/inet6_connection_sock.h
@@ -0,0 +1,42 @@
+/*
+ * NET Generic infrastructure for INET6 connection oriented protocols.
+ *
+ * Authors: Many people, see the TCPv6 sources
+ *
+ * From code originally in TCPv6
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef _INET6_CONNECTION_SOCK_H
+#define _INET6_CONNECTION_SOCK_H
+
+#include <linux/types.h>
+
+struct in6_addr;
+struct inet_bind_bucket;
+struct request_sock;
+struct sk_buff;
+struct sock;
+struct sockaddr;
+
+extern int inet6_csk_bind_conflict(const struct sock *sk,
+ const struct inet_bind_bucket *tb);
+
+extern struct request_sock *inet6_csk_search_req(const struct sock *sk,
+ struct request_sock ***prevp,
+ const __u16 rport,
+ const struct in6_addr *raddr,
+ const struct in6_addr *laddr,
+ const int iif);
+
+extern void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
+ struct request_sock *req,
+ const unsigned long timeout);
+
+extern void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr);
+
+extern int inet6_csk_xmit(struct sk_buff *skb, int ipfragok);
+#endif /* _INET6_CONNECTION_SOCK_H */
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 5a2beed5a770..25f708ff020e 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -48,6 +48,32 @@ static inline int inet6_sk_ehashfn(const struct sock *sk)
return inet6_ehashfn(laddr, lport, faddr, fport);
}
+static inline void __inet6_hash(struct inet_hashinfo *hashinfo,
+ struct sock *sk)
+{
+ struct hlist_head *list;
+ rwlock_t *lock;
+
+ BUG_TRAP(sk_unhashed(sk));
+
+ if (sk->sk_state == TCP_LISTEN) {
+ list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+ lock = &hashinfo->lhash_lock;
+ inet_listen_wlock(hashinfo);
+ } else {
+ unsigned int hash;
+ sk->sk_hash = hash = inet6_sk_ehashfn(sk);
+ hash &= (hashinfo->ehash_size - 1);
+ list = &hashinfo->ehash[hash].chain;
+ lock = &hashinfo->ehash[hash].lock;
+ write_lock(lock);
+ }
+
+ __sk_add_node(sk, list);
+ sock_prot_inc_use(sk->sk_prot);
+ write_unlock(lock);
+}
+
/*
* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
* we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
@@ -84,10 +110,10 @@ static inline struct sock *
if(*((__u32 *)&(tw->tw_dport)) == ports &&
sk->sk_family == PF_INET6) {
- const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk);
+ const struct inet6_timewait_sock *tw6 = inet6_twsk(sk);
- if (ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) &&
- ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) &&
+ if (ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) &&
+ ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) &&
(!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif))
goto hit;
}
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index f943306ce5ff..227adcbdfec8 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -1,8 +1,8 @@
#ifndef _INET_COMMON_H
#define _INET_COMMON_H
-extern struct proto_ops inet_stream_ops;
-extern struct proto_ops inet_dgram_ops;
+extern const struct proto_ops inet_stream_ops;
+extern const struct proto_ops inet_dgram_ops;
/*
* INET4 prototypes used by INET6
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index b0c99060b78d..50234fa56a68 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -15,9 +15,11 @@
#ifndef _INET_CONNECTION_SOCK_H
#define _INET_CONNECTION_SOCK_H
-#include <linux/ip.h>
+#include <linux/compiler.h>
#include <linux/string.h>
#include <linux/timer.h>
+
+#include <net/inet_sock.h>
#include <net/request_sock.h>
#define INET_CSK_DEBUG 1
@@ -29,6 +31,29 @@ struct inet_bind_bucket;
struct inet_hashinfo;
struct tcp_congestion_ops;
+/*
+ * Pointers to address related TCP functions
+ * (i.e. things that depend on the address family)
+ */
+struct inet_connection_sock_af_ops {
+ int (*queue_xmit)(struct sk_buff *skb, int ipfragok);
+ void (*send_check)(struct sock *sk, int len,
+ struct sk_buff *skb);
+ int (*rebuild_header)(struct sock *sk);
+ int (*conn_request)(struct sock *sk, struct sk_buff *skb);
+ struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst);
+ int (*remember_stamp)(struct sock *sk);
+ __u16 net_header_len;
+ int (*setsockopt)(struct sock *sk, int level, int optname,
+ char __user *optval, int optlen);
+ int (*getsockopt)(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
+ void (*addr2sockaddr)(struct sock *sk, struct sockaddr *);
+ int sockaddr_len;
+};
+
/** inet_connection_sock - INET connection oriented sock
*
* @icsk_accept_queue: FIFO of established children
@@ -36,13 +61,16 @@ struct tcp_congestion_ops;
* @icsk_timeout: Timeout
* @icsk_retransmit_timer: Resend (no ack)
* @icsk_rto: Retransmit timeout
+ * @icsk_pmtu_cookie Last pmtu seen by socket
* @icsk_ca_ops Pluggable congestion control hook
+ * @icsk_af_ops Operations which are AF_INET{4,6} specific
* @icsk_ca_state: Congestion control state
* @icsk_retransmits: Number of unrecovered [RTO] timeouts
* @icsk_pending: Scheduled timer event
* @icsk_backoff: Backoff
* @icsk_syn_retries: Number of allowed SYN (or equivalent) retries
* @icsk_probes_out: unanswered 0 window probes
+ * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options)
* @icsk_ack: Delayed ACK control data
*/
struct inet_connection_sock {
@@ -54,14 +82,17 @@ struct inet_connection_sock {
struct timer_list icsk_retransmit_timer;
struct timer_list icsk_delack_timer;
__u32 icsk_rto;
+ __u32 icsk_pmtu_cookie;
struct tcp_congestion_ops *icsk_ca_ops;
+ struct inet_connection_sock_af_ops *icsk_af_ops;
+ unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
__u8 icsk_ca_state;
__u8 icsk_retransmits;
__u8 icsk_pending;
__u8 icsk_backoff;
__u8 icsk_syn_retries;
__u8 icsk_probes_out;
- /* 2 BYTES HOLE, TRY TO PACK! */
+ __u16 icsk_ext_hdr_len;
struct {
__u8 pending; /* ACK is pending */
__u8 quick; /* Scheduled number of quick acks */
@@ -192,8 +223,12 @@ extern struct request_sock *inet_csk_search_req(const struct sock *sk,
const __u16 rport,
const __u32 raddr,
const __u32 laddr);
+extern int inet_csk_bind_conflict(const struct sock *sk,
+ const struct inet_bind_bucket *tb);
extern int inet_csk_get_port(struct inet_hashinfo *hashinfo,
- struct sock *sk, unsigned short snum);
+ struct sock *sk, unsigned short snum,
+ int (*bind_conflict)(const struct sock *sk,
+ const struct inet_bind_bucket *tb));
extern struct dst_entry* inet_csk_route_req(struct sock *sk,
const struct request_sock *req);
@@ -207,7 +242,7 @@ static inline void inet_csk_reqsk_queue_add(struct sock *sk,
extern void inet_csk_reqsk_queue_hash_add(struct sock *sk,
struct request_sock *req,
- const unsigned timeout);
+ unsigned long timeout);
static inline void inet_csk_reqsk_queue_removed(struct sock *sk,
struct request_sock *req)
@@ -273,4 +308,6 @@ static inline unsigned int inet_csk_listen_poll(const struct sock *sk)
extern int inet_csk_listen_start(struct sock *sk, const int nr_table_entries);
extern void inet_csk_listen_stop(struct sock *sk);
+extern void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr);
+
#endif /* _INET_CONNECTION_SOCK_H */
diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h
index b0c47e2eccf1..d599c6bfbb86 100644
--- a/include/net/inet_ecn.h
+++ b/include/net/inet_ecn.h
@@ -3,6 +3,8 @@
#include <linux/ip.h>
#include <linux/skbuff.h>
+
+#include <net/inet_sock.h>
#include <net/dsfield.h>
enum {
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 07840baa9341..135d80fd658e 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -26,6 +26,7 @@
#include <linux/wait.h>
#include <net/inet_connection_sock.h>
+#include <net/inet_sock.h>
#include <net/route.h>
#include <net/sock.h>
#include <net/tcp_states.h>
@@ -128,26 +129,6 @@ struct inet_hashinfo {
kmem_cache_t *bind_bucket_cachep;
};
-static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport,
- const __u32 faddr, const __u16 fport)
-{
- unsigned int h = (laddr ^ lport) ^ (faddr ^ fport);
- h ^= h >> 16;
- h ^= h >> 8;
- return h;
-}
-
-static inline int inet_sk_ehashfn(const struct sock *sk)
-{
- const struct inet_sock *inet = inet_sk(sk);
- const __u32 laddr = inet->rcv_saddr;
- const __u16 lport = inet->num;
- const __u32 faddr = inet->daddr;
- const __u16 fport = inet->dport;
-
- return inet_ehashfn(laddr, lport, faddr, fport);
-}
-
static inline struct inet_ehash_bucket *inet_ehash_bucket(
struct inet_hashinfo *hashinfo,
unsigned int hash)
@@ -434,4 +415,7 @@ static inline struct sock *inet_lookup(struct inet_hashinfo *hashinfo,
return sk;
}
+
+extern int inet_hash_connect(struct inet_timewait_death_row *death_row,
+ struct sock *sk);
#endif /* _INET_HASHTABLES_H */
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
new file mode 100644
index 000000000000..883eb529ef8e
--- /dev/null
+++ b/include/net/inet_sock.h
@@ -0,0 +1,193 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Definitions for inet_sock
+ *
+ * Authors: Many, reorganised here by
+ * Arnaldo Carvalho de Melo <acme@mandriva.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef _INET_SOCK_H
+#define _INET_SOCK_H
+
+#include <linux/config.h>
+
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include <net/flow.h>
+#include <net/sock.h>
+#include <net/request_sock.h>
+
+/** struct ip_options - IP Options
+ *
+ * @faddr - Saved first hop address
+ * @is_setbyuser - Set by setsockopt?
+ * @is_data - Options in __data, rather than skb
+ * @is_strictroute - Strict source route
+ * @srr_is_hit - Packet destination addr was our one
+ * @is_changed - IP checksum more not valid
+ * @rr_needaddr - Need to record addr of outgoing dev
+ * @ts_needtime - Need to record timestamp
+ * @ts_needaddr - Need to record addr of outgoing dev
+ */
+struct ip_options {
+ __u32 faddr;
+ unsigned char optlen;
+ unsigned char srr;
+ unsigned char rr;
+ unsigned char ts;
+ unsigned char is_setbyuser:1,
+ is_data:1,
+ is_strictroute:1,
+ srr_is_hit:1,
+ is_changed:1,
+ rr_needaddr:1,
+ ts_needtime:1,
+ ts_needaddr:1;
+ unsigned char router_alert;
+ unsigned char __pad1;
+ unsigned char __pad2;
+ unsigned char __data[0];
+};
+
+#define optlength(opt) (sizeof(struct ip_options) + opt->optlen)
+
+struct inet_request_sock {
+ struct request_sock req;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ u16 inet6_rsk_offset;
+ /* 2 bytes hole, try to pack */
+#endif
+ u32 loc_addr;
+ u32 rmt_addr;
+ u16 rmt_port;
+ u16 snd_wscale : 4,
+ rcv_wscale : 4,
+ tstamp_ok : 1,
+ sack_ok : 1,
+ wscale_ok : 1,
+ ecn_ok : 1,
+ acked : 1;
+ struct ip_options *opt;
+};
+
+static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
+{
+ return (struct inet_request_sock *)sk;
+}
+
+struct ip_mc_socklist;
+struct ipv6_pinfo;
+struct rtable;
+
+/** struct inet_sock - representation of INET sockets
+ *
+ * @sk - ancestor class
+ * @pinet6 - pointer to IPv6 control block
+ * @daddr - Foreign IPv4 addr
+ * @rcv_saddr - Bound local IPv4 addr
+ * @dport - Destination port
+ * @num - Local port
+ * @saddr - Sending source
+ * @uc_ttl - Unicast TTL
+ * @sport - Source port
+ * @id - ID counter for DF pkts
+ * @tos - TOS
+ * @mc_ttl - Multicasting TTL
+ * @is_icsk - is this an inet_connection_sock?
+ * @mc_index - Multicast device index
+ * @mc_list - Group array
+ * @cork - info to build ip hdr on each ip frag while socket is corked
+ */
+struct inet_sock {
+ /* sk and pinet6 has to be the first two members of inet_sock */
+ struct sock sk;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ struct ipv6_pinfo *pinet6;
+#endif
+ /* Socket demultiplex comparisons on incoming packets. */
+ __u32 daddr;
+ __u32 rcv_saddr;
+ __u16 dport;
+ __u16 num;
+ __u32 saddr;
+ __s16 uc_ttl;
+ __u16 cmsg_flags;
+ struct ip_options *opt;
+ __u16 sport;
+ __u16 id;
+ __u8 tos;
+ __u8 mc_ttl;
+ __u8 pmtudisc;
+ __u8 recverr:1,
+ is_icsk:1,
+ freebind:1,
+ hdrincl:1,
+ mc_loop:1;
+ int mc_index;
+ __u32 mc_addr;
+ struct ip_mc_socklist *mc_list;
+ struct {
+ unsigned int flags;
+ unsigned int fragsize;
+ struct ip_options *opt;
+ struct rtable *rt;
+ int length; /* Total length of all frames */
+ u32 addr;
+ struct flowi fl;
+ } cork;
+};
+
+#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */
+#define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */
+
+static inline struct inet_sock *inet_sk(const struct sock *sk)
+{
+ return (struct inet_sock *)sk;
+}
+
+static inline void __inet_sk_copy_descendant(struct sock *sk_to,
+ const struct sock *sk_from,
+ const int ancestor_size)
+{
+ memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1,
+ sk_from->sk_prot->obj_size - ancestor_size);
+}
+#if !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE))
+static inline void inet_sk_copy_descendant(struct sock *sk_to,
+ const struct sock *sk_from)
+{
+ __inet_sk_copy_descendant(sk_to, sk_from, sizeof(struct inet_sock));
+}
+#endif
+
+extern int inet_sk_rebuild_header(struct sock *sk);
+
+static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport,
+ const __u32 faddr, const __u16 fport)
+{
+ unsigned int h = (laddr ^ lport) ^ (faddr ^ fport);
+ h ^= h >> 16;
+ h ^= h >> 8;
+ return h;
+}
+
+static inline int inet_sk_ehashfn(const struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const __u32 laddr = inet->rcv_saddr;
+ const __u16 lport = inet->num;
+ const __u32 faddr = inet->daddr;
+ const __u16 fport = inet->dport;
+
+ return inet_ehashfn(laddr, lport, faddr, fport);
+}
+
+#endif /* _INET_SOCK_H */
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 28f7b2103505..1da294c47522 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -17,15 +17,16 @@
#include <linux/config.h>
-#include <linux/ip.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/timer.h>
#include <linux/types.h>
#include <linux/workqueue.h>
+#include <net/inet_sock.h>
#include <net/sock.h>
#include <net/tcp_states.h>
+#include <net/timewait_sock.h>
#include <asm/atomic.h>
@@ -127,7 +128,8 @@ struct inet_timewait_sock {
__u16 tw_num;
/* And these are ours. */
__u8 tw_ipv6only:1;
- /* 31 bits hole, try to pack */
+ /* 15 bits hole, try to pack */
+ __u16 tw_ipv6_offset;
int tw_timeout;
unsigned long tw_ttd;
struct inet_bind_bucket *tw_tb;
@@ -199,7 +201,7 @@ static inline void inet_twsk_put(struct inet_timewait_sock *tw)
printk(KERN_DEBUG "%s timewait_sock %p released\n",
tw->tw_prot->name, tw);
#endif
- kmem_cache_free(tw->tw_prot->twsk_slab, tw);
+ kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
module_put(owner);
}
}
diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index 7fda471002b6..0965515f40cf 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -25,6 +25,7 @@ struct inet_peer
__u32 v4daddr; /* peer's address */
__u16 avl_height;
__u16 ip_id_count; /* IP ID for the next packet */
+ atomic_t rid; /* Frag reception counter */
__u32 tcp_ts;
unsigned long tcp_ts_stamp;
};
diff --git a/include/net/ip.h b/include/net/ip.h
index e4563bbee6ea..f7e7fd728b67 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -24,14 +24,10 @@
#include <linux/config.h>
#include <linux/types.h>
-#include <linux/socket.h>
#include <linux/ip.h>
#include <linux/in.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/in_route.h>
-#include <net/route.h>
-#include <net/arp.h>
+
+#include <net/inet_sock.h>
#include <net/snmp.h>
struct sock;
@@ -45,6 +41,7 @@ struct inet_skb_parm
#define IPSKB_TRANSLATED 2
#define IPSKB_FORWARDED 4
#define IPSKB_XFRM_TUNNEL_SIZE 8
+#define IPSKB_FRAG_COMPLETE 16
};
struct ipcm_cookie
@@ -74,6 +71,13 @@ extern rwlock_t ip_ra_lock;
#define IP_FRAG_TIME (30 * HZ) /* fragment lifetime */
+struct msghdr;
+struct net_device;
+struct packet_type;
+struct rtable;
+struct sk_buff;
+struct sockaddr;
+
extern void ip_mc_dropsocket(struct sock *);
extern void ip_mc_dropdevice(struct net_device *dev);
extern int igmp_mc_proc_init(void);
@@ -168,6 +172,7 @@ extern int sysctl_ipfrag_high_thresh;
extern int sysctl_ipfrag_low_thresh;
extern int sysctl_ipfrag_time;
extern int sysctl_ipfrag_secret_interval;
+extern int sysctl_ipfrag_max_dist;
/* From inetpeer.c */
extern int inet_peer_threshold;
@@ -182,6 +187,8 @@ extern int sysctl_ip_dynaddr;
extern void ipfrag_init(void);
#ifdef CONFIG_INET
+#include <net/dst.h>
+
/* The function in 2.2 was invalid, producing wrong result for
* check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */
static inline
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 14de4ebd1211..e000fa2cd5f6 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -238,6 +238,8 @@ extern int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
struct net_device *dev, u32 *spec_dst, u32 *itag);
extern void fib_select_multipath(const struct flowi *flp, struct fib_result *res);
+struct rtentry;
+
/* Exported by fib_semantics.c */
extern int ip_fib_check_default(u32 gw, struct net_device *dev);
extern int fib_sync_down(u32 local, struct net_device *dev, int force);
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 3b5559a023a4..7d2674fde19a 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -251,16 +251,15 @@ struct ip_vs_daemon_user {
#include <linux/config.h>
#include <linux/list.h> /* for struct list_head */
#include <linux/spinlock.h> /* for struct rwlock_t */
-#include <linux/skbuff.h> /* for struct sk_buff */
-#include <linux/ip.h> /* for struct iphdr */
#include <asm/atomic.h> /* for struct atomic_t */
-#include <linux/netdevice.h> /* for struct neighbour */
-#include <net/dst.h> /* for struct dst_entry */
-#include <net/udp.h>
#include <linux/compiler.h>
+#include <linux/timer.h>
+#include <net/checksum.h>
#ifdef CONFIG_IP_VS_DEBUG
+#include <linux/net.h>
+
extern int ip_vs_get_debug_level(void);
#define IP_VS_DBG(level, msg...) \
do { \
@@ -429,8 +428,11 @@ struct ip_vs_stats
spinlock_t lock; /* spin lock */
};
+struct dst_entry;
+struct iphdr;
struct ip_vs_conn;
struct ip_vs_app;
+struct sk_buff;
struct ip_vs_protocol {
struct ip_vs_protocol *next;
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 0a2ad51cff82..860bbac4c4ee 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -240,6 +240,8 @@ extern struct ipv6_txoptions * ipv6_renew_options(struct sock *sk, struct ipv6_t
struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
struct ipv6_txoptions *opt);
+extern int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb);
+
extern int ip6_frag_nqueues;
extern atomic_t ip6_frag_mem;
@@ -525,6 +527,9 @@ extern int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
extern int inet6_ioctl(struct socket *sock, unsigned int cmd,
unsigned long arg);
+extern int inet6_hash_connect(struct inet_timewait_death_row *death_row,
+ struct sock *sk);
+
/*
* reassembly.c
*/
@@ -533,8 +538,11 @@ extern int sysctl_ip6frag_low_thresh;
extern int sysctl_ip6frag_time;
extern int sysctl_ip6frag_secret_interval;
-extern struct proto_ops inet6_stream_ops;
-extern struct proto_ops inet6_dgram_ops;
+extern const struct proto_ops inet6_stream_ops;
+extern const struct proto_ops inet6_dgram_ops;
+
+struct group_source_req;
+struct group_filter;
extern int ip6_mc_source(int add, int omode, struct sock *sk,
struct group_source_req *pgsr);
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index f85d6e4b7442..bbac87eeb422 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -35,11 +35,20 @@ enum {
#ifdef __KERNEL__
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
+#include <linux/config.h>
+#include <linux/compiler.h>
#include <linux/icmpv6.h>
+#include <linux/in6.h>
+#include <linux/types.h>
+
#include <net/neighbour.h>
-#include <asm/atomic.h>
+
+struct ctl_table;
+struct file;
+struct inet6_dev;
+struct net_device;
+struct net_proto_family;
+struct sk_buff;
extern struct neigh_table nd_tbl;
@@ -108,7 +117,7 @@ extern int igmp6_event_report(struct sk_buff *skb);
extern void igmp6_cleanup(void);
#ifdef CONFIG_SYSCTL
-extern int ndisc_ifinfo_sysctl_change(ctl_table *ctl,
+extern int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl,
int write,
struct file * filp,
void __user *buffer,
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 34c07731933d..6fa9ae190741 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -49,8 +49,8 @@
#ifdef __KERNEL__
#include <asm/atomic.h>
-#include <linux/skbuff.h>
#include <linux/netdevice.h>
+#include <linux/skbuff.h>
#include <linux/rcupdate.h>
#include <linux/seq_file.h>
diff --git a/include/net/pkt_act.h b/include/net/pkt_act.h
index bd08964b72c0..b225d8472b7e 100644
--- a/include/net/pkt_act.h
+++ b/include/net/pkt_act.h
@@ -15,7 +15,6 @@
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
-#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/module.h>
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 357691f6a45f..63f7db99c2a6 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -65,7 +65,7 @@ struct inet_protosw {
int protocol; /* This is the L4 protocol number. */
struct proto *prot;
- struct proto_ops *ops;
+ const struct proto_ops *ops;
int capability; /* Which (if any) capability do
* we need to use this socket
@@ -76,6 +76,7 @@ struct inet_protosw {
};
#define INET_PROTOSW_REUSE 0x01 /* Are ports automatically reusable? */
#define INET_PROTOSW_PERMANENT 0x02 /* Permanent protocols are unremovable. */
+#define INET_PROTOSW_ICSK 0x04 /* Is this an inet_connection_sock? */
extern struct net_protocol *inet_protocol_base;
extern struct net_protocol *inet_protos[MAX_INET_PROTOS];
diff --git a/include/net/raw.h b/include/net/raw.h
index f47917469b12..e67b28a0248c 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -19,6 +19,8 @@
#include <linux/config.h>
+#include <net/protocol.h>
+
extern struct proto raw_prot;
extern void raw_err(struct sock *, struct sk_buff *, u32 info);
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index b52cc52ffe39..11641c9384f7 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -244,7 +244,7 @@ static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
static inline void reqsk_queue_hash_req(struct request_sock_queue *queue,
u32 hash, struct request_sock *req,
- unsigned timeout)
+ unsigned long timeout)
{
struct listen_sock *lopt = queue->listen_opt;
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 8e7794ee27ff..f5c22d77feab 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -277,6 +277,24 @@ struct sctp_sock {
__u32 default_context;
__u32 default_timetolive;
+ /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to
+ * the destination address every heartbeat interval. This value
+ * will be inherited by all new associations.
+ */
+ __u32 hbinterval;
+
+ /* This is the max_retrans value for new associations. */
+ __u16 pathmaxrxt;
+
+ /* The initial Path MTU to use for new associations. */
+ __u32 pathmtu;
+
+ /* The default SACK delay timeout for new associations. */
+ __u32 sackdelay;
+
+ /* Flags controling Heartbeat, SACK delay, and Path MTU Discovery. */
+ __u32 param_flags;
+
struct sctp_initmsg initmsg;
struct sctp_rtoinfo rtoinfo;
struct sctp_paddrparams paddrparam;
@@ -845,9 +863,6 @@ struct sctp_transport {
/* Data that has been sent, but not acknowledged. */
__u32 flight_size;
- /* PMTU : The current known path MTU. */
- __u32 pmtu;
-
/* Destination */
struct dst_entry *dst;
/* Source address. */
@@ -862,7 +877,22 @@ struct sctp_transport {
/* Heartbeat interval: The endpoint sends out a Heartbeat chunk to
* the destination address every heartbeat interval.
*/
- int hb_interval;
+ __u32 hbinterval;
+
+ /* This is the max_retrans value for the transport and will
+ * be initialized from the assocs value. This can be changed
+ * using SCTP_SET_PEER_ADDR_PARAMS socket option.
+ */
+ __u16 pathmaxrxt;
+
+ /* PMTU : The current known path MTU. */
+ __u32 pathmtu;
+
+ /* SACK delay timeout */
+ __u32 sackdelay;
+
+ /* Flags controling Heartbeat, SACK delay, and Path MTU Discovery. */
+ __u32 param_flags;
/* When was the last time (in jiffies) that we heard from this
* transport? We use this to pick new active and retran paths.
@@ -882,22 +912,11 @@ struct sctp_transport {
*/
int state;
- /* hb_allowed : The current heartbeat state of this destination,
- * : i.e. ALLOW-HB, NO-HEARTBEAT, etc.
- */
- int hb_allowed;
-
/* These are the error stats for this destination. */
/* Error count : The current error count for this destination. */
unsigned short error_count;
- /* This is the max_retrans value for the transport and will
- * be initialized to proto.max_retrans.path. This can be changed
- * using SCTP_SET_PEER_ADDR_PARAMS socket option.
- */
- int max_retrans;
-
/* Per : A timer used by each destination.
* Destination :
* Timer :
@@ -1502,6 +1521,28 @@ struct sctp_association {
/* The largest timeout or RTO value to use in attempting an INIT */
__u16 max_init_timeo;
+ /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to
+ * the destination address every heartbeat interval. This value
+ * will be inherited by all new transports.
+ */
+ __u32 hbinterval;
+
+ /* This is the max_retrans value for new transports in the
+ * association.
+ */
+ __u16 pathmaxrxt;
+
+ /* Association : The smallest PMTU discovered for all of the
+ * PMTU : peer's transport addresses.
+ */
+ __u32 pathmtu;
+
+ /* SACK delay timeout */
+ __u32 sackdelay;
+
+ /* Flags controling Heartbeat, SACK delay, and Path MTU Discovery. */
+ __u32 param_flags;
+
int timeouts[SCTP_NUM_TIMEOUT_TYPES];
struct timer_list timers[SCTP_NUM_TIMEOUT_TYPES];
@@ -1571,11 +1612,6 @@ struct sctp_association {
*/
wait_queue_head_t wait;
- /* Association : The smallest PMTU discovered for all of the
- * PMTU : peer's transport addresses.
- */
- __u32 pmtu;
-
/* The message size at which SCTP fragmentation will occur. */
__u32 frag_point;
diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
index f1c3bc54526a..8a6bef6f91eb 100644
--- a/include/net/sctp/user.h
+++ b/include/net/sctp/user.h
@@ -93,6 +93,8 @@ enum sctp_optname {
#define SCTP_STATUS SCTP_STATUS
SCTP_GET_PEER_ADDR_INFO,
#define SCTP_GET_PEER_ADDR_INFO SCTP_GET_PEER_ADDR_INFO
+ SCTP_DELAYED_ACK_TIME,
+#define SCTP_DELAYED_ACK_TIME SCTP_DELAYED_ACK_TIME
/* Internal Socket Options. Some of the sctp library functions are
* implemented using these socket options.
@@ -503,13 +505,41 @@ struct sctp_setadaption {
* unreachable. The following structure is used to access and modify an
* address's parameters:
*/
+enum sctp_spp_flags {
+ SPP_HB_ENABLE = 1, /*Enable heartbeats*/
+ SPP_HB_DISABLE = 2, /*Disable heartbeats*/
+ SPP_HB = SPP_HB_ENABLE | SPP_HB_DISABLE,
+ SPP_HB_DEMAND = 4, /*Send heartbeat immediately*/
+ SPP_PMTUD_ENABLE = 8, /*Enable PMTU discovery*/
+ SPP_PMTUD_DISABLE = 16, /*Disable PMTU discovery*/
+ SPP_PMTUD = SPP_PMTUD_ENABLE | SPP_PMTUD_DISABLE,
+ SPP_SACKDELAY_ENABLE = 32, /*Enable SACK*/
+ SPP_SACKDELAY_DISABLE = 64, /*Disable SACK*/
+ SPP_SACKDELAY = SPP_SACKDELAY_ENABLE | SPP_SACKDELAY_DISABLE,
+};
+
struct sctp_paddrparams {
sctp_assoc_t spp_assoc_id;
struct sockaddr_storage spp_address;
__u32 spp_hbinterval;
__u16 spp_pathmaxrxt;
+ __u32 spp_pathmtu;
+ __u32 spp_sackdelay;
+ __u32 spp_flags;
} __attribute__((packed, aligned(4)));
+/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME)
+ *
+ * This options will get or set the delayed ack timer. The time is set
+ * in milliseconds. If the assoc_id is 0, then this sets or gets the
+ * endpoints default delayed ack timer value. If the assoc_id field is
+ * non-zero, then the set or get effects the specified association.
+ */
+struct sctp_assoc_value {
+ sctp_assoc_t assoc_id;
+ uint32_t assoc_value;
+};
+
/*
* 7.2.2 Peer Address Information
*
diff --git a/include/net/sock.h b/include/net/sock.h
index 982b4ecd187b..6961700ff3a0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -493,6 +493,7 @@ extern void sk_stream_kill_queues(struct sock *sk);
extern int sk_wait_data(struct sock *sk, long *timeo);
struct request_sock_ops;
+struct timewait_sock_ops;
/* Networking protocol blocks we attach to sockets.
* socket layer -> transport layer interface
@@ -557,11 +558,10 @@ struct proto {
kmem_cache_t *slab;
unsigned int obj_size;
- kmem_cache_t *twsk_slab;
- unsigned int twsk_obj_size;
atomic_t *orphan_count;
struct request_sock_ops *rsk_prot;
+ struct timewait_sock_ops *twsk_prot;
struct module *owner;
@@ -926,6 +926,29 @@ static inline void sock_put(struct sock *sk)
sk_free(sk);
}
+static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb)
+{
+ int rc = NET_RX_SUCCESS;
+
+ if (sk_filter(sk, skb, 0))
+ goto discard_and_relse;
+
+ skb->dev = NULL;
+
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk))
+ rc = sk->sk_backlog_rcv(sk, skb);
+ else
+ sk_add_backlog(sk, skb);
+ bh_unlock_sock(sk);
+out:
+ sock_put(sk);
+ return rc;
+discard_and_relse:
+ kfree_skb(skb);
+ goto out;
+}
+
/* Detach socket from process context.
* Announce socket dead, detach it from wait queue and inode.
* Note that parent inode held reference count on this struct sock,
@@ -1166,7 +1189,10 @@ static inline int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
static inline int sock_error(struct sock *sk)
{
- int err = xchg(&sk->sk_err, 0);
+ int err;
+ if (likely(!sk->sk_err))
+ return 0;
+ err = xchg(&sk->sk_err, 0);
return -err;
}
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d78025f9fbea..77f21c65bbca 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -225,53 +225,6 @@ extern atomic_t tcp_sockets_allocated;
extern int tcp_memory_pressure;
/*
- * Pointers to address related TCP functions
- * (i.e. things that depend on the address family)
- */
-
-struct tcp_func {
- int (*queue_xmit) (struct sk_buff *skb,
- int ipfragok);
-
- void (*send_check) (struct sock *sk,
- struct tcphdr *th,
- int len,
- struct sk_buff *skb);
-
- int (*rebuild_header) (struct sock *sk);
-
- int (*conn_request) (struct sock *sk,
- struct sk_buff *skb);
-
- struct sock * (*syn_recv_sock) (struct sock *sk,
- struct sk_buff *skb,
- struct request_sock *req,
- struct dst_entry *dst);
-
- int (*remember_stamp) (struct sock *sk);
-
- __u16 net_header_len;
-
- int (*setsockopt) (struct sock *sk,
- int level,
- int optname,
- char __user *optval,
- int optlen);
-
- int (*getsockopt) (struct sock *sk,
- int level,
- int optname,
- char __user *optval,
- int __user *optlen);
-
-
- void (*addr2sockaddr) (struct sock *sk,
- struct sockaddr *);
-
- int sockaddr_len;
-};
-
-/*
* The next routines deal with comparing 32 bit unsigned ints
* and worry about wraparound (automatic with unsigned arithmetic).
*/
@@ -334,6 +287,9 @@ extern int tcp_rcv_established(struct sock *sk,
extern void tcp_rcv_space_adjust(struct sock *sk);
+extern int tcp_twsk_unique(struct sock *sk,
+ struct sock *sktw, void *twp);
+
static inline void tcp_dec_quickack_mode(struct sock *sk,
const unsigned int pkts)
{
@@ -405,8 +361,7 @@ extern void tcp_parse_options(struct sk_buff *skb,
* TCP v4 functions exported for the inet6 API
*/
-extern void tcp_v4_send_check(struct sock *sk,
- struct tcphdr *th, int len,
+extern void tcp_v4_send_check(struct sock *sk, int len,
struct sk_buff *skb);
extern int tcp_v4_conn_request(struct sock *sk,
@@ -490,34 +445,16 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *,
extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
sk_read_actor_t recv_actor);
-/* Initialize RCV_MSS value.
- * RCV_MSS is an our guess about MSS used by the peer.
- * We haven't any direct information about the MSS.
- * It's better to underestimate the RCV_MSS rather than overestimate.
- * Overestimations make us ACKing less frequently than needed.
- * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
- */
+extern void tcp_initialize_rcv_mss(struct sock *sk);
-static inline void tcp_initialize_rcv_mss(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
-
- hint = min(hint, tp->rcv_wnd/2);
- hint = min(hint, TCP_MIN_RCVMSS);
- hint = max(hint, TCP_MIN_MSS);
-
- inet_csk(sk)->icsk_ack.rcv_mss = hint;
-}
-
-static __inline__ void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
+static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
tp->pred_flags = htonl((tp->tcp_header_len << 26) |
ntohl(TCP_FLAG_ACK) |
snd_wnd);
}
-static __inline__ void tcp_fast_path_on(struct tcp_sock *tp)
+static inline void tcp_fast_path_on(struct tcp_sock *tp)
{
__tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
}
@@ -535,7 +472,7 @@ static inline void tcp_fast_path_check(struct sock *sk, struct tcp_sock *tp)
* Rcv_nxt can be after the window if our peer push more data
* than the offered window.
*/
-static __inline__ u32 tcp_receive_window(const struct tcp_sock *tp)
+static inline u32 tcp_receive_window(const struct tcp_sock *tp)
{
s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt;
@@ -707,6 +644,7 @@ extern void tcp_cleanup_congestion_control(struct sock *sk);
extern int tcp_set_default_congestion_control(const char *name);
extern void tcp_get_default_congestion_control(char *name);
extern int tcp_set_congestion_control(struct sock *sk, const char *name);
+extern void tcp_slow_start(struct tcp_sock *tp);
extern struct tcp_congestion_ops tcp_init_congestion_ops;
extern u32 tcp_reno_ssthresh(struct sock *sk);
@@ -746,7 +684,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
* "Packets left network, but not honestly ACKed yet" PLUS
* "Packets fast retransmitted"
*/
-static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
+static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
{
return (tp->packets_out - tp->left_out + tp->retrans_out);
}
@@ -766,33 +704,6 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
(tp->snd_cwnd >> 2)));
}
-/*
- * Linear increase during slow start
- */
-static inline void tcp_slow_start(struct tcp_sock *tp)
-{
- if (sysctl_tcp_abc) {
- /* RFC3465: Slow Start
- * TCP sender SHOULD increase cwnd by the number of
- * previously unacknowledged bytes ACKed by each incoming
- * acknowledgment, provided the increase is not more than L
- */
- if (tp->bytes_acked < tp->mss_cache)
- return;
-
- /* We MAY increase by 2 if discovered delayed ack */
- if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- }
- }
- tp->bytes_acked = 0;
-
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
-}
-
-
static inline void tcp_sync_left_out(struct tcp_sock *tp)
{
if (tp->rx_opt.sack_ok &&
@@ -801,34 +712,7 @@ static inline void tcp_sync_left_out(struct tcp_sock *tp)
tp->left_out = tp->sacked_out + tp->lost_out;
}
-/* Set slow start threshold and cwnd not falling to slow start */
-static inline void __tcp_enter_cwr(struct sock *sk)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
-
- tp->undo_marker = 0;
- tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
- tp->snd_cwnd = min(tp->snd_cwnd,
- tcp_packets_in_flight(tp) + 1U);
- tp->snd_cwnd_cnt = 0;
- tp->high_seq = tp->snd_nxt;
- tp->snd_cwnd_stamp = tcp_time_stamp;
- TCP_ECN_queue_cwr(tp);
-}
-
-static inline void tcp_enter_cwr(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- tp->prior_ssthresh = 0;
- tp->bytes_acked = 0;
- if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
- __tcp_enter_cwr(sk);
- tcp_set_ca_state(sk, TCP_CA_CWR);
- }
-}
-
+extern void tcp_enter_cwr(struct sock *sk);
extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst);
/* Slow start with delack produces 3 packets of burst, so that
@@ -860,14 +744,14 @@ static inline int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
return left <= tcp_max_burst(tp);
}
-static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss,
- const struct sk_buff *skb)
+static inline void tcp_minshall_update(struct tcp_sock *tp, int mss,
+ const struct sk_buff *skb)
{
if (skb->len < mss)
tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
}
-static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp)
+static inline void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
if (!tp->packets_out && !icsk->icsk_pending)
@@ -875,18 +759,18 @@ static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *t
icsk->icsk_rto, TCP_RTO_MAX);
}
-static __inline__ void tcp_push_pending_frames(struct sock *sk,
- struct tcp_sock *tp)
+static inline void tcp_push_pending_frames(struct sock *sk,
+ struct tcp_sock *tp)
{
__tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle);
}
-static __inline__ void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq)
+static inline void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq)
{
tp->snd_wl1 = seq;
}
-static __inline__ void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq)
+static inline void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq)
{
tp->snd_wl1 = seq;
}
@@ -894,19 +778,19 @@ static __inline__ void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq)
/*
* Calculate(/check) TCP checksum
*/
-static __inline__ u16 tcp_v4_check(struct tcphdr *th, int len,
- unsigned long saddr, unsigned long daddr,
- unsigned long base)
+static inline u16 tcp_v4_check(struct tcphdr *th, int len,
+ unsigned long saddr, unsigned long daddr,
+ unsigned long base)
{
return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
}
-static __inline__ int __tcp_checksum_complete(struct sk_buff *skb)
+static inline int __tcp_checksum_complete(struct sk_buff *skb)
{
return __skb_checksum_complete(skb);
}
-static __inline__ int tcp_checksum_complete(struct sk_buff *skb)
+static inline int tcp_checksum_complete(struct sk_buff *skb)
{
return skb->ip_summed != CHECKSUM_UNNECESSARY &&
__tcp_checksum_complete(skb);
@@ -914,7 +798,7 @@ static __inline__ int tcp_checksum_complete(struct sk_buff *skb)
/* Prequeue for VJ style copy to user, combined with checksumming. */
-static __inline__ void tcp_prequeue_init(struct tcp_sock *tp)
+static inline void tcp_prequeue_init(struct tcp_sock *tp)
{
tp->ucopy.task = NULL;
tp->ucopy.len = 0;
@@ -930,7 +814,7 @@ static __inline__ void tcp_prequeue_init(struct tcp_sock *tp)
*
* NOTE: is this not too big to inline?
*/
-static __inline__ int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
+static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -971,7 +855,7 @@ static const char *statename[]={
};
#endif
-static __inline__ void tcp_set_state(struct sock *sk, int state)
+static inline void tcp_set_state(struct sock *sk, int state)
{
int oldstate = sk->sk_state;
@@ -1005,7 +889,7 @@ static __inline__ void tcp_set_state(struct sock *sk, int state)
#endif
}
-static __inline__ void tcp_done(struct sock *sk)
+static inline void tcp_done(struct sock *sk)
{
tcp_set_state(sk, TCP_CLOSE);
tcp_clear_xmit_timers(sk);
@@ -1018,81 +902,13 @@ static __inline__ void tcp_done(struct sock *sk)
inet_csk_destroy_sock(sk);
}
-static __inline__ void tcp_sack_reset(struct tcp_options_received *rx_opt)
+static inline void tcp_sack_reset(struct tcp_options_received *rx_opt)
{
rx_opt->dsack = 0;
rx_opt->eff_sacks = 0;
rx_opt->num_sacks = 0;
}
-static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp, __u32 tstamp)
-{
- if (tp->rx_opt.tstamp_ok) {
- *ptr++ = __constant_htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_TIMESTAMP << 8) |
- TCPOLEN_TIMESTAMP);
- *ptr++ = htonl(tstamp);
- *ptr++ = htonl(tp->rx_opt.ts_recent);
- }
- if (tp->rx_opt.eff_sacks) {
- struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
- int this_sack;
-
- *ptr++ = __constant_htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_SACK << 8) |
- (TCPOLEN_SACK_BASE +
- (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)));
- for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
- *ptr++ = htonl(sp[this_sack].start_seq);
- *ptr++ = htonl(sp[this_sack].end_seq);
- }
- if (tp->rx_opt.dsack) {
- tp->rx_opt.dsack = 0;
- tp->rx_opt.eff_sacks--;
- }
- }
-}
-
-/* Construct a tcp options header for a SYN or SYN_ACK packet.
- * If this is every changed make sure to change the definition of
- * MAX_SYN_SIZE to match the new maximum number of options that you
- * can generate.
- */
-static inline void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack,
- int offer_wscale, int wscale, __u32 tstamp, __u32 ts_recent)
-{
- /* We always get an MSS option.
- * The option bytes which will be seen in normal data
- * packets should timestamps be used, must be in the MSS
- * advertised. But we subtract them from tp->mss_cache so
- * that calculations in tcp_sendmsg are simpler etc.
- * So account for this fact here if necessary. If we
- * don't do this correctly, as a receiver we won't
- * recognize data packets as being full sized when we
- * should, and thus we won't abide by the delayed ACK
- * rules correctly.
- * SACKs don't matter, we never delay an ACK when we
- * have any of those going out.
- */
- *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
- if (ts) {
- if(sack)
- *ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) |
- (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
- else
- *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
- (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
- *ptr++ = htonl(tstamp); /* TSVAL */
- *ptr++ = htonl(ts_recent); /* TSECR */
- } else if(sack)
- *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
- (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM);
- if (offer_wscale)
- *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale));
-}
-
/* Determine a window scaling and initial window to offer. */
extern void tcp_select_initial_window(int __space, __u32 mss,
__u32 *rcv_wnd, __u32 *window_clamp,
@@ -1117,9 +933,9 @@ static inline int tcp_full_space(const struct sock *sk)
return tcp_win_from_space(sk->sk_rcvbuf);
}
-static __inline__ void tcp_openreq_init(struct request_sock *req,
- struct tcp_options_received *rx_opt,
- struct sk_buff *skb)
+static inline void tcp_openreq_init(struct request_sock *req,
+ struct tcp_options_received *rx_opt,
+ struct sk_buff *skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h
index b9d4176b2d15..b0b645988bd8 100644
--- a/include/net/tcp_states.h
+++ b/include/net/tcp_states.h
@@ -31,4 +31,20 @@ enum {
#define TCP_STATE_MASK 0xF
+#define TCP_ACTION_FIN (1 << 7)
+
+enum {
+ TCPF_ESTABLISHED = (1 << 1),
+ TCPF_SYN_SENT = (1 << 2),
+ TCPF_SYN_RECV = (1 << 3),
+ TCPF_FIN_WAIT1 = (1 << 4),
+ TCPF_FIN_WAIT2 = (1 << 5),
+ TCPF_TIME_WAIT = (1 << 6),
+ TCPF_CLOSE = (1 << 7),
+ TCPF_CLOSE_WAIT = (1 << 8),
+ TCPF_LAST_ACK = (1 << 9),
+ TCPF_LISTEN = (1 << 10),
+ TCPF_CLOSING = (1 << 11)
+};
+
#endif /* _LINUX_TCP_STATES_H */
diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h
new file mode 100644
index 000000000000..2544281e1d5e
--- /dev/null
+++ b/include/net/timewait_sock.h
@@ -0,0 +1,31 @@
+/*
+ * NET Generic infrastructure for Network protocols.
+ *
+ * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef _TIMEWAIT_SOCK_H
+#define _TIMEWAIT_SOCK_H
+
+#include <linux/slab.h>
+#include <net/sock.h>
+
+struct timewait_sock_ops {
+ kmem_cache_t *twsk_slab;
+ unsigned int twsk_obj_size;
+ int (*twsk_unique)(struct sock *sk,
+ struct sock *sktw, void *twp);
+};
+
+static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
+{
+ if (sk->sk_prot->twsk_prot->twsk_unique != NULL)
+ return sk->sk_prot->twsk_prot->twsk_unique(sk, sktw, twp);
+ return 0;
+}
+
+#endif /* _TIMEWAIT_SOCK_H */
diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h
index 4e86f2de6638..61f724c1036f 100644
--- a/include/net/transp_v6.h
+++ b/include/net/transp_v6.h
@@ -44,7 +44,7 @@ extern int datagram_send_ctl(struct msghdr *msg,
/*
* address family specific functions
*/
-extern struct tcp_func ipv4_specific;
+extern struct inet_connection_sock_af_ops ipv4_specific;
extern int inet6_destroy_sock(struct sock *sk);
diff --git a/include/net/udp.h b/include/net/udp.h
index 107b9d791a1f..766fba1369ce 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -22,9 +22,8 @@
#ifndef _UDP_H
#define _UDP_H
-#include <linux/udp.h>
-#include <linux/ip.h>
#include <linux/list.h>
+#include <net/inet_sock.h>
#include <net/sock.h>
#include <net/snmp.h>
#include <linux/seq_file.h>
@@ -62,6 +61,7 @@ static inline int udp_lport_inuse(u16 num)
extern struct proto udp_prot;
+struct sk_buff;
extern void udp_err(struct sk_buff *, u32);
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 1cdb87912137..07d7b50cdd76 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -2,11 +2,12 @@
#define _NET_XFRM_H
#include <linux/compiler.h>
+#include <linux/in.h>
#include <linux/xfrm.h>
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/skbuff.h>
-#include <linux/netdevice.h>
+#include <linux/socket.h>
#include <linux/crypto.h>
#include <linux/pfkeyv2.h>
#include <linux/in6.h>
@@ -144,6 +145,9 @@ struct xfrm_state
* transformer. */
struct xfrm_type *type;
+ /* Security context */
+ struct xfrm_sec_ctx *security;
+
/* Private data of this transformer, format is opaque,
* interpreted by xfrm_type methods. */
void *data;
@@ -298,6 +302,7 @@ struct xfrm_policy
__u8 flags;
__u8 dead;
__u8 xfrm_nr;
+ struct xfrm_sec_ctx *security;
struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH];
};
@@ -510,6 +515,25 @@ xfrm_selector_match(struct xfrm_selector *sel, struct flowi *fl,
return 0;
}
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+/* If neither has a context --> match
+ * Otherwise, both must have a context and the sids, doi, alg must match
+ */
+static inline int xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2)
+{
+ return ((!s1 && !s2) ||
+ (s1 && s2 &&
+ (s1->ctx_sid == s2->ctx_sid) &&
+ (s1->ctx_doi == s2->ctx_doi) &&
+ (s1->ctx_alg == s2->ctx_alg)));
+}
+#else
+static inline int xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2)
+{
+ return 1;
+}
+#endif
+
/* A struct encoding bundle of transformations to apply to some set of flow.
*
* dst->child points to the next element of bundle.
@@ -878,8 +902,8 @@ static inline int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, unsig
struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp);
extern int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*), void *);
int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl);
-struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel,
- int delete);
+struct xfrm_policy *xfrm_policy_bysel_ctx(int dir, struct xfrm_selector *sel,
+ struct xfrm_sec_ctx *ctx, int delete);
struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete);
void xfrm_policy_flush(void);
u32 xfrm_get_acqseq(void);
diff --git a/init/main.c b/init/main.c
index 27f97f9b4636..54aaf561cf66 100644
--- a/init/main.c
+++ b/init/main.c
@@ -47,7 +47,6 @@
#include <linux/rmap.h>
#include <linux/mempolicy.h>
#include <linux/key.h>
-#include <net/sock.h>
#include <asm/io.h>
#include <asm/bugs.h>
@@ -614,9 +613,6 @@ static void __init do_basic_setup(void)
sysctl_init();
#endif
- /* Networking initialization needs a process context */
- sock_init();
-
do_initcalls();
}
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 7982656b9c83..a5144e43aae1 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -63,7 +63,7 @@
#include <linux/atalk.h>
struct datalink_proto *ddp_dl, *aarp_dl;
-static struct proto_ops atalk_dgram_ops;
+static const struct proto_ops atalk_dgram_ops;
/**************************************************************************\
* *
@@ -1763,7 +1763,7 @@ static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr
*/
static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
- int rc = -EINVAL;
+ int rc = -ENOIOCTLCMD;
struct sock *sk = sock->sk;
void __user *argp = (void __user *)arg;
@@ -1813,23 +1813,6 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
rc = atif_ioctl(cmd, argp);
rtnl_unlock();
break;
- /* Physical layer ioctl calls */
- case SIOCSIFLINK:
- case SIOCGIFHWADDR:
- case SIOCSIFHWADDR:
- case SIOCGIFFLAGS:
- case SIOCSIFFLAGS:
- case SIOCGIFTXQLEN:
- case SIOCSIFTXQLEN:
- case SIOCGIFMTU:
- case SIOCGIFCONF:
- case SIOCADDMULTI:
- case SIOCDELMULTI:
- case SIOCGIFCOUNT:
- case SIOCGIFINDEX:
- case SIOCGIFNAME:
- rc = dev_ioctl(cmd, argp);
- break;
}
return rc;
@@ -1841,7 +1824,7 @@ static struct net_proto_family atalk_family_ops = {
.owner = THIS_MODULE,
};
-static struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = {
.family = PF_APPLETALK,
.owner = THIS_MODULE,
.release = atalk_release,
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index 2684a92da22b..f2c541774dcd 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -102,7 +102,7 @@ static int pvc_getname(struct socket *sock,struct sockaddr *sockaddr,
}
-static struct proto_ops pvc_proto_ops = {
+static const struct proto_ops pvc_proto_ops = {
.family = PF_ATMPVC,
.owner = THIS_MODULE,
diff --git a/net/atm/svc.c b/net/atm/svc.c
index d7b266136bf6..3a180cfd7b48 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -613,7 +613,7 @@ static int svc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
return error;
}
-static struct proto_ops svc_proto_ops = {
+static const struct proto_ops svc_proto_ops = {
.family = PF_ATMSVC,
.owner = THIS_MODULE,
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 1b683f302657..e8753c7fcad1 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -54,7 +54,7 @@
HLIST_HEAD(ax25_list);
DEFINE_SPINLOCK(ax25_list_lock);
-static struct proto_ops ax25_proto_ops;
+static const struct proto_ops ax25_proto_ops;
static void ax25_free_sock(struct sock *sk)
{
@@ -1827,7 +1827,7 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
break;
default:
- res = dev_ioctl(cmd, argp);
+ res = -ENOIOCTLCMD;
break;
}
release_sock(sk);
@@ -1944,7 +1944,7 @@ static struct net_proto_family ax25_family_ops = {
.owner = THIS_MODULE,
};
-static struct proto_ops ax25_proto_ops = {
+static const struct proto_ops ax25_proto_ops = {
.family = PF_AX25,
.owner = THIS_MODULE,
.release = ax25_release,
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index ea616e3fc98e..fb031fe9be9e 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -287,10 +287,9 @@ int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo)
timeo = schedule_timeout(timeo);
lock_sock(sk);
- if (sk->sk_err) {
- err = sock_error(sk);
+ err = sock_error(sk);
+ if (err)
break;
- }
}
set_current_state(TASK_RUNNING);
remove_wait_queue(sk->sk_sleep, &wait);
diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c
index 9778c6acd53b..ccbaf69afc5b 100644
--- a/net/bluetooth/bnep/sock.c
+++ b/net/bluetooth/bnep/sock.c
@@ -146,7 +146,7 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
return 0;
}
-static struct proto_ops bnep_sock_ops = {
+static const struct proto_ops bnep_sock_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
.release = bnep_sock_release,
diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c
index beb045bf5714..5e22343b6090 100644
--- a/net/bluetooth/cmtp/sock.c
+++ b/net/bluetooth/cmtp/sock.c
@@ -137,7 +137,7 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
return -EINVAL;
}
-static struct proto_ops cmtp_sock_ops = {
+static const struct proto_ops cmtp_sock_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
.release = cmtp_sock_release,
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 1d6d0a15c099..84e6c93a044a 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -575,7 +575,7 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char
return 0;
}
-static struct proto_ops hci_sock_ops = {
+static const struct proto_ops hci_sock_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
.release = hci_sock_release,
diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c
index f8986f881431..8f8dd931b294 100644
--- a/net/bluetooth/hidp/sock.c
+++ b/net/bluetooth/hidp/sock.c
@@ -143,7 +143,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
return -EINVAL;
}
-static struct proto_ops hidp_sock_ops = {
+static const struct proto_ops hidp_sock_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
.release = hidp_sock_release,
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index e3bb11ca4235..7f0781e4326f 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -57,7 +57,7 @@
#define VERSION "2.8"
-static struct proto_ops l2cap_sock_ops;
+static const struct proto_ops l2cap_sock_ops;
static struct bt_sock_list l2cap_sk_list = {
.lock = RW_LOCK_UNLOCKED
@@ -767,8 +767,9 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct ms
BT_DBG("sock %p, sk %p", sock, sk);
- if (sk->sk_err)
- return sock_error(sk);
+ err = sock_error(sk);
+ if (err)
+ return err;
if (msg->msg_flags & MSG_OOB)
return -EOPNOTSUPP;
@@ -2160,7 +2161,7 @@ static ssize_t l2cap_sysfs_show(struct class *dev, char *buf)
static CLASS_ATTR(l2cap, S_IRUGO, l2cap_sysfs_show, NULL);
-static struct proto_ops l2cap_sock_ops = {
+static const struct proto_ops l2cap_sock_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
.release = l2cap_sock_release,
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 6c34261b232e..757d2dd3b02f 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -58,7 +58,7 @@
#define BT_DBG(D...)
#endif
-static struct proto_ops rfcomm_sock_ops;
+static const struct proto_ops rfcomm_sock_ops;
static struct bt_sock_list rfcomm_sk_list = {
.lock = RW_LOCK_UNLOCKED
@@ -907,7 +907,7 @@ static ssize_t rfcomm_sock_sysfs_show(struct class *dev, char *buf)
static CLASS_ATTR(rfcomm, S_IRUGO, rfcomm_sock_sysfs_show, NULL);
-static struct proto_ops rfcomm_sock_ops = {
+static const struct proto_ops rfcomm_sock_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
.release = rfcomm_sock_release,
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 9cb00dc6c08c..6b61323ce23c 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -56,7 +56,7 @@
#define VERSION "0.5"
-static struct proto_ops sco_sock_ops;
+static const struct proto_ops sco_sock_ops;
static struct bt_sock_list sco_sk_list = {
.lock = RW_LOCK_UNLOCKED
@@ -637,8 +637,9 @@ static int sco_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
BT_DBG("sock %p, sk %p", sock, sk);
- if (sk->sk_err)
- return sock_error(sk);
+ err = sock_error(sk);
+ if (err)
+ return err;
if (msg->msg_flags & MSG_OOB)
return -EOPNOTSUPP;
@@ -913,7 +914,7 @@ static ssize_t sco_sysfs_show(struct class *dev, char *buf)
static CLASS_ATTR(sco, S_IRUGO, sco_sysfs_show, NULL);
-static struct proto_ops sco_sock_ops = {
+static const struct proto_ops sco_sock_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
.release = sco_sock_release,
diff --git a/net/bridge/br.c b/net/bridge/br.c
index f8f184942aaf..188cc1ac49eb 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -67,3 +67,4 @@ EXPORT_SYMBOL(br_should_route_hook);
module_init(br_init)
module_exit(br_deinit)
MODULE_LICENSE("GPL");
+MODULE_VERSION(BR_VERSION);
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index f564ee99782d..0b33a7b3a00c 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -15,7 +15,9 @@
#include <linux/kernel.h>
#include <linux/netdevice.h>
-#include <linux/module.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+
#include <asm/uaccess.h>
#include "br_private.h"
@@ -82,6 +84,87 @@ static int br_change_mtu(struct net_device *dev, int new_mtu)
return 0;
}
+/* Allow setting mac address of pseudo-bridge to be same as
+ * any of the bound interfaces
+ */
+static int br_set_mac_address(struct net_device *dev, void *p)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct sockaddr *addr = p;
+ struct net_bridge_port *port;
+ int err = -EADDRNOTAVAIL;
+
+ spin_lock_bh(&br->lock);
+ list_for_each_entry(port, &br->port_list, list) {
+ if (!compare_ether_addr(port->dev->dev_addr, addr->sa_data)) {
+ br_stp_change_bridge_id(br, addr->sa_data);
+ err = 0;
+ break;
+ }
+ }
+ spin_unlock_bh(&br->lock);
+
+ return err;
+}
+
+static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+{
+ strcpy(info->driver, "bridge");
+ strcpy(info->version, BR_VERSION);
+ strcpy(info->fw_version, "N/A");
+ strcpy(info->bus_info, "N/A");
+}
+
+static int br_set_sg(struct net_device *dev, u32 data)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ if (data)
+ br->feature_mask |= NETIF_F_SG;
+ else
+ br->feature_mask &= ~NETIF_F_SG;
+
+ br_features_recompute(br);
+ return 0;
+}
+
+static int br_set_tso(struct net_device *dev, u32 data)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ if (data)
+ br->feature_mask |= NETIF_F_TSO;
+ else
+ br->feature_mask &= ~NETIF_F_TSO;
+
+ br_features_recompute(br);
+ return 0;
+}
+
+static int br_set_tx_csum(struct net_device *dev, u32 data)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ if (data)
+ br->feature_mask |= NETIF_F_IP_CSUM;
+ else
+ br->feature_mask &= ~NETIF_F_IP_CSUM;
+
+ br_features_recompute(br);
+ return 0;
+}
+
+static struct ethtool_ops br_ethtool_ops = {
+ .get_drvinfo = br_getinfo,
+ .get_link = ethtool_op_get_link,
+ .get_sg = ethtool_op_get_sg,
+ .set_sg = br_set_sg,
+ .get_tx_csum = ethtool_op_get_tx_csum,
+ .set_tx_csum = br_set_tx_csum,
+ .get_tso = ethtool_op_get_tso,
+ .set_tso = br_set_tso,
+};
+
void br_dev_setup(struct net_device *dev)
{
memset(dev->dev_addr, 0, ETH_ALEN);
@@ -96,8 +179,12 @@ void br_dev_setup(struct net_device *dev)
dev->change_mtu = br_change_mtu;
dev->destructor = free_netdev;
SET_MODULE_OWNER(dev);
+ SET_ETHTOOL_OPS(dev, &br_ethtool_ops);
dev->stop = br_dev_stop;
dev->tx_queue_len = 0;
- dev->set_mac_address = NULL;
+ dev->set_mac_address = br_set_mac_address;
dev->priv_flags = IFF_EBRIDGE;
+
+ dev->features = NETIF_F_SG | NETIF_F_FRAGLIST
+ | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_IP_CSUM;
}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 975abe254b7a..11321197338e 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -32,9 +32,8 @@
* ethtool, use ethtool_ops. Also, since driver might sleep need to
* not be holding any locks.
*/
-static int br_initial_port_cost(struct net_device *dev)
+static int port_cost(struct net_device *dev)
{
-
struct ethtool_cmd ecmd = { ETHTOOL_GSET };
struct ifreq ifr;
mm_segment_t old_fs;
@@ -58,10 +57,6 @@ static int br_initial_port_cost(struct net_device *dev)
return 2;
case SPEED_10:
return 100;
- default:
- pr_info("bridge: can't decode speed from %s: %d\n",
- dev->name, ecmd.speed);
- return 100;
}
}
@@ -75,6 +70,35 @@ static int br_initial_port_cost(struct net_device *dev)
return 100; /* assume old 10Mbps */
}
+
+/*
+ * Check for port carrier transistions.
+ * Called from work queue to allow for calling functions that
+ * might sleep (such as speed check), and to debounce.
+ */
+static void port_carrier_check(void *arg)
+{
+ struct net_bridge_port *p = arg;
+
+ rtnl_lock();
+ if (netif_carrier_ok(p->dev)) {
+ u32 cost = port_cost(p->dev);
+
+ spin_lock_bh(&p->br->lock);
+ if (p->state == BR_STATE_DISABLED) {
+ p->path_cost = cost;
+ br_stp_enable_port(p);
+ }
+ spin_unlock_bh(&p->br->lock);
+ } else {
+ spin_lock_bh(&p->br->lock);
+ if (p->state != BR_STATE_DISABLED)
+ br_stp_disable_port(p);
+ spin_unlock_bh(&p->br->lock);
+ }
+ rtnl_unlock();
+}
+
static void destroy_nbp(struct net_bridge_port *p)
{
struct net_device *dev = p->dev;
@@ -102,6 +126,9 @@ static void del_nbp(struct net_bridge_port *p)
dev->br_port = NULL;
dev_set_promiscuity(dev, -1);
+ cancel_delayed_work(&p->carrier_check);
+ flush_scheduled_work();
+
spin_lock_bh(&br->lock);
br_stp_disable_port(p);
spin_unlock_bh(&br->lock);
@@ -155,6 +182,7 @@ static struct net_device *new_bridge_dev(const char *name)
br->bridge_id.prio[1] = 0x00;
memset(br->bridge_id.addr, 0, ETH_ALEN);
+ br->feature_mask = dev->features;
br->stp_enabled = 0;
br->designated_root = br->bridge_id;
br->root_path_cost = 0;
@@ -195,10 +223,9 @@ static int find_portno(struct net_bridge *br)
return (index >= BR_MAX_PORTS) ? -EXFULL : index;
}
-/* called with RTNL */
+/* called with RTNL but without bridge lock */
static struct net_bridge_port *new_nbp(struct net_bridge *br,
- struct net_device *dev,
- unsigned long cost)
+ struct net_device *dev)
{
int index;
struct net_bridge_port *p;
@@ -215,12 +242,13 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
p->br = br;
dev_hold(dev);
p->dev = dev;
- p->path_cost = cost;
+ p->path_cost = port_cost(dev);
p->priority = 0x8000 >> BR_PORT_BITS;
dev->br_port = p;
p->port_no = index;
br_init_port(p);
p->state = BR_STATE_DISABLED;
+ INIT_WORK(&p->carrier_check, port_carrier_check, p);
kobject_init(&p->kobj);
return p;
@@ -322,9 +350,8 @@ void br_features_recompute(struct net_bridge *br)
struct net_bridge_port *p;
unsigned long features, checksum;
- features = NETIF_F_SG | NETIF_F_FRAGLIST
- | NETIF_F_HIGHDMA | NETIF_F_TSO;
- checksum = NETIF_F_IP_CSUM; /* least commmon subset */
+ features = br->feature_mask &~ NETIF_F_IP_CSUM;
+ checksum = br->feature_mask & NETIF_F_IP_CSUM;
list_for_each_entry(p, &br->port_list, list) {
if (!(p->dev->features
@@ -351,7 +378,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
if (dev->br_port != NULL)
return -EBUSY;
- if (IS_ERR(p = new_nbp(br, dev, br_initial_port_cost(dev))))
+ if (IS_ERR(p = new_nbp(br, dev)))
return PTR_ERR(p);
if ((err = br_fdb_insert(br, p, dev->dev_addr)))
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index b88220a64cd8..c387852f753a 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -53,6 +53,11 @@ int br_handle_frame_finish(struct sk_buff *skb)
/* insert into forwarding database after filtering to avoid spoofing */
br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
+ if (p->state == BR_STATE_LEARNING) {
+ kfree_skb(skb);
+ goto out;
+ }
+
if (br->dev->flags & IFF_PROMISC) {
struct sk_buff *skb2;
@@ -107,9 +112,6 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb)
if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
goto err;
- if (p->state == BR_STATE_LEARNING)
- br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
-
if (p->br->stp_enabled &&
!memcmp(dest, bridge_ula, 5) &&
!(dest[5] & 0xF0)) {
@@ -118,9 +120,10 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb)
NULL, br_stp_handle_bpdu);
return 1;
}
+ goto err;
}
- else if (p->state == BR_STATE_FORWARDING) {
+ if (p->state == BR_STATE_FORWARDING || p->state == BR_STATE_LEARNING) {
if (br_should_route_hook) {
if (br_should_route_hook(pskb))
return 0;
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 23422bd53a5e..223f8270daee 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -26,6 +26,7 @@
#include <linux/ip.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
+#include <linux/if_arp.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <linux/netfilter_bridge.h>
@@ -33,8 +34,11 @@
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_arp.h>
#include <linux/in_route.h>
+
#include <net/ip.h>
#include <net/ipv6.h>
+#include <net/route.h>
+
#include <asm/uaccess.h>
#include <asm/checksum.h>
#include "br_private.h"
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index 917311c6828b..a43a9c1d50d7 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -52,17 +52,9 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
br_stp_recalculate_bridge_id(br);
break;
- case NETDEV_CHANGE: /* device is up but carrier changed */
- if (!(br->dev->flags & IFF_UP))
- break;
-
- if (netif_carrier_ok(dev)) {
- if (p->state == BR_STATE_DISABLED)
- br_stp_enable_port(p);
- } else {
- if (p->state != BR_STATE_DISABLED)
- br_stp_disable_port(p);
- }
+ case NETDEV_CHANGE:
+ if (br->dev->flags & IFF_UP)
+ schedule_delayed_work(&p->carrier_check, BR_PORT_DEBOUNCE);
break;
case NETDEV_FEAT_CHANGE:
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index bdf95a74d8cd..c5bd631ffcd5 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -27,6 +27,10 @@
#define BR_PORT_BITS 10
#define BR_MAX_PORTS (1<<BR_PORT_BITS)
+#define BR_PORT_DEBOUNCE (HZ/10)
+
+#define BR_VERSION "2.1"
+
typedef struct bridge_id bridge_id;
typedef struct mac_addr mac_addr;
typedef __u16 port_id;
@@ -78,6 +82,7 @@ struct net_bridge_port
struct timer_list hold_timer;
struct timer_list message_age_timer;
struct kobject kobj;
+ struct work_struct carrier_check;
struct rcu_head rcu;
};
@@ -90,6 +95,7 @@ struct net_bridge
spinlock_t hash_lock;
struct hlist_head hash[BR_HASH_SIZE];
struct list_head age_list;
+ unsigned long feature_mask;
/* STP */
bridge_id designated_root;
@@ -201,6 +207,7 @@ extern void br_stp_disable_bridge(struct net_bridge *br);
extern void br_stp_enable_port(struct net_bridge_port *p);
extern void br_stp_disable_port(struct net_bridge_port *p);
extern void br_stp_recalculate_bridge_id(struct net_bridge *br);
+extern void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *a);
extern void br_stp_set_bridge_priority(struct net_bridge *br,
u16 newprio);
extern void br_stp_set_port_priority(struct net_bridge_port *p,
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index ac09b6a23523..cc047f7fb6ef 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -120,8 +120,7 @@ void br_stp_disable_port(struct net_bridge_port *p)
}
/* called under bridge lock */
-static void br_stp_change_bridge_id(struct net_bridge *br,
- const unsigned char *addr)
+void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr)
{
unsigned char oldaddr[6];
struct net_bridge_port *p;
@@ -158,7 +157,7 @@ void br_stp_recalculate_bridge_id(struct net_bridge *br)
list_for_each_entry(p, &br->port_list, list) {
if (addr == br_mac_zero ||
- compare_ether_addr(p->dev->dev_addr, addr) < 0)
+ memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0)
addr = p->dev->dev_addr;
}
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index c70b3be23026..b84fc6075fe1 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -196,9 +196,13 @@ config BRIDGE_EBT_LOG
To compile it as a module, choose M here. If unsure, say N.
config BRIDGE_EBT_ULOG
- tristate "ebt: ulog support"
+ tristate "ebt: ulog support (OBSOLETE)"
depends on BRIDGE_NF_EBTABLES
help
+ This option enables the old bridge-specific "ebt_ulog" implementation
+ which has been obsoleted by the new "nfnetlink_log" code (see
+ CONFIG_NETFILTER_NETLINK_LOG).
+
This option adds the ulog watcher, that you can use in any rule
in any ebtables table. The packet is passed to a userspace
logging daemon using netlink multicast sockets. This differs
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 662975be3d1d..9f6e0193ae10 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -3,13 +3,16 @@
*
* Authors:
* Bart De Schuymer <bdschuym@pandora.be>
+ * Harald Welte <laforge@netfilter.org>
*
* April, 2002
*
*/
+#include <linux/in.h>
#include <linux/netfilter_bridge/ebtables.h>
#include <linux/netfilter_bridge/ebt_log.h>
+#include <linux/netfilter.h>
#include <linux/module.h>
#include <linux/ip.h>
#include <linux/if_arp.h>
@@ -55,27 +58,30 @@ static void print_MAC(unsigned char *p)
}
#define myNIPQUAD(a) a[0], a[1], a[2], a[3]
-static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
- const struct net_device *in, const struct net_device *out,
- const void *data, unsigned int datalen)
+static void
+ebt_log_packet(unsigned int pf, unsigned int hooknum,
+ const struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, const struct nf_loginfo *loginfo,
+ const char *prefix)
{
- struct ebt_log_info *info = (struct ebt_log_info *)data;
- char level_string[4] = "< >";
+ unsigned int bitmask;
- level_string[1] = '0' + info->loglevel;
spin_lock_bh(&ebt_log_lock);
- printk(level_string);
- printk("%s IN=%s OUT=%s ", info->prefix, in ? in->name : "",
- out ? out->name : "");
+ printk("<%c>%s IN=%s OUT=%s MAC source = ", '0' + loginfo->u.log.level,
+ prefix, in ? in->name : "", out ? out->name : "");
- printk("MAC source = ");
print_MAC(eth_hdr(skb)->h_source);
printk("MAC dest = ");
print_MAC(eth_hdr(skb)->h_dest);
printk("proto = 0x%04x", ntohs(eth_hdr(skb)->h_proto));
- if ((info->bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto ==
+ if (loginfo->type == NF_LOG_TYPE_LOG)
+ bitmask = loginfo->u.log.logflags;
+ else
+ bitmask = NF_LOG_MASK;
+
+ if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto ==
htons(ETH_P_IP)){
struct iphdr _iph, *ih;
@@ -84,10 +90,9 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
printk(" INCOMPLETE IP header");
goto out;
}
- printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,",
- NIPQUAD(ih->saddr), NIPQUAD(ih->daddr));
- printk(" IP tos=0x%02X, IP proto=%d", ih->tos,
- ih->protocol);
+ printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u, IP "
+ "tos=0x%02X, IP proto=%d", NIPQUAD(ih->saddr),
+ NIPQUAD(ih->daddr), ih->tos, ih->protocol);
if (ih->protocol == IPPROTO_TCP ||
ih->protocol == IPPROTO_UDP) {
struct tcpudphdr _ports, *pptr;
@@ -104,7 +109,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
goto out;
}
- if ((info->bitmask & EBT_LOG_ARP) &&
+ if ((bitmask & EBT_LOG_ARP) &&
((eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) ||
(eth_hdr(skb)->h_proto == htons(ETH_P_RARP)))) {
struct arphdr _arph, *ah;
@@ -144,6 +149,21 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
out:
printk("\n");
spin_unlock_bh(&ebt_log_lock);
+
+}
+
+static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
+ const struct net_device *in, const struct net_device *out,
+ const void *data, unsigned int datalen)
+{
+ struct ebt_log_info *info = (struct ebt_log_info *)data;
+ struct nf_loginfo li;
+
+ li.type = NF_LOG_TYPE_LOG;
+ li.u.log.level = info->loglevel;
+ li.u.log.logflags = info->bitmask;
+
+ nf_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li, info->prefix);
}
static struct ebt_watcher log =
@@ -154,13 +174,32 @@ static struct ebt_watcher log =
.me = THIS_MODULE,
};
+static struct nf_logger ebt_log_logger = {
+ .name = "ebt_log",
+ .logfn = &ebt_log_packet,
+ .me = THIS_MODULE,
+};
+
static int __init init(void)
{
- return ebt_register_watcher(&log);
+ int ret;
+
+ ret = ebt_register_watcher(&log);
+ if (ret < 0)
+ return ret;
+ if (nf_log_register(PF_BRIDGE, &ebt_log_logger) < 0) {
+ printk(KERN_WARNING "ebt_log: not logging via system console "
+ "since somebody else already registered for PF_INET\n");
+ /* we cannot make module load fail here, since otherwise
+ * ebtables userspace would abort */
+ }
+
+ return 0;
}
static void __exit fini(void)
{
+ nf_log_unregister_logger(&ebt_log_logger);
ebt_unregister_watcher(&log);
}
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index aae26ae2e61f..ce617b3dbbb8 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -3,6 +3,7 @@
*
* Authors:
* Bart De Schuymer <bdschuym@pandora.be>
+ * Harald Welte <laforge@netfilter.org>
*
* November, 2004
*
@@ -115,14 +116,13 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size)
return skb;
}
-static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr,
+static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
- const void *data, unsigned int datalen)
+ const struct ebt_ulog_info *uloginfo, const char *prefix)
{
ebt_ulog_packet_msg_t *pm;
size_t size, copy_len;
struct nlmsghdr *nlh;
- struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data;
unsigned int group = uloginfo->nlgroup;
ebt_ulog_buff_t *ub = &ulog_buffers[group];
spinlock_t *lock = &ub->lock;
@@ -216,6 +216,39 @@ alloc_failure:
goto unlock;
}
+/* this function is registered with the netfilter core */
+static void ebt_log_packet(unsigned int pf, unsigned int hooknum,
+ const struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, const struct nf_loginfo *li,
+ const char *prefix)
+{
+ struct ebt_ulog_info loginfo;
+
+ if (!li || li->type != NF_LOG_TYPE_ULOG) {
+ loginfo.nlgroup = EBT_ULOG_DEFAULT_NLGROUP;
+ loginfo.cprange = 0;
+ loginfo.qthreshold = EBT_ULOG_DEFAULT_QTHRESHOLD;
+ loginfo.prefix[0] = '\0';
+ } else {
+ loginfo.nlgroup = li->u.ulog.group;
+ loginfo.cprange = li->u.ulog.copy_len;
+ loginfo.qthreshold = li->u.ulog.qthreshold;
+ strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
+ }
+
+ ebt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
+}
+
+static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr,
+ const struct net_device *in, const struct net_device *out,
+ const void *data, unsigned int datalen)
+{
+ struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data;
+
+ ebt_ulog_packet(hooknr, skb, in, out, uloginfo, NULL);
+}
+
+
static int ebt_ulog_check(const char *tablename, unsigned int hookmask,
const struct ebt_entry *e, void *data, unsigned int datalen)
{
@@ -240,6 +273,12 @@ static struct ebt_watcher ulog = {
.me = THIS_MODULE,
};
+static struct nf_logger ebt_ulog_logger = {
+ .name = EBT_ULOG_WATCHER,
+ .logfn = &ebt_log_packet,
+ .me = THIS_MODULE,
+};
+
static int __init init(void)
{
int i, ret = 0;
@@ -265,6 +304,13 @@ static int __init init(void)
else if ((ret = ebt_register_watcher(&ulog)))
sock_release(ebtulognl->sk_socket);
+ if (nf_log_register(PF_BRIDGE, &ebt_ulog_logger) < 0) {
+ printk(KERN_WARNING "ebt_ulog: not logging via ulog "
+ "since somebody else already registered for PF_BRIDGE\n");
+ /* we cannot make module load fail here, since otherwise
+ * ebtables userspace would abort */
+ }
+
return ret;
}
@@ -273,6 +319,7 @@ static void __exit fini(void)
ebt_ulog_buff_t *ub;
int i;
+ nf_log_unregister_logger(&ebt_ulog_logger);
ebt_unregister_watcher(&ulog);
for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) {
ub = &ulog_buffers[i];
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 1bcfef51ac58..f8d322e1ea92 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -47,6 +47,7 @@
#include <linux/rtnetlink.h>
#include <linux/poll.h>
#include <linux/highmem.h>
+#include <linux/spinlock.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -200,6 +201,41 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
}
/**
+ * skb_kill_datagram - Free a datagram skbuff forcibly
+ * @sk: socket
+ * @skb: datagram skbuff
+ * @flags: MSG_ flags
+ *
+ * This function frees a datagram skbuff that was received by
+ * skb_recv_datagram. The flags argument must match the one
+ * used for skb_recv_datagram.
+ *
+ * If the MSG_PEEK flag is set, and the packet is still on the
+ * receive queue of the socket, it will be taken off the queue
+ * before it is freed.
+ *
+ * This function currently only disables BH when acquiring the
+ * sk_receive_queue lock. Therefore it must not be used in a
+ * context where that lock is acquired in an IRQ context.
+ */
+
+void skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
+{
+ if (flags & MSG_PEEK) {
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ if (skb == skb_peek(&sk->sk_receive_queue)) {
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ atomic_dec(&skb->users);
+ }
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ }
+
+ kfree_skb(skb);
+}
+
+EXPORT_SYMBOL(skb_kill_datagram);
+
+/**
* skb_copy_datagram_iovec - Copy a datagram to an iovec.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying from
diff --git a/net/core/dev.c b/net/core/dev.c
index a5efc9ae010b..29ba109d3e54 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3276,7 +3276,6 @@ EXPORT_SYMBOL(dev_close);
EXPORT_SYMBOL(dev_get_by_flags);
EXPORT_SYMBOL(dev_get_by_index);
EXPORT_SYMBOL(dev_get_by_name);
-EXPORT_SYMBOL(dev_ioctl);
EXPORT_SYMBOL(dev_open);
EXPORT_SYMBOL(dev_queue_xmit);
EXPORT_SYMBOL(dev_remove_pack);
diff --git a/net/core/filter.c b/net/core/filter.c
index 3a10e0bc90e8..8964d3445588 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -13,6 +13,7 @@
* 2 of the License, or (at your option) any later version.
*
* Andi Kleen - Fix a few bad bugs and races.
+ * Kris Katterjohn - Added many additional checks in sk_chk_filter()
*/
#include <linux/module.h>
@@ -250,7 +251,7 @@ load_b:
mem[fentry->k] = X;
continue;
default:
- /* Invalid instruction counts as RET */
+ WARN_ON(1);
return 0;
}
@@ -283,8 +284,8 @@ load_b:
*
* Check the user's filter code. If we let some ugly
* filter code slip through kaboom! The filter must contain
- * no references or jumps that are out of range, no illegal instructions
- * and no backward jumps. It must end with a RET instruction
+ * no references or jumps that are out of range, no illegal
+ * instructions, and must end with a RET instruction.
*
* Returns 0 if the rule set is legal or a negative errno code if not.
*/
@@ -300,38 +301,85 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
for (pc = 0; pc < flen; pc++) {
/* all jumps are forward as they are not signed */
ftest = &filter[pc];
- if (BPF_CLASS(ftest->code) == BPF_JMP) {
- /* but they mustn't jump off the end */
- if (BPF_OP(ftest->code) == BPF_JA) {
- /*
- * Note, the large ftest->k might cause loops.
- * Compare this with conditional jumps below,
- * where offsets are limited. --ANK (981016)
- */
- if (ftest->k >= (unsigned)(flen-pc-1))
- return -EINVAL;
- } else {
- /* for conditionals both must be safe */
- if (pc + ftest->jt +1 >= flen ||
- pc + ftest->jf +1 >= flen)
- return -EINVAL;
- }
- }
- /* check for division by zero -Kris Katterjohn 2005-10-30 */
- if (ftest->code == (BPF_ALU|BPF_DIV|BPF_K) && ftest->k == 0)
- return -EINVAL;
+ /* Only allow valid instructions */
+ switch (ftest->code) {
+ case BPF_ALU|BPF_ADD|BPF_K:
+ case BPF_ALU|BPF_ADD|BPF_X:
+ case BPF_ALU|BPF_SUB|BPF_K:
+ case BPF_ALU|BPF_SUB|BPF_X:
+ case BPF_ALU|BPF_MUL|BPF_K:
+ case BPF_ALU|BPF_MUL|BPF_X:
+ case BPF_ALU|BPF_DIV|BPF_X:
+ case BPF_ALU|BPF_AND|BPF_K:
+ case BPF_ALU|BPF_AND|BPF_X:
+ case BPF_ALU|BPF_OR|BPF_K:
+ case BPF_ALU|BPF_OR|BPF_X:
+ case BPF_ALU|BPF_LSH|BPF_K:
+ case BPF_ALU|BPF_LSH|BPF_X:
+ case BPF_ALU|BPF_RSH|BPF_K:
+ case BPF_ALU|BPF_RSH|BPF_X:
+ case BPF_ALU|BPF_NEG:
+ case BPF_LD|BPF_W|BPF_ABS:
+ case BPF_LD|BPF_H|BPF_ABS:
+ case BPF_LD|BPF_B|BPF_ABS:
+ case BPF_LD|BPF_W|BPF_LEN:
+ case BPF_LD|BPF_W|BPF_IND:
+ case BPF_LD|BPF_H|BPF_IND:
+ case BPF_LD|BPF_B|BPF_IND:
+ case BPF_LD|BPF_IMM:
+ case BPF_LDX|BPF_W|BPF_LEN:
+ case BPF_LDX|BPF_B|BPF_MSH:
+ case BPF_LDX|BPF_IMM:
+ case BPF_MISC|BPF_TAX:
+ case BPF_MISC|BPF_TXA:
+ case BPF_RET|BPF_K:
+ case BPF_RET|BPF_A:
+ break;
+
+ /* Some instructions need special checks */
- /* check that memory operations use valid addresses. */
- if (ftest->k >= BPF_MEMWORDS) {
- /* but it might not be a memory operation... */
- switch (ftest->code) {
- case BPF_ST:
- case BPF_STX:
- case BPF_LD|BPF_MEM:
- case BPF_LDX|BPF_MEM:
+ case BPF_ALU|BPF_DIV|BPF_K:
+ /* check for division by zero */
+ if (ftest->k == 0)
return -EINVAL;
- }
+ break;
+
+ case BPF_LD|BPF_MEM:
+ case BPF_LDX|BPF_MEM:
+ case BPF_ST:
+ case BPF_STX:
+ /* check for invalid memory addresses */
+ if (ftest->k >= BPF_MEMWORDS)
+ return -EINVAL;
+ break;
+
+ case BPF_JMP|BPF_JA:
+ /*
+ * Note, the large ftest->k might cause loops.
+ * Compare this with conditional jumps below,
+ * where offsets are limited. --ANK (981016)
+ */
+ if (ftest->k >= (unsigned)(flen-pc-1))
+ return -EINVAL;
+ break;
+
+ case BPF_JMP|BPF_JEQ|BPF_K:
+ case BPF_JMP|BPF_JEQ|BPF_X:
+ case BPF_JMP|BPF_JGE|BPF_K:
+ case BPF_JMP|BPF_JGE|BPF_X:
+ case BPF_JMP|BPF_JGT|BPF_K:
+ case BPF_JMP|BPF_JGT|BPF_X:
+ case BPF_JMP|BPF_JSET|BPF_K:
+ case BPF_JMP|BPF_JSET|BPF_X:
+ /* for conditionals both must be safe */
+ if (pc + ftest->jt + 1 >= flen ||
+ pc + ftest->jf + 1 >= flen)
+ return -EINVAL;
+ break;
+
+ default:
+ return -EINVAL;
}
}
diff --git a/net/core/flow.c b/net/core/flow.c
index 7e95b39de9fd..c4f25385029f 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -23,6 +23,7 @@
#include <net/flow.h>
#include <asm/atomic.h>
#include <asm/semaphore.h>
+#include <linux/security.h>
struct flow_cache_entry {
struct flow_cache_entry *next;
@@ -30,6 +31,7 @@ struct flow_cache_entry {
u8 dir;
struct flowi key;
u32 genid;
+ u32 sk_sid;
void *object;
atomic_t *object_ref;
};
@@ -162,7 +164,7 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
return 0;
}
-void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
+void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir,
flow_resolve_t resolver)
{
struct flow_cache_entry *fle, **head;
@@ -186,6 +188,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
for (fle = *head; fle; fle = fle->next) {
if (fle->family == family &&
fle->dir == dir &&
+ fle->sk_sid == sk_sid &&
flow_key_compare(key, &fle->key) == 0) {
if (fle->genid == atomic_read(&flow_cache_genid)) {
void *ret = fle->object;
@@ -210,6 +213,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
*head = fle;
fle->family = family;
fle->dir = dir;
+ fle->sk_sid = sk_sid;
memcpy(&fle->key, key, sizeof(*key));
fle->object = NULL;
flow_count(cpu)++;
@@ -221,7 +225,7 @@ nocache:
void *obj;
atomic_t *obj_ref;
- resolver(key, family, dir, &obj, &obj_ref);
+ resolver(key, sk_sid, family, dir, &obj, &obj_ref);
if (fle) {
fle->genid = atomic_read(&flow_cache_genid);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 49424a42a2c0..281a632fa6a6 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -13,6 +13,7 @@
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/string.h>
+#include <linux/if_arp.h>
#include <linux/inetdevice.h>
#include <linux/inet.h>
#include <linux/interrupt.h>
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 7fc3e9e28c34..06cad2d63e8a 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -487,9 +487,9 @@ static unsigned int fmt_ip6(char *s,const char ip[16]);
/* Module parameters, defaults. */
static int pg_count_d = 1000; /* 1000 pkts by default */
-static int pg_delay_d = 0;
-static int pg_clone_skb_d = 0;
-static int debug = 0;
+static int pg_delay_d;
+static int pg_clone_skb_d;
+static int debug;
static DECLARE_MUTEX(pktgen_sem);
static struct pktgen_thread *pktgen_threads = NULL;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 83fee37de38e..070f91cfde59 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -135,17 +135,13 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
int fclone)
{
+ struct skb_shared_info *shinfo;
struct sk_buff *skb;
u8 *data;
/* Get the HEAD */
- if (fclone)
- skb = kmem_cache_alloc(skbuff_fclone_cache,
- gfp_mask & ~__GFP_DMA);
- else
- skb = kmem_cache_alloc(skbuff_head_cache,
- gfp_mask & ~__GFP_DMA);
-
+ skb = kmem_cache_alloc(fclone ? skbuff_fclone_cache : skbuff_head_cache,
+ gfp_mask & ~__GFP_DMA);
if (!skb)
goto out;
@@ -162,6 +158,16 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
skb->data = data;
skb->tail = data;
skb->end = data + size;
+ /* make sure we initialize shinfo sequentially */
+ shinfo = skb_shinfo(skb);
+ atomic_set(&shinfo->dataref, 1);
+ shinfo->nr_frags = 0;
+ shinfo->tso_size = 0;
+ shinfo->tso_segs = 0;
+ shinfo->ufo_size = 0;
+ shinfo->ip6_frag_id = 0;
+ shinfo->frag_list = NULL;
+
if (fclone) {
struct sk_buff *child = skb + 1;
atomic_t *fclone_ref = (atomic_t *) (child + 1);
@@ -171,13 +177,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
child->fclone = SKB_FCLONE_UNAVAILABLE;
}
- atomic_set(&(skb_shinfo(skb)->dataref), 1);
- skb_shinfo(skb)->nr_frags = 0;
- skb_shinfo(skb)->tso_size = 0;
- skb_shinfo(skb)->tso_segs = 0;
- skb_shinfo(skb)->frag_list = NULL;
- skb_shinfo(skb)->ufo_size = 0;
- skb_shinfo(skb)->ip6_frag_id = 0;
out:
return skb;
nodata:
diff --git a/net/core/sock.c b/net/core/sock.c
index 13cc3be4f056..6465b0e4c8cb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1488,7 +1488,7 @@ int proto_register(struct proto *prot, int alloc_slab)
}
}
- if (prot->twsk_obj_size) {
+ if (prot->twsk_prot != NULL) {
static const char mask[] = "tw_sock_%s";
timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
@@ -1497,11 +1497,12 @@ int proto_register(struct proto *prot, int alloc_slab)
goto out_free_request_sock_slab;
sprintf(timewait_sock_slab_name, mask, prot->name);
- prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name,
- prot->twsk_obj_size,
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
- if (prot->twsk_slab == NULL)
+ prot->twsk_prot->twsk_slab =
+ kmem_cache_create(timewait_sock_slab_name,
+ prot->twsk_prot->twsk_obj_size,
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if (prot->twsk_prot->twsk_slab == NULL)
goto out_free_timewait_sock_slab_name;
}
}
@@ -1548,12 +1549,12 @@ void proto_unregister(struct proto *prot)
prot->rsk_prot->slab = NULL;
}
- if (prot->twsk_slab != NULL) {
- const char *name = kmem_cache_name(prot->twsk_slab);
+ if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
+ const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
- kmem_cache_destroy(prot->twsk_slab);
+ kmem_cache_destroy(prot->twsk_prot->twsk_slab);
kfree(name);
- prot->twsk_slab = NULL;
+ prot->twsk_prot->twsk_slab = NULL;
}
}
diff --git a/net/core/stream.c b/net/core/stream.c
index 15bfd03e8024..35e25259fd95 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -55,8 +55,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
int done;
do {
- if (sk->sk_err)
- return sock_error(sk);
+ int err = sock_error(sk);
+ if (err)
+ return err;
if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
return -EPIPE;
if (!*timeo_p)
@@ -67,6 +68,7 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
sk->sk_write_pending++;
done = sk_wait_event(sk, timeo_p,
+ !sk->sk_err &&
!((1 << sk->sk_state) &
~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
finish_wait(sk->sk_sleep, &wait);
@@ -137,7 +139,9 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
sk->sk_write_pending++;
- sk_wait_event(sk, &current_timeo, sk_stream_memory_free(sk) &&
+ sk_wait_event(sk, &current_timeo, !sk->sk_err &&
+ !(sk->sk_shutdown & SEND_SHUTDOWN) &&
+ sk_stream_memory_free(sk) &&
vm_wait);
sk->sk_write_pending--;
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 344a8da153fc..87b27fff6e3b 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -1,3 +1,7 @@
+obj-$(CONFIG_IPV6) += dccp_ipv6.o
+
+dccp_ipv6-y := ipv6.o
+
obj-$(CONFIG_IP_DCCP) += dccp.o
dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index c9a62cca22fc..ce9cb77c5c29 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -55,8 +55,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
from = av->dccpav_buf + av->dccpav_buf_head;
/* Check if buf_head wraps */
- if (av->dccpav_buf_head + len > av->dccpav_vec_len) {
- const u32 tailsize = (av->dccpav_vec_len - av->dccpav_buf_head);
+ if ((int)av->dccpav_buf_head + len > av->dccpav_vec_len) {
+ const u32 tailsize = av->dccpav_vec_len - av->dccpav_buf_head;
memcpy(to, from, tailsize);
to += tailsize;
@@ -93,8 +93,14 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
struct dccp_ackvec *dccp_ackvec_alloc(const unsigned int len,
const gfp_t priority)
{
- struct dccp_ackvec *av = kmalloc(sizeof(*av) + len, priority);
+ struct dccp_ackvec *av;
+ BUG_ON(len == 0);
+
+ if (len > DCCP_MAX_ACKVEC_LEN)
+ return NULL;
+
+ av = kmalloc(sizeof(*av) + len, priority);
if (av != NULL) {
av->dccpav_buf_len = len;
av->dccpav_buf_head =
@@ -117,13 +123,13 @@ void dccp_ackvec_free(struct dccp_ackvec *av)
}
static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av,
- const unsigned int index)
+ const u8 index)
{
return av->dccpav_buf[index] & DCCP_ACKVEC_STATE_MASK;
}
static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
- const unsigned int index)
+ const u8 index)
{
return av->dccpav_buf[index] & DCCP_ACKVEC_LEN_MASK;
}
@@ -135,7 +141,7 @@ static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
*/
static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
const unsigned int packets,
- const unsigned char state)
+ const unsigned char state)
{
unsigned int gap;
signed long new_head;
@@ -223,7 +229,7 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
* could reduce the complexity of this scan.)
*/
u64 delta = dccp_delta_seqno(ackno, av->dccpav_buf_ackno);
- unsigned int index = av->dccpav_buf_head;
+ u8 index = av->dccpav_buf_head;
while (1) {
const u8 len = dccp_ackvec_len(av, index);
@@ -291,7 +297,7 @@ void dccp_ackvec_print(const struct dccp_ackvec *av)
}
#endif
-static void dccp_ackvec_trow_away_ack_record(struct dccp_ackvec *av)
+static void dccp_ackvec_throw_away_ack_record(struct dccp_ackvec *av)
{
/*
* As we're keeping track of the ack vector size (dccpav_vec_len) and
@@ -301,9 +307,10 @@ static void dccp_ackvec_trow_away_ack_record(struct dccp_ackvec *av)
* draft-ietf-dccp-spec-11.txt Appendix A. -acme
*/
#if 0
- av->dccpav_buf_tail = av->dccpav_ack_ptr + 1;
- if (av->dccpav_buf_tail >= av->dccpav_vec_len)
- av->dccpav_buf_tail -= av->dccpav_vec_len;
+ u32 new_buf_tail = av->dccpav_ack_ptr + 1;
+ if (new_buf_tail >= av->dccpav_vec_len)
+ new_buf_tail -= av->dccpav_vec_len;
+ av->dccpav_buf_tail = new_buf_tail;
#endif
av->dccpav_vec_len -= av->dccpav_sent_len;
}
@@ -326,7 +333,7 @@ void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk,
debug_prefix, 1,
(unsigned long long)av->dccpav_ack_seqno,
(unsigned long long)av->dccpav_ack_ackno);
- dccp_ackvec_trow_away_ack_record(av);
+ dccp_ackvec_throw_away_ack_record(av);
av->dccpav_ack_seqno = DCCP_MAX_SEQNO + 1;
}
}
@@ -389,7 +396,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
av->dccpav_ack_seqno,
(unsigned long long)
av->dccpav_ack_ackno);
- dccp_ackvec_trow_away_ack_record(av);
+ dccp_ackvec_throw_away_ack_record(av);
}
/*
* If dccpav_ack_seqno was not received, no problem
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index d0fd6c60c574..f7dfb5f67b87 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -54,16 +54,16 @@
* @dccpav_buf - circular buffer of acknowledgeable packets
*/
struct dccp_ackvec {
- unsigned int dccpav_buf_head;
- unsigned int dccpav_buf_tail;
u64 dccpav_buf_ackno;
u64 dccpav_ack_seqno;
u64 dccpav_ack_ackno;
- unsigned int dccpav_ack_ptr;
- unsigned int dccpav_sent_len;
- unsigned int dccpav_vec_len;
- unsigned int dccpav_buf_len;
struct timeval dccpav_time;
+ u8 dccpav_buf_head;
+ u8 dccpav_buf_tail;
+ u8 dccpav_ack_ptr;
+ u8 dccpav_sent_len;
+ u8 dccpav_vec_len;
+ u8 dccpav_buf_len;
u8 dccpav_buf_nonce;
u8 dccpav_ack_nonce;
u8 dccpav_buf[0];
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index c37eeeaf5c6e..de681c6ad081 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -21,6 +21,8 @@
#define CCID_MAX 255
+struct tcp_info;
+
struct ccid {
unsigned char ccid_id;
const char *ccid_name;
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index f97b85d55ad8..93f26dd6e6cb 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -59,7 +59,7 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */
-extern struct proto dccp_v4_prot;
+extern struct proto dccp_prot;
/* is seq1 < seq2 ? */
static inline int before48(const u64 seq1, const u64 seq2)
@@ -228,6 +228,9 @@ extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
const struct dccp_hdr *dh, const unsigned len);
+extern int dccp_v4_init_sock(struct sock *sk);
+extern int dccp_v4_destroy_sock(struct sock *sk);
+
extern void dccp_close(struct sock *sk, long timeout);
extern struct sk_buff *dccp_make_response(struct sock *sk,
struct dst_entry *dst,
@@ -238,6 +241,7 @@ extern struct sk_buff *dccp_make_reset(struct sock *sk,
extern int dccp_connect(struct sock *sk);
extern int dccp_disconnect(struct sock *sk, int flags);
+extern void dccp_unhash(struct sock *sk);
extern int dccp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
extern int dccp_setsockopt(struct sock *sk, int level, int optname,
@@ -249,6 +253,13 @@ extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk,
struct msghdr *msg, size_t len, int nonblock,
int flags, int *addr_len);
extern void dccp_shutdown(struct sock *sk, int how);
+extern int inet_dccp_listen(struct socket *sock, int backlog);
+extern unsigned int dccp_poll(struct file *file, struct socket *sock,
+ poll_table *wait);
+extern void dccp_v4_send_check(struct sock *sk, int len,
+ struct sk_buff *skb);
+extern int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
+ int addr_len);
extern int dccp_v4_checksum(const struct sk_buff *skb,
const u32 saddr, const u32 daddr);
@@ -256,6 +267,17 @@ extern int dccp_v4_checksum(const struct sk_buff *skb,
extern int dccp_v4_send_reset(struct sock *sk,
enum dccp_reset_codes code);
extern void dccp_send_close(struct sock *sk, const int active);
+extern int dccp_invalid_packet(struct sk_buff *skb);
+
+static inline int dccp_bad_service_code(const struct sock *sk,
+ const __u32 service)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+
+ if (dp->dccps_service == service)
+ return 0;
+ return !dccp_list_has_service(dp->dccps_service_list, service);
+}
struct dccp_skb_cb {
__u8 dccpd_type:4;
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index f675d8e642d3..3f78c00e3822 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -28,7 +28,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_retransmits = icsk->icsk_retransmits;
info->tcpi_probes = icsk->icsk_probes_out;
info->tcpi_backoff = icsk->icsk_backoff;
- info->tcpi_pmtu = dp->dccps_pmtu_cookie;
+ info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
if (dp->dccps_options.dccpo_send_ack_vector)
info->tcpi_options |= TCPI_OPT_SACK;
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 3454d5941900..b6cba72b44e8 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -151,29 +151,12 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
return 0;
}
-int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct dccp_hdr *dh, const unsigned len)
+static inline int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct dccp_hdr *dh,
+ const unsigned len)
{
struct dccp_sock *dp = dccp_sk(sk);
- if (dccp_check_seqno(sk, skb))
- goto discard;
-
- if (dccp_parse_options(sk, skb))
- goto discard;
-
- if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
- dccp_event_ack_recv(sk, skb);
-
- if (dp->dccps_options.dccpo_send_ack_vector &&
- dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_seq,
- DCCP_ACKVEC_STATE_RECEIVED))
- goto discard;
-
- ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
- ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
-
switch (dccp_hdr(skb)->dccph_type) {
case DCCP_PKT_DATAACK:
case DCCP_PKT_DATA:
@@ -250,6 +233,37 @@ discard:
return 0;
}
+int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct dccp_hdr *dh, const unsigned len)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ if (dccp_check_seqno(sk, skb))
+ goto discard;
+
+ if (dccp_parse_options(sk, skb))
+ goto discard;
+
+ if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+ dccp_event_ack_recv(sk, skb);
+
+ if (dp->dccps_options.dccpo_send_ack_vector &&
+ dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
+ DCCP_SKB_CB(skb)->dccpd_seq,
+ DCCP_ACKVEC_STATE_RECEIVED))
+ goto discard;
+
+ ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
+ ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
+
+ return __dccp_rcv_established(sk, skb, dh, len);
+discard:
+ __kfree_skb(skb);
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_rcv_established);
+
static int dccp_rcv_request_sent_state_process(struct sock *sk,
struct sk_buff *skb,
const struct dccp_hdr *dh,
@@ -286,6 +300,12 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
goto out_invalid_packet;
}
+ if (dp->dccps_options.dccpo_send_ack_vector &&
+ dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
+ DCCP_SKB_CB(skb)->dccpd_seq,
+ DCCP_ACKVEC_STATE_RECEIVED))
+ goto out_invalid_packet; /* FIXME: change error code */
+
dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
dccp_update_gsr(sk, dp->dccps_isr);
/*
@@ -309,7 +329,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
goto out_invalid_packet;
}
- dccp_sync_mss(sk, dp->dccps_pmtu_cookie);
+ dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
/*
* Step 10: Process REQUEST state (second part)
@@ -329,7 +349,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
dccp_set_state(sk, DCCP_PARTOPEN);
/* Make sure socket is routed, for correct metrics. */
- inet_sk_rebuild_header(sk);
+ icsk->icsk_af_ops->rebuild_header(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_state_change(sk);
@@ -398,9 +418,9 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
if (dh->dccph_type == DCCP_PKT_DATAACK ||
dh->dccph_type == DCCP_PKT_DATA) {
- dccp_rcv_established(sk, skb, dh, len);
+ __dccp_rcv_established(sk, skb, dh, len);
queued = 1; /* packet was queued
- (by dccp_rcv_established) */
+ (by __dccp_rcv_established) */
}
break;
}
@@ -444,7 +464,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
*/
if (sk->sk_state == DCCP_LISTEN) {
if (dh->dccph_type == DCCP_PKT_REQUEST) {
- if (dccp_v4_conn_request(sk, skb) < 0)
+ if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
+ skb) < 0)
return 1;
/* FIXME: do congestion control initialization */
@@ -471,14 +492,14 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
dccp_event_ack_recv(sk, skb);
- ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
- ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
-
if (dp->dccps_options.dccpo_send_ack_vector &&
dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
DCCP_SKB_CB(skb)->dccpd_seq,
DCCP_ACKVEC_STATE_RECEIVED))
goto discard;
+
+ ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
+ ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
}
/*
@@ -566,3 +587,5 @@ discard:
}
return 0;
}
+
+EXPORT_SYMBOL_GPL(dccp_rcv_state_process);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 656e13e38cfb..3f244670764a 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -19,7 +19,9 @@
#include <net/icmp.h>
#include <net/inet_hashtables.h>
+#include <net/inet_sock.h>
#include <net/sock.h>
+#include <net/timewait_sock.h>
#include <net/tcp_states.h>
#include <net/xfrm.h>
@@ -37,7 +39,8 @@ EXPORT_SYMBOL_GPL(dccp_hashinfo);
static int dccp_v4_get_port(struct sock *sk, const unsigned short snum)
{
- return inet_csk_get_port(&dccp_hashinfo, sk, snum);
+ return inet_csk_get_port(&dccp_hashinfo, sk, snum,
+ inet_csk_bind_conflict);
}
static void dccp_v4_hash(struct sock *sk)
@@ -45,171 +48,14 @@ static void dccp_v4_hash(struct sock *sk)
inet_hash(&dccp_hashinfo, sk);
}
-static void dccp_v4_unhash(struct sock *sk)
+void dccp_unhash(struct sock *sk)
{
inet_unhash(&dccp_hashinfo, sk);
}
-/* called with local bh disabled */
-static int __dccp_v4_check_established(struct sock *sk, const __u16 lport,
- struct inet_timewait_sock **twp)
-{
- struct inet_sock *inet = inet_sk(sk);
- const u32 daddr = inet->rcv_saddr;
- const u32 saddr = inet->daddr;
- const int dif = sk->sk_bound_dev_if;
- INET_ADDR_COOKIE(acookie, saddr, daddr)
- const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
- unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
- struct inet_ehash_bucket *head = inet_ehash_bucket(&dccp_hashinfo, hash);
- const struct sock *sk2;
- const struct hlist_node *node;
- struct inet_timewait_sock *tw;
-
- prefetch(head->chain.first);
- write_lock(&head->lock);
-
- /* Check TIME-WAIT sockets first. */
- sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) {
- tw = inet_twsk(sk2);
-
- if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
- goto not_unique;
- }
- tw = NULL;
-
- /* And established part... */
- sk_for_each(sk2, node, &head->chain) {
- if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
- goto not_unique;
- }
+EXPORT_SYMBOL_GPL(dccp_unhash);
- /* Must record num and sport now. Otherwise we will see
- * in hash table socket with a funny identity. */
- inet->num = lport;
- inet->sport = htons(lport);
- sk->sk_hash = hash;
- BUG_TRAP(sk_unhashed(sk));
- __sk_add_node(sk, &head->chain);
- sock_prot_inc_use(sk->sk_prot);
- write_unlock(&head->lock);
-
- if (twp != NULL) {
- *twp = tw;
- NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
- } else if (tw != NULL) {
- /* Silly. Should hash-dance instead... */
- inet_twsk_deschedule(tw, &dccp_death_row);
- NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-
- inet_twsk_put(tw);
- }
-
- return 0;
-
-not_unique:
- write_unlock(&head->lock);
- return -EADDRNOTAVAIL;
-}
-
-/*
- * Bind a port for a connect operation and hash it.
- */
-static int dccp_v4_hash_connect(struct sock *sk)
-{
- const unsigned short snum = inet_sk(sk)->num;
- struct inet_bind_hashbucket *head;
- struct inet_bind_bucket *tb;
- int ret;
-
- if (snum == 0) {
- int low = sysctl_local_port_range[0];
- int high = sysctl_local_port_range[1];
- int remaining = (high - low) + 1;
- int rover = net_random() % (high - low) + low;
- struct hlist_node *node;
- struct inet_timewait_sock *tw = NULL;
-
- local_bh_disable();
- do {
- head = &dccp_hashinfo.bhash[inet_bhashfn(rover,
- dccp_hashinfo.bhash_size)];
- spin_lock(&head->lock);
-
- /* Does not bother with rcv_saddr checks,
- * because the established check is already
- * unique enough.
- */
- inet_bind_bucket_for_each(tb, node, &head->chain) {
- if (tb->port == rover) {
- BUG_TRAP(!hlist_empty(&tb->owners));
- if (tb->fastreuse >= 0)
- goto next_port;
- if (!__dccp_v4_check_established(sk,
- rover,
- &tw))
- goto ok;
- goto next_port;
- }
- }
-
- tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep,
- head, rover);
- if (tb == NULL) {
- spin_unlock(&head->lock);
- break;
- }
- tb->fastreuse = -1;
- goto ok;
-
- next_port:
- spin_unlock(&head->lock);
- if (++rover > high)
- rover = low;
- } while (--remaining > 0);
-
- local_bh_enable();
-
- return -EADDRNOTAVAIL;
-
-ok:
- /* All locks still held and bhs disabled */
- inet_bind_hash(sk, tb, rover);
- if (sk_unhashed(sk)) {
- inet_sk(sk)->sport = htons(rover);
- __inet_hash(&dccp_hashinfo, sk, 0);
- }
- spin_unlock(&head->lock);
-
- if (tw != NULL) {
- inet_twsk_deschedule(tw, &dccp_death_row);
- inet_twsk_put(tw);
- }
-
- ret = 0;
- goto out;
- }
-
- head = &dccp_hashinfo.bhash[inet_bhashfn(snum,
- dccp_hashinfo.bhash_size)];
- tb = inet_csk(sk)->icsk_bind_hash;
- spin_lock_bh(&head->lock);
- if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) {
- __inet_hash(&dccp_hashinfo, sk, 0);
- spin_unlock_bh(&head->lock);
- return 0;
- } else {
- spin_unlock(&head->lock);
- /* No definite answer... Walk to established hash table */
- ret = __dccp_v4_check_established(sk, snum, NULL);
-out:
- local_bh_enable();
- return ret;
- }
-}
-
-static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
- int addr_len)
+int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
@@ -259,9 +105,9 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
inet->dport = usin->sin_port;
inet->daddr = daddr;
- dp->dccps_ext_header_len = 0;
+ inet_csk(sk)->icsk_ext_hdr_len = 0;
if (inet->opt != NULL)
- dp->dccps_ext_header_len = inet->opt->optlen;
+ inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
/*
* Socket identity is still unknown (sport may be zero).
* However we set state to DCCP_REQUESTING and not releasing socket
@@ -269,7 +115,7 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
* complete initialization after this.
*/
dccp_set_state(sk, DCCP_REQUESTING);
- err = dccp_v4_hash_connect(sk);
+ err = inet_hash_connect(&dccp_death_row, sk);
if (err != 0)
goto failure;
@@ -287,16 +133,6 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
usin->sin_port);
dccp_update_gss(sk, dp->dccps_iss);
- /*
- * SWL and AWL are initially adjusted so that they are not less than
- * the initial Sequence Numbers received and sent, respectively:
- * SWL := max(GSR + 1 - floor(W/4), ISR),
- * AWL := max(GSS - W' + 1, ISS).
- * These adjustments MUST be applied only at the beginning of the
- * connection.
- */
- dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss));
-
inet->id = dp->dccps_iss ^ jiffies;
err = dccp_connect(sk);
@@ -316,6 +152,8 @@ failure:
goto out;
}
+EXPORT_SYMBOL_GPL(dccp_v4_connect);
+
/*
* This routine does path mtu discovery as defined in RFC1191.
*/
@@ -354,7 +192,7 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
mtu = dst_mtu(dst);
if (inet->pmtudisc != IP_PMTUDISC_DONT &&
- dp->dccps_pmtu_cookie > mtu) {
+ inet_csk(sk)->icsk_pmtu_cookie > mtu) {
dccp_sync_mss(sk, mtu);
/*
@@ -606,6 +444,17 @@ out:
sock_put(sk);
}
+/* This routine computes an IPv4 DCCP checksum. */
+void dccp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ struct dccp_hdr *dh = dccp_hdr(skb);
+
+ dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, inet->daddr);
+}
+
+EXPORT_SYMBOL_GPL(dccp_v4_send_check);
+
int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code)
{
struct sk_buff *skb;
@@ -641,16 +490,6 @@ static inline u64 dccp_v4_init_sequence(const struct sock *sk,
dccp_hdr(skb)->dccph_sport);
}
-static inline int dccp_bad_service_code(const struct sock *sk,
- const __u32 service)
-{
- const struct dccp_sock *dp = dccp_sk(sk);
-
- if (dp->dccps_service == service)
- return 0;
- return !dccp_list_has_service(dp->dccps_service_list, service);
-}
-
int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct inet_request_sock *ireq;
@@ -662,7 +501,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
const __u32 service = dccp_hdr_request(skb)->dccph_req_service;
struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
__u8 reset_code = DCCP_RESET_CODE_TOO_BUSY;
- struct dst_entry *dst = NULL;
/* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */
if (((struct rtable *)skb->dst)->rt_flags &
@@ -703,7 +541,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
ireq = inet_rsk(req);
ireq->loc_addr = daddr;
ireq->rmt_addr = saddr;
- /* FIXME: Merge Aristeu's option parsing code when ready */
req->rcv_wnd = 100; /* Fake, option parsing will get the
right value */
ireq->opt = NULL;
@@ -721,23 +558,22 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
dreq->dreq_iss = dccp_v4_init_sequence(sk, skb);
dreq->dreq_service = service;
- if (dccp_v4_send_response(sk, req, dst))
+ if (dccp_v4_send_response(sk, req, NULL))
goto drop_and_free;
inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
return 0;
drop_and_free:
- /*
- * FIXME: should be reqsk_free after implementing req->rsk_ops
- */
- __reqsk_free(req);
+ reqsk_free(req);
drop:
DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
dcb->dccpd_reset_code = reset_code;
return -1;
}
+EXPORT_SYMBOL_GPL(dccp_v4_conn_request);
+
/*
* The three way handshake has completed - we got a valid ACK or DATAACK -
* now create the new socket.
@@ -792,6 +628,8 @@ exit:
return NULL;
}
+EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock);
+
static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
const struct dccp_hdr *dh = dccp_hdr(skb);
@@ -1011,7 +849,9 @@ discard:
return 0;
}
-static inline int dccp_invalid_packet(struct sk_buff *skb)
+EXPORT_SYMBOL_GPL(dccp_v4_do_rcv);
+
+int dccp_invalid_packet(struct sk_buff *skb)
{
const struct dccp_hdr *dh;
@@ -1065,29 +905,30 @@ static inline int dccp_invalid_packet(struct sk_buff *skb)
return 1;
}
- /* If the header checksum is incorrect, drop packet and return */
- if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr,
- skb->nh.iph->daddr) < 0) {
- LIMIT_NETDEBUG(KERN_WARNING "DCCP: header checksum is "
- "incorrect\n");
- return 1;
- }
-
return 0;
}
+EXPORT_SYMBOL_GPL(dccp_invalid_packet);
+
/* this is called when real data arrives */
int dccp_v4_rcv(struct sk_buff *skb)
{
const struct dccp_hdr *dh;
struct sock *sk;
- int rc;
/* Step 1: Check header basics: */
if (dccp_invalid_packet(skb))
goto discard_it;
+ /* If the header checksum is incorrect, drop packet and return */
+ if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr,
+ skb->nh.iph->daddr) < 0) {
+ LIMIT_NETDEBUG(KERN_WARNING "%s: incorrect header checksum\n",
+ __FUNCTION__);
+ goto discard_it;
+ }
+
dh = dccp_hdr(skb);
DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb);
@@ -1143,28 +984,10 @@ int dccp_v4_rcv(struct sk_buff *skb)
goto do_time_wait;
}
- if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
- dccp_pr_debug("xfrm4_policy_check failed\n");
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
- }
-
- if (sk_filter(sk, skb, 0)) {
- dccp_pr_debug("sk_filter failed\n");
- goto discard_and_relse;
- }
-
- skb->dev = NULL;
-
- bh_lock_sock(sk);
- rc = 0;
- if (!sock_owned_by_user(sk))
- rc = dccp_v4_do_rcv(sk, skb);
- else
- sk_add_backlog(sk, skb);
- bh_unlock_sock(sk);
- sock_put(sk);
- return rc;
+ return sk_receive_skb(sk, skb);
no_dccp_socket:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
@@ -1194,9 +1017,23 @@ do_time_wait:
goto no_dccp_socket;
}
-static int dccp_v4_init_sock(struct sock *sk)
+struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
+ .queue_xmit = ip_queue_xmit,
+ .send_check = dccp_v4_send_check,
+ .rebuild_header = inet_sk_rebuild_header,
+ .conn_request = dccp_v4_conn_request,
+ .syn_recv_sock = dccp_v4_request_recv_sock,
+ .net_header_len = sizeof(struct iphdr),
+ .setsockopt = ip_setsockopt,
+ .getsockopt = ip_getsockopt,
+ .addr2sockaddr = inet_csk_addr2sockaddr,
+ .sockaddr_len = sizeof(struct sockaddr_in),
+};
+
+int dccp_v4_init_sock(struct sock *sk)
{
struct dccp_sock *dp = dccp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
static int dccp_ctl_socket_init = 1;
dccp_options_init(&dp->dccps_options);
@@ -1236,9 +1073,11 @@ static int dccp_v4_init_sock(struct sock *sk)
dccp_ctl_socket_init = 0;
dccp_init_xmit_timers(sk);
- inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT;
+ icsk->icsk_rto = DCCP_TIMEOUT_INIT;
sk->sk_state = DCCP_CLOSED;
sk->sk_write_space = dccp_write_space;
+ icsk->icsk_af_ops = &dccp_ipv4_af_ops;
+ icsk->icsk_sync_mss = dccp_sync_mss;
dp->dccps_mss_cache = 536;
dp->dccps_role = DCCP_ROLE_UNDEFINED;
dp->dccps_service = DCCP_SERVICE_INVALID_VALUE;
@@ -1246,7 +1085,9 @@ static int dccp_v4_init_sock(struct sock *sk)
return 0;
}
-static int dccp_v4_destroy_sock(struct sock *sk)
+EXPORT_SYMBOL_GPL(dccp_v4_init_sock);
+
+int dccp_v4_destroy_sock(struct sock *sk)
{
struct dccp_sock *dp = dccp_sk(sk);
@@ -1279,6 +1120,8 @@ static int dccp_v4_destroy_sock(struct sock *sk)
return 0;
}
+EXPORT_SYMBOL_GPL(dccp_v4_destroy_sock);
+
static void dccp_v4_reqsk_destructor(struct request_sock *req)
{
kfree(inet_rsk(req)->opt);
@@ -1293,7 +1136,11 @@ static struct request_sock_ops dccp_request_sock_ops = {
.send_reset = dccp_v4_ctl_send_reset,
};
-struct proto dccp_v4_prot = {
+static struct timewait_sock_ops dccp_timewait_sock_ops = {
+ .twsk_obj_size = sizeof(struct inet_timewait_sock),
+};
+
+struct proto dccp_prot = {
.name = "DCCP",
.owner = THIS_MODULE,
.close = dccp_close,
@@ -1307,7 +1154,7 @@ struct proto dccp_v4_prot = {
.recvmsg = dccp_recvmsg,
.backlog_rcv = dccp_v4_do_rcv,
.hash = dccp_v4_hash,
- .unhash = dccp_v4_unhash,
+ .unhash = dccp_unhash,
.accept = inet_csk_accept,
.get_port = dccp_v4_get_port,
.shutdown = dccp_shutdown,
@@ -1316,5 +1163,7 @@ struct proto dccp_v4_prot = {
.max_header = MAX_DCCP_HEADER,
.obj_size = sizeof(struct dccp_sock),
.rsk_prot = &dccp_request_sock_ops,
- .twsk_obj_size = sizeof(struct inet_timewait_sock),
+ .twsk_prot = &dccp_timewait_sock_ops,
};
+
+EXPORT_SYMBOL_GPL(dccp_prot);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
new file mode 100644
index 000000000000..c609dc78f487
--- /dev/null
+++ b/net/dccp/ipv6.c
@@ -0,0 +1,1261 @@
+/*
+ * DCCP over IPv6
+ * Linux INET6 implementation
+ *
+ * Based on net/dccp6/ipv6.c
+ *
+ * Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/xfrm.h>
+
+#include <net/addrconf.h>
+#include <net/inet_common.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_sock.h>
+#include <net/inet6_connection_sock.h>
+#include <net/inet6_hashtables.h>
+#include <net/ip6_route.h>
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/xfrm.h>
+
+#include "dccp.h"
+#include "ipv6.h"
+
+static void dccp_v6_ctl_send_reset(struct sk_buff *skb);
+static void dccp_v6_reqsk_send_ack(struct sk_buff *skb,
+ struct request_sock *req);
+static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb);
+
+static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
+
+static struct inet_connection_sock_af_ops dccp_ipv6_mapped;
+static struct inet_connection_sock_af_ops dccp_ipv6_af_ops;
+
+static int dccp_v6_get_port(struct sock *sk, unsigned short snum)
+{
+ return inet_csk_get_port(&dccp_hashinfo, sk, snum,
+ inet6_csk_bind_conflict);
+}
+
+static void dccp_v6_hash(struct sock *sk)
+{
+ if (sk->sk_state != DCCP_CLOSED) {
+ if (inet_csk(sk)->icsk_af_ops == &dccp_ipv6_mapped) {
+ dccp_prot.hash(sk);
+ return;
+ }
+ local_bh_disable();
+ __inet6_hash(&dccp_hashinfo, sk);
+ local_bh_enable();
+ }
+}
+
+static inline u16 dccp_v6_check(struct dccp_hdr *dh, int len,
+ struct in6_addr *saddr,
+ struct in6_addr *daddr,
+ unsigned long base)
+{
+ return csum_ipv6_magic(saddr, daddr, len, IPPROTO_DCCP, base);
+}
+
+static __u32 dccp_v6_init_sequence(struct sock *sk, struct sk_buff *skb)
+{
+ const struct dccp_hdr *dh = dccp_hdr(skb);
+
+ if (skb->protocol == htons(ETH_P_IPV6))
+ return secure_tcpv6_sequence_number(skb->nh.ipv6h->daddr.s6_addr32,
+ skb->nh.ipv6h->saddr.s6_addr32,
+ dh->dccph_dport,
+ dh->dccph_sport);
+ else
+ return secure_dccp_sequence_number(skb->nh.iph->daddr,
+ skb->nh.iph->saddr,
+ dh->dccph_dport,
+ dh->dccph_sport);
+}
+
+static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
+ int addr_len)
+{
+ struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct in6_addr *saddr = NULL, *final_p = NULL, final;
+ struct flowi fl;
+ struct dst_entry *dst;
+ int addr_type;
+ int err;
+
+ dp->dccps_role = DCCP_ROLE_CLIENT;
+
+ if (addr_len < SIN6_LEN_RFC2133)
+ return -EINVAL;
+
+ if (usin->sin6_family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ memset(&fl, 0, sizeof(fl));
+
+ if (np->sndflow) {
+ fl.fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
+ IP6_ECN_flow_init(fl.fl6_flowlabel);
+ if (fl.fl6_flowlabel & IPV6_FLOWLABEL_MASK) {
+ struct ip6_flowlabel *flowlabel;
+ flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
+ if (flowlabel == NULL)
+ return -EINVAL;
+ ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst);
+ fl6_sock_release(flowlabel);
+ }
+ }
+
+ /*
+ * connect() to INADDR_ANY means loopback (BSD'ism).
+ */
+
+ if (ipv6_addr_any(&usin->sin6_addr))
+ usin->sin6_addr.s6_addr[15] = 0x1;
+
+ addr_type = ipv6_addr_type(&usin->sin6_addr);
+
+ if(addr_type & IPV6_ADDR_MULTICAST)
+ return -ENETUNREACH;
+
+ if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ if (addr_len >= sizeof(struct sockaddr_in6) &&
+ usin->sin6_scope_id) {
+ /* If interface is set while binding, indices
+ * must coincide.
+ */
+ if (sk->sk_bound_dev_if &&
+ sk->sk_bound_dev_if != usin->sin6_scope_id)
+ return -EINVAL;
+
+ sk->sk_bound_dev_if = usin->sin6_scope_id;
+ }
+
+ /* Connect to link-local address requires an interface */
+ if (!sk->sk_bound_dev_if)
+ return -EINVAL;
+ }
+
+ ipv6_addr_copy(&np->daddr, &usin->sin6_addr);
+ np->flow_label = fl.fl6_flowlabel;
+
+ /*
+ * DCCP over IPv4
+ */
+
+ if (addr_type == IPV6_ADDR_MAPPED) {
+ u32 exthdrlen = icsk->icsk_ext_hdr_len;
+ struct sockaddr_in sin;
+
+ SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
+
+ if (__ipv6_only_sock(sk))
+ return -ENETUNREACH;
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = usin->sin6_port;
+ sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
+
+ icsk->icsk_af_ops = &dccp_ipv6_mapped;
+ sk->sk_backlog_rcv = dccp_v4_do_rcv;
+
+ err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
+
+ if (err) {
+ icsk->icsk_ext_hdr_len = exthdrlen;
+ icsk->icsk_af_ops = &dccp_ipv6_af_ops;
+ sk->sk_backlog_rcv = dccp_v6_do_rcv;
+ goto failure;
+ } else {
+ ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000FFFF),
+ inet->saddr);
+ ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000FFFF),
+ inet->rcv_saddr);
+ }
+
+ return err;
+ }
+
+ if (!ipv6_addr_any(&np->rcv_saddr))
+ saddr = &np->rcv_saddr;
+
+ fl.proto = IPPROTO_DCCP;
+ ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+ ipv6_addr_copy(&fl.fl6_src, saddr ? saddr : &np->saddr);
+ fl.oif = sk->sk_bound_dev_if;
+ fl.fl_ip_dport = usin->sin6_port;
+ fl.fl_ip_sport = inet->sport;
+
+ if (np->opt && np->opt->srcrt) {
+ struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
+ ipv6_addr_copy(&final, &fl.fl6_dst);
+ ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+ final_p = &final;
+ }
+
+ err = ip6_dst_lookup(sk, &dst, &fl);
+ if (err)
+ goto failure;
+ if (final_p)
+ ipv6_addr_copy(&fl.fl6_dst, final_p);
+
+ if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
+ goto failure;
+
+ if (saddr == NULL) {
+ saddr = &fl.fl6_src;
+ ipv6_addr_copy(&np->rcv_saddr, saddr);
+ }
+
+ /* set the source address */
+ ipv6_addr_copy(&np->saddr, saddr);
+ inet->rcv_saddr = LOOPBACK4_IPV6;
+
+ ip6_dst_store(sk, dst, NULL);
+
+ icsk->icsk_ext_hdr_len = 0;
+ if (np->opt)
+ icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
+ np->opt->opt_nflen);
+
+ inet->dport = usin->sin6_port;
+
+ dccp_set_state(sk, DCCP_REQUESTING);
+ err = inet6_hash_connect(&dccp_death_row, sk);
+ if (err)
+ goto late_failure;
+ /* FIXME */
+#if 0
+ dp->dccps_gar = secure_dccp_v6_sequence_number(np->saddr.s6_addr32,
+ np->daddr.s6_addr32,
+ inet->sport,
+ inet->dport);
+#endif
+ err = dccp_connect(sk);
+ if (err)
+ goto late_failure;
+
+ return 0;
+
+late_failure:
+ dccp_set_state(sk, DCCP_CLOSED);
+ __sk_dst_reset(sk);
+failure:
+ inet->dport = 0;
+ sk->sk_route_caps = 0;
+ return err;
+}
+
+static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ int type, int code, int offset, __u32 info)
+{
+ struct ipv6hdr *hdr = (struct ipv6hdr *)skb->data;
+ const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
+ struct ipv6_pinfo *np;
+ struct sock *sk;
+ int err;
+ __u64 seq;
+
+ sk = inet6_lookup(&dccp_hashinfo, &hdr->daddr, dh->dccph_dport,
+ &hdr->saddr, dh->dccph_sport, skb->dev->ifindex);
+
+ if (sk == NULL) {
+ ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
+ return;
+ }
+
+ if (sk->sk_state == DCCP_TIME_WAIT) {
+ inet_twsk_put((struct inet_timewait_sock *)sk);
+ return;
+ }
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk))
+ NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
+
+ if (sk->sk_state == DCCP_CLOSED)
+ goto out;
+
+ np = inet6_sk(sk);
+
+ if (type == ICMPV6_PKT_TOOBIG) {
+ struct dst_entry *dst = NULL;
+
+ if (sock_owned_by_user(sk))
+ goto out;
+ if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED))
+ goto out;
+
+ /* icmp should have updated the destination cache entry */
+ dst = __sk_dst_check(sk, np->dst_cookie);
+
+ if (dst == NULL) {
+ struct inet_sock *inet = inet_sk(sk);
+ struct flowi fl;
+
+ /* BUGGG_FUTURE: Again, it is not clear how
+ to handle rthdr case. Ignore this complexity
+ for now.
+ */
+ memset(&fl, 0, sizeof(fl));
+ fl.proto = IPPROTO_DCCP;
+ ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+ ipv6_addr_copy(&fl.fl6_src, &np->saddr);
+ fl.oif = sk->sk_bound_dev_if;
+ fl.fl_ip_dport = inet->dport;
+ fl.fl_ip_sport = inet->sport;
+
+ if ((err = ip6_dst_lookup(sk, &dst, &fl))) {
+ sk->sk_err_soft = -err;
+ goto out;
+ }
+
+ if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
+ sk->sk_err_soft = -err;
+ goto out;
+ }
+
+ } else
+ dst_hold(dst);
+
+ if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
+ dccp_sync_mss(sk, dst_mtu(dst));
+ } /* else let the usual retransmit timer handle it */
+ dst_release(dst);
+ goto out;
+ }
+
+ icmpv6_err_convert(type, code, &err);
+
+ seq = DCCP_SKB_CB(skb)->dccpd_seq;
+ /* Might be for an request_sock */
+ switch (sk->sk_state) {
+ struct request_sock *req, **prev;
+ case DCCP_LISTEN:
+ if (sock_owned_by_user(sk))
+ goto out;
+
+ req = inet6_csk_search_req(sk, &prev, dh->dccph_dport,
+ &hdr->daddr, &hdr->saddr,
+ inet6_iif(skb));
+ if (!req)
+ goto out;
+
+ /* ICMPs are not backlogged, hence we cannot get
+ * an established socket here.
+ */
+ BUG_TRAP(req->sk == NULL);
+
+ if (seq != dccp_rsk(req)->dreq_iss) {
+ NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
+ goto out;
+ }
+
+ inet_csk_reqsk_queue_drop(sk, req, prev);
+ goto out;
+
+ case DCCP_REQUESTING:
+ case DCCP_RESPOND: /* Cannot happen.
+ It can, it SYNs are crossed. --ANK */
+ if (!sock_owned_by_user(sk)) {
+ DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+ sk->sk_err = err;
+ /*
+ * Wake people up to see the error
+ * (see connect in sock.c)
+ */
+ sk->sk_error_report(sk);
+
+ dccp_done(sk);
+ } else
+ sk->sk_err_soft = err;
+ goto out;
+ }
+
+ if (!sock_owned_by_user(sk) && np->recverr) {
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ } else
+ sk->sk_err_soft = err;
+
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+
+static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct inet6_request_sock *ireq6 = inet6_rsk(req);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct sk_buff *skb;
+ struct ipv6_txoptions *opt = NULL;
+ struct in6_addr *final_p = NULL, final;
+ struct flowi fl;
+ int err = -1;
+
+ memset(&fl, 0, sizeof(fl));
+ fl.proto = IPPROTO_DCCP;
+ ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
+ ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
+ fl.fl6_flowlabel = 0;
+ fl.oif = ireq6->iif;
+ fl.fl_ip_dport = inet_rsk(req)->rmt_port;
+ fl.fl_ip_sport = inet_sk(sk)->sport;
+
+ if (dst == NULL) {
+ opt = np->opt;
+ if (opt == NULL &&
+ np->rxopt.bits.osrcrt == 2 &&
+ ireq6->pktopts) {
+ struct sk_buff *pktopts = ireq6->pktopts;
+ struct inet6_skb_parm *rxopt = IP6CB(pktopts);
+ if (rxopt->srcrt)
+ opt = ipv6_invert_rthdr(sk,
+ (struct ipv6_rt_hdr *)(pktopts->nh.raw +
+ rxopt->srcrt));
+ }
+
+ if (opt && opt->srcrt) {
+ struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt;
+ ipv6_addr_copy(&final, &fl.fl6_dst);
+ ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+ final_p = &final;
+ }
+
+ err = ip6_dst_lookup(sk, &dst, &fl);
+ if (err)
+ goto done;
+ if (final_p)
+ ipv6_addr_copy(&fl.fl6_dst, final_p);
+ if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
+ goto done;
+ }
+
+ skb = dccp_make_response(sk, dst, req);
+ if (skb != NULL) {
+ struct dccp_hdr *dh = dccp_hdr(skb);
+ dh->dccph_checksum = dccp_v6_check(dh, skb->len,
+ &ireq6->loc_addr,
+ &ireq6->rmt_addr,
+ csum_partial((char *)dh,
+ skb->len,
+ skb->csum));
+ ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
+ err = ip6_xmit(sk, skb, &fl, opt, 0);
+ if (err == NET_XMIT_CN)
+ err = 0;
+ }
+
+done:
+ if (opt && opt != np->opt)
+ sock_kfree_s(sk, opt, opt->tot_len);
+ return err;
+}
+
+static void dccp_v6_reqsk_destructor(struct request_sock *req)
+{
+ if (inet6_rsk(req)->pktopts != NULL)
+ kfree_skb(inet6_rsk(req)->pktopts);
+}
+
+static struct request_sock_ops dccp6_request_sock_ops = {
+ .family = AF_INET6,
+ .obj_size = sizeof(struct dccp6_request_sock),
+ .rtx_syn_ack = dccp_v6_send_response,
+ .send_ack = dccp_v6_reqsk_send_ack,
+ .destructor = dccp_v6_reqsk_destructor,
+ .send_reset = dccp_v6_ctl_send_reset,
+};
+
+static struct timewait_sock_ops dccp6_timewait_sock_ops = {
+ .twsk_obj_size = sizeof(struct dccp6_timewait_sock),
+};
+
+static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct dccp_hdr *dh = dccp_hdr(skb);
+
+ dh->dccph_checksum = csum_ipv6_magic(&np->saddr, &np->daddr,
+ len, IPPROTO_DCCP,
+ csum_partial((char *)dh,
+ dh->dccph_doff << 2,
+ skb->csum));
+}
+
+static void dccp_v6_ctl_send_reset(struct sk_buff *rxskb)
+{
+ struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
+ const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) +
+ sizeof(struct dccp_hdr_ext) +
+ sizeof(struct dccp_hdr_reset);
+ struct sk_buff *skb;
+ struct flowi fl;
+ u64 seqno;
+
+ if (rxdh->dccph_type == DCCP_PKT_RESET)
+ return;
+
+ if (!ipv6_unicast_destination(rxskb))
+ return;
+
+ /*
+ * We need to grab some memory, and put together an RST,
+ * and then put it into the queue to be sent.
+ */
+
+ skb = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) +
+ dccp_hdr_reset_len, GFP_ATOMIC);
+ if (skb == NULL)
+ return;
+
+ skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr) +
+ dccp_hdr_reset_len);
+
+ skb->h.raw = skb_push(skb, dccp_hdr_reset_len);
+ dh = dccp_hdr(skb);
+ memset(dh, 0, dccp_hdr_reset_len);
+
+ /* Swap the send and the receive. */
+ dh->dccph_type = DCCP_PKT_RESET;
+ dh->dccph_sport = rxdh->dccph_dport;
+ dh->dccph_dport = rxdh->dccph_sport;
+ dh->dccph_doff = dccp_hdr_reset_len / 4;
+ dh->dccph_x = 1;
+ dccp_hdr_reset(skb)->dccph_reset_code =
+ DCCP_SKB_CB(rxskb)->dccpd_reset_code;
+
+ /* See "8.3.1. Abnormal Termination" in draft-ietf-dccp-spec-11 */
+ seqno = 0;
+ if (DCCP_SKB_CB(rxskb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+ dccp_set_seqno(&seqno, DCCP_SKB_CB(rxskb)->dccpd_ack_seq + 1);
+
+ dccp_hdr_set_seq(dh, seqno);
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
+ DCCP_SKB_CB(rxskb)->dccpd_seq);
+
+ memset(&fl, 0, sizeof(fl));
+ ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr);
+ ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr);
+ dh->dccph_checksum = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst,
+ sizeof(*dh), IPPROTO_DCCP,
+ skb->csum);
+ fl.proto = IPPROTO_DCCP;
+ fl.oif = inet6_iif(rxskb);
+ fl.fl_ip_dport = dh->dccph_dport;
+ fl.fl_ip_sport = dh->dccph_sport;
+
+ /* sk = NULL, but it is safe for now. RST socket required. */
+ if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) {
+ if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) {
+ ip6_xmit(NULL, skb, &fl, NULL, 0);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
+ return;
+ }
+ }
+
+ kfree_skb(skb);
+}
+
+static void dccp_v6_ctl_send_ack(struct sk_buff *rxskb)
+{
+ struct flowi fl;
+ struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
+ const int dccp_hdr_ack_len = sizeof(struct dccp_hdr) +
+ sizeof(struct dccp_hdr_ext) +
+ sizeof(struct dccp_hdr_ack_bits);
+ struct sk_buff *skb;
+
+ skb = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) +
+ dccp_hdr_ack_len, GFP_ATOMIC);
+ if (skb == NULL)
+ return;
+
+ skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr) +
+ dccp_hdr_ack_len);
+
+ skb->h.raw = skb_push(skb, dccp_hdr_ack_len);
+ dh = dccp_hdr(skb);
+ memset(dh, 0, dccp_hdr_ack_len);
+
+ /* Build DCCP header and checksum it. */
+ dh->dccph_type = DCCP_PKT_ACK;
+ dh->dccph_sport = rxdh->dccph_dport;
+ dh->dccph_dport = rxdh->dccph_sport;
+ dh->dccph_doff = dccp_hdr_ack_len / 4;
+ dh->dccph_x = 1;
+
+ dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq);
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
+ DCCP_SKB_CB(rxskb)->dccpd_seq);
+
+ memset(&fl, 0, sizeof(fl));
+ ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr);
+ ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr);
+
+ /* FIXME: calculate checksum, IPv4 also should... */
+
+ fl.proto = IPPROTO_DCCP;
+ fl.oif = inet6_iif(rxskb);
+ fl.fl_ip_dport = dh->dccph_dport;
+ fl.fl_ip_sport = dh->dccph_sport;
+
+ if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) {
+ if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) {
+ ip6_xmit(NULL, skb, &fl, NULL, 0);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+ return;
+ }
+ }
+
+ kfree_skb(skb);
+}
+
+static void dccp_v6_reqsk_send_ack(struct sk_buff *skb,
+ struct request_sock *req)
+{
+ dccp_v6_ctl_send_ack(skb);
+}
+
+static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
+{
+ const struct dccp_hdr *dh = dccp_hdr(skb);
+ const struct ipv6hdr *iph = skb->nh.ipv6h;
+ struct sock *nsk;
+ struct request_sock **prev;
+ /* Find possible connection requests. */
+ struct request_sock *req = inet6_csk_search_req(sk, &prev,
+ dh->dccph_sport,
+ &iph->saddr,
+ &iph->daddr,
+ inet6_iif(skb));
+ if (req != NULL)
+ return dccp_check_req(sk, skb, req, prev);
+
+ nsk = __inet6_lookup_established(&dccp_hashinfo,
+ &iph->saddr, dh->dccph_sport,
+ &iph->daddr, ntohs(dh->dccph_dport),
+ inet6_iif(skb));
+
+ if (nsk != NULL) {
+ if (nsk->sk_state != DCCP_TIME_WAIT) {
+ bh_lock_sock(nsk);
+ return nsk;
+ }
+ inet_twsk_put((struct inet_timewait_sock *)nsk);
+ return NULL;
+ }
+
+ return sk;
+}
+
+static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+ struct inet_request_sock *ireq;
+ struct dccp_sock dp;
+ struct request_sock *req;
+ struct dccp_request_sock *dreq;
+ struct inet6_request_sock *ireq6;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ const __u32 service = dccp_hdr_request(skb)->dccph_req_service;
+ struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+ __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY;
+
+ if (skb->protocol == htons(ETH_P_IP))
+ return dccp_v4_conn_request(sk, skb);
+
+ if (!ipv6_unicast_destination(skb))
+ goto drop;
+
+ if (dccp_bad_service_code(sk, service)) {
+ reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
+ goto drop;
+ }
+ /*
+ * There are no SYN attacks on IPv6, yet...
+ */
+ if (inet_csk_reqsk_queue_is_full(sk))
+ goto drop;
+
+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+ goto drop;
+
+ req = inet6_reqsk_alloc(sk->sk_prot->rsk_prot);
+ if (req == NULL)
+ goto drop;
+
+ /* FIXME: process options */
+
+ dccp_openreq_init(req, &dp, skb);
+
+ ireq6 = inet6_rsk(req);
+ ireq = inet_rsk(req);
+ ipv6_addr_copy(&ireq6->rmt_addr, &skb->nh.ipv6h->saddr);
+ ipv6_addr_copy(&ireq6->loc_addr, &skb->nh.ipv6h->daddr);
+ req->rcv_wnd = 100; /* Fake, option parsing will get the
+ right value */
+ ireq6->pktopts = NULL;
+
+ if (ipv6_opt_accepted(sk, skb) ||
+ np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
+ np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
+ atomic_inc(&skb->users);
+ ireq6->pktopts = skb;
+ }
+ ireq6->iif = sk->sk_bound_dev_if;
+
+ /* So that link locals have meaning */
+ if (!sk->sk_bound_dev_if &&
+ ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL)
+ ireq6->iif = inet6_iif(skb);
+
+ /*
+ * Step 3: Process LISTEN state
+ *
+ * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+ *
+ * In fact we defer setting S.GSR, S.SWL, S.SWH to
+ * dccp_create_openreq_child.
+ */
+ dreq = dccp_rsk(req);
+ dreq->dreq_isr = dcb->dccpd_seq;
+ dreq->dreq_iss = dccp_v6_init_sequence(sk, skb);
+ dreq->dreq_service = service;
+
+ if (dccp_v6_send_response(sk, req, NULL))
+ goto drop_and_free;
+
+ inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
+ return 0;
+
+drop_and_free:
+ reqsk_free(req);
+drop:
+ DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+ dcb->dccpd_reset_code = reset_code;
+ return -1;
+}
+
+static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
+ struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct inet6_request_sock *ireq6 = inet6_rsk(req);
+ struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
+ struct inet_sock *newinet;
+ struct dccp_sock *newdp;
+ struct dccp6_sock *newdp6;
+ struct sock *newsk;
+ struct ipv6_txoptions *opt;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ /*
+ * v6 mapped
+ */
+
+ newsk = dccp_v4_request_recv_sock(sk, skb, req, dst);
+ if (newsk == NULL)
+ return NULL;
+
+ newdp6 = (struct dccp6_sock *)newsk;
+ newdp = dccp_sk(newsk);
+ newinet = inet_sk(newsk);
+ newinet->pinet6 = &newdp6->inet6;
+ newnp = inet6_sk(newsk);
+
+ memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+
+ ipv6_addr_set(&newnp->daddr, 0, 0, htonl(0x0000FFFF),
+ newinet->daddr);
+
+ ipv6_addr_set(&newnp->saddr, 0, 0, htonl(0x0000FFFF),
+ newinet->saddr);
+
+ ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr);
+
+ inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped;
+ newsk->sk_backlog_rcv = dccp_v4_do_rcv;
+ newnp->pktoptions = NULL;
+ newnp->opt = NULL;
+ newnp->mcast_oif = inet6_iif(skb);
+ newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
+
+ /*
+ * No need to charge this sock to the relevant IPv6 refcnt debug socks count
+ * here, dccp_create_openreq_child now does this for us, see the comment in
+ * that function for the gory details. -acme
+ */
+
+ /* It is tricky place. Until this moment IPv4 tcp
+ worked with IPv6 icsk.icsk_af_ops.
+ Sync it now.
+ */
+ dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
+
+ return newsk;
+ }
+
+ opt = np->opt;
+
+ if (sk_acceptq_is_full(sk))
+ goto out_overflow;
+
+ if (np->rxopt.bits.osrcrt == 2 &&
+ opt == NULL && ireq6->pktopts) {
+ struct inet6_skb_parm *rxopt = IP6CB(ireq6->pktopts);
+ if (rxopt->srcrt)
+ opt = ipv6_invert_rthdr(sk,
+ (struct ipv6_rt_hdr *)(ireq6->pktopts->nh.raw +
+ rxopt->srcrt));
+ }
+
+ if (dst == NULL) {
+ struct in6_addr *final_p = NULL, final;
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(fl));
+ fl.proto = IPPROTO_DCCP;
+ ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
+ if (opt && opt->srcrt) {
+ struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
+ ipv6_addr_copy(&final, &fl.fl6_dst);
+ ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+ final_p = &final;
+ }
+ ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
+ fl.oif = sk->sk_bound_dev_if;
+ fl.fl_ip_dport = inet_rsk(req)->rmt_port;
+ fl.fl_ip_sport = inet_sk(sk)->sport;
+
+ if (ip6_dst_lookup(sk, &dst, &fl))
+ goto out;
+
+ if (final_p)
+ ipv6_addr_copy(&fl.fl6_dst, final_p);
+
+ if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0)
+ goto out;
+ }
+
+ newsk = dccp_create_openreq_child(sk, req, skb);
+ if (newsk == NULL)
+ goto out;
+
+ /*
+ * No need to charge this sock to the relevant IPv6 refcnt debug socks
+ * count here, dccp_create_openreq_child now does this for us, see the
+ * comment in that function for the gory details. -acme
+ */
+
+ ip6_dst_store(newsk, dst, NULL);
+ newsk->sk_route_caps = dst->dev->features &
+ ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
+
+ newdp6 = (struct dccp6_sock *)newsk;
+ newinet = inet_sk(newsk);
+ newinet->pinet6 = &newdp6->inet6;
+ newdp = dccp_sk(newsk);
+ newnp = inet6_sk(newsk);
+
+ memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+
+ ipv6_addr_copy(&newnp->daddr, &ireq6->rmt_addr);
+ ipv6_addr_copy(&newnp->saddr, &ireq6->loc_addr);
+ ipv6_addr_copy(&newnp->rcv_saddr, &ireq6->loc_addr);
+ newsk->sk_bound_dev_if = ireq6->iif;
+
+ /* Now IPv6 options...
+
+ First: no IPv4 options.
+ */
+ newinet->opt = NULL;
+
+ /* Clone RX bits */
+ newnp->rxopt.all = np->rxopt.all;
+
+ /* Clone pktoptions received with SYN */
+ newnp->pktoptions = NULL;
+ if (ireq6->pktopts != NULL) {
+ newnp->pktoptions = skb_clone(ireq6->pktopts, GFP_ATOMIC);
+ kfree_skb(ireq6->pktopts);
+ ireq6->pktopts = NULL;
+ if (newnp->pktoptions)
+ skb_set_owner_r(newnp->pktoptions, newsk);
+ }
+ newnp->opt = NULL;
+ newnp->mcast_oif = inet6_iif(skb);
+ newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
+
+ /* Clone native IPv6 options from listening socket (if any)
+
+ Yes, keeping reference count would be much more clever,
+ but we make one more one thing there: reattach optmem
+ to newsk.
+ */
+ if (opt) {
+ newnp->opt = ipv6_dup_options(newsk, opt);
+ if (opt != np->opt)
+ sock_kfree_s(sk, opt, opt->tot_len);
+ }
+
+ inet_csk(newsk)->icsk_ext_hdr_len = 0;
+ if (newnp->opt)
+ inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
+ newnp->opt->opt_flen);
+
+ dccp_sync_mss(newsk, dst_mtu(dst));
+
+ newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
+
+ __inet6_hash(&dccp_hashinfo, newsk);
+ inet_inherit_port(&dccp_hashinfo, sk, newsk);
+
+ return newsk;
+
+out_overflow:
+ NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
+out:
+ NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
+ if (opt && opt != np->opt)
+ sock_kfree_s(sk, opt, opt->tot_len);
+ dst_release(dst);
+ return NULL;
+}
+
+/* The socket must have it's spinlock held when we get
+ * here.
+ *
+ * We have a potential double-lock case here, so even when
+ * doing backlog processing we use the BH locking scheme.
+ * This is because we cannot sleep with the original spinlock
+ * held.
+ */
+static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct sk_buff *opt_skb = NULL;
+
+ /* Imagine: socket is IPv6. IPv4 packet arrives,
+ goes to IPv4 receive handler and backlogged.
+ From backlog it always goes here. Kerboom...
+ Fortunately, dccp_rcv_established and rcv_established
+ handle them correctly, but it is not case with
+ dccp_v6_hnd_req and dccp_v6_ctl_send_reset(). --ANK
+ */
+
+ if (skb->protocol == htons(ETH_P_IP))
+ return dccp_v4_do_rcv(sk, skb);
+
+ if (sk_filter(sk, skb, 0))
+ goto discard;
+
+ /*
+ * socket locking is here for SMP purposes as backlog rcv
+ * is currently called with bh processing disabled.
+ */
+
+ /* Do Stevens' IPV6_PKTOPTIONS.
+
+ Yes, guys, it is the only place in our code, where we
+ may make it not affecting IPv4.
+ The rest of code is protocol independent,
+ and I do not like idea to uglify IPv4.
+
+ Actually, all the idea behind IPV6_PKTOPTIONS
+ looks not very well thought. For now we latch
+ options, received in the last packet, enqueued
+ by tcp. Feel free to propose better solution.
+ --ANK (980728)
+ */
+ if (np->rxopt.all)
+ opt_skb = skb_clone(skb, GFP_ATOMIC);
+
+ if (sk->sk_state == DCCP_OPEN) { /* Fast path */
+ if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len))
+ goto reset;
+ return 0;
+ }
+
+ if (sk->sk_state == DCCP_LISTEN) {
+ struct sock *nsk = dccp_v6_hnd_req(sk, skb);
+ if (!nsk)
+ goto discard;
+
+ /*
+ * Queue it on the new socket if the new socket is active,
+ * otherwise we just shortcircuit this and continue with
+ * the new socket..
+ */
+ if(nsk != sk) {
+ if (dccp_child_process(sk, nsk, skb))
+ goto reset;
+ if (opt_skb)
+ __kfree_skb(opt_skb);
+ return 0;
+ }
+ }
+
+ if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len))
+ goto reset;
+ return 0;
+
+reset:
+ dccp_v6_ctl_send_reset(skb);
+discard:
+ if (opt_skb)
+ __kfree_skb(opt_skb);
+ kfree_skb(skb);
+ return 0;
+}
+
+static int dccp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+{
+ const struct dccp_hdr *dh;
+ struct sk_buff *skb = *pskb;
+ struct sock *sk;
+
+ /* Step 1: Check header basics: */
+
+ if (dccp_invalid_packet(skb))
+ goto discard_it;
+
+ dh = dccp_hdr(skb);
+
+ DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb);
+ DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
+
+ if (dccp_packet_without_ack(skb))
+ DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ;
+ else
+ DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
+
+ /* Step 2:
+ * Look up flow ID in table and get corresponding socket */
+ sk = __inet6_lookup(&dccp_hashinfo, &skb->nh.ipv6h->saddr,
+ dh->dccph_sport,
+ &skb->nh.ipv6h->daddr, ntohs(dh->dccph_dport),
+ inet6_iif(skb));
+ /*
+ * Step 2:
+ * If no socket ...
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ */
+ if (sk == NULL)
+ goto no_dccp_socket;
+
+ /*
+ * Step 2:
+ * ... or S.state == TIMEWAIT,
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ */
+
+ if (sk->sk_state == DCCP_TIME_WAIT)
+ goto do_time_wait;
+
+ if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
+ goto discard_and_relse;
+
+ return sk_receive_skb(sk, skb) ? -1 : 0;
+
+no_dccp_socket:
+ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto discard_it;
+ /*
+ * Step 2:
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ */
+ if (dh->dccph_type != DCCP_PKT_RESET) {
+ DCCP_SKB_CB(skb)->dccpd_reset_code =
+ DCCP_RESET_CODE_NO_CONNECTION;
+ dccp_v6_ctl_send_reset(skb);
+ }
+discard_it:
+
+ /*
+ * Discard frame
+ */
+
+ kfree_skb(skb);
+ return 0;
+
+discard_and_relse:
+ sock_put(sk);
+ goto discard_it;
+
+do_time_wait:
+ inet_twsk_put((struct inet_timewait_sock *)sk);
+ goto no_dccp_socket;
+}
+
+static struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
+ .queue_xmit = inet6_csk_xmit,
+ .send_check = dccp_v6_send_check,
+ .rebuild_header = inet6_sk_rebuild_header,
+ .conn_request = dccp_v6_conn_request,
+ .syn_recv_sock = dccp_v6_request_recv_sock,
+ .net_header_len = sizeof(struct ipv6hdr),
+ .setsockopt = ipv6_setsockopt,
+ .getsockopt = ipv6_getsockopt,
+ .addr2sockaddr = inet6_csk_addr2sockaddr,
+ .sockaddr_len = sizeof(struct sockaddr_in6)
+};
+
+/*
+ * DCCP over IPv4 via INET6 API
+ */
+static struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
+ .queue_xmit = ip_queue_xmit,
+ .send_check = dccp_v4_send_check,
+ .rebuild_header = inet_sk_rebuild_header,
+ .conn_request = dccp_v6_conn_request,
+ .syn_recv_sock = dccp_v6_request_recv_sock,
+ .net_header_len = sizeof(struct iphdr),
+ .setsockopt = ipv6_setsockopt,
+ .getsockopt = ipv6_getsockopt,
+ .addr2sockaddr = inet6_csk_addr2sockaddr,
+ .sockaddr_len = sizeof(struct sockaddr_in6)
+};
+
+/* NOTE: A lot of things set to zero explicitly by call to
+ * sk_alloc() so need not be done here.
+ */
+static int dccp_v6_init_sock(struct sock *sk)
+{
+ int err = dccp_v4_init_sock(sk);
+
+ if (err == 0)
+ inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops;
+
+ return err;
+}
+
+static int dccp_v6_destroy_sock(struct sock *sk)
+{
+ dccp_v4_destroy_sock(sk);
+ return inet6_destroy_sock(sk);
+}
+
+static struct proto dccp_v6_prot = {
+ .name = "DCCPv6",
+ .owner = THIS_MODULE,
+ .close = dccp_close,
+ .connect = dccp_v6_connect,
+ .disconnect = dccp_disconnect,
+ .ioctl = dccp_ioctl,
+ .init = dccp_v6_init_sock,
+ .setsockopt = dccp_setsockopt,
+ .getsockopt = dccp_getsockopt,
+ .sendmsg = dccp_sendmsg,
+ .recvmsg = dccp_recvmsg,
+ .backlog_rcv = dccp_v6_do_rcv,
+ .hash = dccp_v6_hash,
+ .unhash = dccp_unhash,
+ .accept = inet_csk_accept,
+ .get_port = dccp_v6_get_port,
+ .shutdown = dccp_shutdown,
+ .destroy = dccp_v6_destroy_sock,
+ .orphan_count = &dccp_orphan_count,
+ .max_header = MAX_DCCP_HEADER,
+ .obj_size = sizeof(struct dccp6_sock),
+ .rsk_prot = &dccp6_request_sock_ops,
+ .twsk_prot = &dccp6_timewait_sock_ops,
+};
+
+static struct inet6_protocol dccp_v6_protocol = {
+ .handler = dccp_v6_rcv,
+ .err_handler = dccp_v6_err,
+ .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
+};
+
+static struct proto_ops inet6_dccp_ops = {
+ .family = PF_INET6,
+ .owner = THIS_MODULE,
+ .release = inet6_release,
+ .bind = inet6_bind,
+ .connect = inet_stream_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = inet_accept,
+ .getname = inet6_getname,
+ .poll = dccp_poll,
+ .ioctl = inet6_ioctl,
+ .listen = inet_dccp_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static struct inet_protosw dccp_v6_protosw = {
+ .type = SOCK_DCCP,
+ .protocol = IPPROTO_DCCP,
+ .prot = &dccp_v6_prot,
+ .ops = &inet6_dccp_ops,
+ .capability = -1,
+ .flags = INET_PROTOSW_ICSK,
+};
+
+static int __init dccp_v6_init(void)
+{
+ int err = proto_register(&dccp_v6_prot, 1);
+
+ if (err != 0)
+ goto out;
+
+ err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
+ if (err != 0)
+ goto out_unregister_proto;
+
+ inet6_register_protosw(&dccp_v6_protosw);
+out:
+ return err;
+out_unregister_proto:
+ proto_unregister(&dccp_v6_prot);
+ goto out;
+}
+
+static void __exit dccp_v6_exit(void)
+{
+ inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
+ inet6_unregister_protosw(&dccp_v6_protosw);
+ proto_unregister(&dccp_v6_prot);
+}
+
+module_init(dccp_v6_init);
+module_exit(dccp_v6_exit);
+
+/*
+ * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
+ * values directly, Also cover the case where the protocol is not specified,
+ * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP
+ */
+MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-33-type-6");
+MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-0-type-6");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
+MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h
new file mode 100644
index 000000000000..e4d4e9309270
--- /dev/null
+++ b/net/dccp/ipv6.h
@@ -0,0 +1,37 @@
+#ifndef _DCCP_IPV6_H
+#define _DCCP_IPV6_H
+/*
+ * net/dccp/ipv6.h
+ *
+ * An implementation of the DCCP protocol
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/ipv6.h>
+
+struct dccp6_sock {
+ struct dccp_sock dccp;
+ /*
+ * ipv6_pinfo has to be the last member of dccp6_sock,
+ * see inet6_sk_generic.
+ */
+ struct ipv6_pinfo inet6;
+};
+
+struct dccp6_request_sock {
+ struct dccp_request_sock dccp;
+ struct inet6_request_sock inet6;
+};
+
+struct dccp6_timewait_sock {
+ struct inet_timewait_sock inet;
+ struct inet6_timewait_sock tw6;
+};
+
+#endif /* _DCCP_IPV6_H */
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 1393461898bb..29261fc198e7 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -40,6 +40,8 @@ struct inet_timewait_death_row dccp_death_row = {
(unsigned long)&dccp_death_row),
};
+EXPORT_SYMBOL_GPL(dccp_death_row);
+
void dccp_time_wait(struct sock *sk, int state, int timeo)
{
struct inet_timewait_sock *tw = NULL;
@@ -50,7 +52,18 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
if (tw != NULL) {
const struct inet_connection_sock *icsk = inet_csk(sk);
const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
-
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ if (tw->tw_family == PF_INET6) {
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+ struct inet6_timewait_sock *tw6;
+
+ tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
+ tw6 = inet6_twsk((struct sock *)tw);
+ ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
+ ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
+ tw->tw_ipv6only = np->ipv6only;
+ }
+#endif
/* Linkage updates. */
__inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
@@ -170,6 +183,8 @@ out_free:
return newsk;
}
+EXPORT_SYMBOL_GPL(dccp_create_openreq_child);
+
/*
* Process an incoming packet for RESPOND sockets represented
* as an request_sock.
@@ -214,7 +229,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
goto drop;
}
- child = dccp_v4_request_recv_sock(sk, skb, req, NULL);
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
if (child == NULL)
goto listen_overflow;
@@ -236,6 +251,8 @@ drop:
goto out;
}
+EXPORT_SYMBOL_GPL(dccp_check_req);
+
/*
* Queue segment on the new socket if the new socket is active,
* otherwise we just shortcircuit this and continue with
@@ -266,3 +283,5 @@ int dccp_child_process(struct sock *parent, struct sock *child,
sock_put(child);
return ret;
}
+
+EXPORT_SYMBOL_GPL(dccp_child_process);
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 74ff87025878..efd7ffb903a1 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -15,6 +15,7 @@
#include <linux/kernel.h>
#include <linux/skbuff.h>
+#include <net/inet_sock.h>
#include <net/sock.h>
#include "ackvec.h"
@@ -43,6 +44,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
{
if (likely(skb != NULL)) {
const struct inet_sock *inet = inet_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct dccp_sock *dp = dccp_sk(sk);
struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
struct dccp_hdr *dh;
@@ -108,8 +110,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
break;
}
- dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr,
- inet->daddr);
+ icsk->icsk_af_ops->send_check(sk, skb->len, skb);
if (set_ack)
dccp_event_ack_sent(sk);
@@ -117,7 +118,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- err = ip_queue_xmit(skb, 0);
+ err = icsk->icsk_af_ops->queue_xmit(skb, 0);
if (err <= 0)
return err;
@@ -134,20 +135,13 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct dccp_sock *dp = dccp_sk(sk);
- int mss_now;
-
- /*
- * FIXME: we really should be using the af_specific thing to support
- * IPv6.
- * mss_now = pmtu - tp->af_specific->net_header_len -
- * sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext);
- */
- mss_now = pmtu - sizeof(struct iphdr) - sizeof(struct dccp_hdr) -
- sizeof(struct dccp_hdr_ext);
+ int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
+ sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext));
/* Now subtract optional transport overhead */
- mss_now -= dp->dccps_ext_header_len;
+ mss_now -= icsk->icsk_ext_hdr_len;
/*
* FIXME: this should come from the CCID infrastructure, where, say,
@@ -160,12 +154,14 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4;
/* And store cached results */
- dp->dccps_pmtu_cookie = pmtu;
+ icsk->icsk_pmtu_cookie = pmtu;
dp->dccps_mss_cache = mss_now;
return mss_now;
}
+EXPORT_SYMBOL_GPL(dccp_sync_mss);
+
void dccp_write_space(struct sock *sk)
{
read_lock(&sk->sk_callback_lock);
@@ -266,7 +262,7 @@ int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo)
int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
{
- if (inet_sk_rebuild_header(sk) != 0)
+ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0)
return -EHOSTUNREACH; /* Routing failure or similar. */
return dccp_transmit_skb(sk, (skb_cloned(skb) ?
@@ -321,6 +317,8 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
return skb;
}
+EXPORT_SYMBOL_GPL(dccp_make_response);
+
struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst,
const enum dccp_reset_codes code)
@@ -377,6 +375,7 @@ struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst,
*/
static inline void dccp_connect_init(struct sock *sk)
{
+ struct dccp_sock *dp = dccp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -385,10 +384,16 @@ static inline void dccp_connect_init(struct sock *sk)
dccp_sync_mss(sk, dst_mtu(dst));
- /*
- * FIXME: set dp->{dccps_swh,dccps_swl}, with
- * something like dccp_inc_seq
- */
+ dccp_update_gss(sk, dp->dccps_iss);
+ /*
+ * SWL and AWL are initially adjusted so that they are not less than
+ * the initial Sequence Numbers received and sent, respectively:
+ * SWL := max(GSR + 1 - floor(W/4), ISR),
+ * AWL := max(GSS - W' + 1, ISS).
+ * These adjustments MUST be applied only at the beginning of the
+ * connection.
+ */
+ dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss));
icsk->icsk_retransmits = 0;
}
@@ -420,6 +425,8 @@ int dccp_connect(struct sock *sk)
return 0;
}
+EXPORT_SYMBOL_GPL(dccp_connect);
+
void dccp_send_ack(struct sock *sk)
{
/* If we have been reset, we may not send again. */
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 8a6b2a9e4581..65b11ea90d85 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -24,7 +24,7 @@
#include <net/checksum.h>
#include <net/inet_common.h>
-#include <net/ip.h>
+#include <net/inet_sock.h>
#include <net/protocol.h>
#include <net/sock.h>
#include <net/xfrm.h>
@@ -34,15 +34,18 @@
#include <linux/timer.h>
#include <linux/delay.h>
#include <linux/poll.h>
-#include <linux/dccp.h>
#include "ccid.h"
#include "dccp.h"
DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
+EXPORT_SYMBOL_GPL(dccp_statistics);
+
atomic_t dccp_orphan_count = ATOMIC_INIT(0);
+EXPORT_SYMBOL_GPL(dccp_orphan_count);
+
static struct net_protocol dccp_protocol = {
.handler = dccp_v4_rcv,
.err_handler = dccp_v4_err,
@@ -149,6 +152,8 @@ int dccp_disconnect(struct sock *sk, int flags)
return err;
}
+EXPORT_SYMBOL_GPL(dccp_disconnect);
+
/*
* Wait for a DCCP event.
*
@@ -156,8 +161,8 @@ int dccp_disconnect(struct sock *sk, int flags)
* take care of normal races (between the test and the event) and we don't
* go look at any of the socket buffers directly.
*/
-static unsigned int dccp_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+unsigned int dccp_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
{
unsigned int mask;
struct sock *sk = sock->sk;
@@ -205,12 +210,16 @@ static unsigned int dccp_poll(struct file *file, struct socket *sock,
return mask;
}
+EXPORT_SYMBOL_GPL(dccp_poll);
+
int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
dccp_pr_debug("entry\n");
return -ENOIOCTLCMD;
}
+EXPORT_SYMBOL_GPL(dccp_ioctl);
+
static int dccp_setsockopt_service(struct sock *sk, const u32 service,
char __user *optval, int optlen)
{
@@ -254,7 +263,9 @@ int dccp_setsockopt(struct sock *sk, int level, int optname,
int val;
if (level != SOL_DCCP)
- return ip_setsockopt(sk, level, optname, optval, optlen);
+ return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
+ optname, optval,
+ optlen);
if (optlen < sizeof(int))
return -EINVAL;
@@ -282,6 +293,8 @@ int dccp_setsockopt(struct sock *sk, int level, int optname,
return err;
}
+EXPORT_SYMBOL_GPL(dccp_setsockopt);
+
static int dccp_getsockopt_service(struct sock *sk, int len,
u32 __user *optval,
int __user *optlen)
@@ -320,8 +333,9 @@ int dccp_getsockopt(struct sock *sk, int level, int optname,
int val, len;
if (level != SOL_DCCP)
- return ip_getsockopt(sk, level, optname, optval, optlen);
-
+ return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
+ optname, optval,
+ optlen);
if (get_user(len, optlen))
return -EFAULT;
@@ -354,6 +368,8 @@ int dccp_getsockopt(struct sock *sk, int level, int optname,
return 0;
}
+EXPORT_SYMBOL_GPL(dccp_getsockopt);
+
int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len)
{
@@ -410,6 +426,8 @@ out_discard:
goto out_release;
}
+EXPORT_SYMBOL_GPL(dccp_sendmsg);
+
int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len, int nonblock, int flags, int *addr_len)
{
@@ -507,7 +525,9 @@ out:
return len;
}
-static int inet_dccp_listen(struct socket *sock, int backlog)
+EXPORT_SYMBOL_GPL(dccp_recvmsg);
+
+int inet_dccp_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
unsigned char old_state;
@@ -543,6 +563,8 @@ out:
return err;
}
+EXPORT_SYMBOL_GPL(inet_dccp_listen);
+
static const unsigned char dccp_new_state[] = {
/* current state: new state: action: */
[0] = DCCP_CLOSED,
@@ -648,12 +670,16 @@ adjudge_to_death:
sock_put(sk);
}
+EXPORT_SYMBOL_GPL(dccp_close);
+
void dccp_shutdown(struct sock *sk, int how)
{
dccp_pr_debug("entry\n");
}
-static struct proto_ops inet_dccp_ops = {
+EXPORT_SYMBOL_GPL(dccp_shutdown);
+
+static const struct proto_ops inet_dccp_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
@@ -681,11 +707,11 @@ extern struct net_proto_family inet_family_ops;
static struct inet_protosw dccp_v4_protosw = {
.type = SOCK_DCCP,
.protocol = IPPROTO_DCCP,
- .prot = &dccp_v4_prot,
+ .prot = &dccp_prot,
.ops = &inet_dccp_ops,
.capability = -1,
.no_check = 0,
- .flags = 0,
+ .flags = INET_PROTOSW_ICSK,
};
/*
@@ -760,13 +786,15 @@ MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
int dccp_debug;
module_param(dccp_debug, int, 0444);
MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
+
+EXPORT_SYMBOL_GPL(dccp_debug);
#endif
static int __init dccp_init(void)
{
unsigned long goal;
int ehash_order, bhash_order, i;
- int rc = proto_register(&dccp_v4_prot, 1);
+ int rc = proto_register(&dccp_prot, 1);
if (rc)
goto out;
@@ -869,7 +897,7 @@ out_free_bind_bucket_cachep:
kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
dccp_hashinfo.bind_bucket_cachep = NULL;
out_proto_unregister:
- proto_unregister(&dccp_v4_prot);
+ proto_unregister(&dccp_prot);
goto out;
}
@@ -892,7 +920,7 @@ static void __exit dccp_fini(void)
get_order(dccp_hashinfo.ehash_size *
sizeof(struct inet_ehash_bucket)));
kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
- proto_unregister(&dccp_v4_prot);
+ proto_unregister(&dccp_prot);
}
module_init(dccp_init);
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index d402e9020c68..78ec5344be86 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -149,7 +149,7 @@ static void dn_keepalive(struct sock *sk);
#define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1)
-static struct proto_ops dn_proto_ops;
+static const struct proto_ops dn_proto_ops;
static DEFINE_RWLOCK(dn_hash_lock);
static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE];
static struct hlist_head dn_wild_sk;
@@ -1252,7 +1252,7 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
break;
default:
- err = dev_ioctl(cmd, (void __user *)arg);
+ err = -ENOIOCTLCMD;
break;
}
@@ -2342,7 +2342,7 @@ static struct net_proto_family dn_family_ops = {
.owner = THIS_MODULE,
};
-static struct proto_ops dn_proto_ops = {
+static const struct proto_ops dn_proto_ops = {
.family = AF_DECnet,
.owner = THIS_MODULE,
.release = dn_release,
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 8d0cc3cf3e49..33ab256cfd4a 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -408,11 +408,14 @@ int dn_neigh_router_hello(struct sk_buff *skb)
}
}
- if (!dn_db->router) {
- dn_db->router = neigh_clone(neigh);
- } else {
- if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority)
- neigh_release(xchg(&dn_db->router, neigh_clone(neigh)));
+ /* Only use routers in our area */
+ if ((dn_ntohs(src)>>10) == dn_ntohs((decnet_address)>>10)) {
+ if (!dn_db->router) {
+ dn_db->router = neigh_clone(neigh);
+ } else {
+ if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority)
+ neigh_release(xchg(&dn_db->router, neigh_clone(neigh)));
+ }
}
write_unlock(&neigh->lock);
neigh_release(neigh);
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index 369f25b60f3f..44bda85e678f 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -793,7 +793,6 @@ static int dn_nsp_rx_packet(struct sk_buff *skb)
got_it:
if (sk != NULL) {
struct dn_scp *scp = DN_SK(sk);
- int ret;
/* Reset backoff */
scp->nsp_rxtshift = 0;
@@ -807,21 +806,7 @@ got_it:
goto free_out;
}
- bh_lock_sock(sk);
- ret = NET_RX_SUCCESS;
- if (decnet_debug_level & 8)
- printk(KERN_DEBUG "NSP: 0x%02x 0x%02x 0x%04x 0x%04x %d\n",
- (int)cb->rt_flags, (int)cb->nsp_flags,
- (int)cb->src_port, (int)cb->dst_port,
- !!sock_owned_by_user(sk));
- if (!sock_owned_by_user(sk))
- ret = dn_nsp_backlog_rcv(sk, skb);
- else
- sk_add_backlog(sk, skb);
- bh_unlock_sock(sk);
- sock_put(sk);
-
- return ret;
+ return sk_receive_skb(sk, skb);
}
return dn_nsp_no_socket(skb, reason);
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index 34fdac51df96..c792994d7952 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -31,6 +31,7 @@
#include <linux/if_arp.h>
#include <linux/wireless.h>
#include <linux/skbuff.h>
+#include <linux/udp.h>
#include <net/sock.h>
#include <net/inet_common.h>
#include <linux/stat.h>
@@ -45,7 +46,7 @@
#include <asm/uaccess.h>
#include <asm/system.h>
-static struct proto_ops econet_ops;
+static const struct proto_ops econet_ops;
static struct hlist_head econet_sklist;
static DEFINE_RWLOCK(econet_lock);
@@ -56,7 +57,7 @@ static struct net_device *net2dev_map[256];
#define EC_PORT_IP 0xd2
#ifdef CONFIG_ECONET_AUNUDP
-static spinlock_t aun_queue_lock;
+static DEFINE_SPINLOCK(aun_queue_lock);
static struct socket *udpsock;
#define AUN_PORT 0x8000
@@ -686,7 +687,7 @@ static int econet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg
break;
default:
- return dev_ioctl(cmd, argp);
+ return -ENOIOCTLCMD;
}
/*NOTREACHED*/
return 0;
@@ -698,7 +699,7 @@ static struct net_proto_family econet_family_ops = {
.owner = THIS_MODULE,
};
-static struct proto_ops SOCKOPS_WRAPPED(econet_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(econet_ops) = {
.family = PF_ECONET,
.owner = THIS_MODULE,
.release = econet_release,
diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c
index 03efaacbdb73..4cc6f41c6930 100644
--- a/net/ieee80211/ieee80211_rx.c
+++ b/net/ieee80211/ieee80211_rx.c
@@ -410,9 +410,8 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
return 1;
}
- if ((is_multicast_ether_addr(hdr->addr1) ||
- is_broadcast_ether_addr(hdr->addr2)) ? ieee->host_mc_decrypt :
- ieee->host_decrypt) {
+ if (is_multicast_ether_addr(hdr->addr1)
+ ? ieee->host_mc_decrypt : ieee->host_decrypt) {
int idx = 0;
if (skb->len >= hdrlen + 3)
idx = skb->data[hdrlen + 3] >> 6;
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index e55136ae09f4..011cca7ae02b 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -456,6 +456,14 @@ config TCP_CONG_BIC
increase provides TCP friendliness.
See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
+config TCP_CONG_CUBIC
+ tristate "CUBIC TCP"
+ default m
+ ---help---
+ This is version 2.0 of BIC-TCP which uses a cubic growth function
+ among other techniques.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
+
config TCP_CONG_WESTWOOD
tristate "TCP Westwood+"
default m
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f0435d00db6b..c54edd76de09 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
+obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d368cf249000..966a071a408c 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -93,6 +93,7 @@
#include <linux/smp_lock.h>
#include <linux/inet.h>
#include <linux/igmp.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -302,6 +303,7 @@ lookup_protocol:
sk->sk_reuse = 1;
inet = inet_sk(sk);
+ inet->is_icsk = INET_PROTOSW_ICSK & answer_flags;
if (SOCK_RAW == sock->type) {
inet->num = protocol;
@@ -775,16 +777,16 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
err = devinet_ioctl(cmd, (void __user *)arg);
break;
default:
- if (!sk->sk_prot->ioctl ||
- (err = sk->sk_prot->ioctl(sk, cmd, arg)) ==
- -ENOIOCTLCMD)
- err = dev_ioctl(cmd, (void __user *)arg);
+ if (sk->sk_prot->ioctl)
+ err = sk->sk_prot->ioctl(sk, cmd, arg);
+ else
+ err = -ENOIOCTLCMD;
break;
}
return err;
}
-struct proto_ops inet_stream_ops = {
+const struct proto_ops inet_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
@@ -805,7 +807,7 @@ struct proto_ops inet_stream_ops = {
.sendpage = tcp_sendpage
};
-struct proto_ops inet_dgram_ops = {
+const struct proto_ops inet_dgram_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
@@ -830,7 +832,7 @@ struct proto_ops inet_dgram_ops = {
* For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
* udp_poll
*/
-static struct proto_ops inet_sockraw_ops = {
+static const struct proto_ops inet_sockraw_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
@@ -869,7 +871,8 @@ static struct inet_protosw inetsw_array[] =
.ops = &inet_stream_ops,
.capability = -1,
.no_check = 0,
- .flags = INET_PROTOSW_PERMANENT,
+ .flags = INET_PROTOSW_PERMANENT |
+ INET_PROTOSW_ICSK,
},
{
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 035ad2c9e1ba..aed537fa2c88 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -6,6 +6,7 @@
#include <linux/crypto.h>
#include <linux/pfkeyv2.h>
#include <net/icmp.h>
+#include <net/protocol.h>
#include <asm/scatterlist.h>
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index b425748f02d7..37432088fe6d 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -86,6 +86,7 @@
#include <linux/in.h>
#include <linux/mm.h>
#include <linux/inet.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/fddidevice.h>
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 04a6fe3e95a2..7b9bb28e2ee9 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -58,6 +58,7 @@
#endif
#include <linux/kmod.h>
+#include <net/arp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/ip_fib.h>
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 1b18ce66e7b7..73bfcae8af9c 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -9,6 +9,7 @@
#include <linux/pfkeyv2.h>
#include <linux/random.h>
#include <net/icmp.h>
+#include <net/protocol.h>
#include <net/udp.h>
/* decapsulation data for use when post-processing */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 19b1b984d687..18f5e509281a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -30,6 +30,7 @@
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 7ea0209cb169..e2890ec8159e 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -29,6 +29,7 @@
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 0b298bbc1518..0dd4d06e456d 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -33,6 +33,7 @@
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 6d2a6ac070e3..ef4724de7350 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -29,6 +29,7 @@
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
@@ -36,6 +37,7 @@
#include <linux/netlink.h>
#include <linux/init.h>
+#include <net/arp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 705e3ce86df9..e320b32373e5 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -41,6 +41,13 @@
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
+ *
+ * Substantial contributions to this work comes from:
+ *
+ * David S. Miller, <davem@davemloft.net>
+ * Stephen Hemminger <shemminger@osdl.org>
+ * Paul E. McKenney <paulmck@us.ibm.com>
+ * Patrick McHardy <kaber@trash.net>
*/
#define VERSION "0.404"
@@ -59,6 +66,7 @@
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 92e23b2ad4d2..be5a519cd2f8 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -73,6 +73,7 @@
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/string.h>
#include <linux/netfilter_ipv4.h>
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 4a195c724f01..34758118c10c 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -91,6 +91,8 @@
#include <linux/if_arp.h>
#include <linux/rtnetlink.h>
#include <linux/times.h>
+
+#include <net/arp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 3fe021f1a566..ae20281d8deb 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -37,7 +37,8 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg);
*/
int sysctl_local_port_range[2] = { 1024, 4999 };
-static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
+int inet_csk_bind_conflict(const struct sock *sk,
+ const struct inet_bind_bucket *tb)
{
const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
struct sock *sk2;
@@ -62,11 +63,15 @@ static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucke
return node != NULL;
}
+EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
+
/* Obtain a reference to a local port for the given sock,
* if snum is zero it means select any available local port.
*/
int inet_csk_get_port(struct inet_hashinfo *hashinfo,
- struct sock *sk, unsigned short snum)
+ struct sock *sk, unsigned short snum,
+ int (*bind_conflict)(const struct sock *sk,
+ const struct inet_bind_bucket *tb))
{
struct inet_bind_hashbucket *head;
struct hlist_node *node;
@@ -125,7 +130,7 @@ tb_found:
goto success;
} else {
ret = 1;
- if (inet_csk_bind_conflict(sk, tb))
+ if (bind_conflict(sk, tb))
goto fail_unlock;
}
}
@@ -380,7 +385,7 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,
EXPORT_SYMBOL_GPL(inet_csk_search_req);
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
- const unsigned timeout)
+ unsigned long timeout)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
@@ -631,3 +636,15 @@ void inet_csk_listen_stop(struct sock *sk)
}
EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
+
+void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+ const struct inet_sock *inet = inet_sk(sk);
+
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = inet->daddr;
+ sin->sin_port = inet->dport;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 39061ed53cfd..c49908192047 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -112,12 +112,12 @@ static int inet_diag_fill(struct sk_buff *skb, struct sock *sk,
r->idiag_inode = 0;
#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
if (r->idiag_family == AF_INET6) {
- const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk);
+ const struct inet6_timewait_sock *tw6 = inet6_twsk(sk);
ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
- &tcp6tw->tw_v6_rcv_saddr);
+ &tw6->tw_v6_rcv_saddr);
ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
- &tcp6tw->tw_v6_daddr);
+ &tw6->tw_v6_daddr);
}
#endif
nlh->nlmsg_len = skb->tail - b;
@@ -489,9 +489,9 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
if (r->idiag_family == AF_INET6) {
ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
- &tcp6_rsk(req)->loc_addr);
+ &inet6_rsk(req)->loc_addr);
ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
- &tcp6_rsk(req)->rmt_addr);
+ &inet6_rsk(req)->rmt_addr);
}
#endif
nlh->nlmsg_len = skb->tail - b;
@@ -553,13 +553,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
entry.saddr =
#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
(entry.family == AF_INET6) ?
- tcp6_rsk(req)->loc_addr.s6_addr32 :
+ inet6_rsk(req)->loc_addr.s6_addr32 :
#endif
&ireq->loc_addr;
entry.daddr =
#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
(entry.family == AF_INET6) ?
- tcp6_rsk(req)->rmt_addr.s6_addr32 :
+ inet6_rsk(req)->rmt_addr.s6_addr32 :
#endif
&ireq->rmt_addr;
entry.dport = ntohs(ireq->rmt_port);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index e8d29fe736d2..33228115cda4 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -15,12 +15,14 @@
#include <linux/config.h>
#include <linux/module.h>
+#include <linux/random.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/wait.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
+#include <net/ip.h>
/*
* Allocate and initialize a new local port bind bucket.
@@ -163,3 +165,179 @@ struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 dad
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
+
+/* called with local bh disabled */
+static int __inet_check_established(struct inet_timewait_death_row *death_row,
+ struct sock *sk, __u16 lport,
+ struct inet_timewait_sock **twp)
+{
+ struct inet_hashinfo *hinfo = death_row->hashinfo;
+ struct inet_sock *inet = inet_sk(sk);
+ u32 daddr = inet->rcv_saddr;
+ u32 saddr = inet->daddr;
+ int dif = sk->sk_bound_dev_if;
+ INET_ADDR_COOKIE(acookie, saddr, daddr)
+ const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+ unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
+ struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+ struct sock *sk2;
+ const struct hlist_node *node;
+ struct inet_timewait_sock *tw;
+
+ prefetch(head->chain.first);
+ write_lock(&head->lock);
+
+ /* Check TIME-WAIT sockets first. */
+ sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) {
+ tw = inet_twsk(sk2);
+
+ if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
+ if (twsk_unique(sk, sk2, twp))
+ goto unique;
+ else
+ goto not_unique;
+ }
+ }
+ tw = NULL;
+
+ /* And established part... */
+ sk_for_each(sk2, node, &head->chain) {
+ if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
+ goto not_unique;
+ }
+
+unique:
+ /* Must record num and sport now. Otherwise we will see
+ * in hash table socket with a funny identity. */
+ inet->num = lport;
+ inet->sport = htons(lport);
+ sk->sk_hash = hash;
+ BUG_TRAP(sk_unhashed(sk));
+ __sk_add_node(sk, &head->chain);
+ sock_prot_inc_use(sk->sk_prot);
+ write_unlock(&head->lock);
+
+ if (twp) {
+ *twp = tw;
+ NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+ } else if (tw) {
+ /* Silly. Should hash-dance instead... */
+ inet_twsk_deschedule(tw, death_row);
+ NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+
+ inet_twsk_put(tw);
+ }
+
+ return 0;
+
+not_unique:
+ write_unlock(&head->lock);
+ return -EADDRNOTAVAIL;
+}
+
+static inline u32 inet_sk_port_offset(const struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr,
+ inet->dport);
+}
+
+/*
+ * Bind a port for a connect operation and hash it.
+ */
+int inet_hash_connect(struct inet_timewait_death_row *death_row,
+ struct sock *sk)
+{
+ struct inet_hashinfo *hinfo = death_row->hashinfo;
+ const unsigned short snum = inet_sk(sk)->num;
+ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
+ int ret;
+
+ if (!snum) {
+ int low = sysctl_local_port_range[0];
+ int high = sysctl_local_port_range[1];
+ int range = high - low;
+ int i;
+ int port;
+ static u32 hint;
+ u32 offset = hint + inet_sk_port_offset(sk);
+ struct hlist_node *node;
+ struct inet_timewait_sock *tw = NULL;
+
+ local_bh_disable();
+ for (i = 1; i <= range; i++) {
+ port = low + (i + offset) % range;
+ head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
+ spin_lock(&head->lock);
+
+ /* Does not bother with rcv_saddr checks,
+ * because the established check is already
+ * unique enough.
+ */
+ inet_bind_bucket_for_each(tb, node, &head->chain) {
+ if (tb->port == port) {
+ BUG_TRAP(!hlist_empty(&tb->owners));
+ if (tb->fastreuse >= 0)
+ goto next_port;
+ if (!__inet_check_established(death_row,
+ sk, port,
+ &tw))
+ goto ok;
+ goto next_port;
+ }
+ }
+
+ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port);
+ if (!tb) {
+ spin_unlock(&head->lock);
+ break;
+ }
+ tb->fastreuse = -1;
+ goto ok;
+
+ next_port:
+ spin_unlock(&head->lock);
+ }
+ local_bh_enable();
+
+ return -EADDRNOTAVAIL;
+
+ok:
+ hint += i;
+
+ /* Head lock still held and bh's disabled */
+ inet_bind_hash(sk, tb, port);
+ if (sk_unhashed(sk)) {
+ inet_sk(sk)->sport = htons(port);
+ __inet_hash(hinfo, sk, 0);
+ }
+ spin_unlock(&head->lock);
+
+ if (tw) {
+ inet_twsk_deschedule(tw, death_row);;
+ inet_twsk_put(tw);
+ }
+
+ ret = 0;
+ goto out;
+ }
+
+ head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
+ tb = inet_csk(sk)->icsk_bind_hash;
+ spin_lock_bh(&head->lock);
+ if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
+ __inet_hash(hinfo, sk, 0);
+ spin_unlock_bh(&head->lock);
+ return 0;
+ } else {
+ spin_unlock(&head->lock);
+ /* No definite answer... Walk to established hash table */
+ ret = __inet_check_established(death_row, sk, snum, NULL);
+out:
+ local_bh_enable();
+ return ret;
+ }
+}
+
+EXPORT_SYMBOL_GPL(inet_hash_connect);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index a010e9a68811..417f126c749e 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -90,8 +90,9 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
{
- struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab,
- SLAB_ATOMIC);
+ struct inet_timewait_sock *tw =
+ kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
+ SLAB_ATOMIC);
if (tw != NULL) {
const struct inet_sock *inet = inet_sk(sk);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 2fc3fd38924f..ce5fe3f74a3d 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -401,6 +401,7 @@ struct inet_peer *inet_getpeer(__u32 daddr, int create)
return NULL;
n->v4daddr = daddr;
atomic_set(&n->refcnt, 1);
+ atomic_set(&n->rid, 0);
n->ip_id_count = secure_ip_id(daddr);
n->tcp_ts_stamp = 0;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8ce0ce2ee48e..ce2b70ce4018 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -22,6 +22,7 @@
* Patrick McHardy : LRU queue of frag heads for evictor.
*/
+#include <linux/compiler.h>
#include <linux/config.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -38,6 +39,7 @@
#include <net/ip.h>
#include <net/icmp.h>
#include <net/checksum.h>
+#include <net/inetpeer.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/inet.h>
@@ -56,6 +58,8 @@
int sysctl_ipfrag_high_thresh = 256*1024;
int sysctl_ipfrag_low_thresh = 192*1024;
+int sysctl_ipfrag_max_dist = 64;
+
/* Important NOTE! Fragment queue must be destroyed before MSL expires.
* RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL.
*/
@@ -89,8 +93,10 @@ struct ipq {
spinlock_t lock;
atomic_t refcnt;
struct timer_list timer; /* when will this queue expire? */
- int iif;
struct timeval stamp;
+ int iif;
+ unsigned int rid;
+ struct inet_peer *peer;
};
/* Hash table. */
@@ -195,6 +201,9 @@ static void ip_frag_destroy(struct ipq *qp, int *work)
BUG_TRAP(qp->last_in&COMPLETE);
BUG_TRAP(del_timer(&qp->timer) == 0);
+ if (qp->peer)
+ inet_putpeer(qp->peer);
+
/* Release all fragment data. */
fp = qp->fragments;
while (fp) {
@@ -353,6 +362,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
qp->meat = 0;
qp->fragments = NULL;
qp->iif = 0;
+ qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL;
/* Initialize a timer for this entry. */
init_timer(&qp->timer);
@@ -398,6 +408,56 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
return ip_frag_create(hash, iph, user);
}
+/* Is the fragment too far ahead to be part of ipq? */
+static inline int ip_frag_too_far(struct ipq *qp)
+{
+ struct inet_peer *peer = qp->peer;
+ unsigned int max = sysctl_ipfrag_max_dist;
+ unsigned int start, end;
+
+ int rc;
+
+ if (!peer || !max)
+ return 0;
+
+ start = qp->rid;
+ end = atomic_inc_return(&peer->rid);
+ qp->rid = end;
+
+ rc = qp->fragments && (end - start) > max;
+
+ if (rc) {
+ IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+ }
+
+ return rc;
+}
+
+static int ip_frag_reinit(struct ipq *qp)
+{
+ struct sk_buff *fp;
+
+ if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) {
+ atomic_inc(&qp->refcnt);
+ return -ETIMEDOUT;
+ }
+
+ fp = qp->fragments;
+ do {
+ struct sk_buff *xp = fp->next;
+ frag_kfree_skb(fp, NULL);
+ fp = xp;
+ } while (fp);
+
+ qp->last_in = 0;
+ qp->len = 0;
+ qp->meat = 0;
+ qp->fragments = NULL;
+ qp->iif = 0;
+
+ return 0;
+}
+
/* Add new segment to existing queue. */
static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
@@ -408,6 +468,12 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (qp->last_in & COMPLETE)
goto err;
+ if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
+ unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) {
+ ipq_kill(qp);
+ goto err;
+ }
+
offset = ntohs(skb->nh.iph->frag_off);
flags = offset & ~IP_OFFSET;
offset &= IP_OFFSET;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 473d0f2b2e0d..e45846ae570b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -128,6 +128,7 @@
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/inet.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index dbe12da8d8b3..d3f6c468faf4 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -22,6 +22,7 @@
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
+#include <net/route.h>
/*
* Write options to IP header, record destination address to
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index eba64e2bd397..2a830de3a699 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -445,6 +445,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
hlen = iph->ihl * 4;
mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
+ IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
/* When frag_list is given, use it. First, check its validity:
* some transformers could create wrong frag_list or break existing
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 4f2d87257309..6986e11d65cc 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -25,12 +25,12 @@
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/icmp.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
-#include <net/tcp.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
#include <linux/udp.h>
#include <linux/igmp.h>
#include <linux/netfilter.h>
@@ -427,8 +427,8 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
err = ip_options_get_from_user(&opt, optval, optlen);
if (err)
break;
- if (sk->sk_type == SOCK_STREAM) {
- struct tcp_sock *tp = tcp_sk(sk);
+ if (inet->is_icsk) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
if (sk->sk_family == PF_INET ||
(!((1 << sk->sk_state) &
@@ -436,10 +436,10 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
inet->daddr != LOOPBACK4_IPV6)) {
#endif
if (inet->opt)
- tp->ext_header_len -= inet->opt->optlen;
+ icsk->icsk_ext_hdr_len -= inet->opt->optlen;
if (opt)
- tp->ext_header_len += opt->optlen;
- tcp_sync_mss(sk, tp->pmtu_cookie);
+ icsk->icsk_ext_hdr_len += opt->optlen;
+ icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
}
#endif
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index fc718df17b40..d64e2ec8da7b 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -28,6 +28,7 @@
#include <net/xfrm.h>
#include <net/icmp.h>
#include <net/ipcomp.h>
+#include <net/protocol.h>
struct ipcomp_tfms {
struct list_head list;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index e8674baaa8d9..bb3613ec448c 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -42,6 +42,7 @@
#include <linux/in.h>
#include <linux/if.h>
#include <linux/inet.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
@@ -58,6 +59,7 @@
#include <net/arp.h>
#include <net/ip.h>
#include <net/ipconfig.h>
+#include <net/route.h>
#include <asm/uaccess.h>
#include <net/checksum.h>
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 302b7eb507c9..caa3b7d2e48a 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -52,6 +52,7 @@
#include <net/ip.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
+#include <net/route.h>
#include <net/sock.h>
#include <net/icmp.h>
#include <net/udp.h>
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index d7eb680101c2..9b176a942ac5 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -224,34 +224,6 @@ void unregister_ip_vs_app(struct ip_vs_app *app)
}
-#if 0000
-/*
- * Get reference to app by name (called from user context)
- */
-struct ip_vs_app *ip_vs_app_get_by_name(char *appname)
-{
- struct ip_vs_app *app, *a = NULL;
-
- down(&__ip_vs_app_mutex);
-
- list_for_each_entry(ent, &ip_vs_app_list, a_list) {
- if (strcmp(app->name, appname))
- continue;
-
- /* softirq may call ip_vs_app_get too, so the caller
- must disable softirq on the current CPU */
- if (ip_vs_app_get(app))
- a = app;
- break;
- }
-
- up(&__ip_vs_app_mutex);
-
- return a;
-}
-#endif
-
-
/*
* Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
*/
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index 2a3a8c59c655..81d90354c928 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -24,7 +24,10 @@
*
*/
+#include <linux/in.h>
+#include <linux/net.h>
#include <linux/kernel.h>
+#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/proc_fs.h> /* for proc_net_* */
#include <linux/seq_file.h>
@@ -219,7 +222,7 @@ struct ip_vs_conn *ip_vs_conn_in_get
if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
- IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+ IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
ip_vs_proto_name(protocol),
NIPQUAD(s_addr), ntohs(s_port),
NIPQUAD(d_addr), ntohs(d_port),
@@ -254,7 +257,7 @@ struct ip_vs_conn *ip_vs_ct_in_get
out:
ct_read_unlock(hash);
- IP_VS_DBG(7, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+ IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
ip_vs_proto_name(protocol),
NIPQUAD(s_addr), ntohs(s_port),
NIPQUAD(d_addr), ntohs(d_port),
@@ -295,7 +298,7 @@ struct ip_vs_conn *ip_vs_conn_out_get
ct_read_unlock(hash);
- IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+ IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
ip_vs_proto_name(protocol),
NIPQUAD(s_addr), ntohs(s_port),
NIPQUAD(d_addr), ntohs(d_port),
@@ -391,8 +394,9 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
cp->flags |= atomic_read(&dest->conn_flags);
cp->dest = dest;
- IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
- "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
+ IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+ "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
+ "dest->refcnt:%d\n",
ip_vs_proto_name(cp->protocol),
NIPQUAD(cp->caddr), ntohs(cp->cport),
NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -430,8 +434,9 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
if (!dest)
return;
- IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
- "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
+ IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+ "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
+ "dest->refcnt:%d\n",
ip_vs_proto_name(cp->protocol),
NIPQUAD(cp->caddr), ntohs(cp->cport),
NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -571,7 +576,7 @@ static void ip_vs_conn_expire(unsigned long data)
ip_vs_conn_hash(cp);
expire_later:
- IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
+ IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
atomic_read(&cp->refcnt)-1,
atomic_read(&cp->n_control));
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 1a0843cd58a9..1aca94a9fd8b 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -426,7 +426,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
return NULL;
IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
- "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n",
+ "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n",
ip_vs_fwd_tag(cp),
NIPQUAD(cp->caddr), ntohs(cp->cport),
NIPQUAD(cp->vaddr), ntohs(cp->vport),
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 9bdcf31b760e..c935c5086d33 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -35,6 +35,7 @@
#include <linux/netfilter_ipv4.h>
#include <net/ip.h>
+#include <net/route.h>
#include <net/sock.h>
#include <asm/uaccess.h>
@@ -447,7 +448,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
out:
read_unlock(&__ip_vs_svc_lock);
- IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
+ IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
fwmark, ip_vs_proto_name(protocol),
NIPQUAD(vaddr), ntohs(vport),
svc?"hit":"not hit");
@@ -597,7 +598,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
*/
list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
- "refcnt=%d\n",
+ "dest->refcnt=%d\n",
dest->vfwmark,
NIPQUAD(dest->addr), ntohs(dest->port),
atomic_read(&dest->refcnt));
@@ -804,7 +805,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
dest = ip_vs_trash_get_dest(svc, daddr, dport);
if (dest != NULL) {
IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
- "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
+ "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
NIPQUAD(daddr), ntohs(dport),
atomic_read(&dest->refcnt),
dest->vfwmark,
@@ -949,7 +950,8 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
atomic_dec(&dest->svc->refcnt);
kfree(dest);
} else {
- IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
+ IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
+ "dest->refcnt=%d\n",
NIPQUAD(dest->addr), ntohs(dest->port),
atomic_read(&dest->refcnt));
list_add(&dest->n_list, &ip_vs_dest_trash);
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c
index f3bc320dce93..9fee19c4c617 100644
--- a/net/ipv4/ipvs/ip_vs_dh.c
+++ b/net/ipv4/ipvs/ip_vs_dh.c
@@ -37,8 +37,10 @@
*
*/
+#include <linux/ip.h>
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/skbuff.h>
#include <net/ip_vs.h>
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
index 67b3e2fc1fa1..e7004741ac73 100644
--- a/net/ipv4/ipvs/ip_vs_est.c
+++ b/net/ipv4/ipvs/ip_vs_est.c
@@ -13,7 +13,10 @@
* Changes:
*
*/
+#include <linux/config.h>
#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/slab.h>
#include <linux/types.h>
#include <net/ip_vs.h>
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index 561cda326fa8..6e5cb92a5c83 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -41,8 +41,10 @@
* me to write this module.
*/
+#include <linux/ip.h>
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/skbuff.h>
/* for sysctl */
#include <linux/fs.h>
@@ -228,33 +230,6 @@ ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
}
-#if 0000
-/*
- * Unhash ip_vs_lblc_entry from ip_vs_lblc_table.
- * returns bool success.
- */
-static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl,
- struct ip_vs_lblc_entry *en)
-{
- if (list_empty(&en->list)) {
- IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, "
- "called from %p\n", __builtin_return_address(0));
- return 0;
- }
-
- /*
- * Remove it from the table
- */
- write_lock(&tbl->lock);
- list_del(&en->list);
- INIT_LIST_HEAD(&en->list);
- write_unlock(&tbl->lock);
-
- return 1;
-}
-#endif
-
-
/*
* Get ip_vs_lblc_entry associated with supplied parameters.
*/
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index ce456dbf09a5..32ba37ba72d8 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -39,8 +39,10 @@
*
*/
+#include <linux/ip.h>
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/skbuff.h>
/* for sysctl */
#include <linux/fs.h>
@@ -414,33 +416,6 @@ ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
}
-#if 0000
-/*
- * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table.
- * returns bool success.
- */
-static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl,
- struct ip_vs_lblcr_entry *en)
-{
- if (list_empty(&en->list)) {
- IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, "
- "called from %p\n", __builtin_return_address(0));
- return 0;
- }
-
- /*
- * Remove it from the table
- */
- write_lock(&tbl->lock);
- list_del(&en->list);
- INIT_LIST_HEAD(&en->list);
- write_unlock(&tbl->lock);
-
- return 1;
-}
-#endif
-
-
/*
* Get ip_vs_lblcr_entry associated with supplied parameters.
*/
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c
index 453e94a0bbd7..8b0505b09317 100644
--- a/net/ipv4/ipvs/ip_vs_proto_ah.c
+++ b/net/ipv4/ipvs/ip_vs_proto_ah.c
@@ -12,6 +12,8 @@
*
*/
+#include <linux/in.h>
+#include <linux/ip.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/netfilter.h>
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c
index 478e5c7c7e8e..c36ccf057a19 100644
--- a/net/ipv4/ipvs/ip_vs_proto_esp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_esp.c
@@ -12,6 +12,8 @@
*
*/
+#include <linux/in.h>
+#include <linux/ip.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/netfilter.h>
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index 0e878fd6215c..bc28b1160a3a 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -275,28 +275,6 @@ static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
[IP_VS_TCP_S_LAST] = 2*HZ,
};
-
-#if 0
-
-/* FIXME: This is going to die */
-
-static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = {
- [IP_VS_TCP_S_NONE] = 2*HZ,
- [IP_VS_TCP_S_ESTABLISHED] = 8*60*HZ,
- [IP_VS_TCP_S_SYN_SENT] = 60*HZ,
- [IP_VS_TCP_S_SYN_RECV] = 10*HZ,
- [IP_VS_TCP_S_FIN_WAIT] = 60*HZ,
- [IP_VS_TCP_S_TIME_WAIT] = 60*HZ,
- [IP_VS_TCP_S_CLOSE] = 10*HZ,
- [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
- [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
- [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
- [IP_VS_TCP_S_SYNACK] = 100*HZ,
- [IP_VS_TCP_S_LAST] = 2*HZ,
-};
-
-#endif
-
static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
[IP_VS_TCP_S_NONE] = "NONE",
[IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
@@ -448,7 +426,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
struct ip_vs_dest *dest = cp->dest;
IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
- "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
+ "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n",
pp->name,
(state_off==TCP_DIR_OUTPUT)?"output ":"input ",
th->syn? 'S' : '.',
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
index 8ae5f2e0aefa..89d9175d8f28 100644
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -15,8 +15,11 @@
*
*/
+#include <linux/in.h>
+#include <linux/ip.h>
#include <linux/kernel.h>
#include <linux/netfilter_ipv4.h>
+#include <linux/udp.h>
#include <net/ip_vs.h>
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
index 6f7c50e44a39..7775e6cc68be 100644
--- a/net/ipv4/ipvs/ip_vs_sh.c
+++ b/net/ipv4/ipvs/ip_vs_sh.c
@@ -34,8 +34,10 @@
*
*/
+#include <linux/ip.h>
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/skbuff.h>
#include <net/ip_vs.h>
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 2e5ced3d8062..1bca714bda3d 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -21,12 +21,14 @@
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/inetdevice.h>
#include <linux/net.h>
#include <linux/completion.h>
#include <linux/delay.h>
#include <linux/skbuff.h>
#include <linux/in.h>
#include <linux/igmp.h> /* for ip_mc_join_group */
+#include <linux/udp.h>
#include <net/ip.h>
#include <net/sock.h>
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 3c2e9639bba6..bba156304695 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -68,19 +68,14 @@ struct arpt_table_info {
unsigned int initial_entries;
unsigned int hook_entry[NF_ARP_NUMHOOKS];
unsigned int underflow[NF_ARP_NUMHOOKS];
- char entries[0] __attribute__((aligned(SMP_CACHE_BYTES)));
+ void *entries[NR_CPUS];
};
static LIST_HEAD(arpt_target);
static LIST_HEAD(arpt_tables);
+#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
-#ifdef CONFIG_SMP
-#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
-#else
-#define TABLE_OFFSET(t,p) 0
-#endif
-
static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
char *hdr_addr, int len)
{
@@ -269,9 +264,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb,
outdev = out ? out->name : nulldevname;
read_lock_bh(&table->lock);
- table_base = (void *)table->private->entries
- + TABLE_OFFSET(table->private,
- smp_processor_id());
+ table_base = (void *)table->private->entries[smp_processor_id()];
e = get_entry(table_base, table->private->hook_entry[hook]);
back = get_entry(table_base, table->private->underflow[hook]);
@@ -462,7 +455,8 @@ static inline int unconditional(const struct arpt_arp *arp)
/* Figures out from what hook each rule can be called: returns 0 if
* there are loops. Puts hook bitmask in comefrom.
*/
-static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks)
+static int mark_source_chains(struct arpt_table_info *newinfo,
+ unsigned int valid_hooks, void *entry0)
{
unsigned int hook;
@@ -472,7 +466,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali
for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) {
unsigned int pos = newinfo->hook_entry[hook];
struct arpt_entry *e
- = (struct arpt_entry *)(newinfo->entries + pos);
+ = (struct arpt_entry *)(entry0 + pos);
if (!(valid_hooks & (1 << hook)))
continue;
@@ -514,13 +508,13 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali
goto next;
e = (struct arpt_entry *)
- (newinfo->entries + pos);
+ (entry0 + pos);
} while (oldpos == pos + e->next_offset);
/* Move along one */
size = e->next_offset;
e = (struct arpt_entry *)
- (newinfo->entries + pos + size);
+ (entry0 + pos + size);
e->counters.pcnt = pos;
pos += size;
} else {
@@ -537,7 +531,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali
newpos = pos + e->next_offset;
}
e = (struct arpt_entry *)
- (newinfo->entries + newpos);
+ (entry0 + newpos);
e->counters.pcnt = pos;
pos = newpos;
}
@@ -689,6 +683,7 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
static int translate_table(const char *name,
unsigned int valid_hooks,
struct arpt_table_info *newinfo,
+ void *entry0,
unsigned int size,
unsigned int number,
const unsigned int *hook_entries,
@@ -710,11 +705,11 @@ static int translate_table(const char *name,
i = 0;
/* Walk through entries, checking offsets. */
- ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size,
check_entry_size_and_hooks,
newinfo,
- newinfo->entries,
- newinfo->entries + size,
+ entry0,
+ entry0 + size,
hook_entries, underflows, &i);
duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
if (ret != 0)
@@ -743,29 +738,26 @@ static int translate_table(const char *name,
}
}
- if (!mark_source_chains(newinfo, valid_hooks)) {
+ if (!mark_source_chains(newinfo, valid_hooks, entry0)) {
duprintf("Looping hook\n");
return -ELOOP;
}
/* Finally, each sanity check must pass */
i = 0;
- ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size,
check_entry, name, size, &i);
if (ret != 0) {
- ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ ARPT_ENTRY_ITERATE(entry0, newinfo->size,
cleanup_entry, &i);
return ret;
}
/* And one copy for every other CPU */
for_each_cpu(i) {
- if (i == 0)
- continue;
- memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
- newinfo->entries,
- SMP_ALIGN(newinfo->size));
+ if (newinfo->entries[i] && newinfo->entries[i] != entry0)
+ memcpy(newinfo->entries[i], entry0, newinfo->size);
}
return ret;
@@ -807,15 +799,42 @@ static inline int add_entry_to_counter(const struct arpt_entry *e,
return 0;
}
+static inline int set_entry_to_counter(const struct arpt_entry *e,
+ struct arpt_counters total[],
+ unsigned int *i)
+{
+ SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+
+ (*i)++;
+ return 0;
+}
+
static void get_counters(const struct arpt_table_info *t,
struct arpt_counters counters[])
{
unsigned int cpu;
unsigned int i;
+ unsigned int curcpu;
+
+ /* Instead of clearing (by a previous call to memset())
+ * the counters and using adds, we set the counters
+ * with data used by 'current' CPU
+ * We dont care about preemption here.
+ */
+ curcpu = raw_smp_processor_id();
+
+ i = 0;
+ ARPT_ENTRY_ITERATE(t->entries[curcpu],
+ t->size,
+ set_entry_to_counter,
+ counters,
+ &i);
for_each_cpu(cpu) {
+ if (cpu == curcpu)
+ continue;
i = 0;
- ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+ ARPT_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_entry_to_counter,
counters,
@@ -831,6 +850,7 @@ static int copy_entries_to_user(unsigned int total_size,
struct arpt_entry *e;
struct arpt_counters *counters;
int ret = 0;
+ void *loc_cpu_entry;
/* We need atomic snapshot of counters: rest doesn't change
* (other than comefrom, which userspace doesn't care
@@ -843,13 +863,13 @@ static int copy_entries_to_user(unsigned int total_size,
return -ENOMEM;
/* First, sum counters... */
- memset(counters, 0, countersize);
write_lock_bh(&table->lock);
get_counters(table->private, counters);
write_unlock_bh(&table->lock);
- /* ... then copy entire thing from CPU 0... */
- if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+ loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+ /* ... then copy entire thing ... */
+ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
ret = -EFAULT;
goto free_counters;
}
@@ -859,7 +879,7 @@ static int copy_entries_to_user(unsigned int total_size,
for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
struct arpt_entry_target *t;
- e = (struct arpt_entry *)(table->private->entries + off);
+ e = (struct arpt_entry *)(loc_cpu_entry + off);
if (copy_to_user(userptr + off
+ offsetof(struct arpt_entry, counters),
&counters[num],
@@ -911,6 +931,47 @@ static int get_entries(const struct arpt_get_entries *entries,
return ret;
}
+static void free_table_info(struct arpt_table_info *info)
+{
+ int cpu;
+ for_each_cpu(cpu) {
+ if (info->size <= PAGE_SIZE)
+ kfree(info->entries[cpu]);
+ else
+ vfree(info->entries[cpu]);
+ }
+ kfree(info);
+}
+
+static struct arpt_table_info *alloc_table_info(unsigned int size)
+{
+ struct arpt_table_info *newinfo;
+ int cpu;
+
+ newinfo = kzalloc(sizeof(struct arpt_table_info), GFP_KERNEL);
+ if (!newinfo)
+ return NULL;
+
+ newinfo->size = size;
+
+ for_each_cpu(cpu) {
+ if (size <= PAGE_SIZE)
+ newinfo->entries[cpu] = kmalloc_node(size,
+ GFP_KERNEL,
+ cpu_to_node(cpu));
+ else
+ newinfo->entries[cpu] = vmalloc_node(size,
+ cpu_to_node(cpu));
+
+ if (newinfo->entries[cpu] == NULL) {
+ free_table_info(newinfo);
+ return NULL;
+ }
+ }
+
+ return newinfo;
+}
+
static int do_replace(void __user *user, unsigned int len)
{
int ret;
@@ -918,6 +979,7 @@ static int do_replace(void __user *user, unsigned int len)
struct arpt_table *t;
struct arpt_table_info *newinfo, *oldinfo;
struct arpt_counters *counters;
+ void *loc_cpu_entry, *loc_cpu_old_entry;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -930,13 +992,13 @@ static int do_replace(void __user *user, unsigned int len)
if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
return -ENOMEM;
- newinfo = vmalloc(sizeof(struct arpt_table_info)
- + SMP_ALIGN(tmp.size) *
- (highest_possible_processor_id()+1));
+ newinfo = alloc_table_info(tmp.size);
if (!newinfo)
return -ENOMEM;
- if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+ /* choose the copy that is on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
@@ -947,10 +1009,9 @@ static int do_replace(void __user *user, unsigned int len)
ret = -ENOMEM;
goto free_newinfo;
}
- memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters));
ret = translate_table(tmp.name, tmp.valid_hooks,
- newinfo, tmp.size, tmp.num_entries,
+ newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
tmp.hook_entry, tmp.underflow);
if (ret != 0)
goto free_newinfo_counters;
@@ -989,8 +1050,10 @@ static int do_replace(void __user *user, unsigned int len)
/* Get the old counters. */
get_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
- ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
- vfree(oldinfo);
+ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+ ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
+
+ free_table_info(oldinfo);
if (copy_to_user(tmp.counters, counters,
sizeof(struct arpt_counters) * tmp.num_counters) != 0)
ret = -EFAULT;
@@ -1002,11 +1065,11 @@ static int do_replace(void __user *user, unsigned int len)
module_put(t->me);
up(&arpt_mutex);
free_newinfo_counters_untrans:
- ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL);
+ ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL);
free_newinfo_counters:
vfree(counters);
free_newinfo:
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
@@ -1030,6 +1093,7 @@ static int do_add_counters(void __user *user, unsigned int len)
struct arpt_counters_info tmp, *paddc;
struct arpt_table *t;
int ret = 0;
+ void *loc_cpu_entry;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1059,7 +1123,9 @@ static int do_add_counters(void __user *user, unsigned int len)
}
i = 0;
- ARPT_ENTRY_ITERATE(t->private->entries,
+ /* Choose the copy that is on our node */
+ loc_cpu_entry = t->private->entries[smp_processor_id()];
+ ARPT_ENTRY_ITERATE(loc_cpu_entry,
t->private->size,
add_counter_to_entry,
paddc->counters,
@@ -1220,30 +1286,32 @@ int arpt_register_table(struct arpt_table *table,
struct arpt_table_info *newinfo;
static struct arpt_table_info bootstrap
= { 0, 0, 0, { 0 }, { 0 }, { } };
+ void *loc_cpu_entry;
- newinfo = vmalloc(sizeof(struct arpt_table_info)
- + SMP_ALIGN(repl->size) *
- (highest_possible_processor_id()+1));
+ newinfo = alloc_table_info(repl->size);
if (!newinfo) {
ret = -ENOMEM;
return ret;
}
- memcpy(newinfo->entries, repl->entries, repl->size);
+
+ /* choose the copy on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(table->name, table->valid_hooks,
- newinfo, repl->size,
+ newinfo, loc_cpu_entry, repl->size,
repl->num_entries,
repl->hook_entry,
repl->underflow);
duprintf("arpt_register_table: translate table gives %d\n", ret);
if (ret != 0) {
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
ret = down_interruptible(&arpt_mutex);
if (ret != 0) {
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
@@ -1272,20 +1340,23 @@ int arpt_register_table(struct arpt_table *table,
return ret;
free_unlock:
- vfree(newinfo);
+ free_table_info(newinfo);
goto unlock;
}
void arpt_unregister_table(struct arpt_table *table)
{
+ void *loc_cpu_entry;
+
down(&arpt_mutex);
LIST_DELETE(&arpt_tables, table);
up(&arpt_mutex);
/* Decrease module usage counts and free resources */
- ARPT_ENTRY_ITERATE(table->private->entries, table->private->size,
+ loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+ ARPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
cleanup_entry, NULL);
- vfree(table->private);
+ free_table_info(table->private);
}
/* The built-in targets: standard (NULL) and error. */
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index e52847fa10f5..0366eedb4d70 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -18,11 +18,13 @@
*
*/
+#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/netfilter.h>
#include <linux/ip.h>
#include <linux/moduleparam.h>
+#include <linux/udp.h>
#include <net/checksum.h>
#include <net/udp.h>
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
index 744abb9d377a..57956dee60c8 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
@@ -31,6 +31,7 @@
#include <linux/ip.h>
#include <linux/in.h>
#include <linux/list.h>
+#include <linux/seq_file.h>
static DEFINE_RWLOCK(ip_ct_gre_lock);
#define ASSERT_READ_LOCK(x)
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index f2dcac7c7660..46becbe4fe58 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -11,6 +11,7 @@
#include <linux/timer.h>
#include <linux/netfilter.h>
#include <linux/in.h>
+#include <linux/ip.h>
#include <linux/udp.h>
#include <linux/seq_file.h>
#include <net/checksum.h>
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index dd476b191f4b..a88bcc551244 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -27,6 +27,7 @@
#endif
#include <net/checksum.h>
#include <net/ip.h>
+#include <net/route.h>
#define ASSERT_READ_LOCK(x)
#define ASSERT_WRITE_LOCK(x)
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index 8acb7ed40b47..4f95d477805c 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -44,6 +44,7 @@
*
*/
#include <linux/config.h>
+#include <linux/in.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -53,6 +54,7 @@
#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
#include <linux/netfilter_ipv4/ip_nat_helper.h>
#include <linux/ip.h>
+#include <linux/udp.h>
#include <net/checksum.h>
#include <net/udp.h>
#include <asm/uaccess.h>
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 45886c8475e8..2a26d167e149 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -83,11 +83,6 @@ static DECLARE_MUTEX(ipt_mutex);
context stops packets coming through and allows user context to read
the counters or update the rules.
- To be cache friendly on SMP, we arrange them like so:
- [ n-entries ]
- ... cache-align padding ...
- [ n-entries ]
-
Hence the start of any table is given by get_table() below. */
/* The table itself */
@@ -105,20 +100,15 @@ struct ipt_table_info
unsigned int underflow[NF_IP_NUMHOOKS];
/* ipt_entry tables: one per CPU */
- char entries[0] ____cacheline_aligned;
+ void *entries[NR_CPUS];
};
static LIST_HEAD(ipt_target);
static LIST_HEAD(ipt_match);
static LIST_HEAD(ipt_tables);
+#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
-#ifdef CONFIG_SMP
-#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
-#else
-#define TABLE_OFFSET(t,p) 0
-#endif
-
#if 0
#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
@@ -290,8 +280,7 @@ ipt_do_table(struct sk_buff **pskb,
read_lock_bh(&table->lock);
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
- table_base = (void *)table->private->entries
- + TABLE_OFFSET(table->private, smp_processor_id());
+ table_base = (void *)table->private->entries[smp_processor_id()];
e = get_entry(table_base, table->private->hook_entry[hook]);
#ifdef CONFIG_NETFILTER_DEBUG
@@ -563,7 +552,8 @@ unconditional(const struct ipt_ip *ip)
/* Figures out from what hook each rule can be called: returns 0 if
there are loops. Puts hook bitmask in comefrom. */
static int
-mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
+mark_source_chains(struct ipt_table_info *newinfo,
+ unsigned int valid_hooks, void *entry0)
{
unsigned int hook;
@@ -572,7 +562,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) {
unsigned int pos = newinfo->hook_entry[hook];
struct ipt_entry *e
- = (struct ipt_entry *)(newinfo->entries + pos);
+ = (struct ipt_entry *)(entry0 + pos);
if (!(valid_hooks & (1 << hook)))
continue;
@@ -622,13 +612,13 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
goto next;
e = (struct ipt_entry *)
- (newinfo->entries + pos);
+ (entry0 + pos);
} while (oldpos == pos + e->next_offset);
/* Move along one */
size = e->next_offset;
e = (struct ipt_entry *)
- (newinfo->entries + pos + size);
+ (entry0 + pos + size);
e->counters.pcnt = pos;
pos += size;
} else {
@@ -645,7 +635,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
newpos = pos + e->next_offset;
}
e = (struct ipt_entry *)
- (newinfo->entries + newpos);
+ (entry0 + newpos);
e->counters.pcnt = pos;
pos = newpos;
}
@@ -855,6 +845,7 @@ static int
translate_table(const char *name,
unsigned int valid_hooks,
struct ipt_table_info *newinfo,
+ void *entry0,
unsigned int size,
unsigned int number,
const unsigned int *hook_entries,
@@ -875,11 +866,11 @@ translate_table(const char *name,
duprintf("translate_table: size %u\n", newinfo->size);
i = 0;
/* Walk through entries, checking offsets. */
- ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
check_entry_size_and_hooks,
newinfo,
- newinfo->entries,
- newinfo->entries + size,
+ entry0,
+ entry0 + size,
hook_entries, underflows, &i);
if (ret != 0)
return ret;
@@ -907,27 +898,24 @@ translate_table(const char *name,
}
}
- if (!mark_source_chains(newinfo, valid_hooks))
+ if (!mark_source_chains(newinfo, valid_hooks, entry0))
return -ELOOP;
/* Finally, each sanity check must pass */
i = 0;
- ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
check_entry, name, size, &i);
if (ret != 0) {
- IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ IPT_ENTRY_ITERATE(entry0, newinfo->size,
cleanup_entry, &i);
return ret;
}
/* And one copy for every other CPU */
for_each_cpu(i) {
- if (i == 0)
- continue;
- memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
- newinfo->entries,
- SMP_ALIGN(newinfo->size));
+ if (newinfo->entries[i] && newinfo->entries[i] != entry0)
+ memcpy(newinfo->entries[i], entry0, newinfo->size);
}
return ret;
@@ -943,15 +931,12 @@ replace_table(struct ipt_table *table,
#ifdef CONFIG_NETFILTER_DEBUG
{
- struct ipt_entry *table_base;
- unsigned int i;
+ int cpu;
- for_each_cpu(i) {
- table_base =
- (void *)newinfo->entries
- + TABLE_OFFSET(newinfo, i);
-
- table_base->comefrom = 0xdead57ac;
+ for_each_cpu(cpu) {
+ struct ipt_entry *table_base = newinfo->entries[cpu];
+ if (table_base)
+ table_base->comefrom = 0xdead57ac;
}
}
#endif
@@ -986,16 +971,44 @@ add_entry_to_counter(const struct ipt_entry *e,
return 0;
}
+static inline int
+set_entry_to_counter(const struct ipt_entry *e,
+ struct ipt_counters total[],
+ unsigned int *i)
+{
+ SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+
+ (*i)++;
+ return 0;
+}
+
static void
get_counters(const struct ipt_table_info *t,
struct ipt_counters counters[])
{
unsigned int cpu;
unsigned int i;
+ unsigned int curcpu;
+
+ /* Instead of clearing (by a previous call to memset())
+ * the counters and using adds, we set the counters
+ * with data used by 'current' CPU
+ * We dont care about preemption here.
+ */
+ curcpu = raw_smp_processor_id();
+
+ i = 0;
+ IPT_ENTRY_ITERATE(t->entries[curcpu],
+ t->size,
+ set_entry_to_counter,
+ counters,
+ &i);
for_each_cpu(cpu) {
+ if (cpu == curcpu)
+ continue;
i = 0;
- IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+ IPT_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_entry_to_counter,
counters,
@@ -1012,24 +1025,29 @@ copy_entries_to_user(unsigned int total_size,
struct ipt_entry *e;
struct ipt_counters *counters;
int ret = 0;
+ void *loc_cpu_entry;
/* We need atomic snapshot of counters: rest doesn't change
(other than comefrom, which userspace doesn't care
about). */
countersize = sizeof(struct ipt_counters) * table->private->number;
- counters = vmalloc(countersize);
+ counters = vmalloc_node(countersize, numa_node_id());
if (counters == NULL)
return -ENOMEM;
/* First, sum counters... */
- memset(counters, 0, countersize);
write_lock_bh(&table->lock);
get_counters(table->private, counters);
write_unlock_bh(&table->lock);
- /* ... then copy entire thing from CPU 0... */
- if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+ /* choose the copy that is on our node/cpu, ...
+ * This choice is lazy (because current thread is
+ * allowed to migrate to another cpu)
+ */
+ loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+ /* ... then copy entire thing ... */
+ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
ret = -EFAULT;
goto free_counters;
}
@@ -1041,7 +1059,7 @@ copy_entries_to_user(unsigned int total_size,
struct ipt_entry_match *m;
struct ipt_entry_target *t;
- e = (struct ipt_entry *)(table->private->entries + off);
+ e = (struct ipt_entry *)(loc_cpu_entry + off);
if (copy_to_user(userptr + off
+ offsetof(struct ipt_entry, counters),
&counters[num],
@@ -1110,6 +1128,45 @@ get_entries(const struct ipt_get_entries *entries,
return ret;
}
+static void free_table_info(struct ipt_table_info *info)
+{
+ int cpu;
+ for_each_cpu(cpu) {
+ if (info->size <= PAGE_SIZE)
+ kfree(info->entries[cpu]);
+ else
+ vfree(info->entries[cpu]);
+ }
+ kfree(info);
+}
+
+static struct ipt_table_info *alloc_table_info(unsigned int size)
+{
+ struct ipt_table_info *newinfo;
+ int cpu;
+
+ newinfo = kzalloc(sizeof(struct ipt_table_info), GFP_KERNEL);
+ if (!newinfo)
+ return NULL;
+
+ newinfo->size = size;
+
+ for_each_cpu(cpu) {
+ if (size <= PAGE_SIZE)
+ newinfo->entries[cpu] = kmalloc_node(size,
+ GFP_KERNEL,
+ cpu_to_node(cpu));
+ else
+ newinfo->entries[cpu] = vmalloc_node(size, cpu_to_node(cpu));
+ if (newinfo->entries[cpu] == 0) {
+ free_table_info(newinfo);
+ return NULL;
+ }
+ }
+
+ return newinfo;
+}
+
static int
do_replace(void __user *user, unsigned int len)
{
@@ -1118,6 +1175,7 @@ do_replace(void __user *user, unsigned int len)
struct ipt_table *t;
struct ipt_table_info *newinfo, *oldinfo;
struct ipt_counters *counters;
+ void *loc_cpu_entry, *loc_cpu_old_entry;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1130,13 +1188,13 @@ do_replace(void __user *user, unsigned int len)
if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
return -ENOMEM;
- newinfo = vmalloc(sizeof(struct ipt_table_info)
- + SMP_ALIGN(tmp.size) *
- (highest_possible_processor_id()+1));
+ newinfo = alloc_table_info(tmp.size);
if (!newinfo)
return -ENOMEM;
- if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+ /* choose the copy that is our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
@@ -1147,10 +1205,9 @@ do_replace(void __user *user, unsigned int len)
ret = -ENOMEM;
goto free_newinfo;
}
- memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters));
ret = translate_table(tmp.name, tmp.valid_hooks,
- newinfo, tmp.size, tmp.num_entries,
+ newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
tmp.hook_entry, tmp.underflow);
if (ret != 0)
goto free_newinfo_counters;
@@ -1189,8 +1246,9 @@ do_replace(void __user *user, unsigned int len)
/* Get the old counters. */
get_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
- IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
- vfree(oldinfo);
+ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+ IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
+ free_table_info(oldinfo);
if (copy_to_user(tmp.counters, counters,
sizeof(struct ipt_counters) * tmp.num_counters) != 0)
ret = -EFAULT;
@@ -1202,11 +1260,11 @@ do_replace(void __user *user, unsigned int len)
module_put(t->me);
up(&ipt_mutex);
free_newinfo_counters_untrans:
- IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL);
+ IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
free_newinfo_counters:
vfree(counters);
free_newinfo:
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
@@ -1239,6 +1297,7 @@ do_add_counters(void __user *user, unsigned int len)
struct ipt_counters_info tmp, *paddc;
struct ipt_table *t;
int ret = 0;
+ void *loc_cpu_entry;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1246,7 +1305,7 @@ do_add_counters(void __user *user, unsigned int len)
if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters))
return -EINVAL;
- paddc = vmalloc(len);
+ paddc = vmalloc_node(len, numa_node_id());
if (!paddc)
return -ENOMEM;
@@ -1268,7 +1327,9 @@ do_add_counters(void __user *user, unsigned int len)
}
i = 0;
- IPT_ENTRY_ITERATE(t->private->entries,
+ /* Choose the copy that is on our node */
+ loc_cpu_entry = t->private->entries[raw_smp_processor_id()];
+ IPT_ENTRY_ITERATE(loc_cpu_entry,
t->private->size,
add_counter_to_entry,
paddc->counters,
@@ -1460,28 +1521,31 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
struct ipt_table_info *newinfo;
static struct ipt_table_info bootstrap
= { 0, 0, 0, { 0 }, { 0 }, { } };
+ void *loc_cpu_entry;
- newinfo = vmalloc(sizeof(struct ipt_table_info)
- + SMP_ALIGN(repl->size) *
- (highest_possible_processor_id()+1));
+ newinfo = alloc_table_info(repl->size);
if (!newinfo)
return -ENOMEM;
- memcpy(newinfo->entries, repl->entries, repl->size);
+ /* choose the copy on our node/cpu
+ * but dont care of preemption
+ */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(table->name, table->valid_hooks,
- newinfo, repl->size,
+ newinfo, loc_cpu_entry, repl->size,
repl->num_entries,
repl->hook_entry,
repl->underflow);
if (ret != 0) {
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
ret = down_interruptible(&ipt_mutex);
if (ret != 0) {
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
@@ -1510,20 +1574,23 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
return ret;
free_unlock:
- vfree(newinfo);
+ free_table_info(newinfo);
goto unlock;
}
void ipt_unregister_table(struct ipt_table *table)
{
+ void *loc_cpu_entry;
+
down(&ipt_mutex);
LIST_DELETE(&ipt_tables, table);
up(&ipt_mutex);
/* Decrease module usage counts and free resources */
- IPT_ENTRY_ITERATE(table->private->entries, table->private->size,
+ loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+ IPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
cleanup_entry, NULL);
- vfree(table->private);
+ free_table_info(table->private);
}
/* Returns 1 if the port is matched by the range, 0 otherwise */
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 275a174c6fe6..27860510ca6d 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -11,6 +11,7 @@
#include <linux/config.h>
#include <linux/types.h>
+#include <linux/inetdevice.h>
#include <linux/ip.h>
#include <linux/timer.h>
#include <linux/module.h>
@@ -18,6 +19,7 @@
#include <net/protocol.h>
#include <net/ip.h>
#include <net/checksum.h>
+#include <net/route.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv4/ip_nat_rule.h>
#include <linux/netfilter_ipv4/ip_tables.h>
diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c
index 1a53924041fc..03f554857a4d 100644
--- a/net/ipv4/netfilter/ipt_physdev.c
+++ b/net/ipv4/netfilter/ipt_physdev.c
@@ -9,6 +9,7 @@
*/
#include <linux/module.h>
+#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/netfilter_ipv4/ipt_physdev.h>
#include <linux/netfilter_ipv4/ip_tables.h>
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 0d7dc668db46..39d49dc333a7 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -38,6 +38,7 @@
#include <net/protocol.h>
#include <net/tcp.h>
#include <net/udp.h>
+#include <linux/inetdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <net/sock.h>
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index a34e60ea48a1..e20be3331f67 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -173,10 +173,10 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct sock *child;
- child = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
+ child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
if (child)
inet_csk_reqsk_queue_add(sk, req, child);
else
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 01444a02b48b..16984d4a8a06 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -12,6 +12,7 @@
#include <linux/sysctl.h>
#include <linux/config.h>
#include <linux/igmp.h>
+#include <linux/inetdevice.h>
#include <net/snmp.h>
#include <net/icmp.h>
#include <net/ip.h>
@@ -22,6 +23,7 @@
extern int sysctl_ip_nonlocal_bind;
#ifdef CONFIG_SYSCTL
+static int zero;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -614,6 +616,15 @@ ctl_table ipv4_table[] = {
.strategy = &sysctl_jiffies
},
{
+ .ctl_name = NET_IPV4_IPFRAG_MAX_DIST,
+ .procname = "ipfrag_max_dist",
+ .data = &sysctl_ipfrag_max_dist,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &zero
+ },
+ {
.ctl_name = NET_TCP_NO_METRICS_SAVE,
.procname = "tcp_no_metrics_save",
.data = &sysctl_tcp_nometrics_save,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ef98b14ac56d..00aa80e93243 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1696,8 +1696,8 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
int err = 0;
if (level != SOL_TCP)
- return tp->af_specific->setsockopt(sk, level, optname,
- optval, optlen);
+ return icsk->icsk_af_ops->setsockopt(sk, level, optname,
+ optval, optlen);
/* This is a string value all the others are int's */
if (optname == TCP_CONGESTION) {
@@ -1914,7 +1914,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
- info->tcpi_pmtu = tp->pmtu_cookie;
+ info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
@@ -1939,8 +1939,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
int val, len;
if (level != SOL_TCP)
- return tp->af_specific->getsockopt(sk, level, optname,
- optval, optlen);
+ return icsk->icsk_af_ops->getsockopt(sk, level, optname,
+ optval, optlen);
if (get_user(len, optlen))
return -EFAULT;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 1d0cd86621b1..035f2092d73a 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -30,8 +30,6 @@ static int fast_convergence = 1;
static int max_increment = 16;
static int low_window = 14;
static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
-static int low_utilization_threshold = 153;
-static int low_utilization_period = 2;
static int initial_ssthresh = 100;
static int smooth_part = 20;
@@ -43,10 +41,6 @@ module_param(low_window, int, 0644);
MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
module_param(beta, int, 0644);
MODULE_PARM_DESC(beta, "beta for multiplicative increase");
-module_param(low_utilization_threshold, int, 0644);
-MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode");
-module_param(low_utilization_period, int, 0644);
-MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)");
module_param(initial_ssthresh, int, 0644);
MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
module_param(smooth_part, int, 0644);
@@ -60,11 +54,6 @@ struct bictcp {
u32 loss_cwnd; /* congestion window at last loss */
u32 last_cwnd; /* the last snd_cwnd */
u32 last_time; /* time when updated last_cwnd */
- u32 delay_min; /* min delay */
- u32 delay_max; /* max delay */
- u32 last_delay;
- u8 low_utilization;/* 0: high; 1: low */
- u32 low_utilization_start; /* starting time of low utilization detection*/
u32 epoch_start; /* beginning of an epoch */
#define ACK_RATIO_SHIFT 4
u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
@@ -77,11 +66,6 @@ static inline void bictcp_reset(struct bictcp *ca)
ca->loss_cwnd = 0;
ca->last_cwnd = 0;
ca->last_time = 0;
- ca->delay_min = 0;
- ca->delay_max = 0;
- ca->last_delay = 0;
- ca->low_utilization = 0;
- ca->low_utilization_start = 0;
ca->epoch_start = 0;
ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
}
@@ -143,8 +127,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
}
/* if in slow start or link utilization is very low */
- if ( ca->loss_cwnd == 0 ||
- (cwnd > ca->loss_cwnd && ca->low_utilization)) {
+ if (ca->loss_cwnd == 0) {
if (ca->cnt > 20) /* increase cwnd 5% per RTT */
ca->cnt = 20;
}
@@ -154,69 +137,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
ca->cnt = 1;
}
-
-/* Detect low utilization in congestion avoidance */
-static inline void bictcp_low_utilization(struct sock *sk, int flag)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- struct bictcp *ca = inet_csk_ca(sk);
- u32 dist, delay;
-
- /* No time stamp */
- if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
- /* Discard delay samples right after fast recovery */
- tcp_time_stamp < ca->epoch_start + HZ ||
- /* this delay samples may not be accurate */
- flag == 0) {
- ca->last_delay = 0;
- goto notlow;
- }
-
- delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/
- ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
- if (delay == 0) /* no previous delay sample */
- goto notlow;
-
- /* first time call or link delay decreases */
- if (ca->delay_min == 0 || ca->delay_min > delay) {
- ca->delay_min = ca->delay_max = delay;
- goto notlow;
- }
-
- if (ca->delay_max < delay)
- ca->delay_max = delay;
-
- /* utilization is low, if avg delay < dist*threshold
- for checking_period time */
- dist = ca->delay_max - ca->delay_min;
- if (dist <= ca->delay_min>>6 ||
- tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10)
- goto notlow;
-
- if (ca->low_utilization_start == 0) {
- ca->low_utilization = 0;
- ca->low_utilization_start = tcp_time_stamp;
- } else if ((s32)(tcp_time_stamp - ca->low_utilization_start)
- > low_utilization_period*HZ) {
- ca->low_utilization = 1;
- }
-
- return;
-
- notlow:
- ca->low_utilization = 0;
- ca->low_utilization_start = 0;
-
-}
-
static void bictcp_cong_avoid(struct sock *sk, u32 ack,
u32 seq_rtt, u32 in_flight, int data_acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
- bictcp_low_utilization(sk, data_acked);
-
if (!tcp_is_cwnd_limited(sk, in_flight))
return;
@@ -249,11 +175,6 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk)
ca->epoch_start = 0; /* end of epoch */
- /* in case of wrong delay_max*/
- if (ca->delay_min > 0 && ca->delay_max > ca->delay_min)
- ca->delay_max = ca->delay_min
- + ((ca->delay_max - ca->delay_min)* 90) / 100;
-
/* Wmax and fast convergence */
if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
@@ -289,14 +210,14 @@ static void bictcp_state(struct sock *sk, u8 new_state)
bictcp_reset(inet_csk_ca(sk));
}
-/* Track delayed acknowledgement ratio using sliding window
+/* Track delayed acknowledgment ratio using sliding window
* ratio = (15*ratio + sample) / 16
*/
static void bictcp_acked(struct sock *sk, u32 cnt)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
+ if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
struct bictcp *ca = inet_csk_ca(sk);
cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
ca->delayed_ack += cnt;
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index c7cc62c8dc12..e688c687d62d 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -174,6 +174,34 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
return err;
}
+
+/*
+ * Linear increase during slow start
+ */
+void tcp_slow_start(struct tcp_sock *tp)
+{
+ if (sysctl_tcp_abc) {
+ /* RFC3465: Slow Start
+ * TCP sender SHOULD increase cwnd by the number of
+ * previously unacknowledged bytes ACKed by each incoming
+ * acknowledgment, provided the increase is not more than L
+ */
+ if (tp->bytes_acked < tp->mss_cache)
+ return;
+
+ /* We MAY increase by 2 if discovered delayed ack */
+ if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) {
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ }
+ }
+ tp->bytes_acked = 0;
+
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+}
+EXPORT_SYMBOL_GPL(tcp_slow_start);
+
/*
* TCP Reno congestion control
* This is special case used for fallback as well.
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
new file mode 100644
index 000000000000..31a4986dfbf7
--- /dev/null
+++ b/net/ipv4/tcp_cubic.c
@@ -0,0 +1,411 @@
+/*
+ * TCP CUBIC: Binary Increase Congestion control for TCP v2.0
+ *
+ * This is from the implementation of CUBIC TCP in
+ * Injong Rhee, Lisong Xu.
+ * "CUBIC: A New TCP-Friendly High-Speed TCP Variant
+ * in PFLDnet 2005
+ * Available from:
+ * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
+ *
+ * Unless CUBIC is enabled and congestion window is large
+ * this behaves the same as the original Reno.
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+#include <asm/div64.h>
+
+#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
+ * max_cwnd = snd_cwnd * beta
+ */
+#define BICTCP_B 4 /*
+ * In binary search,
+ * go to point (max+min)/N
+ */
+#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */
+
+static int fast_convergence = 1;
+static int max_increment = 16;
+static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
+static int initial_ssthresh = 100;
+static int bic_scale = 41;
+static int tcp_friendliness = 1;
+
+static u32 cube_rtt_scale;
+static u32 beta_scale;
+static u64 cube_factor;
+
+/* Note parameters that are used for precomputing scale factors are read-only */
+module_param(fast_convergence, int, 0644);
+MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
+module_param(max_increment, int, 0644);
+MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
+module_param(beta, int, 0444);
+MODULE_PARM_DESC(beta, "beta for multiplicative increase");
+module_param(initial_ssthresh, int, 0644);
+MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
+module_param(bic_scale, int, 0444);
+MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
+module_param(tcp_friendliness, int, 0644);
+MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
+
+#include <asm/div64.h>
+
+/* BIC TCP Parameters */
+struct bictcp {
+ u32 cnt; /* increase cwnd by 1 after ACKs */
+ u32 last_max_cwnd; /* last maximum snd_cwnd */
+ u32 loss_cwnd; /* congestion window at last loss */
+ u32 last_cwnd; /* the last snd_cwnd */
+ u32 last_time; /* time when updated last_cwnd */
+ u32 bic_origin_point;/* origin point of bic function */
+ u32 bic_K; /* time to origin point from the beginning of the current epoch */
+ u32 delay_min; /* min delay */
+ u32 epoch_start; /* beginning of an epoch */
+ u32 ack_cnt; /* number of acks */
+ u32 tcp_cwnd; /* estimated tcp cwnd */
+#define ACK_RATIO_SHIFT 4
+ u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
+};
+
+static inline void bictcp_reset(struct bictcp *ca)
+{
+ ca->cnt = 0;
+ ca->last_max_cwnd = 0;
+ ca->loss_cwnd = 0;
+ ca->last_cwnd = 0;
+ ca->last_time = 0;
+ ca->bic_origin_point = 0;
+ ca->bic_K = 0;
+ ca->delay_min = 0;
+ ca->epoch_start = 0;
+ ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
+ ca->ack_cnt = 0;
+ ca->tcp_cwnd = 0;
+}
+
+static void bictcp_init(struct sock *sk)
+{
+ bictcp_reset(inet_csk_ca(sk));
+ if (initial_ssthresh)
+ tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
+}
+
+/* 64bit divisor, dividend and result. dynamic precision */
+static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
+{
+ u_int32_t d = divisor;
+
+ if (divisor > 0xffffffffULL) {
+ unsigned int shift = fls(divisor >> 32);
+
+ d = divisor >> shift;
+ dividend >>= shift;
+ }
+
+ /* avoid 64 bit division if possible */
+ if (dividend >> 32)
+ do_div(dividend, d);
+ else
+ dividend = (uint32_t) dividend / d;
+
+ return dividend;
+}
+
+/*
+ * calculate the cubic root of x using Newton-Raphson
+ */
+static u32 cubic_root(u64 a)
+{
+ u32 x, x1;
+
+ /* Initial estimate is based on:
+ * cbrt(x) = exp(log(x) / 3)
+ */
+ x = 1u << (fls64(a)/3);
+
+ /*
+ * Iteration based on:
+ * 2
+ * x = ( 2 * x + a / x ) / 3
+ * k+1 k k
+ */
+ do {
+ x1 = x;
+ x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3;
+ } while (abs(x1 - x) > 1);
+
+ return x;
+}
+
+/*
+ * Compute congestion window to use.
+ */
+static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
+{
+ u64 offs;
+ u32 delta, t, bic_target, min_cnt, max_cnt;
+
+ ca->ack_cnt++; /* count the number of ACKs */
+
+ if (ca->last_cwnd == cwnd &&
+ (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
+ return;
+
+ ca->last_cwnd = cwnd;
+ ca->last_time = tcp_time_stamp;
+
+ if (ca->epoch_start == 0) {
+ ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */
+ ca->ack_cnt = 1; /* start counting */
+ ca->tcp_cwnd = cwnd; /* syn with cubic */
+
+ if (ca->last_max_cwnd <= cwnd) {
+ ca->bic_K = 0;
+ ca->bic_origin_point = cwnd;
+ } else {
+ /* Compute new K based on
+ * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ)
+ */
+ ca->bic_K = cubic_root(cube_factor
+ * (ca->last_max_cwnd - cwnd));
+ ca->bic_origin_point = ca->last_max_cwnd;
+ }
+ }
+
+ /* cubic function - calc*/
+ /* calculate c * time^3 / rtt,
+ * while considering overflow in calculation of time^3
+ * (so time^3 is done by using 64 bit)
+ * and without the support of division of 64bit numbers
+ * (so all divisions are done by using 32 bit)
+ * also NOTE the unit of those veriables
+ * time = (t - K) / 2^bictcp_HZ
+ * c = bic_scale >> 10
+ * rtt = (srtt >> 3) / HZ
+ * !!! The following code does not have overflow problems,
+ * if the cwnd < 1 million packets !!!
+ */
+
+ /* change the unit from HZ to bictcp_HZ */
+ t = ((tcp_time_stamp + ca->delay_min - ca->epoch_start)
+ << BICTCP_HZ) / HZ;
+
+ if (t < ca->bic_K) /* t - K */
+ offs = ca->bic_K - t;
+ else
+ offs = t - ca->bic_K;
+
+ /* c/rtt * (t-K)^3 */
+ delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
+ if (t < ca->bic_K) /* below origin*/
+ bic_target = ca->bic_origin_point - delta;
+ else /* above origin*/
+ bic_target = ca->bic_origin_point + delta;
+
+ /* cubic function - calc bictcp_cnt*/
+ if (bic_target > cwnd) {
+ ca->cnt = cwnd / (bic_target - cwnd);
+ } else {
+ ca->cnt = 100 * cwnd; /* very small increment*/
+ }
+
+ if (ca->delay_min > 0) {
+ /* max increment = Smax * rtt / 0.1 */
+ min_cnt = (cwnd * HZ * 8)/(10 * max_increment * ca->delay_min);
+ if (ca->cnt < min_cnt)
+ ca->cnt = min_cnt;
+ }
+
+ /* slow start and low utilization */
+ if (ca->loss_cwnd == 0) /* could be aggressive in slow start */
+ ca->cnt = 50;
+
+ /* TCP Friendly */
+ if (tcp_friendliness) {
+ u32 scale = beta_scale;
+ delta = (cwnd * scale) >> 3;
+ while (ca->ack_cnt > delta) { /* update tcp cwnd */
+ ca->ack_cnt -= delta;
+ ca->tcp_cwnd++;
+ }
+
+ if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */
+ delta = ca->tcp_cwnd - cwnd;
+ max_cnt = cwnd / delta;
+ if (ca->cnt > max_cnt)
+ ca->cnt = max_cnt;
+ }
+ }
+
+ ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
+ if (ca->cnt == 0) /* cannot be zero */
+ ca->cnt = 1;
+}
+
+
+/* Keep track of minimum rtt */
+static inline void measure_delay(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+ u32 delay;
+
+ /* No time stamp */
+ if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
+ /* Discard delay samples right after fast recovery */
+ (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
+ return;
+
+ delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
+ if (delay == 0)
+ delay = 1;
+
+ /* first time call or link delay decreases */
+ if (ca->delay_min == 0 || ca->delay_min > delay)
+ ca->delay_min = delay;
+}
+
+static void bictcp_cong_avoid(struct sock *sk, u32 ack,
+ u32 seq_rtt, u32 in_flight, int data_acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ if (data_acked)
+ measure_delay(sk);
+
+ if (!tcp_is_cwnd_limited(sk, in_flight))
+ return;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp);
+ else {
+ bictcp_update(ca, tp->snd_cwnd);
+
+ /* In dangerous area, increase slowly.
+ * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
+ */
+ if (tp->snd_cwnd_cnt >= ca->cnt) {
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt = 0;
+ } else
+ tp->snd_cwnd_cnt++;
+ }
+
+}
+
+static u32 bictcp_recalc_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ ca->epoch_start = 0; /* end of epoch */
+
+ /* Wmax and fast convergence */
+ if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
+ ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
+ / (2 * BICTCP_BETA_SCALE);
+ else
+ ca->last_max_cwnd = tp->snd_cwnd;
+
+ ca->loss_cwnd = tp->snd_cwnd;
+
+ return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
+}
+
+static u32 bictcp_undo_cwnd(struct sock *sk)
+{
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
+}
+
+static u32 bictcp_min_cwnd(struct sock *sk)
+{
+ return tcp_sk(sk)->snd_ssthresh;
+}
+
+static void bictcp_state(struct sock *sk, u8 new_state)
+{
+ if (new_state == TCP_CA_Loss)
+ bictcp_reset(inet_csk_ca(sk));
+}
+
+/* Track delayed acknowledgment ratio using sliding window
+ * ratio = (15*ratio + sample) / 16
+ */
+static void bictcp_acked(struct sock *sk, u32 cnt)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
+ struct bictcp *ca = inet_csk_ca(sk);
+ cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
+ ca->delayed_ack += cnt;
+ }
+}
+
+
+static struct tcp_congestion_ops cubictcp = {
+ .init = bictcp_init,
+ .ssthresh = bictcp_recalc_ssthresh,
+ .cong_avoid = bictcp_cong_avoid,
+ .set_state = bictcp_state,
+ .undo_cwnd = bictcp_undo_cwnd,
+ .min_cwnd = bictcp_min_cwnd,
+ .pkts_acked = bictcp_acked,
+ .owner = THIS_MODULE,
+ .name = "cubic",
+};
+
+static int __init cubictcp_register(void)
+{
+ BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
+
+ /* Precompute a bunch of the scaling factors that are used per-packet
+ * based on SRTT of 100ms
+ */
+
+ beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta);
+
+ cube_rtt_scale = (bic_scale << 3) / 10; /* 1024*c/rtt */
+
+ /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
+ * so K = cubic_root( (wmax-cwnd)*rtt/c )
+ * the unit of K is bictcp_HZ=2^10, not HZ
+ *
+ * c = bic_scale >> 10
+ * rtt = 100ms
+ *
+ * the following code has been designed and tested for
+ * cwnd < 1 million packets
+ * RTT < 100 seconds
+ * HZ < 1,000,00 (corresponding to 10 nano-second)
+ */
+
+ /* 1/c * 2^2*bictcp_HZ * srtt */
+ cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */
+
+ /* divide by bic_scale and by constant Srtt (100ms) */
+ do_div(cube_factor, bic_scale * 10);
+
+ return tcp_register_congestion_control(&cubictcp);
+}
+
+static void __exit cubictcp_unregister(void)
+{
+ tcp_unregister_congestion_control(&cubictcp);
+}
+
+module_init(cubictcp_register);
+module_exit(cubictcp_unregister);
+
+MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("CUBIC TCP");
+MODULE_VERSION("2.0");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bf2e23086bce..0a461232329f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -115,8 +115,8 @@ int sysctl_tcp_abc = 1;
/* Adapt the MSS value used to make delayed ack decision to the
* real world.
*/
-static inline void tcp_measure_rcv_mss(struct sock *sk,
- const struct sk_buff *skb)
+static void tcp_measure_rcv_mss(struct sock *sk,
+ const struct sk_buff *skb)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const unsigned int lss = icsk->icsk_ack.last_seg_size;
@@ -246,8 +246,8 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
return 0;
}
-static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
- struct sk_buff *skb)
+static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
+ struct sk_buff *skb)
{
/* Check #1 */
if (tp->rcv_ssthresh < tp->window_clamp &&
@@ -341,6 +341,26 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
}
+
+/* Initialize RCV_MSS value.
+ * RCV_MSS is an our guess about MSS used by the peer.
+ * We haven't any direct information about the MSS.
+ * It's better to underestimate the RCV_MSS rather than overestimate.
+ * Overestimations make us ACKing less frequently than needed.
+ * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
+ */
+void tcp_initialize_rcv_mss(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
+
+ hint = min(hint, tp->rcv_wnd/2);
+ hint = min(hint, TCP_MIN_RCVMSS);
+ hint = max(hint, TCP_MIN_MSS);
+
+ inet_csk(sk)->icsk_ack.rcv_mss = hint;
+}
+
/* Receiver "autotuning" code.
*
* The algorithm for RTT estimation w/o timestamps is based on
@@ -735,6 +755,27 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
+/* Set slow start threshold and cwnd not falling to slow start */
+void tcp_enter_cwr(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tp->prior_ssthresh = 0;
+ tp->bytes_acked = 0;
+ if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+ tp->undo_marker = 0;
+ tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+ tp->snd_cwnd = min(tp->snd_cwnd,
+ tcp_packets_in_flight(tp) + 1U);
+ tp->snd_cwnd_cnt = 0;
+ tp->high_seq = tp->snd_nxt;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ TCP_ECN_queue_cwr(tp);
+
+ tcp_set_ca_state(sk, TCP_CA_CWR);
+ }
+}
+
/* Initialize metrics on socket. */
static void tcp_init_metrics(struct sock *sk)
@@ -2070,8 +2111,8 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
tcp_ack_no_tstamp(sk, seq_rtt, flag);
}
-static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
- u32 in_flight, int good)
+static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
+ u32 in_flight, int good)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good);
@@ -2082,7 +2123,7 @@ static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
* RFC2988 recommends to restart timer to now+rto.
*/
-static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
+static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
{
if (!tp->packets_out) {
inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
@@ -2147,7 +2188,7 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
return acked;
}
-static inline u32 tcp_usrtt(const struct sk_buff *skb)
+static u32 tcp_usrtt(const struct sk_buff *skb)
{
struct timeval tv, now;
@@ -2342,7 +2383,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
if (nwin > tp->max_window) {
tp->max_window = nwin;
- tcp_sync_mss(sk, tp->pmtu_cookie);
+ tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
}
}
}
@@ -2583,8 +2624,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
/* Fast parse options. This hopes to only see timestamps.
* If it is wrong it falls back on tcp_parse_options().
*/
-static inline int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
- struct tcp_sock *tp)
+static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
+ struct tcp_sock *tp)
{
if (th->doff == sizeof(struct tcphdr)>>2) {
tp->rx_opt.saw_tstamp = 0;
@@ -2804,8 +2845,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
}
}
-static __inline__ int
-tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
+static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
{
if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
if (before(seq, sp->start_seq))
@@ -2817,7 +2857,7 @@ tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
return 0;
}
-static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
+static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
{
if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
if (before(seq, tp->rcv_nxt))
@@ -2832,7 +2872,7 @@ static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
}
}
-static inline void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq)
+static void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq)
{
if (!tp->rx_opt.dsack)
tcp_dsack_set(tp, seq, end_seq);
@@ -2890,7 +2930,7 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
}
}
-static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
+static inline void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
{
__u32 tmp;
@@ -3455,7 +3495,7 @@ void tcp_cwnd_application_limited(struct sock *sk)
tp->snd_cwnd_stamp = tcp_time_stamp;
}
-static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
+static int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
{
/* If the user specified a specific send buffer setting, do
* not modify it.
@@ -3502,7 +3542,7 @@ static void tcp_new_space(struct sock *sk)
sk->sk_write_space(sk);
}
-static inline void tcp_check_space(struct sock *sk)
+static void tcp_check_space(struct sock *sk)
{
if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
@@ -3512,7 +3552,7 @@ static inline void tcp_check_space(struct sock *sk)
}
}
-static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
+static inline void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
{
tcp_push_pending_frames(sk, tp);
tcp_check_space(sk);
@@ -3544,7 +3584,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
}
}
-static __inline__ void tcp_ack_snd_check(struct sock *sk)
+static inline void tcp_ack_snd_check(struct sock *sk)
{
if (!inet_csk_ack_scheduled(sk)) {
/* We sent a data segment already. */
@@ -3692,8 +3732,7 @@ static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
return result;
}
-static __inline__ int
-tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
+static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
{
return skb->ip_summed != CHECKSUM_UNNECESSARY &&
__tcp_checksum_complete_user(sk, skb);
@@ -3967,12 +4006,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
int saved_clamp = tp->rx_opt.mss_clamp;
tcp_parse_options(skb, &tp->rx_opt, 0);
if (th->ack) {
- struct inet_connection_sock *icsk;
/* rfc793:
* "If the state is SYN-SENT then
* first check the ACK bit
@@ -4061,7 +4100,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
tp->rx_opt.sack_ok |= 2;
- tcp_sync_mss(sk, tp->pmtu_cookie);
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
/* Remember, tcp_poll() does not lock socket!
@@ -4072,7 +4111,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_set_state(sk, TCP_ESTABLISHED);
/* Make sure socket is routed, for correct metrics. */
- tp->af_specific->rebuild_header(sk);
+ icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_metrics(sk);
@@ -4098,8 +4137,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
sk_wake_async(sk, 0, POLL_OUT);
}
- icsk = inet_csk(sk);
-
if (sk->sk_write_pending ||
icsk->icsk_accept_queue.rskq_defer_accept ||
icsk->icsk_ack.pingpong) {
@@ -4173,7 +4210,7 @@ discard:
if (tp->ecn_flags&TCP_ECN_OK)
sock_set_flag(sk, SOCK_NO_LARGESEND);
- tcp_sync_mss(sk, tp->pmtu_cookie);
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
@@ -4220,6 +4257,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
int queued = 0;
tp->rx_opt.saw_tstamp = 0;
@@ -4236,7 +4274,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
goto discard;
if(th->syn) {
- if(tp->af_specific->conn_request(sk, skb) < 0)
+ if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
return 1;
/* Now we have several options: In theory there is
@@ -4349,7 +4387,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* Make sure socket is routed, for
* correct metrics.
*/
- tp->af_specific->rebuild_header(sk);
+ icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_metrics(sk);
@@ -4475,3 +4513,4 @@ EXPORT_SYMBOL(sysctl_tcp_abc);
EXPORT_SYMBOL(tcp_parse_options);
EXPORT_SYMBOL(tcp_rcv_established);
EXPORT_SYMBOL(tcp_rcv_state_process);
+EXPORT_SYMBOL(tcp_initialize_rcv_mss);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4d5021e1929b..e9f83e5b28ce 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -69,6 +69,7 @@
#include <net/transp_v6.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
+#include <net/timewait_sock.h>
#include <net/xfrm.h>
#include <linux/inet.h>
@@ -86,8 +87,7 @@ int sysctl_tcp_low_latency;
/* Socket used for sending RSTs */
static struct socket *tcp_socket;
-void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
- struct sk_buff *skb);
+void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
.lhash_lock = RW_LOCK_UNLOCKED,
@@ -97,7 +97,8 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
{
- return inet_csk_get_port(&tcp_hashinfo, sk, snum);
+ return inet_csk_get_port(&tcp_hashinfo, sk, snum,
+ inet_csk_bind_conflict);
}
static void tcp_v4_hash(struct sock *sk)
@@ -118,202 +119,38 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
skb->h.th->source);
}
-/* called with local bh disabled */
-static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
- struct inet_timewait_sock **twp)
+int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
{
- struct inet_sock *inet = inet_sk(sk);
- u32 daddr = inet->rcv_saddr;
- u32 saddr = inet->daddr;
- int dif = sk->sk_bound_dev_if;
- INET_ADDR_COOKIE(acookie, saddr, daddr)
- const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
- unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
- struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
- struct sock *sk2;
- const struct hlist_node *node;
- struct inet_timewait_sock *tw;
-
- prefetch(head->chain.first);
- write_lock(&head->lock);
-
- /* Check TIME-WAIT sockets first. */
- sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
- tw = inet_twsk(sk2);
-
- if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
- const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
- struct tcp_sock *tp = tcp_sk(sk);
-
- /* With PAWS, it is safe from the viewpoint
- of data integrity. Even without PAWS it
- is safe provided sequence spaces do not
- overlap i.e. at data rates <= 80Mbit/sec.
-
- Actually, the idea is close to VJ's one,
- only timestamp cache is held not per host,
- but per port pair and TW bucket is used
- as state holder.
+ const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
+ struct tcp_sock *tp = tcp_sk(sk);
- If TW bucket has been already destroyed we
- fall back to VJ's scheme and use initial
- timestamp retrieved from peer table.
- */
- if (tcptw->tw_ts_recent_stamp &&
- (!twp || (sysctl_tcp_tw_reuse &&
- xtime.tv_sec -
- tcptw->tw_ts_recent_stamp > 1))) {
- tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
- if (tp->write_seq == 0)
- tp->write_seq = 1;
- tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
- tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
- sock_hold(sk2);
- goto unique;
- } else
- goto not_unique;
- }
- }
- tw = NULL;
+ /* With PAWS, it is safe from the viewpoint
+ of data integrity. Even without PAWS it is safe provided sequence
+ spaces do not overlap i.e. at data rates <= 80Mbit/sec.
- /* And established part... */
- sk_for_each(sk2, node, &head->chain) {
- if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
- goto not_unique;
- }
+ Actually, the idea is close to VJ's one, only timestamp cache is
+ held not per host, but per port pair and TW bucket is used as state
+ holder.
-unique:
- /* Must record num and sport now. Otherwise we will see
- * in hash table socket with a funny identity. */
- inet->num = lport;
- inet->sport = htons(lport);
- sk->sk_hash = hash;
- BUG_TRAP(sk_unhashed(sk));
- __sk_add_node(sk, &head->chain);
- sock_prot_inc_use(sk->sk_prot);
- write_unlock(&head->lock);
-
- if (twp) {
- *twp = tw;
- NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
- } else if (tw) {
- /* Silly. Should hash-dance instead... */
- inet_twsk_deschedule(tw, &tcp_death_row);
- NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-
- inet_twsk_put(tw);
+ If TW bucket has been already destroyed we fall back to VJ's scheme
+ and use initial timestamp retrieved from peer table.
+ */
+ if (tcptw->tw_ts_recent_stamp &&
+ (twp == NULL || (sysctl_tcp_tw_reuse &&
+ xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
+ tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
+ if (tp->write_seq == 0)
+ tp->write_seq = 1;
+ tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
+ tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+ sock_hold(sktw);
+ return 1;
}
return 0;
-
-not_unique:
- write_unlock(&head->lock);
- return -EADDRNOTAVAIL;
}
-static inline u32 connect_port_offset(const struct sock *sk)
-{
- const struct inet_sock *inet = inet_sk(sk);
-
- return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
- inet->dport);
-}
-
-/*
- * Bind a port for a connect operation and hash it.
- */
-static inline int tcp_v4_hash_connect(struct sock *sk)
-{
- const unsigned short snum = inet_sk(sk)->num;
- struct inet_bind_hashbucket *head;
- struct inet_bind_bucket *tb;
- int ret;
-
- if (!snum) {
- int low = sysctl_local_port_range[0];
- int high = sysctl_local_port_range[1];
- int range = high - low;
- int i;
- int port;
- static u32 hint;
- u32 offset = hint + connect_port_offset(sk);
- struct hlist_node *node;
- struct inet_timewait_sock *tw = NULL;
-
- local_bh_disable();
- for (i = 1; i <= range; i++) {
- port = low + (i + offset) % range;
- head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
- spin_lock(&head->lock);
-
- /* Does not bother with rcv_saddr checks,
- * because the established check is already
- * unique enough.
- */
- inet_bind_bucket_for_each(tb, node, &head->chain) {
- if (tb->port == port) {
- BUG_TRAP(!hlist_empty(&tb->owners));
- if (tb->fastreuse >= 0)
- goto next_port;
- if (!__tcp_v4_check_established(sk,
- port,
- &tw))
- goto ok;
- goto next_port;
- }
- }
-
- tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
- if (!tb) {
- spin_unlock(&head->lock);
- break;
- }
- tb->fastreuse = -1;
- goto ok;
-
- next_port:
- spin_unlock(&head->lock);
- }
- local_bh_enable();
-
- return -EADDRNOTAVAIL;
-
-ok:
- hint += i;
-
- /* Head lock still held and bh's disabled */
- inet_bind_hash(sk, tb, port);
- if (sk_unhashed(sk)) {
- inet_sk(sk)->sport = htons(port);
- __inet_hash(&tcp_hashinfo, sk, 0);
- }
- spin_unlock(&head->lock);
-
- if (tw) {
- inet_twsk_deschedule(tw, &tcp_death_row);;
- inet_twsk_put(tw);
- }
-
- ret = 0;
- goto out;
- }
-
- head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
- tb = inet_csk(sk)->icsk_bind_hash;
- spin_lock_bh(&head->lock);
- if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- __inet_hash(&tcp_hashinfo, sk, 0);
- spin_unlock_bh(&head->lock);
- return 0;
- } else {
- spin_unlock(&head->lock);
- /* No definite answer... Walk to established hash table */
- ret = __tcp_v4_check_established(sk, snum, NULL);
-out:
- local_bh_enable();
- return ret;
- }
-}
+EXPORT_SYMBOL_GPL(tcp_twsk_unique);
/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
@@ -383,9 +220,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->dport = usin->sin_port;
inet->daddr = daddr;
- tp->ext_header_len = 0;
+ inet_csk(sk)->icsk_ext_hdr_len = 0;
if (inet->opt)
- tp->ext_header_len = inet->opt->optlen;
+ inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
tp->rx_opt.mss_clamp = 536;
@@ -395,7 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
* complete initialization after this.
*/
tcp_set_state(sk, TCP_SYN_SENT);
- err = tcp_v4_hash_connect(sk);
+ err = inet_hash_connect(&tcp_death_row, sk);
if (err)
goto failure;
@@ -433,12 +270,10 @@ failure:
/*
* This routine does path mtu discovery as defined in RFC1191.
*/
-static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
- u32 mtu)
+static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
{
struct dst_entry *dst;
struct inet_sock *inet = inet_sk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
* send out by Linux are always <576bytes so they should go through
@@ -467,7 +302,7 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
mtu = dst_mtu(dst);
if (inet->pmtudisc != IP_PMTUDISC_DONT &&
- tp->pmtu_cookie > mtu) {
+ inet_csk(sk)->icsk_pmtu_cookie > mtu) {
tcp_sync_mss(sk, mtu);
/* Resend the TCP packet because it's
@@ -644,10 +479,10 @@ out:
}
/* This routine computes an IPv4 TCP checksum. */
-void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
- struct sk_buff *skb)
+void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
{
struct inet_sock *inet = inet_sk(sk);
+ struct tcphdr *th = skb->h.th;
if (skb->ip_summed == CHECKSUM_HW) {
th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
@@ -826,7 +661,8 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
kfree(inet_rsk(req)->opt);
}
-static inline void syn_flood_warning(struct sk_buff *skb)
+#ifdef CONFIG_SYN_COOKIES
+static void syn_flood_warning(struct sk_buff *skb)
{
static unsigned long warntime;
@@ -837,12 +673,13 @@ static inline void syn_flood_warning(struct sk_buff *skb)
ntohs(skb->h.th->dest));
}
}
+#endif
/*
* Save and compile IPv4 options into the request_sock if needed.
*/
-static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
- struct sk_buff *skb)
+static struct ip_options *tcp_v4_save_options(struct sock *sk,
+ struct sk_buff *skb)
{
struct ip_options *opt = &(IPCB(skb)->opt);
struct ip_options *dopt = NULL;
@@ -869,6 +706,11 @@ struct request_sock_ops tcp_request_sock_ops = {
.send_reset = tcp_v4_send_reset,
};
+static struct timewait_sock_ops tcp_timewait_sock_ops = {
+ .twsk_obj_size = sizeof(struct tcp_timewait_sock),
+ .twsk_unique = tcp_twsk_unique,
+};
+
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct inet_request_sock *ireq;
@@ -1053,9 +895,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
ireq->opt = NULL;
newinet->mc_index = inet_iif(skb);
newinet->mc_ttl = skb->nh.iph->ttl;
- newtp->ext_header_len = 0;
+ inet_csk(newsk)->icsk_ext_hdr_len = 0;
if (newinet->opt)
- newtp->ext_header_len = newinet->opt->optlen;
+ inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
newinet->id = newtp->write_seq ^ jiffies;
tcp_sync_mss(newsk, dst_mtu(dst));
@@ -1314,16 +1156,6 @@ do_time_wait:
goto discard_it;
}
-static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
-{
- struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
- struct inet_sock *inet = inet_sk(sk);
-
- sin->sin_family = AF_INET;
- sin->sin_addr.s_addr = inet->daddr;
- sin->sin_port = inet->dport;
-}
-
/* VJ's idea. Save last timestamp seen from this destination
* and hold it at least for normal timewait interval to use for duplicate
* segment detection in subsequent connections, before they enter synchronized
@@ -1382,7 +1214,7 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
return 0;
}
-struct tcp_func ipv4_specific = {
+struct inet_connection_sock_af_ops ipv4_specific = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
.rebuild_header = inet_sk_rebuild_header,
@@ -1392,7 +1224,7 @@ struct tcp_func ipv4_specific = {
.net_header_len = sizeof(struct iphdr),
.setsockopt = ip_setsockopt,
.getsockopt = ip_getsockopt,
- .addr2sockaddr = v4_addr2sockaddr,
+ .addr2sockaddr = inet_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in),
};
@@ -1433,7 +1265,8 @@ static int tcp_v4_init_sock(struct sock *sk)
sk->sk_write_space = sk_stream_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
- tp->af_specific = &ipv4_specific;
+ icsk->icsk_af_ops = &ipv4_specific;
+ icsk->icsk_sync_mss = tcp_sync_mss;
sk->sk_sndbuf = sysctl_tcp_wmem[1];
sk->sk_rcvbuf = sysctl_tcp_rmem[1];
@@ -1989,7 +1822,7 @@ struct proto tcp_prot = {
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
- .twsk_obj_size = sizeof(struct tcp_timewait_sock),
+ .twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
};
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 1b66a2ac4321..2b9b7f6c7f7c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -274,18 +274,18 @@ kill:
void tcp_time_wait(struct sock *sk, int state, int timeo)
{
struct inet_timewait_sock *tw = NULL;
+ const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
int recycle_ok = 0;
if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
- recycle_ok = tp->af_specific->remember_stamp(sk);
+ recycle_ok = icsk->icsk_af_ops->remember_stamp(sk);
if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
tw = inet_twsk_alloc(sk, state);
if (tw != NULL) {
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
- const struct inet_connection_sock *icsk = inet_csk(sk);
const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
@@ -298,10 +298,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
if (tw->tw_family == PF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
- struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
+ struct inet6_timewait_sock *tw6;
- ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr);
- ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr);
+ tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
+ tw6 = inet6_twsk((struct sock *)tw);
+ ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
+ ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
tw->tw_ipv6only = np->ipv6only;
}
#endif
@@ -456,7 +458,6 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
struct request_sock **prev)
{
struct tcphdr *th = skb->h.th;
- struct tcp_sock *tp = tcp_sk(sk);
u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
int paws_reject = 0;
struct tcp_options_received tmp_opt;
@@ -613,7 +614,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
* ESTABLISHED STATE. If it will be dropped after
* socket is created, wait for troubles.
*/
- child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb,
+ req, NULL);
if (child == NULL)
goto listen_overflow;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b7325e0b406a..a7623ead39a8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -51,8 +51,8 @@ int sysctl_tcp_retrans_collapse = 1;
*/
int sysctl_tcp_tso_win_divisor = 3;
-static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
- struct sk_buff *skb)
+static void update_send_head(struct sock *sk, struct tcp_sock *tp,
+ struct sk_buff *skb)
{
sk->sk_send_head = skb->next;
if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
@@ -124,8 +124,8 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
tp->snd_cwnd_used = 0;
}
-static inline void tcp_event_data_sent(struct tcp_sock *tp,
- struct sk_buff *skb, struct sock *sk)
+static void tcp_event_data_sent(struct tcp_sock *tp,
+ struct sk_buff *skb, struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const u32 now = tcp_time_stamp;
@@ -142,7 +142,7 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
icsk->icsk_ack.pingpong = 1;
}
-static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
+static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
{
tcp_dec_quickack_mode(sk, pkts);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
@@ -212,7 +212,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
* value can be stuffed directly into th->window for an outgoing
* frame.
*/
-static __inline__ u16 tcp_select_window(struct sock *sk)
+static u16 tcp_select_window(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 cur_win = tcp_receive_window(tp);
@@ -250,6 +250,75 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
return new_win;
}
+static void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp,
+ __u32 tstamp)
+{
+ if (tp->rx_opt.tstamp_ok) {
+ *ptr++ = __constant_htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) |
+ TCPOLEN_TIMESTAMP);
+ *ptr++ = htonl(tstamp);
+ *ptr++ = htonl(tp->rx_opt.ts_recent);
+ }
+ if (tp->rx_opt.eff_sacks) {
+ struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
+ int this_sack;
+
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_SACK << 8) |
+ (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks *
+ TCPOLEN_SACK_PERBLOCK)));
+ for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
+ *ptr++ = htonl(sp[this_sack].start_seq);
+ *ptr++ = htonl(sp[this_sack].end_seq);
+ }
+ if (tp->rx_opt.dsack) {
+ tp->rx_opt.dsack = 0;
+ tp->rx_opt.eff_sacks--;
+ }
+ }
+}
+
+/* Construct a tcp options header for a SYN or SYN_ACK packet.
+ * If this is every changed make sure to change the definition of
+ * MAX_SYN_SIZE to match the new maximum number of options that you
+ * can generate.
+ */
+static void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack,
+ int offer_wscale, int wscale, __u32 tstamp,
+ __u32 ts_recent)
+{
+ /* We always get an MSS option.
+ * The option bytes which will be seen in normal data
+ * packets should timestamps be used, must be in the MSS
+ * advertised. But we subtract them from tp->mss_cache so
+ * that calculations in tcp_sendmsg are simpler etc.
+ * So account for this fact here if necessary. If we
+ * don't do this correctly, as a receiver we won't
+ * recognize data packets as being full sized when we
+ * should, and thus we won't abide by the delayed ACK
+ * rules correctly.
+ * SACKs don't matter, we never delay an ACK when we
+ * have any of those going out.
+ */
+ *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
+ if (ts) {
+ if(sack)
+ *ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) |
+ (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
+ else
+ *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
+ *ptr++ = htonl(tstamp); /* TSVAL */
+ *ptr++ = htonl(ts_recent); /* TSECR */
+ } else if(sack)
+ *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM);
+ if (offer_wscale)
+ *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale));
+}
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
@@ -371,7 +440,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
TCP_ECN_send(sk, tp, skb, tcp_header_size);
}
- tp->af_specific->send_check(sk, th, skb->len, skb);
+ icsk->icsk_af_ops->send_check(sk, skb->len, skb);
if (likely(tcb->flags & TCPCB_FLAG_ACK))
tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
@@ -381,7 +450,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
TCP_INC_STATS(TCP_MIB_OUTSEGS);
- err = tp->af_specific->queue_xmit(skb, 0);
+ err = icsk->icsk_af_ops->queue_xmit(skb, 0);
if (unlikely(err <= 0))
return err;
@@ -621,7 +690,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
It is minimum of user_mss and mss received with SYN.
It also does not include TCP options.
- tp->pmtu_cookie is last pmtu, seen by this function.
+ inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
tp->mss_cache is current effective sending mss, including
all tcp options except for SACKs. It is evaluated,
@@ -631,26 +700,26 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
NOTE1. rfc1122 clearly states that advertised MSS
DOES NOT include either tcp or ip options.
- NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
- this function. --ANK (980731)
+ NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
+ are READ ONLY outside this function. --ANK (980731)
*/
unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
{
struct tcp_sock *tp = tcp_sk(sk);
- int mss_now;
-
+ struct inet_connection_sock *icsk = inet_csk(sk);
/* Calculate base mss without TCP options:
It is MMS_S - sizeof(tcphdr) of rfc1122
*/
- mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
+ int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
+ sizeof(struct tcphdr));
/* Clamp it (mss_clamp does not include tcp options) */
if (mss_now > tp->rx_opt.mss_clamp)
mss_now = tp->rx_opt.mss_clamp;
/* Now subtract optional transport overhead */
- mss_now -= tp->ext_header_len;
+ mss_now -= icsk->icsk_ext_hdr_len;
/* Then reserve room for full set of TCP options and 8 bytes of data */
if (mss_now < 48)
@@ -664,7 +733,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
/* And store cached results */
- tp->pmtu_cookie = pmtu;
+ icsk->icsk_pmtu_cookie = pmtu;
tp->mss_cache = mss_now;
return mss_now;
@@ -694,7 +763,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
if (dst) {
u32 mtu = dst_mtu(dst);
- if (mtu != tp->pmtu_cookie)
+ if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
mss_now = tcp_sync_mss(sk, mtu);
}
@@ -705,9 +774,10 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
xmit_size_goal = mss_now;
if (doing_tso) {
- xmit_size_goal = 65535 -
- tp->af_specific->net_header_len -
- tp->ext_header_len - tp->tcp_header_len;
+ xmit_size_goal = (65535 -
+ inet_csk(sk)->icsk_af_ops->net_header_len -
+ inet_csk(sk)->icsk_ext_hdr_len -
+ tp->tcp_header_len);
if (tp->max_window &&
(xmit_size_goal > (tp->max_window >> 1)))
@@ -723,7 +793,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
/* Congestion window validation. (RFC2861) */
-static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
+static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
{
__u32 packets_out = tp->packets_out;
@@ -772,7 +842,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk
/* This must be invoked the first time we consider transmitting
* SKB onto the wire.
*/
-static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
+static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
{
int tso_segs = tcp_skb_pcount(skb);
@@ -1422,7 +1492,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
(sysctl_tcp_retrans_collapse != 0))
tcp_retrans_try_collapse(sk, skb, cur_mss);
- if(tp->af_specific->rebuild_header(sk))
+ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
return -EHOSTUNREACH; /* Routing failure or similar. */
/* Some Solaris stacks overoptimize and ignore the FIN on a
@@ -1793,7 +1863,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
/*
* Do all connect socket setups that can be done AF independent.
*/
-static inline void tcp_connect_init(struct sock *sk)
+static void tcp_connect_init(struct sock *sk)
{
struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 13e7e6e8df16..3b7403495052 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -330,6 +330,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
vegas->cntRTT = 0;
vegas->minRTT = 0x7fffffff;
}
+ /* Use normal slow start */
+ else if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp);
+
}
/* Extract info for Tcp socket info provided via netlink. */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2422a5f7195d..223abaa72bc5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -86,6 +86,7 @@
#include <linux/module.h>
#include <linux/socket.h>
#include <linux/sockios.h>
+#include <linux/igmp.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/timer.h>
@@ -846,20 +847,7 @@ out:
csum_copy_err:
UDP_INC_STATS_BH(UDP_MIB_INERRORS);
- /* Clear queue. */
- if (flags&MSG_PEEK) {
- int clear = 0;
- spin_lock_bh(&sk->sk_receive_queue.lock);
- if (skb == skb_peek(&sk->sk_receive_queue)) {
- __skb_unlink(skb, &sk->sk_receive_queue);
- clear = 1;
- }
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- if (clear)
- kfree_skb(skb);
- }
-
- skb_free_datagram(sk, skb);
+ skb_kill_datagram(sk, skb, flags);
if (noblock)
return -EAGAIN;
@@ -1094,7 +1082,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
* Otherwise, csum completion requires chacksumming packet body,
* including udp header and folding it to skb->csum.
*/
-static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
+static void udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
unsigned short ulen, u32 saddr, u32 daddr)
{
if (uh->check == 0) {
@@ -1108,7 +1096,6 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
/* Probably, we should checksum udp header (it should be in cache
* in any case) and data in tiny packets (< rx copybreak).
*/
- return 0;
}
/*
@@ -1141,8 +1128,7 @@ int udp_rcv(struct sk_buff *skb)
if (pskb_trim_rcsum(skb, ulen))
goto short_packet;
- if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0)
- goto csum_error;
+ udp_checksum_init(skb, uh, ulen, saddr, daddr);
if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 6460eec834b7..9601fd7f9d66 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -8,7 +8,8 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \
route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \
protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \
exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \
- ip6_flowlabel.o ipv6_syms.o netfilter.o
+ ip6_flowlabel.o ipv6_syms.o netfilter.o \
+ inet6_connection_sock.o
ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
xfrm6_output.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a60585fd85ad..704fb73e6c5f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1195,7 +1195,7 @@ struct inet6_ifaddr * ipv6_get_ifaddr(struct in6_addr *addr, struct net_device *
int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
{
const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr;
- const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2);
+ const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr;
u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
int sk_ipv6only = ipv6_only_sock(sk);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d9546380fa04..68afc53be662 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -167,6 +167,7 @@ lookup_protocol:
sk->sk_reuse = 1;
inet = inet_sk(sk);
+ inet->is_icsk = INET_PROTOSW_ICSK & answer_flags;
if (SOCK_RAW == sock->type) {
inet->num = protocol;
@@ -389,6 +390,8 @@ int inet6_destroy_sock(struct sock *sk)
return 0;
}
+EXPORT_SYMBOL_GPL(inet6_destroy_sock);
+
/*
* This does both peername and sockname.
*/
@@ -431,7 +434,6 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
struct sock *sk = sock->sk;
- int err = -EINVAL;
switch(cmd)
{
@@ -450,16 +452,15 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case SIOCSIFDSTADDR:
return addrconf_set_dstaddr((void __user *) arg);
default:
- if (!sk->sk_prot->ioctl ||
- (err = sk->sk_prot->ioctl(sk, cmd, arg)) == -ENOIOCTLCMD)
- return(dev_ioctl(cmd,(void __user *) arg));
- return err;
+ if (!sk->sk_prot->ioctl)
+ return -ENOIOCTLCMD;
+ return sk->sk_prot->ioctl(sk, cmd, arg);
}
/*NOTREACHED*/
return(0);
}
-struct proto_ops inet6_stream_ops = {
+const struct proto_ops inet6_stream_ops = {
.family = PF_INET6,
.owner = THIS_MODULE,
.release = inet6_release,
@@ -480,7 +481,7 @@ struct proto_ops inet6_stream_ops = {
.sendpage = tcp_sendpage
};
-struct proto_ops inet6_dgram_ops = {
+const struct proto_ops inet6_dgram_ops = {
.family = PF_INET6,
.owner = THIS_MODULE,
.release = inet6_release,
@@ -508,7 +509,7 @@ static struct net_proto_family inet6_family_ops = {
};
/* Same as inet6_dgram_ops, sans udp_poll. */
-static struct proto_ops inet6_sockraw_ops = {
+static const struct proto_ops inet6_sockraw_ops = {
.family = PF_INET6,
.owner = THIS_MODULE,
.release = inet6_release,
@@ -609,6 +610,79 @@ inet6_unregister_protosw(struct inet_protosw *p)
}
}
+int inet6_sk_rebuild_header(struct sock *sk)
+{
+ int err;
+ struct dst_entry *dst;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ dst = __sk_dst_check(sk, np->dst_cookie);
+
+ if (dst == NULL) {
+ struct inet_sock *inet = inet_sk(sk);
+ struct in6_addr *final_p = NULL, final;
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(fl));
+ fl.proto = sk->sk_protocol;
+ ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+ ipv6_addr_copy(&fl.fl6_src, &np->saddr);
+ fl.fl6_flowlabel = np->flow_label;
+ fl.oif = sk->sk_bound_dev_if;
+ fl.fl_ip_dport = inet->dport;
+ fl.fl_ip_sport = inet->sport;
+
+ if (np->opt && np->opt->srcrt) {
+ struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+ ipv6_addr_copy(&final, &fl.fl6_dst);
+ ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+ final_p = &final;
+ }
+
+ err = ip6_dst_lookup(sk, &dst, &fl);
+ if (err) {
+ sk->sk_route_caps = 0;
+ return err;
+ }
+ if (final_p)
+ ipv6_addr_copy(&fl.fl6_dst, final_p);
+
+ if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
+ sk->sk_err_soft = -err;
+ return err;
+ }
+
+ ip6_dst_store(sk, dst, NULL);
+ sk->sk_route_caps = dst->dev->features &
+ ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
+ }
+
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header);
+
+int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct inet6_skb_parm *opt = IP6CB(skb);
+
+ if (np->rxopt.all) {
+ if ((opt->hop && (np->rxopt.bits.hopopts ||
+ np->rxopt.bits.ohopopts)) ||
+ ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) &&
+ np->rxopt.bits.rxflow) ||
+ (opt->srcrt && (np->rxopt.bits.srcrt ||
+ np->rxopt.bits.osrcrt)) ||
+ ((opt->dst1 || opt->dst0) &&
+ (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts)))
+ return 1;
+ }
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(ipv6_opt_accepted);
+
int
snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign)
{
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index f3629730eb15..13cc7f895583 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -33,6 +33,7 @@
#include <linux/string.h>
#include <net/icmp.h>
#include <net/ipv6.h>
+#include <net/protocol.h>
#include <net/xfrm.h>
#include <asm/scatterlist.h>
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 8bfbe9970793..6de8ee1a5ad9 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -36,6 +36,7 @@
#include <linux/random.h>
#include <net/icmp.h>
#include <net/ipv6.h>
+#include <net/protocol.h>
#include <linux/icmpv6.h>
static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index be6faf311387..113374dc342c 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -413,6 +413,8 @@ ipv6_invert_rthdr(struct sock *sk, struct ipv6_rt_hdr *hdr)
return opt;
}
+EXPORT_SYMBOL_GPL(ipv6_invert_rthdr);
+
/**********************************
Hop-by-hop options.
**********************************/
@@ -579,6 +581,8 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
return opt2;
}
+EXPORT_SYMBOL_GPL(ipv6_dup_options);
+
static int ipv6_renew_option(void *ohdr,
struct ipv6_opt_hdr __user *newopt, int newoptlen,
int inherit,
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
new file mode 100644
index 000000000000..792f90f0f9ec
--- /dev/null
+++ b/net/ipv6/inet6_connection_sock.c
@@ -0,0 +1,199 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Support for INET6 connection oriented protocols.
+ *
+ * Authors: See the TCPv6 sources
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or(at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/in6.h>
+#include <linux/ipv6.h>
+#include <linux/jhash.h>
+
+#include <net/addrconf.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_ecn.h>
+#include <net/inet_hashtables.h>
+#include <net/ip6_route.h>
+#include <net/sock.h>
+
+int inet6_csk_bind_conflict(const struct sock *sk,
+ const struct inet_bind_bucket *tb)
+{
+ const struct sock *sk2;
+ const struct hlist_node *node;
+
+ /* We must walk the whole port owner list in this case. -DaveM */
+ sk_for_each_bound(sk2, node, &tb->owners) {
+ if (sk != sk2 &&
+ (!sk->sk_bound_dev_if ||
+ !sk2->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
+ (!sk->sk_reuse || !sk2->sk_reuse ||
+ sk2->sk_state == TCP_LISTEN) &&
+ ipv6_rcv_saddr_equal(sk, sk2))
+ break;
+ }
+
+ return node != NULL;
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict);
+
+/*
+ * request_sock (formerly open request) hash tables.
+ */
+static u32 inet6_synq_hash(const struct in6_addr *raddr, const u16 rport,
+ const u32 rnd, const u16 synq_hsize)
+{
+ u32 a = raddr->s6_addr32[0];
+ u32 b = raddr->s6_addr32[1];
+ u32 c = raddr->s6_addr32[2];
+
+ a += JHASH_GOLDEN_RATIO;
+ b += JHASH_GOLDEN_RATIO;
+ c += rnd;
+ __jhash_mix(a, b, c);
+
+ a += raddr->s6_addr32[3];
+ b += (u32)rport;
+ __jhash_mix(a, b, c);
+
+ return c & (synq_hsize - 1);
+}
+
+struct request_sock *inet6_csk_search_req(const struct sock *sk,
+ struct request_sock ***prevp,
+ const __u16 rport,
+ const struct in6_addr *raddr,
+ const struct in6_addr *laddr,
+ const int iif)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+ struct request_sock *req, **prev;
+
+ for (prev = &lopt->syn_table[inet6_synq_hash(raddr, rport,
+ lopt->hash_rnd,
+ lopt->nr_table_entries)];
+ (req = *prev) != NULL;
+ prev = &req->dl_next) {
+ const struct inet6_request_sock *treq = inet6_rsk(req);
+
+ if (inet_rsk(req)->rmt_port == rport &&
+ req->rsk_ops->family == AF_INET6 &&
+ ipv6_addr_equal(&treq->rmt_addr, raddr) &&
+ ipv6_addr_equal(&treq->loc_addr, laddr) &&
+ (!treq->iif || treq->iif == iif)) {
+ BUG_TRAP(req->sk == NULL);
+ *prevp = prev;
+ return req;
+ }
+ }
+
+ return NULL;
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_search_req);
+
+void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
+ struct request_sock *req,
+ const unsigned long timeout)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+ const u32 h = inet6_synq_hash(&inet6_rsk(req)->rmt_addr,
+ inet_rsk(req)->rmt_port,
+ lopt->hash_rnd, lopt->nr_table_entries);
+
+ reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
+ inet_csk_reqsk_queue_added(sk, timeout);
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add);
+
+void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr;
+
+ sin6->sin6_family = AF_INET6;
+ ipv6_addr_copy(&sin6->sin6_addr, &np->daddr);
+ sin6->sin6_port = inet_sk(sk)->dport;
+ /* We do not store received flowlabel for TCP */
+ sin6->sin6_flowinfo = 0;
+ sin6->sin6_scope_id = 0;
+ if (sk->sk_bound_dev_if &&
+ ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+ sin6->sin6_scope_id = sk->sk_bound_dev_if;
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr);
+
+int inet6_csk_xmit(struct sk_buff *skb, int ipfragok)
+{
+ struct sock *sk = skb->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct flowi fl;
+ struct dst_entry *dst;
+ struct in6_addr *final_p = NULL, final;
+
+ memset(&fl, 0, sizeof(fl));
+ fl.proto = sk->sk_protocol;
+ ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+ ipv6_addr_copy(&fl.fl6_src, &np->saddr);
+ fl.fl6_flowlabel = np->flow_label;
+ IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel);
+ fl.oif = sk->sk_bound_dev_if;
+ fl.fl_ip_sport = inet->sport;
+ fl.fl_ip_dport = inet->dport;
+
+ if (np->opt && np->opt->srcrt) {
+ struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
+ ipv6_addr_copy(&final, &fl.fl6_dst);
+ ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+ final_p = &final;
+ }
+
+ dst = __sk_dst_check(sk, np->dst_cookie);
+
+ if (dst == NULL) {
+ int err = ip6_dst_lookup(sk, &dst, &fl);
+
+ if (err) {
+ sk->sk_err_soft = -err;
+ return err;
+ }
+
+ if (final_p)
+ ipv6_addr_copy(&fl.fl6_dst, final_p);
+
+ if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
+ sk->sk_route_caps = 0;
+ return err;
+ }
+
+ ip6_dst_store(sk, dst, NULL);
+ sk->sk_route_caps = dst->dev->features &
+ ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
+ }
+
+ skb->dst = dst_clone(dst);
+
+ /* Restore final destination back after routing done */
+ ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+
+ return ip6_xmit(sk, skb, &fl, np->opt, 0);
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_xmit);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 01d5f46d4e40..4154f3a8b6cf 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -5,7 +5,8 @@
*
* Generic INET6 transport hashtables
*
- * Authors: Lotsa people, from code originally in tcp
+ * Authors: Lotsa people, from code originally in tcp, generalised here
+ * by Arnaldo Carvalho de Melo <acme@mandriva.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -14,12 +15,13 @@
*/
#include <linux/config.h>
-
#include <linux/module.h>
+#include <linux/random.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
#include <net/inet6_hashtables.h>
+#include <net/ip.h>
struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo,
const struct in6_addr *daddr,
@@ -79,3 +81,180 @@ struct sock *inet6_lookup(struct inet_hashinfo *hashinfo,
}
EXPORT_SYMBOL_GPL(inet6_lookup);
+
+static int __inet6_check_established(struct inet_timewait_death_row *death_row,
+ struct sock *sk, const __u16 lport,
+ struct inet_timewait_sock **twp)
+{
+ struct inet_hashinfo *hinfo = death_row->hashinfo;
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+ const struct in6_addr *daddr = &np->rcv_saddr;
+ const struct in6_addr *saddr = &np->daddr;
+ const int dif = sk->sk_bound_dev_if;
+ const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+ const unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr,
+ inet->dport);
+ struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+ struct sock *sk2;
+ const struct hlist_node *node;
+ struct inet_timewait_sock *tw;
+
+ prefetch(head->chain.first);
+ write_lock(&head->lock);
+
+ /* Check TIME-WAIT sockets first. */
+ sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) {
+ const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2);
+
+ tw = inet_twsk(sk2);
+
+ if(*((__u32 *)&(tw->tw_dport)) == ports &&
+ sk2->sk_family == PF_INET6 &&
+ ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) &&
+ ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) &&
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
+ if (twsk_unique(sk, sk2, twp))
+ goto unique;
+ else
+ goto not_unique;
+ }
+ }
+ tw = NULL;
+
+ /* And established part... */
+ sk_for_each(sk2, node, &head->chain) {
+ if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif))
+ goto not_unique;
+ }
+
+unique:
+ BUG_TRAP(sk_unhashed(sk));
+ __sk_add_node(sk, &head->chain);
+ sk->sk_hash = hash;
+ sock_prot_inc_use(sk->sk_prot);
+ write_unlock(&head->lock);
+
+ if (twp != NULL) {
+ *twp = tw;
+ NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+ } else if (tw != NULL) {
+ /* Silly. Should hash-dance instead... */
+ inet_twsk_deschedule(tw, death_row);
+ NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+
+ inet_twsk_put(tw);
+ }
+ return 0;
+
+not_unique:
+ write_unlock(&head->lock);
+ return -EADDRNOTAVAIL;
+}
+
+static inline u32 inet6_sk_port_offset(const struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+ return secure_ipv6_port_ephemeral(np->rcv_saddr.s6_addr32,
+ np->daddr.s6_addr32,
+ inet->dport);
+}
+
+int inet6_hash_connect(struct inet_timewait_death_row *death_row,
+ struct sock *sk)
+{
+ struct inet_hashinfo *hinfo = death_row->hashinfo;
+ const unsigned short snum = inet_sk(sk)->num;
+ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
+ int ret;
+
+ if (snum == 0) {
+ const int low = sysctl_local_port_range[0];
+ const int high = sysctl_local_port_range[1];
+ const int range = high - low;
+ int i, port;
+ static u32 hint;
+ const u32 offset = hint + inet6_sk_port_offset(sk);
+ struct hlist_node *node;
+ struct inet_timewait_sock *tw = NULL;
+
+ local_bh_disable();
+ for (i = 1; i <= range; i++) {
+ port = low + (i + offset) % range;
+ head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
+ spin_lock(&head->lock);
+
+ /* Does not bother with rcv_saddr checks,
+ * because the established check is already
+ * unique enough.
+ */
+ inet_bind_bucket_for_each(tb, node, &head->chain) {
+ if (tb->port == port) {
+ BUG_TRAP(!hlist_empty(&tb->owners));
+ if (tb->fastreuse >= 0)
+ goto next_port;
+ if (!__inet6_check_established(death_row,
+ sk, port,
+ &tw))
+ goto ok;
+ goto next_port;
+ }
+ }
+
+ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+ head, port);
+ if (!tb) {
+ spin_unlock(&head->lock);
+ break;
+ }
+ tb->fastreuse = -1;
+ goto ok;
+
+ next_port:
+ spin_unlock(&head->lock);
+ }
+ local_bh_enable();
+
+ return -EADDRNOTAVAIL;
+
+ok:
+ hint += i;
+
+ /* Head lock still held and bh's disabled */
+ inet_bind_hash(sk, tb, port);
+ if (sk_unhashed(sk)) {
+ inet_sk(sk)->sport = htons(port);
+ __inet6_hash(hinfo, sk);
+ }
+ spin_unlock(&head->lock);
+
+ if (tw) {
+ inet_twsk_deschedule(tw, death_row);
+ inet_twsk_put(tw);
+ }
+
+ ret = 0;
+ goto out;
+ }
+
+ head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
+ tb = inet_csk(sk)->icsk_bind_hash;
+ spin_lock_bh(&head->lock);
+
+ if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) {
+ __inet6_hash(hinfo, sk);
+ spin_unlock_bh(&head->lock);
+ return 0;
+ } else {
+ spin_unlock(&head->lock);
+ /* No definite answer... Walk to established hash table */
+ ret = __inet6_check_established(death_row, sk, snum, NULL);
+out:
+ local_bh_enable();
+ return ret;
+ }
+}
+
+EXPORT_SYMBOL_GPL(inet6_hash_connect);
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 1cf02765fb5c..89d12b4817a9 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -200,6 +200,8 @@ struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, u32 label)
return NULL;
}
+EXPORT_SYMBOL_GPL(fl6_sock_lookup);
+
void fl6_free_socklist(struct sock *sk)
{
struct ipv6_pinfo *np = inet6_sk(sk);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 8523c76ebf76..b4c4beba0ede 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -775,6 +775,8 @@ out_err_release:
return err;
}
+EXPORT_SYMBOL_GPL(ip6_dst_lookup);
+
static inline int ip6_ufo_append_data(struct sock *sk,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 55917fb17094..626dd39685f2 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -47,6 +47,7 @@
#include <linux/rtnetlink.h>
#include <net/icmp.h>
#include <net/ipv6.h>
+#include <net/protocol.h>
#include <linux/ipv6.h>
#include <linux/icmpv6.h>
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 3620718defe6..c63868dd2ca2 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -163,17 +163,17 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
sk_refcnt_debug_dec(sk);
if (sk->sk_protocol == IPPROTO_TCP) {
- struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
local_bh_disable();
sock_prot_dec_use(sk->sk_prot);
sock_prot_inc_use(&tcp_prot);
local_bh_enable();
sk->sk_prot = &tcp_prot;
- tp->af_specific = &ipv4_specific;
+ icsk->icsk_af_ops = &ipv4_specific;
sk->sk_socket->ops = &inet_stream_ops;
sk->sk_family = PF_INET;
- tcp_sync_mss(sk, tp->pmtu_cookie);
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
} else {
local_bh_disable();
sock_prot_dec_use(sk->sk_prot);
@@ -317,14 +317,15 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
}
retv = 0;
- if (sk->sk_type == SOCK_STREAM) {
+ if (inet_sk(sk)->is_icsk) {
if (opt) {
- struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
if (!((1 << sk->sk_state) &
(TCPF_LISTEN | TCPF_CLOSE))
&& inet_sk(sk)->daddr != LOOPBACK4_IPV6) {
- tp->ext_header_len = opt->opt_flen + opt->opt_nflen;
- tcp_sync_mss(sk, tp->pmtu_cookie);
+ icsk->icsk_ext_hdr_len =
+ opt->opt_flen + opt->opt_nflen;
+ icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
}
}
opt = xchg(&np->opt, opt);
@@ -380,14 +381,15 @@ sticky_done:
goto done;
update:
retv = 0;
- if (sk->sk_type == SOCK_STREAM) {
+ if (inet_sk(sk)->is_icsk) {
if (opt) {
- struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
if (!((1 << sk->sk_state) &
(TCPF_LISTEN | TCPF_CLOSE))
&& inet_sk(sk)->daddr != LOOPBACK4_IPV6) {
- tp->ext_header_len = opt->opt_flen + opt->opt_nflen;
- tcp_sync_mss(sk, tp->pmtu_cookie);
+ icsk->icsk_ext_hdr_len =
+ opt->opt_flen + opt->opt_nflen;
+ icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
}
}
opt = xchg(&np->opt, opt);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index f829a4ad3ccc..1cf305a9f8dd 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -224,7 +224,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
mc_lst->ifindex = dev->ifindex;
mc_lst->sfmode = MCAST_EXCLUDE;
- mc_lst->sflock = RW_LOCK_UNLOCKED;
+ rwlock_init(&mc_lst->sflock);
mc_lst->sflist = NULL;
/*
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 95d469271c4d..ea43ef1d94a7 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -15,6 +15,7 @@
* - new extension header parser code
*/
#include <linux/config.h>
+#include <linux/in.h>
#include <linux/skbuff.h>
#include <linux/kmod.h>
#include <linux/vmalloc.h>
@@ -86,11 +87,6 @@ static DECLARE_MUTEX(ip6t_mutex);
context stops packets coming through and allows user context to read
the counters or update the rules.
- To be cache friendly on SMP, we arrange them like so:
- [ n-entries ]
- ... cache-align padding ...
- [ n-entries ]
-
Hence the start of any table is given by get_table() below. */
/* The table itself */
@@ -108,20 +104,15 @@ struct ip6t_table_info
unsigned int underflow[NF_IP6_NUMHOOKS];
/* ip6t_entry tables: one per CPU */
- char entries[0] ____cacheline_aligned;
+ void *entries[NR_CPUS];
};
static LIST_HEAD(ip6t_target);
static LIST_HEAD(ip6t_match);
static LIST_HEAD(ip6t_tables);
+#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
-#ifdef CONFIG_SMP
-#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
-#else
-#define TABLE_OFFSET(t,p) 0
-#endif
-
#if 0
#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
@@ -376,8 +367,7 @@ ip6t_do_table(struct sk_buff **pskb,
read_lock_bh(&table->lock);
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
- table_base = (void *)table->private->entries
- + TABLE_OFFSET(table->private, smp_processor_id());
+ table_base = (void *)table->private->entries[smp_processor_id()];
e = get_entry(table_base, table->private->hook_entry[hook]);
#ifdef CONFIG_NETFILTER_DEBUG
@@ -649,7 +639,8 @@ unconditional(const struct ip6t_ip6 *ipv6)
/* Figures out from what hook each rule can be called: returns 0 if
there are loops. Puts hook bitmask in comefrom. */
static int
-mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks)
+mark_source_chains(struct ip6t_table_info *newinfo,
+ unsigned int valid_hooks, void *entry0)
{
unsigned int hook;
@@ -658,7 +649,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks)
for (hook = 0; hook < NF_IP6_NUMHOOKS; hook++) {
unsigned int pos = newinfo->hook_entry[hook];
struct ip6t_entry *e
- = (struct ip6t_entry *)(newinfo->entries + pos);
+ = (struct ip6t_entry *)(entry0 + pos);
if (!(valid_hooks & (1 << hook)))
continue;
@@ -708,13 +699,13 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks)
goto next;
e = (struct ip6t_entry *)
- (newinfo->entries + pos);
+ (entry0 + pos);
} while (oldpos == pos + e->next_offset);
/* Move along one */
size = e->next_offset;
e = (struct ip6t_entry *)
- (newinfo->entries + pos + size);
+ (entry0 + pos + size);
e->counters.pcnt = pos;
pos += size;
} else {
@@ -731,7 +722,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks)
newpos = pos + e->next_offset;
}
e = (struct ip6t_entry *)
- (newinfo->entries + newpos);
+ (entry0 + newpos);
e->counters.pcnt = pos;
pos = newpos;
}
@@ -941,6 +932,7 @@ static int
translate_table(const char *name,
unsigned int valid_hooks,
struct ip6t_table_info *newinfo,
+ void *entry0,
unsigned int size,
unsigned int number,
const unsigned int *hook_entries,
@@ -961,11 +953,11 @@ translate_table(const char *name,
duprintf("translate_table: size %u\n", newinfo->size);
i = 0;
/* Walk through entries, checking offsets. */
- ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size,
check_entry_size_and_hooks,
newinfo,
- newinfo->entries,
- newinfo->entries + size,
+ entry0,
+ entry0 + size,
hook_entries, underflows, &i);
if (ret != 0)
return ret;
@@ -993,27 +985,24 @@ translate_table(const char *name,
}
}
- if (!mark_source_chains(newinfo, valid_hooks))
+ if (!mark_source_chains(newinfo, valid_hooks, entry0))
return -ELOOP;
/* Finally, each sanity check must pass */
i = 0;
- ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size,
check_entry, name, size, &i);
if (ret != 0) {
- IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ IP6T_ENTRY_ITERATE(entry0, newinfo->size,
cleanup_entry, &i);
return ret;
}
/* And one copy for every other CPU */
for_each_cpu(i) {
- if (i == 0)
- continue;
- memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
- newinfo->entries,
- SMP_ALIGN(newinfo->size));
+ if (newinfo->entries[i] && newinfo->entries[i] != entry0)
+ memcpy(newinfo->entries[i], entry0, newinfo->size);
}
return ret;
@@ -1029,15 +1018,12 @@ replace_table(struct ip6t_table *table,
#ifdef CONFIG_NETFILTER_DEBUG
{
- struct ip6t_entry *table_base;
- unsigned int i;
+ int cpu;
- for_each_cpu(i) {
- table_base =
- (void *)newinfo->entries
- + TABLE_OFFSET(newinfo, i);
-
- table_base->comefrom = 0xdead57ac;
+ for_each_cpu(cpu) {
+ struct ip6t_entry *table_base = newinfo->entries[cpu];
+ if (table_base)
+ table_base->comefrom = 0xdead57ac;
}
}
#endif
@@ -1072,16 +1058,44 @@ add_entry_to_counter(const struct ip6t_entry *e,
return 0;
}
+static inline int
+set_entry_to_counter(const struct ip6t_entry *e,
+ struct ip6t_counters total[],
+ unsigned int *i)
+{
+ SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+
+ (*i)++;
+ return 0;
+}
+
static void
get_counters(const struct ip6t_table_info *t,
struct ip6t_counters counters[])
{
unsigned int cpu;
unsigned int i;
+ unsigned int curcpu;
+
+ /* Instead of clearing (by a previous call to memset())
+ * the counters and using adds, we set the counters
+ * with data used by 'current' CPU
+ * We dont care about preemption here.
+ */
+ curcpu = raw_smp_processor_id();
+
+ i = 0;
+ IP6T_ENTRY_ITERATE(t->entries[curcpu],
+ t->size,
+ set_entry_to_counter,
+ counters,
+ &i);
for_each_cpu(cpu) {
+ if (cpu == curcpu)
+ continue;
i = 0;
- IP6T_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+ IP6T_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_entry_to_counter,
counters,
@@ -1098,6 +1112,7 @@ copy_entries_to_user(unsigned int total_size,
struct ip6t_entry *e;
struct ip6t_counters *counters;
int ret = 0;
+ void *loc_cpu_entry;
/* We need atomic snapshot of counters: rest doesn't change
(other than comefrom, which userspace doesn't care
@@ -1109,13 +1124,13 @@ copy_entries_to_user(unsigned int total_size,
return -ENOMEM;
/* First, sum counters... */
- memset(counters, 0, countersize);
write_lock_bh(&table->lock);
get_counters(table->private, counters);
write_unlock_bh(&table->lock);
- /* ... then copy entire thing from CPU 0... */
- if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+ /* choose the copy that is on ourc node/cpu */
+ loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
ret = -EFAULT;
goto free_counters;
}
@@ -1127,7 +1142,7 @@ copy_entries_to_user(unsigned int total_size,
struct ip6t_entry_match *m;
struct ip6t_entry_target *t;
- e = (struct ip6t_entry *)(table->private->entries + off);
+ e = (struct ip6t_entry *)(loc_cpu_entry + off);
if (copy_to_user(userptr + off
+ offsetof(struct ip6t_entry, counters),
&counters[num],
@@ -1196,6 +1211,46 @@ get_entries(const struct ip6t_get_entries *entries,
return ret;
}
+static void free_table_info(struct ip6t_table_info *info)
+{
+ int cpu;
+ for_each_cpu(cpu) {
+ if (info->size <= PAGE_SIZE)
+ kfree(info->entries[cpu]);
+ else
+ vfree(info->entries[cpu]);
+ }
+ kfree(info);
+}
+
+static struct ip6t_table_info *alloc_table_info(unsigned int size)
+{
+ struct ip6t_table_info *newinfo;
+ int cpu;
+
+ newinfo = kzalloc(sizeof(struct ip6t_table_info), GFP_KERNEL);
+ if (!newinfo)
+ return NULL;
+
+ newinfo->size = size;
+
+ for_each_cpu(cpu) {
+ if (size <= PAGE_SIZE)
+ newinfo->entries[cpu] = kmalloc_node(size,
+ GFP_KERNEL,
+ cpu_to_node(cpu));
+ else
+ newinfo->entries[cpu] = vmalloc_node(size,
+ cpu_to_node(cpu));
+ if (newinfo->entries[cpu] == NULL) {
+ free_table_info(newinfo);
+ return NULL;
+ }
+ }
+
+ return newinfo;
+}
+
static int
do_replace(void __user *user, unsigned int len)
{
@@ -1204,6 +1259,7 @@ do_replace(void __user *user, unsigned int len)
struct ip6t_table *t;
struct ip6t_table_info *newinfo, *oldinfo;
struct ip6t_counters *counters;
+ void *loc_cpu_entry, *loc_cpu_old_entry;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1212,13 +1268,13 @@ do_replace(void __user *user, unsigned int len)
if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
return -ENOMEM;
- newinfo = vmalloc(sizeof(struct ip6t_table_info)
- + SMP_ALIGN(tmp.size) *
- (highest_possible_processor_id()+1));
+ newinfo = alloc_table_info(tmp.size);
if (!newinfo)
return -ENOMEM;
- if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+ /* choose the copy that is on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
@@ -1229,10 +1285,9 @@ do_replace(void __user *user, unsigned int len)
ret = -ENOMEM;
goto free_newinfo;
}
- memset(counters, 0, tmp.num_counters * sizeof(struct ip6t_counters));
ret = translate_table(tmp.name, tmp.valid_hooks,
- newinfo, tmp.size, tmp.num_entries,
+ newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
tmp.hook_entry, tmp.underflow);
if (ret != 0)
goto free_newinfo_counters;
@@ -1271,8 +1326,9 @@ do_replace(void __user *user, unsigned int len)
/* Get the old counters. */
get_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
- IP6T_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
- vfree(oldinfo);
+ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+ IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
+ free_table_info(oldinfo);
if (copy_to_user(tmp.counters, counters,
sizeof(struct ip6t_counters) * tmp.num_counters) != 0)
ret = -EFAULT;
@@ -1284,11 +1340,11 @@ do_replace(void __user *user, unsigned int len)
module_put(t->me);
up(&ip6t_mutex);
free_newinfo_counters_untrans:
- IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL);
+ IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
free_newinfo_counters:
vfree(counters);
free_newinfo:
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
@@ -1321,6 +1377,7 @@ do_add_counters(void __user *user, unsigned int len)
struct ip6t_counters_info tmp, *paddc;
struct ip6t_table *t;
int ret = 0;
+ void *loc_cpu_entry;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1350,7 +1407,9 @@ do_add_counters(void __user *user, unsigned int len)
}
i = 0;
- IP6T_ENTRY_ITERATE(t->private->entries,
+ /* Choose the copy that is on our node */
+ loc_cpu_entry = t->private->entries[smp_processor_id()];
+ IP6T_ENTRY_ITERATE(loc_cpu_entry,
t->private->size,
add_counter_to_entry,
paddc->counters,
@@ -1543,28 +1602,29 @@ int ip6t_register_table(struct ip6t_table *table,
struct ip6t_table_info *newinfo;
static struct ip6t_table_info bootstrap
= { 0, 0, 0, { 0 }, { 0 }, { } };
+ void *loc_cpu_entry;
- newinfo = vmalloc(sizeof(struct ip6t_table_info)
- + SMP_ALIGN(repl->size) *
- (highest_possible_processor_id()+1));
+ newinfo = alloc_table_info(repl->size);
if (!newinfo)
return -ENOMEM;
- memcpy(newinfo->entries, repl->entries, repl->size);
+ /* choose the copy on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(table->name, table->valid_hooks,
- newinfo, repl->size,
+ newinfo, loc_cpu_entry, repl->size,
repl->num_entries,
repl->hook_entry,
repl->underflow);
if (ret != 0) {
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
ret = down_interruptible(&ip6t_mutex);
if (ret != 0) {
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
@@ -1593,20 +1653,23 @@ int ip6t_register_table(struct ip6t_table *table,
return ret;
free_unlock:
- vfree(newinfo);
+ free_table_info(newinfo);
goto unlock;
}
void ip6t_unregister_table(struct ip6t_table *table)
{
+ void *loc_cpu_entry;
+
down(&ip6t_mutex);
LIST_DELETE(&ip6t_tables, table);
up(&ip6t_mutex);
/* Decrease module usage counts and free resources */
- IP6T_ENTRY_ITERATE(table->private->entries, table->private->size,
+ loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+ IP6T_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
cleanup_entry, NULL);
- vfree(table->private);
+ free_table_info(table->private);
}
/* Returns 1 if the port is matched by the range, 0 otherwise */
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 0cd1d1bd9033..ae4653bfd654 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/skbuff.h>
+#include <linux/if_arp.h>
#include <linux/ip.h>
#include <linux/spinlock.h>
#include <linux/icmpv6.h>
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index dde37793d20b..268918d5deea 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -9,6 +9,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
+#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/types.h>
#include <net/checksum.h>
diff --git a/net/ipv6/netfilter/ip6t_esp.c b/net/ipv6/netfilter/ip6t_esp.c
index 24bc0cde43a1..65937de1b58c 100644
--- a/net/ipv6/netfilter/ip6t_esp.c
+++ b/net/ipv6/netfilter/ip6t_esp.c
@@ -9,6 +9,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
+#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/types.h>
#include <net/checksum.h>
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index c2c52af9e560..f3e5ffbd592f 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -98,7 +98,7 @@ struct nf_ct_frag6_queue
#define FRAG6Q_HASHSZ 64
static struct nf_ct_frag6_queue *nf_ct_frag6_hash[FRAG6Q_HASHSZ];
-static rwlock_t nf_ct_frag6_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(nf_ct_frag6_lock);
static u32 nf_ct_frag6_hash_rnd;
static LIST_HEAD(nf_ct_frag6_lru_list);
int nf_ct_frag6_nqueues = 0;
@@ -371,7 +371,7 @@ nf_ct_frag6_create(unsigned int hash, u32 id, struct in6_addr *src, struct
init_timer(&fq->timer);
fq->timer.function = nf_ct_frag6_expire;
fq->timer.data = (long) fq;
- fq->lock = SPIN_LOCK_UNLOCKED;
+ spin_lock_init(&fq->lock);
atomic_set(&fq->refcnt, 1);
return nf_ct_frag6_intern(hash, fq);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index a66900cda2af..66f1d12ea578 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -32,6 +32,7 @@
#include <linux/icmpv6.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
+#include <linux/skbuff.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
#include <asm/bug.h>
@@ -433,25 +434,14 @@ out:
return err;
csum_copy_err:
- /* Clear queue. */
- if (flags&MSG_PEEK) {
- int clear = 0;
- spin_lock_bh(&sk->sk_receive_queue.lock);
- if (skb == skb_peek(&sk->sk_receive_queue)) {
- __skb_unlink(skb, &sk->sk_receive_queue);
- clear = 1;
- }
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- if (clear)
- kfree_skb(skb);
- }
+ skb_kill_datagram(sk, skb, flags);
/* Error for blocking case is chosen to masquerade
as some normal condition.
*/
err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
/* FIXME: increment a raw6 drops counter here */
- goto out_free;
+ goto out;
}
static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 8827389abaf7..2947bc56d8a0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -48,6 +48,7 @@
#include <net/tcp.h>
#include <net/ndisc.h>
#include <net/inet6_hashtables.h>
+#include <net/inet6_connection_sock.h>
#include <net/ipv6.h>
#include <net/transp_v6.h>
#include <net/addrconf.h>
@@ -59,6 +60,7 @@
#include <net/addrconf.h>
#include <net/snmp.h>
#include <net/dsfield.h>
+#include <net/timewait_sock.h>
#include <asm/uaccess.h>
@@ -67,224 +69,33 @@
static void tcp_v6_send_reset(struct sk_buff *skb);
static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req);
-static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len,
+static void tcp_v6_send_check(struct sock *sk, int len,
struct sk_buff *skb);
static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
-static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok);
-static struct tcp_func ipv6_mapped;
-static struct tcp_func ipv6_specific;
+static struct inet_connection_sock_af_ops ipv6_mapped;
+static struct inet_connection_sock_af_ops ipv6_specific;
-static inline int tcp_v6_bind_conflict(const struct sock *sk,
- const struct inet_bind_bucket *tb)
-{
- const struct sock *sk2;
- const struct hlist_node *node;
-
- /* We must walk the whole port owner list in this case. -DaveM */
- sk_for_each_bound(sk2, node, &tb->owners) {
- if (sk != sk2 &&
- (!sk->sk_bound_dev_if ||
- !sk2->sk_bound_dev_if ||
- sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
- (!sk->sk_reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) &&
- ipv6_rcv_saddr_equal(sk, sk2))
- break;
- }
-
- return node != NULL;
-}
-
-/* Grrr, addr_type already calculated by caller, but I don't want
- * to add some silly "cookie" argument to this method just for that.
- * But it doesn't matter, the recalculation is in the rarest path
- * this function ever takes.
- */
static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
{
- struct inet_bind_hashbucket *head;
- struct inet_bind_bucket *tb;
- struct hlist_node *node;
- int ret;
-
- local_bh_disable();
- if (snum == 0) {
- int low = sysctl_local_port_range[0];
- int high = sysctl_local_port_range[1];
- int remaining = (high - low) + 1;
- int rover = net_random() % (high - low) + low;
-
- do {
- head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
- spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, node, &head->chain)
- if (tb->port == rover)
- goto next;
- break;
- next:
- spin_unlock(&head->lock);
- if (++rover > high)
- rover = low;
- } while (--remaining > 0);
-
- /* Exhausted local port range during search? It is not
- * possible for us to be holding one of the bind hash
- * locks if this test triggers, because if 'remaining'
- * drops to zero, we broke out of the do/while loop at
- * the top level, not from the 'break;' statement.
- */
- ret = 1;
- if (unlikely(remaining <= 0))
- goto fail;
-
- /* OK, here is the one we will use. */
- snum = rover;
- } else {
- head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
- spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, node, &head->chain)
- if (tb->port == snum)
- goto tb_found;
- }
- tb = NULL;
- goto tb_not_found;
-tb_found:
- if (tb && !hlist_empty(&tb->owners)) {
- if (tb->fastreuse > 0 && sk->sk_reuse &&
- sk->sk_state != TCP_LISTEN) {
- goto success;
- } else {
- ret = 1;
- if (tcp_v6_bind_conflict(sk, tb))
- goto fail_unlock;
- }
- }
-tb_not_found:
- ret = 1;
- if (tb == NULL) {
- tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum);
- if (tb == NULL)
- goto fail_unlock;
- }
- if (hlist_empty(&tb->owners)) {
- if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
- tb->fastreuse = 1;
- else
- tb->fastreuse = 0;
- } else if (tb->fastreuse &&
- (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
- tb->fastreuse = 0;
-
-success:
- if (!inet_csk(sk)->icsk_bind_hash)
- inet_bind_hash(sk, tb, snum);
- BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
- ret = 0;
-
-fail_unlock:
- spin_unlock(&head->lock);
-fail:
- local_bh_enable();
- return ret;
-}
-
-static __inline__ void __tcp_v6_hash(struct sock *sk)
-{
- struct hlist_head *list;
- rwlock_t *lock;
-
- BUG_TRAP(sk_unhashed(sk));
-
- if (sk->sk_state == TCP_LISTEN) {
- list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)];
- lock = &tcp_hashinfo.lhash_lock;
- inet_listen_wlock(&tcp_hashinfo);
- } else {
- unsigned int hash;
- sk->sk_hash = hash = inet6_sk_ehashfn(sk);
- hash &= (tcp_hashinfo.ehash_size - 1);
- list = &tcp_hashinfo.ehash[hash].chain;
- lock = &tcp_hashinfo.ehash[hash].lock;
- write_lock(lock);
- }
-
- __sk_add_node(sk, list);
- sock_prot_inc_use(sk->sk_prot);
- write_unlock(lock);
+ return inet_csk_get_port(&tcp_hashinfo, sk, snum,
+ inet6_csk_bind_conflict);
}
-
static void tcp_v6_hash(struct sock *sk)
{
if (sk->sk_state != TCP_CLOSE) {
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (tp->af_specific == &ipv6_mapped) {
+ if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) {
tcp_prot.hash(sk);
return;
}
local_bh_disable();
- __tcp_v6_hash(sk);
+ __inet6_hash(&tcp_hashinfo, sk);
local_bh_enable();
}
}
-/*
- * Open request hash tables.
- */
-
-static u32 tcp_v6_synq_hash(const struct in6_addr *raddr, const u16 rport, const u32 rnd)
-{
- u32 a, b, c;
-
- a = raddr->s6_addr32[0];
- b = raddr->s6_addr32[1];
- c = raddr->s6_addr32[2];
-
- a += JHASH_GOLDEN_RATIO;
- b += JHASH_GOLDEN_RATIO;
- c += rnd;
- __jhash_mix(a, b, c);
-
- a += raddr->s6_addr32[3];
- b += (u32) rport;
- __jhash_mix(a, b, c);
-
- return c & (TCP_SYNQ_HSIZE - 1);
-}
-
-static struct request_sock *tcp_v6_search_req(const struct sock *sk,
- struct request_sock ***prevp,
- __u16 rport,
- struct in6_addr *raddr,
- struct in6_addr *laddr,
- int iif)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- struct request_sock *req, **prev;
-
- for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)];
- (req = *prev) != NULL;
- prev = &req->dl_next) {
- const struct tcp6_request_sock *treq = tcp6_rsk(req);
-
- if (inet_rsk(req)->rmt_port == rport &&
- req->rsk_ops->family == AF_INET6 &&
- ipv6_addr_equal(&treq->rmt_addr, raddr) &&
- ipv6_addr_equal(&treq->loc_addr, laddr) &&
- (!treq->iif || treq->iif == iif)) {
- BUG_TRAP(req->sk == NULL);
- *prevp = prev;
- return req;
- }
- }
-
- return NULL;
-}
-
static __inline__ u16 tcp_v6_check(struct tcphdr *th, int len,
struct in6_addr *saddr,
struct in6_addr *daddr,
@@ -308,195 +119,12 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb)
}
}
-static int __tcp_v6_check_established(struct sock *sk, const __u16 lport,
- struct inet_timewait_sock **twp)
-{
- struct inet_sock *inet = inet_sk(sk);
- const struct ipv6_pinfo *np = inet6_sk(sk);
- const struct in6_addr *daddr = &np->rcv_saddr;
- const struct in6_addr *saddr = &np->daddr;
- const int dif = sk->sk_bound_dev_if;
- const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
- unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr, inet->dport);
- struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
- struct sock *sk2;
- const struct hlist_node *node;
- struct inet_timewait_sock *tw;
-
- prefetch(head->chain.first);
- write_lock(&head->lock);
-
- /* Check TIME-WAIT sockets first. */
- sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
- const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk2);
-
- tw = inet_twsk(sk2);
-
- if(*((__u32 *)&(tw->tw_dport)) == ports &&
- sk2->sk_family == PF_INET6 &&
- ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) &&
- ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) &&
- sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
- const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (tcptw->tw_ts_recent_stamp &&
- (!twp ||
- (sysctl_tcp_tw_reuse &&
- xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
- /* See comment in tcp_ipv4.c */
- tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
- if (!tp->write_seq)
- tp->write_seq = 1;
- tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
- tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
- sock_hold(sk2);
- goto unique;
- } else
- goto not_unique;
- }
- }
- tw = NULL;
-
- /* And established part... */
- sk_for_each(sk2, node, &head->chain) {
- if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif))
- goto not_unique;
- }
-
-unique:
- BUG_TRAP(sk_unhashed(sk));
- __sk_add_node(sk, &head->chain);
- sk->sk_hash = hash;
- sock_prot_inc_use(sk->sk_prot);
- write_unlock(&head->lock);
-
- if (twp) {
- *twp = tw;
- NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
- } else if (tw) {
- /* Silly. Should hash-dance instead... */
- inet_twsk_deschedule(tw, &tcp_death_row);
- NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-
- inet_twsk_put(tw);
- }
- return 0;
-
-not_unique:
- write_unlock(&head->lock);
- return -EADDRNOTAVAIL;
-}
-
-static inline u32 tcpv6_port_offset(const struct sock *sk)
-{
- const struct inet_sock *inet = inet_sk(sk);
- const struct ipv6_pinfo *np = inet6_sk(sk);
-
- return secure_tcpv6_port_ephemeral(np->rcv_saddr.s6_addr32,
- np->daddr.s6_addr32,
- inet->dport);
-}
-
-static int tcp_v6_hash_connect(struct sock *sk)
-{
- unsigned short snum = inet_sk(sk)->num;
- struct inet_bind_hashbucket *head;
- struct inet_bind_bucket *tb;
- int ret;
-
- if (!snum) {
- int low = sysctl_local_port_range[0];
- int high = sysctl_local_port_range[1];
- int range = high - low;
- int i;
- int port;
- static u32 hint;
- u32 offset = hint + tcpv6_port_offset(sk);
- struct hlist_node *node;
- struct inet_timewait_sock *tw = NULL;
-
- local_bh_disable();
- for (i = 1; i <= range; i++) {
- port = low + (i + offset) % range;
- head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
- spin_lock(&head->lock);
-
- /* Does not bother with rcv_saddr checks,
- * because the established check is already
- * unique enough.
- */
- inet_bind_bucket_for_each(tb, node, &head->chain) {
- if (tb->port == port) {
- BUG_TRAP(!hlist_empty(&tb->owners));
- if (tb->fastreuse >= 0)
- goto next_port;
- if (!__tcp_v6_check_established(sk,
- port,
- &tw))
- goto ok;
- goto next_port;
- }
- }
-
- tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
- if (!tb) {
- spin_unlock(&head->lock);
- break;
- }
- tb->fastreuse = -1;
- goto ok;
-
- next_port:
- spin_unlock(&head->lock);
- }
- local_bh_enable();
-
- return -EADDRNOTAVAIL;
-
-ok:
- hint += i;
-
- /* Head lock still held and bh's disabled */
- inet_bind_hash(sk, tb, port);
- if (sk_unhashed(sk)) {
- inet_sk(sk)->sport = htons(port);
- __tcp_v6_hash(sk);
- }
- spin_unlock(&head->lock);
-
- if (tw) {
- inet_twsk_deschedule(tw, &tcp_death_row);
- inet_twsk_put(tw);
- }
-
- ret = 0;
- goto out;
- }
-
- head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
- tb = inet_csk(sk)->icsk_bind_hash;
- spin_lock_bh(&head->lock);
-
- if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- __tcp_v6_hash(sk);
- spin_unlock_bh(&head->lock);
- return 0;
- } else {
- spin_unlock(&head->lock);
- /* No definite answer... Walk to established hash table */
- ret = __tcp_v6_check_established(sk, snum, NULL);
-out:
- local_bh_enable();
- return ret;
- }
-}
-
static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
int addr_len)
{
struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
- struct inet_sock *inet = inet_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct in6_addr *saddr = NULL, *final_p = NULL, final;
@@ -571,7 +199,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
*/
if (addr_type == IPV6_ADDR_MAPPED) {
- u32 exthdrlen = tp->ext_header_len;
+ u32 exthdrlen = icsk->icsk_ext_hdr_len;
struct sockaddr_in sin;
SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
@@ -583,14 +211,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
sin.sin_port = usin->sin6_port;
sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
- tp->af_specific = &ipv6_mapped;
+ icsk->icsk_af_ops = &ipv6_mapped;
sk->sk_backlog_rcv = tcp_v4_do_rcv;
err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
if (err) {
- tp->ext_header_len = exthdrlen;
- tp->af_specific = &ipv6_specific;
+ icsk->icsk_ext_hdr_len = exthdrlen;
+ icsk->icsk_af_ops = &ipv6_specific;
sk->sk_backlog_rcv = tcp_v6_do_rcv;
goto failure;
} else {
@@ -643,16 +271,17 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
sk->sk_route_caps = dst->dev->features &
~(NETIF_F_IP_CSUM | NETIF_F_TSO);
- tp->ext_header_len = 0;
+ icsk->icsk_ext_hdr_len = 0;
if (np->opt)
- tp->ext_header_len = np->opt->opt_flen + np->opt->opt_nflen;
+ icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
+ np->opt->opt_nflen);
tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
inet->dport = usin->sin6_port;
tcp_set_state(sk, TCP_SYN_SENT);
- err = tcp_v6_hash_connect(sk);
+ err = inet6_hash_connect(&tcp_death_row, sk);
if (err)
goto late_failure;
@@ -758,7 +387,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
} else
dst_hold(dst);
- if (tp->pmtu_cookie > dst_mtu(dst)) {
+ if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
tcp_sync_mss(sk, dst_mtu(dst));
tcp_simple_retransmit(sk);
} /* else let the usual retransmit timer handle it */
@@ -775,8 +404,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (sock_owned_by_user(sk))
goto out;
- req = tcp_v6_search_req(sk, &prev, th->dest, &hdr->daddr,
- &hdr->saddr, inet6_iif(skb));
+ req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr,
+ &hdr->saddr, inet6_iif(skb));
if (!req)
goto out;
@@ -822,7 +451,7 @@ out:
static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
struct dst_entry *dst)
{
- struct tcp6_request_sock *treq = tcp6_rsk(req);
+ struct inet6_request_sock *treq = inet6_rsk(req);
struct ipv6_pinfo *np = inet6_sk(sk);
struct sk_buff * skb;
struct ipv6_txoptions *opt = NULL;
@@ -888,8 +517,8 @@ done:
static void tcp_v6_reqsk_destructor(struct request_sock *req)
{
- if (tcp6_rsk(req)->pktopts)
- kfree_skb(tcp6_rsk(req)->pktopts);
+ if (inet6_rsk(req)->pktopts)
+ kfree_skb(inet6_rsk(req)->pktopts);
}
static struct request_sock_ops tcp6_request_sock_ops = {
@@ -901,26 +530,15 @@ static struct request_sock_ops tcp6_request_sock_ops = {
.send_reset = tcp_v6_send_reset
};
-static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb)
-{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct inet6_skb_parm *opt = IP6CB(skb);
-
- if (np->rxopt.all) {
- if ((opt->hop && (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) ||
- ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) && np->rxopt.bits.rxflow) ||
- (opt->srcrt && (np->rxopt.bits.srcrt || np->rxopt.bits.osrcrt)) ||
- ((opt->dst1 || opt->dst0) && (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts)))
- return 1;
- }
- return 0;
-}
-
+static struct timewait_sock_ops tcp6_timewait_sock_ops = {
+ .twsk_obj_size = sizeof(struct tcp6_timewait_sock),
+ .twsk_unique = tcp_twsk_unique,
+};
-static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len,
- struct sk_buff *skb)
+static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb)
{
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct tcphdr *th = skb->h.th;
if (skb->ip_summed == CHECKSUM_HW) {
th->check = ~csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, 0);
@@ -1091,8 +709,9 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
struct sock *nsk;
/* Find possible connection requests. */
- req = tcp_v6_search_req(sk, &prev, th->source, &skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr, inet6_iif(skb));
+ req = inet6_csk_search_req(sk, &prev, th->source,
+ &skb->nh.ipv6h->saddr,
+ &skb->nh.ipv6h->daddr, inet6_iif(skb));
if (req)
return tcp_check_req(sk, skb, req, prev);
@@ -1116,23 +735,12 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
return sk;
}
-static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- const u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
-
- reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT);
- inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT);
-}
-
-
/* FIXME: this is substantially similar to the ipv4 code.
* Can some kind of merge be done? -- erics
*/
static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
- struct tcp6_request_sock *treq;
+ struct inet6_request_sock *treq;
struct ipv6_pinfo *np = inet6_sk(sk);
struct tcp_options_received tmp_opt;
struct tcp_sock *tp = tcp_sk(sk);
@@ -1157,7 +765,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
goto drop;
- req = reqsk_alloc(&tcp6_request_sock_ops);
+ req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
if (req == NULL)
goto drop;
@@ -1170,7 +778,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb);
- treq = tcp6_rsk(req);
+ treq = inet6_rsk(req);
ipv6_addr_copy(&treq->rmt_addr, &skb->nh.ipv6h->saddr);
ipv6_addr_copy(&treq->loc_addr, &skb->nh.ipv6h->daddr);
TCP_ECN_create_request(req, skb->h.th);
@@ -1196,8 +804,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
if (tcp_v6_send_synack(sk, req, NULL))
goto drop;
- tcp_v6_synq_add(sk, req);
-
+ inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
return 0;
drop:
@@ -1212,7 +819,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst)
{
- struct tcp6_request_sock *treq = tcp6_rsk(req);
+ struct inet6_request_sock *treq = inet6_rsk(req);
struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
struct tcp6_sock *newtcp6sk;
struct inet_sock *newinet;
@@ -1247,7 +854,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr);
- newtp->af_specific = &ipv6_mapped;
+ inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
newsk->sk_backlog_rcv = tcp_v4_do_rcv;
newnp->pktoptions = NULL;
newnp->opt = NULL;
@@ -1261,10 +868,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
*/
/* It is tricky place. Until this moment IPv4 tcp
- worked with IPv6 af_tcp.af_specific.
+ worked with IPv6 icsk.icsk_af_ops.
Sync it now.
*/
- tcp_sync_mss(newsk, newtp->pmtu_cookie);
+ tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
return newsk;
}
@@ -1371,10 +978,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
sock_kfree_s(sk, opt, opt->tot_len);
}
- newtp->ext_header_len = 0;
+ inet_csk(newsk)->icsk_ext_hdr_len = 0;
if (newnp->opt)
- newtp->ext_header_len = newnp->opt->opt_nflen +
- newnp->opt->opt_flen;
+ inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
+ newnp->opt->opt_flen);
tcp_sync_mss(newsk, dst_mtu(dst));
newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
@@ -1382,7 +989,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
- __tcp_v6_hash(newsk);
+ __inet6_hash(&tcp_hashinfo, newsk);
inet_inherit_port(&tcp_hashinfo, sk, newsk);
return newsk;
@@ -1679,139 +1286,16 @@ do_time_wait:
goto discard_it;
}
-static int tcp_v6_rebuild_header(struct sock *sk)
-{
- int err;
- struct dst_entry *dst;
- struct ipv6_pinfo *np = inet6_sk(sk);
-
- dst = __sk_dst_check(sk, np->dst_cookie);
-
- if (dst == NULL) {
- struct inet_sock *inet = inet_sk(sk);
- struct in6_addr *final_p = NULL, final;
- struct flowi fl;
-
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_TCP;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
- fl.fl6_flowlabel = np->flow_label;
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = inet->dport;
- fl.fl_ip_sport = inet->sport;
-
- if (np->opt && np->opt->srcrt) {
- struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- }
-
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err) {
- sk->sk_route_caps = 0;
- return err;
- }
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
- sk->sk_err_soft = -err;
- return err;
- }
-
- ip6_dst_store(sk, dst, NULL);
- sk->sk_route_caps = dst->dev->features &
- ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
- }
-
- return 0;
-}
-
-static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok)
-{
- struct sock *sk = skb->sk;
- struct inet_sock *inet = inet_sk(sk);
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct flowi fl;
- struct dst_entry *dst;
- struct in6_addr *final_p = NULL, final;
-
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_TCP;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
- fl.fl6_flowlabel = np->flow_label;
- IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_sport = inet->sport;
- fl.fl_ip_dport = inet->dport;
-
- if (np->opt && np->opt->srcrt) {
- struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- }
-
- dst = __sk_dst_check(sk, np->dst_cookie);
-
- if (dst == NULL) {
- int err = ip6_dst_lookup(sk, &dst, &fl);
-
- if (err) {
- sk->sk_err_soft = -err;
- return err;
- }
-
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
- sk->sk_route_caps = 0;
- return err;
- }
-
- ip6_dst_store(sk, dst, NULL);
- sk->sk_route_caps = dst->dev->features &
- ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
- }
-
- skb->dst = dst_clone(dst);
-
- /* Restore final destination back after routing done */
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
-
- return ip6_xmit(sk, skb, &fl, np->opt, 0);
-}
-
-static void v6_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
-{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr;
-
- sin6->sin6_family = AF_INET6;
- ipv6_addr_copy(&sin6->sin6_addr, &np->daddr);
- sin6->sin6_port = inet_sk(sk)->dport;
- /* We do not store received flowlabel for TCP */
- sin6->sin6_flowinfo = 0;
- sin6->sin6_scope_id = 0;
- if (sk->sk_bound_dev_if &&
- ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
- sin6->sin6_scope_id = sk->sk_bound_dev_if;
-}
-
static int tcp_v6_remember_stamp(struct sock *sk)
{
/* Alas, not yet... */
return 0;
}
-static struct tcp_func ipv6_specific = {
- .queue_xmit = tcp_v6_xmit,
+static struct inet_connection_sock_af_ops ipv6_specific = {
+ .queue_xmit = inet6_csk_xmit,
.send_check = tcp_v6_send_check,
- .rebuild_header = tcp_v6_rebuild_header,
+ .rebuild_header = inet6_sk_rebuild_header,
.conn_request = tcp_v6_conn_request,
.syn_recv_sock = tcp_v6_syn_recv_sock,
.remember_stamp = tcp_v6_remember_stamp,
@@ -1819,7 +1303,7 @@ static struct tcp_func ipv6_specific = {
.setsockopt = ipv6_setsockopt,
.getsockopt = ipv6_getsockopt,
- .addr2sockaddr = v6_addr2sockaddr,
+ .addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in6)
};
@@ -1827,7 +1311,7 @@ static struct tcp_func ipv6_specific = {
* TCP over IPv4 via INET6 API
*/
-static struct tcp_func ipv6_mapped = {
+static struct inet_connection_sock_af_ops ipv6_mapped = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
.rebuild_header = inet_sk_rebuild_header,
@@ -1838,7 +1322,7 @@ static struct tcp_func ipv6_mapped = {
.setsockopt = ipv6_setsockopt,
.getsockopt = ipv6_getsockopt,
- .addr2sockaddr = v6_addr2sockaddr,
+ .addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in6)
};
@@ -1877,8 +1361,9 @@ static int tcp_v6_init_sock(struct sock *sk)
sk->sk_state = TCP_CLOSE;
- tp->af_specific = &ipv6_specific;
+ icsk->icsk_af_ops = &ipv6_specific;
icsk->icsk_ca_ops = &tcp_init_congestion_ops;
+ icsk->icsk_sync_mss = tcp_sync_mss;
sk->sk_write_space = sk_stream_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
@@ -1900,14 +1385,13 @@ static int tcp_v6_destroy_sock(struct sock *sk)
static void get_openreq6(struct seq_file *seq,
struct sock *sk, struct request_sock *req, int i, int uid)
{
- struct in6_addr *dest, *src;
int ttd = req->expires - jiffies;
+ struct in6_addr *src = &inet6_rsk(req)->loc_addr;
+ struct in6_addr *dest = &inet6_rsk(req)->rmt_addr;
if (ttd < 0)
ttd = 0;
- src = &tcp6_rsk(req)->loc_addr;
- dest = &tcp6_rsk(req)->rmt_addr;
seq_printf(seq,
"%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
"%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n",
@@ -1988,14 +1472,14 @@ static void get_timewait6_sock(struct seq_file *seq,
{
struct in6_addr *dest, *src;
__u16 destp, srcp;
- struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
+ struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw);
int ttd = tw->tw_ttd - jiffies;
if (ttd < 0)
ttd = 0;
- dest = &tcp6tw->tw_v6_daddr;
- src = &tcp6tw->tw_v6_rcv_saddr;
+ dest = &tw6->tw_v6_daddr;
+ src = &tw6->tw_v6_rcv_saddr;
destp = ntohs(tw->tw_dport);
srcp = ntohs(tw->tw_sport);
@@ -2093,7 +1577,7 @@ struct proto tcpv6_prot = {
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp6_sock),
- .twsk_obj_size = sizeof(struct tcp6_timewait_sock),
+ .twsk_prot = &tcp6_timewait_sock_ops,
.rsk_prot = &tcp6_request_sock_ops,
};
@@ -2110,7 +1594,8 @@ static struct inet_protosw tcpv6_protosw = {
.ops = &inet6_stream_ops,
.capability = -1,
.no_check = 0,
- .flags = INET_PROTOSW_PERMANENT,
+ .flags = INET_PROTOSW_PERMANENT |
+ INET_PROTOSW_ICSK,
};
void __init tcpv6_init(void)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 5cc8731eb55b..d8538dcea813 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -36,6 +36,7 @@
#include <linux/ipv6.h>
#include <linux/icmpv6.h>
#include <linux/init.h>
+#include <linux/skbuff.h>
#include <asm/uaccess.h>
#include <net/sock.h>
@@ -300,20 +301,7 @@ out:
return err;
csum_copy_err:
- /* Clear queue. */
- if (flags&MSG_PEEK) {
- int clear = 0;
- spin_lock_bh(&sk->sk_receive_queue.lock);
- if (skb == skb_peek(&sk->sk_receive_queue)) {
- __skb_unlink(skb, &sk->sk_receive_queue);
- clear = 1;
- }
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- if (clear)
- kfree_skb(skb);
- }
-
- skb_free_datagram(sk, skb);
+ skb_kill_datagram(sk, skb, flags);
if (flags & MSG_DONTWAIT) {
UDP6_INC_STATS_USER(UDP_MIB_INERRORS);
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 34b3bb868409..0dc519b40404 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -75,7 +75,7 @@ static struct datalink_proto *pEII_datalink;
static struct datalink_proto *p8023_datalink;
static struct datalink_proto *pSNAP_datalink;
-static struct proto_ops ipx_dgram_ops;
+static const struct proto_ops ipx_dgram_ops;
LIST_HEAD(ipx_interfaces);
DEFINE_SPINLOCK(ipx_interfaces_lock);
@@ -1884,7 +1884,7 @@ static int ipx_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
rc = -EINVAL;
break;
default:
- rc = dev_ioctl(cmd, argp);
+ rc = -ENOIOCTLCMD;
break;
}
@@ -1901,7 +1901,7 @@ static struct net_proto_family ipx_family_ops = {
.owner = THIS_MODULE,
};
-static struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = {
.family = PF_IPX,
.owner = THIS_MODULE,
.release = ipx_release,
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 6f92f9c62990..fbfa96754417 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -62,12 +62,12 @@
static int irda_create(struct socket *sock, int protocol);
-static struct proto_ops irda_stream_ops;
-static struct proto_ops irda_seqpacket_ops;
-static struct proto_ops irda_dgram_ops;
+static const struct proto_ops irda_stream_ops;
+static const struct proto_ops irda_seqpacket_ops;
+static const struct proto_ops irda_dgram_ops;
#ifdef CONFIG_IRDA_ULTRA
-static struct proto_ops irda_ultra_ops;
+static const struct proto_ops irda_ultra_ops;
#define ULTRA_MAX_DATA 382
#endif /* CONFIG_IRDA_ULTRA */
@@ -1438,8 +1438,9 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock,
/*
* POSIX 1003.1g mandates this order.
*/
- if (sk->sk_err)
- ret = sock_error(sk);
+ ret = sock_error(sk);
+ if (ret)
+ break;
else if (sk->sk_shutdown & RCV_SHUTDOWN)
;
else if (noblock)
@@ -1821,7 +1822,7 @@ static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
return -EINVAL;
default:
IRDA_DEBUG(1, "%s(), doing device ioctl!\n", __FUNCTION__);
- return dev_ioctl(cmd, (void __user *) arg);
+ return -ENOIOCTLCMD;
}
/*NOTREACHED*/
@@ -2463,7 +2464,7 @@ static struct net_proto_family irda_family_ops = {
.owner = THIS_MODULE,
};
-static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = {
.family = PF_IRDA,
.owner = THIS_MODULE,
.release = irda_release,
@@ -2484,7 +2485,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = {
.sendpage = sock_no_sendpage,
};
-static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = {
.family = PF_IRDA,
.owner = THIS_MODULE,
.release = irda_release,
@@ -2505,7 +2506,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = {
.sendpage = sock_no_sendpage,
};
-static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = {
.family = PF_IRDA,
.owner = THIS_MODULE,
.release = irda_release,
@@ -2527,7 +2528,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = {
};
#ifdef CONFIG_IRDA_ULTRA
-static struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = {
.family = PF_IRDA,
.owner = THIS_MODULE,
.release = irda_release,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 39031684b65c..52efd04cbedb 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -113,7 +113,7 @@ static __inline__ void pfkey_unlock_table(void)
}
-static struct proto_ops pfkey_ops;
+static const struct proto_ops pfkey_ops;
static void pfkey_insert(struct sock *sk)
{
@@ -336,6 +336,7 @@ static u8 sadb_ext_min_len[] = {
[SADB_X_EXT_NAT_T_SPORT] = (u8) sizeof(struct sadb_x_nat_t_port),
[SADB_X_EXT_NAT_T_DPORT] = (u8) sizeof(struct sadb_x_nat_t_port),
[SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address),
+ [SADB_X_EXT_SEC_CTX] = (u8) sizeof(struct sadb_x_sec_ctx),
};
/* Verify sadb_address_{len,prefixlen} against sa_family. */
@@ -383,6 +384,55 @@ static int verify_address_len(void *p)
return 0;
}
+static inline int pfkey_sec_ctx_len(struct sadb_x_sec_ctx *sec_ctx)
+{
+ int len = 0;
+
+ len += sizeof(struct sadb_x_sec_ctx);
+ len += sec_ctx->sadb_x_ctx_len;
+ len += sizeof(uint64_t) - 1;
+ len /= sizeof(uint64_t);
+
+ return len;
+}
+
+static inline int verify_sec_ctx_len(void *p)
+{
+ struct sadb_x_sec_ctx *sec_ctx = (struct sadb_x_sec_ctx *)p;
+ int len;
+
+ if (sec_ctx->sadb_x_ctx_len > PAGE_SIZE)
+ return -EINVAL;
+
+ len = pfkey_sec_ctx_len(sec_ctx);
+
+ if (sec_ctx->sadb_x_sec_len != len)
+ return -EINVAL;
+
+ return 0;
+}
+
+static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(struct sadb_x_sec_ctx *sec_ctx)
+{
+ struct xfrm_user_sec_ctx *uctx = NULL;
+ int ctx_size = sec_ctx->sadb_x_ctx_len;
+
+ uctx = kmalloc((sizeof(*uctx)+ctx_size), GFP_KERNEL);
+
+ if (!uctx)
+ return NULL;
+
+ uctx->len = pfkey_sec_ctx_len(sec_ctx);
+ uctx->exttype = sec_ctx->sadb_x_sec_exttype;
+ uctx->ctx_doi = sec_ctx->sadb_x_ctx_doi;
+ uctx->ctx_alg = sec_ctx->sadb_x_ctx_alg;
+ uctx->ctx_len = sec_ctx->sadb_x_ctx_len;
+ memcpy(uctx + 1, sec_ctx + 1,
+ uctx->ctx_len);
+
+ return uctx;
+}
+
static int present_and_same_family(struct sadb_address *src,
struct sadb_address *dst)
{
@@ -438,6 +488,10 @@ static int parse_exthdrs(struct sk_buff *skb, struct sadb_msg *hdr, void **ext_h
if (verify_address_len(p))
return -EINVAL;
}
+ if (ext_type == SADB_X_EXT_SEC_CTX) {
+ if (verify_sec_ctx_len(p))
+ return -EINVAL;
+ }
ext_hdrs[ext_type-1] = p;
}
p += ext_len;
@@ -586,6 +640,9 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
struct sadb_key *key;
struct sadb_x_sa2 *sa2;
struct sockaddr_in *sin;
+ struct sadb_x_sec_ctx *sec_ctx;
+ struct xfrm_sec_ctx *xfrm_ctx;
+ int ctx_size = 0;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
struct sockaddr_in6 *sin6;
#endif
@@ -609,6 +666,12 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
sizeof(struct sadb_address)*2 +
sockaddr_size*2 +
sizeof(struct sadb_x_sa2);
+
+ if ((xfrm_ctx = x->security)) {
+ ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len);
+ size += sizeof(struct sadb_x_sec_ctx) + ctx_size;
+ }
+
/* identity & sensitivity */
if ((x->props.family == AF_INET &&
@@ -899,6 +962,20 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
n_port->sadb_x_nat_t_port_reserved = 0;
}
+ /* security context */
+ if (xfrm_ctx) {
+ sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb,
+ sizeof(struct sadb_x_sec_ctx) + ctx_size);
+ sec_ctx->sadb_x_sec_len =
+ (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t);
+ sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX;
+ sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi;
+ sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg;
+ sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len;
+ memcpy(sec_ctx + 1, xfrm_ctx->ctx_str,
+ xfrm_ctx->ctx_len);
+ }
+
return skb;
}
@@ -909,6 +986,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
struct sadb_lifetime *lifetime;
struct sadb_sa *sa;
struct sadb_key *key;
+ struct sadb_x_sec_ctx *sec_ctx;
uint16_t proto;
int err;
@@ -993,6 +1071,21 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime;
x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime;
}
+
+ sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
+ if (sec_ctx != NULL) {
+ struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
+
+ if (!uctx)
+ goto out;
+
+ err = security_xfrm_state_alloc(x, uctx);
+ kfree(uctx);
+
+ if (err)
+ goto out;
+ }
+
key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1];
if (sa->sadb_sa_auth) {
int keysize = 0;
@@ -1720,6 +1813,18 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol)
return 0;
}
+static inline int pfkey_xfrm_policy2sec_ctx_size(struct xfrm_policy *xp)
+{
+ struct xfrm_sec_ctx *xfrm_ctx = xp->security;
+
+ if (xfrm_ctx) {
+ int len = sizeof(struct sadb_x_sec_ctx);
+ len += xfrm_ctx->ctx_len;
+ return PFKEY_ALIGN8(len);
+ }
+ return 0;
+}
+
static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp)
{
int sockaddr_size = pfkey_sockaddr_size(xp->family);
@@ -1733,7 +1838,8 @@ static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp)
(sockaddr_size * 2) +
sizeof(struct sadb_x_policy) +
(xp->xfrm_nr * (sizeof(struct sadb_x_ipsecrequest) +
- (socklen * 2)));
+ (socklen * 2))) +
+ pfkey_xfrm_policy2sec_ctx_size(xp);
}
static struct sk_buff * pfkey_xfrm_policy2msg_prep(struct xfrm_policy *xp)
@@ -1757,6 +1863,8 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i
struct sadb_lifetime *lifetime;
struct sadb_x_policy *pol;
struct sockaddr_in *sin;
+ struct sadb_x_sec_ctx *sec_ctx;
+ struct xfrm_sec_ctx *xfrm_ctx;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
struct sockaddr_in6 *sin6;
#endif
@@ -1941,6 +2049,21 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i
}
}
}
+
+ /* security context */
+ if ((xfrm_ctx = xp->security)) {
+ int ctx_size = pfkey_xfrm_policy2sec_ctx_size(xp);
+
+ sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb, ctx_size);
+ sec_ctx->sadb_x_sec_len = ctx_size / sizeof(uint64_t);
+ sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX;
+ sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi;
+ sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg;
+ sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len;
+ memcpy(sec_ctx + 1, xfrm_ctx->ctx_str,
+ xfrm_ctx->ctx_len);
+ }
+
hdr->sadb_msg_len = size / sizeof(uint64_t);
hdr->sadb_msg_reserved = atomic_read(&xp->refcnt);
}
@@ -1976,12 +2099,13 @@ out:
static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
{
- int err;
+ int err = 0;
struct sadb_lifetime *lifetime;
struct sadb_address *sa;
struct sadb_x_policy *pol;
struct xfrm_policy *xp;
struct km_event c;
+ struct sadb_x_sec_ctx *sec_ctx;
if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
@@ -2028,6 +2152,22 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
if (xp->selector.dport)
xp->selector.dport_mask = ~0;
+ sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
+ if (sec_ctx != NULL) {
+ struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
+
+ if (!uctx) {
+ err = -ENOBUFS;
+ goto out;
+ }
+
+ err = security_xfrm_policy_alloc(xp, uctx);
+ kfree(uctx);
+
+ if (err)
+ goto out;
+ }
+
xp->lft.soft_byte_limit = XFRM_INF;
xp->lft.hard_byte_limit = XFRM_INF;
xp->lft.soft_packet_limit = XFRM_INF;
@@ -2051,10 +2191,9 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp,
hdr->sadb_msg_type != SADB_X_SPDUPDATE);
- if (err) {
- kfree(xp);
- return err;
- }
+
+ if (err)
+ goto out;
if (hdr->sadb_msg_type == SADB_X_SPDUPDATE)
c.event = XFRM_MSG_UPDPOLICY;
@@ -2069,6 +2208,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
return 0;
out:
+ security_xfrm_policy_free(xp);
kfree(xp);
return err;
}
@@ -2078,9 +2218,10 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
int err;
struct sadb_address *sa;
struct sadb_x_policy *pol;
- struct xfrm_policy *xp;
+ struct xfrm_policy *xp, tmp;
struct xfrm_selector sel;
struct km_event c;
+ struct sadb_x_sec_ctx *sec_ctx;
if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
@@ -2109,7 +2250,24 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
if (sel.dport)
sel.dport_mask = ~0;
- xp = xfrm_policy_bysel(pol->sadb_x_policy_dir-1, &sel, 1);
+ sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
+ memset(&tmp, 0, sizeof(struct xfrm_policy));
+
+ if (sec_ctx != NULL) {
+ struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
+
+ if (!uctx)
+ return -ENOMEM;
+
+ err = security_xfrm_policy_alloc(&tmp, uctx);
+ kfree(uctx);
+
+ if (err)
+ return err;
+ }
+
+ xp = xfrm_policy_bysel_ctx(pol->sadb_x_policy_dir-1, &sel, tmp.security, 1);
+ security_xfrm_policy_free(&tmp);
if (xp == NULL)
return -ENOENT;
@@ -2660,6 +2818,7 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt,
{
struct xfrm_policy *xp;
struct sadb_x_policy *pol = (struct sadb_x_policy*)data;
+ struct sadb_x_sec_ctx *sec_ctx;
switch (family) {
case AF_INET:
@@ -2709,10 +2868,32 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt,
(*dir = parse_ipsecrequests(xp, pol)) < 0)
goto out;
+ /* security context too */
+ if (len >= (pol->sadb_x_policy_len*8 +
+ sizeof(struct sadb_x_sec_ctx))) {
+ char *p = (char *)pol;
+ struct xfrm_user_sec_ctx *uctx;
+
+ p += pol->sadb_x_policy_len*8;
+ sec_ctx = (struct sadb_x_sec_ctx *)p;
+ if (len < pol->sadb_x_policy_len*8 +
+ sec_ctx->sadb_x_sec_len)
+ goto out;
+ if ((*dir = verify_sec_ctx_len(p)))
+ goto out;
+ uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
+ *dir = security_xfrm_policy_alloc(xp, uctx);
+ kfree(uctx);
+
+ if (*dir)
+ goto out;
+ }
+
*dir = pol->sadb_x_policy_dir-1;
return xp;
out:
+ security_xfrm_policy_free(xp);
kfree(xp);
return NULL;
}
@@ -2946,7 +3127,7 @@ out:
return err;
}
-static struct proto_ops pfkey_ops = {
+static const struct proto_ops pfkey_ops = {
.family = PF_KEY,
.owner = THIS_MODULE,
/* Operations that make no sense on pfkey sockets. */
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index c3f0b0783453..8171c53bc0ed 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -36,7 +36,7 @@
static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START;
static u16 llc_ui_sap_link_no_max[256];
static struct sockaddr_llc llc_ui_addrnull;
-static struct proto_ops llc_ui_ops;
+static const struct proto_ops llc_ui_ops;
static int llc_ui_wait_for_conn(struct sock *sk, long timeout);
static int llc_ui_wait_for_disc(struct sock *sk, long timeout);
@@ -566,10 +566,9 @@ static int llc_wait_data(struct sock *sk, long timeo)
/*
* POSIX 1003.1g mandates this order.
*/
- if (sk->sk_err) {
- rc = sock_error(sk);
+ rc = sock_error(sk);
+ if (rc)
break;
- }
rc = 0;
if (sk->sk_shutdown & RCV_SHUTDOWN)
break;
@@ -960,7 +959,7 @@ out:
static int llc_ui_ioctl(struct socket *sock, unsigned int cmd,
unsigned long arg)
{
- return dev_ioctl(cmd, (void __user *)arg);
+ return -ENOIOCTLCMD;
}
/**
@@ -1099,7 +1098,7 @@ static struct net_proto_family llc_ui_family_ops = {
.owner = THIS_MODULE,
};
-static struct proto_ops llc_ui_ops = {
+static const struct proto_ops llc_ui_ops = {
.family = PF_LLC,
.owner = THIS_MODULE,
.release = llc_ui_release,
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index cba63729313d..e10512e229b6 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -151,7 +151,7 @@ instance_create(u_int16_t group_num, int pid)
goto out_unlock;
INIT_HLIST_NODE(&inst->hlist);
- inst->lock = SPIN_LOCK_UNLOCKED;
+ spin_lock_init(&inst->lock);
/* needs to be two, since we _put() after creation */
atomic_set(&inst->use, 2);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index f28460b61e47..55afdda3d940 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -148,7 +148,7 @@ instance_create(u_int16_t queue_num, int pid)
atomic_set(&inst->id_sequence, 0);
/* needs to be two, since we _put() after creation */
atomic_set(&inst->use, 2);
- inst->lock = SPIN_LOCK_UNLOCKED;
+ spin_lock_init(&inst->lock);
INIT_LIST_HEAD(&inst->queue_list);
if (!try_module_get(THIS_MODULE))
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 96020d7087e8..7849cac14d3a 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -293,7 +293,7 @@ static inline int nl_pid_hash_dilute(struct nl_pid_hash *hash, int len)
return 0;
}
-static struct proto_ops netlink_ops;
+static const struct proto_ops netlink_ops;
static int netlink_insert(struct sock *sk, u32 pid)
{
@@ -1656,7 +1656,7 @@ int netlink_unregister_notifier(struct notifier_block *nb)
return notifier_chain_unregister(&netlink_chain, nb);
}
-static struct proto_ops netlink_ops = {
+static const struct proto_ops netlink_ops = {
.family = PF_NETLINK,
.owner = THIS_MODULE,
.release = netlink_release,
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 287cfcc56951..3b1378498d50 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -441,7 +441,7 @@ errout:
}
static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid,
- int seq, int cmd)
+ int seq, u8 cmd)
{
struct sk_buff *skb;
int err;
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index e5d82d711cae..63b0e4afeb33 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -63,7 +63,7 @@ static unsigned short circuit = 0x101;
static HLIST_HEAD(nr_list);
static DEFINE_SPINLOCK(nr_list_lock);
-static struct proto_ops nr_proto_ops;
+static const struct proto_ops nr_proto_ops;
/*
* Socket removal during an interrupt is now safe.
@@ -1166,10 +1166,11 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
void __user *argp = (void __user *)arg;
int ret;
- lock_sock(sk);
switch (cmd) {
case TIOCOUTQ: {
long amount;
+
+ lock_sock(sk);
amount = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc);
if (amount < 0)
amount = 0;
@@ -1180,6 +1181,8 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case TIOCINQ: {
struct sk_buff *skb;
long amount = 0L;
+
+ lock_sock(sk);
/* These two are safe on a single CPU system as only user tasks fiddle here */
if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL)
amount = skb->len;
@@ -1188,6 +1191,7 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
}
case SIOCGSTAMP:
+ lock_sock(sk);
ret = sock_get_timestamp(sk, argp);
release_sock(sk);
return ret;
@@ -1202,21 +1206,17 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case SIOCSIFNETMASK:
case SIOCGIFMETRIC:
case SIOCSIFMETRIC:
- release_sock(sk);
return -EINVAL;
case SIOCADDRT:
case SIOCDELRT:
case SIOCNRDECOBS:
- release_sock(sk);
if (!capable(CAP_NET_ADMIN)) return -EPERM;
return nr_rt_ioctl(cmd, argp);
default:
- release_sock(sk);
- return dev_ioctl(cmd, argp);
+ return -ENOIOCTLCMD;
}
- release_sock(sk);
return 0;
}
@@ -1337,7 +1337,7 @@ static struct net_proto_family nr_family_ops = {
.owner = THIS_MODULE,
};
-static struct proto_ops nr_proto_ops = {
+static const struct proto_ops nr_proto_ops = {
.family = PF_NETROM,
.owner = THIS_MODULE,
.release = nr_release,
diff --git a/net/nonet.c b/net/nonet.c
index e5241dceaa57..1230f0ae832e 100644
--- a/net/nonet.c
+++ b/net/nonet.c
@@ -14,11 +14,6 @@
#include <linux/init.h>
#include <linux/kernel.h>
-void __init sock_init(void)
-{
- printk(KERN_INFO "Linux NoNET1.0 for Linux 2.6\n");
-}
-
static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
{
return -ENXIO;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3e2462760413..f69e5ed9bd06 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -251,10 +251,10 @@ static void packet_sock_destruct(struct sock *sk)
}
-static struct proto_ops packet_ops;
+static const struct proto_ops packet_ops;
#ifdef CONFIG_SOCK_PACKET
-static struct proto_ops packet_ops_spkt;
+static const struct proto_ops packet_ops_spkt;
static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
@@ -1521,7 +1521,7 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,
#endif
default:
- return dev_ioctl(cmd, (void __user *)arg);
+ return -ENOIOCTLCMD;
}
return 0;
}
@@ -1784,7 +1784,7 @@ out:
#ifdef CONFIG_SOCK_PACKET
-static struct proto_ops packet_ops_spkt = {
+static const struct proto_ops packet_ops_spkt = {
.family = PF_PACKET,
.owner = THIS_MODULE,
.release = packet_release,
@@ -1806,7 +1806,7 @@ static struct proto_ops packet_ops_spkt = {
};
#endif
-static struct proto_ops packet_ops = {
+static const struct proto_ops packet_ops = {
.family = PF_PACKET,
.owner = THIS_MODULE,
.release = packet_release,
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 829fdbc4400b..63090be2315a 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1320,7 +1320,7 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
return 0;
default:
- return dev_ioctl(cmd, argp);
+ return -ENOIOCTLCMD;
}
return 0;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 82fb07aa06a5..ba5283204837 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -25,7 +25,7 @@
#include <net/pkt_sched.h>
-#define VERSION "1.1"
+#define VERSION "1.2"
/* Network Emulation Queuing algorithm.
====================================
@@ -65,11 +65,12 @@ struct netem_sched_data {
u32 jitter;
u32 duplicate;
u32 reorder;
+ u32 corrupt;
struct crndstate {
unsigned long last;
unsigned long rho;
- } delay_cor, loss_cor, dup_cor, reorder_cor;
+ } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
struct disttable {
u32 size;
@@ -183,6 +184,23 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
q->duplicate = dupsave;
}
+ /*
+ * Randomized packet corruption.
+ * Make copy if needed since we are modifying
+ * If packet is going to be hardware checksummed, then
+ * do it now in software before we mangle it.
+ */
+ if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
+ if (!(skb = skb_unshare(skb, GFP_ATOMIC))
+ || (skb->ip_summed == CHECKSUM_HW
+ && skb_checksum_help(skb, 0))) {
+ sch->qstats.drops++;
+ return NET_XMIT_DROP;
+ }
+
+ skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
+ }
+
if (q->gap == 0 /* not doing reordering */
|| q->counter < q->gap /* inside last reordering gap */
|| q->reorder < get_crandom(&q->reorder_cor)) {
@@ -382,6 +400,20 @@ static int get_reorder(struct Qdisc *sch, const struct rtattr *attr)
return 0;
}
+static int get_corrupt(struct Qdisc *sch, const struct rtattr *attr)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ const struct tc_netem_corrupt *r = RTA_DATA(attr);
+
+ if (RTA_PAYLOAD(attr) != sizeof(*r))
+ return -EINVAL;
+
+ q->corrupt = r->probability;
+ init_crandom(&q->corrupt_cor, r->correlation);
+ return 0;
+}
+
+/* Parse netlink message to set options */
static int netem_change(struct Qdisc *sch, struct rtattr *opt)
{
struct netem_sched_data *q = qdisc_priv(sch);
@@ -432,13 +464,19 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt)
if (ret)
return ret;
}
+
if (tb[TCA_NETEM_REORDER-1]) {
ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]);
if (ret)
return ret;
}
- }
+ if (tb[TCA_NETEM_CORRUPT-1]) {
+ ret = get_corrupt(sch, tb[TCA_NETEM_CORRUPT-1]);
+ if (ret)
+ return ret;
+ }
+ }
return 0;
}
@@ -564,6 +602,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
struct tc_netem_qopt qopt;
struct tc_netem_corr cor;
struct tc_netem_reorder reorder;
+ struct tc_netem_corrupt corrupt;
qopt.latency = q->latency;
qopt.jitter = q->jitter;
@@ -582,6 +621,10 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
reorder.correlation = q->reorder_cor.rho;
RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
+ corrupt.probability = q->corrupt;
+ corrupt.correlation = q->corrupt_cor.rho;
+ RTA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
+
rta->rta_len = skb->tail - b;
return skb->len;
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 6cf0342706b5..c4a2a8c4c339 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -22,6 +22,7 @@
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
+#include <linux/if_arp.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index dec68a604773..9d05e13e92f6 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -110,7 +110,6 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
asoc->cookie_life.tv_sec = sp->assocparams.sasoc_cookie_life / 1000;
asoc->cookie_life.tv_usec = (sp->assocparams.sasoc_cookie_life % 1000)
* 1000;
- asoc->pmtu = 0;
asoc->frag_point = 0;
/* Set the association max_retrans and RTO values from the
@@ -123,6 +122,25 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
asoc->overall_error_count = 0;
+ /* Initialize the association's heartbeat interval based on the
+ * sock configured value.
+ */
+ asoc->hbinterval = msecs_to_jiffies(sp->hbinterval);
+
+ /* Initialize path max retrans value. */
+ asoc->pathmaxrxt = sp->pathmaxrxt;
+
+ /* Initialize default path MTU. */
+ asoc->pathmtu = sp->pathmtu;
+
+ /* Set association default SACK delay */
+ asoc->sackdelay = msecs_to_jiffies(sp->sackdelay);
+
+ /* Set the association default flags controlling
+ * Heartbeat, SACK delay, and Path MTU Discovery.
+ */
+ asoc->param_flags = sp->param_flags;
+
/* Initialize the maximum mumber of new data packets that can be sent
* in a burst.
*/
@@ -144,8 +162,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
= 5 * asoc->rto_max;
asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
- asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
- SCTP_DEFAULT_TIMEOUT_SACK;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay;
asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
sp->autoclose * HZ;
@@ -540,23 +557,46 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
sctp_transport_set_owner(peer, asoc);
+ /* Initialize the peer's heartbeat interval based on the
+ * association configured value.
+ */
+ peer->hbinterval = asoc->hbinterval;
+
+ /* Set the path max_retrans. */
+ peer->pathmaxrxt = asoc->pathmaxrxt;
+
+ /* Initialize the peer's SACK delay timeout based on the
+ * association configured value.
+ */
+ peer->sackdelay = asoc->sackdelay;
+
+ /* Enable/disable heartbeat, SACK delay, and path MTU discovery
+ * based on association setting.
+ */
+ peer->param_flags = asoc->param_flags;
+
/* Initialize the pmtu of the transport. */
- sctp_transport_pmtu(peer);
+ if (peer->param_flags & SPP_PMTUD_ENABLE)
+ sctp_transport_pmtu(peer);
+ else if (asoc->pathmtu)
+ peer->pathmtu = asoc->pathmtu;
+ else
+ peer->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
/* If this is the first transport addr on this association,
* initialize the association PMTU to the peer's PMTU.
* If not and the current association PMTU is higher than the new
* peer's PMTU, reset the association PMTU to the new peer's PMTU.
*/
- if (asoc->pmtu)
- asoc->pmtu = min_t(int, peer->pmtu, asoc->pmtu);
+ if (asoc->pathmtu)
+ asoc->pathmtu = min_t(int, peer->pathmtu, asoc->pathmtu);
else
- asoc->pmtu = peer->pmtu;
+ asoc->pathmtu = peer->pathmtu;
SCTP_DEBUG_PRINTK("sctp_assoc_add_peer:association %p PMTU set to "
- "%d\n", asoc, asoc->pmtu);
+ "%d\n", asoc, asoc->pathmtu);
- asoc->frag_point = sctp_frag_point(sp, asoc->pmtu);
+ asoc->frag_point = sctp_frag_point(sp, asoc->pathmtu);
/* The asoc->peer.port might not be meaningful yet, but
* initialize the packet structure anyway.
@@ -574,7 +614,7 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
* (for example, implementations MAY use the size of the
* receiver advertised window).
*/
- peer->cwnd = min(4*asoc->pmtu, max_t(__u32, 2*asoc->pmtu, 4380));
+ peer->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380));
/* At this point, we may not have the receiver's advertised window,
* so initialize ssthresh to the default value and it will be set
@@ -585,17 +625,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
peer->partial_bytes_acked = 0;
peer->flight_size = 0;
- /* By default, enable heartbeat for peer address. */
- peer->hb_allowed = 1;
-
- /* Initialize the peer's heartbeat interval based on the
- * sock configured value.
- */
- peer->hb_interval = msecs_to_jiffies(sp->paddrparam.spp_hbinterval);
-
- /* Set the path max_retrans. */
- peer->max_retrans = sp->paddrparam.spp_pathmaxrxt;
-
/* Set the transport's RTO.initial value */
peer->rto = asoc->rto_initial;
@@ -1155,18 +1184,18 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
/* Get the lowest pmtu of all the transports. */
list_for_each(pos, &asoc->peer.transport_addr_list) {
t = list_entry(pos, struct sctp_transport, transports);
- if (!pmtu || (t->pmtu < pmtu))
- pmtu = t->pmtu;
+ if (!pmtu || (t->pathmtu < pmtu))
+ pmtu = t->pathmtu;
}
if (pmtu) {
struct sctp_sock *sp = sctp_sk(asoc->base.sk);
- asoc->pmtu = pmtu;
+ asoc->pathmtu = pmtu;
asoc->frag_point = sctp_frag_point(sp, pmtu);
}
SCTP_DEBUG_PRINTK("%s: asoc:%p, pmtu:%d, frag_point:%d\n",
- __FUNCTION__, asoc, asoc->pmtu, asoc->frag_point);
+ __FUNCTION__, asoc, asoc->pathmtu, asoc->frag_point);
}
/* Should we send a SACK to update our peer? */
@@ -1179,7 +1208,7 @@ static inline int sctp_peer_needs_update(struct sctp_association *asoc)
case SCTP_STATE_SHUTDOWN_SENT:
if ((asoc->rwnd > asoc->a_rwnd) &&
((asoc->rwnd - asoc->a_rwnd) >=
- min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pmtu)))
+ min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pathmtu)))
return 1;
break;
default:
diff --git a/net/sctp/input.c b/net/sctp/input.c
index b24ff2c1aef5..238f1bffa684 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -305,18 +305,36 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
struct sctp_transport *t, __u32 pmtu)
{
- if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
- printk(KERN_WARNING "%s: Reported pmtu %d too low, "
- "using default minimum of %d\n", __FUNCTION__, pmtu,
- SCTP_DEFAULT_MINSEGMENT);
- pmtu = SCTP_DEFAULT_MINSEGMENT;
- }
+ if (sock_owned_by_user(sk) || !t || (t->pathmtu == pmtu))
+ return;
- if (!sock_owned_by_user(sk) && t && (t->pmtu != pmtu)) {
- t->pmtu = pmtu;
+ if (t->param_flags & SPP_PMTUD_ENABLE) {
+ if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
+ printk(KERN_WARNING "%s: Reported pmtu %d too low, "
+ "using default minimum of %d\n",
+ __FUNCTION__, pmtu,
+ SCTP_DEFAULT_MINSEGMENT);
+ /* Use default minimum segment size and disable
+ * pmtu discovery on this transport.
+ */
+ t->pathmtu = SCTP_DEFAULT_MINSEGMENT;
+ t->param_flags = (t->param_flags & ~SPP_HB) |
+ SPP_PMTUD_DISABLE;
+ } else {
+ t->pathmtu = pmtu;
+ }
+
+ /* Update association pmtu. */
sctp_assoc_sync_pmtu(asoc);
- sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD);
}
+
+ /* Retransmit with the new pmtu setting.
+ * Normally, if PMTU discovery is disabled, an ICMP Fragmentation
+ * Needed will never be sent, but if a message was sent before
+ * PMTU discovery was disabled that was larger than the PMTU, it
+ * would not be fragmented, so it must be re-transmitted fragmented.
+ */
+ sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD);
}
/*
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index fa3be2b8fb5f..15c05165c905 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -866,7 +866,7 @@ static int sctp_inet6_supported_addrs(const struct sctp_sock *opt,
return 2;
}
-static struct proto_ops inet6_seqpacket_ops = {
+static const struct proto_ops inet6_seqpacket_ops = {
.family = PF_INET6,
.owner = THIS_MODULE,
.release = inet6_release,
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 931371633464..a40991ef72c9 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -234,8 +234,8 @@ sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *packet,
goto finish;
pmtu = ((packet->transport->asoc) ?
- (packet->transport->asoc->pmtu) :
- (packet->transport->pmtu));
+ (packet->transport->asoc->pathmtu) :
+ (packet->transport->pathmtu));
too_big = (psize + chunk_len > pmtu);
@@ -482,7 +482,9 @@ int sctp_packet_transmit(struct sctp_packet *packet)
if (!dst || (dst->obsolete > 1)) {
dst_release(dst);
sctp_transport_route(tp, NULL, sctp_sk(sk));
- sctp_assoc_sync_pmtu(asoc);
+ if (asoc->param_flags & SPP_PMTUD_ENABLE) {
+ sctp_assoc_sync_pmtu(asoc);
+ }
}
nskb->dst = dst_clone(tp->dst);
@@ -492,7 +494,10 @@ int sctp_packet_transmit(struct sctp_packet *packet)
SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n",
nskb->len);
- (*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok);
+ if (tp->param_flags & SPP_PMTUD_ENABLE)
+ (*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok);
+ else
+ (*tp->af_specific->sctp_xmit)(nskb, tp, 1);
out:
packet->size = packet->overhead;
@@ -577,7 +582,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet,
* if ((flightsize + Max.Burst * MTU) < cwnd)
* cwnd = flightsize + Max.Burst * MTU
*/
- max_burst_bytes = asoc->max_burst * asoc->pmtu;
+ max_burst_bytes = asoc->max_burst * asoc->pathmtu;
if ((transport->flight_size + max_burst_bytes) < transport->cwnd) {
transport->cwnd = transport->flight_size + max_burst_bytes;
SCTP_DEBUG_PRINTK("%s: cwnd limited by max_burst: "
@@ -622,7 +627,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet,
* data will fit or delay in hopes of bundling a full
* sized packet.
*/
- if (len < asoc->pmtu - packet->overhead) {
+ if (len < asoc->pathmtu - packet->overhead) {
retval = SCTP_XMIT_NAGLE_DELAY;
goto finish;
}
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index f775d78aa59d..de693b43c8ea 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -54,6 +54,7 @@
#include <net/protocol.h>
#include <net/ip.h>
#include <net/ipv6.h>
+#include <net/route.h>
#include <net/sctp/sctp.h>
#include <net/addrconf.h>
#include <net/inet_common.h>
@@ -829,7 +830,7 @@ static struct notifier_block sctp_inetaddr_notifier = {
};
/* Socket operations. */
-static struct proto_ops inet_seqpacket_ops = {
+static const struct proto_ops inet_seqpacket_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release, /* Needs to be wrapped... */
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 823947170a33..2d7d8a5db2ac 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -157,9 +157,12 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force,
{
__u32 ctsn, max_tsn_seen;
struct sctp_chunk *sack;
+ struct sctp_transport *trans = asoc->peer.last_data_from;
int error = 0;
- if (force)
+ if (force ||
+ (!trans && (asoc->param_flags & SPP_SACKDELAY_DISABLE)) ||
+ (trans && (trans->param_flags & SPP_SACKDELAY_DISABLE)))
asoc->peer.sack_needed = 1;
ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
@@ -189,7 +192,22 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force,
if (!asoc->peer.sack_needed) {
/* We will need a SACK for the next packet. */
asoc->peer.sack_needed = 1;
- goto out;
+
+ /* Set the SACK delay timeout based on the
+ * SACK delay for the last transport
+ * data was received from, or the default
+ * for the association.
+ */
+ if (trans)
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
+ trans->sackdelay;
+ else
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
+ asoc->sackdelay;
+
+ /* Restart the SACK timer. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
} else {
if (asoc->a_rwnd > asoc->rwnd)
asoc->a_rwnd = asoc->rwnd;
@@ -205,7 +223,7 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force,
sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
}
-out:
+
return error;
nomem:
error = -ENOMEM;
@@ -415,7 +433,7 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
asoc->overall_error_count++;
if (transport->state != SCTP_INACTIVE &&
- (transport->error_count++ >= transport->max_retrans)) {
+ (transport->error_count++ >= transport->pathmaxrxt)) {
SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
" transport IP: port:%d failed.\n",
asoc,
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 475bfb4972d9..557a7d90b92a 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -900,7 +900,7 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(const struct sctp_endpoint *ep,
* HEARTBEAT is sent (see Section 8.3).
*/
- if (transport->hb_allowed) {
+ if (transport->param_flags & SPP_HB_ENABLE) {
if (SCTP_DISPOSITION_NOMEM ==
sctp_sf_heartbeat(ep, asoc, type, arg,
commands))
@@ -1051,7 +1051,7 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep,
return SCTP_DISPOSITION_DISCARD;
}
- max_interval = link->hb_interval + link->rto;
+ max_interval = link->hbinterval + link->rto;
/* Check if the timestamp looks valid. */
if (time_after(hbinfo->sent_at, jiffies) ||
@@ -2691,14 +2691,9 @@ sctp_disposition_t sctp_sf_eat_data_6_2(const struct sctp_endpoint *ep,
* document allow. However, an SCTP transmitter MUST NOT be
* more aggressive than the following algorithms allow.
*/
- if (chunk->end_of_packet) {
+ if (chunk->end_of_packet)
sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE());
- /* Start the SACK timer. */
- sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
- SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
- }
-
return SCTP_DISPOSITION_CONSUME;
discard_force:
@@ -2721,13 +2716,9 @@ discard_force:
return SCTP_DISPOSITION_DISCARD;
discard_noforce:
- if (chunk->end_of_packet) {
+ if (chunk->end_of_packet)
sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE());
- /* Start the SACK timer. */
- sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
- SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
- }
return SCTP_DISPOSITION_DISCARD;
consume:
return SCTP_DISPOSITION_CONSUME;
@@ -3442,9 +3433,6 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(const struct sctp_endpoint *ep,
* send another.
*/
sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE());
- /* Start the SACK timer. */
- sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
- SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
return SCTP_DISPOSITION_CONSUME;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 9df888e932c5..fc04d185fa33 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1941,107 +1941,379 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
* address's parameters:
*
* struct sctp_paddrparams {
- * sctp_assoc_t spp_assoc_id;
- * struct sockaddr_storage spp_address;
- * uint32_t spp_hbinterval;
- * uint16_t spp_pathmaxrxt;
- * };
- *
- * spp_assoc_id - (UDP style socket) This is filled in the application,
- * and identifies the association for this query.
+ * sctp_assoc_t spp_assoc_id;
+ * struct sockaddr_storage spp_address;
+ * uint32_t spp_hbinterval;
+ * uint16_t spp_pathmaxrxt;
+ * uint32_t spp_pathmtu;
+ * uint32_t spp_sackdelay;
+ * uint32_t spp_flags;
+ * };
+ *
+ * spp_assoc_id - (one-to-many style socket) This is filled in the
+ * application, and identifies the association for
+ * this query.
* spp_address - This specifies which address is of interest.
* spp_hbinterval - This contains the value of the heartbeat interval,
- * in milliseconds. A value of 0, when modifying the
- * parameter, specifies that the heartbeat on this
- * address should be disabled. A value of UINT32_MAX
- * (4294967295), when modifying the parameter,
- * specifies that a heartbeat should be sent
- * immediately to the peer address, and the current
- * interval should remain unchanged.
+ * in milliseconds. If a value of zero
+ * is present in this field then no changes are to
+ * be made to this parameter.
* spp_pathmaxrxt - This contains the maximum number of
* retransmissions before this address shall be
- * considered unreachable.
+ * considered unreachable. If a value of zero
+ * is present in this field then no changes are to
+ * be made to this parameter.
+ * spp_pathmtu - When Path MTU discovery is disabled the value
+ * specified here will be the "fixed" path mtu.
+ * Note that if the spp_address field is empty
+ * then all associations on this address will
+ * have this fixed path mtu set upon them.
+ *
+ * spp_sackdelay - When delayed sack is enabled, this value specifies
+ * the number of milliseconds that sacks will be delayed
+ * for. This value will apply to all addresses of an
+ * association if the spp_address field is empty. Note
+ * also, that if delayed sack is enabled and this
+ * value is set to 0, no change is made to the last
+ * recorded delayed sack timer value.
+ *
+ * spp_flags - These flags are used to control various features
+ * on an association. The flag field may contain
+ * zero or more of the following options.
+ *
+ * SPP_HB_ENABLE - Enable heartbeats on the
+ * specified address. Note that if the address
+ * field is empty all addresses for the association
+ * have heartbeats enabled upon them.
+ *
+ * SPP_HB_DISABLE - Disable heartbeats on the
+ * speicifed address. Note that if the address
+ * field is empty all addresses for the association
+ * will have their heartbeats disabled. Note also
+ * that SPP_HB_ENABLE and SPP_HB_DISABLE are
+ * mutually exclusive, only one of these two should
+ * be specified. Enabling both fields will have
+ * undetermined results.
+ *
+ * SPP_HB_DEMAND - Request a user initiated heartbeat
+ * to be made immediately.
+ *
+ * SPP_PMTUD_ENABLE - This field will enable PMTU
+ * discovery upon the specified address. Note that
+ * if the address feild is empty then all addresses
+ * on the association are effected.
+ *
+ * SPP_PMTUD_DISABLE - This field will disable PMTU
+ * discovery upon the specified address. Note that
+ * if the address feild is empty then all addresses
+ * on the association are effected. Not also that
+ * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually
+ * exclusive. Enabling both will have undetermined
+ * results.
+ *
+ * SPP_SACKDELAY_ENABLE - Setting this flag turns
+ * on delayed sack. The time specified in spp_sackdelay
+ * is used to specify the sack delay for this address. Note
+ * that if spp_address is empty then all addresses will
+ * enable delayed sack and take on the sack delay
+ * value specified in spp_sackdelay.
+ * SPP_SACKDELAY_DISABLE - Setting this flag turns
+ * off delayed sack. If the spp_address field is blank then
+ * delayed sack is disabled for the entire association. Note
+ * also that this field is mutually exclusive to
+ * SPP_SACKDELAY_ENABLE, setting both will have undefined
+ * results.
*/
+int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
+ struct sctp_transport *trans,
+ struct sctp_association *asoc,
+ struct sctp_sock *sp,
+ int hb_change,
+ int pmtud_change,
+ int sackdelay_change)
+{
+ int error;
+
+ if (params->spp_flags & SPP_HB_DEMAND && trans) {
+ error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans);
+ if (error)
+ return error;
+ }
+
+ if (params->spp_hbinterval) {
+ if (trans) {
+ trans->hbinterval = msecs_to_jiffies(params->spp_hbinterval);
+ } else if (asoc) {
+ asoc->hbinterval = msecs_to_jiffies(params->spp_hbinterval);
+ } else {
+ sp->hbinterval = params->spp_hbinterval;
+ }
+ }
+
+ if (hb_change) {
+ if (trans) {
+ trans->param_flags =
+ (trans->param_flags & ~SPP_HB) | hb_change;
+ } else if (asoc) {
+ asoc->param_flags =
+ (asoc->param_flags & ~SPP_HB) | hb_change;
+ } else {
+ sp->param_flags =
+ (sp->param_flags & ~SPP_HB) | hb_change;
+ }
+ }
+
+ if (params->spp_pathmtu) {
+ if (trans) {
+ trans->pathmtu = params->spp_pathmtu;
+ sctp_assoc_sync_pmtu(asoc);
+ } else if (asoc) {
+ asoc->pathmtu = params->spp_pathmtu;
+ sctp_frag_point(sp, params->spp_pathmtu);
+ } else {
+ sp->pathmtu = params->spp_pathmtu;
+ }
+ }
+
+ if (pmtud_change) {
+ if (trans) {
+ int update = (trans->param_flags & SPP_PMTUD_DISABLE) &&
+ (params->spp_flags & SPP_PMTUD_ENABLE);
+ trans->param_flags =
+ (trans->param_flags & ~SPP_PMTUD) | pmtud_change;
+ if (update) {
+ sctp_transport_pmtu(trans);
+ sctp_assoc_sync_pmtu(asoc);
+ }
+ } else if (asoc) {
+ asoc->param_flags =
+ (asoc->param_flags & ~SPP_PMTUD) | pmtud_change;
+ } else {
+ sp->param_flags =
+ (sp->param_flags & ~SPP_PMTUD) | pmtud_change;
+ }
+ }
+
+ if (params->spp_sackdelay) {
+ if (trans) {
+ trans->sackdelay =
+ msecs_to_jiffies(params->spp_sackdelay);
+ } else if (asoc) {
+ asoc->sackdelay =
+ msecs_to_jiffies(params->spp_sackdelay);
+ } else {
+ sp->sackdelay = params->spp_sackdelay;
+ }
+ }
+
+ if (sackdelay_change) {
+ if (trans) {
+ trans->param_flags =
+ (trans->param_flags & ~SPP_SACKDELAY) |
+ sackdelay_change;
+ } else if (asoc) {
+ asoc->param_flags =
+ (asoc->param_flags & ~SPP_SACKDELAY) |
+ sackdelay_change;
+ } else {
+ sp->param_flags =
+ (sp->param_flags & ~SPP_SACKDELAY) |
+ sackdelay_change;
+ }
+ }
+
+ if (params->spp_pathmaxrxt) {
+ if (trans) {
+ trans->pathmaxrxt = params->spp_pathmaxrxt;
+ } else if (asoc) {
+ asoc->pathmaxrxt = params->spp_pathmaxrxt;
+ } else {
+ sp->pathmaxrxt = params->spp_pathmaxrxt;
+ }
+ }
+
+ return 0;
+}
+
static int sctp_setsockopt_peer_addr_params(struct sock *sk,
char __user *optval, int optlen)
{
- struct sctp_paddrparams params;
- struct sctp_transport *trans;
+ struct sctp_paddrparams params;
+ struct sctp_transport *trans = NULL;
+ struct sctp_association *asoc = NULL;
+ struct sctp_sock *sp = sctp_sk(sk);
int error;
+ int hb_change, pmtud_change, sackdelay_change;
if (optlen != sizeof(struct sctp_paddrparams))
- return -EINVAL;
+ return - EINVAL;
+
if (copy_from_user(&params, optval, optlen))
return -EFAULT;
- /*
- * API 7. Socket Options (setting the default value for the endpoint)
- * All options that support specific settings on an association by
- * filling in either an association id variable or a sockaddr_storage
- * SHOULD also support setting of the same value for the entire endpoint
- * (i.e. future associations). To accomplish this the following logic is
- * used when setting one of these options:
-
- * c) If neither the sockaddr_storage or association identification is
- * set i.e. the sockaddr_storage is set to all 0's (INADDR_ANY) and
- * the association identification is 0, the settings are a default
- * and to be applied to the endpoint (all future associations).
- */
+ /* Validate flags and value parameters. */
+ hb_change = params.spp_flags & SPP_HB;
+ pmtud_change = params.spp_flags & SPP_PMTUD;
+ sackdelay_change = params.spp_flags & SPP_SACKDELAY;
+
+ if (hb_change == SPP_HB ||
+ pmtud_change == SPP_PMTUD ||
+ sackdelay_change == SPP_SACKDELAY ||
+ params.spp_sackdelay > 500 ||
+ (params.spp_pathmtu
+ && params.spp_pathmtu < SCTP_DEFAULT_MINSEGMENT))
+ return -EINVAL;
- /* update default value for endpoint (all future associations) */
- if (!params.spp_assoc_id &&
- sctp_is_any(( union sctp_addr *)&params.spp_address)) {
- /* Manual heartbeat on an endpoint is invalid. */
- if (0xffffffff == params.spp_hbinterval)
+ /* If an address other than INADDR_ANY is specified, and
+ * no transport is found, then the request is invalid.
+ */
+ if (!sctp_is_any(( union sctp_addr *)&params.spp_address)) {
+ trans = sctp_addr_id2transport(sk, &params.spp_address,
+ params.spp_assoc_id);
+ if (!trans)
return -EINVAL;
- else if (params.spp_hbinterval)
- sctp_sk(sk)->paddrparam.spp_hbinterval =
- params.spp_hbinterval;
- if (params.spp_pathmaxrxt)
- sctp_sk(sk)->paddrparam.spp_pathmaxrxt =
- params.spp_pathmaxrxt;
- return 0;
}
- trans = sctp_addr_id2transport(sk, &params.spp_address,
- params.spp_assoc_id);
- if (!trans)
+ /* Get association, if assoc_id != 0 and the socket is a one
+ * to many style socket, and an association was not found, then
+ * the id was invalid.
+ */
+ asoc = sctp_id2assoc(sk, params.spp_assoc_id);
+ if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP))
return -EINVAL;
- /* Applications can enable or disable heartbeats for any peer address
- * of an association, modify an address's heartbeat interval, force a
- * heartbeat to be sent immediately, and adjust the address's maximum
- * number of retransmissions sent before an address is considered
- * unreachable.
- *
- * The value of the heartbeat interval, in milliseconds. A value of
- * UINT32_MAX (4294967295), when modifying the parameter, specifies
- * that a heartbeat should be sent immediately to the peer address,
- * and the current interval should remain unchanged.
+ /* Heartbeat demand can only be sent on a transport or
+ * association, but not a socket.
*/
- if (0xffffffff == params.spp_hbinterval) {
- error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans);
- if (error)
- return error;
- } else {
- /* The value of the heartbeat interval, in milliseconds. A value of 0,
- * when modifying the parameter, specifies that the heartbeat on this
- * address should be disabled.
+ if (params.spp_flags & SPP_HB_DEMAND && !trans && !asoc)
+ return -EINVAL;
+
+ /* Process parameters. */
+ error = sctp_apply_peer_addr_params(&params, trans, asoc, sp,
+ hb_change, pmtud_change,
+ sackdelay_change);
+
+ if (error)
+ return error;
+
+ /* If changes are for association, also apply parameters to each
+ * transport.
*/
- if (params.spp_hbinterval) {
- trans->hb_allowed = 1;
- trans->hb_interval =
- msecs_to_jiffies(params.spp_hbinterval);
- } else
- trans->hb_allowed = 0;
+ if (!trans && asoc) {
+ struct list_head *pos;
+
+ list_for_each(pos, &asoc->peer.transport_addr_list) {
+ trans = list_entry(pos, struct sctp_transport,
+ transports);
+ sctp_apply_peer_addr_params(&params, trans, asoc, sp,
+ hb_change, pmtud_change,
+ sackdelay_change);
+ }
}
- /* spp_pathmaxrxt contains the maximum number of retransmissions
- * before this address shall be considered unreachable.
- */
- if (params.spp_pathmaxrxt)
- trans->max_retrans = params.spp_pathmaxrxt;
+ return 0;
+}
+
+/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME)
+ *
+ * This options will get or set the delayed ack timer. The time is set
+ * in milliseconds. If the assoc_id is 0, then this sets or gets the
+ * endpoints default delayed ack timer value. If the assoc_id field is
+ * non-zero, then the set or get effects the specified association.
+ *
+ * struct sctp_assoc_value {
+ * sctp_assoc_t assoc_id;
+ * uint32_t assoc_value;
+ * };
+ *
+ * assoc_id - This parameter, indicates which association the
+ * user is preforming an action upon. Note that if
+ * this field's value is zero then the endpoints
+ * default value is changed (effecting future
+ * associations only).
+ *
+ * assoc_value - This parameter contains the number of milliseconds
+ * that the user is requesting the delayed ACK timer
+ * be set to. Note that this value is defined in
+ * the standard to be between 200 and 500 milliseconds.
+ *
+ * Note: a value of zero will leave the value alone,
+ * but disable SACK delay. A non-zero value will also
+ * enable SACK delay.
+ */
+static int sctp_setsockopt_delayed_ack_time(struct sock *sk,
+ char __user *optval, int optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_transport *trans = NULL;
+ struct sctp_association *asoc = NULL;
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ if (optlen != sizeof(struct sctp_assoc_value))
+ return - EINVAL;
+
+ if (copy_from_user(&params, optval, optlen))
+ return -EFAULT;
+
+ /* Validate value parameter. */
+ if (params.assoc_value > 500)
+ return -EINVAL;
+
+ /* Get association, if assoc_id != 0 and the socket is a one
+ * to many style socket, and an association was not found, then
+ * the id was invalid.
+ */
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (!asoc && params.assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+
+ if (params.assoc_value) {
+ if (asoc) {
+ asoc->sackdelay =
+ msecs_to_jiffies(params.assoc_value);
+ asoc->param_flags =
+ (asoc->param_flags & ~SPP_SACKDELAY) |
+ SPP_SACKDELAY_ENABLE;
+ } else {
+ sp->sackdelay = params.assoc_value;
+ sp->param_flags =
+ (sp->param_flags & ~SPP_SACKDELAY) |
+ SPP_SACKDELAY_ENABLE;
+ }
+ } else {
+ if (asoc) {
+ asoc->param_flags =
+ (asoc->param_flags & ~SPP_SACKDELAY) |
+ SPP_SACKDELAY_DISABLE;
+ } else {
+ sp->param_flags =
+ (sp->param_flags & ~SPP_SACKDELAY) |
+ SPP_SACKDELAY_DISABLE;
+ }
+ }
+
+ /* If change is for association, also apply to each transport. */
+ if (asoc) {
+ struct list_head *pos;
+
+ list_for_each(pos, &asoc->peer.transport_addr_list) {
+ trans = list_entry(pos, struct sctp_transport,
+ transports);
+ if (params.assoc_value) {
+ trans->sackdelay =
+ msecs_to_jiffies(params.assoc_value);
+ trans->param_flags =
+ (trans->param_flags & ~SPP_SACKDELAY) |
+ SPP_SACKDELAY_ENABLE;
+ } else {
+ trans->param_flags =
+ (trans->param_flags & ~SPP_SACKDELAY) |
+ SPP_SACKDELAY_DISABLE;
+ }
+ }
+ }
+
return 0;
}
@@ -2334,7 +2606,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, int optl
/* Update the frag_point of the existing associations. */
list_for_each(pos, &(sp->ep->asocs)) {
asoc = list_entry(pos, struct sctp_association, asocs);
- asoc->frag_point = sctp_frag_point(sp, asoc->pmtu);
+ asoc->frag_point = sctp_frag_point(sp, asoc->pathmtu);
}
return 0;
@@ -2491,6 +2763,10 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
retval = sctp_setsockopt_peer_addr_params(sk, optval, optlen);
break;
+ case SCTP_DELAYED_ACK_TIME:
+ retval = sctp_setsockopt_delayed_ack_time(sk, optval, optlen);
+ break;
+
case SCTP_INITMSG:
retval = sctp_setsockopt_initmsg(sk, optval, optlen);
break;
@@ -2715,8 +2991,13 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk)
/* Default Peer Address Parameters. These defaults can
* be modified via SCTP_PEER_ADDR_PARAMS
*/
- sp->paddrparam.spp_hbinterval = jiffies_to_msecs(sctp_hb_interval);
- sp->paddrparam.spp_pathmaxrxt = sctp_max_retrans_path;
+ sp->hbinterval = jiffies_to_msecs(sctp_hb_interval);
+ sp->pathmaxrxt = sctp_max_retrans_path;
+ sp->pathmtu = 0; // allow default discovery
+ sp->sackdelay = sctp_sack_timeout;
+ sp->param_flags = SPP_HB_ENABLE |
+ SPP_PMTUD_ENABLE |
+ SPP_SACKDELAY_ENABLE;
/* If enabled no SCTP message fragmentation will be performed.
* Configure through SCTP_DISABLE_FRAGMENTS socket option.
@@ -2865,7 +3146,7 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
status.sstat_primary.spinfo_cwnd = transport->cwnd;
status.sstat_primary.spinfo_srtt = transport->srtt;
status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto);
- status.sstat_primary.spinfo_mtu = transport->pmtu;
+ status.sstat_primary.spinfo_mtu = transport->pathmtu;
if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN)
status.sstat_primary.spinfo_state = SCTP_ACTIVE;
@@ -2924,7 +3205,7 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len,
pinfo.spinfo_cwnd = transport->cwnd;
pinfo.spinfo_srtt = transport->srtt;
pinfo.spinfo_rto = jiffies_to_msecs(transport->rto);
- pinfo.spinfo_mtu = transport->pmtu;
+ pinfo.spinfo_mtu = transport->pathmtu;
if (pinfo.spinfo_state == SCTP_UNKNOWN)
pinfo.spinfo_state = SCTP_ACTIVE;
@@ -3086,69 +3367,227 @@ out:
* address's parameters:
*
* struct sctp_paddrparams {
- * sctp_assoc_t spp_assoc_id;
- * struct sockaddr_storage spp_address;
- * uint32_t spp_hbinterval;
- * uint16_t spp_pathmaxrxt;
- * };
- *
- * spp_assoc_id - (UDP style socket) This is filled in the application,
- * and identifies the association for this query.
+ * sctp_assoc_t spp_assoc_id;
+ * struct sockaddr_storage spp_address;
+ * uint32_t spp_hbinterval;
+ * uint16_t spp_pathmaxrxt;
+ * uint32_t spp_pathmtu;
+ * uint32_t spp_sackdelay;
+ * uint32_t spp_flags;
+ * };
+ *
+ * spp_assoc_id - (one-to-many style socket) This is filled in the
+ * application, and identifies the association for
+ * this query.
* spp_address - This specifies which address is of interest.
* spp_hbinterval - This contains the value of the heartbeat interval,
- * in milliseconds. A value of 0, when modifying the
- * parameter, specifies that the heartbeat on this
- * address should be disabled. A value of UINT32_MAX
- * (4294967295), when modifying the parameter,
- * specifies that a heartbeat should be sent
- * immediately to the peer address, and the current
- * interval should remain unchanged.
+ * in milliseconds. If a value of zero
+ * is present in this field then no changes are to
+ * be made to this parameter.
* spp_pathmaxrxt - This contains the maximum number of
* retransmissions before this address shall be
- * considered unreachable.
+ * considered unreachable. If a value of zero
+ * is present in this field then no changes are to
+ * be made to this parameter.
+ * spp_pathmtu - When Path MTU discovery is disabled the value
+ * specified here will be the "fixed" path mtu.
+ * Note that if the spp_address field is empty
+ * then all associations on this address will
+ * have this fixed path mtu set upon them.
+ *
+ * spp_sackdelay - When delayed sack is enabled, this value specifies
+ * the number of milliseconds that sacks will be delayed
+ * for. This value will apply to all addresses of an
+ * association if the spp_address field is empty. Note
+ * also, that if delayed sack is enabled and this
+ * value is set to 0, no change is made to the last
+ * recorded delayed sack timer value.
+ *
+ * spp_flags - These flags are used to control various features
+ * on an association. The flag field may contain
+ * zero or more of the following options.
+ *
+ * SPP_HB_ENABLE - Enable heartbeats on the
+ * specified address. Note that if the address
+ * field is empty all addresses for the association
+ * have heartbeats enabled upon them.
+ *
+ * SPP_HB_DISABLE - Disable heartbeats on the
+ * speicifed address. Note that if the address
+ * field is empty all addresses for the association
+ * will have their heartbeats disabled. Note also
+ * that SPP_HB_ENABLE and SPP_HB_DISABLE are
+ * mutually exclusive, only one of these two should
+ * be specified. Enabling both fields will have
+ * undetermined results.
+ *
+ * SPP_HB_DEMAND - Request a user initiated heartbeat
+ * to be made immediately.
+ *
+ * SPP_PMTUD_ENABLE - This field will enable PMTU
+ * discovery upon the specified address. Note that
+ * if the address feild is empty then all addresses
+ * on the association are effected.
+ *
+ * SPP_PMTUD_DISABLE - This field will disable PMTU
+ * discovery upon the specified address. Note that
+ * if the address feild is empty then all addresses
+ * on the association are effected. Not also that
+ * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually
+ * exclusive. Enabling both will have undetermined
+ * results.
+ *
+ * SPP_SACKDELAY_ENABLE - Setting this flag turns
+ * on delayed sack. The time specified in spp_sackdelay
+ * is used to specify the sack delay for this address. Note
+ * that if spp_address is empty then all addresses will
+ * enable delayed sack and take on the sack delay
+ * value specified in spp_sackdelay.
+ * SPP_SACKDELAY_DISABLE - Setting this flag turns
+ * off delayed sack. If the spp_address field is blank then
+ * delayed sack is disabled for the entire association. Note
+ * also that this field is mutually exclusive to
+ * SPP_SACKDELAY_ENABLE, setting both will have undefined
+ * results.
*/
static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
- char __user *optval, int __user *optlen)
+ char __user *optval, int __user *optlen)
{
- struct sctp_paddrparams params;
- struct sctp_transport *trans;
+ struct sctp_paddrparams params;
+ struct sctp_transport *trans = NULL;
+ struct sctp_association *asoc = NULL;
+ struct sctp_sock *sp = sctp_sk(sk);
if (len != sizeof(struct sctp_paddrparams))
return -EINVAL;
+
if (copy_from_user(&params, optval, len))
return -EFAULT;
- /* If no association id is specified retrieve the default value
- * for the endpoint that will be used for all future associations
+ /* If an address other than INADDR_ANY is specified, and
+ * no transport is found, then the request is invalid.
*/
- if (!params.spp_assoc_id &&
- sctp_is_any(( union sctp_addr *)&params.spp_address)) {
- params.spp_hbinterval = sctp_sk(sk)->paddrparam.spp_hbinterval;
- params.spp_pathmaxrxt = sctp_sk(sk)->paddrparam.spp_pathmaxrxt;
-
- goto done;
+ if (!sctp_is_any(( union sctp_addr *)&params.spp_address)) {
+ trans = sctp_addr_id2transport(sk, &params.spp_address,
+ params.spp_assoc_id);
+ if (!trans) {
+ SCTP_DEBUG_PRINTK("Failed no transport\n");
+ return -EINVAL;
+ }
}
- trans = sctp_addr_id2transport(sk, &params.spp_address,
- params.spp_assoc_id);
- if (!trans)
+ /* Get association, if assoc_id != 0 and the socket is a one
+ * to many style socket, and an association was not found, then
+ * the id was invalid.
+ */
+ asoc = sctp_id2assoc(sk, params.spp_assoc_id);
+ if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP)) {
+ SCTP_DEBUG_PRINTK("Failed no association\n");
return -EINVAL;
+ }
- /* The value of the heartbeat interval, in milliseconds. A value of 0,
- * when modifying the parameter, specifies that the heartbeat on this
- * address should be disabled.
- */
- if (!trans->hb_allowed)
- params.spp_hbinterval = 0;
- else
- params.spp_hbinterval = jiffies_to_msecs(trans->hb_interval);
+ if (trans) {
+ /* Fetch transport values. */
+ params.spp_hbinterval = jiffies_to_msecs(trans->hbinterval);
+ params.spp_pathmtu = trans->pathmtu;
+ params.spp_pathmaxrxt = trans->pathmaxrxt;
+ params.spp_sackdelay = jiffies_to_msecs(trans->sackdelay);
+
+ /*draft-11 doesn't say what to return in spp_flags*/
+ params.spp_flags = trans->param_flags;
+ } else if (asoc) {
+ /* Fetch association values. */
+ params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval);
+ params.spp_pathmtu = asoc->pathmtu;
+ params.spp_pathmaxrxt = asoc->pathmaxrxt;
+ params.spp_sackdelay = jiffies_to_msecs(asoc->sackdelay);
+
+ /*draft-11 doesn't say what to return in spp_flags*/
+ params.spp_flags = asoc->param_flags;
+ } else {
+ /* Fetch socket values. */
+ params.spp_hbinterval = sp->hbinterval;
+ params.spp_pathmtu = sp->pathmtu;
+ params.spp_sackdelay = sp->sackdelay;
+ params.spp_pathmaxrxt = sp->pathmaxrxt;
+
+ /*draft-11 doesn't say what to return in spp_flags*/
+ params.spp_flags = sp->param_flags;
+ }
- /* spp_pathmaxrxt contains the maximum number of retransmissions
- * before this address shall be considered unreachable.
- */
- params.spp_pathmaxrxt = trans->max_retrans;
+ if (copy_to_user(optval, &params, len))
+ return -EFAULT;
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ return 0;
+}
+
+/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME)
+ *
+ * This options will get or set the delayed ack timer. The time is set
+ * in milliseconds. If the assoc_id is 0, then this sets or gets the
+ * endpoints default delayed ack timer value. If the assoc_id field is
+ * non-zero, then the set or get effects the specified association.
+ *
+ * struct sctp_assoc_value {
+ * sctp_assoc_t assoc_id;
+ * uint32_t assoc_value;
+ * };
+ *
+ * assoc_id - This parameter, indicates which association the
+ * user is preforming an action upon. Note that if
+ * this field's value is zero then the endpoints
+ * default value is changed (effecting future
+ * associations only).
+ *
+ * assoc_value - This parameter contains the number of milliseconds
+ * that the user is requesting the delayed ACK timer
+ * be set to. Note that this value is defined in
+ * the standard to be between 200 and 500 milliseconds.
+ *
+ * Note: a value of zero will leave the value alone,
+ * but disable SACK delay. A non-zero value will also
+ * enable SACK delay.
+ */
+static int sctp_getsockopt_delayed_ack_time(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_association *asoc = NULL;
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ if (len != sizeof(struct sctp_assoc_value))
+ return - EINVAL;
+
+ if (copy_from_user(&params, optval, len))
+ return -EFAULT;
+
+ /* Get association, if assoc_id != 0 and the socket is a one
+ * to many style socket, and an association was not found, then
+ * the id was invalid.
+ */
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (!asoc && params.assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+
+ if (asoc) {
+ /* Fetch association values. */
+ if (asoc->param_flags & SPP_SACKDELAY_ENABLE)
+ params.assoc_value = jiffies_to_msecs(
+ asoc->sackdelay);
+ else
+ params.assoc_value = 0;
+ } else {
+ /* Fetch socket values. */
+ if (sp->param_flags & SPP_SACKDELAY_ENABLE)
+ params.assoc_value = sp->sackdelay;
+ else
+ params.assoc_value = 0;
+ }
-done:
if (copy_to_user(optval, &params, len))
return -EFAULT;
@@ -4015,6 +4454,10 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
retval = sctp_getsockopt_peer_addr_params(sk, len, optval,
optlen);
break;
+ case SCTP_DELAYED_ACK_TIME:
+ retval = sctp_getsockopt_delayed_ack_time(sk, len, optval,
+ optlen);
+ break;
case SCTP_INITMSG:
retval = sctp_getsockopt_initmsg(sk, len, optval, optlen);
break;
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 268ddaf2dc0f..68d73e2dd155 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -86,10 +86,13 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
peer->init_sent_count = 0;
peer->state = SCTP_ACTIVE;
- peer->hb_allowed = 0;
+ peer->param_flags = SPP_HB_DISABLE |
+ SPP_PMTUD_ENABLE |
+ SPP_SACKDELAY_ENABLE;
+ peer->hbinterval = 0;
/* Initialize the default path max_retrans. */
- peer->max_retrans = sctp_max_retrans_path;
+ peer->pathmaxrxt = sctp_max_retrans_path;
peer->error_count = 0;
INIT_LIST_HEAD(&peer->transmitted);
@@ -229,10 +232,10 @@ void sctp_transport_pmtu(struct sctp_transport *transport)
dst = transport->af_specific->get_dst(NULL, &transport->ipaddr, NULL);
if (dst) {
- transport->pmtu = dst_mtu(dst);
+ transport->pathmtu = dst_mtu(dst);
dst_release(dst);
} else
- transport->pmtu = SCTP_DEFAULT_MAXSEGMENT;
+ transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
}
/* Caches the dst entry and source address for a transport's destination
@@ -254,8 +257,11 @@ void sctp_transport_route(struct sctp_transport *transport,
af->get_saddr(asoc, dst, daddr, &transport->saddr);
transport->dst = dst;
+ if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) {
+ return;
+ }
if (dst) {
- transport->pmtu = dst_mtu(dst);
+ transport->pathmtu = dst_mtu(dst);
/* Initialize sk->sk_rcv_saddr, if the transport is the
* association's active path for getsockname().
@@ -264,7 +270,7 @@ void sctp_transport_route(struct sctp_transport *transport,
opt->pf->af->to_sk_saddr(&transport->saddr,
asoc->base.sk);
} else
- transport->pmtu = SCTP_DEFAULT_MAXSEGMENT;
+ transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
}
/* Hold a reference to a transport. */
@@ -369,7 +375,7 @@ void sctp_transport_raise_cwnd(struct sctp_transport *transport,
ssthresh = transport->ssthresh;
pba = transport->partial_bytes_acked;
- pmtu = transport->asoc->pmtu;
+ pmtu = transport->asoc->pathmtu;
if (cwnd <= ssthresh) {
/* RFC 2960 7.2.1, sctpimpguide-05 2.14.2 When cwnd is less
@@ -441,8 +447,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
* partial_bytes_acked = 0
*/
transport->ssthresh = max(transport->cwnd/2,
- 4*transport->asoc->pmtu);
- transport->cwnd = transport->asoc->pmtu;
+ 4*transport->asoc->pathmtu);
+ transport->cwnd = transport->asoc->pathmtu;
break;
case SCTP_LOWER_CWND_FAST_RTX:
@@ -459,7 +465,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
* partial_bytes_acked = 0
*/
transport->ssthresh = max(transport->cwnd/2,
- 4*transport->asoc->pmtu);
+ 4*transport->asoc->pathmtu);
transport->cwnd = transport->ssthresh;
break;
@@ -479,7 +485,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
if ((jiffies - transport->last_time_ecne_reduced) >
transport->rtt) {
transport->ssthresh = max(transport->cwnd/2,
- 4*transport->asoc->pmtu);
+ 4*transport->asoc->pathmtu);
transport->cwnd = transport->ssthresh;
transport->last_time_ecne_reduced = jiffies;
}
@@ -496,7 +502,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
*/
if ((jiffies - transport->last_time_used) > transport->rto)
transport->cwnd = max(transport->cwnd/2,
- 4*transport->asoc->pmtu);
+ 4*transport->asoc->pathmtu);
break;
};
@@ -511,7 +517,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
unsigned long sctp_transport_timeout(struct sctp_transport *t)
{
unsigned long timeout;
- timeout = t->hb_interval + t->rto + sctp_jitter(t->rto);
+ timeout = t->hbinterval + t->rto + sctp_jitter(t->rto);
timeout += jiffies;
return timeout;
}
diff --git a/net/socket.c b/net/socket.c
index 3145103cdf54..06fa217f58a9 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -640,154 +640,150 @@ static void sock_aio_dtor(struct kiocb *iocb)
kfree(iocb->private);
}
-/*
- * Read data from a socket. ubuf is a user mode pointer. We make sure the user
- * area ubuf...ubuf+size-1 is writable before asking the protocol.
- */
-
-static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf,
- size_t size, loff_t pos)
+static ssize_t sock_sendpage(struct file *file, struct page *page,
+ int offset, size_t size, loff_t *ppos, int more)
{
- struct sock_iocb *x, siocb;
struct socket *sock;
int flags;
- if (pos != 0)
- return -ESPIPE;
- if (size==0) /* Match SYS5 behaviour */
- return 0;
+ sock = file->private_data;
- if (is_sync_kiocb(iocb))
- x = &siocb;
- else {
- x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL);
- if (!x)
- return -ENOMEM;
+ flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
+ if (more)
+ flags |= MSG_MORE;
+
+ return sock->ops->sendpage(sock, page, offset, size, flags);
+}
+
+static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb,
+ char __user *ubuf, size_t size, struct sock_iocb *siocb)
+{
+ if (!is_sync_kiocb(iocb)) {
+ siocb = kmalloc(sizeof(*siocb), GFP_KERNEL);
+ if (!siocb)
+ return NULL;
iocb->ki_dtor = sock_aio_dtor;
}
- iocb->private = x;
- x->kiocb = iocb;
- sock = iocb->ki_filp->private_data;
- x->async_msg.msg_name = NULL;
- x->async_msg.msg_namelen = 0;
- x->async_msg.msg_iov = &x->async_iov;
- x->async_msg.msg_iovlen = 1;
- x->async_msg.msg_control = NULL;
- x->async_msg.msg_controllen = 0;
- x->async_iov.iov_base = ubuf;
- x->async_iov.iov_len = size;
- flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
+ siocb->kiocb = iocb;
+ siocb->async_iov.iov_base = ubuf;
+ siocb->async_iov.iov_len = size;
- return __sock_recvmsg(iocb, sock, &x->async_msg, size, flags);
+ iocb->private = siocb;
+ return siocb;
}
+static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
+ struct file *file, struct iovec *iov, unsigned long nr_segs)
+{
+ struct socket *sock = file->private_data;
+ size_t size = 0;
+ int i;
-/*
- * Write data to a socket. We verify that the user area ubuf..ubuf+size-1
- * is readable by the user process.
- */
+ for (i = 0 ; i < nr_segs ; i++)
+ size += iov[i].iov_len;
-static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf,
- size_t size, loff_t pos)
+ msg->msg_name = NULL;
+ msg->msg_namelen = 0;
+ msg->msg_control = NULL;
+ msg->msg_controllen = 0;
+ msg->msg_iov = (struct iovec *) iov;
+ msg->msg_iovlen = nr_segs;
+ msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
+
+ return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
+}
+
+static ssize_t sock_readv(struct file *file, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
{
- struct sock_iocb *x, siocb;
- struct socket *sock;
-
+ struct kiocb iocb;
+ struct sock_iocb siocb;
+ struct msghdr msg;
+ int ret;
+
+ init_sync_kiocb(&iocb, NULL);
+ iocb.private = &siocb;
+
+ ret = do_sock_read(&msg, &iocb, file, (struct iovec *)iov, nr_segs);
+ if (-EIOCBQUEUED == ret)
+ ret = wait_on_sync_kiocb(&iocb);
+ return ret;
+}
+
+static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf,
+ size_t count, loff_t pos)
+{
+ struct sock_iocb siocb, *x;
+
if (pos != 0)
return -ESPIPE;
- if(size==0) /* Match SYS5 behaviour */
+ if (count == 0) /* Match SYS5 behaviour */
return 0;
- if (is_sync_kiocb(iocb))
- x = &siocb;
- else {
- x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL);
- if (!x)
- return -ENOMEM;
- iocb->ki_dtor = sock_aio_dtor;
- }
- iocb->private = x;
- x->kiocb = iocb;
- sock = iocb->ki_filp->private_data;
-
- x->async_msg.msg_name = NULL;
- x->async_msg.msg_namelen = 0;
- x->async_msg.msg_iov = &x->async_iov;
- x->async_msg.msg_iovlen = 1;
- x->async_msg.msg_control = NULL;
- x->async_msg.msg_controllen = 0;
- x->async_msg.msg_flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
- if (sock->type == SOCK_SEQPACKET)
- x->async_msg.msg_flags |= MSG_EOR;
- x->async_iov.iov_base = (void __user *)ubuf;
- x->async_iov.iov_len = size;
-
- return __sock_sendmsg(iocb, sock, &x->async_msg, size);
+ x = alloc_sock_iocb(iocb, ubuf, count, &siocb);
+ if (!x)
+ return -ENOMEM;
+ return do_sock_read(&x->async_msg, iocb, iocb->ki_filp,
+ &x->async_iov, 1);
}
-static ssize_t sock_sendpage(struct file *file, struct page *page,
- int offset, size_t size, loff_t *ppos, int more)
+static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
+ struct file *file, struct iovec *iov, unsigned long nr_segs)
{
- struct socket *sock;
- int flags;
+ struct socket *sock = file->private_data;
+ size_t size = 0;
+ int i;
- sock = file->private_data;
+ for (i = 0 ; i < nr_segs ; i++)
+ size += iov[i].iov_len;
- flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
- if (more)
- flags |= MSG_MORE;
+ msg->msg_name = NULL;
+ msg->msg_namelen = 0;
+ msg->msg_control = NULL;
+ msg->msg_controllen = 0;
+ msg->msg_iov = (struct iovec *) iov;
+ msg->msg_iovlen = nr_segs;
+ msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
+ if (sock->type == SOCK_SEQPACKET)
+ msg->msg_flags |= MSG_EOR;
- return sock->ops->sendpage(sock, page, offset, size, flags);
+ return __sock_sendmsg(iocb, sock, msg, size);
}
-static int sock_readv_writev(int type,
- struct file * file, const struct iovec * iov,
- long count, size_t size)
+static ssize_t sock_writev(struct file *file, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
{
struct msghdr msg;
- struct socket *sock;
+ struct kiocb iocb;
+ struct sock_iocb siocb;
+ int ret;
- sock = file->private_data;
+ init_sync_kiocb(&iocb, NULL);
+ iocb.private = &siocb;
- msg.msg_name = NULL;
- msg.msg_namelen = 0;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_iov = (struct iovec *) iov;
- msg.msg_iovlen = count;
- msg.msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
+ ret = do_sock_write(&msg, &iocb, file, (struct iovec *)iov, nr_segs);
+ if (-EIOCBQUEUED == ret)
+ ret = wait_on_sync_kiocb(&iocb);
+ return ret;
+}
- /* read() does a VERIFY_WRITE */
- if (type == VERIFY_WRITE)
- return sock_recvmsg(sock, &msg, size, msg.msg_flags);
+static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf,
+ size_t count, loff_t pos)
+{
+ struct sock_iocb siocb, *x;
- if (sock->type == SOCK_SEQPACKET)
- msg.msg_flags |= MSG_EOR;
+ if (pos != 0)
+ return -ESPIPE;
+ if (count == 0) /* Match SYS5 behaviour */
+ return 0;
- return sock_sendmsg(sock, &msg, size);
-}
+ x = alloc_sock_iocb(iocb, (void __user *)ubuf, count, &siocb);
+ if (!x)
+ return -ENOMEM;
-static ssize_t sock_readv(struct file *file, const struct iovec *vector,
- unsigned long count, loff_t *ppos)
-{
- size_t tot_len = 0;
- int i;
- for (i = 0 ; i < count ; i++)
- tot_len += vector[i].iov_len;
- return sock_readv_writev(VERIFY_WRITE,
- file, vector, count, tot_len);
-}
-
-static ssize_t sock_writev(struct file *file, const struct iovec *vector,
- unsigned long count, loff_t *ppos)
-{
- size_t tot_len = 0;
- int i;
- for (i = 0 ; i < count ; i++)
- tot_len += vector[i].iov_len;
- return sock_readv_writev(VERIFY_READ,
- file, vector, count, tot_len);
+ return do_sock_write(&x->async_msg, iocb, iocb->ki_filp,
+ &x->async_iov, 1);
}
@@ -904,6 +900,13 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
break;
default:
err = sock->ops->ioctl(sock, cmd, arg);
+
+ /*
+ * If this ioctl is unknown try to hand it down
+ * to the NIC driver.
+ */
+ if (err == -ENOIOCTLCMD)
+ err = dev_ioctl(cmd, argp);
break;
}
return err;
@@ -2036,7 +2039,7 @@ int sock_unregister(int family)
return 0;
}
-void __init sock_init(void)
+static int __init sock_init(void)
{
/*
* Initialize sock SLAB cache.
@@ -2044,12 +2047,10 @@ void __init sock_init(void)
sk_init();
-#ifdef SLAB_SKB
/*
* Initialize skbuff SLAB cache
*/
skb_init();
-#endif
/*
* Initialize the protocols module.
@@ -2058,15 +2059,19 @@ void __init sock_init(void)
init_inodecache();
register_filesystem(&sock_fs_type);
sock_mnt = kern_mount(&sock_fs_type);
- /* The real protocol initialization is performed when
- * do_initcalls is run.
+
+ /* The real protocol initialization is performed in later initcalls.
*/
#ifdef CONFIG_NETFILTER
netfilter_init();
#endif
+
+ return 0;
}
+core_initcall(sock_init); /* early initcall */
+
#ifdef CONFIG_PROC_FS
void socket_seq_show(struct seq_file *seq)
{
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index c6a51911e71e..d68eba481291 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -758,7 +758,7 @@ svc_tcp_accept(struct svc_sock *svsk)
struct svc_serv *serv = svsk->sk_server;
struct socket *sock = svsk->sk_sock;
struct socket *newsock;
- struct proto_ops *ops;
+ const struct proto_ops *ops;
struct svc_sock *newsvsk;
int err, slen;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index acc73ba8bade..5f6ae79b8b16 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -121,7 +121,7 @@
int sysctl_unix_max_dgram_qlen = 10;
struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
-DEFINE_RWLOCK(unix_table_lock);
+DEFINE_SPINLOCK(unix_table_lock);
static atomic_t unix_nr_socks = ATOMIC_INIT(0);
#define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE])
@@ -130,7 +130,7 @@ static atomic_t unix_nr_socks = ATOMIC_INIT(0);
/*
* SMP locking strategy:
- * hash table is protected with rwlock unix_table_lock
+ * hash table is protected with spinlock unix_table_lock
* each socket state is protected by separate rwlock.
*/
@@ -214,16 +214,16 @@ static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
static inline void unix_remove_socket(struct sock *sk)
{
- write_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
__unix_remove_socket(sk);
- write_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
}
static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
{
- write_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
__unix_insert_socket(list, sk);
- write_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
}
static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
@@ -250,11 +250,11 @@ static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
{
struct sock *s;
- read_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
s = __unix_find_socket_byname(sunname, len, type, hash);
if (s)
sock_hold(s);
- read_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
return s;
}
@@ -263,7 +263,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
struct sock *s;
struct hlist_node *node;
- read_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
sk_for_each(s, node,
&unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
struct dentry *dentry = unix_sk(s)->dentry;
@@ -276,7 +276,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
}
s = NULL;
found:
- read_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
return s;
}
@@ -473,7 +473,7 @@ static int unix_dgram_connect(struct socket *, struct sockaddr *,
static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
struct msghdr *, size_t);
-static struct proto_ops unix_stream_ops = {
+static const struct proto_ops unix_stream_ops = {
.family = PF_UNIX,
.owner = THIS_MODULE,
.release = unix_release,
@@ -494,7 +494,7 @@ static struct proto_ops unix_stream_ops = {
.sendpage = sock_no_sendpage,
};
-static struct proto_ops unix_dgram_ops = {
+static const struct proto_ops unix_dgram_ops = {
.family = PF_UNIX,
.owner = THIS_MODULE,
.release = unix_release,
@@ -515,7 +515,7 @@ static struct proto_ops unix_dgram_ops = {
.sendpage = sock_no_sendpage,
};
-static struct proto_ops unix_seqpacket_ops = {
+static const struct proto_ops unix_seqpacket_ops = {
.family = PF_UNIX,
.owner = THIS_MODULE,
.release = unix_release,
@@ -564,7 +564,7 @@ static struct sock * unix_create1(struct socket *sock)
u = unix_sk(sk);
u->dentry = NULL;
u->mnt = NULL;
- rwlock_init(&u->lock);
+ spin_lock_init(&u->lock);
atomic_set(&u->inflight, sock ? 0 : -1);
init_MUTEX(&u->readsem); /* single task reading lock */
init_waitqueue_head(&u->peer_wait);
@@ -642,12 +642,12 @@ retry:
addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
- write_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
ordernum = (ordernum+1)&0xFFFFF;
if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
addr->hash)) {
- write_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
/* Sanity yield. It is unusual case, but yet... */
if (!(ordernum&0xFF))
yield();
@@ -658,7 +658,7 @@ retry:
__unix_remove_socket(sk);
u->addr = addr;
__unix_insert_socket(&unix_socket_table[addr->hash], sk);
- write_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
err = 0;
out: up(&u->readsem);
@@ -791,7 +791,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
addr->hash = UNIX_HASH_SIZE;
}
- write_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
if (!sunaddr->sun_path[0]) {
err = -EADDRINUSE;
@@ -814,7 +814,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
__unix_insert_socket(list, sk);
out_unlock:
- write_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
out_up:
up(&u->readsem);
out:
@@ -1063,10 +1063,12 @@ restart:
/* Set credentials */
sk->sk_peercred = other->sk_peercred;
- sock_hold(newsk);
- unix_peer(sk) = newsk;
sock->state = SS_CONNECTED;
sk->sk_state = TCP_ESTABLISHED;
+ sock_hold(newsk);
+
+ smp_mb__after_atomic_inc(); /* sock_hold() does an atomic_inc() */
+ unix_peer(sk) = newsk;
unix_state_wunlock(sk);
@@ -1414,7 +1416,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
} else {
sunaddr = NULL;
err = -ENOTCONN;
- other = unix_peer_get(sk);
+ other = unix_peer(sk);
if (!other)
goto out_err;
}
@@ -1476,7 +1478,6 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
other->sk_data_ready(other, size);
sent+=size;
}
- sock_put(other);
scm_destroy(siocb->scm);
siocb->scm = NULL;
@@ -1491,8 +1492,6 @@ pipe_err:
send_sig(SIGPIPE,current,0);
err = -EPIPE;
out_err:
- if (other)
- sock_put(other);
scm_destroy(siocb->scm);
siocb->scm = NULL;
return sent ? : err;
@@ -1860,7 +1859,7 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
}
default:
- err = dev_ioctl(cmd, (void __user *)arg);
+ err = -ENOIOCTLCMD;
break;
}
return err;
@@ -1917,7 +1916,7 @@ static struct sock *unix_seq_idx(int *iter, loff_t pos)
static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
{
- read_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
}
@@ -1932,7 +1931,7 @@ static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static void unix_seq_stop(struct seq_file *seq, void *v)
{
- read_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
}
static int unix_seq_show(struct seq_file *seq, void *v)
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 6ffc64e1712d..411802bd4d37 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -182,7 +182,7 @@ void unix_gc(void)
if (down_trylock(&unix_gc_sem))
return;
- read_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
forall_unix_sockets(i, s)
{
@@ -301,7 +301,7 @@ void unix_gc(void)
}
u->gc_tree = GC_ORPHAN;
}
- read_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
/*
* Here we are. Hitlist is filled. Die.
diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c
index 59fec59b2132..7a43ae4721ed 100644
--- a/net/wanrouter/af_wanpipe.c
+++ b/net/wanrouter/af_wanpipe.c
@@ -181,7 +181,7 @@ struct wanpipe_opt
#endif
static int sk_count;
-extern struct proto_ops wanpipe_ops;
+extern const struct proto_ops wanpipe_ops;
static unsigned long find_free_critical;
static void wanpipe_unlink_driver(struct sock *sk);
@@ -1839,7 +1839,7 @@ static int wanpipe_ioctl(struct socket *sock, unsigned int cmd, unsigned long ar
#endif
default:
- return dev_ioctl(cmd,(void __user *) arg);
+ return -ENOIOCTLCMD;
}
/*NOTREACHED*/
}
@@ -2546,7 +2546,7 @@ static int wanpipe_connect(struct socket *sock, struct sockaddr *uaddr, int addr
return 0;
}
-struct proto_ops wanpipe_ops = {
+const struct proto_ops wanpipe_ops = {
.family = PF_WANPIPE,
.owner = THIS_MODULE,
.release = wanpipe_release,
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 020d73cc8414..16459c7f54b2 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -64,7 +64,7 @@ int sysctl_x25_ack_holdback_timeout = X25_DEFAULT_T2;
HLIST_HEAD(x25_list);
DEFINE_RWLOCK(x25_list_lock);
-static struct proto_ops x25_proto_ops;
+static const struct proto_ops x25_proto_ops;
static struct x25_address null_x25_address = {" "};
@@ -1378,7 +1378,7 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
}
default:
- rc = dev_ioctl(cmd, argp);
+ rc = -ENOIOCTLCMD;
break;
}
@@ -1391,7 +1391,7 @@ static struct net_proto_family x25_family_ops = {
.owner = THIS_MODULE,
};
-static struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = {
.family = AF_X25,
.owner = THIS_MODULE,
.release = x25_release,
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d19e274b9c4a..64a447375fdb 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -10,7 +10,7 @@
* YOSHIFUJI Hideaki
* Split up af-specific portion
* Derek Atkins <derek@ihtfp.com> Add the post_input processor
- *
+ *
*/
#include <asm/bug.h>
@@ -256,6 +256,7 @@ void __xfrm_policy_destroy(struct xfrm_policy *policy)
if (del_timer(&policy->timer))
BUG();
+ security_xfrm_policy_free(policy);
kfree(policy);
}
EXPORT_SYMBOL(__xfrm_policy_destroy);
@@ -350,7 +351,8 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
write_lock_bh(&xfrm_policy_lock);
for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL;) {
- if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0) {
+ if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0 &&
+ xfrm_sec_ctx_match(pol->security, policy->security)) {
if (excl) {
write_unlock_bh(&xfrm_policy_lock);
return -EEXIST;
@@ -416,14 +418,15 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
}
EXPORT_SYMBOL(xfrm_policy_insert);
-struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel,
- int delete)
+struct xfrm_policy *xfrm_policy_bysel_ctx(int dir, struct xfrm_selector *sel,
+ struct xfrm_sec_ctx *ctx, int delete)
{
struct xfrm_policy *pol, **p;
write_lock_bh(&xfrm_policy_lock);
for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) {
- if (memcmp(sel, &pol->selector, sizeof(*sel)) == 0) {
+ if ((memcmp(sel, &pol->selector, sizeof(*sel)) == 0) &&
+ (xfrm_sec_ctx_match(ctx, pol->security))) {
xfrm_pol_hold(pol);
if (delete)
*p = pol->next;
@@ -438,7 +441,7 @@ struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel,
}
return pol;
}
-EXPORT_SYMBOL(xfrm_policy_bysel);
+EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete)
{
@@ -519,7 +522,7 @@ EXPORT_SYMBOL(xfrm_policy_walk);
/* Find policy to apply to this flow. */
-static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
+static void xfrm_policy_lookup(struct flowi *fl, u32 sk_sid, u16 family, u8 dir,
void **objp, atomic_t **obj_refp)
{
struct xfrm_policy *pol;
@@ -533,9 +536,12 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
continue;
match = xfrm_selector_match(sel, fl, family);
+
if (match) {
- xfrm_pol_hold(pol);
- break;
+ if (!security_xfrm_policy_lookup(pol, sk_sid, dir)) {
+ xfrm_pol_hold(pol);
+ break;
+ }
}
}
read_unlock_bh(&xfrm_policy_lock);
@@ -543,15 +549,37 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
*obj_refp = &pol->refcnt;
}
-static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl)
+static inline int policy_to_flow_dir(int dir)
+{
+ if (XFRM_POLICY_IN == FLOW_DIR_IN &&
+ XFRM_POLICY_OUT == FLOW_DIR_OUT &&
+ XFRM_POLICY_FWD == FLOW_DIR_FWD)
+ return dir;
+ switch (dir) {
+ default:
+ case XFRM_POLICY_IN:
+ return FLOW_DIR_IN;
+ case XFRM_POLICY_OUT:
+ return FLOW_DIR_OUT;
+ case XFRM_POLICY_FWD:
+ return FLOW_DIR_FWD;
+ };
+}
+
+static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl, u32 sk_sid)
{
struct xfrm_policy *pol;
read_lock_bh(&xfrm_policy_lock);
if ((pol = sk->sk_policy[dir]) != NULL) {
- int match = xfrm_selector_match(&pol->selector, fl,
+ int match = xfrm_selector_match(&pol->selector, fl,
sk->sk_family);
+ int err = 0;
+
if (match)
+ err = security_xfrm_policy_lookup(pol, sk_sid, policy_to_flow_dir(dir));
+
+ if (match && !err)
xfrm_pol_hold(pol);
else
pol = NULL;
@@ -624,6 +652,10 @@ static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir)
if (newp) {
newp->selector = old->selector;
+ if (security_xfrm_policy_clone(old, newp)) {
+ kfree(newp);
+ return NULL; /* ENOMEM */
+ }
newp->lft = old->lft;
newp->curlft = old->curlft;
newp->action = old->action;
@@ -735,22 +767,6 @@ xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
return err;
}
-static inline int policy_to_flow_dir(int dir)
-{
- if (XFRM_POLICY_IN == FLOW_DIR_IN &&
- XFRM_POLICY_OUT == FLOW_DIR_OUT &&
- XFRM_POLICY_FWD == FLOW_DIR_FWD)
- return dir;
- switch (dir) {
- default:
- case XFRM_POLICY_IN:
- return FLOW_DIR_IN;
- case XFRM_POLICY_OUT:
- return FLOW_DIR_OUT;
- case XFRM_POLICY_FWD:
- return FLOW_DIR_FWD;
- };
-}
static int stale_bundle(struct dst_entry *dst);
@@ -769,19 +785,20 @@ int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
int err;
u32 genid;
u16 family = dst_orig->ops->family;
+ u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
+ u32 sk_sid = security_sk_sid(sk, fl, dir);
restart:
genid = atomic_read(&flow_cache_genid);
policy = NULL;
if (sk && sk->sk_policy[1])
- policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
+ policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, sk_sid);
if (!policy) {
/* To accelerate a bit... */
if ((dst_orig->flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT])
return 0;
- policy = flow_cache_lookup(fl, family,
- policy_to_flow_dir(XFRM_POLICY_OUT),
+ policy = flow_cache_lookup(fl, sk_sid, family, dir,
xfrm_policy_lookup);
}
@@ -962,16 +979,20 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
{
struct xfrm_policy *pol;
struct flowi fl;
+ u8 fl_dir = policy_to_flow_dir(dir);
+ u32 sk_sid;
if (_decode_session(skb, &fl, family) < 0)
return 0;
+ sk_sid = security_sk_sid(sk, &fl, fl_dir);
+
/* First, check used SA against their selectors. */
if (skb->sp) {
int i;
for (i=skb->sp->len-1; i>=0; i--) {
- struct sec_decap_state *xvec = &(skb->sp->x[i]);
+ struct sec_decap_state *xvec = &(skb->sp->x[i]);
if (!xfrm_selector_match(&xvec->xvec->sel, &fl, family))
return 0;
@@ -986,11 +1007,10 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
pol = NULL;
if (sk && sk->sk_policy[dir])
- pol = xfrm_sk_policy_lookup(sk, dir, &fl);
+ pol = xfrm_sk_policy_lookup(sk, dir, &fl, sk_sid);
if (!pol)
- pol = flow_cache_lookup(&fl, family,
- policy_to_flow_dir(dir),
+ pol = flow_cache_lookup(&fl, sk_sid, family, fl_dir,
xfrm_policy_lookup);
if (!pol)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 479effc97666..e12d0be5f976 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -10,7 +10,7 @@
* Split up af-specific functions
* Derek Atkins <derek@ihtfp.com>
* Add UDP Encapsulation
- *
+ *
*/
#include <linux/workqueue.h>
@@ -70,6 +70,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
x->type->destructor(x);
xfrm_put_type(x->type);
}
+ security_xfrm_state_free(x);
kfree(x);
}
@@ -343,7 +344,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
selector.
*/
if (x->km.state == XFRM_STATE_VALID) {
- if (!xfrm_selector_match(&x->sel, fl, family))
+ if (!xfrm_selector_match(&x->sel, fl, family) ||
+ !xfrm_sec_ctx_match(pol->security, x->security))
continue;
if (!best ||
best->km.dying > x->km.dying ||
@@ -354,7 +356,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
acquire_in_progress = 1;
} else if (x->km.state == XFRM_STATE_ERROR ||
x->km.state == XFRM_STATE_EXPIRED) {
- if (xfrm_selector_match(&x->sel, fl, family))
+ if (xfrm_selector_match(&x->sel, fl, family) &&
+ xfrm_sec_ctx_match(pol->security, x->security))
error = -ESRCH;
}
}
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 0cdd9a07e043..92e2b804c606 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -7,7 +7,7 @@
* Kazunori MIYAZAWA @USAGI
* Kunihiro Ishiguro <kunihiro@ipinfusion.com>
* IPv6 support
- *
+ *
*/
#include <linux/module.h>
@@ -88,6 +88,34 @@ static int verify_encap_tmpl(struct rtattr **xfrma)
return 0;
}
+
+static inline int verify_sec_ctx_len(struct rtattr **xfrma)
+{
+ struct rtattr *rt = xfrma[XFRMA_SEC_CTX - 1];
+ struct xfrm_user_sec_ctx *uctx;
+ int len = 0;
+
+ if (!rt)
+ return 0;
+
+ if (rt->rta_len < sizeof(*uctx))
+ return -EINVAL;
+
+ uctx = RTA_DATA(rt);
+
+ if (uctx->ctx_len > PAGE_SIZE)
+ return -EINVAL;
+
+ len += sizeof(struct xfrm_user_sec_ctx);
+ len += uctx->ctx_len;
+
+ if (uctx->len != len)
+ return -EINVAL;
+
+ return 0;
+}
+
+
static int verify_newsa_info(struct xfrm_usersa_info *p,
struct rtattr **xfrma)
{
@@ -145,6 +173,8 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
goto out;
if ((err = verify_encap_tmpl(xfrma)))
goto out;
+ if ((err = verify_sec_ctx_len(xfrma)))
+ goto out;
err = -EINVAL;
switch (p->mode) {
@@ -209,6 +239,30 @@ static int attach_encap_tmpl(struct xfrm_encap_tmpl **encapp, struct rtattr *u_a
return 0;
}
+
+static inline int xfrm_user_sec_ctx_size(struct xfrm_policy *xp)
+{
+ struct xfrm_sec_ctx *xfrm_ctx = xp->security;
+ int len = 0;
+
+ if (xfrm_ctx) {
+ len += sizeof(struct xfrm_user_sec_ctx);
+ len += xfrm_ctx->ctx_len;
+ }
+ return len;
+}
+
+static int attach_sec_ctx(struct xfrm_state *x, struct rtattr *u_arg)
+{
+ struct xfrm_user_sec_ctx *uctx;
+
+ if (!u_arg)
+ return 0;
+
+ uctx = RTA_DATA(u_arg);
+ return security_xfrm_state_alloc(x, uctx);
+}
+
static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
{
memcpy(&x->id, &p->id, sizeof(x->id));
@@ -253,6 +307,9 @@ static struct xfrm_state *xfrm_state_construct(struct xfrm_usersa_info *p,
if (err)
goto error;
+ if ((err = attach_sec_ctx(x, xfrma[XFRMA_SEC_CTX-1])))
+ goto error;
+
x->km.seq = p->seq;
return x;
@@ -272,11 +329,11 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
int err;
struct km_event c;
- err = verify_newsa_info(p, (struct rtattr **) xfrma);
+ err = verify_newsa_info(p, (struct rtattr **)xfrma);
if (err)
return err;
- x = xfrm_state_construct(p, (struct rtattr **) xfrma, &err);
+ x = xfrm_state_construct(p, (struct rtattr **)xfrma, &err);
if (!x)
return err;
@@ -390,6 +447,19 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr)
if (x->encap)
RTA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap);
+ if (x->security) {
+ int ctx_size = sizeof(struct xfrm_sec_ctx) +
+ x->security->ctx_len;
+ struct rtattr *rt = __RTA_PUT(skb, XFRMA_SEC_CTX, ctx_size);
+ struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt);
+
+ uctx->exttype = XFRMA_SEC_CTX;
+ uctx->len = ctx_size;
+ uctx->ctx_doi = x->security->ctx_doi;
+ uctx->ctx_alg = x->security->ctx_alg;
+ uctx->ctx_len = x->security->ctx_len;
+ memcpy(uctx + 1, x->security->ctx_str, x->security->ctx_len);
+ }
nlh->nlmsg_len = skb->tail - b;
out:
sp->this_idx++;
@@ -603,6 +673,18 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p)
return verify_policy_dir(p->dir);
}
+static int copy_from_user_sec_ctx(struct xfrm_policy *pol, struct rtattr **xfrma)
+{
+ struct rtattr *rt = xfrma[XFRMA_SEC_CTX-1];
+ struct xfrm_user_sec_ctx *uctx;
+
+ if (!rt)
+ return 0;
+
+ uctx = RTA_DATA(rt);
+ return security_xfrm_policy_alloc(pol, uctx);
+}
+
static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut,
int nr)
{
@@ -681,7 +763,10 @@ static struct xfrm_policy *xfrm_policy_construct(struct xfrm_userpolicy_info *p,
}
copy_from_user_policy(xp, p);
- err = copy_from_user_tmpl(xp, xfrma);
+
+ if (!(err = copy_from_user_tmpl(xp, xfrma)))
+ err = copy_from_user_sec_ctx(xp, xfrma);
+
if (err) {
*errp = err;
kfree(xp);
@@ -702,8 +787,11 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
err = verify_newpolicy_info(p);
if (err)
return err;
+ err = verify_sec_ctx_len((struct rtattr **)xfrma);
+ if (err)
+ return err;
- xp = xfrm_policy_construct(p, (struct rtattr **) xfrma, &err);
+ xp = xfrm_policy_construct(p, (struct rtattr **)xfrma, &err);
if (!xp)
return err;
@@ -761,6 +849,27 @@ rtattr_failure:
return -1;
}
+static int copy_to_user_sec_ctx(struct xfrm_policy *xp, struct sk_buff *skb)
+{
+ if (xp->security) {
+ int ctx_size = sizeof(struct xfrm_sec_ctx) +
+ xp->security->ctx_len;
+ struct rtattr *rt = __RTA_PUT(skb, XFRMA_SEC_CTX, ctx_size);
+ struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt);
+
+ uctx->exttype = XFRMA_SEC_CTX;
+ uctx->len = ctx_size;
+ uctx->ctx_doi = xp->security->ctx_doi;
+ uctx->ctx_alg = xp->security->ctx_alg;
+ uctx->ctx_len = xp->security->ctx_len;
+ memcpy(uctx + 1, xp->security->ctx_str, xp->security->ctx_len);
+ }
+ return 0;
+
+ rtattr_failure:
+ return -1;
+}
+
static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr)
{
struct xfrm_dump_info *sp = ptr;
@@ -782,6 +891,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr
copy_to_user_policy(xp, p, dir);
if (copy_to_user_tmpl(xp, skb) < 0)
goto nlmsg_failure;
+ if (copy_to_user_sec_ctx(xp, skb))
+ goto nlmsg_failure;
nlh->nlmsg_len = skb->tail - b;
out:
@@ -852,8 +963,25 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
if (p->index)
xp = xfrm_policy_byid(p->dir, p->index, delete);
- else
- xp = xfrm_policy_bysel(p->dir, &p->sel, delete);
+ else {
+ struct rtattr **rtattrs = (struct rtattr **)xfrma;
+ struct rtattr *rt = rtattrs[XFRMA_SEC_CTX-1];
+ struct xfrm_policy tmp;
+
+ err = verify_sec_ctx_len(rtattrs);
+ if (err)
+ return err;
+
+ memset(&tmp, 0, sizeof(struct xfrm_policy));
+ if (rt) {
+ struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt);
+
+ if ((err = security_xfrm_policy_alloc(&tmp, uctx)))
+ return err;
+ }
+ xp = xfrm_policy_bysel_ctx(p->dir, &p->sel, tmp.security, delete);
+ security_xfrm_policy_free(&tmp);
+ }
if (xp == NULL)
return -ENOENT;
@@ -1224,6 +1352,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
if (copy_to_user_tmpl(xp, skb) < 0)
goto nlmsg_failure;
+ if (copy_to_user_sec_ctx(xp, skb))
+ goto nlmsg_failure;
nlh->nlmsg_len = skb->tail - b;
return skb->len;
@@ -1241,6 +1371,7 @@ static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt,
len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
len += NLMSG_SPACE(sizeof(struct xfrm_user_acquire));
+ len += RTA_SPACE(xfrm_user_sec_ctx_size(xp));
skb = alloc_skb(len, GFP_ATOMIC);
if (skb == NULL)
return -ENOMEM;
@@ -1324,6 +1455,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
copy_to_user_policy(xp, &upe->pol, dir);
if (copy_to_user_tmpl(xp, skb) < 0)
goto nlmsg_failure;
+ if (copy_to_user_sec_ctx(xp, skb))
+ goto nlmsg_failure;
upe->hard = !!hard;
nlh->nlmsg_len = skb->tail - b;
@@ -1341,6 +1474,7 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_eve
len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
len += NLMSG_SPACE(sizeof(struct xfrm_user_polexpire));
+ len += RTA_SPACE(xfrm_user_sec_ctx_size(xp));
skb = alloc_skb(len, GFP_ATOMIC);
if (skb == NULL)
return -ENOMEM;
diff --git a/security/Kconfig b/security/Kconfig
index 64d3f1e9ca85..34f593410d57 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -54,6 +54,19 @@ config SECURITY_NETWORK
implement socket and networking access controls.
If you are unsure how to answer this question, answer N.
+config SECURITY_NETWORK_XFRM
+ bool "XFRM (IPSec) Networking Security Hooks"
+ depends on XFRM && SECURITY_NETWORK
+ help
+ This enables the XFRM (IPSec) networking security hooks.
+ If enabled, a security module can use these hooks to
+ implement per-packet access controls based on labels
+ derived from IPSec policy. Non-IPSec communications are
+ designated as unlabelled, and only sockets authorized
+ to communicate unlabelled data can send without using
+ IPSec.
+ If you are unsure how to answer this question, answer N.
+
config SECURITY_CAPABILITIES
tristate "Default Linux Capabilities"
depends on SECURITY
diff --git a/security/dummy.c b/security/dummy.c
index 3ca5f2b828a0..a15c54709fde 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -776,8 +776,42 @@ static inline int dummy_sk_alloc_security (struct sock *sk, int family, gfp_t pr
static inline void dummy_sk_free_security (struct sock *sk)
{
}
+
+static unsigned int dummy_sk_getsid(struct sock *sk, struct flowi *fl, u8 dir)
+{
+ return 0;
+}
#endif /* CONFIG_SECURITY_NETWORK */
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+static int dummy_xfrm_policy_alloc_security(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx)
+{
+ return 0;
+}
+
+static inline int dummy_xfrm_policy_clone_security(struct xfrm_policy *old, struct xfrm_policy *new)
+{
+ return 0;
+}
+
+static void dummy_xfrm_policy_free_security(struct xfrm_policy *xp)
+{
+}
+
+static int dummy_xfrm_state_alloc_security(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
+{
+ return 0;
+}
+
+static void dummy_xfrm_state_free_security(struct xfrm_state *x)
+{
+}
+
+static int dummy_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
+{
+ return 0;
+}
+#endif /* CONFIG_SECURITY_NETWORK_XFRM */
static int dummy_register_security (const char *name, struct security_operations *ops)
{
return -EINVAL;
@@ -970,7 +1004,16 @@ void security_fixup_ops (struct security_operations *ops)
set_to_dummy_if_null(ops, socket_getpeersec);
set_to_dummy_if_null(ops, sk_alloc_security);
set_to_dummy_if_null(ops, sk_free_security);
-#endif /* CONFIG_SECURITY_NETWORK */
+ set_to_dummy_if_null(ops, sk_getsid);
+ #endif /* CONFIG_SECURITY_NETWORK */
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+ set_to_dummy_if_null(ops, xfrm_policy_alloc_security);
+ set_to_dummy_if_null(ops, xfrm_policy_clone_security);
+ set_to_dummy_if_null(ops, xfrm_policy_free_security);
+ set_to_dummy_if_null(ops, xfrm_state_alloc_security);
+ set_to_dummy_if_null(ops, xfrm_state_free_security);
+ set_to_dummy_if_null(ops, xfrm_policy_lookup);
+#endif /* CONFIG_SECURITY_NETWORK_XFRM */
#ifdef CONFIG_KEYS
set_to_dummy_if_null(ops, key_alloc);
set_to_dummy_if_null(ops, key_free);
diff --git a/security/selinux/Makefile b/security/selinux/Makefile
index b038cd0fae2e..06d54d9d20a5 100644
--- a/security/selinux/Makefile
+++ b/security/selinux/Makefile
@@ -8,5 +8,7 @@ selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o
selinux-$(CONFIG_SECURITY_NETWORK) += netif.o
+selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o
+
EXTRA_CFLAGS += -Isecurity/selinux/include
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index fc774436a264..3d496eae1b47 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -73,6 +73,7 @@
#include "avc.h"
#include "objsec.h"
#include "netif.h"
+#include "xfrm.h"
#define XATTR_SELINUX_SUFFIX "selinux"
#define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX
@@ -3349,6 +3350,10 @@ static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
err = avc_has_perm(sock_sid, port_sid,
sock_class, recv_perm, &ad);
}
+
+ if (!err)
+ err = selinux_xfrm_sock_rcv_skb(sock_sid, skb);
+
out:
return err;
}
@@ -3401,6 +3406,24 @@ static void selinux_sk_free_security(struct sock *sk)
sk_free_security(sk);
}
+static unsigned int selinux_sk_getsid_security(struct sock *sk, struct flowi *fl, u8 dir)
+{
+ struct inode_security_struct *isec;
+ u32 sock_sid = SECINITSID_ANY_SOCKET;
+
+ if (!sk)
+ return selinux_no_sk_sid(fl);
+
+ read_lock_bh(&sk->sk_callback_lock);
+ isec = get_sock_isec(sk);
+
+ if (isec)
+ sock_sid = isec->sid;
+
+ read_unlock_bh(&sk->sk_callback_lock);
+ return sock_sid;
+}
+
static int selinux_nlmsg_perm(struct sock *sk, struct sk_buff *skb)
{
int err = 0;
@@ -3536,6 +3559,11 @@ static unsigned int selinux_ip_postroute_last(unsigned int hooknum,
send_perm, &ad) ? NF_DROP : NF_ACCEPT;
}
+ if (err != NF_ACCEPT)
+ goto out;
+
+ err = selinux_xfrm_postroute_last(isec->sid, skb);
+
out:
return err;
}
@@ -4380,6 +4408,16 @@ static struct security_operations selinux_ops = {
.socket_getpeersec = selinux_socket_getpeersec,
.sk_alloc_security = selinux_sk_alloc_security,
.sk_free_security = selinux_sk_free_security,
+ .sk_getsid = selinux_sk_getsid_security,
+#endif
+
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+ .xfrm_policy_alloc_security = selinux_xfrm_policy_alloc,
+ .xfrm_policy_clone_security = selinux_xfrm_policy_clone,
+ .xfrm_policy_free_security = selinux_xfrm_policy_free,
+ .xfrm_state_alloc_security = selinux_xfrm_state_alloc,
+ .xfrm_state_free_security = selinux_xfrm_state_free,
+ .xfrm_policy_lookup = selinux_xfrm_policy_lookup,
#endif
};
@@ -4491,6 +4529,7 @@ static int __init selinux_nf_ip_init(void)
panic("SELinux: nf_register_hook for IPv6: error %d\n", err);
#endif /* IPV6 */
+
out:
return err;
}
diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h
index 1deb59e1b762..71aeb12f07c8 100644
--- a/security/selinux/include/av_perm_to_string.h
+++ b/security/selinux/include/av_perm_to_string.h
@@ -238,3 +238,5 @@
S_(SECCLASS_NSCD, NSCD__SHMEMHOST, "shmemhost")
S_(SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO, "sendto")
S_(SECCLASS_ASSOCIATION, ASSOCIATION__RECVFROM, "recvfrom")
+ S_(SECCLASS_ASSOCIATION, ASSOCIATION__RELABELFROM, "relabelfrom")
+ S_(SECCLASS_ASSOCIATION, ASSOCIATION__RELABELTO, "relabelto")
diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h
index a78b5d59c9fc..d1d0996049e3 100644
--- a/security/selinux/include/av_permissions.h
+++ b/security/selinux/include/av_permissions.h
@@ -908,6 +908,8 @@
#define ASSOCIATION__SENDTO 0x00000001UL
#define ASSOCIATION__RECVFROM 0x00000002UL
+#define ASSOCIATION__RELABELFROM 0x00000004UL
+#define ASSOCIATION__RELABELTO 0x00000008UL
#define NETLINK_KOBJECT_UEVENT_SOCKET__IOCTL 0x00000001UL
#define NETLINK_KOBJECT_UEVENT_SOCKET__READ 0x00000002UL
diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h
new file mode 100644
index 000000000000..8e87996c6dd5
--- /dev/null
+++ b/security/selinux/include/xfrm.h
@@ -0,0 +1,54 @@
+/*
+ * SELinux support for the XFRM LSM hooks
+ *
+ * Author : Trent Jaeger, <jaegert@us.ibm.com>
+ */
+#ifndef _SELINUX_XFRM_H_
+#define _SELINUX_XFRM_H_
+
+int selinux_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx);
+int selinux_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new);
+void selinux_xfrm_policy_free(struct xfrm_policy *xp);
+int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx);
+void selinux_xfrm_state_free(struct xfrm_state *x);
+int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir);
+
+/*
+ * Extract the security blob from the sock (it's actually on the socket)
+ */
+static inline struct inode_security_struct *get_sock_isec(struct sock *sk)
+{
+ if (!sk->sk_socket)
+ return NULL;
+
+ return SOCK_INODE(sk->sk_socket)->i_security;
+}
+
+
+static inline u32 selinux_no_sk_sid(struct flowi *fl)
+{
+ /* NOTE: no sock occurs on ICMP reply, forwards, ... */
+ /* icmp_reply: authorize as kernel packet */
+ if (fl && fl->proto == IPPROTO_ICMP) {
+ return SECINITSID_KERNEL;
+ }
+
+ return SECINITSID_ANY_SOCKET;
+}
+
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb);
+int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb);
+#else
+static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb)
+{
+ return 0;
+}
+
+static inline int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb)
+{
+ return NF_ACCEPT;
+}
+#endif
+
+#endif /* _SELINUX_XFRM_H_ */
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
new file mode 100644
index 000000000000..c4d87d4dca7b
--- /dev/null
+++ b/security/selinux/xfrm.c
@@ -0,0 +1,311 @@
+/*
+ * NSA Security-Enhanced Linux (SELinux) security module
+ *
+ * This file contains the SELinux XFRM hook function implementations.
+ *
+ * Authors: Serge Hallyn <sergeh@us.ibm.com>
+ * Trent Jaeger <jaegert@us.ibm.com>
+ *
+ * Copyright (C) 2005 International Business Machines Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2,
+ * as published by the Free Software Foundation.
+ */
+
+/*
+ * USAGE:
+ * NOTES:
+ * 1. Make sure to enable the following options in your kernel config:
+ * CONFIG_SECURITY=y
+ * CONFIG_SECURITY_NETWORK=y
+ * CONFIG_SECURITY_NETWORK_XFRM=y
+ * CONFIG_SECURITY_SELINUX=m/y
+ * ISSUES:
+ * 1. Caching packets, so they are not dropped during negotiation
+ * 2. Emulating a reasonable SO_PEERSEC across machines
+ * 3. Testing addition of sk_policy's with security context via setsockopt
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/security.h>
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/skbuff.h>
+#include <linux/xfrm.h>
+#include <net/xfrm.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+#include <asm/semaphore.h>
+
+#include "avc.h"
+#include "objsec.h"
+#include "xfrm.h"
+
+
+/*
+ * Returns true if an LSM/SELinux context
+ */
+static inline int selinux_authorizable_ctx(struct xfrm_sec_ctx *ctx)
+{
+ return (ctx &&
+ (ctx->ctx_doi == XFRM_SC_DOI_LSM) &&
+ (ctx->ctx_alg == XFRM_SC_ALG_SELINUX));
+}
+
+/*
+ * Returns true if the xfrm contains a security blob for SELinux
+ */
+static inline int selinux_authorizable_xfrm(struct xfrm_state *x)
+{
+ return selinux_authorizable_ctx(x->security);
+}
+
+/*
+ * LSM hook implementation that authorizes that a socket can be used
+ * with the corresponding xfrm_sec_ctx and direction.
+ */
+int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
+{
+ int rc = 0;
+ u32 sel_sid = SECINITSID_UNLABELED;
+ struct xfrm_sec_ctx *ctx;
+
+ /* Context sid is either set to label or ANY_ASSOC */
+ if ((ctx = xp->security)) {
+ if (!selinux_authorizable_ctx(ctx))
+ return -EINVAL;
+
+ sel_sid = ctx->ctx_sid;
+ }
+
+ rc = avc_has_perm(sk_sid, sel_sid, SECCLASS_ASSOCIATION,
+ ((dir == FLOW_DIR_IN) ? ASSOCIATION__RECVFROM :
+ ((dir == FLOW_DIR_OUT) ? ASSOCIATION__SENDTO :
+ (ASSOCIATION__SENDTO | ASSOCIATION__RECVFROM))),
+ NULL);
+
+ return rc;
+}
+
+/*
+ * Security blob allocation for xfrm_policy and xfrm_state
+ * CTX does not have a meaningful value on input
+ */
+static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *uctx)
+{
+ int rc = 0;
+ struct task_security_struct *tsec = current->security;
+ struct xfrm_sec_ctx *ctx;
+
+ BUG_ON(!uctx);
+ BUG_ON(uctx->ctx_doi != XFRM_SC_ALG_SELINUX);
+
+ if (uctx->ctx_len >= PAGE_SIZE)
+ return -ENOMEM;
+
+ *ctxp = ctx = kmalloc(sizeof(*ctx) +
+ uctx->ctx_len,
+ GFP_KERNEL);
+
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ctx_doi = uctx->ctx_doi;
+ ctx->ctx_len = uctx->ctx_len;
+ ctx->ctx_alg = uctx->ctx_alg;
+
+ memcpy(ctx->ctx_str,
+ uctx+1,
+ ctx->ctx_len);
+ rc = security_context_to_sid(ctx->ctx_str,
+ ctx->ctx_len,
+ &ctx->ctx_sid);
+
+ if (rc)
+ goto out;
+
+ /*
+ * Does the subject have permission to set security or permission to
+ * do the relabel?
+ * Must be permitted to relabel from default socket type (process type)
+ * to specified context
+ */
+ rc = avc_has_perm(tsec->sid, tsec->sid,
+ SECCLASS_ASSOCIATION,
+ ASSOCIATION__RELABELFROM, NULL);
+ if (rc)
+ goto out;
+
+ rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
+ SECCLASS_ASSOCIATION,
+ ASSOCIATION__RELABELTO, NULL);
+ if (rc)
+ goto out;
+
+ return rc;
+
+out:
+ *ctxp = 0;
+ kfree(ctx);
+ return rc;
+}
+
+/*
+ * LSM hook implementation that allocs and transfers uctx spec to
+ * xfrm_policy.
+ */
+int selinux_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *uctx)
+{
+ int err;
+
+ BUG_ON(!xp);
+
+ err = selinux_xfrm_sec_ctx_alloc(&xp->security, uctx);
+ return err;
+}
+
+
+/*
+ * LSM hook implementation that copies security data structure from old to
+ * new for policy cloning.
+ */
+int selinux_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new)
+{
+ struct xfrm_sec_ctx *old_ctx, *new_ctx;
+
+ old_ctx = old->security;
+
+ if (old_ctx) {
+ new_ctx = new->security = kmalloc(sizeof(*new_ctx) +
+ old_ctx->ctx_len,
+ GFP_KERNEL);
+
+ if (!new_ctx)
+ return -ENOMEM;
+
+ memcpy(new_ctx, old_ctx, sizeof(*new_ctx));
+ memcpy(new_ctx->ctx_str, old_ctx->ctx_str, new_ctx->ctx_len);
+ }
+ return 0;
+}
+
+/*
+ * LSM hook implementation that frees xfrm_policy security information.
+ */
+void selinux_xfrm_policy_free(struct xfrm_policy *xp)
+{
+ struct xfrm_sec_ctx *ctx = xp->security;
+ if (ctx)
+ kfree(ctx);
+}
+
+/*
+ * LSM hook implementation that allocs and transfers sec_ctx spec to
+ * xfrm_state.
+ */
+int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *uctx)
+{
+ int err;
+
+ BUG_ON(!x);
+
+ err = selinux_xfrm_sec_ctx_alloc(&x->security, uctx);
+ return err;
+}
+
+/*
+ * LSM hook implementation that frees xfrm_state security information.
+ */
+void selinux_xfrm_state_free(struct xfrm_state *x)
+{
+ struct xfrm_sec_ctx *ctx = x->security;
+ if (ctx)
+ kfree(ctx);
+}
+
+/*
+ * LSM hook that controls access to unlabelled packets. If
+ * a xfrm_state is authorizable (defined by macro) then it was
+ * already authorized by the IPSec process. If not, then
+ * we need to check for unlabelled access since this may not have
+ * gone thru the IPSec process.
+ */
+int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb)
+{
+ int i, rc = 0;
+ struct sec_path *sp;
+
+ sp = skb->sp;
+
+ if (sp) {
+ /*
+ * __xfrm_policy_check does not approve unless xfrm_policy_ok
+ * says that spi's match for policy and the socket.
+ *
+ * Only need to verify the existence of an authorizable sp.
+ */
+ for (i = 0; i < sp->len; i++) {
+ struct xfrm_state *x = sp->x[i].xvec;
+
+ if (x && selinux_authorizable_xfrm(x))
+ goto accept;
+ }
+ }
+
+ /* check SELinux sock for unlabelled access */
+ rc = avc_has_perm(isec_sid, SECINITSID_UNLABELED, SECCLASS_ASSOCIATION,
+ ASSOCIATION__RECVFROM, NULL);
+ if (rc)
+ goto drop;
+
+accept:
+ return 0;
+
+drop:
+ return rc;
+}
+
+/*
+ * POSTROUTE_LAST hook's XFRM processing:
+ * If we have no security association, then we need to determine
+ * whether the socket is allowed to send to an unlabelled destination.
+ * If we do have a authorizable security association, then it has already been
+ * checked in xfrm_policy_lookup hook.
+ */
+int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb)
+{
+ struct dst_entry *dst;
+ int rc = 0;
+
+ dst = skb->dst;
+
+ if (dst) {
+ struct dst_entry *dst_test;
+
+ for (dst_test = dst; dst_test != 0;
+ dst_test = dst_test->child) {
+ struct xfrm_state *x = dst_test->xfrm;
+
+ if (x && selinux_authorizable_xfrm(x))
+ goto accept;
+ }
+ }
+
+ rc = avc_has_perm(isec_sid, SECINITSID_UNLABELED, SECCLASS_ASSOCIATION,
+ ASSOCIATION__SENDTO, NULL);
+ if (rc)
+ goto drop;
+
+accept:
+ return NF_ACCEPT;
+
+drop:
+ return NF_DROP;
+}