aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/appletalk/ddp.c23
-rw-r--r--net/atm/pvc.c2
-rw-r--r--net/atm/svc.c2
-rw-r--r--net/ax25/af_ax25.c6
-rw-r--r--net/bluetooth/af_bluetooth.c5
-rw-r--r--net/bluetooth/bnep/sock.c2
-rw-r--r--net/bluetooth/cmtp/sock.c2
-rw-r--r--net/bluetooth/hci_sock.c2
-rw-r--r--net/bluetooth/hidp/sock.c2
-rw-r--r--net/bluetooth/l2cap.c9
-rw-r--r--net/bluetooth/rfcomm/sock.c4
-rw-r--r--net/bluetooth/sco.c9
-rw-r--r--net/bridge/br.c1
-rw-r--r--net/bridge/br_device.c91
-rw-r--r--net/bridge/br_if.c55
-rw-r--r--net/bridge/br_input.c11
-rw-r--r--net/bridge/br_netfilter.c4
-rw-r--r--net/bridge/br_notify.c14
-rw-r--r--net/bridge/br_private.h7
-rw-r--r--net/bridge/br_stp_if.c5
-rw-r--r--net/bridge/netfilter/Kconfig6
-rw-r--r--net/bridge/netfilter/ebt_log.c73
-rw-r--r--net/bridge/netfilter/ebt_ulog.c53
-rw-r--r--net/core/datagram.c36
-rw-r--r--net/core/dev.c1
-rw-r--r--net/core/filter.c112
-rw-r--r--net/core/flow.c8
-rw-r--r--net/core/netpoll.c1
-rw-r--r--net/core/pktgen.c6
-rw-r--r--net/core/skbuff.c27
-rw-r--r--net/core/sock.c21
-rw-r--r--net/core/stream.c10
-rw-r--r--net/dccp/Makefile4
-rw-r--r--net/dccp/ackvec.c33
-rw-r--r--net/dccp/ackvec.h12
-rw-r--r--net/dccp/ccid.h2
-rw-r--r--net/dccp/dccp.h24
-rw-r--r--net/dccp/diag.c2
-rw-r--r--net/dccp/input.c79
-rw-r--r--net/dccp/ipv4.c305
-rw-r--r--net/dccp/ipv6.c1261
-rw-r--r--net/dccp/ipv6.h37
-rw-r--r--net/dccp/minisocks.c23
-rw-r--r--net/dccp/output.c47
-rw-r--r--net/dccp/proto.c56
-rw-r--r--net/decnet/af_decnet.c6
-rw-r--r--net/decnet/dn_neigh.c13
-rw-r--r--net/decnet/dn_nsp_in.c17
-rw-r--r--net/econet/af_econet.c9
-rw-r--r--net/ieee80211/ieee80211_rx.c5
-rw-r--r--net/ipv4/Kconfig8
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c19
-rw-r--r--net/ipv4/ah4.c1
-rw-r--r--net/ipv4/arp.c1
-rw-r--r--net/ipv4/devinet.c1
-rw-r--r--net/ipv4/esp4.c1
-rw-r--r--net/ipv4/fib_frontend.c1
-rw-r--r--net/ipv4/fib_hash.c1
-rw-r--r--net/ipv4/fib_rules.c1
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/fib_trie.c8
-rw-r--r--net/ipv4/icmp.c1
-rw-r--r--net/ipv4/igmp.c2
-rw-r--r--net/ipv4/inet_connection_sock.c25
-rw-r--r--net/ipv4/inet_diag.c14
-rw-r--r--net/ipv4/inet_hashtables.c178
-rw-r--r--net/ipv4/inet_timewait_sock.c5
-rw-r--r--net/ipv4/inetpeer.c1
-rw-r--r--net/ipv4/ip_fragment.c68
-rw-r--r--net/ipv4/ip_input.c1
-rw-r--r--net/ipv4/ip_options.c1
-rw-r--r--net/ipv4/ip_output.c1
-rw-r--r--net/ipv4/ip_sockglue.c14
-rw-r--r--net/ipv4/ipcomp.c1
-rw-r--r--net/ipv4/ipconfig.c2
-rw-r--r--net/ipv4/ipmr.c1
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c28
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c21
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c10
-rw-r--r--net/ipv4/ipvs/ip_vs_dh.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_est.c3
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c29
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c29
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_ah.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_esp.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c24
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_udp.c3
-rw-r--r--net/ipv4/ipvs/ip_vs_sh.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c2
-rw-r--r--net/ipv4/netfilter/arp_tables.c175
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c2
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_gre.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c1
-rw-r--r--net/ipv4/netfilter/ip_nat_snmp_basic.c2
-rw-r--r--net/ipv4/netfilter/ip_tables.c199
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c2
-rw-r--r--net/ipv4/netfilter/ipt_physdev.c1
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/syncookies.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c11
-rw-r--r--net/ipv4/tcp.c10
-rw-r--r--net/ipv4/tcp_bic.c85
-rw-r--r--net/ipv4/tcp_cong.c28
-rw-r--r--net/ipv4/tcp_cubic.c411
-rw-r--r--net/ipv4/tcp_input.c99
-rw-r--r--net/ipv4/tcp_ipv4.c269
-rw-r--r--net/ipv4/tcp_minisocks.c16
-rw-r--r--net/ipv4/tcp_output.c118
-rw-r--r--net/ipv4/tcp_vegas.c4
-rw-r--r--net/ipv4/udp.c22
-rw-r--r--net/ipv6/Makefile3
-rw-r--r--net/ipv6/addrconf.c2
-rw-r--r--net/ipv6/af_inet6.c90
-rw-r--r--net/ipv6/ah6.c1
-rw-r--r--net/ipv6/esp6.c1
-rw-r--r--net/ipv6/exthdrs.c4
-rw-r--r--net/ipv6/inet6_connection_sock.c199
-rw-r--r--net/ipv6/inet6_hashtables.c183
-rw-r--r--net/ipv6/ip6_flowlabel.c2
-rw-r--r--net/ipv6/ip6_output.c2
-rw-r--r--net/ipv6/ipcomp6.c1
-rw-r--r--net/ipv6/ipv6_sockglue.c24
-rw-r--r--net/ipv6/mcast.c2
-rw-r--r--net/ipv6/netfilter/ip6_tables.c191
-rw-r--r--net/ipv6/netfilter/ip6t_LOG.c1
-rw-r--r--net/ipv6/netfilter/ip6t_ah.c1
-rw-r--r--net/ipv6/netfilter/ip6t_esp.c1
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c4
-rw-r--r--net/ipv6/raw.c16
-rw-r--r--net/ipv6/tcp_ipv6.c639
-rw-r--r--net/ipv6/udp.c16
-rw-r--r--net/ipx/af_ipx.c6
-rw-r--r--net/irda/af_irda.c23
-rw-r--r--net/key/af_key.c201
-rw-r--r--net/llc/af_llc.c11
-rw-r--r--net/netfilter/nfnetlink_log.c2
-rw-r--r--net/netfilter/nfnetlink_queue.c2
-rw-r--r--net/netlink/af_netlink.c4
-rw-r--r--net/netlink/genetlink.c2
-rw-r--r--net/netrom/af_netrom.c16
-rw-r--r--net/nonet.c5
-rw-r--r--net/packet/af_packet.c10
-rw-r--r--net/rose/af_rose.c2
-rw-r--r--net/sched/sch_netem.c49
-rw-r--r--net/sched/sch_teql.c1
-rw-r--r--net/sctp/associola.c81
-rw-r--r--net/sctp/input.c36
-rw-r--r--net/sctp/ipv6.c2
-rw-r--r--net/sctp/output.c17
-rw-r--r--net/sctp/protocol.c3
-rw-r--r--net/sctp/sm_sideeffect.c26
-rw-r--r--net/sctp/sm_statefuns.c20
-rw-r--r--net/sctp/socket.c691
-rw-r--r--net/sctp/transport.c32
-rw-r--r--net/socket.c243
-rw-r--r--net/sunrpc/svcsock.c2
-rw-r--r--net/unix/af_unix.c55
-rw-r--r--net/unix/garbage.c4
-rw-r--r--net/wanrouter/af_wanpipe.c6
-rw-r--r--net/x25/af_x25.c6
-rw-r--r--net/xfrm/xfrm_policy.c88
-rw-r--r--net/xfrm/xfrm_state.c9
-rw-r--r--net/xfrm/xfrm_user.c148
166 files changed, 5404 insertions, 2308 deletions
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 7982656b9c83..a5144e43aae1 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -63,7 +63,7 @@
#include <linux/atalk.h>
struct datalink_proto *ddp_dl, *aarp_dl;
-static struct proto_ops atalk_dgram_ops;
+static const struct proto_ops atalk_dgram_ops;
/**************************************************************************\
* *
@@ -1763,7 +1763,7 @@ static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr
*/
static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
- int rc = -EINVAL;
+ int rc = -ENOIOCTLCMD;
struct sock *sk = sock->sk;
void __user *argp = (void __user *)arg;
@@ -1813,23 +1813,6 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
rc = atif_ioctl(cmd, argp);
rtnl_unlock();
break;
- /* Physical layer ioctl calls */
- case SIOCSIFLINK:
- case SIOCGIFHWADDR:
- case SIOCSIFHWADDR:
- case SIOCGIFFLAGS:
- case SIOCSIFFLAGS:
- case SIOCGIFTXQLEN:
- case SIOCSIFTXQLEN:
- case SIOCGIFMTU:
- case SIOCGIFCONF:
- case SIOCADDMULTI:
- case SIOCDELMULTI:
- case SIOCGIFCOUNT:
- case SIOCGIFINDEX:
- case SIOCGIFNAME:
- rc = dev_ioctl(cmd, argp);
- break;
}
return rc;
@@ -1841,7 +1824,7 @@ static struct net_proto_family atalk_family_ops = {
.owner = THIS_MODULE,
};
-static struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = {
.family = PF_APPLETALK,
.owner = THIS_MODULE,
.release = atalk_release,
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index 2684a92da22b..f2c541774dcd 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -102,7 +102,7 @@ static int pvc_getname(struct socket *sock,struct sockaddr *sockaddr,
}
-static struct proto_ops pvc_proto_ops = {
+static const struct proto_ops pvc_proto_ops = {
.family = PF_ATMPVC,
.owner = THIS_MODULE,
diff --git a/net/atm/svc.c b/net/atm/svc.c
index d7b266136bf6..3a180cfd7b48 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -613,7 +613,7 @@ static int svc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
return error;
}
-static struct proto_ops svc_proto_ops = {
+static const struct proto_ops svc_proto_ops = {
.family = PF_ATMSVC,
.owner = THIS_MODULE,
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 1b683f302657..e8753c7fcad1 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -54,7 +54,7 @@
HLIST_HEAD(ax25_list);
DEFINE_SPINLOCK(ax25_list_lock);
-static struct proto_ops ax25_proto_ops;
+static const struct proto_ops ax25_proto_ops;
static void ax25_free_sock(struct sock *sk)
{
@@ -1827,7 +1827,7 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
break;
default:
- res = dev_ioctl(cmd, argp);
+ res = -ENOIOCTLCMD;
break;
}
release_sock(sk);
@@ -1944,7 +1944,7 @@ static struct net_proto_family ax25_family_ops = {
.owner = THIS_MODULE,
};
-static struct proto_ops ax25_proto_ops = {
+static const struct proto_ops ax25_proto_ops = {
.family = PF_AX25,
.owner = THIS_MODULE,
.release = ax25_release,
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index ea616e3fc98e..fb031fe9be9e 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -287,10 +287,9 @@ int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo)
timeo = schedule_timeout(timeo);
lock_sock(sk);
- if (sk->sk_err) {
- err = sock_error(sk);
+ err = sock_error(sk);
+ if (err)
break;
- }
}
set_current_state(TASK_RUNNING);
remove_wait_queue(sk->sk_sleep, &wait);
diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c
index 9778c6acd53b..ccbaf69afc5b 100644
--- a/net/bluetooth/bnep/sock.c
+++ b/net/bluetooth/bnep/sock.c
@@ -146,7 +146,7 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
return 0;
}
-static struct proto_ops bnep_sock_ops = {
+static const struct proto_ops bnep_sock_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
.release = bnep_sock_release,
diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c
index beb045bf5714..5e22343b6090 100644
--- a/net/bluetooth/cmtp/sock.c
+++ b/net/bluetooth/cmtp/sock.c
@@ -137,7 +137,7 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
return -EINVAL;
}
-static struct proto_ops cmtp_sock_ops = {
+static const struct proto_ops cmtp_sock_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
.release = cmtp_sock_release,
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 1d6d0a15c099..84e6c93a044a 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -575,7 +575,7 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char
return 0;
}
-static struct proto_ops hci_sock_ops = {
+static const struct proto_ops hci_sock_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
.release = hci_sock_release,
diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c
index f8986f881431..8f8dd931b294 100644
--- a/net/bluetooth/hidp/sock.c
+++ b/net/bluetooth/hidp/sock.c
@@ -143,7 +143,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
return -EINVAL;
}
-static struct proto_ops hidp_sock_ops = {
+static const struct proto_ops hidp_sock_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
.release = hidp_sock_release,
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index e3bb11ca4235..7f0781e4326f 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -57,7 +57,7 @@
#define VERSION "2.8"
-static struct proto_ops l2cap_sock_ops;
+static const struct proto_ops l2cap_sock_ops;
static struct bt_sock_list l2cap_sk_list = {
.lock = RW_LOCK_UNLOCKED
@@ -767,8 +767,9 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct ms
BT_DBG("sock %p, sk %p", sock, sk);
- if (sk->sk_err)
- return sock_error(sk);
+ err = sock_error(sk);
+ if (err)
+ return err;
if (msg->msg_flags & MSG_OOB)
return -EOPNOTSUPP;
@@ -2160,7 +2161,7 @@ static ssize_t l2cap_sysfs_show(struct class *dev, char *buf)
static CLASS_ATTR(l2cap, S_IRUGO, l2cap_sysfs_show, NULL);
-static struct proto_ops l2cap_sock_ops = {
+static const struct proto_ops l2cap_sock_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
.release = l2cap_sock_release,
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 6c34261b232e..757d2dd3b02f 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -58,7 +58,7 @@
#define BT_DBG(D...)
#endif
-static struct proto_ops rfcomm_sock_ops;
+static const struct proto_ops rfcomm_sock_ops;
static struct bt_sock_list rfcomm_sk_list = {
.lock = RW_LOCK_UNLOCKED
@@ -907,7 +907,7 @@ static ssize_t rfcomm_sock_sysfs_show(struct class *dev, char *buf)
static CLASS_ATTR(rfcomm, S_IRUGO, rfcomm_sock_sysfs_show, NULL);
-static struct proto_ops rfcomm_sock_ops = {
+static const struct proto_ops rfcomm_sock_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
.release = rfcomm_sock_release,
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 9cb00dc6c08c..6b61323ce23c 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -56,7 +56,7 @@
#define VERSION "0.5"
-static struct proto_ops sco_sock_ops;
+static const struct proto_ops sco_sock_ops;
static struct bt_sock_list sco_sk_list = {
.lock = RW_LOCK_UNLOCKED
@@ -637,8 +637,9 @@ static int sco_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
BT_DBG("sock %p, sk %p", sock, sk);
- if (sk->sk_err)
- return sock_error(sk);
+ err = sock_error(sk);
+ if (err)
+ return err;
if (msg->msg_flags & MSG_OOB)
return -EOPNOTSUPP;
@@ -913,7 +914,7 @@ static ssize_t sco_sysfs_show(struct class *dev, char *buf)
static CLASS_ATTR(sco, S_IRUGO, sco_sysfs_show, NULL);
-static struct proto_ops sco_sock_ops = {
+static const struct proto_ops sco_sock_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
.release = sco_sock_release,
diff --git a/net/bridge/br.c b/net/bridge/br.c
index f8f184942aaf..188cc1ac49eb 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -67,3 +67,4 @@ EXPORT_SYMBOL(br_should_route_hook);
module_init(br_init)
module_exit(br_deinit)
MODULE_LICENSE("GPL");
+MODULE_VERSION(BR_VERSION);
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index f564ee99782d..0b33a7b3a00c 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -15,7 +15,9 @@
#include <linux/kernel.h>
#include <linux/netdevice.h>
-#include <linux/module.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+
#include <asm/uaccess.h>
#include "br_private.h"
@@ -82,6 +84,87 @@ static int br_change_mtu(struct net_device *dev, int new_mtu)
return 0;
}
+/* Allow setting mac address of pseudo-bridge to be same as
+ * any of the bound interfaces
+ */
+static int br_set_mac_address(struct net_device *dev, void *p)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct sockaddr *addr = p;
+ struct net_bridge_port *port;
+ int err = -EADDRNOTAVAIL;
+
+ spin_lock_bh(&br->lock);
+ list_for_each_entry(port, &br->port_list, list) {
+ if (!compare_ether_addr(port->dev->dev_addr, addr->sa_data)) {
+ br_stp_change_bridge_id(br, addr->sa_data);
+ err = 0;
+ break;
+ }
+ }
+ spin_unlock_bh(&br->lock);
+
+ return err;
+}
+
+static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+{
+ strcpy(info->driver, "bridge");
+ strcpy(info->version, BR_VERSION);
+ strcpy(info->fw_version, "N/A");
+ strcpy(info->bus_info, "N/A");
+}
+
+static int br_set_sg(struct net_device *dev, u32 data)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ if (data)
+ br->feature_mask |= NETIF_F_SG;
+ else
+ br->feature_mask &= ~NETIF_F_SG;
+
+ br_features_recompute(br);
+ return 0;
+}
+
+static int br_set_tso(struct net_device *dev, u32 data)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ if (data)
+ br->feature_mask |= NETIF_F_TSO;
+ else
+ br->feature_mask &= ~NETIF_F_TSO;
+
+ br_features_recompute(br);
+ return 0;
+}
+
+static int br_set_tx_csum(struct net_device *dev, u32 data)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ if (data)
+ br->feature_mask |= NETIF_F_IP_CSUM;
+ else
+ br->feature_mask &= ~NETIF_F_IP_CSUM;
+
+ br_features_recompute(br);
+ return 0;
+}
+
+static struct ethtool_ops br_ethtool_ops = {
+ .get_drvinfo = br_getinfo,
+ .get_link = ethtool_op_get_link,
+ .get_sg = ethtool_op_get_sg,
+ .set_sg = br_set_sg,
+ .get_tx_csum = ethtool_op_get_tx_csum,
+ .set_tx_csum = br_set_tx_csum,
+ .get_tso = ethtool_op_get_tso,
+ .set_tso = br_set_tso,
+};
+
void br_dev_setup(struct net_device *dev)
{
memset(dev->dev_addr, 0, ETH_ALEN);
@@ -96,8 +179,12 @@ void br_dev_setup(struct net_device *dev)
dev->change_mtu = br_change_mtu;
dev->destructor = free_netdev;
SET_MODULE_OWNER(dev);
+ SET_ETHTOOL_OPS(dev, &br_ethtool_ops);
dev->stop = br_dev_stop;
dev->tx_queue_len = 0;
- dev->set_mac_address = NULL;
+ dev->set_mac_address = br_set_mac_address;
dev->priv_flags = IFF_EBRIDGE;
+
+ dev->features = NETIF_F_SG | NETIF_F_FRAGLIST
+ | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_IP_CSUM;
}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 975abe254b7a..11321197338e 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -32,9 +32,8 @@
* ethtool, use ethtool_ops. Also, since driver might sleep need to
* not be holding any locks.
*/
-static int br_initial_port_cost(struct net_device *dev)
+static int port_cost(struct net_device *dev)
{
-
struct ethtool_cmd ecmd = { ETHTOOL_GSET };
struct ifreq ifr;
mm_segment_t old_fs;
@@ -58,10 +57,6 @@ static int br_initial_port_cost(struct net_device *dev)
return 2;
case SPEED_10:
return 100;
- default:
- pr_info("bridge: can't decode speed from %s: %d\n",
- dev->name, ecmd.speed);
- return 100;
}
}
@@ -75,6 +70,35 @@ static int br_initial_port_cost(struct net_device *dev)
return 100; /* assume old 10Mbps */
}
+
+/*
+ * Check for port carrier transistions.
+ * Called from work queue to allow for calling functions that
+ * might sleep (such as speed check), and to debounce.
+ */
+static void port_carrier_check(void *arg)
+{
+ struct net_bridge_port *p = arg;
+
+ rtnl_lock();
+ if (netif_carrier_ok(p->dev)) {
+ u32 cost = port_cost(p->dev);
+
+ spin_lock_bh(&p->br->lock);
+ if (p->state == BR_STATE_DISABLED) {
+ p->path_cost = cost;
+ br_stp_enable_port(p);
+ }
+ spin_unlock_bh(&p->br->lock);
+ } else {
+ spin_lock_bh(&p->br->lock);
+ if (p->state != BR_STATE_DISABLED)
+ br_stp_disable_port(p);
+ spin_unlock_bh(&p->br->lock);
+ }
+ rtnl_unlock();
+}
+
static void destroy_nbp(struct net_bridge_port *p)
{
struct net_device *dev = p->dev;
@@ -102,6 +126,9 @@ static void del_nbp(struct net_bridge_port *p)
dev->br_port = NULL;
dev_set_promiscuity(dev, -1);
+ cancel_delayed_work(&p->carrier_check);
+ flush_scheduled_work();
+
spin_lock_bh(&br->lock);
br_stp_disable_port(p);
spin_unlock_bh(&br->lock);
@@ -155,6 +182,7 @@ static struct net_device *new_bridge_dev(const char *name)
br->bridge_id.prio[1] = 0x00;
memset(br->bridge_id.addr, 0, ETH_ALEN);
+ br->feature_mask = dev->features;
br->stp_enabled = 0;
br->designated_root = br->bridge_id;
br->root_path_cost = 0;
@@ -195,10 +223,9 @@ static int find_portno(struct net_bridge *br)
return (index >= BR_MAX_PORTS) ? -EXFULL : index;
}
-/* called with RTNL */
+/* called with RTNL but without bridge lock */
static struct net_bridge_port *new_nbp(struct net_bridge *br,
- struct net_device *dev,
- unsigned long cost)
+ struct net_device *dev)
{
int index;
struct net_bridge_port *p;
@@ -215,12 +242,13 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
p->br = br;
dev_hold(dev);
p->dev = dev;
- p->path_cost = cost;
+ p->path_cost = port_cost(dev);
p->priority = 0x8000 >> BR_PORT_BITS;
dev->br_port = p;
p->port_no = index;
br_init_port(p);
p->state = BR_STATE_DISABLED;
+ INIT_WORK(&p->carrier_check, port_carrier_check, p);
kobject_init(&p->kobj);
return p;
@@ -322,9 +350,8 @@ void br_features_recompute(struct net_bridge *br)
struct net_bridge_port *p;
unsigned long features, checksum;
- features = NETIF_F_SG | NETIF_F_FRAGLIST
- | NETIF_F_HIGHDMA | NETIF_F_TSO;
- checksum = NETIF_F_IP_CSUM; /* least commmon subset */
+ features = br->feature_mask &~ NETIF_F_IP_CSUM;
+ checksum = br->feature_mask & NETIF_F_IP_CSUM;
list_for_each_entry(p, &br->port_list, list) {
if (!(p->dev->features
@@ -351,7 +378,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
if (dev->br_port != NULL)
return -EBUSY;
- if (IS_ERR(p = new_nbp(br, dev, br_initial_port_cost(dev))))
+ if (IS_ERR(p = new_nbp(br, dev)))
return PTR_ERR(p);
if ((err = br_fdb_insert(br, p, dev->dev_addr)))
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index b88220a64cd8..c387852f753a 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -53,6 +53,11 @@ int br_handle_frame_finish(struct sk_buff *skb)
/* insert into forwarding database after filtering to avoid spoofing */
br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
+ if (p->state == BR_STATE_LEARNING) {
+ kfree_skb(skb);
+ goto out;
+ }
+
if (br->dev->flags & IFF_PROMISC) {
struct sk_buff *skb2;
@@ -107,9 +112,6 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb)
if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
goto err;
- if (p->state == BR_STATE_LEARNING)
- br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
-
if (p->br->stp_enabled &&
!memcmp(dest, bridge_ula, 5) &&
!(dest[5] & 0xF0)) {
@@ -118,9 +120,10 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb)
NULL, br_stp_handle_bpdu);
return 1;
}
+ goto err;
}
- else if (p->state == BR_STATE_FORWARDING) {
+ if (p->state == BR_STATE_FORWARDING || p->state == BR_STATE_LEARNING) {
if (br_should_route_hook) {
if (br_should_route_hook(pskb))
return 0;
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 23422bd53a5e..223f8270daee 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -26,6 +26,7 @@
#include <linux/ip.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
+#include <linux/if_arp.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <linux/netfilter_bridge.h>
@@ -33,8 +34,11 @@
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_arp.h>
#include <linux/in_route.h>
+
#include <net/ip.h>
#include <net/ipv6.h>
+#include <net/route.h>
+
#include <asm/uaccess.h>
#include <asm/checksum.h>
#include "br_private.h"
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index 917311c6828b..a43a9c1d50d7 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -52,17 +52,9 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
br_stp_recalculate_bridge_id(br);
break;
- case NETDEV_CHANGE: /* device is up but carrier changed */
- if (!(br->dev->flags & IFF_UP))
- break;
-
- if (netif_carrier_ok(dev)) {
- if (p->state == BR_STATE_DISABLED)
- br_stp_enable_port(p);
- } else {
- if (p->state != BR_STATE_DISABLED)
- br_stp_disable_port(p);
- }
+ case NETDEV_CHANGE:
+ if (br->dev->flags & IFF_UP)
+ schedule_delayed_work(&p->carrier_check, BR_PORT_DEBOUNCE);
break;
case NETDEV_FEAT_CHANGE:
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index bdf95a74d8cd..c5bd631ffcd5 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -27,6 +27,10 @@
#define BR_PORT_BITS 10
#define BR_MAX_PORTS (1<<BR_PORT_BITS)
+#define BR_PORT_DEBOUNCE (HZ/10)
+
+#define BR_VERSION "2.1"
+
typedef struct bridge_id bridge_id;
typedef struct mac_addr mac_addr;
typedef __u16 port_id;
@@ -78,6 +82,7 @@ struct net_bridge_port
struct timer_list hold_timer;
struct timer_list message_age_timer;
struct kobject kobj;
+ struct work_struct carrier_check;
struct rcu_head rcu;
};
@@ -90,6 +95,7 @@ struct net_bridge
spinlock_t hash_lock;
struct hlist_head hash[BR_HASH_SIZE];
struct list_head age_list;
+ unsigned long feature_mask;
/* STP */
bridge_id designated_root;
@@ -201,6 +207,7 @@ extern void br_stp_disable_bridge(struct net_bridge *br);
extern void br_stp_enable_port(struct net_bridge_port *p);
extern void br_stp_disable_port(struct net_bridge_port *p);
extern void br_stp_recalculate_bridge_id(struct net_bridge *br);
+extern void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *a);
extern void br_stp_set_bridge_priority(struct net_bridge *br,
u16 newprio);
extern void br_stp_set_port_priority(struct net_bridge_port *p,
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index ac09b6a23523..cc047f7fb6ef 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -120,8 +120,7 @@ void br_stp_disable_port(struct net_bridge_port *p)
}
/* called under bridge lock */
-static void br_stp_change_bridge_id(struct net_bridge *br,
- const unsigned char *addr)
+void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr)
{
unsigned char oldaddr[6];
struct net_bridge_port *p;
@@ -158,7 +157,7 @@ void br_stp_recalculate_bridge_id(struct net_bridge *br)
list_for_each_entry(p, &br->port_list, list) {
if (addr == br_mac_zero ||
- compare_ether_addr(p->dev->dev_addr, addr) < 0)
+ memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0)
addr = p->dev->dev_addr;
}
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index c70b3be23026..b84fc6075fe1 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -196,9 +196,13 @@ config BRIDGE_EBT_LOG
To compile it as a module, choose M here. If unsure, say N.
config BRIDGE_EBT_ULOG
- tristate "ebt: ulog support"
+ tristate "ebt: ulog support (OBSOLETE)"
depends on BRIDGE_NF_EBTABLES
help
+ This option enables the old bridge-specific "ebt_ulog" implementation
+ which has been obsoleted by the new "nfnetlink_log" code (see
+ CONFIG_NETFILTER_NETLINK_LOG).
+
This option adds the ulog watcher, that you can use in any rule
in any ebtables table. The packet is passed to a userspace
logging daemon using netlink multicast sockets. This differs
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 662975be3d1d..9f6e0193ae10 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -3,13 +3,16 @@
*
* Authors:
* Bart De Schuymer <bdschuym@pandora.be>
+ * Harald Welte <laforge@netfilter.org>
*
* April, 2002
*
*/
+#include <linux/in.h>
#include <linux/netfilter_bridge/ebtables.h>
#include <linux/netfilter_bridge/ebt_log.h>
+#include <linux/netfilter.h>
#include <linux/module.h>
#include <linux/ip.h>
#include <linux/if_arp.h>
@@ -55,27 +58,30 @@ static void print_MAC(unsigned char *p)
}
#define myNIPQUAD(a) a[0], a[1], a[2], a[3]
-static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
- const struct net_device *in, const struct net_device *out,
- const void *data, unsigned int datalen)
+static void
+ebt_log_packet(unsigned int pf, unsigned int hooknum,
+ const struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, const struct nf_loginfo *loginfo,
+ const char *prefix)
{
- struct ebt_log_info *info = (struct ebt_log_info *)data;
- char level_string[4] = "< >";
+ unsigned int bitmask;
- level_string[1] = '0' + info->loglevel;
spin_lock_bh(&ebt_log_lock);
- printk(level_string);
- printk("%s IN=%s OUT=%s ", info->prefix, in ? in->name : "",
- out ? out->name : "");
+ printk("<%c>%s IN=%s OUT=%s MAC source = ", '0' + loginfo->u.log.level,
+ prefix, in ? in->name : "", out ? out->name : "");
- printk("MAC source = ");
print_MAC(eth_hdr(skb)->h_source);
printk("MAC dest = ");
print_MAC(eth_hdr(skb)->h_dest);
printk("proto = 0x%04x", ntohs(eth_hdr(skb)->h_proto));
- if ((info->bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto ==
+ if (loginfo->type == NF_LOG_TYPE_LOG)
+ bitmask = loginfo->u.log.logflags;
+ else
+ bitmask = NF_LOG_MASK;
+
+ if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto ==
htons(ETH_P_IP)){
struct iphdr _iph, *ih;
@@ -84,10 +90,9 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
printk(" INCOMPLETE IP header");
goto out;
}
- printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,",
- NIPQUAD(ih->saddr), NIPQUAD(ih->daddr));
- printk(" IP tos=0x%02X, IP proto=%d", ih->tos,
- ih->protocol);
+ printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u, IP "
+ "tos=0x%02X, IP proto=%d", NIPQUAD(ih->saddr),
+ NIPQUAD(ih->daddr), ih->tos, ih->protocol);
if (ih->protocol == IPPROTO_TCP ||
ih->protocol == IPPROTO_UDP) {
struct tcpudphdr _ports, *pptr;
@@ -104,7 +109,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
goto out;
}
- if ((info->bitmask & EBT_LOG_ARP) &&
+ if ((bitmask & EBT_LOG_ARP) &&
((eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) ||
(eth_hdr(skb)->h_proto == htons(ETH_P_RARP)))) {
struct arphdr _arph, *ah;
@@ -144,6 +149,21 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
out:
printk("\n");
spin_unlock_bh(&ebt_log_lock);
+
+}
+
+static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
+ const struct net_device *in, const struct net_device *out,
+ const void *data, unsigned int datalen)
+{
+ struct ebt_log_info *info = (struct ebt_log_info *)data;
+ struct nf_loginfo li;
+
+ li.type = NF_LOG_TYPE_LOG;
+ li.u.log.level = info->loglevel;
+ li.u.log.logflags = info->bitmask;
+
+ nf_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li, info->prefix);
}
static struct ebt_watcher log =
@@ -154,13 +174,32 @@ static struct ebt_watcher log =
.me = THIS_MODULE,
};
+static struct nf_logger ebt_log_logger = {
+ .name = "ebt_log",
+ .logfn = &ebt_log_packet,
+ .me = THIS_MODULE,
+};
+
static int __init init(void)
{
- return ebt_register_watcher(&log);
+ int ret;
+
+ ret = ebt_register_watcher(&log);
+ if (ret < 0)
+ return ret;
+ if (nf_log_register(PF_BRIDGE, &ebt_log_logger) < 0) {
+ printk(KERN_WARNING "ebt_log: not logging via system console "
+ "since somebody else already registered for PF_INET\n");
+ /* we cannot make module load fail here, since otherwise
+ * ebtables userspace would abort */
+ }
+
+ return 0;
}
static void __exit fini(void)
{
+ nf_log_unregister_logger(&ebt_log_logger);
ebt_unregister_watcher(&log);
}
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index aae26ae2e61f..ce617b3dbbb8 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -3,6 +3,7 @@
*
* Authors:
* Bart De Schuymer <bdschuym@pandora.be>
+ * Harald Welte <laforge@netfilter.org>
*
* November, 2004
*
@@ -115,14 +116,13 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size)
return skb;
}
-static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr,
+static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
- const void *data, unsigned int datalen)
+ const struct ebt_ulog_info *uloginfo, const char *prefix)
{
ebt_ulog_packet_msg_t *pm;
size_t size, copy_len;
struct nlmsghdr *nlh;
- struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data;
unsigned int group = uloginfo->nlgroup;
ebt_ulog_buff_t *ub = &ulog_buffers[group];
spinlock_t *lock = &ub->lock;
@@ -216,6 +216,39 @@ alloc_failure:
goto unlock;
}
+/* this function is registered with the netfilter core */
+static void ebt_log_packet(unsigned int pf, unsigned int hooknum,
+ const struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, const struct nf_loginfo *li,
+ const char *prefix)
+{
+ struct ebt_ulog_info loginfo;
+
+ if (!li || li->type != NF_LOG_TYPE_ULOG) {
+ loginfo.nlgroup = EBT_ULOG_DEFAULT_NLGROUP;
+ loginfo.cprange = 0;
+ loginfo.qthreshold = EBT_ULOG_DEFAULT_QTHRESHOLD;
+ loginfo.prefix[0] = '\0';
+ } else {
+ loginfo.nlgroup = li->u.ulog.group;
+ loginfo.cprange = li->u.ulog.copy_len;
+ loginfo.qthreshold = li->u.ulog.qthreshold;
+ strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
+ }
+
+ ebt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
+}
+
+static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr,
+ const struct net_device *in, const struct net_device *out,
+ const void *data, unsigned int datalen)
+{
+ struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data;
+
+ ebt_ulog_packet(hooknr, skb, in, out, uloginfo, NULL);
+}
+
+
static int ebt_ulog_check(const char *tablename, unsigned int hookmask,
const struct ebt_entry *e, void *data, unsigned int datalen)
{
@@ -240,6 +273,12 @@ static struct ebt_watcher ulog = {
.me = THIS_MODULE,
};
+static struct nf_logger ebt_ulog_logger = {
+ .name = EBT_ULOG_WATCHER,
+ .logfn = &ebt_log_packet,
+ .me = THIS_MODULE,
+};
+
static int __init init(void)
{
int i, ret = 0;
@@ -265,6 +304,13 @@ static int __init init(void)
else if ((ret = ebt_register_watcher(&ulog)))
sock_release(ebtulognl->sk_socket);
+ if (nf_log_register(PF_BRIDGE, &ebt_ulog_logger) < 0) {
+ printk(KERN_WARNING "ebt_ulog: not logging via ulog "
+ "since somebody else already registered for PF_BRIDGE\n");
+ /* we cannot make module load fail here, since otherwise
+ * ebtables userspace would abort */
+ }
+
return ret;
}
@@ -273,6 +319,7 @@ static void __exit fini(void)
ebt_ulog_buff_t *ub;
int i;
+ nf_log_unregister_logger(&ebt_ulog_logger);
ebt_unregister_watcher(&ulog);
for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) {
ub = &ulog_buffers[i];
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 1bcfef51ac58..f8d322e1ea92 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -47,6 +47,7 @@
#include <linux/rtnetlink.h>
#include <linux/poll.h>
#include <linux/highmem.h>
+#include <linux/spinlock.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -200,6 +201,41 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
}
/**
+ * skb_kill_datagram - Free a datagram skbuff forcibly
+ * @sk: socket
+ * @skb: datagram skbuff
+ * @flags: MSG_ flags
+ *
+ * This function frees a datagram skbuff that was received by
+ * skb_recv_datagram. The flags argument must match the one
+ * used for skb_recv_datagram.
+ *
+ * If the MSG_PEEK flag is set, and the packet is still on the
+ * receive queue of the socket, it will be taken off the queue
+ * before it is freed.
+ *
+ * This function currently only disables BH when acquiring the
+ * sk_receive_queue lock. Therefore it must not be used in a
+ * context where that lock is acquired in an IRQ context.
+ */
+
+void skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
+{
+ if (flags & MSG_PEEK) {
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ if (skb == skb_peek(&sk->sk_receive_queue)) {
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ atomic_dec(&skb->users);
+ }
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ }
+
+ kfree_skb(skb);
+}
+
+EXPORT_SYMBOL(skb_kill_datagram);
+
+/**
* skb_copy_datagram_iovec - Copy a datagram to an iovec.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying from
diff --git a/net/core/dev.c b/net/core/dev.c
index 1a59f2173f0a..5081287923d5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3277,7 +3277,6 @@ EXPORT_SYMBOL(dev_close);
EXPORT_SYMBOL(dev_get_by_flags);
EXPORT_SYMBOL(dev_get_by_index);
EXPORT_SYMBOL(dev_get_by_name);
-EXPORT_SYMBOL(dev_ioctl);
EXPORT_SYMBOL(dev_open);
EXPORT_SYMBOL(dev_queue_xmit);
EXPORT_SYMBOL(dev_remove_pack);
diff --git a/net/core/filter.c b/net/core/filter.c
index 3a10e0bc90e8..8964d3445588 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -13,6 +13,7 @@
* 2 of the License, or (at your option) any later version.
*
* Andi Kleen - Fix a few bad bugs and races.
+ * Kris Katterjohn - Added many additional checks in sk_chk_filter()
*/
#include <linux/module.h>
@@ -250,7 +251,7 @@ load_b:
mem[fentry->k] = X;
continue;
default:
- /* Invalid instruction counts as RET */
+ WARN_ON(1);
return 0;
}
@@ -283,8 +284,8 @@ load_b:
*
* Check the user's filter code. If we let some ugly
* filter code slip through kaboom! The filter must contain
- * no references or jumps that are out of range, no illegal instructions
- * and no backward jumps. It must end with a RET instruction
+ * no references or jumps that are out of range, no illegal
+ * instructions, and must end with a RET instruction.
*
* Returns 0 if the rule set is legal or a negative errno code if not.
*/
@@ -300,38 +301,85 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
for (pc = 0; pc < flen; pc++) {
/* all jumps are forward as they are not signed */
ftest = &filter[pc];
- if (BPF_CLASS(ftest->code) == BPF_JMP) {
- /* but they mustn't jump off the end */
- if (BPF_OP(ftest->code) == BPF_JA) {
- /*
- * Note, the large ftest->k might cause loops.
- * Compare this with conditional jumps below,
- * where offsets are limited. --ANK (981016)
- */
- if (ftest->k >= (unsigned)(flen-pc-1))
- return -EINVAL;
- } else {
- /* for conditionals both must be safe */
- if (pc + ftest->jt +1 >= flen ||
- pc + ftest->jf +1 >= flen)
- return -EINVAL;
- }
- }
- /* check for division by zero -Kris Katterjohn 2005-10-30 */
- if (ftest->code == (BPF_ALU|BPF_DIV|BPF_K) && ftest->k == 0)
- return -EINVAL;
+ /* Only allow valid instructions */
+ switch (ftest->code) {
+ case BPF_ALU|BPF_ADD|BPF_K:
+ case BPF_ALU|BPF_ADD|BPF_X:
+ case BPF_ALU|BPF_SUB|BPF_K:
+ case BPF_ALU|BPF_SUB|BPF_X:
+ case BPF_ALU|BPF_MUL|BPF_K:
+ case BPF_ALU|BPF_MUL|BPF_X:
+ case BPF_ALU|BPF_DIV|BPF_X:
+ case BPF_ALU|BPF_AND|BPF_K:
+ case BPF_ALU|BPF_AND|BPF_X:
+ case BPF_ALU|BPF_OR|BPF_K:
+ case BPF_ALU|BPF_OR|BPF_X:
+ case BPF_ALU|BPF_LSH|BPF_K:
+ case BPF_ALU|BPF_LSH|BPF_X:
+ case BPF_ALU|BPF_RSH|BPF_K:
+ case BPF_ALU|BPF_RSH|BPF_X:
+ case BPF_ALU|BPF_NEG:
+ case BPF_LD|BPF_W|BPF_ABS:
+ case BPF_LD|BPF_H|BPF_ABS:
+ case BPF_LD|BPF_B|BPF_ABS:
+ case BPF_LD|BPF_W|BPF_LEN:
+ case BPF_LD|BPF_W|BPF_IND:
+ case BPF_LD|BPF_H|BPF_IND:
+ case BPF_LD|BPF_B|BPF_IND:
+ case BPF_LD|BPF_IMM:
+ case BPF_LDX|BPF_W|BPF_LEN:
+ case BPF_LDX|BPF_B|BPF_MSH:
+ case BPF_LDX|BPF_IMM:
+ case BPF_MISC|BPF_TAX:
+ case BPF_MISC|BPF_TXA:
+ case BPF_RET|BPF_K:
+ case BPF_RET|BPF_A:
+ break;
+
+ /* Some instructions need special checks */
- /* check that memory operations use valid addresses. */
- if (ftest->k >= BPF_MEMWORDS) {
- /* but it might not be a memory operation... */
- switch (ftest->code) {
- case BPF_ST:
- case BPF_STX:
- case BPF_LD|BPF_MEM:
- case BPF_LDX|BPF_MEM:
+ case BPF_ALU|BPF_DIV|BPF_K:
+ /* check for division by zero */
+ if (ftest->k == 0)
return -EINVAL;
- }
+ break;
+
+ case BPF_LD|BPF_MEM:
+ case BPF_LDX|BPF_MEM:
+ case BPF_ST:
+ case BPF_STX:
+ /* check for invalid memory addresses */
+ if (ftest->k >= BPF_MEMWORDS)
+ return -EINVAL;
+ break;
+
+ case BPF_JMP|BPF_JA:
+ /*
+ * Note, the large ftest->k might cause loops.
+ * Compare this with conditional jumps below,
+ * where offsets are limited. --ANK (981016)
+ */
+ if (ftest->k >= (unsigned)(flen-pc-1))
+ return -EINVAL;
+ break;
+
+ case BPF_JMP|BPF_JEQ|BPF_K:
+ case BPF_JMP|BPF_JEQ|BPF_X:
+ case BPF_JMP|BPF_JGE|BPF_K:
+ case BPF_JMP|BPF_JGE|BPF_X:
+ case BPF_JMP|BPF_JGT|BPF_K:
+ case BPF_JMP|BPF_JGT|BPF_X:
+ case BPF_JMP|BPF_JSET|BPF_K:
+ case BPF_JMP|BPF_JSET|BPF_X:
+ /* for conditionals both must be safe */
+ if (pc + ftest->jt + 1 >= flen ||
+ pc + ftest->jf + 1 >= flen)
+ return -EINVAL;
+ break;
+
+ default:
+ return -EINVAL;
}
}
diff --git a/net/core/flow.c b/net/core/flow.c
index 7e95b39de9fd..c4f25385029f 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -23,6 +23,7 @@
#include <net/flow.h>
#include <asm/atomic.h>
#include <asm/semaphore.h>
+#include <linux/security.h>
struct flow_cache_entry {
struct flow_cache_entry *next;
@@ -30,6 +31,7 @@ struct flow_cache_entry {
u8 dir;
struct flowi key;
u32 genid;
+ u32 sk_sid;
void *object;
atomic_t *object_ref;
};
@@ -162,7 +164,7 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
return 0;
}
-void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
+void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir,
flow_resolve_t resolver)
{
struct flow_cache_entry *fle, **head;
@@ -186,6 +188,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
for (fle = *head; fle; fle = fle->next) {
if (fle->family == family &&
fle->dir == dir &&
+ fle->sk_sid == sk_sid &&
flow_key_compare(key, &fle->key) == 0) {
if (fle->genid == atomic_read(&flow_cache_genid)) {
void *ret = fle->object;
@@ -210,6 +213,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
*head = fle;
fle->family = family;
fle->dir = dir;
+ fle->sk_sid = sk_sid;
memcpy(&fle->key, key, sizeof(*key));
fle->object = NULL;
flow_count(cpu)++;
@@ -221,7 +225,7 @@ nocache:
void *obj;
atomic_t *obj_ref;
- resolver(key, family, dir, &obj, &obj_ref);
+ resolver(key, sk_sid, family, dir, &obj, &obj_ref);
if (fle) {
fle->genid = atomic_read(&flow_cache_genid);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 49424a42a2c0..281a632fa6a6 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -13,6 +13,7 @@
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/string.h>
+#include <linux/if_arp.h>
#include <linux/inetdevice.h>
#include <linux/inet.h>
#include <linux/interrupt.h>
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 7fc3e9e28c34..06cad2d63e8a 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -487,9 +487,9 @@ static unsigned int fmt_ip6(char *s,const char ip[16]);
/* Module parameters, defaults. */
static int pg_count_d = 1000; /* 1000 pkts by default */
-static int pg_delay_d = 0;
-static int pg_clone_skb_d = 0;
-static int debug = 0;
+static int pg_delay_d;
+static int pg_clone_skb_d;
+static int debug;
static DECLARE_MUTEX(pktgen_sem);
static struct pktgen_thread *pktgen_threads = NULL;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 83fee37de38e..070f91cfde59 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -135,17 +135,13 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
int fclone)
{
+ struct skb_shared_info *shinfo;
struct sk_buff *skb;
u8 *data;
/* Get the HEAD */
- if (fclone)
- skb = kmem_cache_alloc(skbuff_fclone_cache,
- gfp_mask & ~__GFP_DMA);
- else
- skb = kmem_cache_alloc(skbuff_head_cache,
- gfp_mask & ~__GFP_DMA);
-
+ skb = kmem_cache_alloc(fclone ? skbuff_fclone_cache : skbuff_head_cache,
+ gfp_mask & ~__GFP_DMA);
if (!skb)
goto out;
@@ -162,6 +158,16 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
skb->data = data;
skb->tail = data;
skb->end = data + size;
+ /* make sure we initialize shinfo sequentially */
+ shinfo = skb_shinfo(skb);
+ atomic_set(&shinfo->dataref, 1);
+ shinfo->nr_frags = 0;
+ shinfo->tso_size = 0;
+ shinfo->tso_segs = 0;
+ shinfo->ufo_size = 0;
+ shinfo->ip6_frag_id = 0;
+ shinfo->frag_list = NULL;
+
if (fclone) {
struct sk_buff *child = skb + 1;
atomic_t *fclone_ref = (atomic_t *) (child + 1);
@@ -171,13 +177,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
child->fclone = SKB_FCLONE_UNAVAILABLE;
}
- atomic_set(&(skb_shinfo(skb)->dataref), 1);
- skb_shinfo(skb)->nr_frags = 0;
- skb_shinfo(skb)->tso_size = 0;
- skb_shinfo(skb)->tso_segs = 0;
- skb_shinfo(skb)->frag_list = NULL;
- skb_shinfo(skb)->ufo_size = 0;
- skb_shinfo(skb)->ip6_frag_id = 0;
out:
return skb;
nodata:
diff --git a/net/core/sock.c b/net/core/sock.c
index 13cc3be4f056..6465b0e4c8cb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1488,7 +1488,7 @@ int proto_register(struct proto *prot, int alloc_slab)
}
}
- if (prot->twsk_obj_size) {
+ if (prot->twsk_prot != NULL) {
static const char mask[] = "tw_sock_%s";
timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
@@ -1497,11 +1497,12 @@ int proto_register(struct proto *prot, int alloc_slab)
goto out_free_request_sock_slab;
sprintf(timewait_sock_slab_name, mask, prot->name);
- prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name,
- prot->twsk_obj_size,
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
- if (prot->twsk_slab == NULL)
+ prot->twsk_prot->twsk_slab =
+ kmem_cache_create(timewait_sock_slab_name,
+ prot->twsk_prot->twsk_obj_size,
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if (prot->twsk_prot->twsk_slab == NULL)
goto out_free_timewait_sock_slab_name;
}
}
@@ -1548,12 +1549,12 @@ void proto_unregister(struct proto *prot)
prot->rsk_prot->slab = NULL;
}
- if (prot->twsk_slab != NULL) {
- const char *name = kmem_cache_name(prot->twsk_slab);
+ if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
+ const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
- kmem_cache_destroy(prot->twsk_slab);
+ kmem_cache_destroy(prot->twsk_prot->twsk_slab);
kfree(name);
- prot->twsk_slab = NULL;
+ prot->twsk_prot->twsk_slab = NULL;
}
}
diff --git a/net/core/stream.c b/net/core/stream.c
index 15bfd03e8024..35e25259fd95 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -55,8 +55,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
int done;
do {
- if (sk->sk_err)
- return sock_error(sk);
+ int err = sock_error(sk);
+ if (err)
+ return err;
if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
return -EPIPE;
if (!*timeo_p)
@@ -67,6 +68,7 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
sk->sk_write_pending++;
done = sk_wait_event(sk, timeo_p,
+ !sk->sk_err &&
!((1 << sk->sk_state) &
~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
finish_wait(sk->sk_sleep, &wait);
@@ -137,7 +139,9 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
sk->sk_write_pending++;
- sk_wait_event(sk, &current_timeo, sk_stream_memory_free(sk) &&
+ sk_wait_event(sk, &current_timeo, !sk->sk_err &&
+ !(sk->sk_shutdown & SEND_SHUTDOWN) &&
+ sk_stream_memory_free(sk) &&
vm_wait);
sk->sk_write_pending--;
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 344a8da153fc..87b27fff6e3b 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -1,3 +1,7 @@
+obj-$(CONFIG_IPV6) += dccp_ipv6.o
+
+dccp_ipv6-y := ipv6.o
+
obj-$(CONFIG_IP_DCCP) += dccp.o
dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index c9a62cca22fc..ce9cb77c5c29 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -55,8 +55,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
from = av->dccpav_buf + av->dccpav_buf_head;
/* Check if buf_head wraps */
- if (av->dccpav_buf_head + len > av->dccpav_vec_len) {
- const u32 tailsize = (av->dccpav_vec_len - av->dccpav_buf_head);
+ if ((int)av->dccpav_buf_head + len > av->dccpav_vec_len) {
+ const u32 tailsize = av->dccpav_vec_len - av->dccpav_buf_head;
memcpy(to, from, tailsize);
to += tailsize;
@@ -93,8 +93,14 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
struct dccp_ackvec *dccp_ackvec_alloc(const unsigned int len,
const gfp_t priority)
{
- struct dccp_ackvec *av = kmalloc(sizeof(*av) + len, priority);
+ struct dccp_ackvec *av;
+ BUG_ON(len == 0);
+
+ if (len > DCCP_MAX_ACKVEC_LEN)
+ return NULL;
+
+ av = kmalloc(sizeof(*av) + len, priority);
if (av != NULL) {
av->dccpav_buf_len = len;
av->dccpav_buf_head =
@@ -117,13 +123,13 @@ void dccp_ackvec_free(struct dccp_ackvec *av)
}
static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av,
- const unsigned int index)
+ const u8 index)
{
return av->dccpav_buf[index] & DCCP_ACKVEC_STATE_MASK;
}
static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
- const unsigned int index)
+ const u8 index)
{
return av->dccpav_buf[index] & DCCP_ACKVEC_LEN_MASK;
}
@@ -135,7 +141,7 @@ static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
*/
static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
const unsigned int packets,
- const unsigned char state)
+ const unsigned char state)
{
unsigned int gap;
signed long new_head;
@@ -223,7 +229,7 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
* could reduce the complexity of this scan.)
*/
u64 delta = dccp_delta_seqno(ackno, av->dccpav_buf_ackno);
- unsigned int index = av->dccpav_buf_head;
+ u8 index = av->dccpav_buf_head;
while (1) {
const u8 len = dccp_ackvec_len(av, index);
@@ -291,7 +297,7 @@ void dccp_ackvec_print(const struct dccp_ackvec *av)
}
#endif
-static void dccp_ackvec_trow_away_ack_record(struct dccp_ackvec *av)
+static void dccp_ackvec_throw_away_ack_record(struct dccp_ackvec *av)
{
/*
* As we're keeping track of the ack vector size (dccpav_vec_len) and
@@ -301,9 +307,10 @@ static void dccp_ackvec_trow_away_ack_record(struct dccp_ackvec *av)
* draft-ietf-dccp-spec-11.txt Appendix A. -acme
*/
#if 0
- av->dccpav_buf_tail = av->dccpav_ack_ptr + 1;
- if (av->dccpav_buf_tail >= av->dccpav_vec_len)
- av->dccpav_buf_tail -= av->dccpav_vec_len;
+ u32 new_buf_tail = av->dccpav_ack_ptr + 1;
+ if (new_buf_tail >= av->dccpav_vec_len)
+ new_buf_tail -= av->dccpav_vec_len;
+ av->dccpav_buf_tail = new_buf_tail;
#endif
av->dccpav_vec_len -= av->dccpav_sent_len;
}
@@ -326,7 +333,7 @@ void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk,
debug_prefix, 1,
(unsigned long long)av->dccpav_ack_seqno,
(unsigned long long)av->dccpav_ack_ackno);
- dccp_ackvec_trow_away_ack_record(av);
+ dccp_ackvec_throw_away_ack_record(av);
av->dccpav_ack_seqno = DCCP_MAX_SEQNO + 1;
}
}
@@ -389,7 +396,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
av->dccpav_ack_seqno,
(unsigned long long)
av->dccpav_ack_ackno);
- dccp_ackvec_trow_away_ack_record(av);
+ dccp_ackvec_throw_away_ack_record(av);
}
/*
* If dccpav_ack_seqno was not received, no problem
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index d0fd6c60c574..f7dfb5f67b87 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -54,16 +54,16 @@
* @dccpav_buf - circular buffer of acknowledgeable packets
*/
struct dccp_ackvec {
- unsigned int dccpav_buf_head;
- unsigned int dccpav_buf_tail;
u64 dccpav_buf_ackno;
u64 dccpav_ack_seqno;
u64 dccpav_ack_ackno;
- unsigned int dccpav_ack_ptr;
- unsigned int dccpav_sent_len;
- unsigned int dccpav_vec_len;
- unsigned int dccpav_buf_len;
struct timeval dccpav_time;
+ u8 dccpav_buf_head;
+ u8 dccpav_buf_tail;
+ u8 dccpav_ack_ptr;
+ u8 dccpav_sent_len;
+ u8 dccpav_vec_len;
+ u8 dccpav_buf_len;
u8 dccpav_buf_nonce;
u8 dccpav_ack_nonce;
u8 dccpav_buf[0];
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index c37eeeaf5c6e..de681c6ad081 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -21,6 +21,8 @@
#define CCID_MAX 255
+struct tcp_info;
+
struct ccid {
unsigned char ccid_id;
const char *ccid_name;
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index f97b85d55ad8..93f26dd6e6cb 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -59,7 +59,7 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */
-extern struct proto dccp_v4_prot;
+extern struct proto dccp_prot;
/* is seq1 < seq2 ? */
static inline int before48(const u64 seq1, const u64 seq2)
@@ -228,6 +228,9 @@ extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
const struct dccp_hdr *dh, const unsigned len);
+extern int dccp_v4_init_sock(struct sock *sk);
+extern int dccp_v4_destroy_sock(struct sock *sk);
+
extern void dccp_close(struct sock *sk, long timeout);
extern struct sk_buff *dccp_make_response(struct sock *sk,
struct dst_entry *dst,
@@ -238,6 +241,7 @@ extern struct sk_buff *dccp_make_reset(struct sock *sk,
extern int dccp_connect(struct sock *sk);
extern int dccp_disconnect(struct sock *sk, int flags);
+extern void dccp_unhash(struct sock *sk);
extern int dccp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
extern int dccp_setsockopt(struct sock *sk, int level, int optname,
@@ -249,6 +253,13 @@ extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk,
struct msghdr *msg, size_t len, int nonblock,
int flags, int *addr_len);
extern void dccp_shutdown(struct sock *sk, int how);
+extern int inet_dccp_listen(struct socket *sock, int backlog);
+extern unsigned int dccp_poll(struct file *file, struct socket *sock,
+ poll_table *wait);
+extern void dccp_v4_send_check(struct sock *sk, int len,
+ struct sk_buff *skb);
+extern int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
+ int addr_len);
extern int dccp_v4_checksum(const struct sk_buff *skb,
const u32 saddr, const u32 daddr);
@@ -256,6 +267,17 @@ extern int dccp_v4_checksum(const struct sk_buff *skb,
extern int dccp_v4_send_reset(struct sock *sk,
enum dccp_reset_codes code);
extern void dccp_send_close(struct sock *sk, const int active);
+extern int dccp_invalid_packet(struct sk_buff *skb);
+
+static inline int dccp_bad_service_code(const struct sock *sk,
+ const __u32 service)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+
+ if (dp->dccps_service == service)
+ return 0;
+ return !dccp_list_has_service(dp->dccps_service_list, service);
+}
struct dccp_skb_cb {
__u8 dccpd_type:4;
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index f675d8e642d3..3f78c00e3822 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -28,7 +28,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_retransmits = icsk->icsk_retransmits;
info->tcpi_probes = icsk->icsk_probes_out;
info->tcpi_backoff = icsk->icsk_backoff;
- info->tcpi_pmtu = dp->dccps_pmtu_cookie;
+ info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
if (dp->dccps_options.dccpo_send_ack_vector)
info->tcpi_options |= TCPI_OPT_SACK;
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 3454d5941900..b6cba72b44e8 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -151,29 +151,12 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
return 0;
}
-int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct dccp_hdr *dh, const unsigned len)
+static inline int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct dccp_hdr *dh,
+ const unsigned len)
{
struct dccp_sock *dp = dccp_sk(sk);
- if (dccp_check_seqno(sk, skb))
- goto discard;
-
- if (dccp_parse_options(sk, skb))
- goto discard;
-
- if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
- dccp_event_ack_recv(sk, skb);
-
- if (dp->dccps_options.dccpo_send_ack_vector &&
- dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_seq,
- DCCP_ACKVEC_STATE_RECEIVED))
- goto discard;
-
- ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
- ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
-
switch (dccp_hdr(skb)->dccph_type) {
case DCCP_PKT_DATAACK:
case DCCP_PKT_DATA:
@@ -250,6 +233,37 @@ discard:
return 0;
}
+int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct dccp_hdr *dh, const unsigned len)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ if (dccp_check_seqno(sk, skb))
+ goto discard;
+
+ if (dccp_parse_options(sk, skb))
+ goto discard;
+
+ if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+ dccp_event_ack_recv(sk, skb);
+
+ if (dp->dccps_options.dccpo_send_ack_vector &&
+ dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
+ DCCP_SKB_CB(skb)->dccpd_seq,
+ DCCP_ACKVEC_STATE_RECEIVED))
+ goto discard;
+
+ ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
+ ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
+
+ return __dccp_rcv_established(sk, skb, dh, len);
+discard:
+ __kfree_skb(skb);
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_rcv_established);
+
static int dccp_rcv_request_sent_state_process(struct sock *sk,
struct sk_buff *skb,
const struct dccp_hdr *dh,
@@ -286,6 +300,12 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
goto out_invalid_packet;
}
+ if (dp->dccps_options.dccpo_send_ack_vector &&
+ dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
+ DCCP_SKB_CB(skb)->dccpd_seq,
+ DCCP_ACKVEC_STATE_RECEIVED))
+ goto out_invalid_packet; /* FIXME: change error code */
+
dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
dccp_update_gsr(sk, dp->dccps_isr);
/*
@@ -309,7 +329,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
goto out_invalid_packet;
}
- dccp_sync_mss(sk, dp->dccps_pmtu_cookie);
+ dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
/*
* Step 10: Process REQUEST state (second part)
@@ -329,7 +349,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
dccp_set_state(sk, DCCP_PARTOPEN);
/* Make sure socket is routed, for correct metrics. */
- inet_sk_rebuild_header(sk);
+ icsk->icsk_af_ops->rebuild_header(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_state_change(sk);
@@ -398,9 +418,9 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
if (dh->dccph_type == DCCP_PKT_DATAACK ||
dh->dccph_type == DCCP_PKT_DATA) {
- dccp_rcv_established(sk, skb, dh, len);
+ __dccp_rcv_established(sk, skb, dh, len);
queued = 1; /* packet was queued
- (by dccp_rcv_established) */
+ (by __dccp_rcv_established) */
}
break;
}
@@ -444,7 +464,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
*/
if (sk->sk_state == DCCP_LISTEN) {
if (dh->dccph_type == DCCP_PKT_REQUEST) {
- if (dccp_v4_conn_request(sk, skb) < 0)
+ if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
+ skb) < 0)
return 1;
/* FIXME: do congestion control initialization */
@@ -471,14 +492,14 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
dccp_event_ack_recv(sk, skb);
- ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
- ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
-
if (dp->dccps_options.dccpo_send_ack_vector &&
dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
DCCP_SKB_CB(skb)->dccpd_seq,
DCCP_ACKVEC_STATE_RECEIVED))
goto discard;
+
+ ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
+ ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
}
/*
@@ -566,3 +587,5 @@ discard:
}
return 0;
}
+
+EXPORT_SYMBOL_GPL(dccp_rcv_state_process);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 656e13e38cfb..3f244670764a 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -19,7 +19,9 @@
#include <net/icmp.h>
#include <net/inet_hashtables.h>
+#include <net/inet_sock.h>
#include <net/sock.h>
+#include <net/timewait_sock.h>
#include <net/tcp_states.h>
#include <net/xfrm.h>
@@ -37,7 +39,8 @@ EXPORT_SYMBOL_GPL(dccp_hashinfo);
static int dccp_v4_get_port(struct sock *sk, const unsigned short snum)
{
- return inet_csk_get_port(&dccp_hashinfo, sk, snum);
+ return inet_csk_get_port(&dccp_hashinfo, sk, snum,
+ inet_csk_bind_conflict);
}
static void dccp_v4_hash(struct sock *sk)
@@ -45,171 +48,14 @@ static void dccp_v4_hash(struct sock *sk)
inet_hash(&dccp_hashinfo, sk);
}
-static void dccp_v4_unhash(struct sock *sk)
+void dccp_unhash(struct sock *sk)
{
inet_unhash(&dccp_hashinfo, sk);
}
-/* called with local bh disabled */
-static int __dccp_v4_check_established(struct sock *sk, const __u16 lport,
- struct inet_timewait_sock **twp)
-{
- struct inet_sock *inet = inet_sk(sk);
- const u32 daddr = inet->rcv_saddr;
- const u32 saddr = inet->daddr;
- const int dif = sk->sk_bound_dev_if;
- INET_ADDR_COOKIE(acookie, saddr, daddr)
- const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
- unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
- struct inet_ehash_bucket *head = inet_ehash_bucket(&dccp_hashinfo, hash);
- const struct sock *sk2;
- const struct hlist_node *node;
- struct inet_timewait_sock *tw;
-
- prefetch(head->chain.first);
- write_lock(&head->lock);
-
- /* Check TIME-WAIT sockets first. */
- sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) {
- tw = inet_twsk(sk2);
-
- if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
- goto not_unique;
- }
- tw = NULL;
-
- /* And established part... */
- sk_for_each(sk2, node, &head->chain) {
- if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
- goto not_unique;
- }
+EXPORT_SYMBOL_GPL(dccp_unhash);
- /* Must record num and sport now. Otherwise we will see
- * in hash table socket with a funny identity. */
- inet->num = lport;
- inet->sport = htons(lport);
- sk->sk_hash = hash;
- BUG_TRAP(sk_unhashed(sk));
- __sk_add_node(sk, &head->chain);
- sock_prot_inc_use(sk->sk_prot);
- write_unlock(&head->lock);
-
- if (twp != NULL) {
- *twp = tw;
- NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
- } else if (tw != NULL) {
- /* Silly. Should hash-dance instead... */
- inet_twsk_deschedule(tw, &dccp_death_row);
- NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-
- inet_twsk_put(tw);
- }
-
- return 0;
-
-not_unique:
- write_unlock(&head->lock);
- return -EADDRNOTAVAIL;
-}
-
-/*
- * Bind a port for a connect operation and hash it.
- */
-static int dccp_v4_hash_connect(struct sock *sk)
-{
- const unsigned short snum = inet_sk(sk)->num;
- struct inet_bind_hashbucket *head;
- struct inet_bind_bucket *tb;
- int ret;
-
- if (snum == 0) {
- int low = sysctl_local_port_range[0];
- int high = sysctl_local_port_range[1];
- int remaining = (high - low) + 1;
- int rover = net_random() % (high - low) + low;
- struct hlist_node *node;
- struct inet_timewait_sock *tw = NULL;
-
- local_bh_disable();
- do {
- head = &dccp_hashinfo.bhash[inet_bhashfn(rover,
- dccp_hashinfo.bhash_size)];
- spin_lock(&head->lock);
-
- /* Does not bother with rcv_saddr checks,
- * because the established check is already
- * unique enough.
- */
- inet_bind_bucket_for_each(tb, node, &head->chain) {
- if (tb->port == rover) {
- BUG_TRAP(!hlist_empty(&tb->owners));
- if (tb->fastreuse >= 0)
- goto next_port;
- if (!__dccp_v4_check_established(sk,
- rover,
- &tw))
- goto ok;
- goto next_port;
- }
- }
-
- tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep,
- head, rover);
- if (tb == NULL) {
- spin_unlock(&head->lock);
- break;
- }
- tb->fastreuse = -1;
- goto ok;
-
- next_port:
- spin_unlock(&head->lock);
- if (++rover > high)
- rover = low;
- } while (--remaining > 0);
-
- local_bh_enable();
-
- return -EADDRNOTAVAIL;
-
-ok:
- /* All locks still held and bhs disabled */
- inet_bind_hash(sk, tb, rover);
- if (sk_unhashed(sk)) {
- inet_sk(sk)->sport = htons(rover);
- __inet_hash(&dccp_hashinfo, sk, 0);
- }
- spin_unlock(&head->lock);
-
- if (tw != NULL) {
- inet_twsk_deschedule(tw, &dccp_death_row);
- inet_twsk_put(tw);
- }
-
- ret = 0;
- goto out;
- }
-
- head = &dccp_hashinfo.bhash[inet_bhashfn(snum,
- dccp_hashinfo.bhash_size)];
- tb = inet_csk(sk)->icsk_bind_hash;
- spin_lock_bh(&head->lock);
- if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) {
- __inet_hash(&dccp_hashinfo, sk, 0);
- spin_unlock_bh(&head->lock);
- return 0;
- } else {
- spin_unlock(&head->lock);
- /* No definite answer... Walk to established hash table */
- ret = __dccp_v4_check_established(sk, snum, NULL);
-out:
- local_bh_enable();
- return ret;
- }
-}
-
-static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
- int addr_len)
+int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
@@ -259,9 +105,9 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
inet->dport = usin->sin_port;
inet->daddr = daddr;
- dp->dccps_ext_header_len = 0;
+ inet_csk(sk)->icsk_ext_hdr_len = 0;
if (inet->opt != NULL)
- dp->dccps_ext_header_len = inet->opt->optlen;
+ inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
/*
* Socket identity is still unknown (sport may be zero).
* However we set state to DCCP_REQUESTING and not releasing socket
@@ -269,7 +115,7 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
* complete initialization after this.
*/
dccp_set_state(sk, DCCP_REQUESTING);
- err = dccp_v4_hash_connect(sk);
+ err = inet_hash_connect(&dccp_death_row, sk);
if (err != 0)
goto failure;
@@ -287,16 +133,6 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
usin->sin_port);
dccp_update_gss(sk, dp->dccps_iss);
- /*
- * SWL and AWL are initially adjusted so that they are not less than
- * the initial Sequence Numbers received and sent, respectively:
- * SWL := max(GSR + 1 - floor(W/4), ISR),
- * AWL := max(GSS - W' + 1, ISS).
- * These adjustments MUST be applied only at the beginning of the
- * connection.
- */
- dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss));
-
inet->id = dp->dccps_iss ^ jiffies;
err = dccp_connect(sk);
@@ -316,6 +152,8 @@ failure:
goto out;
}
+EXPORT_SYMBOL_GPL(dccp_v4_connect);
+
/*
* This routine does path mtu discovery as defined in RFC1191.
*/
@@ -354,7 +192,7 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
mtu = dst_mtu(dst);
if (inet->pmtudisc != IP_PMTUDISC_DONT &&
- dp->dccps_pmtu_cookie > mtu) {
+ inet_csk(sk)->icsk_pmtu_cookie > mtu) {
dccp_sync_mss(sk, mtu);
/*
@@ -606,6 +444,17 @@ out:
sock_put(sk);
}
+/* This routine computes an IPv4 DCCP checksum. */
+void dccp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ struct dccp_hdr *dh = dccp_hdr(skb);
+
+ dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, inet->daddr);
+}
+
+EXPORT_SYMBOL_GPL(dccp_v4_send_check);
+
int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code)
{
struct sk_buff *skb;
@@ -641,16 +490,6 @@ static inline u64 dccp_v4_init_sequence(const struct sock *sk,
dccp_hdr(skb)->dccph_sport);
}
-static inline int dccp_bad_service_code(const struct sock *sk,
- const __u32 service)
-{
- const struct dccp_sock *dp = dccp_sk(sk);
-
- if (dp->dccps_service == service)
- return 0;
- return !dccp_list_has_service(dp->dccps_service_list, service);
-}
-
int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct inet_request_sock *ireq;
@@ -662,7 +501,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
const __u32 service = dccp_hdr_request(skb)->dccph_req_service;
struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
__u8 reset_code = DCCP_RESET_CODE_TOO_BUSY;
- struct dst_entry *dst = NULL;
/* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */
if (((struct rtable *)skb->dst)->rt_flags &
@@ -703,7 +541,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
ireq = inet_rsk(req);
ireq->loc_addr = daddr;
ireq->rmt_addr = saddr;
- /* FIXME: Merge Aristeu's option parsing code when ready */
req->rcv_wnd = 100; /* Fake, option parsing will get the
right value */
ireq->opt = NULL;
@@ -721,23 +558,22 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
dreq->dreq_iss = dccp_v4_init_sequence(sk, skb);
dreq->dreq_service = service;
- if (dccp_v4_send_response(sk, req, dst))
+ if (dccp_v4_send_response(sk, req, NULL))
goto drop_and_free;
inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
return 0;
drop_and_free:
- /*
- * FIXME: should be reqsk_free after implementing req->rsk_ops
- */
- __reqsk_free(req);
+ reqsk_free(req);
drop:
DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
dcb->dccpd_reset_code = reset_code;
return -1;
}
+EXPORT_SYMBOL_GPL(dccp_v4_conn_request);
+
/*
* The three way handshake has completed - we got a valid ACK or DATAACK -
* now create the new socket.
@@ -792,6 +628,8 @@ exit:
return NULL;
}
+EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock);
+
static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
const struct dccp_hdr *dh = dccp_hdr(skb);
@@ -1011,7 +849,9 @@ discard:
return 0;
}
-static inline int dccp_invalid_packet(struct sk_buff *skb)
+EXPORT_SYMBOL_GPL(dccp_v4_do_rcv);
+
+int dccp_invalid_packet(struct sk_buff *skb)
{
const struct dccp_hdr *dh;
@@ -1065,29 +905,30 @@ static inline int dccp_invalid_packet(struct sk_buff *skb)
return 1;
}
- /* If the header checksum is incorrect, drop packet and return */
- if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr,
- skb->nh.iph->daddr) < 0) {
- LIMIT_NETDEBUG(KERN_WARNING "DCCP: header checksum is "
- "incorrect\n");
- return 1;
- }
-
return 0;
}
+EXPORT_SYMBOL_GPL(dccp_invalid_packet);
+
/* this is called when real data arrives */
int dccp_v4_rcv(struct sk_buff *skb)
{
const struct dccp_hdr *dh;
struct sock *sk;
- int rc;
/* Step 1: Check header basics: */
if (dccp_invalid_packet(skb))
goto discard_it;
+ /* If the header checksum is incorrect, drop packet and return */
+ if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr,
+ skb->nh.iph->daddr) < 0) {
+ LIMIT_NETDEBUG(KERN_WARNING "%s: incorrect header checksum\n",
+ __FUNCTION__);
+ goto discard_it;
+ }
+
dh = dccp_hdr(skb);
DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb);
@@ -1143,28 +984,10 @@ int dccp_v4_rcv(struct sk_buff *skb)
goto do_time_wait;
}
- if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
- dccp_pr_debug("xfrm4_policy_check failed\n");
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
- }
-
- if (sk_filter(sk, skb, 0)) {
- dccp_pr_debug("sk_filter failed\n");
- goto discard_and_relse;
- }
-
- skb->dev = NULL;
-
- bh_lock_sock(sk);
- rc = 0;
- if (!sock_owned_by_user(sk))
- rc = dccp_v4_do_rcv(sk, skb);
- else
- sk_add_backlog(sk, skb);
- bh_unlock_sock(sk);
- sock_put(sk);
- return rc;
+ return sk_receive_skb(sk, skb);
no_dccp_socket:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
@@ -1194,9 +1017,23 @@ do_time_wait:
goto no_dccp_socket;
}
-static int dccp_v4_init_sock(struct sock *sk)
+struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
+ .queue_xmit = ip_queue_xmit,
+ .send_check = dccp_v4_send_check,
+ .rebuild_header = inet_sk_rebuild_header,
+ .conn_request = dccp_v4_conn_request,
+ .syn_recv_sock = dccp_v4_request_recv_sock,
+ .net_header_len = sizeof(struct iphdr),
+ .setsockopt = ip_setsockopt,
+ .getsockopt = ip_getsockopt,
+ .addr2sockaddr = inet_csk_addr2sockaddr,
+ .sockaddr_len = sizeof(struct sockaddr_in),
+};
+
+int dccp_v4_init_sock(struct sock *sk)
{
struct dccp_sock *dp = dccp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
static int dccp_ctl_socket_init = 1;
dccp_options_init(&dp->dccps_options);
@@ -1236,9 +1073,11 @@ static int dccp_v4_init_sock(struct sock *sk)
dccp_ctl_socket_init = 0;
dccp_init_xmit_timers(sk);
- inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT;
+ icsk->icsk_rto = DCCP_TIMEOUT_INIT;
sk->sk_state = DCCP_CLOSED;
sk->sk_write_space = dccp_write_space;
+ icsk->icsk_af_ops = &dccp_ipv4_af_ops;
+ icsk->icsk_sync_mss = dccp_sync_mss;
dp->dccps_mss_cache = 536;
dp->dccps_role = DCCP_ROLE_UNDEFINED;
dp->dccps_service = DCCP_SERVICE_INVALID_VALUE;
@@ -1246,7 +1085,9 @@ static int dccp_v4_init_sock(struct sock *sk)
return 0;
}
-static int dccp_v4_destroy_sock(struct sock *sk)
+EXPORT_SYMBOL_GPL(dccp_v4_init_sock);
+
+int dccp_v4_destroy_sock(struct sock *sk)
{
struct dccp_sock *dp = dccp_sk(sk);
@@ -1279,6 +1120,8 @@ static int dccp_v4_destroy_sock(struct sock *sk)
return 0;
}
+EXPORT_SYMBOL_GPL(dccp_v4_destroy_sock);
+
static void dccp_v4_reqsk_destructor(struct request_sock *req)
{
kfree(inet_rsk(req)->opt);
@@ -1293,7 +1136,11 @@ static struct request_sock_ops dccp_request_sock_ops = {
.send_reset = dccp_v4_ctl_send_reset,
};
-struct proto dccp_v4_prot = {
+static struct timewait_sock_ops dccp_timewait_sock_ops = {
+ .twsk_obj_size = sizeof(struct inet_timewait_sock),
+};
+
+struct proto dccp_prot = {
.name = "DCCP",
.owner = THIS_MODULE,
.close = dccp_close,
@@ -1307,7 +1154,7 @@ struct proto dccp_v4_prot = {
.recvmsg = dccp_recvmsg,
.backlog_rcv = dccp_v4_do_rcv,
.hash = dccp_v4_hash,
- .unhash = dccp_v4_unhash,
+ .unhash = dccp_unhash,
.accept = inet_csk_accept,
.get_port = dccp_v4_get_port,
.shutdown = dccp_shutdown,
@@ -1316,5 +1163,7 @@ struct proto dccp_v4_prot = {
.max_header = MAX_DCCP_HEADER,
.obj_size = sizeof(struct dccp_sock),
.rsk_prot = &dccp_request_sock_ops,
- .twsk_obj_size = sizeof(struct inet_timewait_sock),
+ .twsk_prot = &dccp_timewait_sock_ops,
};
+
+EXPORT_SYMBOL_GPL(dccp_prot);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
new file mode 100644
index 000000000000..c609dc78f487
--- /dev/null
+++ b/net/dccp/ipv6.c
@@ -0,0 +1,1261 @@
+/*
+ * DCCP over IPv6
+ * Linux INET6 implementation
+ *
+ * Based on net/dccp6/ipv6.c
+ *
+ * Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/xfrm.h>
+
+#include <net/addrconf.h>
+#include <net/inet_common.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_sock.h>
+#include <net/inet6_connection_sock.h>
+#include <net/inet6_hashtables.h>
+#include <net/ip6_route.h>
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/xfrm.h>
+
+#include "dccp.h"
+#include "ipv6.h"
+
+static void dccp_v6_ctl_send_reset(struct sk_buff *skb);
+static void dccp_v6_reqsk_send_ack(struct sk_buff *skb,
+ struct request_sock *req);
+static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb);
+
+static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
+
+static struct inet_connection_sock_af_ops dccp_ipv6_mapped;
+static struct inet_connection_sock_af_ops dccp_ipv6_af_ops;
+
+static int dccp_v6_get_port(struct sock *sk, unsigned short snum)
+{
+ return inet_csk_get_port(&dccp_hashinfo, sk, snum,
+ inet6_csk_bind_conflict);
+}
+
+static void dccp_v6_hash(struct sock *sk)
+{
+ if (sk->sk_state != DCCP_CLOSED) {
+ if (inet_csk(sk)->icsk_af_ops == &dccp_ipv6_mapped) {
+ dccp_prot.hash(sk);
+ return;
+ }
+ local_bh_disable();
+ __inet6_hash(&dccp_hashinfo, sk);
+ local_bh_enable();
+ }
+}
+
+static inline u16 dccp_v6_check(struct dccp_hdr *dh, int len,
+ struct in6_addr *saddr,
+ struct in6_addr *daddr,
+ unsigned long base)
+{
+ return csum_ipv6_magic(saddr, daddr, len, IPPROTO_DCCP, base);
+}
+
+static __u32 dccp_v6_init_sequence(struct sock *sk, struct sk_buff *skb)
+{
+ const struct dccp_hdr *dh = dccp_hdr(skb);
+
+ if (skb->protocol == htons(ETH_P_IPV6))
+ return secure_tcpv6_sequence_number(skb->nh.ipv6h->daddr.s6_addr32,
+ skb->nh.ipv6h->saddr.s6_addr32,
+ dh->dccph_dport,
+ dh->dccph_sport);
+ else
+ return secure_dccp_sequence_number(skb->nh.iph->daddr,
+ skb->nh.iph->saddr,
+ dh->dccph_dport,
+ dh->dccph_sport);
+}
+
+static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
+ int addr_len)
+{
+ struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct in6_addr *saddr = NULL, *final_p = NULL, final;
+ struct flowi fl;
+ struct dst_entry *dst;
+ int addr_type;
+ int err;
+
+ dp->dccps_role = DCCP_ROLE_CLIENT;
+
+ if (addr_len < SIN6_LEN_RFC2133)
+ return -EINVAL;
+
+ if (usin->sin6_family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ memset(&fl, 0, sizeof(fl));
+
+ if (np->sndflow) {
+ fl.fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
+ IP6_ECN_flow_init(fl.fl6_flowlabel);
+ if (fl.fl6_flowlabel & IPV6_FLOWLABEL_MASK) {
+ struct ip6_flowlabel *flowlabel;
+ flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
+ if (flowlabel == NULL)
+ return -EINVAL;
+ ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst);
+ fl6_sock_release(flowlabel);
+ }
+ }
+
+ /*
+ * connect() to INADDR_ANY means loopback (BSD'ism).
+ */
+
+ if (ipv6_addr_any(&usin->sin6_addr))
+ usin->sin6_addr.s6_addr[15] = 0x1;
+
+ addr_type = ipv6_addr_type(&usin->sin6_addr);
+
+ if(addr_type & IPV6_ADDR_MULTICAST)
+ return -ENETUNREACH;
+
+ if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ if (addr_len >= sizeof(struct sockaddr_in6) &&
+ usin->sin6_scope_id) {
+ /* If interface is set while binding, indices
+ * must coincide.
+ */
+ if (sk->sk_bound_dev_if &&
+ sk->sk_bound_dev_if != usin->sin6_scope_id)
+ return -EINVAL;
+
+ sk->sk_bound_dev_if = usin->sin6_scope_id;
+ }
+
+ /* Connect to link-local address requires an interface */
+ if (!sk->sk_bound_dev_if)
+ return -EINVAL;
+ }
+
+ ipv6_addr_copy(&np->daddr, &usin->sin6_addr);
+ np->flow_label = fl.fl6_flowlabel;
+
+ /*
+ * DCCP over IPv4
+ */
+
+ if (addr_type == IPV6_ADDR_MAPPED) {
+ u32 exthdrlen = icsk->icsk_ext_hdr_len;
+ struct sockaddr_in sin;
+
+ SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
+
+ if (__ipv6_only_sock(sk))
+ return -ENETUNREACH;
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = usin->sin6_port;
+ sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
+
+ icsk->icsk_af_ops = &dccp_ipv6_mapped;
+ sk->sk_backlog_rcv = dccp_v4_do_rcv;
+
+ err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
+
+ if (err) {
+ icsk->icsk_ext_hdr_len = exthdrlen;
+ icsk->icsk_af_ops = &dccp_ipv6_af_ops;
+ sk->sk_backlog_rcv = dccp_v6_do_rcv;
+ goto failure;
+ } else {
+ ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000FFFF),
+ inet->saddr);
+ ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000FFFF),
+ inet->rcv_saddr);
+ }
+
+ return err;
+ }
+
+ if (!ipv6_addr_any(&np->rcv_saddr))
+ saddr = &np->rcv_saddr;
+
+ fl.proto = IPPROTO_DCCP;
+ ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+ ipv6_addr_copy(&fl.fl6_src, saddr ? saddr : &np->saddr);
+ fl.oif = sk->sk_bound_dev_if;
+ fl.fl_ip_dport = usin->sin6_port;
+ fl.fl_ip_sport = inet->sport;
+
+ if (np->opt && np->opt->srcrt) {
+ struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
+ ipv6_addr_copy(&final, &fl.fl6_dst);
+ ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+ final_p = &final;
+ }
+
+ err = ip6_dst_lookup(sk, &dst, &fl);
+ if (err)
+ goto failure;
+ if (final_p)
+ ipv6_addr_copy(&fl.fl6_dst, final_p);
+
+ if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
+ goto failure;
+
+ if (saddr == NULL) {
+ saddr = &fl.fl6_src;
+ ipv6_addr_copy(&np->rcv_saddr, saddr);
+ }
+
+ /* set the source address */
+ ipv6_addr_copy(&np->saddr, saddr);
+ inet->rcv_saddr = LOOPBACK4_IPV6;
+
+ ip6_dst_store(sk, dst, NULL);
+
+ icsk->icsk_ext_hdr_len = 0;
+ if (np->opt)
+ icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
+ np->opt->opt_nflen);
+
+ inet->dport = usin->sin6_port;
+
+ dccp_set_state(sk, DCCP_REQUESTING);
+ err = inet6_hash_connect(&dccp_death_row, sk);
+ if (err)
+ goto late_failure;
+ /* FIXME */
+#if 0
+ dp->dccps_gar = secure_dccp_v6_sequence_number(np->saddr.s6_addr32,
+ np->daddr.s6_addr32,
+ inet->sport,
+ inet->dport);
+#endif
+ err = dccp_connect(sk);
+ if (err)
+ goto late_failure;
+
+ return 0;
+
+late_failure:
+ dccp_set_state(sk, DCCP_CLOSED);
+ __sk_dst_reset(sk);
+failure:
+ inet->dport = 0;
+ sk->sk_route_caps = 0;
+ return err;
+}
+
+static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ int type, int code, int offset, __u32 info)
+{
+ struct ipv6hdr *hdr = (struct ipv6hdr *)skb->data;
+ const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
+ struct ipv6_pinfo *np;
+ struct sock *sk;
+ int err;
+ __u64 seq;
+
+ sk = inet6_lookup(&dccp_hashinfo, &hdr->daddr, dh->dccph_dport,
+ &hdr->saddr, dh->dccph_sport, skb->dev->ifindex);
+
+ if (sk == NULL) {
+ ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
+ return;
+ }
+
+ if (sk->sk_state == DCCP_TIME_WAIT) {
+ inet_twsk_put((struct inet_timewait_sock *)sk);
+ return;
+ }
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk))
+ NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
+
+ if (sk->sk_state == DCCP_CLOSED)
+ goto out;
+
+ np = inet6_sk(sk);
+
+ if (type == ICMPV6_PKT_TOOBIG) {
+ struct dst_entry *dst = NULL;
+
+ if (sock_owned_by_user(sk))
+ goto out;
+ if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED))
+ goto out;
+
+ /* icmp should have updated the destination cache entry */
+ dst = __sk_dst_check(sk, np->dst_cookie);
+
+ if (dst == NULL) {
+ struct inet_sock *inet = inet_sk(sk);
+ struct flowi fl;
+
+ /* BUGGG_FUTURE: Again, it is not clear how
+ to handle rthdr case. Ignore this complexity
+ for now.
+ */
+ memset(&fl, 0, sizeof(fl));
+ fl.proto = IPPROTO_DCCP;
+ ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+ ipv6_addr_copy(&fl.fl6_src, &np->saddr);
+ fl.oif = sk->sk_bound_dev_if;
+ fl.fl_ip_dport = inet->dport;
+ fl.fl_ip_sport = inet->sport;
+
+ if ((err = ip6_dst_lookup(sk, &dst, &fl))) {
+ sk->sk_err_soft = -err;
+ goto out;
+ }
+
+ if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
+ sk->sk_err_soft = -err;
+ goto out;
+ }
+
+ } else
+ dst_hold(dst);
+
+ if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
+ dccp_sync_mss(sk, dst_mtu(dst));
+ } /* else let the usual retransmit timer handle it */
+ dst_release(dst);
+ goto out;
+ }
+
+ icmpv6_err_convert(type, code, &err);
+
+ seq = DCCP_SKB_CB(skb)->dccpd_seq;
+ /* Might be for an request_sock */
+ switch (sk->sk_state) {
+ struct request_sock *req, **prev;
+ case DCCP_LISTEN:
+ if (sock_owned_by_user(sk))
+ goto out;
+
+ req = inet6_csk_search_req(sk, &prev, dh->dccph_dport,
+ &hdr->daddr, &hdr->saddr,
+ inet6_iif(skb));
+ if (!req)
+ goto out;
+
+ /* ICMPs are not backlogged, hence we cannot get
+ * an established socket here.
+ */
+ BUG_TRAP(req->sk == NULL);
+
+ if (seq != dccp_rsk(req)->dreq_iss) {
+ NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
+ goto out;
+ }
+
+ inet_csk_reqsk_queue_drop(sk, req, prev);
+ goto out;
+
+ case DCCP_REQUESTING:
+ case DCCP_RESPOND: /* Cannot happen.
+ It can, it SYNs are crossed. --ANK */
+ if (!sock_owned_by_user(sk)) {
+ DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+ sk->sk_err = err;
+ /*
+ * Wake people up to see the error
+ * (see connect in sock.c)
+ */
+ sk->sk_error_report(sk);
+
+ dccp_done(sk);
+ } else
+ sk->sk_err_soft = err;
+ goto out;
+ }
+
+ if (!sock_owned_by_user(sk) && np->recverr) {
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ } else
+ sk->sk_err_soft = err;
+
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+
+static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct inet6_request_sock *ireq6 = inet6_rsk(req);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct sk_buff *skb;
+ struct ipv6_txoptions *opt = NULL;
+ struct in6_addr *final_p = NULL, final;
+ struct flowi fl;
+ int err = -1;
+
+ memset(&fl, 0, sizeof(fl));
+ fl.proto = IPPROTO_DCCP;
+ ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
+ ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
+ fl.fl6_flowlabel = 0;
+ fl.oif = ireq6->iif;
+ fl.fl_ip_dport = inet_rsk(req)->rmt_port;
+ fl.fl_ip_sport = inet_sk(sk)->sport;
+
+ if (dst == NULL) {
+ opt = np->opt;
+ if (opt == NULL &&
+ np->rxopt.bits.osrcrt == 2 &&
+ ireq6->pktopts) {
+ struct sk_buff *pktopts = ireq6->pktopts;
+ struct inet6_skb_parm *rxopt = IP6CB(pktopts);
+ if (rxopt->srcrt)
+ opt = ipv6_invert_rthdr(sk,
+ (struct ipv6_rt_hdr *)(pktopts->nh.raw +
+ rxopt->srcrt));
+ }
+
+ if (opt && opt->srcrt) {
+ struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt;
+ ipv6_addr_copy(&final, &fl.fl6_dst);
+ ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+ final_p = &final;
+ }
+
+ err = ip6_dst_lookup(sk, &dst, &fl);
+ if (err)
+ goto done;
+ if (final_p)
+ ipv6_addr_copy(&fl.fl6_dst, final_p);
+ if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
+ goto done;
+ }
+
+ skb = dccp_make_response(sk, dst, req);
+ if (skb != NULL) {
+ struct dccp_hdr *dh = dccp_hdr(skb);
+ dh->dccph_checksum = dccp_v6_check(dh, skb->len,
+ &ireq6->loc_addr,
+ &ireq6->rmt_addr,
+ csum_partial((char *)dh,
+ skb->len,
+ skb->csum));
+ ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
+ err = ip6_xmit(sk, skb, &fl, opt, 0);
+ if (err == NET_XMIT_CN)
+ err = 0;
+ }
+
+done:
+ if (opt && opt != np->opt)
+ sock_kfree_s(sk, opt, opt->tot_len);
+ return err;
+}
+
+static void dccp_v6_reqsk_destructor(struct request_sock *req)
+{
+ if (inet6_rsk(req)->pktopts != NULL)
+ kfree_skb(inet6_rsk(req)->pktopts);
+}
+
+static struct request_sock_ops dccp6_request_sock_ops = {
+ .family = AF_INET6,
+ .obj_size = sizeof(struct dccp6_request_sock),
+ .rtx_syn_ack = dccp_v6_send_response,
+ .send_ack = dccp_v6_reqsk_send_ack,
+ .destructor = dccp_v6_reqsk_destructor,
+ .send_reset = dccp_v6_ctl_send_reset,
+};
+
+static struct timewait_sock_ops dccp6_timewait_sock_ops = {
+ .twsk_obj_size = sizeof(struct dccp6_timewait_sock),
+};
+
+static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct dccp_hdr *dh = dccp_hdr(skb);
+
+ dh->dccph_checksum = csum_ipv6_magic(&np->saddr, &np->daddr,
+ len, IPPROTO_DCCP,
+ csum_partial((char *)dh,
+ dh->dccph_doff << 2,
+ skb->csum));
+}
+
+static void dccp_v6_ctl_send_reset(struct sk_buff *rxskb)
+{
+ struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
+ const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) +
+ sizeof(struct dccp_hdr_ext) +
+ sizeof(struct dccp_hdr_reset);
+ struct sk_buff *skb;
+ struct flowi fl;
+ u64 seqno;
+
+ if (rxdh->dccph_type == DCCP_PKT_RESET)
+ return;
+
+ if (!ipv6_unicast_destination(rxskb))
+ return;
+
+ /*
+ * We need to grab some memory, and put together an RST,
+ * and then put it into the queue to be sent.
+ */
+
+ skb = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) +
+ dccp_hdr_reset_len, GFP_ATOMIC);
+ if (skb == NULL)
+ return;
+
+ skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr) +
+ dccp_hdr_reset_len);
+
+ skb->h.raw = skb_push(skb, dccp_hdr_reset_len);
+ dh = dccp_hdr(skb);
+ memset(dh, 0, dccp_hdr_reset_len);
+
+ /* Swap the send and the receive. */
+ dh->dccph_type = DCCP_PKT_RESET;
+ dh->dccph_sport = rxdh->dccph_dport;
+ dh->dccph_dport = rxdh->dccph_sport;
+ dh->dccph_doff = dccp_hdr_reset_len / 4;
+ dh->dccph_x = 1;
+ dccp_hdr_reset(skb)->dccph_reset_code =
+ DCCP_SKB_CB(rxskb)->dccpd_reset_code;
+
+ /* See "8.3.1. Abnormal Termination" in draft-ietf-dccp-spec-11 */
+ seqno = 0;
+ if (DCCP_SKB_CB(rxskb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+ dccp_set_seqno(&seqno, DCCP_SKB_CB(rxskb)->dccpd_ack_seq + 1);
+
+ dccp_hdr_set_seq(dh, seqno);
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
+ DCCP_SKB_CB(rxskb)->dccpd_seq);
+
+ memset(&fl, 0, sizeof(fl));
+ ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr);
+ ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr);
+ dh->dccph_checksum = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst,
+ sizeof(*dh), IPPROTO_DCCP,
+ skb->csum);
+ fl.proto = IPPROTO_DCCP;
+ fl.oif = inet6_iif(rxskb);
+ fl.fl_ip_dport = dh->dccph_dport;
+ fl.fl_ip_sport = dh->dccph_sport;
+
+ /* sk = NULL, but it is safe for now. RST socket required. */
+ if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) {
+ if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) {
+ ip6_xmit(NULL, skb, &fl, NULL, 0);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
+ return;
+ }
+ }
+
+ kfree_skb(skb);
+}
+
+static void dccp_v6_ctl_send_ack(struct sk_buff *rxskb)
+{
+ struct flowi fl;
+ struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
+ const int dccp_hdr_ack_len = sizeof(struct dccp_hdr) +
+ sizeof(struct dccp_hdr_ext) +
+ sizeof(struct dccp_hdr_ack_bits);
+ struct sk_buff *skb;
+
+ skb = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) +
+ dccp_hdr_ack_len, GFP_ATOMIC);
+ if (skb == NULL)
+ return;
+
+ skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr) +
+ dccp_hdr_ack_len);
+
+ skb->h.raw = skb_push(skb, dccp_hdr_ack_len);
+ dh = dccp_hdr(skb);
+ memset(dh, 0, dccp_hdr_ack_len);
+
+ /* Build DCCP header and checksum it. */
+ dh->dccph_type = DCCP_PKT_ACK;
+ dh->dccph_sport = rxdh->dccph_dport;
+ dh->dccph_dport = rxdh->dccph_sport;
+ dh->dccph_doff = dccp_hdr_ack_len / 4;
+ dh->dccph_x = 1;
+
+ dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq);
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
+ DCCP_SKB_CB(rxskb)->dccpd_seq);
+
+ memset(&fl, 0, sizeof(fl));
+ ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr);
+ ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr);
+
+ /* FIXME: calculate checksum, IPv4 also should... */
+
+ fl.proto = IPPROTO_DCCP;
+ fl.oif = inet6_iif(rxskb);
+ fl.fl_ip_dport = dh->dccph_dport;
+ fl.fl_ip_sport = dh->dccph_sport;
+
+ if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) {
+ if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) {
+ ip6_xmit(NULL, skb, &fl, NULL, 0);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+ return;
+ }
+ }
+
+ kfree_skb(skb);
+}
+
+static void dccp_v6_reqsk_send_ack(struct sk_buff *skb,
+ struct request_sock *req)
+{
+ dccp_v6_ctl_send_ack(skb);
+}
+
+static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
+{
+ const struct dccp_hdr *dh = dccp_hdr(skb);
+ const struct ipv6hdr *iph = skb->nh.ipv6h;
+ struct sock *nsk;
+ struct request_sock **prev;
+ /* Find possible connection requests. */
+ struct request_sock *req = inet6_csk_search_req(sk, &prev,
+ dh->dccph_sport,
+ &iph->saddr,
+ &iph->daddr,
+ inet6_iif(skb));
+ if (req != NULL)
+ return dccp_check_req(sk, skb, req, prev);
+
+ nsk = __inet6_lookup_established(&dccp_hashinfo,
+ &iph->saddr, dh->dccph_sport,
+ &iph->daddr, ntohs(dh->dccph_dport),
+ inet6_iif(skb));
+
+ if (nsk != NULL) {
+ if (nsk->sk_state != DCCP_TIME_WAIT) {
+ bh_lock_sock(nsk);
+ return nsk;
+ }
+ inet_twsk_put((struct inet_timewait_sock *)nsk);
+ return NULL;
+ }
+
+ return sk;
+}
+
+static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+ struct inet_request_sock *ireq;
+ struct dccp_sock dp;
+ struct request_sock *req;
+ struct dccp_request_sock *dreq;
+ struct inet6_request_sock *ireq6;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ const __u32 service = dccp_hdr_request(skb)->dccph_req_service;
+ struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+ __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY;
+
+ if (skb->protocol == htons(ETH_P_IP))
+ return dccp_v4_conn_request(sk, skb);
+
+ if (!ipv6_unicast_destination(skb))
+ goto drop;
+
+ if (dccp_bad_service_code(sk, service)) {
+ reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
+ goto drop;
+ }
+ /*
+ * There are no SYN attacks on IPv6, yet...
+ */
+ if (inet_csk_reqsk_queue_is_full(sk))
+ goto drop;
+
+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+ goto drop;
+
+ req = inet6_reqsk_alloc(sk->sk_prot->rsk_prot);
+ if (req == NULL)
+ goto drop;
+
+ /* FIXME: process options */
+
+ dccp_openreq_init(req, &dp, skb);
+
+ ireq6 = inet6_rsk(req);
+ ireq = inet_rsk(req);
+ ipv6_addr_copy(&ireq6->rmt_addr, &skb->nh.ipv6h->saddr);
+ ipv6_addr_copy(&ireq6->loc_addr, &skb->nh.ipv6h->daddr);
+ req->rcv_wnd = 100; /* Fake, option parsing will get the
+ right value */
+ ireq6->pktopts = NULL;
+
+ if (ipv6_opt_accepted(sk, skb) ||
+ np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
+ np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
+ atomic_inc(&skb->users);
+ ireq6->pktopts = skb;
+ }
+ ireq6->iif = sk->sk_bound_dev_if;
+
+ /* So that link locals have meaning */
+ if (!sk->sk_bound_dev_if &&
+ ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL)
+ ireq6->iif = inet6_iif(skb);
+
+ /*
+ * Step 3: Process LISTEN state
+ *
+ * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+ *
+ * In fact we defer setting S.GSR, S.SWL, S.SWH to
+ * dccp_create_openreq_child.
+ */
+ dreq = dccp_rsk(req);
+ dreq->dreq_isr = dcb->dccpd_seq;
+ dreq->dreq_iss = dccp_v6_init_sequence(sk, skb);
+ dreq->dreq_service = service;
+
+ if (dccp_v6_send_response(sk, req, NULL))
+ goto drop_and_free;
+
+ inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
+ return 0;
+
+drop_and_free:
+ reqsk_free(req);
+drop:
+ DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+ dcb->dccpd_reset_code = reset_code;
+ return -1;
+}
+
+static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
+ struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct inet6_request_sock *ireq6 = inet6_rsk(req);
+ struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
+ struct inet_sock *newinet;
+ struct dccp_sock *newdp;
+ struct dccp6_sock *newdp6;
+ struct sock *newsk;
+ struct ipv6_txoptions *opt;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ /*
+ * v6 mapped
+ */
+
+ newsk = dccp_v4_request_recv_sock(sk, skb, req, dst);
+ if (newsk == NULL)
+ return NULL;
+
+ newdp6 = (struct dccp6_sock *)newsk;
+ newdp = dccp_sk(newsk);
+ newinet = inet_sk(newsk);
+ newinet->pinet6 = &newdp6->inet6;
+ newnp = inet6_sk(newsk);
+
+ memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+
+ ipv6_addr_set(&newnp->daddr, 0, 0, htonl(0x0000FFFF),
+ newinet->daddr);
+
+ ipv6_addr_set(&newnp->saddr, 0, 0, htonl(0x0000FFFF),
+ newinet->saddr);
+
+ ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr);
+
+ inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped;
+ newsk->sk_backlog_rcv = dccp_v4_do_rcv;
+ newnp->pktoptions = NULL;
+ newnp->opt = NULL;
+ newnp->mcast_oif = inet6_iif(skb);
+ newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
+
+ /*
+ * No need to charge this sock to the relevant IPv6 refcnt debug socks count
+ * here, dccp_create_openreq_child now does this for us, see the comment in
+ * that function for the gory details. -acme
+ */
+
+ /* It is tricky place. Until this moment IPv4 tcp
+ worked with IPv6 icsk.icsk_af_ops.
+ Sync it now.
+ */
+ dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
+
+ return newsk;
+ }
+
+ opt = np->opt;
+
+ if (sk_acceptq_is_full(sk))
+ goto out_overflow;
+
+ if (np->rxopt.bits.osrcrt == 2 &&
+ opt == NULL && ireq6->pktopts) {
+ struct inet6_skb_parm *rxopt = IP6CB(ireq6->pktopts);
+ if (rxopt->srcrt)
+ opt = ipv6_invert_rthdr(sk,
+ (struct ipv6_rt_hdr *)(ireq6->pktopts->nh.raw +
+ rxopt->srcrt));
+ }
+
+ if (dst == NULL) {
+ struct in6_addr *final_p = NULL, final;
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(fl));
+ fl.proto = IPPROTO_DCCP;
+ ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
+ if (opt && opt->srcrt) {
+ struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
+ ipv6_addr_copy(&final, &fl.fl6_dst);
+ ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+ final_p = &final;
+ }
+ ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
+ fl.oif = sk->sk_bound_dev_if;
+ fl.fl_ip_dport = inet_rsk(req)->rmt_port;
+ fl.fl_ip_sport = inet_sk(sk)->sport;
+
+ if (ip6_dst_lookup(sk, &dst, &fl))
+ goto out;
+
+ if (final_p)
+ ipv6_addr_copy(&fl.fl6_dst, final_p);
+
+ if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0)
+ goto out;
+ }
+
+ newsk = dccp_create_openreq_child(sk, req, skb);
+ if (newsk == NULL)
+ goto out;
+
+ /*
+ * No need to charge this sock to the relevant IPv6 refcnt debug socks
+ * count here, dccp_create_openreq_child now does this for us, see the
+ * comment in that function for the gory details. -acme
+ */
+
+ ip6_dst_store(newsk, dst, NULL);
+ newsk->sk_route_caps = dst->dev->features &
+ ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
+
+ newdp6 = (struct dccp6_sock *)newsk;
+ newinet = inet_sk(newsk);
+ newinet->pinet6 = &newdp6->inet6;
+ newdp = dccp_sk(newsk);
+ newnp = inet6_sk(newsk);
+
+ memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+
+ ipv6_addr_copy(&newnp->daddr, &ireq6->rmt_addr);
+ ipv6_addr_copy(&newnp->saddr, &ireq6->loc_addr);
+ ipv6_addr_copy(&newnp->rcv_saddr, &ireq6->loc_addr);
+ newsk->sk_bound_dev_if = ireq6->iif;
+
+ /* Now IPv6 options...
+
+ First: no IPv4 options.
+ */
+ newinet->opt = NULL;
+
+ /* Clone RX bits */
+ newnp->rxopt.all = np->rxopt.all;
+
+ /* Clone pktoptions received with SYN */
+ newnp->pktoptions = NULL;
+ if (ireq6->pktopts != NULL) {
+ newnp->pktoptions = skb_clone(ireq6->pktopts, GFP_ATOMIC);
+ kfree_skb(ireq6->pktopts);
+ ireq6->pktopts = NULL;
+ if (newnp->pktoptions)
+ skb_set_owner_r(newnp->pktoptions, newsk);
+ }
+ newnp->opt = NULL;
+ newnp->mcast_oif = inet6_iif(skb);
+ newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
+
+ /* Clone native IPv6 options from listening socket (if any)
+
+ Yes, keeping reference count would be much more clever,
+ but we make one more one thing there: reattach optmem
+ to newsk.
+ */
+ if (opt) {
+ newnp->opt = ipv6_dup_options(newsk, opt);
+ if (opt != np->opt)
+ sock_kfree_s(sk, opt, opt->tot_len);
+ }
+
+ inet_csk(newsk)->icsk_ext_hdr_len = 0;
+ if (newnp->opt)
+ inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
+ newnp->opt->opt_flen);
+
+ dccp_sync_mss(newsk, dst_mtu(dst));
+
+ newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
+
+ __inet6_hash(&dccp_hashinfo, newsk);
+ inet_inherit_port(&dccp_hashinfo, sk, newsk);
+
+ return newsk;
+
+out_overflow:
+ NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
+out:
+ NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
+ if (opt && opt != np->opt)
+ sock_kfree_s(sk, opt, opt->tot_len);
+ dst_release(dst);
+ return NULL;
+}
+
+/* The socket must have it's spinlock held when we get
+ * here.
+ *
+ * We have a potential double-lock case here, so even when
+ * doing backlog processing we use the BH locking scheme.
+ * This is because we cannot sleep with the original spinlock
+ * held.
+ */
+static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct sk_buff *opt_skb = NULL;
+
+ /* Imagine: socket is IPv6. IPv4 packet arrives,
+ goes to IPv4 receive handler and backlogged.
+ From backlog it always goes here. Kerboom...
+ Fortunately, dccp_rcv_established and rcv_established
+ handle them correctly, but it is not case with
+ dccp_v6_hnd_req and dccp_v6_ctl_send_reset(). --ANK
+ */
+
+ if (skb->protocol == htons(ETH_P_IP))
+ return dccp_v4_do_rcv(sk, skb);
+
+ if (sk_filter(sk, skb, 0))
+ goto discard;
+
+ /*
+ * socket locking is here for SMP purposes as backlog rcv
+ * is currently called with bh processing disabled.
+ */
+
+ /* Do Stevens' IPV6_PKTOPTIONS.
+
+ Yes, guys, it is the only place in our code, where we
+ may make it not affecting IPv4.
+ The rest of code is protocol independent,
+ and I do not like idea to uglify IPv4.
+
+ Actually, all the idea behind IPV6_PKTOPTIONS
+ looks not very well thought. For now we latch
+ options, received in the last packet, enqueued
+ by tcp. Feel free to propose better solution.
+ --ANK (980728)
+ */
+ if (np->rxopt.all)
+ opt_skb = skb_clone(skb, GFP_ATOMIC);
+
+ if (sk->sk_state == DCCP_OPEN) { /* Fast path */
+ if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len))
+ goto reset;
+ return 0;
+ }
+
+ if (sk->sk_state == DCCP_LISTEN) {
+ struct sock *nsk = dccp_v6_hnd_req(sk, skb);
+ if (!nsk)
+ goto discard;
+
+ /*
+ * Queue it on the new socket if the new socket is active,
+ * otherwise we just shortcircuit this and continue with
+ * the new socket..
+ */
+ if(nsk != sk) {
+ if (dccp_child_process(sk, nsk, skb))
+ goto reset;
+ if (opt_skb)
+ __kfree_skb(opt_skb);
+ return 0;
+ }
+ }
+
+ if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len))
+ goto reset;
+ return 0;
+
+reset:
+ dccp_v6_ctl_send_reset(skb);
+discard:
+ if (opt_skb)
+ __kfree_skb(opt_skb);
+ kfree_skb(skb);
+ return 0;
+}
+
+static int dccp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+{
+ const struct dccp_hdr *dh;
+ struct sk_buff *skb = *pskb;
+ struct sock *sk;
+
+ /* Step 1: Check header basics: */
+
+ if (dccp_invalid_packet(skb))
+ goto discard_it;
+
+ dh = dccp_hdr(skb);
+
+ DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb);
+ DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
+
+ if (dccp_packet_without_ack(skb))
+ DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ;
+ else
+ DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
+
+ /* Step 2:
+ * Look up flow ID in table and get corresponding socket */
+ sk = __inet6_lookup(&dccp_hashinfo, &skb->nh.ipv6h->saddr,
+ dh->dccph_sport,
+ &skb->nh.ipv6h->daddr, ntohs(dh->dccph_dport),
+ inet6_iif(skb));
+ /*
+ * Step 2:
+ * If no socket ...
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ */
+ if (sk == NULL)
+ goto no_dccp_socket;
+
+ /*
+ * Step 2:
+ * ... or S.state == TIMEWAIT,
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ */
+
+ if (sk->sk_state == DCCP_TIME_WAIT)
+ goto do_time_wait;
+
+ if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
+ goto discard_and_relse;
+
+ return sk_receive_skb(sk, skb) ? -1 : 0;
+
+no_dccp_socket:
+ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto discard_it;
+ /*
+ * Step 2:
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ */
+ if (dh->dccph_type != DCCP_PKT_RESET) {
+ DCCP_SKB_CB(skb)->dccpd_reset_code =
+ DCCP_RESET_CODE_NO_CONNECTION;
+ dccp_v6_ctl_send_reset(skb);
+ }
+discard_it:
+
+ /*
+ * Discard frame
+ */
+
+ kfree_skb(skb);
+ return 0;
+
+discard_and_relse:
+ sock_put(sk);
+ goto discard_it;
+
+do_time_wait:
+ inet_twsk_put((struct inet_timewait_sock *)sk);
+ goto no_dccp_socket;
+}
+
+static struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
+ .queue_xmit = inet6_csk_xmit,
+ .send_check = dccp_v6_send_check,
+ .rebuild_header = inet6_sk_rebuild_header,
+ .conn_request = dccp_v6_conn_request,
+ .syn_recv_sock = dccp_v6_request_recv_sock,
+ .net_header_len = sizeof(struct ipv6hdr),
+ .setsockopt = ipv6_setsockopt,
+ .getsockopt = ipv6_getsockopt,
+ .addr2sockaddr = inet6_csk_addr2sockaddr,
+ .sockaddr_len = sizeof(struct sockaddr_in6)
+};
+
+/*
+ * DCCP over IPv4 via INET6 API
+ */
+static struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
+ .queue_xmit = ip_queue_xmit,
+ .send_check = dccp_v4_send_check,
+ .rebuild_header = inet_sk_rebuild_header,
+ .conn_request = dccp_v6_conn_request,
+ .syn_recv_sock = dccp_v6_request_recv_sock,
+ .net_header_len = sizeof(struct iphdr),
+ .setsockopt = ipv6_setsockopt,
+ .getsockopt = ipv6_getsockopt,
+ .addr2sockaddr = inet6_csk_addr2sockaddr,
+ .sockaddr_len = sizeof(struct sockaddr_in6)
+};
+
+/* NOTE: A lot of things set to zero explicitly by call to
+ * sk_alloc() so need not be done here.
+ */
+static int dccp_v6_init_sock(struct sock *sk)
+{
+ int err = dccp_v4_init_sock(sk);
+
+ if (err == 0)
+ inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops;
+
+ return err;
+}
+
+static int dccp_v6_destroy_sock(struct sock *sk)
+{
+ dccp_v4_destroy_sock(sk);
+ return inet6_destroy_sock(sk);
+}
+
+static struct proto dccp_v6_prot = {
+ .name = "DCCPv6",
+ .owner = THIS_MODULE,
+ .close = dccp_close,
+ .connect = dccp_v6_connect,
+ .disconnect = dccp_disconnect,
+ .ioctl = dccp_ioctl,
+ .init = dccp_v6_init_sock,
+ .setsockopt = dccp_setsockopt,
+ .getsockopt = dccp_getsockopt,
+ .sendmsg = dccp_sendmsg,
+ .recvmsg = dccp_recvmsg,
+ .backlog_rcv = dccp_v6_do_rcv,
+ .hash = dccp_v6_hash,
+ .unhash = dccp_unhash,
+ .accept = inet_csk_accept,
+ .get_port = dccp_v6_get_port,
+ .shutdown = dccp_shutdown,
+ .destroy = dccp_v6_destroy_sock,
+ .orphan_count = &dccp_orphan_count,
+ .max_header = MAX_DCCP_HEADER,
+ .obj_size = sizeof(struct dccp6_sock),
+ .rsk_prot = &dccp6_request_sock_ops,
+ .twsk_prot = &dccp6_timewait_sock_ops,
+};
+
+static struct inet6_protocol dccp_v6_protocol = {
+ .handler = dccp_v6_rcv,
+ .err_handler = dccp_v6_err,
+ .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
+};
+
+static struct proto_ops inet6_dccp_ops = {
+ .family = PF_INET6,
+ .owner = THIS_MODULE,
+ .release = inet6_release,
+ .bind = inet6_bind,
+ .connect = inet_stream_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = inet_accept,
+ .getname = inet6_getname,
+ .poll = dccp_poll,
+ .ioctl = inet6_ioctl,
+ .listen = inet_dccp_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static struct inet_protosw dccp_v6_protosw = {
+ .type = SOCK_DCCP,
+ .protocol = IPPROTO_DCCP,
+ .prot = &dccp_v6_prot,
+ .ops = &inet6_dccp_ops,
+ .capability = -1,
+ .flags = INET_PROTOSW_ICSK,
+};
+
+static int __init dccp_v6_init(void)
+{
+ int err = proto_register(&dccp_v6_prot, 1);
+
+ if (err != 0)
+ goto out;
+
+ err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
+ if (err != 0)
+ goto out_unregister_proto;
+
+ inet6_register_protosw(&dccp_v6_protosw);
+out:
+ return err;
+out_unregister_proto:
+ proto_unregister(&dccp_v6_prot);
+ goto out;
+}
+
+static void __exit dccp_v6_exit(void)
+{
+ inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
+ inet6_unregister_protosw(&dccp_v6_protosw);
+ proto_unregister(&dccp_v6_prot);
+}
+
+module_init(dccp_v6_init);
+module_exit(dccp_v6_exit);
+
+/*
+ * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
+ * values directly, Also cover the case where the protocol is not specified,
+ * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP
+ */
+MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-33-type-6");
+MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-0-type-6");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
+MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h
new file mode 100644
index 000000000000..e4d4e9309270
--- /dev/null
+++ b/net/dccp/ipv6.h
@@ -0,0 +1,37 @@
+#ifndef _DCCP_IPV6_H
+#define _DCCP_IPV6_H
+/*
+ * net/dccp/ipv6.h
+ *
+ * An implementation of the DCCP protocol
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/ipv6.h>
+
+struct dccp6_sock {
+ struct dccp_sock dccp;
+ /*
+ * ipv6_pinfo has to be the last member of dccp6_sock,
+ * see inet6_sk_generic.
+ */
+ struct ipv6_pinfo inet6;
+};
+
+struct dccp6_request_sock {
+ struct dccp_request_sock dccp;
+ struct inet6_request_sock inet6;
+};
+
+struct dccp6_timewait_sock {
+ struct inet_timewait_sock inet;
+ struct inet6_timewait_sock tw6;
+};
+
+#endif /* _DCCP_IPV6_H */
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 1393461898bb..29261fc198e7 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -40,6 +40,8 @@ struct inet_timewait_death_row dccp_death_row = {
(unsigned long)&dccp_death_row),
};
+EXPORT_SYMBOL_GPL(dccp_death_row);
+
void dccp_time_wait(struct sock *sk, int state, int timeo)
{
struct inet_timewait_sock *tw = NULL;
@@ -50,7 +52,18 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
if (tw != NULL) {
const struct inet_connection_sock *icsk = inet_csk(sk);
const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
-
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ if (tw->tw_family == PF_INET6) {
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+ struct inet6_timewait_sock *tw6;
+
+ tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
+ tw6 = inet6_twsk((struct sock *)tw);
+ ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
+ ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
+ tw->tw_ipv6only = np->ipv6only;
+ }
+#endif
/* Linkage updates. */
__inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
@@ -170,6 +183,8 @@ out_free:
return newsk;
}
+EXPORT_SYMBOL_GPL(dccp_create_openreq_child);
+
/*
* Process an incoming packet for RESPOND sockets represented
* as an request_sock.
@@ -214,7 +229,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
goto drop;
}
- child = dccp_v4_request_recv_sock(sk, skb, req, NULL);
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
if (child == NULL)
goto listen_overflow;
@@ -236,6 +251,8 @@ drop:
goto out;
}
+EXPORT_SYMBOL_GPL(dccp_check_req);
+
/*
* Queue segment on the new socket if the new socket is active,
* otherwise we just shortcircuit this and continue with
@@ -266,3 +283,5 @@ int dccp_child_process(struct sock *parent, struct sock *child,
sock_put(child);
return ret;
}
+
+EXPORT_SYMBOL_GPL(dccp_child_process);
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 74ff87025878..efd7ffb903a1 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -15,6 +15,7 @@
#include <linux/kernel.h>
#include <linux/skbuff.h>
+#include <net/inet_sock.h>
#include <net/sock.h>
#include "ackvec.h"
@@ -43,6 +44,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
{
if (likely(skb != NULL)) {
const struct inet_sock *inet = inet_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct dccp_sock *dp = dccp_sk(sk);
struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
struct dccp_hdr *dh;
@@ -108,8 +110,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
break;
}
- dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr,
- inet->daddr);
+ icsk->icsk_af_ops->send_check(sk, skb->len, skb);
if (set_ack)
dccp_event_ack_sent(sk);
@@ -117,7 +118,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- err = ip_queue_xmit(skb, 0);
+ err = icsk->icsk_af_ops->queue_xmit(skb, 0);
if (err <= 0)
return err;
@@ -134,20 +135,13 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct dccp_sock *dp = dccp_sk(sk);
- int mss_now;
-
- /*
- * FIXME: we really should be using the af_specific thing to support
- * IPv6.
- * mss_now = pmtu - tp->af_specific->net_header_len -
- * sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext);
- */
- mss_now = pmtu - sizeof(struct iphdr) - sizeof(struct dccp_hdr) -
- sizeof(struct dccp_hdr_ext);
+ int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
+ sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext));
/* Now subtract optional transport overhead */
- mss_now -= dp->dccps_ext_header_len;
+ mss_now -= icsk->icsk_ext_hdr_len;
/*
* FIXME: this should come from the CCID infrastructure, where, say,
@@ -160,12 +154,14 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4;
/* And store cached results */
- dp->dccps_pmtu_cookie = pmtu;
+ icsk->icsk_pmtu_cookie = pmtu;
dp->dccps_mss_cache = mss_now;
return mss_now;
}
+EXPORT_SYMBOL_GPL(dccp_sync_mss);
+
void dccp_write_space(struct sock *sk)
{
read_lock(&sk->sk_callback_lock);
@@ -266,7 +262,7 @@ int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo)
int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
{
- if (inet_sk_rebuild_header(sk) != 0)
+ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0)
return -EHOSTUNREACH; /* Routing failure or similar. */
return dccp_transmit_skb(sk, (skb_cloned(skb) ?
@@ -321,6 +317,8 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
return skb;
}
+EXPORT_SYMBOL_GPL(dccp_make_response);
+
struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst,
const enum dccp_reset_codes code)
@@ -377,6 +375,7 @@ struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst,
*/
static inline void dccp_connect_init(struct sock *sk)
{
+ struct dccp_sock *dp = dccp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -385,10 +384,16 @@ static inline void dccp_connect_init(struct sock *sk)
dccp_sync_mss(sk, dst_mtu(dst));
- /*
- * FIXME: set dp->{dccps_swh,dccps_swl}, with
- * something like dccp_inc_seq
- */
+ dccp_update_gss(sk, dp->dccps_iss);
+ /*
+ * SWL and AWL are initially adjusted so that they are not less than
+ * the initial Sequence Numbers received and sent, respectively:
+ * SWL := max(GSR + 1 - floor(W/4), ISR),
+ * AWL := max(GSS - W' + 1, ISS).
+ * These adjustments MUST be applied only at the beginning of the
+ * connection.
+ */
+ dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss));
icsk->icsk_retransmits = 0;
}
@@ -420,6 +425,8 @@ int dccp_connect(struct sock *sk)
return 0;
}
+EXPORT_SYMBOL_GPL(dccp_connect);
+
void dccp_send_ack(struct sock *sk)
{
/* If we have been reset, we may not send again. */
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 8a6b2a9e4581..65b11ea90d85 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -24,7 +24,7 @@
#include <net/checksum.h>
#include <net/inet_common.h>
-#include <net/ip.h>
+#include <net/inet_sock.h>
#include <net/protocol.h>
#include <net/sock.h>
#include <net/xfrm.h>
@@ -34,15 +34,18 @@
#include <linux/timer.h>
#include <linux/delay.h>
#include <linux/poll.h>
-#include <linux/dccp.h>
#include "ccid.h"
#include "dccp.h"
DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
+EXPORT_SYMBOL_GPL(dccp_statistics);
+
atomic_t dccp_orphan_count = ATOMIC_INIT(0);
+EXPORT_SYMBOL_GPL(dccp_orphan_count);
+
static struct net_protocol dccp_protocol = {
.handler = dccp_v4_rcv,
.err_handler = dccp_v4_err,
@@ -149,6 +152,8 @@ int dccp_disconnect(struct sock *sk, int flags)
return err;
}
+EXPORT_SYMBOL_GPL(dccp_disconnect);
+
/*
* Wait for a DCCP event.
*
@@ -156,8 +161,8 @@ int dccp_disconnect(struct sock *sk, int flags)
* take care of normal races (between the test and the event) and we don't
* go look at any of the socket buffers directly.
*/
-static unsigned int dccp_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+unsigned int dccp_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
{
unsigned int mask;
struct sock *sk = sock->sk;
@@ -205,12 +210,16 @@ static unsigned int dccp_poll(struct file *file, struct socket *sock,
return mask;
}
+EXPORT_SYMBOL_GPL(dccp_poll);
+
int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
dccp_pr_debug("entry\n");
return -ENOIOCTLCMD;
}
+EXPORT_SYMBOL_GPL(dccp_ioctl);
+
static int dccp_setsockopt_service(struct sock *sk, const u32 service,
char __user *optval, int optlen)
{
@@ -254,7 +263,9 @@ int dccp_setsockopt(struct sock *sk, int level, int optname,
int val;
if (level != SOL_DCCP)
- return ip_setsockopt(sk, level, optname, optval, optlen);
+ return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
+ optname, optval,
+ optlen);
if (optlen < sizeof(int))
return -EINVAL;
@@ -282,6 +293,8 @@ int dccp_setsockopt(struct sock *sk, int level, int optname,
return err;
}
+EXPORT_SYMBOL_GPL(dccp_setsockopt);
+
static int dccp_getsockopt_service(struct sock *sk, int len,
u32 __user *optval,
int __user *optlen)
@@ -320,8 +333,9 @@ int dccp_getsockopt(struct sock *sk, int level, int optname,
int val, len;
if (level != SOL_DCCP)
- return ip_getsockopt(sk, level, optname, optval, optlen);
-
+ return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
+ optname, optval,
+ optlen);
if (get_user(len, optlen))
return -EFAULT;
@@ -354,6 +368,8 @@ int dccp_getsockopt(struct sock *sk, int level, int optname,
return 0;
}
+EXPORT_SYMBOL_GPL(dccp_getsockopt);
+
int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len)
{
@@ -410,6 +426,8 @@ out_discard:
goto out_release;
}
+EXPORT_SYMBOL_GPL(dccp_sendmsg);
+
int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len, int nonblock, int flags, int *addr_len)
{
@@ -507,7 +525,9 @@ out:
return len;
}
-static int inet_dccp_listen(struct socket *sock, int backlog)
+EXPORT_SYMBOL_GPL(dccp_recvmsg);
+
+int inet_dccp_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
unsigned char old_state;
@@ -543,6 +563,8 @@ out:
return err;
}
+EXPORT_SYMBOL_GPL(inet_dccp_listen);
+
static const unsigned char dccp_new_state[] = {
/* current state: new state: action: */
[0] = DCCP_CLOSED,
@@ -648,12 +670,16 @@ adjudge_to_death:
sock_put(sk);
}
+EXPORT_SYMBOL_GPL(dccp_close);
+
void dccp_shutdown(struct sock *sk, int how)
{
dccp_pr_debug("entry\n");
}
-static struct proto_ops inet_dccp_ops = {
+EXPORT_SYMBOL_GPL(dccp_shutdown);
+
+static const struct proto_ops inet_dccp_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
@@ -681,11 +707,11 @@ extern struct net_proto_family inet_family_ops;
static struct inet_protosw dccp_v4_protosw = {
.type = SOCK_DCCP,
.protocol = IPPROTO_DCCP,
- .prot = &dccp_v4_prot,
+ .prot = &dccp_prot,
.ops = &inet_dccp_ops,
.capability = -1,
.no_check = 0,
- .flags = 0,
+ .flags = INET_PROTOSW_ICSK,
};
/*
@@ -760,13 +786,15 @@ MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
int dccp_debug;
module_param(dccp_debug, int, 0444);
MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
+
+EXPORT_SYMBOL_GPL(dccp_debug);
#endif
static int __init dccp_init(void)
{
unsigned long goal;
int ehash_order, bhash_order, i;
- int rc = proto_register(&dccp_v4_prot, 1);
+ int rc = proto_register(&dccp_prot, 1);
if (rc)
goto out;
@@ -869,7 +897,7 @@ out_free_bind_bucket_cachep:
kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
dccp_hashinfo.bind_bucket_cachep = NULL;
out_proto_unregister:
- proto_unregister(&dccp_v4_prot);
+ proto_unregister(&dccp_prot);
goto out;
}
@@ -892,7 +920,7 @@ static void __exit dccp_fini(void)
get_order(dccp_hashinfo.ehash_size *
sizeof(struct inet_ehash_bucket)));
kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
- proto_unregister(&dccp_v4_prot);
+ proto_unregister(&dccp_prot);
}
module_init(dccp_init);
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index d402e9020c68..78ec5344be86 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -149,7 +149,7 @@ static void dn_keepalive(struct sock *sk);
#define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1)
-static struct proto_ops dn_proto_ops;
+static const struct proto_ops dn_proto_ops;
static DEFINE_RWLOCK(dn_hash_lock);
static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE];
static struct hlist_head dn_wild_sk;
@@ -1252,7 +1252,7 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
break;
default:
- err = dev_ioctl(cmd, (void __user *)arg);
+ err = -ENOIOCTLCMD;
break;
}
@@ -2342,7 +2342,7 @@ static struct net_proto_family dn_family_ops = {
.owner = THIS_MODULE,
};
-static struct proto_ops dn_proto_ops = {
+static const struct proto_ops dn_proto_ops = {
.family = AF_DECnet,
.owner = THIS_MODULE,
.release = dn_release,
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 8d0cc3cf3e49..33ab256cfd4a 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -408,11 +408,14 @@ int dn_neigh_router_hello(struct sk_buff *skb)
}
}
- if (!dn_db->router) {
- dn_db->router = neigh_clone(neigh);
- } else {
- if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority)
- neigh_release(xchg(&dn_db->router, neigh_clone(neigh)));
+ /* Only use routers in our area */
+ if ((dn_ntohs(src)>>10) == dn_ntohs((decnet_address)>>10)) {
+ if (!dn_db->router) {
+ dn_db->router = neigh_clone(neigh);
+ } else {
+ if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority)
+ neigh_release(xchg(&dn_db->router, neigh_clone(neigh)));
+ }
}
write_unlock(&neigh->lock);
neigh_release(neigh);
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index 369f25b60f3f..44bda85e678f 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -793,7 +793,6 @@ static int dn_nsp_rx_packet(struct sk_buff *skb)
got_it:
if (sk != NULL) {
struct dn_scp *scp = DN_SK(sk);
- int ret;
/* Reset backoff */
scp->nsp_rxtshift = 0;
@@ -807,21 +806,7 @@ got_it:
goto free_out;
}
- bh_lock_sock(sk);
- ret = NET_RX_SUCCESS;
- if (decnet_debug_level & 8)
- printk(KERN_DEBUG "NSP: 0x%02x 0x%02x 0x%04x 0x%04x %d\n",
- (int)cb->rt_flags, (int)cb->nsp_flags,
- (int)cb->src_port, (int)cb->dst_port,
- !!sock_owned_by_user(sk));
- if (!sock_owned_by_user(sk))
- ret = dn_nsp_backlog_rcv(sk, skb);
- else
- sk_add_backlog(sk, skb);
- bh_unlock_sock(sk);
- sock_put(sk);
-
- return ret;
+ return sk_receive_skb(sk, skb);
}
return dn_nsp_no_socket(skb, reason);
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index 34fdac51df96..c792994d7952 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -31,6 +31,7 @@
#include <linux/if_arp.h>
#include <linux/wireless.h>
#include <linux/skbuff.h>
+#include <linux/udp.h>
#include <net/sock.h>
#include <net/inet_common.h>
#include <linux/stat.h>
@@ -45,7 +46,7 @@
#include <asm/uaccess.h>
#include <asm/system.h>
-static struct proto_ops econet_ops;
+static const struct proto_ops econet_ops;
static struct hlist_head econet_sklist;
static DEFINE_RWLOCK(econet_lock);
@@ -56,7 +57,7 @@ static struct net_device *net2dev_map[256];
#define EC_PORT_IP 0xd2
#ifdef CONFIG_ECONET_AUNUDP
-static spinlock_t aun_queue_lock;
+static DEFINE_SPINLOCK(aun_queue_lock);
static struct socket *udpsock;
#define AUN_PORT 0x8000
@@ -686,7 +687,7 @@ static int econet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg
break;
default:
- return dev_ioctl(cmd, argp);
+ return -ENOIOCTLCMD;
}
/*NOTREACHED*/
return 0;
@@ -698,7 +699,7 @@ static struct net_proto_family econet_family_ops = {
.owner = THIS_MODULE,
};
-static struct proto_ops SOCKOPS_WRAPPED(econet_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(econet_ops) = {
.family = PF_ECONET,
.owner = THIS_MODULE,
.release = econet_release,
diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c
index 03efaacbdb73..4cc6f41c6930 100644
--- a/net/ieee80211/ieee80211_rx.c
+++ b/net/ieee80211/ieee80211_rx.c
@@ -410,9 +410,8 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
return 1;
}
- if ((is_multicast_ether_addr(hdr->addr1) ||
- is_broadcast_ether_addr(hdr->addr2)) ? ieee->host_mc_decrypt :
- ieee->host_decrypt) {
+ if (is_multicast_ether_addr(hdr->addr1)
+ ? ieee->host_mc_decrypt : ieee->host_decrypt) {
int idx = 0;
if (skb->len >= hdrlen + 3)
idx = skb->data[hdrlen + 3] >> 6;
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index e55136ae09f4..011cca7ae02b 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -456,6 +456,14 @@ config TCP_CONG_BIC
increase provides TCP friendliness.
See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
+config TCP_CONG_CUBIC
+ tristate "CUBIC TCP"
+ default m
+ ---help---
+ This is version 2.0 of BIC-TCP which uses a cubic growth function
+ among other techniques.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
+
config TCP_CONG_WESTWOOD
tristate "TCP Westwood+"
default m
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f0435d00db6b..c54edd76de09 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
+obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d368cf249000..966a071a408c 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -93,6 +93,7 @@
#include <linux/smp_lock.h>
#include <linux/inet.h>
#include <linux/igmp.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -302,6 +303,7 @@ lookup_protocol:
sk->sk_reuse = 1;
inet = inet_sk(sk);
+ inet->is_icsk = INET_PROTOSW_ICSK & answer_flags;
if (SOCK_RAW == sock->type) {
inet->num = protocol;
@@ -775,16 +777,16 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
err = devinet_ioctl(cmd, (void __user *)arg);
break;
default:
- if (!sk->sk_prot->ioctl ||
- (err = sk->sk_prot->ioctl(sk, cmd, arg)) ==
- -ENOIOCTLCMD)
- err = dev_ioctl(cmd, (void __user *)arg);
+ if (sk->sk_prot->ioctl)
+ err = sk->sk_prot->ioctl(sk, cmd, arg);
+ else
+ err = -ENOIOCTLCMD;
break;
}
return err;
}
-struct proto_ops inet_stream_ops = {
+const struct proto_ops inet_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
@@ -805,7 +807,7 @@ struct proto_ops inet_stream_ops = {
.sendpage = tcp_sendpage
};
-struct proto_ops inet_dgram_ops = {
+const struct proto_ops inet_dgram_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
@@ -830,7 +832,7 @@ struct proto_ops inet_dgram_ops = {
* For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
* udp_poll
*/
-static struct proto_ops inet_sockraw_ops = {
+static const struct proto_ops inet_sockraw_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
@@ -869,7 +871,8 @@ static struct inet_protosw inetsw_array[] =
.ops = &inet_stream_ops,
.capability = -1,
.no_check = 0,
- .flags = INET_PROTOSW_PERMANENT,
+ .flags = INET_PROTOSW_PERMANENT |
+ INET_PROTOSW_ICSK,
},
{
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 035ad2c9e1ba..aed537fa2c88 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -6,6 +6,7 @@
#include <linux/crypto.h>
#include <linux/pfkeyv2.h>
#include <net/icmp.h>
+#include <net/protocol.h>
#include <asm/scatterlist.h>
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index b425748f02d7..37432088fe6d 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -86,6 +86,7 @@
#include <linux/in.h>
#include <linux/mm.h>
#include <linux/inet.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/fddidevice.h>
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 04a6fe3e95a2..7b9bb28e2ee9 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -58,6 +58,7 @@
#endif
#include <linux/kmod.h>
+#include <net/arp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/ip_fib.h>
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 1b18ce66e7b7..73bfcae8af9c 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -9,6 +9,7 @@
#include <linux/pfkeyv2.h>
#include <linux/random.h>
#include <net/icmp.h>
+#include <net/protocol.h>
#include <net/udp.h>
/* decapsulation data for use when post-processing */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 19b1b984d687..18f5e509281a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -30,6 +30,7 @@
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 7ea0209cb169..e2890ec8159e 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -29,6 +29,7 @@
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 0b298bbc1518..0dd4d06e456d 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -33,6 +33,7 @@
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 6d2a6ac070e3..ef4724de7350 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -29,6 +29,7 @@
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
@@ -36,6 +37,7 @@
#include <linux/netlink.h>
#include <linux/init.h>
+#include <net/arp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 705e3ce86df9..e320b32373e5 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -41,6 +41,13 @@
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
+ *
+ * Substantial contributions to this work comes from:
+ *
+ * David S. Miller, <davem@davemloft.net>
+ * Stephen Hemminger <shemminger@osdl.org>
+ * Paul E. McKenney <paulmck@us.ibm.com>
+ * Patrick McHardy <kaber@trash.net>
*/
#define VERSION "0.404"
@@ -59,6 +66,7 @@
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 92e23b2ad4d2..be5a519cd2f8 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -73,6 +73,7 @@
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/string.h>
#include <linux/netfilter_ipv4.h>
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 4a195c724f01..34758118c10c 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -91,6 +91,8 @@
#include <linux/if_arp.h>
#include <linux/rtnetlink.h>
#include <linux/times.h>
+
+#include <net/arp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 3fe021f1a566..ae20281d8deb 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -37,7 +37,8 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg);
*/
int sysctl_local_port_range[2] = { 1024, 4999 };
-static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
+int inet_csk_bind_conflict(const struct sock *sk,
+ const struct inet_bind_bucket *tb)
{
const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
struct sock *sk2;
@@ -62,11 +63,15 @@ static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucke
return node != NULL;
}
+EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
+
/* Obtain a reference to a local port for the given sock,
* if snum is zero it means select any available local port.
*/
int inet_csk_get_port(struct inet_hashinfo *hashinfo,
- struct sock *sk, unsigned short snum)
+ struct sock *sk, unsigned short snum,
+ int (*bind_conflict)(const struct sock *sk,
+ const struct inet_bind_bucket *tb))
{
struct inet_bind_hashbucket *head;
struct hlist_node *node;
@@ -125,7 +130,7 @@ tb_found:
goto success;
} else {
ret = 1;
- if (inet_csk_bind_conflict(sk, tb))
+ if (bind_conflict(sk, tb))
goto fail_unlock;
}
}
@@ -380,7 +385,7 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,
EXPORT_SYMBOL_GPL(inet_csk_search_req);
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
- const unsigned timeout)
+ unsigned long timeout)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
@@ -631,3 +636,15 @@ void inet_csk_listen_stop(struct sock *sk)
}
EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
+
+void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+ const struct inet_sock *inet = inet_sk(sk);
+
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = inet->daddr;
+ sin->sin_port = inet->dport;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 39061ed53cfd..c49908192047 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -112,12 +112,12 @@ static int inet_diag_fill(struct sk_buff *skb, struct sock *sk,
r->idiag_inode = 0;
#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
if (r->idiag_family == AF_INET6) {
- const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk);
+ const struct inet6_timewait_sock *tw6 = inet6_twsk(sk);
ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
- &tcp6tw->tw_v6_rcv_saddr);
+ &tw6->tw_v6_rcv_saddr);
ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
- &tcp6tw->tw_v6_daddr);
+ &tw6->tw_v6_daddr);
}
#endif
nlh->nlmsg_len = skb->tail - b;
@@ -489,9 +489,9 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
if (r->idiag_family == AF_INET6) {
ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
- &tcp6_rsk(req)->loc_addr);
+ &inet6_rsk(req)->loc_addr);
ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
- &tcp6_rsk(req)->rmt_addr);
+ &inet6_rsk(req)->rmt_addr);
}
#endif
nlh->nlmsg_len = skb->tail - b;
@@ -553,13 +553,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
entry.saddr =
#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
(entry.family == AF_INET6) ?
- tcp6_rsk(req)->loc_addr.s6_addr32 :
+ inet6_rsk(req)->loc_addr.s6_addr32 :
#endif
&ireq->loc_addr;
entry.daddr =
#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
(entry.family == AF_INET6) ?
- tcp6_rsk(req)->rmt_addr.s6_addr32 :
+ inet6_rsk(req)->rmt_addr.s6_addr32 :
#endif
&ireq->rmt_addr;
entry.dport = ntohs(ireq->rmt_port);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index e8d29fe736d2..33228115cda4 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -15,12 +15,14 @@
#include <linux/config.h>
#include <linux/module.h>
+#include <linux/random.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/wait.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
+#include <net/ip.h>
/*
* Allocate and initialize a new local port bind bucket.
@@ -163,3 +165,179 @@ struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 dad
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
+
+/* called with local bh disabled */
+static int __inet_check_established(struct inet_timewait_death_row *death_row,
+ struct sock *sk, __u16 lport,
+ struct inet_timewait_sock **twp)
+{
+ struct inet_hashinfo *hinfo = death_row->hashinfo;
+ struct inet_sock *inet = inet_sk(sk);
+ u32 daddr = inet->rcv_saddr;
+ u32 saddr = inet->daddr;
+ int dif = sk->sk_bound_dev_if;
+ INET_ADDR_COOKIE(acookie, saddr, daddr)
+ const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+ unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
+ struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+ struct sock *sk2;
+ const struct hlist_node *node;
+ struct inet_timewait_sock *tw;
+
+ prefetch(head->chain.first);
+ write_lock(&head->lock);
+
+ /* Check TIME-WAIT sockets first. */
+ sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) {
+ tw = inet_twsk(sk2);
+
+ if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
+ if (twsk_unique(sk, sk2, twp))
+ goto unique;
+ else
+ goto not_unique;
+ }
+ }
+ tw = NULL;
+
+ /* And established part... */
+ sk_for_each(sk2, node, &head->chain) {
+ if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
+ goto not_unique;
+ }
+
+unique:
+ /* Must record num and sport now. Otherwise we will see
+ * in hash table socket with a funny identity. */
+ inet->num = lport;
+ inet->sport = htons(lport);
+ sk->sk_hash = hash;
+ BUG_TRAP(sk_unhashed(sk));
+ __sk_add_node(sk, &head->chain);
+ sock_prot_inc_use(sk->sk_prot);
+ write_unlock(&head->lock);
+
+ if (twp) {
+ *twp = tw;
+ NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+ } else if (tw) {
+ /* Silly. Should hash-dance instead... */
+ inet_twsk_deschedule(tw, death_row);
+ NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+
+ inet_twsk_put(tw);
+ }
+
+ return 0;
+
+not_unique:
+ write_unlock(&head->lock);
+ return -EADDRNOTAVAIL;
+}
+
+static inline u32 inet_sk_port_offset(const struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr,
+ inet->dport);
+}
+
+/*
+ * Bind a port for a connect operation and hash it.
+ */
+int inet_hash_connect(struct inet_timewait_death_row *death_row,
+ struct sock *sk)
+{
+ struct inet_hashinfo *hinfo = death_row->hashinfo;
+ const unsigned short snum = inet_sk(sk)->num;
+ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
+ int ret;
+
+ if (!snum) {
+ int low = sysctl_local_port_range[0];
+ int high = sysctl_local_port_range[1];
+ int range = high - low;
+ int i;
+ int port;
+ static u32 hint;
+ u32 offset = hint + inet_sk_port_offset(sk);
+ struct hlist_node *node;
+ struct inet_timewait_sock *tw = NULL;
+
+ local_bh_disable();
+ for (i = 1; i <= range; i++) {
+ port = low + (i + offset) % range;
+ head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
+ spin_lock(&head->lock);
+
+ /* Does not bother with rcv_saddr checks,
+ * because the established check is already
+ * unique enough.
+ */
+ inet_bind_bucket_for_each(tb, node, &head->chain) {
+ if (tb->port == port) {
+ BUG_TRAP(!hlist_empty(&tb->owners));
+ if (tb->fastreuse >= 0)
+ goto next_port;
+ if (!__inet_check_established(death_row,
+ sk, port,
+ &tw))
+ goto ok;
+ goto next_port;
+ }
+ }
+
+ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port);
+ if (!tb) {
+ spin_unlock(&head->lock);
+ break;
+ }
+ tb->fastreuse = -1;
+ goto ok;
+
+ next_port:
+ spin_unlock(&head->lock);
+ }
+ local_bh_enable();
+
+ return -EADDRNOTAVAIL;
+
+ok:
+ hint += i;
+
+ /* Head lock still held and bh's disabled */
+ inet_bind_hash(sk, tb, port);
+ if (sk_unhashed(sk)) {
+ inet_sk(sk)->sport = htons(port);
+ __inet_hash(hinfo, sk, 0);
+ }
+ spin_unlock(&head->lock);
+
+ if (tw) {
+ inet_twsk_deschedule(tw, death_row);;
+ inet_twsk_put(tw);
+ }
+
+ ret = 0;
+ goto out;
+ }
+
+ head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
+ tb = inet_csk(sk)->icsk_bind_hash;
+ spin_lock_bh(&head->lock);
+ if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
+ __inet_hash(hinfo, sk, 0);
+ spin_unlock_bh(&head->lock);
+ return 0;
+ } else {
+ spin_unlock(&head->lock);
+ /* No definite answer... Walk to established hash table */
+ ret = __inet_check_established(death_row, sk, snum, NULL);
+out:
+ local_bh_enable();
+ return ret;
+ }
+}
+
+EXPORT_SYMBOL_GPL(inet_hash_connect);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index a010e9a68811..417f126c749e 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -90,8 +90,9 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
{
- struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab,
- SLAB_ATOMIC);
+ struct inet_timewait_sock *tw =
+ kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
+ SLAB_ATOMIC);
if (tw != NULL) {
const struct inet_sock *inet = inet_sk(sk);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 2fc3fd38924f..ce5fe3f74a3d 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -401,6 +401,7 @@ struct inet_peer *inet_getpeer(__u32 daddr, int create)
return NULL;
n->v4daddr = daddr;
atomic_set(&n->refcnt, 1);
+ atomic_set(&n->rid, 0);
n->ip_id_count = secure_ip_id(daddr);
n->tcp_ts_stamp = 0;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8ce0ce2ee48e..ce2b70ce4018 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -22,6 +22,7 @@
* Patrick McHardy : LRU queue of frag heads for evictor.
*/
+#include <linux/compiler.h>
#include <linux/config.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -38,6 +39,7 @@
#include <net/ip.h>
#include <net/icmp.h>
#include <net/checksum.h>
+#include <net/inetpeer.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/inet.h>
@@ -56,6 +58,8 @@
int sysctl_ipfrag_high_thresh = 256*1024;
int sysctl_ipfrag_low_thresh = 192*1024;
+int sysctl_ipfrag_max_dist = 64;
+
/* Important NOTE! Fragment queue must be destroyed before MSL expires.
* RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL.
*/
@@ -89,8 +93,10 @@ struct ipq {
spinlock_t lock;
atomic_t refcnt;
struct timer_list timer; /* when will this queue expire? */
- int iif;
struct timeval stamp;
+ int iif;
+ unsigned int rid;
+ struct inet_peer *peer;
};
/* Hash table. */
@@ -195,6 +201,9 @@ static void ip_frag_destroy(struct ipq *qp, int *work)
BUG_TRAP(qp->last_in&COMPLETE);
BUG_TRAP(del_timer(&qp->timer) == 0);
+ if (qp->peer)
+ inet_putpeer(qp->peer);
+
/* Release all fragment data. */
fp = qp->fragments;
while (fp) {
@@ -353,6 +362,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
qp->meat = 0;
qp->fragments = NULL;
qp->iif = 0;
+ qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL;
/* Initialize a timer for this entry. */
init_timer(&qp->timer);
@@ -398,6 +408,56 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
return ip_frag_create(hash, iph, user);
}
+/* Is the fragment too far ahead to be part of ipq? */
+static inline int ip_frag_too_far(struct ipq *qp)
+{
+ struct inet_peer *peer = qp->peer;
+ unsigned int max = sysctl_ipfrag_max_dist;
+ unsigned int start, end;
+
+ int rc;
+
+ if (!peer || !max)
+ return 0;
+
+ start = qp->rid;
+ end = atomic_inc_return(&peer->rid);
+ qp->rid = end;
+
+ rc = qp->fragments && (end - start) > max;
+
+ if (rc) {
+ IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+ }
+
+ return rc;
+}
+
+static int ip_frag_reinit(struct ipq *qp)
+{
+ struct sk_buff *fp;
+
+ if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) {
+ atomic_inc(&qp->refcnt);
+ return -ETIMEDOUT;
+ }
+
+ fp = qp->fragments;
+ do {
+ struct sk_buff *xp = fp->next;
+ frag_kfree_skb(fp, NULL);
+ fp = xp;
+ } while (fp);
+
+ qp->last_in = 0;
+ qp->len = 0;
+ qp->meat = 0;
+ qp->fragments = NULL;
+ qp->iif = 0;
+
+ return 0;
+}
+
/* Add new segment to existing queue. */
static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
@@ -408,6 +468,12 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (qp->last_in & COMPLETE)
goto err;
+ if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
+ unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) {
+ ipq_kill(qp);
+ goto err;
+ }
+
offset = ntohs(skb->nh.iph->frag_off);
flags = offset & ~IP_OFFSET;
offset &= IP_OFFSET;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 473d0f2b2e0d..e45846ae570b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -128,6 +128,7 @@
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/inet.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index dbe12da8d8b3..d3f6c468faf4 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -22,6 +22,7 @@
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
+#include <net/route.h>
/*
* Write options to IP header, record destination address to
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index eba64e2bd397..2a830de3a699 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -445,6 +445,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
hlen = iph->ihl * 4;
mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
+ IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
/* When frag_list is given, use it. First, check its validity:
* some transformers could create wrong frag_list or break existing
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 4f2d87257309..6986e11d65cc 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -25,12 +25,12 @@
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/icmp.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
-#include <net/tcp.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
#include <linux/udp.h>
#include <linux/igmp.h>
#include <linux/netfilter.h>
@@ -427,8 +427,8 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
err = ip_options_get_from_user(&opt, optval, optlen);
if (err)
break;
- if (sk->sk_type == SOCK_STREAM) {
- struct tcp_sock *tp = tcp_sk(sk);
+ if (inet->is_icsk) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
if (sk->sk_family == PF_INET ||
(!((1 << sk->sk_state) &
@@ -436,10 +436,10 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
inet->daddr != LOOPBACK4_IPV6)) {
#endif
if (inet->opt)
- tp->ext_header_len -= inet->opt->optlen;
+ icsk->icsk_ext_hdr_len -= inet->opt->optlen;
if (opt)
- tp->ext_header_len += opt->optlen;
- tcp_sync_mss(sk, tp->pmtu_cookie);
+ icsk->icsk_ext_hdr_len += opt->optlen;
+ icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
}
#endif
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index fc718df17b40..d64e2ec8da7b 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -28,6 +28,7 @@
#include <net/xfrm.h>
#include <net/icmp.h>
#include <net/ipcomp.h>
+#include <net/protocol.h>
struct ipcomp_tfms {
struct list_head list;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index e8674baaa8d9..bb3613ec448c 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -42,6 +42,7 @@
#include <linux/in.h>
#include <linux/if.h>
#include <linux/inet.h>
+#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
@@ -58,6 +59,7 @@
#include <net/arp.h>
#include <net/ip.h>
#include <net/ipconfig.h>
+#include <net/route.h>
#include <asm/uaccess.h>
#include <net/checksum.h>
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 302b7eb507c9..caa3b7d2e48a 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -52,6 +52,7 @@
#include <net/ip.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
+#include <net/route.h>
#include <net/sock.h>
#include <net/icmp.h>
#include <net/udp.h>
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index d7eb680101c2..9b176a942ac5 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -224,34 +224,6 @@ void unregister_ip_vs_app(struct ip_vs_app *app)
}
-#if 0000
-/*
- * Get reference to app by name (called from user context)
- */
-struct ip_vs_app *ip_vs_app_get_by_name(char *appname)
-{
- struct ip_vs_app *app, *a = NULL;
-
- down(&__ip_vs_app_mutex);
-
- list_for_each_entry(ent, &ip_vs_app_list, a_list) {
- if (strcmp(app->name, appname))
- continue;
-
- /* softirq may call ip_vs_app_get too, so the caller
- must disable softirq on the current CPU */
- if (ip_vs_app_get(app))
- a = app;
- break;
- }
-
- up(&__ip_vs_app_mutex);
-
- return a;
-}
-#endif
-
-
/*
* Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
*/
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index 2a3a8c59c655..81d90354c928 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -24,7 +24,10 @@
*
*/
+#include <linux/in.h>
+#include <linux/net.h>
#include <linux/kernel.h>
+#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/proc_fs.h> /* for proc_net_* */
#include <linux/seq_file.h>
@@ -219,7 +222,7 @@ struct ip_vs_conn *ip_vs_conn_in_get
if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
- IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+ IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
ip_vs_proto_name(protocol),
NIPQUAD(s_addr), ntohs(s_port),
NIPQUAD(d_addr), ntohs(d_port),
@@ -254,7 +257,7 @@ struct ip_vs_conn *ip_vs_ct_in_get
out:
ct_read_unlock(hash);
- IP_VS_DBG(7, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+ IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
ip_vs_proto_name(protocol),
NIPQUAD(s_addr), ntohs(s_port),
NIPQUAD(d_addr), ntohs(d_port),
@@ -295,7 +298,7 @@ struct ip_vs_conn *ip_vs_conn_out_get
ct_read_unlock(hash);
- IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+ IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
ip_vs_proto_name(protocol),
NIPQUAD(s_addr), ntohs(s_port),
NIPQUAD(d_addr), ntohs(d_port),
@@ -391,8 +394,9 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
cp->flags |= atomic_read(&dest->conn_flags);
cp->dest = dest;
- IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
- "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
+ IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+ "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
+ "dest->refcnt:%d\n",
ip_vs_proto_name(cp->protocol),
NIPQUAD(cp->caddr), ntohs(cp->cport),
NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -430,8 +434,9 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
if (!dest)
return;
- IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
- "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
+ IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+ "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
+ "dest->refcnt:%d\n",
ip_vs_proto_name(cp->protocol),
NIPQUAD(cp->caddr), ntohs(cp->cport),
NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -571,7 +576,7 @@ static void ip_vs_conn_expire(unsigned long data)
ip_vs_conn_hash(cp);
expire_later:
- IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
+ IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
atomic_read(&cp->refcnt)-1,
atomic_read(&cp->n_control));
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 1a0843cd58a9..1aca94a9fd8b 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -426,7 +426,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
return NULL;
IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
- "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n",
+ "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n",
ip_vs_fwd_tag(cp),
NIPQUAD(cp->caddr), ntohs(cp->cport),
NIPQUAD(cp->vaddr), ntohs(cp->vport),
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 9bdcf31b760e..c935c5086d33 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -35,6 +35,7 @@
#include <linux/netfilter_ipv4.h>
#include <net/ip.h>
+#include <net/route.h>
#include <net/sock.h>
#include <asm/uaccess.h>
@@ -447,7 +448,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
out:
read_unlock(&__ip_vs_svc_lock);
- IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
+ IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
fwmark, ip_vs_proto_name(protocol),
NIPQUAD(vaddr), ntohs(vport),
svc?"hit":"not hit");
@@ -597,7 +598,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
*/
list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
- "refcnt=%d\n",
+ "dest->refcnt=%d\n",
dest->vfwmark,
NIPQUAD(dest->addr), ntohs(dest->port),
atomic_read(&dest->refcnt));
@@ -804,7 +805,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
dest = ip_vs_trash_get_dest(svc, daddr, dport);
if (dest != NULL) {
IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
- "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
+ "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
NIPQUAD(daddr), ntohs(dport),
atomic_read(&dest->refcnt),
dest->vfwmark,
@@ -949,7 +950,8 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
atomic_dec(&dest->svc->refcnt);
kfree(dest);
} else {
- IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
+ IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
+ "dest->refcnt=%d\n",
NIPQUAD(dest->addr), ntohs(dest->port),
atomic_read(&dest->refcnt));
list_add(&dest->n_list, &ip_vs_dest_trash);
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c
index f3bc320dce93..9fee19c4c617 100644
--- a/net/ipv4/ipvs/ip_vs_dh.c
+++ b/net/ipv4/ipvs/ip_vs_dh.c
@@ -37,8 +37,10 @@
*
*/
+#include <linux/ip.h>
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/skbuff.h>
#include <net/ip_vs.h>
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
index 67b3e2fc1fa1..e7004741ac73 100644
--- a/net/ipv4/ipvs/ip_vs_est.c
+++ b/net/ipv4/ipvs/ip_vs_est.c
@@ -13,7 +13,10 @@
* Changes:
*
*/
+#include <linux/config.h>
#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/slab.h>
#include <linux/types.h>
#include <net/ip_vs.h>
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index 561cda326fa8..6e5cb92a5c83 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -41,8 +41,10 @@
* me to write this module.
*/
+#include <linux/ip.h>
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/skbuff.h>
/* for sysctl */
#include <linux/fs.h>
@@ -228,33 +230,6 @@ ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
}
-#if 0000
-/*
- * Unhash ip_vs_lblc_entry from ip_vs_lblc_table.
- * returns bool success.
- */
-static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl,
- struct ip_vs_lblc_entry *en)
-{
- if (list_empty(&en->list)) {
- IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, "
- "called from %p\n", __builtin_return_address(0));
- return 0;
- }
-
- /*
- * Remove it from the table
- */
- write_lock(&tbl->lock);
- list_del(&en->list);
- INIT_LIST_HEAD(&en->list);
- write_unlock(&tbl->lock);
-
- return 1;
-}
-#endif
-
-
/*
* Get ip_vs_lblc_entry associated with supplied parameters.
*/
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index ce456dbf09a5..32ba37ba72d8 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -39,8 +39,10 @@
*
*/
+#include <linux/ip.h>
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/skbuff.h>
/* for sysctl */
#include <linux/fs.h>
@@ -414,33 +416,6 @@ ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
}
-#if 0000
-/*
- * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table.
- * returns bool success.
- */
-static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl,
- struct ip_vs_lblcr_entry *en)
-{
- if (list_empty(&en->list)) {
- IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, "
- "called from %p\n", __builtin_return_address(0));
- return 0;
- }
-
- /*
- * Remove it from the table
- */
- write_lock(&tbl->lock);
- list_del(&en->list);
- INIT_LIST_HEAD(&en->list);
- write_unlock(&tbl->lock);
-
- return 1;
-}
-#endif
-
-
/*
* Get ip_vs_lblcr_entry associated with supplied parameters.
*/
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c
index 453e94a0bbd7..8b0505b09317 100644
--- a/net/ipv4/ipvs/ip_vs_proto_ah.c
+++ b/net/ipv4/ipvs/ip_vs_proto_ah.c
@@ -12,6 +12,8 @@
*
*/
+#include <linux/in.h>
+#include <linux/ip.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/netfilter.h>
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c
index 478e5c7c7e8e..c36ccf057a19 100644
--- a/net/ipv4/ipvs/ip_vs_proto_esp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_esp.c
@@ -12,6 +12,8 @@
*
*/
+#include <linux/in.h>
+#include <linux/ip.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/netfilter.h>
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index 0e878fd6215c..bc28b1160a3a 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -275,28 +275,6 @@ static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
[IP_VS_TCP_S_LAST] = 2*HZ,
};
-
-#if 0
-
-/* FIXME: This is going to die */
-
-static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = {
- [IP_VS_TCP_S_NONE] = 2*HZ,
- [IP_VS_TCP_S_ESTABLISHED] = 8*60*HZ,
- [IP_VS_TCP_S_SYN_SENT] = 60*HZ,
- [IP_VS_TCP_S_SYN_RECV] = 10*HZ,
- [IP_VS_TCP_S_FIN_WAIT] = 60*HZ,
- [IP_VS_TCP_S_TIME_WAIT] = 60*HZ,
- [IP_VS_TCP_S_CLOSE] = 10*HZ,
- [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
- [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
- [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
- [IP_VS_TCP_S_SYNACK] = 100*HZ,
- [IP_VS_TCP_S_LAST] = 2*HZ,
-};
-
-#endif
-
static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
[IP_VS_TCP_S_NONE] = "NONE",
[IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
@@ -448,7 +426,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
struct ip_vs_dest *dest = cp->dest;
IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
- "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
+ "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n",
pp->name,
(state_off==TCP_DIR_OUTPUT)?"output ":"input ",
th->syn? 'S' : '.',
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
index 8ae5f2e0aefa..89d9175d8f28 100644
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -15,8 +15,11 @@
*
*/
+#include <linux/in.h>
+#include <linux/ip.h>
#include <linux/kernel.h>
#include <linux/netfilter_ipv4.h>
+#include <linux/udp.h>
#include <net/ip_vs.h>
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
index 6f7c50e44a39..7775e6cc68be 100644
--- a/net/ipv4/ipvs/ip_vs_sh.c
+++ b/net/ipv4/ipvs/ip_vs_sh.c
@@ -34,8 +34,10 @@
*
*/
+#include <linux/ip.h>
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/skbuff.h>
#include <net/ip_vs.h>
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 2e5ced3d8062..1bca714bda3d 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -21,12 +21,14 @@
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/inetdevice.h>
#include <linux/net.h>
#include <linux/completion.h>
#include <linux/delay.h>
#include <linux/skbuff.h>
#include <linux/in.h>
#include <linux/igmp.h> /* for ip_mc_join_group */
+#include <linux/udp.h>
#include <net/ip.h>
#include <net/sock.h>
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 3c2e9639bba6..bba156304695 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -68,19 +68,14 @@ struct arpt_table_info {
unsigned int initial_entries;
unsigned int hook_entry[NF_ARP_NUMHOOKS];
unsigned int underflow[NF_ARP_NUMHOOKS];
- char entries[0] __attribute__((aligned(SMP_CACHE_BYTES)));
+ void *entries[NR_CPUS];
};
static LIST_HEAD(arpt_target);
static LIST_HEAD(arpt_tables);
+#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
-#ifdef CONFIG_SMP
-#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
-#else
-#define TABLE_OFFSET(t,p) 0
-#endif
-
static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
char *hdr_addr, int len)
{
@@ -269,9 +264,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb,
outdev = out ? out->name : nulldevname;
read_lock_bh(&table->lock);
- table_base = (void *)table->private->entries
- + TABLE_OFFSET(table->private,
- smp_processor_id());
+ table_base = (void *)table->private->entries[smp_processor_id()];
e = get_entry(table_base, table->private->hook_entry[hook]);
back = get_entry(table_base, table->private->underflow[hook]);
@@ -462,7 +455,8 @@ static inline int unconditional(const struct arpt_arp *arp)
/* Figures out from what hook each rule can be called: returns 0 if
* there are loops. Puts hook bitmask in comefrom.
*/
-static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks)
+static int mark_source_chains(struct arpt_table_info *newinfo,
+ unsigned int valid_hooks, void *entry0)
{
unsigned int hook;
@@ -472,7 +466,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali
for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) {
unsigned int pos = newinfo->hook_entry[hook];
struct arpt_entry *e
- = (struct arpt_entry *)(newinfo->entries + pos);
+ = (struct arpt_entry *)(entry0 + pos);
if (!(valid_hooks & (1 << hook)))
continue;
@@ -514,13 +508,13 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali
goto next;
e = (struct arpt_entry *)
- (newinfo->entries + pos);
+ (entry0 + pos);
} while (oldpos == pos + e->next_offset);
/* Move along one */
size = e->next_offset;
e = (struct arpt_entry *)
- (newinfo->entries + pos + size);
+ (entry0 + pos + size);
e->counters.pcnt = pos;
pos += size;
} else {
@@ -537,7 +531,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali
newpos = pos + e->next_offset;
}
e = (struct arpt_entry *)
- (newinfo->entries + newpos);
+ (entry0 + newpos);
e->counters.pcnt = pos;
pos = newpos;
}
@@ -689,6 +683,7 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
static int translate_table(const char *name,
unsigned int valid_hooks,
struct arpt_table_info *newinfo,
+ void *entry0,
unsigned int size,
unsigned int number,
const unsigned int *hook_entries,
@@ -710,11 +705,11 @@ static int translate_table(const char *name,
i = 0;
/* Walk through entries, checking offsets. */
- ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size,
check_entry_size_and_hooks,
newinfo,
- newinfo->entries,
- newinfo->entries + size,
+ entry0,
+ entry0 + size,
hook_entries, underflows, &i);
duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
if (ret != 0)
@@ -743,29 +738,26 @@ static int translate_table(const char *name,
}
}
- if (!mark_source_chains(newinfo, valid_hooks)) {
+ if (!mark_source_chains(newinfo, valid_hooks, entry0)) {
duprintf("Looping hook\n");
return -ELOOP;
}
/* Finally, each sanity check must pass */
i = 0;
- ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size,
check_entry, name, size, &i);
if (ret != 0) {
- ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ ARPT_ENTRY_ITERATE(entry0, newinfo->size,
cleanup_entry, &i);
return ret;
}
/* And one copy for every other CPU */
for_each_cpu(i) {
- if (i == 0)
- continue;
- memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
- newinfo->entries,
- SMP_ALIGN(newinfo->size));
+ if (newinfo->entries[i] && newinfo->entries[i] != entry0)
+ memcpy(newinfo->entries[i], entry0, newinfo->size);
}
return ret;
@@ -807,15 +799,42 @@ static inline int add_entry_to_counter(const struct arpt_entry *e,
return 0;
}
+static inline int set_entry_to_counter(const struct arpt_entry *e,
+ struct arpt_counters total[],
+ unsigned int *i)
+{
+ SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+
+ (*i)++;
+ return 0;
+}
+
static void get_counters(const struct arpt_table_info *t,
struct arpt_counters counters[])
{
unsigned int cpu;
unsigned int i;
+ unsigned int curcpu;
+
+ /* Instead of clearing (by a previous call to memset())
+ * the counters and using adds, we set the counters
+ * with data used by 'current' CPU
+ * We dont care about preemption here.
+ */
+ curcpu = raw_smp_processor_id();
+
+ i = 0;
+ ARPT_ENTRY_ITERATE(t->entries[curcpu],
+ t->size,
+ set_entry_to_counter,
+ counters,
+ &i);
for_each_cpu(cpu) {
+ if (cpu == curcpu)
+ continue;
i = 0;
- ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+ ARPT_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_entry_to_counter,
counters,
@@ -831,6 +850,7 @@ static int copy_entries_to_user(unsigned int total_size,
struct arpt_entry *e;
struct arpt_counters *counters;
int ret = 0;
+ void *loc_cpu_entry;
/* We need atomic snapshot of counters: rest doesn't change
* (other than comefrom, which userspace doesn't care
@@ -843,13 +863,13 @@ static int copy_entries_to_user(unsigned int total_size,
return -ENOMEM;
/* First, sum counters... */
- memset(counters, 0, countersize);
write_lock_bh(&table->lock);
get_counters(table->private, counters);
write_unlock_bh(&table->lock);
- /* ... then copy entire thing from CPU 0... */
- if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+ loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+ /* ... then copy entire thing ... */
+ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
ret = -EFAULT;
goto free_counters;
}
@@ -859,7 +879,7 @@ static int copy_entries_to_user(unsigned int total_size,
for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
struct arpt_entry_target *t;
- e = (struct arpt_entry *)(table->private->entries + off);
+ e = (struct arpt_entry *)(loc_cpu_entry + off);
if (copy_to_user(userptr + off
+ offsetof(struct arpt_entry, counters),
&counters[num],
@@ -911,6 +931,47 @@ static int get_entries(const struct arpt_get_entries *entries,
return ret;
}
+static void free_table_info(struct arpt_table_info *info)
+{
+ int cpu;
+ for_each_cpu(cpu) {
+ if (info->size <= PAGE_SIZE)
+ kfree(info->entries[cpu]);
+ else
+ vfree(info->entries[cpu]);
+ }
+ kfree(info);
+}
+
+static struct arpt_table_info *alloc_table_info(unsigned int size)
+{
+ struct arpt_table_info *newinfo;
+ int cpu;
+
+ newinfo = kzalloc(sizeof(struct arpt_table_info), GFP_KERNEL);
+ if (!newinfo)
+ return NULL;
+
+ newinfo->size = size;
+
+ for_each_cpu(cpu) {
+ if (size <= PAGE_SIZE)
+ newinfo->entries[cpu] = kmalloc_node(size,
+ GFP_KERNEL,
+ cpu_to_node(cpu));
+ else
+ newinfo->entries[cpu] = vmalloc_node(size,
+ cpu_to_node(cpu));
+
+ if (newinfo->entries[cpu] == NULL) {
+ free_table_info(newinfo);
+ return NULL;
+ }
+ }
+
+ return newinfo;
+}
+
static int do_replace(void __user *user, unsigned int len)
{
int ret;
@@ -918,6 +979,7 @@ static int do_replace(void __user *user, unsigned int len)
struct arpt_table *t;
struct arpt_table_info *newinfo, *oldinfo;
struct arpt_counters *counters;
+ void *loc_cpu_entry, *loc_cpu_old_entry;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -930,13 +992,13 @@ static int do_replace(void __user *user, unsigned int len)
if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
return -ENOMEM;
- newinfo = vmalloc(sizeof(struct arpt_table_info)
- + SMP_ALIGN(tmp.size) *
- (highest_possible_processor_id()+1));
+ newinfo = alloc_table_info(tmp.size);
if (!newinfo)
return -ENOMEM;
- if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+ /* choose the copy that is on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
@@ -947,10 +1009,9 @@ static int do_replace(void __user *user, unsigned int len)
ret = -ENOMEM;
goto free_newinfo;
}
- memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters));
ret = translate_table(tmp.name, tmp.valid_hooks,
- newinfo, tmp.size, tmp.num_entries,
+ newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
tmp.hook_entry, tmp.underflow);
if (ret != 0)
goto free_newinfo_counters;
@@ -989,8 +1050,10 @@ static int do_replace(void __user *user, unsigned int len)
/* Get the old counters. */
get_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
- ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
- vfree(oldinfo);
+ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+ ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
+
+ free_table_info(oldinfo);
if (copy_to_user(tmp.counters, counters,
sizeof(struct arpt_counters) * tmp.num_counters) != 0)
ret = -EFAULT;
@@ -1002,11 +1065,11 @@ static int do_replace(void __user *user, unsigned int len)
module_put(t->me);
up(&arpt_mutex);
free_newinfo_counters_untrans:
- ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL);
+ ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL);
free_newinfo_counters:
vfree(counters);
free_newinfo:
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
@@ -1030,6 +1093,7 @@ static int do_add_counters(void __user *user, unsigned int len)
struct arpt_counters_info tmp, *paddc;
struct arpt_table *t;
int ret = 0;
+ void *loc_cpu_entry;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1059,7 +1123,9 @@ static int do_add_counters(void __user *user, unsigned int len)
}
i = 0;
- ARPT_ENTRY_ITERATE(t->private->entries,
+ /* Choose the copy that is on our node */
+ loc_cpu_entry = t->private->entries[smp_processor_id()];
+ ARPT_ENTRY_ITERATE(loc_cpu_entry,
t->private->size,
add_counter_to_entry,
paddc->counters,
@@ -1220,30 +1286,32 @@ int arpt_register_table(struct arpt_table *table,
struct arpt_table_info *newinfo;
static struct arpt_table_info bootstrap
= { 0, 0, 0, { 0 }, { 0 }, { } };
+ void *loc_cpu_entry;
- newinfo = vmalloc(sizeof(struct arpt_table_info)
- + SMP_ALIGN(repl->size) *
- (highest_possible_processor_id()+1));
+ newinfo = alloc_table_info(repl->size);
if (!newinfo) {
ret = -ENOMEM;
return ret;
}
- memcpy(newinfo->entries, repl->entries, repl->size);
+
+ /* choose the copy on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(table->name, table->valid_hooks,
- newinfo, repl->size,
+ newinfo, loc_cpu_entry, repl->size,
repl->num_entries,
repl->hook_entry,
repl->underflow);
duprintf("arpt_register_table: translate table gives %d\n", ret);
if (ret != 0) {
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
ret = down_interruptible(&arpt_mutex);
if (ret != 0) {
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
@@ -1272,20 +1340,23 @@ int arpt_register_table(struct arpt_table *table,
return ret;
free_unlock:
- vfree(newinfo);
+ free_table_info(newinfo);
goto unlock;
}
void arpt_unregister_table(struct arpt_table *table)
{
+ void *loc_cpu_entry;
+
down(&arpt_mutex);
LIST_DELETE(&arpt_tables, table);
up(&arpt_mutex);
/* Decrease module usage counts and free resources */
- ARPT_ENTRY_ITERATE(table->private->entries, table->private->size,
+ loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+ ARPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
cleanup_entry, NULL);
- vfree(table->private);
+ free_table_info(table->private);
}
/* The built-in targets: standard (NULL) and error. */
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index e52847fa10f5..0366eedb4d70 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -18,11 +18,13 @@
*
*/
+#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/netfilter.h>
#include <linux/ip.h>
#include <linux/moduleparam.h>
+#include <linux/udp.h>
#include <net/checksum.h>
#include <net/udp.h>
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
index 744abb9d377a..57956dee60c8 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
@@ -31,6 +31,7 @@
#include <linux/ip.h>
#include <linux/in.h>
#include <linux/list.h>
+#include <linux/seq_file.h>
static DEFINE_RWLOCK(ip_ct_gre_lock);
#define ASSERT_READ_LOCK(x)
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index f2dcac7c7660..46becbe4fe58 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -11,6 +11,7 @@
#include <linux/timer.h>
#include <linux/netfilter.h>
#include <linux/in.h>
+#include <linux/ip.h>
#include <linux/udp.h>
#include <linux/seq_file.h>
#include <net/checksum.h>
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index dd476b191f4b..a88bcc551244 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -27,6 +27,7 @@
#endif
#include <net/checksum.h>
#include <net/ip.h>
+#include <net/route.h>
#define ASSERT_READ_LOCK(x)
#define ASSERT_WRITE_LOCK(x)
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index 8acb7ed40b47..4f95d477805c 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -44,6 +44,7 @@
*
*/
#include <linux/config.h>
+#include <linux/in.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -53,6 +54,7 @@
#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
#include <linux/netfilter_ipv4/ip_nat_helper.h>
#include <linux/ip.h>
+#include <linux/udp.h>
#include <net/checksum.h>
#include <net/udp.h>
#include <asm/uaccess.h>
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 45886c8475e8..2a26d167e149 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -83,11 +83,6 @@ static DECLARE_MUTEX(ipt_mutex);
context stops packets coming through and allows user context to read
the counters or update the rules.
- To be cache friendly on SMP, we arrange them like so:
- [ n-entries ]
- ... cache-align padding ...
- [ n-entries ]
-
Hence the start of any table is given by get_table() below. */
/* The table itself */
@@ -105,20 +100,15 @@ struct ipt_table_info
unsigned int underflow[NF_IP_NUMHOOKS];
/* ipt_entry tables: one per CPU */
- char entries[0] ____cacheline_aligned;
+ void *entries[NR_CPUS];
};
static LIST_HEAD(ipt_target);
static LIST_HEAD(ipt_match);
static LIST_HEAD(ipt_tables);
+#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
-#ifdef CONFIG_SMP
-#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
-#else
-#define TABLE_OFFSET(t,p) 0
-#endif
-
#if 0
#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
@@ -290,8 +280,7 @@ ipt_do_table(struct sk_buff **pskb,
read_lock_bh(&table->lock);
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
- table_base = (void *)table->private->entries
- + TABLE_OFFSET(table->private, smp_processor_id());
+ table_base = (void *)table->private->entries[smp_processor_id()];
e = get_entry(table_base, table->private->hook_entry[hook]);
#ifdef CONFIG_NETFILTER_DEBUG
@@ -563,7 +552,8 @@ unconditional(const struct ipt_ip *ip)
/* Figures out from what hook each rule can be called: returns 0 if
there are loops. Puts hook bitmask in comefrom. */
static int
-mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
+mark_source_chains(struct ipt_table_info *newinfo,
+ unsigned int valid_hooks, void *entry0)
{
unsigned int hook;
@@ -572,7 +562,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) {
unsigned int pos = newinfo->hook_entry[hook];
struct ipt_entry *e
- = (struct ipt_entry *)(newinfo->entries + pos);
+ = (struct ipt_entry *)(entry0 + pos);
if (!(valid_hooks & (1 << hook)))
continue;
@@ -622,13 +612,13 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
goto next;
e = (struct ipt_entry *)
- (newinfo->entries + pos);
+ (entry0 + pos);
} while (oldpos == pos + e->next_offset);
/* Move along one */
size = e->next_offset;
e = (struct ipt_entry *)
- (newinfo->entries + pos + size);
+ (entry0 + pos + size);
e->counters.pcnt = pos;
pos += size;
} else {
@@ -645,7 +635,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
newpos = pos + e->next_offset;
}
e = (struct ipt_entry *)
- (newinfo->entries + newpos);
+ (entry0 + newpos);
e->counters.pcnt = pos;
pos = newpos;
}
@@ -855,6 +845,7 @@ static int
translate_table(const char *name,
unsigned int valid_hooks,
struct ipt_table_info *newinfo,
+ void *entry0,
unsigned int size,
unsigned int number,
const unsigned int *hook_entries,
@@ -875,11 +866,11 @@ translate_table(const char *name,
duprintf("translate_table: size %u\n", newinfo->size);
i = 0;
/* Walk through entries, checking offsets. */
- ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
check_entry_size_and_hooks,
newinfo,
- newinfo->entries,
- newinfo->entries + size,
+ entry0,
+ entry0 + size,
hook_entries, underflows, &i);
if (ret != 0)
return ret;
@@ -907,27 +898,24 @@ translate_table(const char *name,
}
}
- if (!mark_source_chains(newinfo, valid_hooks))
+ if (!mark_source_chains(newinfo, valid_hooks, entry0))
return -ELOOP;
/* Finally, each sanity check must pass */
i = 0;
- ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
check_entry, name, size, &i);
if (ret != 0) {
- IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ IPT_ENTRY_ITERATE(entry0, newinfo->size,
cleanup_entry, &i);
return ret;
}
/* And one copy for every other CPU */
for_each_cpu(i) {
- if (i == 0)
- continue;
- memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
- newinfo->entries,
- SMP_ALIGN(newinfo->size));
+ if (newinfo->entries[i] && newinfo->entries[i] != entry0)
+ memcpy(newinfo->entries[i], entry0, newinfo->size);
}
return ret;
@@ -943,15 +931,12 @@ replace_table(struct ipt_table *table,
#ifdef CONFIG_NETFILTER_DEBUG
{
- struct ipt_entry *table_base;
- unsigned int i;
+ int cpu;
- for_each_cpu(i) {
- table_base =
- (void *)newinfo->entries
- + TABLE_OFFSET(newinfo, i);
-
- table_base->comefrom = 0xdead57ac;
+ for_each_cpu(cpu) {
+ struct ipt_entry *table_base = newinfo->entries[cpu];
+ if (table_base)
+ table_base->comefrom = 0xdead57ac;
}
}
#endif
@@ -986,16 +971,44 @@ add_entry_to_counter(const struct ipt_entry *e,
return 0;
}
+static inline int
+set_entry_to_counter(const struct ipt_entry *e,
+ struct ipt_counters total[],
+ unsigned int *i)
+{
+ SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+
+ (*i)++;
+ return 0;
+}
+
static void
get_counters(const struct ipt_table_info *t,
struct ipt_counters counters[])
{
unsigned int cpu;
unsigned int i;
+ unsigned int curcpu;
+
+ /* Instead of clearing (by a previous call to memset())
+ * the counters and using adds, we set the counters
+ * with data used by 'current' CPU
+ * We dont care about preemption here.
+ */
+ curcpu = raw_smp_processor_id();
+
+ i = 0;
+ IPT_ENTRY_ITERATE(t->entries[curcpu],
+ t->size,
+ set_entry_to_counter,
+ counters,
+ &i);
for_each_cpu(cpu) {
+ if (cpu == curcpu)
+ continue;
i = 0;
- IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+ IPT_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_entry_to_counter,
counters,
@@ -1012,24 +1025,29 @@ copy_entries_to_user(unsigned int total_size,
struct ipt_entry *e;
struct ipt_counters *counters;
int ret = 0;
+ void *loc_cpu_entry;
/* We need atomic snapshot of counters: rest doesn't change
(other than comefrom, which userspace doesn't care
about). */
countersize = sizeof(struct ipt_counters) * table->private->number;
- counters = vmalloc(countersize);
+ counters = vmalloc_node(countersize, numa_node_id());
if (counters == NULL)
return -ENOMEM;
/* First, sum counters... */
- memset(counters, 0, countersize);
write_lock_bh(&table->lock);
get_counters(table->private, counters);
write_unlock_bh(&table->lock);
- /* ... then copy entire thing from CPU 0... */
- if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+ /* choose the copy that is on our node/cpu, ...
+ * This choice is lazy (because current thread is
+ * allowed to migrate to another cpu)
+ */
+ loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+ /* ... then copy entire thing ... */
+ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
ret = -EFAULT;
goto free_counters;
}
@@ -1041,7 +1059,7 @@ copy_entries_to_user(unsigned int total_size,
struct ipt_entry_match *m;
struct ipt_entry_target *t;
- e = (struct ipt_entry *)(table->private->entries + off);
+ e = (struct ipt_entry *)(loc_cpu_entry + off);
if (copy_to_user(userptr + off
+ offsetof(struct ipt_entry, counters),
&counters[num],
@@ -1110,6 +1128,45 @@ get_entries(const struct ipt_get_entries *entries,
return ret;
}
+static void free_table_info(struct ipt_table_info *info)
+{
+ int cpu;
+ for_each_cpu(cpu) {
+ if (info->size <= PAGE_SIZE)
+ kfree(info->entries[cpu]);
+ else
+ vfree(info->entries[cpu]);
+ }
+ kfree(info);
+}
+
+static struct ipt_table_info *alloc_table_info(unsigned int size)
+{
+ struct ipt_table_info *newinfo;
+ int cpu;
+
+ newinfo = kzalloc(sizeof(struct ipt_table_info), GFP_KERNEL);
+ if (!newinfo)
+ return NULL;
+
+ newinfo->size = size;
+
+ for_each_cpu(cpu) {
+ if (size <= PAGE_SIZE)
+ newinfo->entries[cpu] = kmalloc_node(size,
+ GFP_KERNEL,
+ cpu_to_node(cpu));
+ else
+ newinfo->entries[cpu] = vmalloc_node(size, cpu_to_node(cpu));
+ if (newinfo->entries[cpu] == 0) {
+ free_table_info(newinfo);
+ return NULL;
+ }
+ }
+
+ return newinfo;
+}
+
static int
do_replace(void __user *user, unsigned int len)
{
@@ -1118,6 +1175,7 @@ do_replace(void __user *user, unsigned int len)
struct ipt_table *t;
struct ipt_table_info *newinfo, *oldinfo;
struct ipt_counters *counters;
+ void *loc_cpu_entry, *loc_cpu_old_entry;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1130,13 +1188,13 @@ do_replace(void __user *user, unsigned int len)
if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
return -ENOMEM;
- newinfo = vmalloc(sizeof(struct ipt_table_info)
- + SMP_ALIGN(tmp.size) *
- (highest_possible_processor_id()+1));
+ newinfo = alloc_table_info(tmp.size);
if (!newinfo)
return -ENOMEM;
- if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+ /* choose the copy that is our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
@@ -1147,10 +1205,9 @@ do_replace(void __user *user, unsigned int len)
ret = -ENOMEM;
goto free_newinfo;
}
- memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters));
ret = translate_table(tmp.name, tmp.valid_hooks,
- newinfo, tmp.size, tmp.num_entries,
+ newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
tmp.hook_entry, tmp.underflow);
if (ret != 0)
goto free_newinfo_counters;
@@ -1189,8 +1246,9 @@ do_replace(void __user *user, unsigned int len)
/* Get the old counters. */
get_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
- IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
- vfree(oldinfo);
+ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+ IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
+ free_table_info(oldinfo);
if (copy_to_user(tmp.counters, counters,
sizeof(struct ipt_counters) * tmp.num_counters) != 0)
ret = -EFAULT;
@@ -1202,11 +1260,11 @@ do_replace(void __user *user, unsigned int len)
module_put(t->me);
up(&ipt_mutex);
free_newinfo_counters_untrans:
- IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL);
+ IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
free_newinfo_counters:
vfree(counters);
free_newinfo:
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
@@ -1239,6 +1297,7 @@ do_add_counters(void __user *user, unsigned int len)
struct ipt_counters_info tmp, *paddc;
struct ipt_table *t;
int ret = 0;
+ void *loc_cpu_entry;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1246,7 +1305,7 @@ do_add_counters(void __user *user, unsigned int len)
if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters))
return -EINVAL;
- paddc = vmalloc(len);
+ paddc = vmalloc_node(len, numa_node_id());
if (!paddc)
return -ENOMEM;
@@ -1268,7 +1327,9 @@ do_add_counters(void __user *user, unsigned int len)
}
i = 0;
- IPT_ENTRY_ITERATE(t->private->entries,
+ /* Choose the copy that is on our node */
+ loc_cpu_entry = t->private->entries[raw_smp_processor_id()];
+ IPT_ENTRY_ITERATE(loc_cpu_entry,
t->private->size,
add_counter_to_entry,
paddc->counters,
@@ -1460,28 +1521,31 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
struct ipt_table_info *newinfo;
static struct ipt_table_info bootstrap
= { 0, 0, 0, { 0 }, { 0 }, { } };
+ void *loc_cpu_entry;
- newinfo = vmalloc(sizeof(struct ipt_table_info)
- + SMP_ALIGN(repl->size) *
- (highest_possible_processor_id()+1));
+ newinfo = alloc_table_info(repl->size);
if (!newinfo)
return -ENOMEM;
- memcpy(newinfo->entries, repl->entries, repl->size);
+ /* choose the copy on our node/cpu
+ * but dont care of preemption
+ */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(table->name, table->valid_hooks,
- newinfo, repl->size,
+ newinfo, loc_cpu_entry, repl->size,
repl->num_entries,
repl->hook_entry,
repl->underflow);
if (ret != 0) {
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
ret = down_interruptible(&ipt_mutex);
if (ret != 0) {
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
@@ -1510,20 +1574,23 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
return ret;
free_unlock:
- vfree(newinfo);
+ free_table_info(newinfo);
goto unlock;
}
void ipt_unregister_table(struct ipt_table *table)
{
+ void *loc_cpu_entry;
+
down(&ipt_mutex);
LIST_DELETE(&ipt_tables, table);
up(&ipt_mutex);
/* Decrease module usage counts and free resources */
- IPT_ENTRY_ITERATE(table->private->entries, table->private->size,
+ loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+ IPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
cleanup_entry, NULL);
- vfree(table->private);
+ free_table_info(table->private);
}
/* Returns 1 if the port is matched by the range, 0 otherwise */
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 275a174c6fe6..27860510ca6d 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -11,6 +11,7 @@
#include <linux/config.h>
#include <linux/types.h>
+#include <linux/inetdevice.h>
#include <linux/ip.h>
#include <linux/timer.h>
#include <linux/module.h>
@@ -18,6 +19,7 @@
#include <net/protocol.h>
#include <net/ip.h>
#include <net/checksum.h>
+#include <net/route.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv4/ip_nat_rule.h>
#include <linux/netfilter_ipv4/ip_tables.h>
diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c
index 1a53924041fc..03f554857a4d 100644
--- a/net/ipv4/netfilter/ipt_physdev.c
+++ b/net/ipv4/netfilter/ipt_physdev.c
@@ -9,6 +9,7 @@
*/
#include <linux/module.h>
+#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/netfilter_ipv4/ipt_physdev.h>
#include <linux/netfilter_ipv4/ip_tables.h>
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 0d7dc668db46..39d49dc333a7 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -38,6 +38,7 @@
#include <net/protocol.h>
#include <net/tcp.h>
#include <net/udp.h>
+#include <linux/inetdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <net/sock.h>
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index a34e60ea48a1..e20be3331f67 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -173,10 +173,10 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct sock *child;
- child = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
+ child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
if (child)
inet_csk_reqsk_queue_add(sk, req, child);
else
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 01444a02b48b..16984d4a8a06 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -12,6 +12,7 @@
#include <linux/sysctl.h>
#include <linux/config.h>
#include <linux/igmp.h>
+#include <linux/inetdevice.h>
#include <net/snmp.h>
#include <net/icmp.h>
#include <net/ip.h>
@@ -22,6 +23,7 @@
extern int sysctl_ip_nonlocal_bind;
#ifdef CONFIG_SYSCTL
+static int zero;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -614,6 +616,15 @@ ctl_table ipv4_table[] = {
.strategy = &sysctl_jiffies
},
{
+ .ctl_name = NET_IPV4_IPFRAG_MAX_DIST,
+ .procname = "ipfrag_max_dist",
+ .data = &sysctl_ipfrag_max_dist,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &zero
+ },
+ {
.ctl_name = NET_TCP_NO_METRICS_SAVE,
.procname = "tcp_no_metrics_save",
.data = &sysctl_tcp_nometrics_save,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ef98b14ac56d..00aa80e93243 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1696,8 +1696,8 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
int err = 0;
if (level != SOL_TCP)
- return tp->af_specific->setsockopt(sk, level, optname,
- optval, optlen);
+ return icsk->icsk_af_ops->setsockopt(sk, level, optname,
+ optval, optlen);
/* This is a string value all the others are int's */
if (optname == TCP_CONGESTION) {
@@ -1914,7 +1914,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
- info->tcpi_pmtu = tp->pmtu_cookie;
+ info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
@@ -1939,8 +1939,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
int val, len;
if (level != SOL_TCP)
- return tp->af_specific->getsockopt(sk, level, optname,
- optval, optlen);
+ return icsk->icsk_af_ops->getsockopt(sk, level, optname,
+ optval, optlen);
if (get_user(len, optlen))
return -EFAULT;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 1d0cd86621b1..035f2092d73a 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -30,8 +30,6 @@ static int fast_convergence = 1;
static int max_increment = 16;
static int low_window = 14;
static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
-static int low_utilization_threshold = 153;
-static int low_utilization_period = 2;
static int initial_ssthresh = 100;
static int smooth_part = 20;
@@ -43,10 +41,6 @@ module_param(low_window, int, 0644);
MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
module_param(beta, int, 0644);
MODULE_PARM_DESC(beta, "beta for multiplicative increase");
-module_param(low_utilization_threshold, int, 0644);
-MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode");
-module_param(low_utilization_period, int, 0644);
-MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)");
module_param(initial_ssthresh, int, 0644);
MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
module_param(smooth_part, int, 0644);
@@ -60,11 +54,6 @@ struct bictcp {
u32 loss_cwnd; /* congestion window at last loss */
u32 last_cwnd; /* the last snd_cwnd */
u32 last_time; /* time when updated last_cwnd */
- u32 delay_min; /* min delay */
- u32 delay_max; /* max delay */
- u32 last_delay;
- u8 low_utilization;/* 0: high; 1: low */
- u32 low_utilization_start; /* starting time of low utilization detection*/
u32 epoch_start; /* beginning of an epoch */
#define ACK_RATIO_SHIFT 4
u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
@@ -77,11 +66,6 @@ static inline void bictcp_reset(struct bictcp *ca)
ca->loss_cwnd = 0;
ca->last_cwnd = 0;
ca->last_time = 0;
- ca->delay_min = 0;
- ca->delay_max = 0;
- ca->last_delay = 0;
- ca->low_utilization = 0;
- ca->low_utilization_start = 0;
ca->epoch_start = 0;
ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
}
@@ -143,8 +127,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
}
/* if in slow start or link utilization is very low */
- if ( ca->loss_cwnd == 0 ||
- (cwnd > ca->loss_cwnd && ca->low_utilization)) {
+ if (ca->loss_cwnd == 0) {
if (ca->cnt > 20) /* increase cwnd 5% per RTT */
ca->cnt = 20;
}
@@ -154,69 +137,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
ca->cnt = 1;
}
-
-/* Detect low utilization in congestion avoidance */
-static inline void bictcp_low_utilization(struct sock *sk, int flag)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- struct bictcp *ca = inet_csk_ca(sk);
- u32 dist, delay;
-
- /* No time stamp */
- if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
- /* Discard delay samples right after fast recovery */
- tcp_time_stamp < ca->epoch_start + HZ ||
- /* this delay samples may not be accurate */
- flag == 0) {
- ca->last_delay = 0;
- goto notlow;
- }
-
- delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/
- ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
- if (delay == 0) /* no previous delay sample */
- goto notlow;
-
- /* first time call or link delay decreases */
- if (ca->delay_min == 0 || ca->delay_min > delay) {
- ca->delay_min = ca->delay_max = delay;
- goto notlow;
- }
-
- if (ca->delay_max < delay)
- ca->delay_max = delay;
-
- /* utilization is low, if avg delay < dist*threshold
- for checking_period time */
- dist = ca->delay_max - ca->delay_min;
- if (dist <= ca->delay_min>>6 ||
- tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10)
- goto notlow;
-
- if (ca->low_utilization_start == 0) {
- ca->low_utilization = 0;
- ca->low_utilization_start = tcp_time_stamp;
- } else if ((s32)(tcp_time_stamp - ca->low_utilization_start)
- > low_utilization_period*HZ) {
- ca->low_utilization = 1;
- }
-
- return;
-
- notlow:
- ca->low_utilization = 0;
- ca->low_utilization_start = 0;
-
-}
-
static void bictcp_cong_avoid(struct sock *sk, u32 ack,
u32 seq_rtt, u32 in_flight, int data_acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
- bictcp_low_utilization(sk, data_acked);
-
if (!tcp_is_cwnd_limited(sk, in_flight))
return;
@@ -249,11 +175,6 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk)
ca->epoch_start = 0; /* end of epoch */
- /* in case of wrong delay_max*/
- if (ca->delay_min > 0 && ca->delay_max > ca->delay_min)
- ca->delay_max = ca->delay_min
- + ((ca->delay_max - ca->delay_min)* 90) / 100;
-
/* Wmax and fast convergence */
if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
@@ -289,14 +210,14 @@ static void bictcp_state(struct sock *sk, u8 new_state)
bictcp_reset(inet_csk_ca(sk));
}
-/* Track delayed acknowledgement ratio using sliding window
+/* Track delayed acknowledgment ratio using sliding window
* ratio = (15*ratio + sample) / 16
*/
static void bictcp_acked(struct sock *sk, u32 cnt)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
+ if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
struct bictcp *ca = inet_csk_ca(sk);
cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
ca->delayed_ack += cnt;
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index c7cc62c8dc12..e688c687d62d 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -174,6 +174,34 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
return err;
}
+
+/*
+ * Linear increase during slow start
+ */
+void tcp_slow_start(struct tcp_sock *tp)
+{
+ if (sysctl_tcp_abc) {
+ /* RFC3465: Slow Start
+ * TCP sender SHOULD increase cwnd by the number of
+ * previously unacknowledged bytes ACKed by each incoming
+ * acknowledgment, provided the increase is not more than L
+ */
+ if (tp->bytes_acked < tp->mss_cache)
+ return;
+
+ /* We MAY increase by 2 if discovered delayed ack */
+ if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) {
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ }
+ }
+ tp->bytes_acked = 0;
+
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+}
+EXPORT_SYMBOL_GPL(tcp_slow_start);
+
/*
* TCP Reno congestion control
* This is special case used for fallback as well.
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
new file mode 100644
index 000000000000..31a4986dfbf7
--- /dev/null
+++ b/net/ipv4/tcp_cubic.c
@@ -0,0 +1,411 @@
+/*
+ * TCP CUBIC: Binary Increase Congestion control for TCP v2.0
+ *
+ * This is from the implementation of CUBIC TCP in
+ * Injong Rhee, Lisong Xu.
+ * "CUBIC: A New TCP-Friendly High-Speed TCP Variant
+ * in PFLDnet 2005
+ * Available from:
+ * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
+ *
+ * Unless CUBIC is enabled and congestion window is large
+ * this behaves the same as the original Reno.
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+#include <asm/div64.h>
+
+#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
+ * max_cwnd = snd_cwnd * beta
+ */
+#define BICTCP_B 4 /*
+ * In binary search,
+ * go to point (max+min)/N
+ */
+#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */
+
+static int fast_convergence = 1;
+static int max_increment = 16;
+static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
+static int initial_ssthresh = 100;
+static int bic_scale = 41;
+static int tcp_friendliness = 1;
+
+static u32 cube_rtt_scale;
+static u32 beta_scale;
+static u64 cube_factor;
+
+/* Note parameters that are used for precomputing scale factors are read-only */
+module_param(fast_convergence, int, 0644);
+MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
+module_param(max_increment, int, 0644);
+MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
+module_param(beta, int, 0444);
+MODULE_PARM_DESC(beta, "beta for multiplicative increase");
+module_param(initial_ssthresh, int, 0644);
+MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
+module_param(bic_scale, int, 0444);
+MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
+module_param(tcp_friendliness, int, 0644);
+MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
+
+#include <asm/div64.h>
+
+/* BIC TCP Parameters */
+struct bictcp {
+ u32 cnt; /* increase cwnd by 1 after ACKs */
+ u32 last_max_cwnd; /* last maximum snd_cwnd */
+ u32 loss_cwnd; /* congestion window at last loss */
+ u32 last_cwnd; /* the last snd_cwnd */
+ u32 last_time; /* time when updated last_cwnd */
+ u32 bic_origin_point;/* origin point of bic function */
+ u32 bic_K; /* time to origin point from the beginning of the current epoch */
+ u32 delay_min; /* min delay */
+ u32 epoch_start; /* beginning of an epoch */
+ u32 ack_cnt; /* number of acks */
+ u32 tcp_cwnd; /* estimated tcp cwnd */
+#define ACK_RATIO_SHIFT 4
+ u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
+};
+
+static inline void bictcp_reset(struct bictcp *ca)
+{
+ ca->cnt = 0;
+ ca->last_max_cwnd = 0;
+ ca->loss_cwnd = 0;
+ ca->last_cwnd = 0;
+ ca->last_time = 0;
+ ca->bic_origin_point = 0;
+ ca->bic_K = 0;
+ ca->delay_min = 0;
+ ca->epoch_start = 0;
+ ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
+ ca->ack_cnt = 0;
+ ca->tcp_cwnd = 0;
+}
+
+static void bictcp_init(struct sock *sk)
+{
+ bictcp_reset(inet_csk_ca(sk));
+ if (initial_ssthresh)
+ tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
+}
+
+/* 64bit divisor, dividend and result. dynamic precision */
+static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
+{
+ u_int32_t d = divisor;
+
+ if (divisor > 0xffffffffULL) {
+ unsigned int shift = fls(divisor >> 32);
+
+ d = divisor >> shift;
+ dividend >>= shift;
+ }
+
+ /* avoid 64 bit division if possible */
+ if (dividend >> 32)
+ do_div(dividend, d);
+ else
+ dividend = (uint32_t) dividend / d;
+
+ return dividend;
+}
+
+/*
+ * calculate the cubic root of x using Newton-Raphson
+ */
+static u32 cubic_root(u64 a)
+{
+ u32 x, x1;
+
+ /* Initial estimate is based on:
+ * cbrt(x) = exp(log(x) / 3)
+ */
+ x = 1u << (fls64(a)/3);
+
+ /*
+ * Iteration based on:
+ * 2
+ * x = ( 2 * x + a / x ) / 3
+ * k+1 k k
+ */
+ do {
+ x1 = x;
+ x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3;
+ } while (abs(x1 - x) > 1);
+
+ return x;
+}
+
+/*
+ * Compute congestion window to use.
+ */
+static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
+{
+ u64 offs;
+ u32 delta, t, bic_target, min_cnt, max_cnt;
+
+ ca->ack_cnt++; /* count the number of ACKs */
+
+ if (ca->last_cwnd == cwnd &&
+ (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
+ return;
+
+ ca->last_cwnd = cwnd;
+ ca->last_time = tcp_time_stamp;
+
+ if (ca->epoch_start == 0) {
+ ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */
+ ca->ack_cnt = 1; /* start counting */
+ ca->tcp_cwnd = cwnd; /* syn with cubic */
+
+ if (ca->last_max_cwnd <= cwnd) {
+ ca->bic_K = 0;
+ ca->bic_origin_point = cwnd;
+ } else {
+ /* Compute new K based on
+ * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ)
+ */
+ ca->bic_K = cubic_root(cube_factor
+ * (ca->last_max_cwnd - cwnd));
+ ca->bic_origin_point = ca->last_max_cwnd;
+ }
+ }
+
+ /* cubic function - calc*/
+ /* calculate c * time^3 / rtt,
+ * while considering overflow in calculation of time^3
+ * (so time^3 is done by using 64 bit)
+ * and without the support of division of 64bit numbers
+ * (so all divisions are done by using 32 bit)
+ * also NOTE the unit of those veriables
+ * time = (t - K) / 2^bictcp_HZ
+ * c = bic_scale >> 10
+ * rtt = (srtt >> 3) / HZ
+ * !!! The following code does not have overflow problems,
+ * if the cwnd < 1 million packets !!!
+ */
+
+ /* change the unit from HZ to bictcp_HZ */
+ t = ((tcp_time_stamp + ca->delay_min - ca->epoch_start)
+ << BICTCP_HZ) / HZ;
+
+ if (t < ca->bic_K) /* t - K */
+ offs = ca->bic_K - t;
+ else
+ offs = t - ca->bic_K;
+
+ /* c/rtt * (t-K)^3 */
+ delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
+ if (t < ca->bic_K) /* below origin*/
+ bic_target = ca->bic_origin_point - delta;
+ else /* above origin*/
+ bic_target = ca->bic_origin_point + delta;
+
+ /* cubic function - calc bictcp_cnt*/
+ if (bic_target > cwnd) {
+ ca->cnt = cwnd / (bic_target - cwnd);
+ } else {
+ ca->cnt = 100 * cwnd; /* very small increment*/
+ }
+
+ if (ca->delay_min > 0) {
+ /* max increment = Smax * rtt / 0.1 */
+ min_cnt = (cwnd * HZ * 8)/(10 * max_increment * ca->delay_min);
+ if (ca->cnt < min_cnt)
+ ca->cnt = min_cnt;
+ }
+
+ /* slow start and low utilization */
+ if (ca->loss_cwnd == 0) /* could be aggressive in slow start */
+ ca->cnt = 50;
+
+ /* TCP Friendly */
+ if (tcp_friendliness) {
+ u32 scale = beta_scale;
+ delta = (cwnd * scale) >> 3;
+ while (ca->ack_cnt > delta) { /* update tcp cwnd */
+ ca->ack_cnt -= delta;
+ ca->tcp_cwnd++;
+ }
+
+ if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */
+ delta = ca->tcp_cwnd - cwnd;
+ max_cnt = cwnd / delta;
+ if (ca->cnt > max_cnt)
+ ca->cnt = max_cnt;
+ }
+ }
+
+ ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
+ if (ca->cnt == 0) /* cannot be zero */
+ ca->cnt = 1;
+}
+
+
+/* Keep track of minimum rtt */
+static inline void measure_delay(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+ u32 delay;
+
+ /* No time stamp */
+ if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
+ /* Discard delay samples right after fast recovery */
+ (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
+ return;
+
+ delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
+ if (delay == 0)
+ delay = 1;
+
+ /* first time call or link delay decreases */
+ if (ca->delay_min == 0 || ca->delay_min > delay)
+ ca->delay_min = delay;
+}
+
+static void bictcp_cong_avoid(struct sock *sk, u32 ack,
+ u32 seq_rtt, u32 in_flight, int data_acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ if (data_acked)
+ measure_delay(sk);
+
+ if (!tcp_is_cwnd_limited(sk, in_flight))
+ return;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp);
+ else {
+ bictcp_update(ca, tp->snd_cwnd);
+
+ /* In dangerous area, increase slowly.
+ * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
+ */
+ if (tp->snd_cwnd_cnt >= ca->cnt) {
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt = 0;
+ } else
+ tp->snd_cwnd_cnt++;
+ }
+
+}
+
+static u32 bictcp_recalc_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ ca->epoch_start = 0; /* end of epoch */
+
+ /* Wmax and fast convergence */
+ if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
+ ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
+ / (2 * BICTCP_BETA_SCALE);
+ else
+ ca->last_max_cwnd = tp->snd_cwnd;
+
+ ca->loss_cwnd = tp->snd_cwnd;
+
+ return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
+}
+
+static u32 bictcp_undo_cwnd(struct sock *sk)
+{
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
+}
+
+static u32 bictcp_min_cwnd(struct sock *sk)
+{
+ return tcp_sk(sk)->snd_ssthresh;
+}
+
+static void bictcp_state(struct sock *sk, u8 new_state)
+{
+ if (new_state == TCP_CA_Loss)
+ bictcp_reset(inet_csk_ca(sk));
+}
+
+/* Track delayed acknowledgment ratio using sliding window
+ * ratio = (15*ratio + sample) / 16
+ */
+static void bictcp_acked(struct sock *sk, u32 cnt)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
+ struct bictcp *ca = inet_csk_ca(sk);
+ cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
+ ca->delayed_ack += cnt;
+ }
+}
+
+
+static struct tcp_congestion_ops cubictcp = {
+ .init = bictcp_init,
+ .ssthresh = bictcp_recalc_ssthresh,
+ .cong_avoid = bictcp_cong_avoid,
+ .set_state = bictcp_state,
+ .undo_cwnd = bictcp_undo_cwnd,
+ .min_cwnd = bictcp_min_cwnd,
+ .pkts_acked = bictcp_acked,
+ .owner = THIS_MODULE,
+ .name = "cubic",
+};
+
+static int __init cubictcp_register(void)
+{
+ BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
+
+ /* Precompute a bunch of the scaling factors that are used per-packet
+ * based on SRTT of 100ms
+ */
+
+ beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta);
+
+ cube_rtt_scale = (bic_scale << 3) / 10; /* 1024*c/rtt */
+
+ /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
+ * so K = cubic_root( (wmax-cwnd)*rtt/c )
+ * the unit of K is bictcp_HZ=2^10, not HZ
+ *
+ * c = bic_scale >> 10
+ * rtt = 100ms
+ *
+ * the following code has been designed and tested for
+ * cwnd < 1 million packets
+ * RTT < 100 seconds
+ * HZ < 1,000,00 (corresponding to 10 nano-second)
+ */
+
+ /* 1/c * 2^2*bictcp_HZ * srtt */
+ cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */
+
+ /* divide by bic_scale and by constant Srtt (100ms) */
+ do_div(cube_factor, bic_scale * 10);
+
+ return tcp_register_congestion_control(&cubictcp);
+}
+
+static void __exit cubictcp_unregister(void)
+{
+ tcp_unregister_congestion_control(&cubictcp);
+}
+
+module_init(cubictcp_register);
+module_exit(cubictcp_unregister);
+
+MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("CUBIC TCP");
+MODULE_VERSION("2.0");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bf2e23086bce..0a461232329f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -115,8 +115,8 @@ int sysctl_tcp_abc = 1;
/* Adapt the MSS value used to make delayed ack decision to the
* real world.
*/
-static inline void tcp_measure_rcv_mss(struct sock *sk,
- const struct sk_buff *skb)
+static void tcp_measure_rcv_mss(struct sock *sk,
+ const struct sk_buff *skb)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const unsigned int lss = icsk->icsk_ack.last_seg_size;
@@ -246,8 +246,8 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
return 0;
}
-static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
- struct sk_buff *skb)
+static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
+ struct sk_buff *skb)
{
/* Check #1 */
if (tp->rcv_ssthresh < tp->window_clamp &&
@@ -341,6 +341,26 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
}
+
+/* Initialize RCV_MSS value.
+ * RCV_MSS is an our guess about MSS used by the peer.
+ * We haven't any direct information about the MSS.
+ * It's better to underestimate the RCV_MSS rather than overestimate.
+ * Overestimations make us ACKing less frequently than needed.
+ * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
+ */
+void tcp_initialize_rcv_mss(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
+
+ hint = min(hint, tp->rcv_wnd/2);
+ hint = min(hint, TCP_MIN_RCVMSS);
+ hint = max(hint, TCP_MIN_MSS);
+
+ inet_csk(sk)->icsk_ack.rcv_mss = hint;
+}
+
/* Receiver "autotuning" code.
*
* The algorithm for RTT estimation w/o timestamps is based on
@@ -735,6 +755,27 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
+/* Set slow start threshold and cwnd not falling to slow start */
+void tcp_enter_cwr(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tp->prior_ssthresh = 0;
+ tp->bytes_acked = 0;
+ if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+ tp->undo_marker = 0;
+ tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+ tp->snd_cwnd = min(tp->snd_cwnd,
+ tcp_packets_in_flight(tp) + 1U);
+ tp->snd_cwnd_cnt = 0;
+ tp->high_seq = tp->snd_nxt;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ TCP_ECN_queue_cwr(tp);
+
+ tcp_set_ca_state(sk, TCP_CA_CWR);
+ }
+}
+
/* Initialize metrics on socket. */
static void tcp_init_metrics(struct sock *sk)
@@ -2070,8 +2111,8 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
tcp_ack_no_tstamp(sk, seq_rtt, flag);
}
-static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
- u32 in_flight, int good)
+static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
+ u32 in_flight, int good)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good);
@@ -2082,7 +2123,7 @@ static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
* RFC2988 recommends to restart timer to now+rto.
*/
-static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
+static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
{
if (!tp->packets_out) {
inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
@@ -2147,7 +2188,7 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
return acked;
}
-static inline u32 tcp_usrtt(const struct sk_buff *skb)
+static u32 tcp_usrtt(const struct sk_buff *skb)
{
struct timeval tv, now;
@@ -2342,7 +2383,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
if (nwin > tp->max_window) {
tp->max_window = nwin;
- tcp_sync_mss(sk, tp->pmtu_cookie);
+ tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
}
}
}
@@ -2583,8 +2624,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
/* Fast parse options. This hopes to only see timestamps.
* If it is wrong it falls back on tcp_parse_options().
*/
-static inline int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
- struct tcp_sock *tp)
+static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
+ struct tcp_sock *tp)
{
if (th->doff == sizeof(struct tcphdr)>>2) {
tp->rx_opt.saw_tstamp = 0;
@@ -2804,8 +2845,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
}
}
-static __inline__ int
-tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
+static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
{
if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
if (before(seq, sp->start_seq))
@@ -2817,7 +2857,7 @@ tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
return 0;
}
-static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
+static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
{
if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
if (before(seq, tp->rcv_nxt))
@@ -2832,7 +2872,7 @@ static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
}
}
-static inline void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq)
+static void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq)
{
if (!tp->rx_opt.dsack)
tcp_dsack_set(tp, seq, end_seq);
@@ -2890,7 +2930,7 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
}
}
-static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
+static inline void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
{
__u32 tmp;
@@ -3455,7 +3495,7 @@ void tcp_cwnd_application_limited(struct sock *sk)
tp->snd_cwnd_stamp = tcp_time_stamp;
}
-static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
+static int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
{
/* If the user specified a specific send buffer setting, do
* not modify it.
@@ -3502,7 +3542,7 @@ static void tcp_new_space(struct sock *sk)
sk->sk_write_space(sk);
}
-static inline void tcp_check_space(struct sock *sk)
+static void tcp_check_space(struct sock *sk)
{
if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
@@ -3512,7 +3552,7 @@ static inline void tcp_check_space(struct sock *sk)
}
}
-static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
+static inline void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
{
tcp_push_pending_frames(sk, tp);
tcp_check_space(sk);
@@ -3544,7 +3584,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
}
}
-static __inline__ void tcp_ack_snd_check(struct sock *sk)
+static inline void tcp_ack_snd_check(struct sock *sk)
{
if (!inet_csk_ack_scheduled(sk)) {
/* We sent a data segment already. */
@@ -3692,8 +3732,7 @@ static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
return result;
}
-static __inline__ int
-tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
+static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
{
return skb->ip_summed != CHECKSUM_UNNECESSARY &&
__tcp_checksum_complete_user(sk, skb);
@@ -3967,12 +4006,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
int saved_clamp = tp->rx_opt.mss_clamp;
tcp_parse_options(skb, &tp->rx_opt, 0);
if (th->ack) {
- struct inet_connection_sock *icsk;
/* rfc793:
* "If the state is SYN-SENT then
* first check the ACK bit
@@ -4061,7 +4100,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
tp->rx_opt.sack_ok |= 2;
- tcp_sync_mss(sk, tp->pmtu_cookie);
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
/* Remember, tcp_poll() does not lock socket!
@@ -4072,7 +4111,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_set_state(sk, TCP_ESTABLISHED);
/* Make sure socket is routed, for correct metrics. */
- tp->af_specific->rebuild_header(sk);
+ icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_metrics(sk);
@@ -4098,8 +4137,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
sk_wake_async(sk, 0, POLL_OUT);
}
- icsk = inet_csk(sk);
-
if (sk->sk_write_pending ||
icsk->icsk_accept_queue.rskq_defer_accept ||
icsk->icsk_ack.pingpong) {
@@ -4173,7 +4210,7 @@ discard:
if (tp->ecn_flags&TCP_ECN_OK)
sock_set_flag(sk, SOCK_NO_LARGESEND);
- tcp_sync_mss(sk, tp->pmtu_cookie);
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
@@ -4220,6 +4257,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
int queued = 0;
tp->rx_opt.saw_tstamp = 0;
@@ -4236,7 +4274,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
goto discard;
if(th->syn) {
- if(tp->af_specific->conn_request(sk, skb) < 0)
+ if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
return 1;
/* Now we have several options: In theory there is
@@ -4349,7 +4387,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* Make sure socket is routed, for
* correct metrics.
*/
- tp->af_specific->rebuild_header(sk);
+ icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_metrics(sk);
@@ -4475,3 +4513,4 @@ EXPORT_SYMBOL(sysctl_tcp_abc);
EXPORT_SYMBOL(tcp_parse_options);
EXPORT_SYMBOL(tcp_rcv_established);
EXPORT_SYMBOL(tcp_rcv_state_process);
+EXPORT_SYMBOL(tcp_initialize_rcv_mss);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4d5021e1929b..e9f83e5b28ce 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -69,6 +69,7 @@
#include <net/transp_v6.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
+#include <net/timewait_sock.h>
#include <net/xfrm.h>
#include <linux/inet.h>
@@ -86,8 +87,7 @@ int sysctl_tcp_low_latency;
/* Socket used for sending RSTs */
static struct socket *tcp_socket;
-void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
- struct sk_buff *skb);
+void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
.lhash_lock = RW_LOCK_UNLOCKED,
@@ -97,7 +97,8 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
{
- return inet_csk_get_port(&tcp_hashinfo, sk, snum);
+ return inet_csk_get_port(&tcp_hashinfo, sk, snum,
+ inet_csk_bind_conflict);
}
static void tcp_v4_hash(struct sock *sk)
@@ -118,202 +119,38 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
skb->h.th->source);
}
-/* called with local bh disabled */
-static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
- struct inet_timewait_sock **twp)
+int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
{
- struct inet_sock *inet = inet_sk(sk);
- u32 daddr = inet->rcv_saddr;
- u32 saddr = inet->daddr;
- int dif = sk->sk_bound_dev_if;
- INET_ADDR_COOKIE(acookie, saddr, daddr)
- const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
- unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
- struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
- struct sock *sk2;
- const struct hlist_node *node;
- struct inet_timewait_sock *tw;
-
- prefetch(head->chain.first);
- write_lock(&head->lock);
-
- /* Check TIME-WAIT sockets first. */
- sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
- tw = inet_twsk(sk2);
-
- if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
- const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
- struct tcp_sock *tp = tcp_sk(sk);
-
- /* With PAWS, it is safe from the viewpoint
- of data integrity. Even without PAWS it
- is safe provided sequence spaces do not
- overlap i.e. at data rates <= 80Mbit/sec.
-
- Actually, the idea is close to VJ's one,
- only timestamp cache is held not per host,
- but per port pair and TW bucket is used
- as state holder.
+ const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
+ struct tcp_sock *tp = tcp_sk(sk);
- If TW bucket has been already destroyed we
- fall back to VJ's scheme and use initial
- timestamp retrieved from peer table.
- */
- if (tcptw->tw_ts_recent_stamp &&
- (!twp || (sysctl_tcp_tw_reuse &&
- xtime.tv_sec -
- tcptw->tw_ts_recent_stamp > 1))) {
- tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
- if (tp->write_seq == 0)
- tp->write_seq = 1;
- tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
- tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
- sock_hold(sk2);
- goto unique;
- } else
- goto not_unique;
- }
- }
- tw = NULL;
+ /* With PAWS, it is safe from the viewpoint
+ of data integrity. Even without PAWS it is safe provided sequence
+ spaces do not overlap i.e. at data rates <= 80Mbit/sec.
- /* And established part... */
- sk_for_each(sk2, node, &head->chain) {
- if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
- goto not_unique;
- }
+ Actually, the idea is close to VJ's one, only timestamp cache is
+ held not per host, but per port pair and TW bucket is used as state
+ holder.
-unique:
- /* Must record num and sport now. Otherwise we will see
- * in hash table socket with a funny identity. */
- inet->num = lport;
- inet->sport = htons(lport);
- sk->sk_hash = hash;
- BUG_TRAP(sk_unhashed(sk));
- __sk_add_node(sk, &head->chain);
- sock_prot_inc_use(sk->sk_prot);
- write_unlock(&head->lock);
-
- if (twp) {
- *twp = tw;
- NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
- } else if (tw) {
- /* Silly. Should hash-dance instead... */
- inet_twsk_deschedule(tw, &tcp_death_row);
- NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-
- inet_twsk_put(tw);
+ If TW bucket has been already destroyed we fall back to VJ's scheme
+ and use initial timestamp retrieved from peer table.
+ */
+ if (tcptw->tw_ts_recent_stamp &&
+ (twp == NULL || (sysctl_tcp_tw_reuse &&
+ xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
+ tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
+ if (tp->write_seq == 0)
+ tp->write_seq = 1;
+ tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
+ tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+ sock_hold(sktw);
+ return 1;
}
return 0;
-
-not_unique:
- write_unlock(&head->lock);
- return -EADDRNOTAVAIL;
}
-static inline u32 connect_port_offset(const struct sock *sk)
-{
- const struct inet_sock *inet = inet_sk(sk);
-
- return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
- inet->dport);
-}
-
-/*
- * Bind a port for a connect operation and hash it.
- */
-static inline int tcp_v4_hash_connect(struct sock *sk)
-{
- const unsigned short snum = inet_sk(sk)->num;
- struct inet_bind_hashbucket *head;
- struct inet_bind_bucket *tb;
- int ret;
-
- if (!snum) {
- int low = sysctl_local_port_range[0];
- int high = sysctl_local_port_range[1];
- int range = high - low;
- int i;
- int port;
- static u32 hint;
- u32 offset = hint + connect_port_offset(sk);
- struct hlist_node *node;
- struct inet_timewait_sock *tw = NULL;
-
- local_bh_disable();
- for (i = 1; i <= range; i++) {
- port = low + (i + offset) % range;
- head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
- spin_lock(&head->lock);
-
- /* Does not bother with rcv_saddr checks,
- * because the established check is already
- * unique enough.
- */
- inet_bind_bucket_for_each(tb, node, &head->chain) {
- if (tb->port == port) {
- BUG_TRAP(!hlist_empty(&tb->owners));
- if (tb->fastreuse >= 0)
- goto next_port;
- if (!__tcp_v4_check_established(sk,
- port,
- &tw))
- goto ok;
- goto next_port;
- }
- }
-
- tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
- if (!tb) {
- spin_unlock(&head->lock);
- break;
- }
- tb->fastreuse = -1;
- goto ok;
-
- next_port:
- spin_unlock(&head->lock);
- }
- local_bh_enable();
-
- return -EADDRNOTAVAIL;
-
-ok:
- hint += i;
-
- /* Head lock still held and bh's disabled */
- inet_bind_hash(sk, tb, port);
- if (sk_unhashed(sk)) {
- inet_sk(sk)->sport = htons(port);
- __inet_hash(&tcp_hashinfo, sk, 0);
- }
- spin_unlock(&head->lock);
-
- if (tw) {
- inet_twsk_deschedule(tw, &tcp_death_row);;
- inet_twsk_put(tw);
- }
-
- ret = 0;
- goto out;
- }
-
- head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
- tb = inet_csk(sk)->icsk_bind_hash;
- spin_lock_bh(&head->lock);
- if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- __inet_hash(&tcp_hashinfo, sk, 0);
- spin_unlock_bh(&head->lock);
- return 0;
- } else {
- spin_unlock(&head->lock);
- /* No definite answer... Walk to established hash table */
- ret = __tcp_v4_check_established(sk, snum, NULL);
-out:
- local_bh_enable();
- return ret;
- }
-}
+EXPORT_SYMBOL_GPL(tcp_twsk_unique);
/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
@@ -383,9 +220,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->dport = usin->sin_port;
inet->daddr = daddr;
- tp->ext_header_len = 0;
+ inet_csk(sk)->icsk_ext_hdr_len = 0;
if (inet->opt)
- tp->ext_header_len = inet->opt->optlen;
+ inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
tp->rx_opt.mss_clamp = 536;
@@ -395,7 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
* complete initialization after this.
*/
tcp_set_state(sk, TCP_SYN_SENT);
- err = tcp_v4_hash_connect(sk);
+ err = inet_hash_connect(&tcp_death_row, sk);
if (err)
goto failure;
@@ -433,12 +270,10 @@ failure:
/*
* This routine does path mtu discovery as defined in RFC1191.
*/
-static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
- u32 mtu)
+static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
{
struct dst_entry *dst;
struct inet_sock *inet = inet_sk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
* send out by Linux are always <576bytes so they should go through
@@ -467,7 +302,7 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
mtu = dst_mtu(dst);
if (inet->pmtudisc != IP_PMTUDISC_DONT &&
- tp->pmtu_cookie > mtu) {
+ inet_csk(sk)->icsk_pmtu_cookie > mtu) {
tcp_sync_mss(sk, mtu);
/* Resend the TCP packet because it's
@@ -644,10 +479,10 @@ out:
}
/* This routine computes an IPv4 TCP checksum. */
-void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
- struct sk_buff *skb)
+void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
{
struct inet_sock *inet = inet_sk(sk);
+ struct tcphdr *th = skb->h.th;
if (skb->ip_summed == CHECKSUM_HW) {
th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
@@ -826,7 +661,8 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
kfree(inet_rsk(req)->opt);
}
-static inline void syn_flood_warning(struct sk_buff *skb)
+#ifdef CONFIG_SYN_COOKIES
+static void syn_flood_warning(struct sk_buff *skb)
{
static unsigned long warntime;
@@ -837,12 +673,13 @@ static inline void syn_flood_warning(struct sk_buff *skb)
ntohs(skb->h.th->dest));
}
}
+#endif
/*
* Save and compile IPv4 options into the request_sock if needed.
*/
-static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
- struct sk_buff *skb)
+static struct ip_options *tcp_v4_save_options(struct sock *sk,
+ struct sk_buff *skb)
{
struct ip_options *opt = &(IPCB(skb)->opt);
struct ip_options *dopt = NULL;
@@ -869,6 +706,11 @@ struct request_sock_ops tcp_request_sock_ops = {
.send_reset = tcp_v4_send_reset,
};
+static struct timewait_sock_ops tcp_timewait_sock_ops = {
+ .twsk_obj_size = sizeof(struct tcp_timewait_sock),
+ .twsk_unique = tcp_twsk_unique,
+};
+
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct inet_request_sock *ireq;
@@ -1053,9 +895,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
ireq->opt = NULL;
newinet->mc_index = inet_iif(skb);
newinet->mc_ttl = skb->nh.iph->ttl;
- newtp->ext_header_len = 0;
+ inet_csk(newsk)->icsk_ext_hdr_len = 0;
if (newinet->opt)
- newtp->ext_header_len = newinet->opt->optlen;
+ inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
newinet->id = newtp->write_seq ^ jiffies;
tcp_sync_mss(newsk, dst_mtu(dst));
@@ -1314,16 +1156,6 @@ do_time_wait:
goto discard_it;
}
-static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
-{
- struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
- struct inet_sock *inet = inet_sk(sk);
-
- sin->sin_family = AF_INET;
- sin->sin_addr.s_addr = inet->daddr;
- sin->sin_port = inet->dport;
-}
-
/* VJ's idea. Save last timestamp seen from this destination
* and hold it at least for normal timewait interval to use for duplicate
* segment detection in subsequent connections, before they enter synchronized
@@ -1382,7 +1214,7 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
return 0;
}
-struct tcp_func ipv4_specific = {
+struct inet_connection_sock_af_ops ipv4_specific = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
.rebuild_header = inet_sk_rebuild_header,
@@ -1392,7 +1224,7 @@ struct tcp_func ipv4_specific = {
.net_header_len = sizeof(struct iphdr),
.setsockopt = ip_setsockopt,
.getsockopt = ip_getsockopt,
- .addr2sockaddr = v4_addr2sockaddr,
+ .addr2sockaddr = inet_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in),
};
@@ -1433,7 +1265,8 @@ static int tcp_v4_init_sock(struct sock *sk)
sk->sk_write_space = sk_stream_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
- tp->af_specific = &ipv4_specific;
+ icsk->icsk_af_ops = &ipv4_specific;
+ icsk->icsk_sync_mss = tcp_sync_mss;
sk->sk_sndbuf = sysctl_tcp_wmem[1];
sk->sk_rcvbuf = sysctl_tcp_rmem[1];
@@ -1989,7 +1822,7 @@ struct proto tcp_prot = {
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
- .twsk_obj_size = sizeof(struct tcp_timewait_sock),
+ .twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
};
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 1b66a2ac4321..2b9b7f6c7f7c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -274,18 +274,18 @@ kill:
void tcp_time_wait(struct sock *sk, int state, int timeo)
{
struct inet_timewait_sock *tw = NULL;
+ const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
int recycle_ok = 0;
if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
- recycle_ok = tp->af_specific->remember_stamp(sk);
+ recycle_ok = icsk->icsk_af_ops->remember_stamp(sk);
if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
tw = inet_twsk_alloc(sk, state);
if (tw != NULL) {
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
- const struct inet_connection_sock *icsk = inet_csk(sk);
const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
@@ -298,10 +298,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
if (tw->tw_family == PF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
- struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
+ struct inet6_timewait_sock *tw6;
- ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr);
- ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr);
+ tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
+ tw6 = inet6_twsk((struct sock *)tw);
+ ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
+ ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
tw->tw_ipv6only = np->ipv6only;
}
#endif
@@ -456,7 +458,6 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
struct request_sock **prev)
{
struct tcphdr *th = skb->h.th;
- struct tcp_sock *tp = tcp_sk(sk);
u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
int paws_reject = 0;
struct tcp_options_received tmp_opt;
@@ -613,7 +614,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
* ESTABLISHED STATE. If it will be dropped after
* socket is created, wait for troubles.
*/
- child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb,
+ req, NULL);
if (child == NULL)
goto listen_overflow;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b7325e0b406a..a7623ead39a8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -51,8 +51,8 @@ int sysctl_tcp_retrans_collapse = 1;
*/
int sysctl_tcp_tso_win_divisor = 3;
-static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
- struct sk_buff *skb)
+static void update_send_head(struct sock *sk, struct tcp_sock *tp,
+ struct sk_buff *skb)
{
sk->sk_send_head = skb->next;
if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
@@ -124,8 +124,8 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
tp->snd_cwnd_used = 0;
}
-static inline void tcp_event_data_sent(struct tcp_sock *tp,
- struct sk_buff *skb, struct sock *sk)
+static void tcp_event_data_sent(struct tcp_sock *tp,
+ struct sk_buff *skb, struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const u32 now = tcp_time_stamp;
@@ -142,7 +142,7 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
icsk->icsk_ack.pingpong = 1;
}
-static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
+static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
{
tcp_dec_quickack_mode(sk, pkts);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
@@ -212,7 +212,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
* value can be stuffed directly into th->window for an outgoing
* frame.
*/
-static __inline__ u16 tcp_select_window(struct sock *sk)
+static u16 tcp_select_window(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 cur_win = tcp_receive_window(tp);
@@ -250,6 +250,75 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
return new_win;
}
+static void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp,
+ __u32 tstamp)
+{
+ if (tp->rx_opt.tstamp_ok) {
+ *ptr++ = __constant_htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) |
+ TCPOLEN_TIMESTAMP);
+ *ptr++ = htonl(tstamp);
+ *ptr++ = htonl(tp->rx_opt.ts_recent);
+ }
+ if (tp->rx_opt.eff_sacks) {
+ struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
+ int this_sack;
+
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_SACK << 8) |
+ (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks *
+ TCPOLEN_SACK_PERBLOCK)));
+ for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
+ *ptr++ = htonl(sp[this_sack].start_seq);
+ *ptr++ = htonl(sp[this_sack].end_seq);
+ }
+ if (tp->rx_opt.dsack) {
+ tp->rx_opt.dsack = 0;
+ tp->rx_opt.eff_sacks--;
+ }
+ }
+}
+
+/* Construct a tcp options header for a SYN or SYN_ACK packet.
+ * If this is every changed make sure to change the definition of
+ * MAX_SYN_SIZE to match the new maximum number of options that you
+ * can generate.
+ */
+static void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack,
+ int offer_wscale, int wscale, __u32 tstamp,
+ __u32 ts_recent)
+{
+ /* We always get an MSS option.
+ * The option bytes which will be seen in normal data
+ * packets should timestamps be used, must be in the MSS
+ * advertised. But we subtract them from tp->mss_cache so
+ * that calculations in tcp_sendmsg are simpler etc.
+ * So account for this fact here if necessary. If we
+ * don't do this correctly, as a receiver we won't
+ * recognize data packets as being full sized when we
+ * should, and thus we won't abide by the delayed ACK
+ * rules correctly.
+ * SACKs don't matter, we never delay an ACK when we
+ * have any of those going out.
+ */
+ *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
+ if (ts) {
+ if(sack)
+ *ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) |
+ (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
+ else
+ *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
+ *ptr++ = htonl(tstamp); /* TSVAL */
+ *ptr++ = htonl(ts_recent); /* TSECR */
+ } else if(sack)
+ *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM);
+ if (offer_wscale)
+ *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale));
+}
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
@@ -371,7 +440,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
TCP_ECN_send(sk, tp, skb, tcp_header_size);
}
- tp->af_specific->send_check(sk, th, skb->len, skb);
+ icsk->icsk_af_ops->send_check(sk, skb->len, skb);
if (likely(tcb->flags & TCPCB_FLAG_ACK))
tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
@@ -381,7 +450,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
TCP_INC_STATS(TCP_MIB_OUTSEGS);
- err = tp->af_specific->queue_xmit(skb, 0);
+ err = icsk->icsk_af_ops->queue_xmit(skb, 0);
if (unlikely(err <= 0))
return err;
@@ -621,7 +690,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
It is minimum of user_mss and mss received with SYN.
It also does not include TCP options.
- tp->pmtu_cookie is last pmtu, seen by this function.
+ inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
tp->mss_cache is current effective sending mss, including
all tcp options except for SACKs. It is evaluated,
@@ -631,26 +700,26 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
NOTE1. rfc1122 clearly states that advertised MSS
DOES NOT include either tcp or ip options.
- NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
- this function. --ANK (980731)
+ NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
+ are READ ONLY outside this function. --ANK (980731)
*/
unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
{
struct tcp_sock *tp = tcp_sk(sk);
- int mss_now;
-
+ struct inet_connection_sock *icsk = inet_csk(sk);
/* Calculate base mss without TCP options:
It is MMS_S - sizeof(tcphdr) of rfc1122
*/
- mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
+ int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
+ sizeof(struct tcphdr));
/* Clamp it (mss_clamp does not include tcp options) */
if (mss_now > tp->rx_opt.mss_clamp)
mss_now = tp->rx_opt.mss_clamp;
/* Now subtract optional transport overhead */
- mss_now -= tp->ext_header_len;
+ mss_now -= icsk->icsk_ext_hdr_len;
/* Then reserve room for full set of TCP options and 8 bytes of data */
if (mss_now < 48)
@@ -664,7 +733,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
/* And store cached results */
- tp->pmtu_cookie = pmtu;
+ icsk->icsk_pmtu_cookie = pmtu;
tp->mss_cache = mss_now;
return mss_now;
@@ -694,7 +763,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
if (dst) {
u32 mtu = dst_mtu(dst);
- if (mtu != tp->pmtu_cookie)
+ if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
mss_now = tcp_sync_mss(sk, mtu);
}
@@ -705,9 +774,10 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
xmit_size_goal = mss_now;
if (doing_tso) {
- xmit_size_goal = 65535 -
- tp->af_specific->net_header_len -
- tp->ext_header_len - tp->tcp_header_len;
+ xmit_size_goal = (65535 -
+ inet_csk(sk)->icsk_af_ops->net_header_len -
+ inet_csk(sk)->icsk_ext_hdr_len -
+ tp->tcp_header_len);
if (tp->max_window &&
(xmit_size_goal > (tp->max_window >> 1)))
@@ -723,7 +793,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
/* Congestion window validation. (RFC2861) */
-static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
+static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
{
__u32 packets_out = tp->packets_out;
@@ -772,7 +842,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk
/* This must be invoked the first time we consider transmitting
* SKB onto the wire.
*/
-static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
+static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
{
int tso_segs = tcp_skb_pcount(skb);
@@ -1422,7 +1492,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
(sysctl_tcp_retrans_collapse != 0))
tcp_retrans_try_collapse(sk, skb, cur_mss);
- if(tp->af_specific->rebuild_header(sk))
+ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
return -EHOSTUNREACH; /* Routing failure or similar. */
/* Some Solaris stacks overoptimize and ignore the FIN on a
@@ -1793,7 +1863,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
/*
* Do all connect socket setups that can be done AF independent.
*/
-static inline void tcp_connect_init(struct sock *sk)
+static void tcp_connect_init(struct sock *sk)
{
struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 13e7e6e8df16..3b7403495052 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -330,6 +330,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
vegas->cntRTT = 0;
vegas->minRTT = 0x7fffffff;
}
+ /* Use normal slow start */
+ else if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp);
+
}
/* Extract info for Tcp socket info provided via netlink. */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2422a5f7195d..223abaa72bc5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -86,6 +86,7 @@
#include <linux/module.h>
#include <linux/socket.h>
#include <linux/sockios.h>
+#include <linux/igmp.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/timer.h>
@@ -846,20 +847,7 @@ out:
csum_copy_err:
UDP_INC_STATS_BH(UDP_MIB_INERRORS);
- /* Clear queue. */
- if (flags&MSG_PEEK) {
- int clear = 0;
- spin_lock_bh(&sk->sk_receive_queue.lock);
- if (skb == skb_peek(&sk->sk_receive_queue)) {
- __skb_unlink(skb, &sk->sk_receive_queue);
- clear = 1;
- }
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- if (clear)
- kfree_skb(skb);
- }
-
- skb_free_datagram(sk, skb);
+ skb_kill_datagram(sk, skb, flags);
if (noblock)
return -EAGAIN;
@@ -1094,7 +1082,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
* Otherwise, csum completion requires chacksumming packet body,
* including udp header and folding it to skb->csum.
*/
-static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
+static void udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
unsigned short ulen, u32 saddr, u32 daddr)
{
if (uh->check == 0) {
@@ -1108,7 +1096,6 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
/* Probably, we should checksum udp header (it should be in cache
* in any case) and data in tiny packets (< rx copybreak).
*/
- return 0;
}
/*
@@ -1141,8 +1128,7 @@ int udp_rcv(struct sk_buff *skb)
if (pskb_trim_rcsum(skb, ulen))
goto short_packet;
- if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0)
- goto csum_error;
+ udp_checksum_init(skb, uh, ulen, saddr, daddr);
if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 6460eec834b7..9601fd7f9d66 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -8,7 +8,8 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \
route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \
protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \
exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \
- ip6_flowlabel.o ipv6_syms.o netfilter.o
+ ip6_flowlabel.o ipv6_syms.o netfilter.o \
+ inet6_connection_sock.o
ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
xfrm6_output.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a60585fd85ad..704fb73e6c5f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1195,7 +1195,7 @@ struct inet6_ifaddr * ipv6_get_ifaddr(struct in6_addr *addr, struct net_device *
int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
{
const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr;
- const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2);
+ const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr;
u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
int sk_ipv6only = ipv6_only_sock(sk);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d9546380fa04..68afc53be662 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -167,6 +167,7 @@ lookup_protocol:
sk->sk_reuse = 1;
inet = inet_sk(sk);
+ inet->is_icsk = INET_PROTOSW_ICSK & answer_flags;
if (SOCK_RAW == sock->type) {
inet->num = protocol;
@@ -389,6 +390,8 @@ int inet6_destroy_sock(struct sock *sk)
return 0;
}
+EXPORT_SYMBOL_GPL(inet6_destroy_sock);
+
/*
* This does both peername and sockname.
*/
@@ -431,7 +434,6 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
struct sock *sk = sock->sk;
- int err = -EINVAL;
switch(cmd)
{
@@ -450,16 +452,15 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case SIOCSIFDSTADDR:
return addrconf_set_dstaddr((void __user *) arg);
default:
- if (!sk->sk_prot->ioctl ||
- (err = sk->sk_prot->ioctl(sk, cmd, arg)) == -ENOIOCTLCMD)
- return(dev_ioctl(cmd,(void __user *) arg));
- return err;
+ if (!sk->sk_prot->ioctl)
+ return -ENOIOCTLCMD;
+ return sk->sk_prot->ioctl(sk, cmd, arg);
}
/*NOTREACHED*/
return(0);
}
-struct proto_ops inet6_stream_ops = {
+const struct proto_ops inet6_stream_ops = {
.family = PF_INET6,
.owner = THIS_MODULE,
.release = inet6_release,
@@ -480,7 +481,7 @@ struct proto_ops inet6_stream_ops = {
.sendpage = tcp_sendpage
};
-struct proto_ops inet6_dgram_ops = {
+const struct proto_ops inet6_dgram_ops = {
.family = PF_INET6,
.owner = THIS_MODULE,
.release = inet6_release,
@@ -508,7 +509,7 @@ static struct net_proto_family inet6_family_ops = {
};
/* Same as inet6_dgram_ops, sans udp_poll. */
-static struct proto_ops inet6_sockraw_ops = {
+static const struct proto_ops inet6_sockraw_ops = {
.family = PF_INET6,
.owner = THIS_MODULE,
.release = inet6_release,
@@ -609,6 +610,79 @@ inet6_unregister_protosw(struct inet_protosw *p)
}
}
+int inet6_sk_rebuild_header(struct sock *sk)
+{
+ int err;
+ struct dst_entry *dst;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ dst = __sk_dst_check(sk, np->dst_cookie);
+
+ if (dst == NULL) {
+ struct inet_sock *inet = inet_sk(sk);
+ struct in6_addr *final_p = NULL, final;
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(fl));
+ fl.proto = sk->sk_protocol;
+ ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+ ipv6_addr_copy(&fl.fl6_src, &np->saddr);
+ fl.fl6_flowlabel = np->flow_label;
+ fl.oif = sk->sk_bound_dev_if;
+ fl.fl_ip_dport = inet->dport;
+ fl.fl_ip_sport = inet->sport;
+
+ if (np->opt && np->opt->srcrt) {
+ struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+ ipv6_addr_copy(&final, &fl.fl6_dst);
+ ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+ final_p = &final;
+ }
+
+ err = ip6_dst_lookup(sk, &dst, &fl);
+ if (err) {
+ sk->sk_route_caps = 0;
+ return err;
+ }
+ if (final_p)
+ ipv6_addr_copy(&fl.fl6_dst, final_p);
+
+ if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
+ sk->sk_err_soft = -err;
+ return err;
+ }
+
+ ip6_dst_store(sk, dst, NULL);
+ sk->sk_route_caps = dst->dev->features &
+ ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
+ }
+
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header);
+
+int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct inet6_skb_parm *opt = IP6CB(skb);
+
+ if (np->rxopt.all) {
+ if ((opt->hop && (np->rxopt.bits.hopopts ||
+ np->rxopt.bits.ohopopts)) ||
+ ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) &&
+ np->rxopt.bits.rxflow) ||
+ (opt->srcrt && (np->rxopt.bits.srcrt ||
+ np->rxopt.bits.osrcrt)) ||
+ ((opt->dst1 || opt->dst0) &&
+ (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts)))
+ return 1;
+ }
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(ipv6_opt_accepted);
+
int
snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign)
{
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index f3629730eb15..13cc7f895583 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -33,6 +33,7 @@
#include <linux/string.h>
#include <net/icmp.h>
#include <net/ipv6.h>
+#include <net/protocol.h>
#include <net/xfrm.h>
#include <asm/scatterlist.h>
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 8bfbe9970793..6de8ee1a5ad9 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -36,6 +36,7 @@
#include <linux/random.h>
#include <net/icmp.h>
#include <net/ipv6.h>
+#include <net/protocol.h>
#include <linux/icmpv6.h>
static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index be6faf311387..113374dc342c 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -413,6 +413,8 @@ ipv6_invert_rthdr(struct sock *sk, struct ipv6_rt_hdr *hdr)
return opt;
}
+EXPORT_SYMBOL_GPL(ipv6_invert_rthdr);
+
/**********************************
Hop-by-hop options.
**********************************/
@@ -579,6 +581,8 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
return opt2;
}
+EXPORT_SYMBOL_GPL(ipv6_dup_options);
+
static int ipv6_renew_option(void *ohdr,
struct ipv6_opt_hdr __user *newopt, int newoptlen,
int inherit,
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
new file mode 100644
index 000000000000..792f90f0f9ec
--- /dev/null
+++ b/net/ipv6/inet6_connection_sock.c
@@ -0,0 +1,199 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Support for INET6 connection oriented protocols.
+ *
+ * Authors: See the TCPv6 sources
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or(at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/in6.h>
+#include <linux/ipv6.h>
+#include <linux/jhash.h>
+
+#include <net/addrconf.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_ecn.h>
+#include <net/inet_hashtables.h>
+#include <net/ip6_route.h>
+#include <net/sock.h>
+
+int inet6_csk_bind_conflict(const struct sock *sk,
+ const struct inet_bind_bucket *tb)
+{
+ const struct sock *sk2;
+ const struct hlist_node *node;
+
+ /* We must walk the whole port owner list in this case. -DaveM */
+ sk_for_each_bound(sk2, node, &tb->owners) {
+ if (sk != sk2 &&
+ (!sk->sk_bound_dev_if ||
+ !sk2->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
+ (!sk->sk_reuse || !sk2->sk_reuse ||
+ sk2->sk_state == TCP_LISTEN) &&
+ ipv6_rcv_saddr_equal(sk, sk2))
+ break;
+ }
+
+ return node != NULL;
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict);
+
+/*
+ * request_sock (formerly open request) hash tables.
+ */
+static u32 inet6_synq_hash(const struct in6_addr *raddr, const u16 rport,
+ const u32 rnd, const u16 synq_hsize)
+{
+ u32 a = raddr->s6_addr32[0];
+ u32 b = raddr->s6_addr32[1];
+ u32 c = raddr->s6_addr32[2];
+
+ a += JHASH_GOLDEN_RATIO;
+ b += JHASH_GOLDEN_RATIO;
+ c += rnd;
+ __jhash_mix(a, b, c);
+
+ a += raddr->s6_addr32[3];
+ b += (u32)rport;
+ __jhash_mix(a, b, c);
+
+ return c & (synq_hsize - 1);
+}
+
+struct request_sock *inet6_csk_search_req(const struct sock *sk,
+ struct request_sock ***prevp,
+ const __u16 rport,
+ const struct in6_addr *raddr,
+ const struct in6_addr *laddr,
+ const int iif)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+ struct request_sock *req, **prev;
+
+ for (prev = &lopt->syn_table[inet6_synq_hash(raddr, rport,
+ lopt->hash_rnd,
+ lopt->nr_table_entries)];
+ (req = *prev) != NULL;
+ prev = &req->dl_next) {
+ const struct inet6_request_sock *treq = inet6_rsk(req);
+
+ if (inet_rsk(req)->rmt_port == rport &&
+ req->rsk_ops->family == AF_INET6 &&
+ ipv6_addr_equal(&treq->rmt_addr, raddr) &&
+ ipv6_addr_equal(&treq->loc_addr, laddr) &&
+ (!treq->iif || treq->iif == iif)) {
+ BUG_TRAP(req->sk == NULL);
+ *prevp = prev;
+ return req;
+ }
+ }
+
+ return NULL;
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_search_req);
+
+void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
+ struct request_sock *req,
+ const unsigned long timeout)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+ const u32 h = inet6_synq_hash(&inet6_rsk(req)->rmt_addr,
+ inet_rsk(req)->rmt_port,
+ lopt->hash_rnd, lopt->nr_table_entries);
+
+ reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
+ inet_csk_reqsk_queue_added(sk, timeout);
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add);
+
+void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr;
+
+ sin6->sin6_family = AF_INET6;
+ ipv6_addr_copy(&sin6->sin6_addr, &np->daddr);
+ sin6->sin6_port = inet_sk(sk)->dport;
+ /* We do not store received flowlabel for TCP */
+ sin6->sin6_flowinfo = 0;
+ sin6->sin6_scope_id = 0;
+ if (sk->sk_bound_dev_if &&
+ ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+ sin6->sin6_scope_id = sk->sk_bound_dev_if;
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr);
+
+int inet6_csk_xmit(struct sk_buff *skb, int ipfragok)
+{
+ struct sock *sk = skb->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct flowi fl;
+ struct dst_entry *dst;
+ struct in6_addr *final_p = NULL, final;
+
+ memset(&fl, 0, sizeof(fl));
+ fl.proto = sk->sk_protocol;
+ ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+ ipv6_addr_copy(&fl.fl6_src, &np->saddr);
+ fl.fl6_flowlabel = np->flow_label;
+ IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel);
+ fl.oif = sk->sk_bound_dev_if;
+ fl.fl_ip_sport = inet->sport;
+ fl.fl_ip_dport = inet->dport;
+
+ if (np->opt && np->opt->srcrt) {
+ struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
+ ipv6_addr_copy(&final, &fl.fl6_dst);
+ ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+ final_p = &final;
+ }
+
+ dst = __sk_dst_check(sk, np->dst_cookie);
+
+ if (dst == NULL) {
+ int err = ip6_dst_lookup(sk, &dst, &fl);
+
+ if (err) {
+ sk->sk_err_soft = -err;
+ return err;
+ }
+
+ if (final_p)
+ ipv6_addr_copy(&fl.fl6_dst, final_p);
+
+ if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
+ sk->sk_route_caps = 0;
+ return err;
+ }
+
+ ip6_dst_store(sk, dst, NULL);
+ sk->sk_route_caps = dst->dev->features &
+ ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
+ }
+
+ skb->dst = dst_clone(dst);
+
+ /* Restore final destination back after routing done */
+ ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+
+ return ip6_xmit(sk, skb, &fl, np->opt, 0);
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_xmit);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 01d5f46d4e40..4154f3a8b6cf 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -5,7 +5,8 @@
*
* Generic INET6 transport hashtables
*
- * Authors: Lotsa people, from code originally in tcp
+ * Authors: Lotsa people, from code originally in tcp, generalised here
+ * by Arnaldo Carvalho de Melo <acme@mandriva.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -14,12 +15,13 @@
*/
#include <linux/config.h>
-
#include <linux/module.h>
+#include <linux/random.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
#include <net/inet6_hashtables.h>
+#include <net/ip.h>
struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo,
const struct in6_addr *daddr,
@@ -79,3 +81,180 @@ struct sock *inet6_lookup(struct inet_hashinfo *hashinfo,
}
EXPORT_SYMBOL_GPL(inet6_lookup);
+
+static int __inet6_check_established(struct inet_timewait_death_row *death_row,
+ struct sock *sk, const __u16 lport,
+ struct inet_timewait_sock **twp)
+{
+ struct inet_hashinfo *hinfo = death_row->hashinfo;
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+ const struct in6_addr *daddr = &np->rcv_saddr;
+ const struct in6_addr *saddr = &np->daddr;
+ const int dif = sk->sk_bound_dev_if;
+ const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+ const unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr,
+ inet->dport);
+ struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+ struct sock *sk2;
+ const struct hlist_node *node;
+ struct inet_timewait_sock *tw;
+
+ prefetch(head->chain.first);
+ write_lock(&head->lock);
+
+ /* Check TIME-WAIT sockets first. */
+ sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) {
+ const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2);
+
+ tw = inet_twsk(sk2);
+
+ if(*((__u32 *)&(tw->tw_dport)) == ports &&
+ sk2->sk_family == PF_INET6 &&
+ ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) &&
+ ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) &&
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
+ if (twsk_unique(sk, sk2, twp))
+ goto unique;
+ else
+ goto not_unique;
+ }
+ }
+ tw = NULL;
+
+ /* And established part... */
+ sk_for_each(sk2, node, &head->chain) {
+ if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif))
+ goto not_unique;
+ }
+
+unique:
+ BUG_TRAP(sk_unhashed(sk));
+ __sk_add_node(sk, &head->chain);
+ sk->sk_hash = hash;
+ sock_prot_inc_use(sk->sk_prot);
+ write_unlock(&head->lock);
+
+ if (twp != NULL) {
+ *twp = tw;
+ NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+ } else if (tw != NULL) {
+ /* Silly. Should hash-dance instead... */
+ inet_twsk_deschedule(tw, death_row);
+ NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+
+ inet_twsk_put(tw);
+ }
+ return 0;
+
+not_unique:
+ write_unlock(&head->lock);
+ return -EADDRNOTAVAIL;
+}
+
+static inline u32 inet6_sk_port_offset(const struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+ return secure_ipv6_port_ephemeral(np->rcv_saddr.s6_addr32,
+ np->daddr.s6_addr32,
+ inet->dport);
+}
+
+int inet6_hash_connect(struct inet_timewait_death_row *death_row,
+ struct sock *sk)
+{
+ struct inet_hashinfo *hinfo = death_row->hashinfo;
+ const unsigned short snum = inet_sk(sk)->num;
+ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
+ int ret;
+
+ if (snum == 0) {
+ const int low = sysctl_local_port_range[0];
+ const int high = sysctl_local_port_range[1];
+ const int range = high - low;
+ int i, port;
+ static u32 hint;
+ const u32 offset = hint + inet6_sk_port_offset(sk);
+ struct hlist_node *node;
+ struct inet_timewait_sock *tw = NULL;
+
+ local_bh_disable();
+ for (i = 1; i <= range; i++) {
+ port = low + (i + offset) % range;
+ head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
+ spin_lock(&head->lock);
+
+ /* Does not bother with rcv_saddr checks,
+ * because the established check is already
+ * unique enough.
+ */
+ inet_bind_bucket_for_each(tb, node, &head->chain) {
+ if (tb->port == port) {
+ BUG_TRAP(!hlist_empty(&tb->owners));
+ if (tb->fastreuse >= 0)
+ goto next_port;
+ if (!__inet6_check_established(death_row,
+ sk, port,
+ &tw))
+ goto ok;
+ goto next_port;
+ }
+ }
+
+ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+ head, port);
+ if (!tb) {
+ spin_unlock(&head->lock);
+ break;
+ }
+ tb->fastreuse = -1;
+ goto ok;
+
+ next_port:
+ spin_unlock(&head->lock);
+ }
+ local_bh_enable();
+
+ return -EADDRNOTAVAIL;
+
+ok:
+ hint += i;
+
+ /* Head lock still held and bh's disabled */
+ inet_bind_hash(sk, tb, port);
+ if (sk_unhashed(sk)) {
+ inet_sk(sk)->sport = htons(port);
+ __inet6_hash(hinfo, sk);
+ }
+ spin_unlock(&head->lock);
+
+ if (tw) {
+ inet_twsk_deschedule(tw, death_row);
+ inet_twsk_put(tw);
+ }
+
+ ret = 0;
+ goto out;
+ }
+
+ head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
+ tb = inet_csk(sk)->icsk_bind_hash;
+ spin_lock_bh(&head->lock);
+
+ if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) {
+ __inet6_hash(hinfo, sk);
+ spin_unlock_bh(&head->lock);
+ return 0;
+ } else {
+ spin_unlock(&head->lock);
+ /* No definite answer... Walk to established hash table */
+ ret = __inet6_check_established(death_row, sk, snum, NULL);
+out:
+ local_bh_enable();
+ return ret;
+ }
+}
+
+EXPORT_SYMBOL_GPL(inet6_hash_connect);
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 1cf02765fb5c..89d12b4817a9 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -200,6 +200,8 @@ struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, u32 label)
return NULL;
}
+EXPORT_SYMBOL_GPL(fl6_sock_lookup);
+
void fl6_free_socklist(struct sock *sk)
{
struct ipv6_pinfo *np = inet6_sk(sk);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 8523c76ebf76..b4c4beba0ede 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -775,6 +775,8 @@ out_err_release:
return err;
}
+EXPORT_SYMBOL_GPL(ip6_dst_lookup);
+
static inline int ip6_ufo_append_data(struct sock *sk,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 55917fb17094..626dd39685f2 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -47,6 +47,7 @@
#include <linux/rtnetlink.h>
#include <net/icmp.h>
#include <net/ipv6.h>
+#include <net/protocol.h>
#include <linux/ipv6.h>
#include <linux/icmpv6.h>
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 3620718defe6..c63868dd2ca2 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -163,17 +163,17 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
sk_refcnt_debug_dec(sk);
if (sk->sk_protocol == IPPROTO_TCP) {
- struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
local_bh_disable();
sock_prot_dec_use(sk->sk_prot);
sock_prot_inc_use(&tcp_prot);
local_bh_enable();
sk->sk_prot = &tcp_prot;
- tp->af_specific = &ipv4_specific;
+ icsk->icsk_af_ops = &ipv4_specific;
sk->sk_socket->ops = &inet_stream_ops;
sk->sk_family = PF_INET;
- tcp_sync_mss(sk, tp->pmtu_cookie);
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
} else {
local_bh_disable();
sock_prot_dec_use(sk->sk_prot);
@@ -317,14 +317,15 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
}
retv = 0;
- if (sk->sk_type == SOCK_STREAM) {
+ if (inet_sk(sk)->is_icsk) {
if (opt) {
- struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
if (!((1 << sk->sk_state) &
(TCPF_LISTEN | TCPF_CLOSE))
&& inet_sk(sk)->daddr != LOOPBACK4_IPV6) {
- tp->ext_header_len = opt->opt_flen + opt->opt_nflen;
- tcp_sync_mss(sk, tp->pmtu_cookie);
+ icsk->icsk_ext_hdr_len =
+ opt->opt_flen + opt->opt_nflen;
+ icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
}
}
opt = xchg(&np->opt, opt);
@@ -380,14 +381,15 @@ sticky_done:
goto done;
update:
retv = 0;
- if (sk->sk_type == SOCK_STREAM) {
+ if (inet_sk(sk)->is_icsk) {
if (opt) {
- struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
if (!((1 << sk->sk_state) &
(TCPF_LISTEN | TCPF_CLOSE))
&& inet_sk(sk)->daddr != LOOPBACK4_IPV6) {
- tp->ext_header_len = opt->opt_flen + opt->opt_nflen;
- tcp_sync_mss(sk, tp->pmtu_cookie);
+ icsk->icsk_ext_hdr_len =
+ opt->opt_flen + opt->opt_nflen;
+ icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
}
}
opt = xchg(&np->opt, opt);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index f829a4ad3ccc..1cf305a9f8dd 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -224,7 +224,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
mc_lst->ifindex = dev->ifindex;
mc_lst->sfmode = MCAST_EXCLUDE;
- mc_lst->sflock = RW_LOCK_UNLOCKED;
+ rwlock_init(&mc_lst->sflock);
mc_lst->sflist = NULL;
/*
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 95d469271c4d..ea43ef1d94a7 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -15,6 +15,7 @@
* - new extension header parser code
*/
#include <linux/config.h>
+#include <linux/in.h>
#include <linux/skbuff.h>
#include <linux/kmod.h>
#include <linux/vmalloc.h>
@@ -86,11 +87,6 @@ static DECLARE_MUTEX(ip6t_mutex);
context stops packets coming through and allows user context to read
the counters or update the rules.
- To be cache friendly on SMP, we arrange them like so:
- [ n-entries ]
- ... cache-align padding ...
- [ n-entries ]
-
Hence the start of any table is given by get_table() below. */
/* The table itself */
@@ -108,20 +104,15 @@ struct ip6t_table_info
unsigned int underflow[NF_IP6_NUMHOOKS];
/* ip6t_entry tables: one per CPU */
- char entries[0] ____cacheline_aligned;
+ void *entries[NR_CPUS];
};
static LIST_HEAD(ip6t_target);
static LIST_HEAD(ip6t_match);
static LIST_HEAD(ip6t_tables);
+#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
-#ifdef CONFIG_SMP
-#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
-#else
-#define TABLE_OFFSET(t,p) 0
-#endif
-
#if 0
#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
@@ -376,8 +367,7 @@ ip6t_do_table(struct sk_buff **pskb,
read_lock_bh(&table->lock);
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
- table_base = (void *)table->private->entries
- + TABLE_OFFSET(table->private, smp_processor_id());
+ table_base = (void *)table->private->entries[smp_processor_id()];
e = get_entry(table_base, table->private->hook_entry[hook]);
#ifdef CONFIG_NETFILTER_DEBUG
@@ -649,7 +639,8 @@ unconditional(const struct ip6t_ip6 *ipv6)
/* Figures out from what hook each rule can be called: returns 0 if
there are loops. Puts hook bitmask in comefrom. */
static int
-mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks)
+mark_source_chains(struct ip6t_table_info *newinfo,
+ unsigned int valid_hooks, void *entry0)
{
unsigned int hook;
@@ -658,7 +649,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks)
for (hook = 0; hook < NF_IP6_NUMHOOKS; hook++) {
unsigned int pos = newinfo->hook_entry[hook];
struct ip6t_entry *e
- = (struct ip6t_entry *)(newinfo->entries + pos);
+ = (struct ip6t_entry *)(entry0 + pos);
if (!(valid_hooks & (1 << hook)))
continue;
@@ -708,13 +699,13 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks)
goto next;
e = (struct ip6t_entry *)
- (newinfo->entries + pos);
+ (entry0 + pos);
} while (oldpos == pos + e->next_offset);
/* Move along one */
size = e->next_offset;
e = (struct ip6t_entry *)
- (newinfo->entries + pos + size);
+ (entry0 + pos + size);
e->counters.pcnt = pos;
pos += size;
} else {
@@ -731,7 +722,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks)
newpos = pos + e->next_offset;
}
e = (struct ip6t_entry *)
- (newinfo->entries + newpos);
+ (entry0 + newpos);
e->counters.pcnt = pos;
pos = newpos;
}
@@ -941,6 +932,7 @@ static int
translate_table(const char *name,
unsigned int valid_hooks,
struct ip6t_table_info *newinfo,
+ void *entry0,
unsigned int size,
unsigned int number,
const unsigned int *hook_entries,
@@ -961,11 +953,11 @@ translate_table(const char *name,
duprintf("translate_table: size %u\n", newinfo->size);
i = 0;
/* Walk through entries, checking offsets. */
- ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size,
check_entry_size_and_hooks,
newinfo,
- newinfo->entries,
- newinfo->entries + size,
+ entry0,
+ entry0 + size,
hook_entries, underflows, &i);
if (ret != 0)
return ret;
@@ -993,27 +985,24 @@ translate_table(const char *name,
}
}
- if (!mark_source_chains(newinfo, valid_hooks))
+ if (!mark_source_chains(newinfo, valid_hooks, entry0))
return -ELOOP;
/* Finally, each sanity check must pass */
i = 0;
- ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size,
check_entry, name, size, &i);
if (ret != 0) {
- IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ IP6T_ENTRY_ITERATE(entry0, newinfo->size,
cleanup_entry, &i);
return ret;
}
/* And one copy for every other CPU */
for_each_cpu(i) {
- if (i == 0)
- continue;
- memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
- newinfo->entries,
- SMP_ALIGN(newinfo->size));
+ if (newinfo->entries[i] && newinfo->entries[i] != entry0)
+ memcpy(newinfo->entries[i], entry0, newinfo->size);
}
return ret;
@@ -1029,15 +1018,12 @@ replace_table(struct ip6t_table *table,
#ifdef CONFIG_NETFILTER_DEBUG
{
- struct ip6t_entry *table_base;
- unsigned int i;
+ int cpu;
- for_each_cpu(i) {
- table_base =
- (void *)newinfo->entries
- + TABLE_OFFSET(newinfo, i);
-
- table_base->comefrom = 0xdead57ac;
+ for_each_cpu(cpu) {
+ struct ip6t_entry *table_base = newinfo->entries[cpu];
+ if (table_base)
+ table_base->comefrom = 0xdead57ac;
}
}
#endif
@@ -1072,16 +1058,44 @@ add_entry_to_counter(const struct ip6t_entry *e,
return 0;
}
+static inline int
+set_entry_to_counter(const struct ip6t_entry *e,
+ struct ip6t_counters total[],
+ unsigned int *i)
+{
+ SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+
+ (*i)++;
+ return 0;
+}
+
static void
get_counters(const struct ip6t_table_info *t,
struct ip6t_counters counters[])
{
unsigned int cpu;
unsigned int i;
+ unsigned int curcpu;
+
+ /* Instead of clearing (by a previous call to memset())
+ * the counters and using adds, we set the counters
+ * with data used by 'current' CPU
+ * We dont care about preemption here.
+ */
+ curcpu = raw_smp_processor_id();
+
+ i = 0;
+ IP6T_ENTRY_ITERATE(t->entries[curcpu],
+ t->size,
+ set_entry_to_counter,
+ counters,
+ &i);
for_each_cpu(cpu) {
+ if (cpu == curcpu)
+ continue;
i = 0;
- IP6T_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+ IP6T_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_entry_to_counter,
counters,
@@ -1098,6 +1112,7 @@ copy_entries_to_user(unsigned int total_size,
struct ip6t_entry *e;
struct ip6t_counters *counters;
int ret = 0;
+ void *loc_cpu_entry;
/* We need atomic snapshot of counters: rest doesn't change
(other than comefrom, which userspace doesn't care
@@ -1109,13 +1124,13 @@ copy_entries_to_user(unsigned int total_size,
return -ENOMEM;
/* First, sum counters... */
- memset(counters, 0, countersize);
write_lock_bh(&table->lock);
get_counters(table->private, counters);
write_unlock_bh(&table->lock);
- /* ... then copy entire thing from CPU 0... */
- if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+ /* choose the copy that is on ourc node/cpu */
+ loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
ret = -EFAULT;
goto free_counters;
}
@@ -1127,7 +1142,7 @@ copy_entries_to_user(unsigned int total_size,
struct ip6t_entry_match *m;
struct ip6t_entry_target *t;
- e = (struct ip6t_entry *)(table->private->entries + off);
+ e = (struct ip6t_entry *)(loc_cpu_entry + off);
if (copy_to_user(userptr + off
+ offsetof(struct ip6t_entry, counters),
&counters[num],
@@ -1196,6 +1211,46 @@ get_entries(const struct ip6t_get_entries *entries,
return ret;
}
+static void free_table_info(struct ip6t_table_info *info)
+{
+ int cpu;
+ for_each_cpu(cpu) {
+ if (info->size <= PAGE_SIZE)
+ kfree(info->entries[cpu]);
+ else
+ vfree(info->entries[cpu]);
+ }
+ kfree(info);
+}
+
+static struct ip6t_table_info *alloc_table_info(unsigned int size)
+{
+ struct ip6t_table_info *newinfo;
+ int cpu;
+
+ newinfo = kzalloc(sizeof(struct ip6t_table_info), GFP_KERNEL);
+ if (!newinfo)
+ return NULL;
+
+ newinfo->size = size;
+
+ for_each_cpu(cpu) {
+ if (size <= PAGE_SIZE)
+ newinfo->entries[cpu] = kmalloc_node(size,
+ GFP_KERNEL,
+ cpu_to_node(cpu));
+ else
+ newinfo->entries[cpu] = vmalloc_node(size,
+ cpu_to_node(cpu));
+ if (newinfo->entries[cpu] == NULL) {
+ free_table_info(newinfo);
+ return NULL;
+ }
+ }
+
+ return newinfo;
+}
+
static int
do_replace(void __user *user, unsigned int len)
{
@@ -1204,6 +1259,7 @@ do_replace(void __user *user, unsigned int len)
struct ip6t_table *t;
struct ip6t_table_info *newinfo, *oldinfo;
struct ip6t_counters *counters;
+ void *loc_cpu_entry, *loc_cpu_old_entry;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1212,13 +1268,13 @@ do_replace(void __user *user, unsigned int len)
if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
return -ENOMEM;
- newinfo = vmalloc(sizeof(struct ip6t_table_info)
- + SMP_ALIGN(tmp.size) *
- (highest_possible_processor_id()+1));
+ newinfo = alloc_table_info(tmp.size);
if (!newinfo)
return -ENOMEM;
- if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+ /* choose the copy that is on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
@@ -1229,10 +1285,9 @@ do_replace(void __user *user, unsigned int len)
ret = -ENOMEM;
goto free_newinfo;
}
- memset(counters, 0, tmp.num_counters * sizeof(struct ip6t_counters));
ret = translate_table(tmp.name, tmp.valid_hooks,
- newinfo, tmp.size, tmp.num_entries,
+ newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
tmp.hook_entry, tmp.underflow);
if (ret != 0)
goto free_newinfo_counters;
@@ -1271,8 +1326,9 @@ do_replace(void __user *user, unsigned int len)
/* Get the old counters. */
get_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
- IP6T_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
- vfree(oldinfo);
+ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+ IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
+ free_table_info(oldinfo);
if (copy_to_user(tmp.counters, counters,
sizeof(struct ip6t_counters) * tmp.num_counters) != 0)
ret = -EFAULT;
@@ -1284,11 +1340,11 @@ do_replace(void __user *user, unsigned int len)
module_put(t->me);
up(&ip6t_mutex);
free_newinfo_counters_untrans:
- IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL);
+ IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
free_newinfo_counters:
vfree(counters);
free_newinfo:
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
@@ -1321,6 +1377,7 @@ do_add_counters(void __user *user, unsigned int len)
struct ip6t_counters_info tmp, *paddc;
struct ip6t_table *t;
int ret = 0;
+ void *loc_cpu_entry;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1350,7 +1407,9 @@ do_add_counters(void __user *user, unsigned int len)
}
i = 0;
- IP6T_ENTRY_ITERATE(t->private->entries,
+ /* Choose the copy that is on our node */
+ loc_cpu_entry = t->private->entries[smp_processor_id()];
+ IP6T_ENTRY_ITERATE(loc_cpu_entry,
t->private->size,
add_counter_to_entry,
paddc->counters,
@@ -1543,28 +1602,29 @@ int ip6t_register_table(struct ip6t_table *table,
struct ip6t_table_info *newinfo;
static struct ip6t_table_info bootstrap
= { 0, 0, 0, { 0 }, { 0 }, { } };
+ void *loc_cpu_entry;
- newinfo = vmalloc(sizeof(struct ip6t_table_info)
- + SMP_ALIGN(repl->size) *
- (highest_possible_processor_id()+1));
+ newinfo = alloc_table_info(repl->size);
if (!newinfo)
return -ENOMEM;
- memcpy(newinfo->entries, repl->entries, repl->size);
+ /* choose the copy on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(table->name, table->valid_hooks,
- newinfo, repl->size,
+ newinfo, loc_cpu_entry, repl->size,
repl->num_entries,
repl->hook_entry,
repl->underflow);
if (ret != 0) {
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
ret = down_interruptible(&ip6t_mutex);
if (ret != 0) {
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
@@ -1593,20 +1653,23 @@ int ip6t_register_table(struct ip6t_table *table,
return ret;
free_unlock:
- vfree(newinfo);
+ free_table_info(newinfo);
goto unlock;
}
void ip6t_unregister_table(struct ip6t_table *table)
{
+ void *loc_cpu_entry;
+
down(&ip6t_mutex);
LIST_DELETE(&ip6t_tables, table);
up(&ip6t_mutex);
/* Decrease module usage counts and free resources */
- IP6T_ENTRY_ITERATE(table->private->entries, table->private->size,
+ loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+ IP6T_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
cleanup_entry, NULL);
- vfree(table->private);
+ free_table_info(table->private);
}
/* Returns 1 if the port is matched by the range, 0 otherwise */
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 0cd1d1bd9033..ae4653bfd654 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/skbuff.h>
+#include <linux/if_arp.h>
#include <linux/ip.h>
#include <linux/spinlock.h>
#include <linux/icmpv6.h>
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index dde37793d20b..268918d5deea 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -9,6 +9,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
+#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/types.h>
#include <net/checksum.h>
diff --git a/net/ipv6/netfilter/ip6t_esp.c b/net/ipv6/netfilter/ip6t_esp.c
index 24bc0cde43a1..65937de1b58c 100644
--- a/net/ipv6/netfilter/ip6t_esp.c
+++ b/net/ipv6/netfilter/ip6t_esp.c
@@ -9,6 +9,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
+#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/types.h>
#include <net/checksum.h>
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index c2c52af9e560..f3e5ffbd592f 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -98,7 +98,7 @@ struct nf_ct_frag6_queue
#define FRAG6Q_HASHSZ 64
static struct nf_ct_frag6_queue *nf_ct_frag6_hash[FRAG6Q_HASHSZ];
-static rwlock_t nf_ct_frag6_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(nf_ct_frag6_lock);
static u32 nf_ct_frag6_hash_rnd;
static LIST_HEAD(nf_ct_frag6_lru_list);
int nf_ct_frag6_nqueues = 0;
@@ -371,7 +371,7 @@ nf_ct_frag6_create(unsigned int hash, u32 id, struct in6_addr *src, struct
init_timer(&fq->timer);
fq->timer.function = nf_ct_frag6_expire;
fq->timer.data = (long) fq;
- fq->lock = SPIN_LOCK_UNLOCKED;
+ spin_lock_init(&fq->lock);
atomic_set(&fq->refcnt, 1);
return nf_ct_frag6_intern(hash, fq);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index a66900cda2af..66f1d12ea578 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -32,6 +32,7 @@
#include <linux/icmpv6.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
+#include <linux/skbuff.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
#include <asm/bug.h>
@@ -433,25 +434,14 @@ out:
return err;
csum_copy_err:
- /* Clear queue. */
- if (flags&MSG_PEEK) {
- int clear = 0;
- spin_lock_bh(&sk->sk_receive_queue.lock);
- if (skb == skb_peek(&sk->sk_receive_queue)) {
- __skb_unlink(skb, &sk->sk_receive_queue);
- clear = 1;
- }
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- if (clear)
- kfree_skb(skb);
- }
+ skb_kill_datagram(sk, skb, flags);
/* Error for blocking case is chosen to masquerade
as some normal condition.
*/
err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
/* FIXME: increment a raw6 drops counter here */
- goto out_free;
+ goto out;
}
static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 8827389abaf7..2947bc56d8a0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -48,6 +48,7 @@
#include <net/tcp.h>
#include <net/ndisc.h>
#include <net/inet6_hashtables.h>
+#include <net/inet6_connection_sock.h>
#include <net/ipv6.h>
#include <net/transp_v6.h>
#include <net/addrconf.h>
@@ -59,6 +60,7 @@
#include <net/addrconf.h>
#include <net/snmp.h>
#include <net/dsfield.h>
+#include <net/timewait_sock.h>
#include <asm/uaccess.h>
@@ -67,224 +69,33 @@
static void tcp_v6_send_reset(struct sk_buff *skb);
static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req);
-static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len,
+static void tcp_v6_send_check(struct sock *sk, int len,
struct sk_buff *skb);
static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
-static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok);
-static struct tcp_func ipv6_mapped;
-static struct tcp_func ipv6_specific;
+static struct inet_connection_sock_af_ops ipv6_mapped;
+static struct inet_connection_sock_af_ops ipv6_specific;
-static inline int tcp_v6_bind_conflict(const struct sock *sk,
- const struct inet_bind_bucket *tb)
-{
- const struct sock *sk2;
- const struct hlist_node *node;
-
- /* We must walk the whole port owner list in this case. -DaveM */
- sk_for_each_bound(sk2, node, &tb->owners) {
- if (sk != sk2 &&
- (!sk->sk_bound_dev_if ||
- !sk2->sk_bound_dev_if ||
- sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
- (!sk->sk_reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) &&
- ipv6_rcv_saddr_equal(sk, sk2))
- break;
- }
-
- return node != NULL;
-}
-
-/* Grrr, addr_type already calculated by caller, but I don't want
- * to add some silly "cookie" argument to this method just for that.
- * But it doesn't matter, the recalculation is in the rarest path
- * this function ever takes.
- */
static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
{
- struct inet_bind_hashbucket *head;
- struct inet_bind_bucket *tb;
- struct hlist_node *node;
- int ret;
-
- local_bh_disable();
- if (snum == 0) {
- int low = sysctl_local_port_range[0];
- int high = sysctl_local_port_range[1];
- int remaining = (high - low) + 1;
- int rover = net_random() % (high - low) + low;
-
- do {
- head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
- spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, node, &head->chain)
- if (tb->port == rover)
- goto next;
- break;
- next:
- spin_unlock(&head->lock);
- if (++rover > high)
- rover = low;
- } while (--remaining > 0);
-
- /* Exhausted local port range during search? It is not
- * possible for us to be holding one of the bind hash
- * locks if this test triggers, because if 'remaining'
- * drops to zero, we broke out of the do/while loop at
- * the top level, not from the 'break;' statement.
- */
- ret = 1;
- if (unlikely(remaining <= 0))
- goto fail;
-
- /* OK, here is the one we will use. */
- snum = rover;
- } else {
- head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
- spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, node, &head->chain)
- if (tb->port == snum)
- goto tb_found;
- }
- tb = NULL;
- goto tb_not_found;
-tb_found:
- if (tb && !hlist_empty(&tb->owners)) {
- if (tb->fastreuse > 0 && sk->sk_reuse &&
- sk->sk_state != TCP_LISTEN) {
- goto success;
- } else {
- ret = 1;
- if (tcp_v6_bind_conflict(sk, tb))
- goto fail_unlock;
- }
- }
-tb_not_found:
- ret = 1;
- if (tb == NULL) {
- tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum);
- if (tb == NULL)
- goto fail_unlock;
- }
- if (hlist_empty(&tb->owners)) {
- if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
- tb->fastreuse = 1;
- else
- tb->fastreuse = 0;
- } else if (tb->fastreuse &&
- (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
- tb->fastreuse = 0;
-
-success:
- if (!inet_csk(sk)->icsk_bind_hash)
- inet_bind_hash(sk, tb, snum);
- BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
- ret = 0;
-
-fail_unlock:
- spin_unlock(&head->lock);
-fail:
- local_bh_enable();
- return ret;
-}
-
-static __inline__ void __tcp_v6_hash(struct sock *sk)
-{
- struct hlist_head *list;
- rwlock_t *lock;
-
- BUG_TRAP(sk_unhashed(sk));
-
- if (sk->sk_state == TCP_LISTEN) {
- list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)];
- lock = &tcp_hashinfo.lhash_lock;
- inet_listen_wlock(&tcp_hashinfo);
- } else {
- unsigned int hash;
- sk->sk_hash = hash = inet6_sk_ehashfn(sk);
- hash &= (tcp_hashinfo.ehash_size - 1);
- list = &tcp_hashinfo.ehash[hash].chain;
- lock = &tcp_hashinfo.ehash[hash].lock;
- write_lock(lock);
- }
-
- __sk_add_node(sk, list);
- sock_prot_inc_use(sk->sk_prot);
- write_unlock(lock);
+ return inet_csk_get_port(&tcp_hashinfo, sk, snum,
+ inet6_csk_bind_conflict);
}
-
static void tcp_v6_hash(struct sock *sk)
{
if (sk->sk_state != TCP_CLOSE) {
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (tp->af_specific == &ipv6_mapped) {
+ if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) {
tcp_prot.hash(sk);
return;
}
local_bh_disable();
- __tcp_v6_hash(sk);
+ __inet6_hash(&tcp_hashinfo, sk);
local_bh_enable();
}
}
-/*
- * Open request hash tables.
- */
-
-static u32 tcp_v6_synq_hash(const struct in6_addr *raddr, const u16 rport, const u32 rnd)
-{
- u32 a, b, c;
-
- a = raddr->s6_addr32[0];
- b = raddr->s6_addr32[1];
- c = raddr->s6_addr32[2];
-
- a += JHASH_GOLDEN_RATIO;
- b += JHASH_GOLDEN_RATIO;
- c += rnd;
- __jhash_mix(a, b, c);
-
- a += raddr->s6_addr32[3];
- b += (u32) rport;
- __jhash_mix(a, b, c);
-
- return c & (TCP_SYNQ_HSIZE - 1);
-}
-
-static struct request_sock *tcp_v6_search_req(const struct sock *sk,
- struct request_sock ***prevp,
- __u16 rport,
- struct in6_addr *raddr,
- struct in6_addr *laddr,
- int iif)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- struct request_sock *req, **prev;
-
- for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)];
- (req = *prev) != NULL;
- prev = &req->dl_next) {
- const struct tcp6_request_sock *treq = tcp6_rsk(req);
-
- if (inet_rsk(req)->rmt_port == rport &&
- req->rsk_ops->family == AF_INET6 &&
- ipv6_addr_equal(&treq->rmt_addr, raddr) &&
- ipv6_addr_equal(&treq->loc_addr, laddr) &&
- (!treq->iif || treq->iif == iif)) {
- BUG_TRAP(req->sk == NULL);
- *prevp = prev;
- return req;
- }
- }
-
- return NULL;
-}
-
static __inline__ u16 tcp_v6_check(struct tcphdr *th, int len,
struct in6_addr *saddr,
struct in6_addr *daddr,
@@ -308,195 +119,12 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb)
}
}
-static int __tcp_v6_check_established(struct sock *sk, const __u16 lport,
- struct inet_timewait_sock **twp)
-{
- struct inet_sock *inet = inet_sk(sk);
- const struct ipv6_pinfo *np = inet6_sk(sk);
- const struct in6_addr *daddr = &np->rcv_saddr;
- const struct in6_addr *saddr = &np->daddr;
- const int dif = sk->sk_bound_dev_if;
- const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
- unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr, inet->dport);
- struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
- struct sock *sk2;
- const struct hlist_node *node;
- struct inet_timewait_sock *tw;
-
- prefetch(head->chain.first);
- write_lock(&head->lock);
-
- /* Check TIME-WAIT sockets first. */
- sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
- const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk2);
-
- tw = inet_twsk(sk2);
-
- if(*((__u32 *)&(tw->tw_dport)) == ports &&
- sk2->sk_family == PF_INET6 &&
- ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) &&
- ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) &&
- sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
- const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (tcptw->tw_ts_recent_stamp &&
- (!twp ||
- (sysctl_tcp_tw_reuse &&
- xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
- /* See comment in tcp_ipv4.c */
- tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
- if (!tp->write_seq)
- tp->write_seq = 1;
- tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
- tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
- sock_hold(sk2);
- goto unique;
- } else
- goto not_unique;
- }
- }
- tw = NULL;
-
- /* And established part... */
- sk_for_each(sk2, node, &head->chain) {
- if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif))
- goto not_unique;
- }
-
-unique:
- BUG_TRAP(sk_unhashed(sk));
- __sk_add_node(sk, &head->chain);
- sk->sk_hash = hash;
- sock_prot_inc_use(sk->sk_prot);
- write_unlock(&head->lock);
-
- if (twp) {
- *twp = tw;
- NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
- } else if (tw) {
- /* Silly. Should hash-dance instead... */
- inet_twsk_deschedule(tw, &tcp_death_row);
- NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-
- inet_twsk_put(tw);
- }
- return 0;
-
-not_unique:
- write_unlock(&head->lock);
- return -EADDRNOTAVAIL;
-}
-
-static inline u32 tcpv6_port_offset(const struct sock *sk)
-{
- const struct inet_sock *inet = inet_sk(sk);
- const struct ipv6_pinfo *np = inet6_sk(sk);
-
- return secure_tcpv6_port_ephemeral(np->rcv_saddr.s6_addr32,
- np->daddr.s6_addr32,
- inet->dport);
-}
-
-static int tcp_v6_hash_connect(struct sock *sk)
-{
- unsigned short snum = inet_sk(sk)->num;
- struct inet_bind_hashbucket *head;
- struct inet_bind_bucket *tb;
- int ret;
-
- if (!snum) {
- int low = sysctl_local_port_range[0];
- int high = sysctl_local_port_range[1];
- int range = high - low;
- int i;
- int port;
- static u32 hint;
- u32 offset = hint + tcpv6_port_offset(sk);
- struct hlist_node *node;
- struct inet_timewait_sock *tw = NULL;
-
- local_bh_disable();
- for (i = 1; i <= range; i++) {
- port = low + (i + offset) % range;
- head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
- spin_lock(&head->lock);
-
- /* Does not bother with rcv_saddr checks,
- * because the established check is already
- * unique enough.
- */
- inet_bind_bucket_for_each(tb, node, &head->chain) {
- if (tb->port == port) {
- BUG_TRAP(!hlist_empty(&tb->owners));
- if (tb->fastreuse >= 0)
- goto next_port;
- if (!__tcp_v6_check_established(sk,
- port,
- &tw))
- goto ok;
- goto next_port;
- }
- }
-
- tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
- if (!tb) {
- spin_unlock(&head->lock);
- break;
- }
- tb->fastreuse = -1;
- goto ok;
-
- next_port:
- spin_unlock(&head->lock);
- }
- local_bh_enable();
-
- return -EADDRNOTAVAIL;
-
-ok:
- hint += i;
-
- /* Head lock still held and bh's disabled */
- inet_bind_hash(sk, tb, port);
- if (sk_unhashed(sk)) {
- inet_sk(sk)->sport = htons(port);
- __tcp_v6_hash(sk);
- }
- spin_unlock(&head->lock);
-
- if (tw) {
- inet_twsk_deschedule(tw, &tcp_death_row);
- inet_twsk_put(tw);
- }
-
- ret = 0;
- goto out;
- }
-
- head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
- tb = inet_csk(sk)->icsk_bind_hash;
- spin_lock_bh(&head->lock);
-
- if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- __tcp_v6_hash(sk);
- spin_unlock_bh(&head->lock);
- return 0;
- } else {
- spin_unlock(&head->lock);
- /* No definite answer... Walk to established hash table */
- ret = __tcp_v6_check_established(sk, snum, NULL);
-out:
- local_bh_enable();
- return ret;
- }
-}
-
static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
int addr_len)
{
struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
- struct inet_sock *inet = inet_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct in6_addr *saddr = NULL, *final_p = NULL, final;
@@ -571,7 +199,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
*/
if (addr_type == IPV6_ADDR_MAPPED) {
- u32 exthdrlen = tp->ext_header_len;
+ u32 exthdrlen = icsk->icsk_ext_hdr_len;
struct sockaddr_in sin;
SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
@@ -583,14 +211,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
sin.sin_port = usin->sin6_port;
sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
- tp->af_specific = &ipv6_mapped;
+ icsk->icsk_af_ops = &ipv6_mapped;
sk->sk_backlog_rcv = tcp_v4_do_rcv;
err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
if (err) {
- tp->ext_header_len = exthdrlen;
- tp->af_specific = &ipv6_specific;
+ icsk->icsk_ext_hdr_len = exthdrlen;
+ icsk->icsk_af_ops = &ipv6_specific;
sk->sk_backlog_rcv = tcp_v6_do_rcv;
goto failure;
} else {
@@ -643,16 +271,17 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
sk->sk_route_caps = dst->dev->features &
~(NETIF_F_IP_CSUM | NETIF_F_TSO);
- tp->ext_header_len = 0;
+ icsk->icsk_ext_hdr_len = 0;
if (np->opt)
- tp->ext_header_len = np->opt->opt_flen + np->opt->opt_nflen;
+ icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
+ np->opt->opt_nflen);
tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
inet->dport = usin->sin6_port;
tcp_set_state(sk, TCP_SYN_SENT);
- err = tcp_v6_hash_connect(sk);
+ err = inet6_hash_connect(&tcp_death_row, sk);
if (err)
goto late_failure;
@@ -758,7 +387,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
} else
dst_hold(dst);
- if (tp->pmtu_cookie > dst_mtu(dst)) {
+ if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
tcp_sync_mss(sk, dst_mtu(dst));
tcp_simple_retransmit(sk);
} /* else let the usual retransmit timer handle it */
@@ -775,8 +404,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (sock_owned_by_user(sk))
goto out;
- req = tcp_v6_search_req(sk, &prev, th->dest, &hdr->daddr,
- &hdr->saddr, inet6_iif(skb));
+ req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr,
+ &hdr->saddr, inet6_iif(skb));
if (!req)
goto out;
@@ -822,7 +451,7 @@ out:
static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
struct dst_entry *dst)
{
- struct tcp6_request_sock *treq = tcp6_rsk(req);
+ struct inet6_request_sock *treq = inet6_rsk(req);
struct ipv6_pinfo *np = inet6_sk(sk);
struct sk_buff * skb;
struct ipv6_txoptions *opt = NULL;
@@ -888,8 +517,8 @@ done:
static void tcp_v6_reqsk_destructor(struct request_sock *req)
{
- if (tcp6_rsk(req)->pktopts)
- kfree_skb(tcp6_rsk(req)->pktopts);
+ if (inet6_rsk(req)->pktopts)
+ kfree_skb(inet6_rsk(req)->pktopts);
}
static struct request_sock_ops tcp6_request_sock_ops = {
@@ -901,26 +530,15 @@ static struct request_sock_ops tcp6_request_sock_ops = {
.send_reset = tcp_v6_send_reset
};
-static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb)
-{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct inet6_skb_parm *opt = IP6CB(skb);
-
- if (np->rxopt.all) {
- if ((opt->hop && (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) ||
- ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) && np->rxopt.bits.rxflow) ||
- (opt->srcrt && (np->rxopt.bits.srcrt || np->rxopt.bits.osrcrt)) ||
- ((opt->dst1 || opt->dst0) && (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts)))
- return 1;
- }
- return 0;
-}
-
+static struct timewait_sock_ops tcp6_timewait_sock_ops = {
+ .twsk_obj_size = sizeof(struct tcp6_timewait_sock),
+ .twsk_unique = tcp_twsk_unique,
+};
-static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len,
- struct sk_buff *skb)
+static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb)
{
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct tcphdr *th = skb->h.th;
if (skb->ip_summed == CHECKSUM_HW) {
th->check = ~csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, 0);
@@ -1091,8 +709,9 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
struct sock *nsk;
/* Find possible connection requests. */
- req = tcp_v6_search_req(sk, &prev, th->source, &skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr, inet6_iif(skb));
+ req = inet6_csk_search_req(sk, &prev, th->source,
+ &skb->nh.ipv6h->saddr,
+ &skb->nh.ipv6h->daddr, inet6_iif(skb));
if (req)
return tcp_check_req(sk, skb, req, prev);
@@ -1116,23 +735,12 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
return sk;
}
-static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- const u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
-
- reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT);
- inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT);
-}
-
-
/* FIXME: this is substantially similar to the ipv4 code.
* Can some kind of merge be done? -- erics
*/
static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
- struct tcp6_request_sock *treq;
+ struct inet6_request_sock *treq;
struct ipv6_pinfo *np = inet6_sk(sk);
struct tcp_options_received tmp_opt;
struct tcp_sock *tp = tcp_sk(sk);
@@ -1157,7 +765,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
goto drop;
- req = reqsk_alloc(&tcp6_request_sock_ops);
+ req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
if (req == NULL)
goto drop;
@@ -1170,7 +778,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb);
- treq = tcp6_rsk(req);
+ treq = inet6_rsk(req);
ipv6_addr_copy(&treq->rmt_addr, &skb->nh.ipv6h->saddr);
ipv6_addr_copy(&treq->loc_addr, &skb->nh.ipv6h->daddr);
TCP_ECN_create_request(req, skb->h.th);
@@ -1196,8 +804,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
if (tcp_v6_send_synack(sk, req, NULL))
goto drop;
- tcp_v6_synq_add(sk, req);
-
+ inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
return 0;
drop:
@@ -1212,7 +819,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst)
{
- struct tcp6_request_sock *treq = tcp6_rsk(req);
+ struct inet6_request_sock *treq = inet6_rsk(req);
struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
struct tcp6_sock *newtcp6sk;
struct inet_sock *newinet;
@@ -1247,7 +854,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr);
- newtp->af_specific = &ipv6_mapped;
+ inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
newsk->sk_backlog_rcv = tcp_v4_do_rcv;
newnp->pktoptions = NULL;
newnp->opt = NULL;
@@ -1261,10 +868,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
*/
/* It is tricky place. Until this moment IPv4 tcp
- worked with IPv6 af_tcp.af_specific.
+ worked with IPv6 icsk.icsk_af_ops.
Sync it now.
*/
- tcp_sync_mss(newsk, newtp->pmtu_cookie);
+ tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
return newsk;
}
@@ -1371,10 +978,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
sock_kfree_s(sk, opt, opt->tot_len);
}
- newtp->ext_header_len = 0;
+ inet_csk(newsk)->icsk_ext_hdr_len = 0;
if (newnp->opt)
- newtp->ext_header_len = newnp->opt->opt_nflen +
- newnp->opt->opt_flen;
+ inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
+ newnp->opt->opt_flen);
tcp_sync_mss(newsk, dst_mtu(dst));
newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
@@ -1382,7 +989,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
- __tcp_v6_hash(newsk);
+ __inet6_hash(&tcp_hashinfo, newsk);
inet_inherit_port(&tcp_hashinfo, sk, newsk);
return newsk;
@@ -1679,139 +1286,16 @@ do_time_wait:
goto discard_it;
}
-static int tcp_v6_rebuild_header(struct sock *sk)
-{
- int err;
- struct dst_entry *dst;
- struct ipv6_pinfo *np = inet6_sk(sk);
-
- dst = __sk_dst_check(sk, np->dst_cookie);
-
- if (dst == NULL) {
- struct inet_sock *inet = inet_sk(sk);
- struct in6_addr *final_p = NULL, final;
- struct flowi fl;
-
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_TCP;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
- fl.fl6_flowlabel = np->flow_label;
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = inet->dport;
- fl.fl_ip_sport = inet->sport;
-
- if (np->opt && np->opt->srcrt) {
- struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- }
-
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err) {
- sk->sk_route_caps = 0;
- return err;
- }
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
- sk->sk_err_soft = -err;
- return err;
- }
-
- ip6_dst_store(sk, dst, NULL);
- sk->sk_route_caps = dst->dev->features &
- ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
- }
-
- return 0;
-}
-
-static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok)
-{
- struct sock *sk = skb->sk;
- struct inet_sock *inet = inet_sk(sk);
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct flowi fl;
- struct dst_entry *dst;
- struct in6_addr *final_p = NULL, final;
-
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_TCP;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
- fl.fl6_flowlabel = np->flow_label;
- IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_sport = inet->sport;
- fl.fl_ip_dport = inet->dport;
-
- if (np->opt && np->opt->srcrt) {
- struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- }
-
- dst = __sk_dst_check(sk, np->dst_cookie);
-
- if (dst == NULL) {
- int err = ip6_dst_lookup(sk, &dst, &fl);
-
- if (err) {
- sk->sk_err_soft = -err;
- return err;
- }
-
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
- sk->sk_route_caps = 0;
- return err;
- }
-
- ip6_dst_store(sk, dst, NULL);
- sk->sk_route_caps = dst->dev->features &
- ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
- }
-
- skb->dst = dst_clone(dst);
-
- /* Restore final destination back after routing done */
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
-
- return ip6_xmit(sk, skb, &fl, np->opt, 0);
-}
-
-static void v6_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
-{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr;
-
- sin6->sin6_family = AF_INET6;
- ipv6_addr_copy(&sin6->sin6_addr, &np->daddr);
- sin6->sin6_port = inet_sk(sk)->dport;
- /* We do not store received flowlabel for TCP */
- sin6->sin6_flowinfo = 0;
- sin6->sin6_scope_id = 0;
- if (sk->sk_bound_dev_if &&
- ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
- sin6->sin6_scope_id = sk->sk_bound_dev_if;
-}
-
static int tcp_v6_remember_stamp(struct sock *sk)
{
/* Alas, not yet... */
return 0;
}
-static struct tcp_func ipv6_specific = {
- .queue_xmit = tcp_v6_xmit,
+static struct inet_connection_sock_af_ops ipv6_specific = {
+ .queue_xmit = inet6_csk_xmit,
.send_check = tcp_v6_send_check,
- .rebuild_header = tcp_v6_rebuild_header,
+ .rebuild_header = inet6_sk_rebuild_header,
.conn_request = tcp_v6_conn_request,
.syn_recv_sock = tcp_v6_syn_recv_sock,
.remember_stamp = tcp_v6_remember_stamp,
@@ -1819,7 +1303,7 @@ static struct tcp_func ipv6_specific = {
.setsockopt = ipv6_setsockopt,
.getsockopt = ipv6_getsockopt,
- .addr2sockaddr = v6_addr2sockaddr,
+ .addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in6)
};
@@ -1827,7 +1311,7 @@ static struct tcp_func ipv6_specific = {
* TCP over IPv4 via INET6 API
*/
-static struct tcp_func ipv6_mapped = {
+static struct inet_connection_sock_af_ops ipv6_mapped = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
.rebuild_header = inet_sk_rebuild_header,
@@ -1838,7 +1322,7 @@ static struct tcp_func ipv6_mapped = {
.setsockopt = ipv6_setsockopt,
.getsockopt = ipv6_getsockopt,
- .addr2sockaddr = v6_addr2sockaddr,
+ .addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in6)
};
@@ -1877,8 +1361,9 @@ static int tcp_v6_init_sock(struct sock *sk)
sk->sk_state = TCP_CLOSE;
- tp->af_specific = &ipv6_specific;
+ icsk->icsk_af_ops = &ipv6_specific;
icsk->icsk_ca_ops = &tcp_init_congestion_ops;
+ icsk->icsk_sync_mss = tcp_sync_mss;
sk->sk_write_space = sk_stream_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
@@ -1900,14 +1385,13 @@ static int tcp_v6_destroy_sock(struct sock *sk)
static void get_openreq6(struct seq_file *seq,
struct sock *sk, struct request_sock *req, int i, int uid)
{
- struct in6_addr *dest, *src;
int ttd = req->expires - jiffies;
+ struct in6_addr *src = &inet6_rsk(req)->loc_addr;
+ struct in6_addr *dest = &inet6_rsk(req)->rmt_addr;
if (ttd < 0)
ttd = 0;
- src = &tcp6_rsk(req)->loc_addr;
- dest = &tcp6_rsk(req)->rmt_addr;
seq_printf(seq,
"%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
"%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n",
@@ -1988,14 +1472,14 @@ static void get_timewait6_sock(struct seq_file *seq,
{
struct in6_addr *dest, *src;
__u16 destp, srcp;
- struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
+ struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw);
int ttd = tw->tw_ttd - jiffies;
if (ttd < 0)
ttd = 0;
- dest = &tcp6tw->tw_v6_daddr;
- src = &tcp6tw->tw_v6_rcv_saddr;
+ dest = &tw6->tw_v6_daddr;
+ src = &tw6->tw_v6_rcv_saddr;
destp = ntohs(tw->tw_dport);
srcp = ntohs(tw->tw_sport);
@@ -2093,7 +1577,7 @@ struct proto tcpv6_prot = {
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp6_sock),
- .twsk_obj_size = sizeof(struct tcp6_timewait_sock),
+ .twsk_prot = &tcp6_timewait_sock_ops,
.rsk_prot = &tcp6_request_sock_ops,
};
@@ -2110,7 +1594,8 @@ static struct inet_protosw tcpv6_protosw = {
.ops = &inet6_stream_ops,
.capability = -1,
.no_check = 0,
- .flags = INET_PROTOSW_PERMANENT,
+ .flags = INET_PROTOSW_PERMANENT |
+ INET_PROTOSW_ICSK,
};
void __init tcpv6_init(void)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 5cc8731eb55b..d8538dcea813 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -36,6 +36,7 @@
#include <linux/ipv6.h>
#include <linux/icmpv6.h>
#include <linux/init.h>
+#include <linux/skbuff.h>
#include <asm/uaccess.h>
#include <net/sock.h>
@@ -300,20 +301,7 @@ out:
return err;
csum_copy_err:
- /* Clear queue. */
- if (flags&MSG_PEEK) {
- int clear = 0;
- spin_lock_bh(&sk->sk_receive_queue.lock);
- if (skb == skb_peek(&sk->sk_receive_queue)) {
- __skb_unlink(skb, &sk->sk_receive_queue);
- clear = 1;
- }
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- if (clear)
- kfree_skb(skb);
- }
-
- skb_free_datagram(sk, skb);
+ skb_kill_datagram(sk, skb, flags);
if (flags & MSG_DONTWAIT) {
UDP6_INC_STATS_USER(UDP_MIB_INERRORS);
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 34b3bb868409..0dc519b40404 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -75,7 +75,7 @@ static struct datalink_proto *pEII_datalink;
static struct datalink_proto *p8023_datalink;
static struct datalink_proto *pSNAP_datalink;
-static struct proto_ops ipx_dgram_ops;
+static const struct proto_ops ipx_dgram_ops;
LIST_HEAD(ipx_interfaces);
DEFINE_SPINLOCK(ipx_interfaces_lock);
@@ -1884,7 +1884,7 @@ static int ipx_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
rc = -EINVAL;
break;
default:
- rc = dev_ioctl(cmd, argp);
+ rc = -ENOIOCTLCMD;
break;
}
@@ -1901,7 +1901,7 @@ static struct net_proto_family ipx_family_ops = {
.owner = THIS_MODULE,
};
-static struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = {
.family = PF_IPX,
.owner = THIS_MODULE,
.release = ipx_release,
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 6f92f9c62990..fbfa96754417 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -62,12 +62,12 @@
static int irda_create(struct socket *sock, int protocol);
-static struct proto_ops irda_stream_ops;
-static struct proto_ops irda_seqpacket_ops;
-static struct proto_ops irda_dgram_ops;
+static const struct proto_ops irda_stream_ops;
+static const struct proto_ops irda_seqpacket_ops;
+static const struct proto_ops irda_dgram_ops;
#ifdef CONFIG_IRDA_ULTRA
-static struct proto_ops irda_ultra_ops;
+static const struct proto_ops irda_ultra_ops;
#define ULTRA_MAX_DATA 382
#endif /* CONFIG_IRDA_ULTRA */
@@ -1438,8 +1438,9 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock,
/*
* POSIX 1003.1g mandates this order.
*/
- if (sk->sk_err)
- ret = sock_error(sk);
+ ret = sock_error(sk);
+ if (ret)
+ break;
else if (sk->sk_shutdown & RCV_SHUTDOWN)
;
else if (noblock)
@@ -1821,7 +1822,7 @@ static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
return -EINVAL;
default:
IRDA_DEBUG(1, "%s(), doing device ioctl!\n", __FUNCTION__);
- return dev_ioctl(cmd, (void __user *) arg);
+ return -ENOIOCTLCMD;
}
/*NOTREACHED*/
@@ -2463,7 +2464,7 @@ static struct net_proto_family irda_family_ops = {
.owner = THIS_MODULE,
};
-static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = {
.family = PF_IRDA,
.owner = THIS_MODULE,
.release = irda_release,
@@ -2484,7 +2485,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = {
.sendpage = sock_no_sendpage,
};
-static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = {
.family = PF_IRDA,
.owner = THIS_MODULE,
.release = irda_release,
@@ -2505,7 +2506,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = {
.sendpage = sock_no_sendpage,
};
-static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = {
.family = PF_IRDA,
.owner = THIS_MODULE,
.release = irda_release,
@@ -2527,7 +2528,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = {
};
#ifdef CONFIG_IRDA_ULTRA
-static struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = {
.family = PF_IRDA,
.owner = THIS_MODULE,
.release = irda_release,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 39031684b65c..52efd04cbedb 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -113,7 +113,7 @@ static __inline__ void pfkey_unlock_table(void)
}
-static struct proto_ops pfkey_ops;
+static const struct proto_ops pfkey_ops;
static void pfkey_insert(struct sock *sk)
{
@@ -336,6 +336,7 @@ static u8 sadb_ext_min_len[] = {
[SADB_X_EXT_NAT_T_SPORT] = (u8) sizeof(struct sadb_x_nat_t_port),
[SADB_X_EXT_NAT_T_DPORT] = (u8) sizeof(struct sadb_x_nat_t_port),
[SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address),
+ [SADB_X_EXT_SEC_CTX] = (u8) sizeof(struct sadb_x_sec_ctx),
};
/* Verify sadb_address_{len,prefixlen} against sa_family. */
@@ -383,6 +384,55 @@ static int verify_address_len(void *p)
return 0;
}
+static inline int pfkey_sec_ctx_len(struct sadb_x_sec_ctx *sec_ctx)
+{
+ int len = 0;
+
+ len += sizeof(struct sadb_x_sec_ctx);
+ len += sec_ctx->sadb_x_ctx_len;
+ len += sizeof(uint64_t) - 1;
+ len /= sizeof(uint64_t);
+
+ return len;
+}
+
+static inline int verify_sec_ctx_len(void *p)
+{
+ struct sadb_x_sec_ctx *sec_ctx = (struct sadb_x_sec_ctx *)p;
+ int len;
+
+ if (sec_ctx->sadb_x_ctx_len > PAGE_SIZE)
+ return -EINVAL;
+
+ len = pfkey_sec_ctx_len(sec_ctx);
+
+ if (sec_ctx->sadb_x_sec_len != len)
+ return -EINVAL;
+
+ return 0;
+}
+
+static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(struct sadb_x_sec_ctx *sec_ctx)
+{
+ struct xfrm_user_sec_ctx *uctx = NULL;
+ int ctx_size = sec_ctx->sadb_x_ctx_len;
+
+ uctx = kmalloc((sizeof(*uctx)+ctx_size), GFP_KERNEL);
+
+ if (!uctx)
+ return NULL;
+
+ uctx->len = pfkey_sec_ctx_len(sec_ctx);
+ uctx->exttype = sec_ctx->sadb_x_sec_exttype;
+ uctx->ctx_doi = sec_ctx->sadb_x_ctx_doi;
+ uctx->ctx_alg = sec_ctx->sadb_x_ctx_alg;
+ uctx->ctx_len = sec_ctx->sadb_x_ctx_len;
+ memcpy(uctx + 1, sec_ctx + 1,
+ uctx->ctx_len);
+
+ return uctx;
+}
+
static int present_and_same_family(struct sadb_address *src,
struct sadb_address *dst)
{
@@ -438,6 +488,10 @@ static int parse_exthdrs(struct sk_buff *skb, struct sadb_msg *hdr, void **ext_h
if (verify_address_len(p))
return -EINVAL;
}
+ if (ext_type == SADB_X_EXT_SEC_CTX) {
+ if (verify_sec_ctx_len(p))
+ return -EINVAL;
+ }
ext_hdrs[ext_type-1] = p;
}
p += ext_len;
@@ -586,6 +640,9 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
struct sadb_key *key;
struct sadb_x_sa2 *sa2;
struct sockaddr_in *sin;
+ struct sadb_x_sec_ctx *sec_ctx;
+ struct xfrm_sec_ctx *xfrm_ctx;
+ int ctx_size = 0;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
struct sockaddr_in6 *sin6;
#endif
@@ -609,6 +666,12 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
sizeof(struct sadb_address)*2 +
sockaddr_size*2 +
sizeof(struct sadb_x_sa2);
+
+ if ((xfrm_ctx = x->security)) {
+ ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len);
+ size += sizeof(struct sadb_x_sec_ctx) + ctx_size;
+ }
+
/* identity & sensitivity */
if ((x->props.family == AF_INET &&
@@ -899,6 +962,20 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
n_port->sadb_x_nat_t_port_reserved = 0;
}
+ /* security context */
+ if (xfrm_ctx) {
+ sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb,
+ sizeof(struct sadb_x_sec_ctx) + ctx_size);
+ sec_ctx->sadb_x_sec_len =
+ (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t);
+ sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX;
+ sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi;
+ sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg;
+ sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len;
+ memcpy(sec_ctx + 1, xfrm_ctx->ctx_str,
+ xfrm_ctx->ctx_len);
+ }
+
return skb;
}
@@ -909,6 +986,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
struct sadb_lifetime *lifetime;
struct sadb_sa *sa;
struct sadb_key *key;
+ struct sadb_x_sec_ctx *sec_ctx;
uint16_t proto;
int err;
@@ -993,6 +1071,21 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime;
x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime;
}
+
+ sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
+ if (sec_ctx != NULL) {
+ struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
+
+ if (!uctx)
+ goto out;
+
+ err = security_xfrm_state_alloc(x, uctx);
+ kfree(uctx);
+
+ if (err)
+ goto out;
+ }
+
key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1];
if (sa->sadb_sa_auth) {
int keysize = 0;
@@ -1720,6 +1813,18 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol)
return 0;
}
+static inline int pfkey_xfrm_policy2sec_ctx_size(struct xfrm_policy *xp)
+{
+ struct xfrm_sec_ctx *xfrm_ctx = xp->security;
+
+ if (xfrm_ctx) {
+ int len = sizeof(struct sadb_x_sec_ctx);
+ len += xfrm_ctx->ctx_len;
+ return PFKEY_ALIGN8(len);
+ }
+ return 0;
+}
+
static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp)
{
int sockaddr_size = pfkey_sockaddr_size(xp->family);
@@ -1733,7 +1838,8 @@ static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp)
(sockaddr_size * 2) +
sizeof(struct sadb_x_policy) +
(xp->xfrm_nr * (sizeof(struct sadb_x_ipsecrequest) +
- (socklen * 2)));
+ (socklen * 2))) +
+ pfkey_xfrm_policy2sec_ctx_size(xp);
}
static struct sk_buff * pfkey_xfrm_policy2msg_prep(struct xfrm_policy *xp)
@@ -1757,6 +1863,8 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i
struct sadb_lifetime *lifetime;
struct sadb_x_policy *pol;
struct sockaddr_in *sin;
+ struct sadb_x_sec_ctx *sec_ctx;
+ struct xfrm_sec_ctx *xfrm_ctx;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
struct sockaddr_in6 *sin6;
#endif
@@ -1941,6 +2049,21 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i
}
}
}
+
+ /* security context */
+ if ((xfrm_ctx = xp->security)) {
+ int ctx_size = pfkey_xfrm_policy2sec_ctx_size(xp);
+
+ sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb, ctx_size);
+ sec_ctx->sadb_x_sec_len = ctx_size / sizeof(uint64_t);
+ sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX;
+ sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi;
+ sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg;
+ sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len;
+ memcpy(sec_ctx + 1, xfrm_ctx->ctx_str,
+ xfrm_ctx->ctx_len);
+ }
+
hdr->sadb_msg_len = size / sizeof(uint64_t);
hdr->sadb_msg_reserved = atomic_read(&xp->refcnt);
}
@@ -1976,12 +2099,13 @@ out:
static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
{
- int err;
+ int err = 0;
struct sadb_lifetime *lifetime;
struct sadb_address *sa;
struct sadb_x_policy *pol;
struct xfrm_policy *xp;
struct km_event c;
+ struct sadb_x_sec_ctx *sec_ctx;
if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
@@ -2028,6 +2152,22 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
if (xp->selector.dport)
xp->selector.dport_mask = ~0;
+ sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
+ if (sec_ctx != NULL) {
+ struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
+
+ if (!uctx) {
+ err = -ENOBUFS;
+ goto out;
+ }
+
+ err = security_xfrm_policy_alloc(xp, uctx);
+ kfree(uctx);
+
+ if (err)
+ goto out;
+ }
+
xp->lft.soft_byte_limit = XFRM_INF;
xp->lft.hard_byte_limit = XFRM_INF;
xp->lft.soft_packet_limit = XFRM_INF;
@@ -2051,10 +2191,9 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp,
hdr->sadb_msg_type != SADB_X_SPDUPDATE);
- if (err) {
- kfree(xp);
- return err;
- }
+
+ if (err)
+ goto out;
if (hdr->sadb_msg_type == SADB_X_SPDUPDATE)
c.event = XFRM_MSG_UPDPOLICY;
@@ -2069,6 +2208,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
return 0;
out:
+ security_xfrm_policy_free(xp);
kfree(xp);
return err;
}
@@ -2078,9 +2218,10 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
int err;
struct sadb_address *sa;
struct sadb_x_policy *pol;
- struct xfrm_policy *xp;
+ struct xfrm_policy *xp, tmp;
struct xfrm_selector sel;
struct km_event c;
+ struct sadb_x_sec_ctx *sec_ctx;
if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
@@ -2109,7 +2250,24 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
if (sel.dport)
sel.dport_mask = ~0;
- xp = xfrm_policy_bysel(pol->sadb_x_policy_dir-1, &sel, 1);
+ sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
+ memset(&tmp, 0, sizeof(struct xfrm_policy));
+
+ if (sec_ctx != NULL) {
+ struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
+
+ if (!uctx)
+ return -ENOMEM;
+
+ err = security_xfrm_policy_alloc(&tmp, uctx);
+ kfree(uctx);
+
+ if (err)
+ return err;
+ }
+
+ xp = xfrm_policy_bysel_ctx(pol->sadb_x_policy_dir-1, &sel, tmp.security, 1);
+ security_xfrm_policy_free(&tmp);
if (xp == NULL)
return -ENOENT;
@@ -2660,6 +2818,7 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt,
{
struct xfrm_policy *xp;
struct sadb_x_policy *pol = (struct sadb_x_policy*)data;
+ struct sadb_x_sec_ctx *sec_ctx;
switch (family) {
case AF_INET:
@@ -2709,10 +2868,32 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt,
(*dir = parse_ipsecrequests(xp, pol)) < 0)
goto out;
+ /* security context too */
+ if (len >= (pol->sadb_x_policy_len*8 +
+ sizeof(struct sadb_x_sec_ctx))) {
+ char *p = (char *)pol;
+ struct xfrm_user_sec_ctx *uctx;
+
+ p += pol->sadb_x_policy_len*8;
+ sec_ctx = (struct sadb_x_sec_ctx *)p;
+ if (len < pol->sadb_x_policy_len*8 +
+ sec_ctx->sadb_x_sec_len)
+ goto out;
+ if ((*dir = verify_sec_ctx_len(p)))
+ goto out;
+ uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
+ *dir = security_xfrm_policy_alloc(xp, uctx);
+ kfree(uctx);
+
+ if (*dir)
+ goto out;
+ }
+
*dir = pol->sadb_x_policy_dir-1;
return xp;
out:
+ security_xfrm_policy_free(xp);
kfree(xp);
return NULL;
}
@@ -2946,7 +3127,7 @@ out:
return err;
}
-static struct proto_ops pfkey_ops = {
+static const struct proto_ops pfkey_ops = {
.family = PF_KEY,
.owner = THIS_MODULE,
/* Operations that make no sense on pfkey sockets. */
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index c3f0b0783453..8171c53bc0ed 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -36,7 +36,7 @@
static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START;
static u16 llc_ui_sap_link_no_max[256];
static struct sockaddr_llc llc_ui_addrnull;
-static struct proto_ops llc_ui_ops;
+static const struct proto_ops llc_ui_ops;
static int llc_ui_wait_for_conn(struct sock *sk, long timeout);
static int llc_ui_wait_for_disc(struct sock *sk, long timeout);
@@ -566,10 +566,9 @@ static int llc_wait_data(struct sock *sk, long timeo)
/*
* POSIX 1003.1g mandates this order.
*/
- if (sk->sk_err) {
- rc = sock_error(sk);
+ rc = sock_error(sk);
+ if (rc)
break;
- }
rc = 0;
if (sk->sk_shutdown & RCV_SHUTDOWN)
break;
@@ -960,7 +959,7 @@ out:
static int llc_ui_ioctl(struct socket *sock, unsigned int cmd,
unsigned long arg)
{
- return dev_ioctl(cmd, (void __user *)arg);
+ return -ENOIOCTLCMD;
}
/**
@@ -1099,7 +1098,7 @@ static struct net_proto_family llc_ui_family_ops = {
.owner = THIS_MODULE,
};
-static struct proto_ops llc_ui_ops = {
+static const struct proto_ops llc_ui_ops = {
.family = PF_LLC,
.owner = THIS_MODULE,
.release = llc_ui_release,
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index cba63729313d..e10512e229b6 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -151,7 +151,7 @@ instance_create(u_int16_t group_num, int pid)
goto out_unlock;
INIT_HLIST_NODE(&inst->hlist);
- inst->lock = SPIN_LOCK_UNLOCKED;
+ spin_lock_init(&inst->lock);
/* needs to be two, since we _put() after creation */
atomic_set(&inst->use, 2);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index f28460b61e47..55afdda3d940 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -148,7 +148,7 @@ instance_create(u_int16_t queue_num, int pid)
atomic_set(&inst->id_sequence, 0);
/* needs to be two, since we _put() after creation */
atomic_set(&inst->use, 2);
- inst->lock = SPIN_LOCK_UNLOCKED;
+ spin_lock_init(&inst->lock);
INIT_LIST_HEAD(&inst->queue_list);
if (!try_module_get(THIS_MODULE))
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 96020d7087e8..7849cac14d3a 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -293,7 +293,7 @@ static inline int nl_pid_hash_dilute(struct nl_pid_hash *hash, int len)
return 0;
}
-static struct proto_ops netlink_ops;
+static const struct proto_ops netlink_ops;
static int netlink_insert(struct sock *sk, u32 pid)
{
@@ -1656,7 +1656,7 @@ int netlink_unregister_notifier(struct notifier_block *nb)
return notifier_chain_unregister(&netlink_chain, nb);
}
-static struct proto_ops netlink_ops = {
+static const struct proto_ops netlink_ops = {
.family = PF_NETLINK,
.owner = THIS_MODULE,
.release = netlink_release,
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 287cfcc56951..3b1378498d50 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -441,7 +441,7 @@ errout:
}
static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid,
- int seq, int cmd)
+ int seq, u8 cmd)
{
struct sk_buff *skb;
int err;
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index e5d82d711cae..63b0e4afeb33 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -63,7 +63,7 @@ static unsigned short circuit = 0x101;
static HLIST_HEAD(nr_list);
static DEFINE_SPINLOCK(nr_list_lock);
-static struct proto_ops nr_proto_ops;
+static const struct proto_ops nr_proto_ops;
/*
* Socket removal during an interrupt is now safe.
@@ -1166,10 +1166,11 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
void __user *argp = (void __user *)arg;
int ret;
- lock_sock(sk);
switch (cmd) {
case TIOCOUTQ: {
long amount;
+
+ lock_sock(sk);
amount = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc);
if (amount < 0)
amount = 0;
@@ -1180,6 +1181,8 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case TIOCINQ: {
struct sk_buff *skb;
long amount = 0L;
+
+ lock_sock(sk);
/* These two are safe on a single CPU system as only user tasks fiddle here */
if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL)
amount = skb->len;
@@ -1188,6 +1191,7 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
}
case SIOCGSTAMP:
+ lock_sock(sk);
ret = sock_get_timestamp(sk, argp);
release_sock(sk);
return ret;
@@ -1202,21 +1206,17 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case SIOCSIFNETMASK:
case SIOCGIFMETRIC:
case SIOCSIFMETRIC:
- release_sock(sk);
return -EINVAL;
case SIOCADDRT:
case SIOCDELRT:
case SIOCNRDECOBS:
- release_sock(sk);
if (!capable(CAP_NET_ADMIN)) return -EPERM;
return nr_rt_ioctl(cmd, argp);
default:
- release_sock(sk);
- return dev_ioctl(cmd, argp);
+ return -ENOIOCTLCMD;
}
- release_sock(sk);
return 0;
}
@@ -1337,7 +1337,7 @@ static struct net_proto_family nr_family_ops = {
.owner = THIS_MODULE,
};
-static struct proto_ops nr_proto_ops = {
+static const struct proto_ops nr_proto_ops = {
.family = PF_NETROM,
.owner = THIS_MODULE,
.release = nr_release,
diff --git a/net/nonet.c b/net/nonet.c
index e5241dceaa57..1230f0ae832e 100644
--- a/net/nonet.c
+++ b/net/nonet.c
@@ -14,11 +14,6 @@
#include <linux/init.h>
#include <linux/kernel.h>
-void __init sock_init(void)
-{
- printk(KERN_INFO "Linux NoNET1.0 for Linux 2.6\n");
-}
-
static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
{
return -ENXIO;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3e2462760413..f69e5ed9bd06 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -251,10 +251,10 @@ static void packet_sock_destruct(struct sock *sk)
}
-static struct proto_ops packet_ops;
+static const struct proto_ops packet_ops;
#ifdef CONFIG_SOCK_PACKET
-static struct proto_ops packet_ops_spkt;
+static const struct proto_ops packet_ops_spkt;
static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
@@ -1521,7 +1521,7 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,
#endif
default:
- return dev_ioctl(cmd, (void __user *)arg);
+ return -ENOIOCTLCMD;
}
return 0;
}
@@ -1784,7 +1784,7 @@ out:
#ifdef CONFIG_SOCK_PACKET
-static struct proto_ops packet_ops_spkt = {
+static const struct proto_ops packet_ops_spkt = {
.family = PF_PACKET,
.owner = THIS_MODULE,
.release = packet_release,
@@ -1806,7 +1806,7 @@ static struct proto_ops packet_ops_spkt = {
};
#endif
-static struct proto_ops packet_ops = {
+static const struct proto_ops packet_ops = {
.family = PF_PACKET,
.owner = THIS_MODULE,
.release = packet_release,
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 829fdbc4400b..63090be2315a 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1320,7 +1320,7 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
return 0;
default:
- return dev_ioctl(cmd, argp);
+ return -ENOIOCTLCMD;
}
return 0;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 82fb07aa06a5..ba5283204837 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -25,7 +25,7 @@
#include <net/pkt_sched.h>
-#define VERSION "1.1"
+#define VERSION "1.2"
/* Network Emulation Queuing algorithm.
====================================
@@ -65,11 +65,12 @@ struct netem_sched_data {
u32 jitter;
u32 duplicate;
u32 reorder;
+ u32 corrupt;
struct crndstate {
unsigned long last;
unsigned long rho;
- } delay_cor, loss_cor, dup_cor, reorder_cor;
+ } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
struct disttable {
u32 size;
@@ -183,6 +184,23 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
q->duplicate = dupsave;
}
+ /*
+ * Randomized packet corruption.
+ * Make copy if needed since we are modifying
+ * If packet is going to be hardware checksummed, then
+ * do it now in software before we mangle it.
+ */
+ if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
+ if (!(skb = skb_unshare(skb, GFP_ATOMIC))
+ || (skb->ip_summed == CHECKSUM_HW
+ && skb_checksum_help(skb, 0))) {
+ sch->qstats.drops++;
+ return NET_XMIT_DROP;
+ }
+
+ skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
+ }
+
if (q->gap == 0 /* not doing reordering */
|| q->counter < q->gap /* inside last reordering gap */
|| q->reorder < get_crandom(&q->reorder_cor)) {
@@ -382,6 +400,20 @@ static int get_reorder(struct Qdisc *sch, const struct rtattr *attr)
return 0;
}
+static int get_corrupt(struct Qdisc *sch, const struct rtattr *attr)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ const struct tc_netem_corrupt *r = RTA_DATA(attr);
+
+ if (RTA_PAYLOAD(attr) != sizeof(*r))
+ return -EINVAL;
+
+ q->corrupt = r->probability;
+ init_crandom(&q->corrupt_cor, r->correlation);
+ return 0;
+}
+
+/* Parse netlink message to set options */
static int netem_change(struct Qdisc *sch, struct rtattr *opt)
{
struct netem_sched_data *q = qdisc_priv(sch);
@@ -432,13 +464,19 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt)
if (ret)
return ret;
}
+
if (tb[TCA_NETEM_REORDER-1]) {
ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]);
if (ret)
return ret;
}
- }
+ if (tb[TCA_NETEM_CORRUPT-1]) {
+ ret = get_corrupt(sch, tb[TCA_NETEM_CORRUPT-1]);
+ if (ret)
+ return ret;
+ }
+ }
return 0;
}
@@ -564,6 +602,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
struct tc_netem_qopt qopt;
struct tc_netem_corr cor;
struct tc_netem_reorder reorder;
+ struct tc_netem_corrupt corrupt;
qopt.latency = q->latency;
qopt.jitter = q->jitter;
@@ -582,6 +621,10 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
reorder.correlation = q->reorder_cor.rho;
RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
+ corrupt.probability = q->corrupt;
+ corrupt.correlation = q->corrupt_cor.rho;
+ RTA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
+
rta->rta_len = skb->tail - b;
return skb->len;
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 6cf0342706b5..c4a2a8c4c339 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -22,6 +22,7 @@
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
+#include <linux/if_arp.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index dec68a604773..9d05e13e92f6 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -110,7 +110,6 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
asoc->cookie_life.tv_sec = sp->assocparams.sasoc_cookie_life / 1000;
asoc->cookie_life.tv_usec = (sp->assocparams.sasoc_cookie_life % 1000)
* 1000;
- asoc->pmtu = 0;
asoc->frag_point = 0;
/* Set the association max_retrans and RTO values from the
@@ -123,6 +122,25 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
asoc->overall_error_count = 0;
+ /* Initialize the association's heartbeat interval based on the
+ * sock configured value.
+ */
+ asoc->hbinterval = msecs_to_jiffies(sp->hbinterval);
+
+ /* Initialize path max retrans value. */
+ asoc->pathmaxrxt = sp->pathmaxrxt;
+
+ /* Initialize default path MTU. */
+ asoc->pathmtu = sp->pathmtu;
+
+ /* Set association default SACK delay */
+ asoc->sackdelay = msecs_to_jiffies(sp->sackdelay);
+
+ /* Set the association default flags controlling
+ * Heartbeat, SACK delay, and Path MTU Discovery.
+ */
+ asoc->param_flags = sp->param_flags;
+
/* Initialize the maximum mumber of new data packets that can be sent
* in a burst.
*/
@@ -144,8 +162,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
= 5 * asoc->rto_max;
asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
- asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
- SCTP_DEFAULT_TIMEOUT_SACK;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay;
asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
sp->autoclose * HZ;
@@ -540,23 +557,46 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
sctp_transport_set_owner(peer, asoc);
+ /* Initialize the peer's heartbeat interval based on the
+ * association configured value.
+ */
+ peer->hbinterval = asoc->hbinterval;
+
+ /* Set the path max_retrans. */
+ peer->pathmaxrxt = asoc->pathmaxrxt;
+
+ /* Initialize the peer's SACK delay timeout based on the
+ * association configured value.
+ */
+ peer->sackdelay = asoc->sackdelay;
+
+ /* Enable/disable heartbeat, SACK delay, and path MTU discovery
+ * based on association setting.
+ */
+ peer->param_flags = asoc->param_flags;
+
/* Initialize the pmtu of the transport. */
- sctp_transport_pmtu(peer);
+ if (peer->param_flags & SPP_PMTUD_ENABLE)
+ sctp_transport_pmtu(peer);
+ else if (asoc->pathmtu)
+ peer->pathmtu = asoc->pathmtu;
+ else
+ peer->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
/* If this is the first transport addr on this association,
* initialize the association PMTU to the peer's PMTU.
* If not and the current association PMTU is higher than the new
* peer's PMTU, reset the association PMTU to the new peer's PMTU.
*/
- if (asoc->pmtu)
- asoc->pmtu = min_t(int, peer->pmtu, asoc->pmtu);
+ if (asoc->pathmtu)
+ asoc->pathmtu = min_t(int, peer->pathmtu, asoc->pathmtu);
else
- asoc->pmtu = peer->pmtu;
+ asoc->pathmtu = peer->pathmtu;
SCTP_DEBUG_PRINTK("sctp_assoc_add_peer:association %p PMTU set to "
- "%d\n", asoc, asoc->pmtu);
+ "%d\n", asoc, asoc->pathmtu);
- asoc->frag_point = sctp_frag_point(sp, asoc->pmtu);
+ asoc->frag_point = sctp_frag_point(sp, asoc->pathmtu);
/* The asoc->peer.port might not be meaningful yet, but
* initialize the packet structure anyway.
@@ -574,7 +614,7 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
* (for example, implementations MAY use the size of the
* receiver advertised window).
*/
- peer->cwnd = min(4*asoc->pmtu, max_t(__u32, 2*asoc->pmtu, 4380));
+ peer->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380));
/* At this point, we may not have the receiver's advertised window,
* so initialize ssthresh to the default value and it will be set
@@ -585,17 +625,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
peer->partial_bytes_acked = 0;
peer->flight_size = 0;
- /* By default, enable heartbeat for peer address. */
- peer->hb_allowed = 1;
-
- /* Initialize the peer's heartbeat interval based on the
- * sock configured value.
- */
- peer->hb_interval = msecs_to_jiffies(sp->paddrparam.spp_hbinterval);
-
- /* Set the path max_retrans. */
- peer->max_retrans = sp->paddrparam.spp_pathmaxrxt;
-
/* Set the transport's RTO.initial value */
peer->rto = asoc->rto_initial;
@@ -1155,18 +1184,18 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
/* Get the lowest pmtu of all the transports. */
list_for_each(pos, &asoc->peer.transport_addr_list) {
t = list_entry(pos, struct sctp_transport, transports);
- if (!pmtu || (t->pmtu < pmtu))
- pmtu = t->pmtu;
+ if (!pmtu || (t->pathmtu < pmtu))
+ pmtu = t->pathmtu;
}
if (pmtu) {
struct sctp_sock *sp = sctp_sk(asoc->base.sk);
- asoc->pmtu = pmtu;
+ asoc->pathmtu = pmtu;
asoc->frag_point = sctp_frag_point(sp, pmtu);
}
SCTP_DEBUG_PRINTK("%s: asoc:%p, pmtu:%d, frag_point:%d\n",
- __FUNCTION__, asoc, asoc->pmtu, asoc->frag_point);
+ __FUNCTION__, asoc, asoc->pathmtu, asoc->frag_point);
}
/* Should we send a SACK to update our peer? */
@@ -1179,7 +1208,7 @@ static inline int sctp_peer_needs_update(struct sctp_association *asoc)
case SCTP_STATE_SHUTDOWN_SENT:
if ((asoc->rwnd > asoc->a_rwnd) &&
((asoc->rwnd - asoc->a_rwnd) >=
- min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pmtu)))
+ min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pathmtu)))
return 1;
break;
default:
diff --git a/net/sctp/input.c b/net/sctp/input.c
index b24ff2c1aef5..238f1bffa684 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -305,18 +305,36 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
struct sctp_transport *t, __u32 pmtu)
{
- if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
- printk(KERN_WARNING "%s: Reported pmtu %d too low, "
- "using default minimum of %d\n", __FUNCTION__, pmtu,
- SCTP_DEFAULT_MINSEGMENT);
- pmtu = SCTP_DEFAULT_MINSEGMENT;
- }
+ if (sock_owned_by_user(sk) || !t || (t->pathmtu == pmtu))
+ return;
- if (!sock_owned_by_user(sk) && t && (t->pmtu != pmtu)) {
- t->pmtu = pmtu;
+ if (t->param_flags & SPP_PMTUD_ENABLE) {
+ if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
+ printk(KERN_WARNING "%s: Reported pmtu %d too low, "
+ "using default minimum of %d\n",
+ __FUNCTION__, pmtu,
+ SCTP_DEFAULT_MINSEGMENT);
+ /* Use default minimum segment size and disable
+ * pmtu discovery on this transport.
+ */
+ t->pathmtu = SCTP_DEFAULT_MINSEGMENT;
+ t->param_flags = (t->param_flags & ~SPP_HB) |
+ SPP_PMTUD_DISABLE;
+ } else {
+ t->pathmtu = pmtu;
+ }
+
+ /* Update association pmtu. */
sctp_assoc_sync_pmtu(asoc);
- sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD);
}
+
+ /* Retransmit with the new pmtu setting.
+ * Normally, if PMTU discovery is disabled, an ICMP Fragmentation
+ * Needed will never be sent, but if a message was sent before
+ * PMTU discovery was disabled that was larger than the PMTU, it
+ * would not be fragmented, so it must be re-transmitted fragmented.
+ */
+ sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD);
}
/*
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index fa3be2b8fb5f..15c05165c905 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -866,7 +866,7 @@ static int sctp_inet6_supported_addrs(const struct sctp_sock *opt,
return 2;
}
-static struct proto_ops inet6_seqpacket_ops = {
+static const struct proto_ops inet6_seqpacket_ops = {
.family = PF_INET6,
.owner = THIS_MODULE,
.release = inet6_release,
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 931371633464..a40991ef72c9 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -234,8 +234,8 @@ sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *packet,
goto finish;
pmtu = ((packet->transport->asoc) ?
- (packet->transport->asoc->pmtu) :
- (packet->transport->pmtu));
+ (packet->transport->asoc->pathmtu) :
+ (packet->transport->pathmtu));
too_big = (psize + chunk_len > pmtu);
@@ -482,7 +482,9 @@ int sctp_packet_transmit(struct sctp_packet *packet)
if (!dst || (dst->obsolete > 1)) {
dst_release(dst);
sctp_transport_route(tp, NULL, sctp_sk(sk));
- sctp_assoc_sync_pmtu(asoc);
+ if (asoc->param_flags & SPP_PMTUD_ENABLE) {
+ sctp_assoc_sync_pmtu(asoc);
+ }
}
nskb->dst = dst_clone(tp->dst);
@@ -492,7 +494,10 @@ int sctp_packet_transmit(struct sctp_packet *packet)
SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n",
nskb->len);
- (*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok);
+ if (tp->param_flags & SPP_PMTUD_ENABLE)
+ (*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok);
+ else
+ (*tp->af_specific->sctp_xmit)(nskb, tp, 1);
out:
packet->size = packet->overhead;
@@ -577,7 +582,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet,
* if ((flightsize + Max.Burst * MTU) < cwnd)
* cwnd = flightsize + Max.Burst * MTU
*/
- max_burst_bytes = asoc->max_burst * asoc->pmtu;
+ max_burst_bytes = asoc->max_burst * asoc->pathmtu;
if ((transport->flight_size + max_burst_bytes) < transport->cwnd) {
transport->cwnd = transport->flight_size + max_burst_bytes;
SCTP_DEBUG_PRINTK("%s: cwnd limited by max_burst: "
@@ -622,7 +627,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet,
* data will fit or delay in hopes of bundling a full
* sized packet.
*/
- if (len < asoc->pmtu - packet->overhead) {
+ if (len < asoc->pathmtu - packet->overhead) {
retval = SCTP_XMIT_NAGLE_DELAY;
goto finish;
}
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index f775d78aa59d..de693b43c8ea 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -54,6 +54,7 @@
#include <net/protocol.h>
#include <net/ip.h>
#include <net/ipv6.h>
+#include <net/route.h>
#include <net/sctp/sctp.h>
#include <net/addrconf.h>
#include <net/inet_common.h>
@@ -829,7 +830,7 @@ static struct notifier_block sctp_inetaddr_notifier = {
};
/* Socket operations. */
-static struct proto_ops inet_seqpacket_ops = {
+static const struct proto_ops inet_seqpacket_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release, /* Needs to be wrapped... */
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 823947170a33..2d7d8a5db2ac 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -157,9 +157,12 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force,
{
__u32 ctsn, max_tsn_seen;
struct sctp_chunk *sack;
+ struct sctp_transport *trans = asoc->peer.last_data_from;
int error = 0;
- if (force)
+ if (force ||
+ (!trans && (asoc->param_flags & SPP_SACKDELAY_DISABLE)) ||
+ (trans && (trans->param_flags & SPP_SACKDELAY_DISABLE)))
asoc->peer.sack_needed = 1;
ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
@@ -189,7 +192,22 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force,
if (!asoc->peer.sack_needed) {
/* We will need a SACK for the next packet. */
asoc->peer.sack_needed = 1;
- goto out;
+
+ /* Set the SACK delay timeout based on the
+ * SACK delay for the last transport
+ * data was received from, or the default
+ * for the association.
+ */
+ if (trans)
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
+ trans->sackdelay;
+ else
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
+ asoc->sackdelay;
+
+ /* Restart the SACK timer. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
} else {
if (asoc->a_rwnd > asoc->rwnd)
asoc->a_rwnd = asoc->rwnd;
@@ -205,7 +223,7 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force,
sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
}
-out:
+
return error;
nomem:
error = -ENOMEM;
@@ -415,7 +433,7 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
asoc->overall_error_count++;
if (transport->state != SCTP_INACTIVE &&
- (transport->error_count++ >= transport->max_retrans)) {
+ (transport->error_count++ >= transport->pathmaxrxt)) {
SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
" transport IP: port:%d failed.\n",
asoc,
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 475bfb4972d9..557a7d90b92a 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -900,7 +900,7 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(const struct sctp_endpoint *ep,
* HEARTBEAT is sent (see Section 8.3).
*/
- if (transport->hb_allowed) {
+ if (transport->param_flags & SPP_HB_ENABLE) {
if (SCTP_DISPOSITION_NOMEM ==
sctp_sf_heartbeat(ep, asoc, type, arg,
commands))
@@ -1051,7 +1051,7 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep,
return SCTP_DISPOSITION_DISCARD;
}
- max_interval = link->hb_interval + link->rto;
+ max_interval = link->hbinterval + link->rto;
/* Check if the timestamp looks valid. */
if (time_after(hbinfo->sent_at, jiffies) ||
@@ -2691,14 +2691,9 @@ sctp_disposition_t sctp_sf_eat_data_6_2(const struct sctp_endpoint *ep,
* document allow. However, an SCTP transmitter MUST NOT be
* more aggressive than the following algorithms allow.
*/
- if (chunk->end_of_packet) {
+ if (chunk->end_of_packet)
sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE());
- /* Start the SACK timer. */
- sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
- SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
- }
-
return SCTP_DISPOSITION_CONSUME;
discard_force:
@@ -2721,13 +2716,9 @@ discard_force:
return SCTP_DISPOSITION_DISCARD;
discard_noforce:
- if (chunk->end_of_packet) {
+ if (chunk->end_of_packet)
sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE());
- /* Start the SACK timer. */
- sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
- SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
- }
return SCTP_DISPOSITION_DISCARD;
consume:
return SCTP_DISPOSITION_CONSUME;
@@ -3442,9 +3433,6 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(const struct sctp_endpoint *ep,
* send another.
*/
sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE());
- /* Start the SACK timer. */
- sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
- SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
return SCTP_DISPOSITION_CONSUME;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 9df888e932c5..fc04d185fa33 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1941,107 +1941,379 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
* address's parameters:
*
* struct sctp_paddrparams {
- * sctp_assoc_t spp_assoc_id;
- * struct sockaddr_storage spp_address;
- * uint32_t spp_hbinterval;
- * uint16_t spp_pathmaxrxt;
- * };
- *
- * spp_assoc_id - (UDP style socket) This is filled in the application,
- * and identifies the association for this query.
+ * sctp_assoc_t spp_assoc_id;
+ * struct sockaddr_storage spp_address;
+ * uint32_t spp_hbinterval;
+ * uint16_t spp_pathmaxrxt;
+ * uint32_t spp_pathmtu;
+ * uint32_t spp_sackdelay;
+ * uint32_t spp_flags;
+ * };
+ *
+ * spp_assoc_id - (one-to-many style socket) This is filled in the
+ * application, and identifies the association for
+ * this query.
* spp_address - This specifies which address is of interest.
* spp_hbinterval - This contains the value of the heartbeat interval,
- * in milliseconds. A value of 0, when modifying the
- * parameter, specifies that the heartbeat on this
- * address should be disabled. A value of UINT32_MAX
- * (4294967295), when modifying the parameter,
- * specifies that a heartbeat should be sent
- * immediately to the peer address, and the current
- * interval should remain unchanged.
+ * in milliseconds. If a value of zero
+ * is present in this field then no changes are to
+ * be made to this parameter.
* spp_pathmaxrxt - This contains the maximum number of
* retransmissions before this address shall be
- * considered unreachable.
+ * considered unreachable. If a value of zero
+ * is present in this field then no changes are to
+ * be made to this parameter.
+ * spp_pathmtu - When Path MTU discovery is disabled the value
+ * specified here will be the "fixed" path mtu.
+ * Note that if the spp_address field is empty
+ * then all associations on this address will
+ * have this fixed path mtu set upon them.
+ *
+ * spp_sackdelay - When delayed sack is enabled, this value specifies
+ * the number of milliseconds that sacks will be delayed
+ * for. This value will apply to all addresses of an
+ * association if the spp_address field is empty. Note
+ * also, that if delayed sack is enabled and this
+ * value is set to 0, no change is made to the last
+ * recorded delayed sack timer value.
+ *
+ * spp_flags - These flags are used to control various features
+ * on an association. The flag field may contain
+ * zero or more of the following options.
+ *
+ * SPP_HB_ENABLE - Enable heartbeats on the
+ * specified address. Note that if the address
+ * field is empty all addresses for the association
+ * have heartbeats enabled upon them.
+ *
+ * SPP_HB_DISABLE - Disable heartbeats on the
+ * speicifed address. Note that if the address
+ * field is empty all addresses for the association
+ * will have their heartbeats disabled. Note also
+ * that SPP_HB_ENABLE and SPP_HB_DISABLE are
+ * mutually exclusive, only one of these two should
+ * be specified. Enabling both fields will have
+ * undetermined results.
+ *
+ * SPP_HB_DEMAND - Request a user initiated heartbeat
+ * to be made immediately.
+ *
+ * SPP_PMTUD_ENABLE - This field will enable PMTU
+ * discovery upon the specified address. Note that
+ * if the address feild is empty then all addresses
+ * on the association are effected.
+ *
+ * SPP_PMTUD_DISABLE - This field will disable PMTU
+ * discovery upon the specified address. Note that
+ * if the address feild is empty then all addresses
+ * on the association are effected. Not also that
+ * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually
+ * exclusive. Enabling both will have undetermined
+ * results.
+ *
+ * SPP_SACKDELAY_ENABLE - Setting this flag turns
+ * on delayed sack. The time specified in spp_sackdelay
+ * is used to specify the sack delay for this address. Note
+ * that if spp_address is empty then all addresses will
+ * enable delayed sack and take on the sack delay
+ * value specified in spp_sackdelay.
+ * SPP_SACKDELAY_DISABLE - Setting this flag turns
+ * off delayed sack. If the spp_address field is blank then
+ * delayed sack is disabled for the entire association. Note
+ * also that this field is mutually exclusive to
+ * SPP_SACKDELAY_ENABLE, setting both will have undefined
+ * results.
*/
+int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
+ struct sctp_transport *trans,
+ struct sctp_association *asoc,
+ struct sctp_sock *sp,
+ int hb_change,
+ int pmtud_change,
+ int sackdelay_change)
+{
+ int error;
+
+ if (params->spp_flags & SPP_HB_DEMAND && trans) {
+ error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans);
+ if (error)
+ return error;
+ }
+
+ if (params->spp_hbinterval) {
+ if (trans) {
+ trans->hbinterval = msecs_to_jiffies(params->spp_hbinterval);
+ } else if (asoc) {
+ asoc->hbinterval = msecs_to_jiffies(params->spp_hbinterval);
+ } else {
+ sp->hbinterval = params->spp_hbinterval;
+ }
+ }
+
+ if (hb_change) {
+ if (trans) {
+ trans->param_flags =
+ (trans->param_flags & ~SPP_HB) | hb_change;
+ } else if (asoc) {
+ asoc->param_flags =
+ (asoc->param_flags & ~SPP_HB) | hb_change;
+ } else {
+ sp->param_flags =
+ (sp->param_flags & ~SPP_HB) | hb_change;
+ }
+ }
+
+ if (params->spp_pathmtu) {
+ if (trans) {
+ trans->pathmtu = params->spp_pathmtu;
+ sctp_assoc_sync_pmtu(asoc);
+ } else if (asoc) {
+ asoc->pathmtu = params->spp_pathmtu;
+ sctp_frag_point(sp, params->spp_pathmtu);
+ } else {
+ sp->pathmtu = params->spp_pathmtu;
+ }
+ }
+
+ if (pmtud_change) {
+ if (trans) {
+ int update = (trans->param_flags & SPP_PMTUD_DISABLE) &&
+ (params->spp_flags & SPP_PMTUD_ENABLE);
+ trans->param_flags =
+ (trans->param_flags & ~SPP_PMTUD) | pmtud_change;
+ if (update) {
+ sctp_transport_pmtu(trans);
+ sctp_assoc_sync_pmtu(asoc);
+ }
+ } else if (asoc) {
+ asoc->param_flags =
+ (asoc->param_flags & ~SPP_PMTUD) | pmtud_change;
+ } else {
+ sp->param_flags =
+ (sp->param_flags & ~SPP_PMTUD) | pmtud_change;
+ }
+ }
+
+ if (params->spp_sackdelay) {
+ if (trans) {
+ trans->sackdelay =
+ msecs_to_jiffies(params->spp_sackdelay);
+ } else if (asoc) {
+ asoc->sackdelay =
+ msecs_to_jiffies(params->spp_sackdelay);
+ } else {
+ sp->sackdelay = params->spp_sackdelay;
+ }
+ }
+
+ if (sackdelay_change) {
+ if (trans) {
+ trans->param_flags =
+ (trans->param_flags & ~SPP_SACKDELAY) |
+ sackdelay_change;
+ } else if (asoc) {
+ asoc->param_flags =
+ (asoc->param_flags & ~SPP_SACKDELAY) |
+ sackdelay_change;
+ } else {
+ sp->param_flags =
+ (sp->param_flags & ~SPP_SACKDELAY) |
+ sackdelay_change;
+ }
+ }
+
+ if (params->spp_pathmaxrxt) {
+ if (trans) {
+ trans->pathmaxrxt = params->spp_pathmaxrxt;
+ } else if (asoc) {
+ asoc->pathmaxrxt = params->spp_pathmaxrxt;
+ } else {
+ sp->pathmaxrxt = params->spp_pathmaxrxt;
+ }
+ }
+
+ return 0;
+}
+
static int sctp_setsockopt_peer_addr_params(struct sock *sk,
char __user *optval, int optlen)
{
- struct sctp_paddrparams params;
- struct sctp_transport *trans;
+ struct sctp_paddrparams params;
+ struct sctp_transport *trans = NULL;
+ struct sctp_association *asoc = NULL;
+ struct sctp_sock *sp = sctp_sk(sk);
int error;
+ int hb_change, pmtud_change, sackdelay_change;
if (optlen != sizeof(struct sctp_paddrparams))
- return -EINVAL;
+ return - EINVAL;
+
if (copy_from_user(&params, optval, optlen))
return -EFAULT;
- /*
- * API 7. Socket Options (setting the default value for the endpoint)
- * All options that support specific settings on an association by
- * filling in either an association id variable or a sockaddr_storage
- * SHOULD also support setting of the same value for the entire endpoint
- * (i.e. future associations). To accomplish this the following logic is
- * used when setting one of these options:
-
- * c) If neither the sockaddr_storage or association identification is
- * set i.e. the sockaddr_storage is set to all 0's (INADDR_ANY) and
- * the association identification is 0, the settings are a default
- * and to be applied to the endpoint (all future associations).
- */
+ /* Validate flags and value parameters. */
+ hb_change = params.spp_flags & SPP_HB;
+ pmtud_change = params.spp_flags & SPP_PMTUD;
+ sackdelay_change = params.spp_flags & SPP_SACKDELAY;
+
+ if (hb_change == SPP_HB ||
+ pmtud_change == SPP_PMTUD ||
+ sackdelay_change == SPP_SACKDELAY ||
+ params.spp_sackdelay > 500 ||
+ (params.spp_pathmtu
+ && params.spp_pathmtu < SCTP_DEFAULT_MINSEGMENT))
+ return -EINVAL;
- /* update default value for endpoint (all future associations) */
- if (!params.spp_assoc_id &&
- sctp_is_any(( union sctp_addr *)&params.spp_address)) {
- /* Manual heartbeat on an endpoint is invalid. */
- if (0xffffffff == params.spp_hbinterval)
+ /* If an address other than INADDR_ANY is specified, and
+ * no transport is found, then the request is invalid.
+ */
+ if (!sctp_is_any(( union sctp_addr *)&params.spp_address)) {
+ trans = sctp_addr_id2transport(sk, &params.spp_address,
+ params.spp_assoc_id);
+ if (!trans)
return -EINVAL;
- else if (params.spp_hbinterval)
- sctp_sk(sk)->paddrparam.spp_hbinterval =
- params.spp_hbinterval;
- if (params.spp_pathmaxrxt)
- sctp_sk(sk)->paddrparam.spp_pathmaxrxt =
- params.spp_pathmaxrxt;
- return 0;
}
- trans = sctp_addr_id2transport(sk, &params.spp_address,
- params.spp_assoc_id);
- if (!trans)
+ /* Get association, if assoc_id != 0 and the socket is a one
+ * to many style socket, and an association was not found, then
+ * the id was invalid.
+ */
+ asoc = sctp_id2assoc(sk, params.spp_assoc_id);
+ if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP))
return -EINVAL;
- /* Applications can enable or disable heartbeats for any peer address
- * of an association, modify an address's heartbeat interval, force a
- * heartbeat to be sent immediately, and adjust the address's maximum
- * number of retransmissions sent before an address is considered
- * unreachable.
- *
- * The value of the heartbeat interval, in milliseconds. A value of
- * UINT32_MAX (4294967295), when modifying the parameter, specifies
- * that a heartbeat should be sent immediately to the peer address,
- * and the current interval should remain unchanged.
+ /* Heartbeat demand can only be sent on a transport or
+ * association, but not a socket.
*/
- if (0xffffffff == params.spp_hbinterval) {
- error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans);
- if (error)
- return error;
- } else {
- /* The value of the heartbeat interval, in milliseconds. A value of 0,
- * when modifying the parameter, specifies that the heartbeat on this
- * address should be disabled.
+ if (params.spp_flags & SPP_HB_DEMAND && !trans && !asoc)
+ return -EINVAL;
+
+ /* Process parameters. */
+ error = sctp_apply_peer_addr_params(&params, trans, asoc, sp,
+ hb_change, pmtud_change,
+ sackdelay_change);
+
+ if (error)
+ return error;
+
+ /* If changes are for association, also apply parameters to each
+ * transport.
*/
- if (params.spp_hbinterval) {
- trans->hb_allowed = 1;
- trans->hb_interval =
- msecs_to_jiffies(params.spp_hbinterval);
- } else
- trans->hb_allowed = 0;
+ if (!trans && asoc) {
+ struct list_head *pos;
+
+ list_for_each(pos, &asoc->peer.transport_addr_list) {
+ trans = list_entry(pos, struct sctp_transport,
+ transports);
+ sctp_apply_peer_addr_params(&params, trans, asoc, sp,
+ hb_change, pmtud_change,
+ sackdelay_change);
+ }
}
- /* spp_pathmaxrxt contains the maximum number of retransmissions
- * before this address shall be considered unreachable.
- */
- if (params.spp_pathmaxrxt)
- trans->max_retrans = params.spp_pathmaxrxt;
+ return 0;
+}
+
+/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME)
+ *
+ * This options will get or set the delayed ack timer. The time is set
+ * in milliseconds. If the assoc_id is 0, then this sets or gets the
+ * endpoints default delayed ack timer value. If the assoc_id field is
+ * non-zero, then the set or get effects the specified association.
+ *
+ * struct sctp_assoc_value {
+ * sctp_assoc_t assoc_id;
+ * uint32_t assoc_value;
+ * };
+ *
+ * assoc_id - This parameter, indicates which association the
+ * user is preforming an action upon. Note that if
+ * this field's value is zero then the endpoints
+ * default value is changed (effecting future
+ * associations only).
+ *
+ * assoc_value - This parameter contains the number of milliseconds
+ * that the user is requesting the delayed ACK timer
+ * be set to. Note that this value is defined in
+ * the standard to be between 200 and 500 milliseconds.
+ *
+ * Note: a value of zero will leave the value alone,
+ * but disable SACK delay. A non-zero value will also
+ * enable SACK delay.
+ */
+static int sctp_setsockopt_delayed_ack_time(struct sock *sk,
+ char __user *optval, int optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_transport *trans = NULL;
+ struct sctp_association *asoc = NULL;
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ if (optlen != sizeof(struct sctp_assoc_value))
+ return - EINVAL;
+
+ if (copy_from_user(&params, optval, optlen))
+ return -EFAULT;
+
+ /* Validate value parameter. */
+ if (params.assoc_value > 500)
+ return -EINVAL;
+
+ /* Get association, if assoc_id != 0 and the socket is a one
+ * to many style socket, and an association was not found, then
+ * the id was invalid.
+ */
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (!asoc && params.assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+
+ if (params.assoc_value) {
+ if (asoc) {
+ asoc->sackdelay =
+ msecs_to_jiffies(params.assoc_value);
+ asoc->param_flags =
+ (asoc->param_flags & ~SPP_SACKDELAY) |
+ SPP_SACKDELAY_ENABLE;
+ } else {
+ sp->sackdelay = params.assoc_value;
+ sp->param_flags =
+ (sp->param_flags & ~SPP_SACKDELAY) |
+ SPP_SACKDELAY_ENABLE;
+ }
+ } else {
+ if (asoc) {
+ asoc->param_flags =
+ (asoc->param_flags & ~SPP_SACKDELAY) |
+ SPP_SACKDELAY_DISABLE;
+ } else {
+ sp->param_flags =
+ (sp->param_flags & ~SPP_SACKDELAY) |
+ SPP_SACKDELAY_DISABLE;
+ }
+ }
+
+ /* If change is for association, also apply to each transport. */
+ if (asoc) {
+ struct list_head *pos;
+
+ list_for_each(pos, &asoc->peer.transport_addr_list) {
+ trans = list_entry(pos, struct sctp_transport,
+ transports);
+ if (params.assoc_value) {
+ trans->sackdelay =
+ msecs_to_jiffies(params.assoc_value);
+ trans->param_flags =
+ (trans->param_flags & ~SPP_SACKDELAY) |
+ SPP_SACKDELAY_ENABLE;
+ } else {
+ trans->param_flags =
+ (trans->param_flags & ~SPP_SACKDELAY) |
+ SPP_SACKDELAY_DISABLE;
+ }
+ }
+ }
+
return 0;
}
@@ -2334,7 +2606,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, int optl
/* Update the frag_point of the existing associations. */
list_for_each(pos, &(sp->ep->asocs)) {
asoc = list_entry(pos, struct sctp_association, asocs);
- asoc->frag_point = sctp_frag_point(sp, asoc->pmtu);
+ asoc->frag_point = sctp_frag_point(sp, asoc->pathmtu);
}
return 0;
@@ -2491,6 +2763,10 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
retval = sctp_setsockopt_peer_addr_params(sk, optval, optlen);
break;
+ case SCTP_DELAYED_ACK_TIME:
+ retval = sctp_setsockopt_delayed_ack_time(sk, optval, optlen);
+ break;
+
case SCTP_INITMSG:
retval = sctp_setsockopt_initmsg(sk, optval, optlen);
break;
@@ -2715,8 +2991,13 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk)
/* Default Peer Address Parameters. These defaults can
* be modified via SCTP_PEER_ADDR_PARAMS
*/
- sp->paddrparam.spp_hbinterval = jiffies_to_msecs(sctp_hb_interval);
- sp->paddrparam.spp_pathmaxrxt = sctp_max_retrans_path;
+ sp->hbinterval = jiffies_to_msecs(sctp_hb_interval);
+ sp->pathmaxrxt = sctp_max_retrans_path;
+ sp->pathmtu = 0; // allow default discovery
+ sp->sackdelay = sctp_sack_timeout;
+ sp->param_flags = SPP_HB_ENABLE |
+ SPP_PMTUD_ENABLE |
+ SPP_SACKDELAY_ENABLE;
/* If enabled no SCTP message fragmentation will be performed.
* Configure through SCTP_DISABLE_FRAGMENTS socket option.
@@ -2865,7 +3146,7 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
status.sstat_primary.spinfo_cwnd = transport->cwnd;
status.sstat_primary.spinfo_srtt = transport->srtt;
status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto);
- status.sstat_primary.spinfo_mtu = transport->pmtu;
+ status.sstat_primary.spinfo_mtu = transport->pathmtu;
if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN)
status.sstat_primary.spinfo_state = SCTP_ACTIVE;
@@ -2924,7 +3205,7 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len,
pinfo.spinfo_cwnd = transport->cwnd;
pinfo.spinfo_srtt = transport->srtt;
pinfo.spinfo_rto = jiffies_to_msecs(transport->rto);
- pinfo.spinfo_mtu = transport->pmtu;
+ pinfo.spinfo_mtu = transport->pathmtu;
if (pinfo.spinfo_state == SCTP_UNKNOWN)
pinfo.spinfo_state = SCTP_ACTIVE;
@@ -3086,69 +3367,227 @@ out:
* address's parameters:
*
* struct sctp_paddrparams {
- * sctp_assoc_t spp_assoc_id;
- * struct sockaddr_storage spp_address;
- * uint32_t spp_hbinterval;
- * uint16_t spp_pathmaxrxt;
- * };
- *
- * spp_assoc_id - (UDP style socket) This is filled in the application,
- * and identifies the association for this query.
+ * sctp_assoc_t spp_assoc_id;
+ * struct sockaddr_storage spp_address;
+ * uint32_t spp_hbinterval;
+ * uint16_t spp_pathmaxrxt;
+ * uint32_t spp_pathmtu;
+ * uint32_t spp_sackdelay;
+ * uint32_t spp_flags;
+ * };
+ *
+ * spp_assoc_id - (one-to-many style socket) This is filled in the
+ * application, and identifies the association for
+ * this query.
* spp_address - This specifies which address is of interest.
* spp_hbinterval - This contains the value of the heartbeat interval,
- * in milliseconds. A value of 0, when modifying the
- * parameter, specifies that the heartbeat on this
- * address should be disabled. A value of UINT32_MAX
- * (4294967295), when modifying the parameter,
- * specifies that a heartbeat should be sent
- * immediately to the peer address, and the current
- * interval should remain unchanged.
+ * in milliseconds. If a value of zero
+ * is present in this field then no changes are to
+ * be made to this parameter.
* spp_pathmaxrxt - This contains the maximum number of
* retransmissions before this address shall be
- * considered unreachable.
+ * considered unreachable. If a value of zero
+ * is present in this field then no changes are to
+ * be made to this parameter.
+ * spp_pathmtu - When Path MTU discovery is disabled the value
+ * specified here will be the "fixed" path mtu.
+ * Note that if the spp_address field is empty
+ * then all associations on this address will
+ * have this fixed path mtu set upon them.
+ *
+ * spp_sackdelay - When delayed sack is enabled, this value specifies
+ * the number of milliseconds that sacks will be delayed
+ * for. This value will apply to all addresses of an
+ * association if the spp_address field is empty. Note
+ * also, that if delayed sack is enabled and this
+ * value is set to 0, no change is made to the last
+ * recorded delayed sack timer value.
+ *
+ * spp_flags - These flags are used to control various features
+ * on an association. The flag field may contain
+ * zero or more of the following options.
+ *
+ * SPP_HB_ENABLE - Enable heartbeats on the
+ * specified address. Note that if the address
+ * field is empty all addresses for the association
+ * have heartbeats enabled upon them.
+ *
+ * SPP_HB_DISABLE - Disable heartbeats on the
+ * speicifed address. Note that if the address
+ * field is empty all addresses for the association
+ * will have their heartbeats disabled. Note also
+ * that SPP_HB_ENABLE and SPP_HB_DISABLE are
+ * mutually exclusive, only one of these two should
+ * be specified. Enabling both fields will have
+ * undetermined results.
+ *
+ * SPP_HB_DEMAND - Request a user initiated heartbeat
+ * to be made immediately.
+ *
+ * SPP_PMTUD_ENABLE - This field will enable PMTU
+ * discovery upon the specified address. Note that
+ * if the address feild is empty then all addresses
+ * on the association are effected.
+ *
+ * SPP_PMTUD_DISABLE - This field will disable PMTU
+ * discovery upon the specified address. Note that
+ * if the address feild is empty then all addresses
+ * on the association are effected. Not also that
+ * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually
+ * exclusive. Enabling both will have undetermined
+ * results.
+ *
+ * SPP_SACKDELAY_ENABLE - Setting this flag turns
+ * on delayed sack. The time specified in spp_sackdelay
+ * is used to specify the sack delay for this address. Note
+ * that if spp_address is empty then all addresses will
+ * enable delayed sack and take on the sack delay
+ * value specified in spp_sackdelay.
+ * SPP_SACKDELAY_DISABLE - Setting this flag turns
+ * off delayed sack. If the spp_address field is blank then
+ * delayed sack is disabled for the entire association. Note
+ * also that this field is mutually exclusive to
+ * SPP_SACKDELAY_ENABLE, setting both will have undefined
+ * results.
*/
static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
- char __user *optval, int __user *optlen)
+ char __user *optval, int __user *optlen)
{
- struct sctp_paddrparams params;
- struct sctp_transport *trans;
+ struct sctp_paddrparams params;
+ struct sctp_transport *trans = NULL;
+ struct sctp_association *asoc = NULL;
+ struct sctp_sock *sp = sctp_sk(sk);
if (len != sizeof(struct sctp_paddrparams))
return -EINVAL;
+
if (copy_from_user(&params, optval, len))
return -EFAULT;
- /* If no association id is specified retrieve the default value
- * for the endpoint that will be used for all future associations
+ /* If an address other than INADDR_ANY is specified, and
+ * no transport is found, then the request is invalid.
*/
- if (!params.spp_assoc_id &&
- sctp_is_any(( union sctp_addr *)&params.spp_address)) {
- params.spp_hbinterval = sctp_sk(sk)->paddrparam.spp_hbinterval;
- params.spp_pathmaxrxt = sctp_sk(sk)->paddrparam.spp_pathmaxrxt;
-
- goto done;
+ if (!sctp_is_any(( union sctp_addr *)&params.spp_address)) {
+ trans = sctp_addr_id2transport(sk, &params.spp_address,
+ params.spp_assoc_id);
+ if (!trans) {
+ SCTP_DEBUG_PRINTK("Failed no transport\n");
+ return -EINVAL;
+ }
}
- trans = sctp_addr_id2transport(sk, &params.spp_address,
- params.spp_assoc_id);
- if (!trans)
+ /* Get association, if assoc_id != 0 and the socket is a one
+ * to many style socket, and an association was not found, then
+ * the id was invalid.
+ */
+ asoc = sctp_id2assoc(sk, params.spp_assoc_id);
+ if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP)) {
+ SCTP_DEBUG_PRINTK("Failed no association\n");
return -EINVAL;
+ }
- /* The value of the heartbeat interval, in milliseconds. A value of 0,
- * when modifying the parameter, specifies that the heartbeat on this
- * address should be disabled.
- */
- if (!trans->hb_allowed)
- params.spp_hbinterval = 0;
- else
- params.spp_hbinterval = jiffies_to_msecs(trans->hb_interval);
+ if (trans) {
+ /* Fetch transport values. */
+ params.spp_hbinterval = jiffies_to_msecs(trans->hbinterval);
+ params.spp_pathmtu = trans->pathmtu;
+ params.spp_pathmaxrxt = trans->pathmaxrxt;
+ params.spp_sackdelay = jiffies_to_msecs(trans->sackdelay);
+
+ /*draft-11 doesn't say what to return in spp_flags*/
+ params.spp_flags = trans->param_flags;
+ } else if (asoc) {
+ /* Fetch association values. */
+ params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval);
+ params.spp_pathmtu = asoc->pathmtu;
+ params.spp_pathmaxrxt = asoc->pathmaxrxt;
+ params.spp_sackdelay = jiffies_to_msecs(asoc->sackdelay);
+
+ /*draft-11 doesn't say what to return in spp_flags*/
+ params.spp_flags = asoc->param_flags;
+ } else {
+ /* Fetch socket values. */
+ params.spp_hbinterval = sp->hbinterval;
+ params.spp_pathmtu = sp->pathmtu;
+ params.spp_sackdelay = sp->sackdelay;
+ params.spp_pathmaxrxt = sp->pathmaxrxt;
+
+ /*draft-11 doesn't say what to return in spp_flags*/
+ params.spp_flags = sp->param_flags;
+ }
- /* spp_pathmaxrxt contains the maximum number of retransmissions
- * before this address shall be considered unreachable.
- */
- params.spp_pathmaxrxt = trans->max_retrans;
+ if (copy_to_user(optval, &params, len))
+ return -EFAULT;
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ return 0;
+}
+
+/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME)
+ *
+ * This options will get or set the delayed ack timer. The time is set
+ * in milliseconds. If the assoc_id is 0, then this sets or gets the
+ * endpoints default delayed ack timer value. If the assoc_id field is
+ * non-zero, then the set or get effects the specified association.
+ *
+ * struct sctp_assoc_value {
+ * sctp_assoc_t assoc_id;
+ * uint32_t assoc_value;
+ * };
+ *
+ * assoc_id - This parameter, indicates which association the
+ * user is preforming an action upon. Note that if
+ * this field's value is zero then the endpoints
+ * default value is changed (effecting future
+ * associations only).
+ *
+ * assoc_value - This parameter contains the number of milliseconds
+ * that the user is requesting the delayed ACK timer
+ * be set to. Note that this value is defined in
+ * the standard to be between 200 and 500 milliseconds.
+ *
+ * Note: a value of zero will leave the value alone,
+ * but disable SACK delay. A non-zero value will also
+ * enable SACK delay.
+ */
+static int sctp_getsockopt_delayed_ack_time(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_association *asoc = NULL;
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ if (len != sizeof(struct sctp_assoc_value))
+ return - EINVAL;
+
+ if (copy_from_user(&params, optval, len))
+ return -EFAULT;
+
+ /* Get association, if assoc_id != 0 and the socket is a one
+ * to many style socket, and an association was not found, then
+ * the id was invalid.
+ */
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (!asoc && params.assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+
+ if (asoc) {
+ /* Fetch association values. */
+ if (asoc->param_flags & SPP_SACKDELAY_ENABLE)
+ params.assoc_value = jiffies_to_msecs(
+ asoc->sackdelay);
+ else
+ params.assoc_value = 0;
+ } else {
+ /* Fetch socket values. */
+ if (sp->param_flags & SPP_SACKDELAY_ENABLE)
+ params.assoc_value = sp->sackdelay;
+ else
+ params.assoc_value = 0;
+ }
-done:
if (copy_to_user(optval, &params, len))
return -EFAULT;
@@ -4015,6 +4454,10 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
retval = sctp_getsockopt_peer_addr_params(sk, len, optval,
optlen);
break;
+ case SCTP_DELAYED_ACK_TIME:
+ retval = sctp_getsockopt_delayed_ack_time(sk, len, optval,
+ optlen);
+ break;
case SCTP_INITMSG:
retval = sctp_getsockopt_initmsg(sk, len, optval, optlen);
break;
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 268ddaf2dc0f..68d73e2dd155 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -86,10 +86,13 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
peer->init_sent_count = 0;
peer->state = SCTP_ACTIVE;
- peer->hb_allowed = 0;
+ peer->param_flags = SPP_HB_DISABLE |
+ SPP_PMTUD_ENABLE |
+ SPP_SACKDELAY_ENABLE;
+ peer->hbinterval = 0;
/* Initialize the default path max_retrans. */
- peer->max_retrans = sctp_max_retrans_path;
+ peer->pathmaxrxt = sctp_max_retrans_path;
peer->error_count = 0;
INIT_LIST_HEAD(&peer->transmitted);
@@ -229,10 +232,10 @@ void sctp_transport_pmtu(struct sctp_transport *transport)
dst = transport->af_specific->get_dst(NULL, &transport->ipaddr, NULL);
if (dst) {
- transport->pmtu = dst_mtu(dst);
+ transport->pathmtu = dst_mtu(dst);
dst_release(dst);
} else
- transport->pmtu = SCTP_DEFAULT_MAXSEGMENT;
+ transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
}
/* Caches the dst entry and source address for a transport's destination
@@ -254,8 +257,11 @@ void sctp_transport_route(struct sctp_transport *transport,
af->get_saddr(asoc, dst, daddr, &transport->saddr);
transport->dst = dst;
+ if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) {
+ return;
+ }
if (dst) {
- transport->pmtu = dst_mtu(dst);
+ transport->pathmtu = dst_mtu(dst);
/* Initialize sk->sk_rcv_saddr, if the transport is the
* association's active path for getsockname().
@@ -264,7 +270,7 @@ void sctp_transport_route(struct sctp_transport *transport,
opt->pf->af->to_sk_saddr(&transport->saddr,
asoc->base.sk);
} else
- transport->pmtu = SCTP_DEFAULT_MAXSEGMENT;
+ transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
}
/* Hold a reference to a transport. */
@@ -369,7 +375,7 @@ void sctp_transport_raise_cwnd(struct sctp_transport *transport,
ssthresh = transport->ssthresh;
pba = transport->partial_bytes_acked;
- pmtu = transport->asoc->pmtu;
+ pmtu = transport->asoc->pathmtu;
if (cwnd <= ssthresh) {
/* RFC 2960 7.2.1, sctpimpguide-05 2.14.2 When cwnd is less
@@ -441,8 +447,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
* partial_bytes_acked = 0
*/
transport->ssthresh = max(transport->cwnd/2,
- 4*transport->asoc->pmtu);
- transport->cwnd = transport->asoc->pmtu;
+ 4*transport->asoc->pathmtu);
+ transport->cwnd = transport->asoc->pathmtu;
break;
case SCTP_LOWER_CWND_FAST_RTX:
@@ -459,7 +465,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
* partial_bytes_acked = 0
*/
transport->ssthresh = max(transport->cwnd/2,
- 4*transport->asoc->pmtu);
+ 4*transport->asoc->pathmtu);
transport->cwnd = transport->ssthresh;
break;
@@ -479,7 +485,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
if ((jiffies - transport->last_time_ecne_reduced) >
transport->rtt) {
transport->ssthresh = max(transport->cwnd/2,
- 4*transport->asoc->pmtu);
+ 4*transport->asoc->pathmtu);
transport->cwnd = transport->ssthresh;
transport->last_time_ecne_reduced = jiffies;
}
@@ -496,7 +502,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
*/
if ((jiffies - transport->last_time_used) > transport->rto)
transport->cwnd = max(transport->cwnd/2,
- 4*transport->asoc->pmtu);
+ 4*transport->asoc->pathmtu);
break;
};
@@ -511,7 +517,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
unsigned long sctp_transport_timeout(struct sctp_transport *t)
{
unsigned long timeout;
- timeout = t->hb_interval + t->rto + sctp_jitter(t->rto);
+ timeout = t->hbinterval + t->rto + sctp_jitter(t->rto);
timeout += jiffies;
return timeout;
}
diff --git a/net/socket.c b/net/socket.c
index 3145103cdf54..06fa217f58a9 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -640,154 +640,150 @@ static void sock_aio_dtor(struct kiocb *iocb)
kfree(iocb->private);
}
-/*
- * Read data from a socket. ubuf is a user mode pointer. We make sure the user
- * area ubuf...ubuf+size-1 is writable before asking the protocol.
- */
-
-static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf,
- size_t size, loff_t pos)
+static ssize_t sock_sendpage(struct file *file, struct page *page,
+ int offset, size_t size, loff_t *ppos, int more)
{
- struct sock_iocb *x, siocb;
struct socket *sock;
int flags;
- if (pos != 0)
- return -ESPIPE;
- if (size==0) /* Match SYS5 behaviour */
- return 0;
+ sock = file->private_data;
- if (is_sync_kiocb(iocb))
- x = &siocb;
- else {
- x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL);
- if (!x)
- return -ENOMEM;
+ flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
+ if (more)
+ flags |= MSG_MORE;
+
+ return sock->ops->sendpage(sock, page, offset, size, flags);
+}
+
+static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb,
+ char __user *ubuf, size_t size, struct sock_iocb *siocb)
+{
+ if (!is_sync_kiocb(iocb)) {
+ siocb = kmalloc(sizeof(*siocb), GFP_KERNEL);
+ if (!siocb)
+ return NULL;
iocb->ki_dtor = sock_aio_dtor;
}
- iocb->private = x;
- x->kiocb = iocb;
- sock = iocb->ki_filp->private_data;
- x->async_msg.msg_name = NULL;
- x->async_msg.msg_namelen = 0;
- x->async_msg.msg_iov = &x->async_iov;
- x->async_msg.msg_iovlen = 1;
- x->async_msg.msg_control = NULL;
- x->async_msg.msg_controllen = 0;
- x->async_iov.iov_base = ubuf;
- x->async_iov.iov_len = size;
- flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
+ siocb->kiocb = iocb;
+ siocb->async_iov.iov_base = ubuf;
+ siocb->async_iov.iov_len = size;
- return __sock_recvmsg(iocb, sock, &x->async_msg, size, flags);
+ iocb->private = siocb;
+ return siocb;
}
+static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
+ struct file *file, struct iovec *iov, unsigned long nr_segs)
+{
+ struct socket *sock = file->private_data;
+ size_t size = 0;
+ int i;
-/*
- * Write data to a socket. We verify that the user area ubuf..ubuf+size-1
- * is readable by the user process.
- */
+ for (i = 0 ; i < nr_segs ; i++)
+ size += iov[i].iov_len;
-static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf,
- size_t size, loff_t pos)
+ msg->msg_name = NULL;
+ msg->msg_namelen = 0;
+ msg->msg_control = NULL;
+ msg->msg_controllen = 0;
+ msg->msg_iov = (struct iovec *) iov;
+ msg->msg_iovlen = nr_segs;
+ msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
+
+ return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
+}
+
+static ssize_t sock_readv(struct file *file, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
{
- struct sock_iocb *x, siocb;
- struct socket *sock;
-
+ struct kiocb iocb;
+ struct sock_iocb siocb;
+ struct msghdr msg;
+ int ret;
+
+ init_sync_kiocb(&iocb, NULL);
+ iocb.private = &siocb;
+
+ ret = do_sock_read(&msg, &iocb, file, (struct iovec *)iov, nr_segs);
+ if (-EIOCBQUEUED == ret)
+ ret = wait_on_sync_kiocb(&iocb);
+ return ret;
+}
+
+static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf,
+ size_t count, loff_t pos)
+{
+ struct sock_iocb siocb, *x;
+
if (pos != 0)
return -ESPIPE;
- if(size==0) /* Match SYS5 behaviour */
+ if (count == 0) /* Match SYS5 behaviour */
return 0;
- if (is_sync_kiocb(iocb))
- x = &siocb;
- else {
- x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL);
- if (!x)
- return -ENOMEM;
- iocb->ki_dtor = sock_aio_dtor;
- }
- iocb->private = x;
- x->kiocb = iocb;
- sock = iocb->ki_filp->private_data;
-
- x->async_msg.msg_name = NULL;
- x->async_msg.msg_namelen = 0;
- x->async_msg.msg_iov = &x->async_iov;
- x->async_msg.msg_iovlen = 1;
- x->async_msg.msg_control = NULL;
- x->async_msg.msg_controllen = 0;
- x->async_msg.msg_flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
- if (sock->type == SOCK_SEQPACKET)
- x->async_msg.msg_flags |= MSG_EOR;
- x->async_iov.iov_base = (void __user *)ubuf;
- x->async_iov.iov_len = size;
-
- return __sock_sendmsg(iocb, sock, &x->async_msg, size);
+ x = alloc_sock_iocb(iocb, ubuf, count, &siocb);
+ if (!x)
+ return -ENOMEM;
+ return do_sock_read(&x->async_msg, iocb, iocb->ki_filp,
+ &x->async_iov, 1);
}
-static ssize_t sock_sendpage(struct file *file, struct page *page,
- int offset, size_t size, loff_t *ppos, int more)
+static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
+ struct file *file, struct iovec *iov, unsigned long nr_segs)
{
- struct socket *sock;
- int flags;
+ struct socket *sock = file->private_data;
+ size_t size = 0;
+ int i;
- sock = file->private_data;
+ for (i = 0 ; i < nr_segs ; i++)
+ size += iov[i].iov_len;
- flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
- if (more)
- flags |= MSG_MORE;
+ msg->msg_name = NULL;
+ msg->msg_namelen = 0;
+ msg->msg_control = NULL;
+ msg->msg_controllen = 0;
+ msg->msg_iov = (struct iovec *) iov;
+ msg->msg_iovlen = nr_segs;
+ msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
+ if (sock->type == SOCK_SEQPACKET)
+ msg->msg_flags |= MSG_EOR;
- return sock->ops->sendpage(sock, page, offset, size, flags);
+ return __sock_sendmsg(iocb, sock, msg, size);
}
-static int sock_readv_writev(int type,
- struct file * file, const struct iovec * iov,
- long count, size_t size)
+static ssize_t sock_writev(struct file *file, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
{
struct msghdr msg;
- struct socket *sock;
+ struct kiocb iocb;
+ struct sock_iocb siocb;
+ int ret;
- sock = file->private_data;
+ init_sync_kiocb(&iocb, NULL);
+ iocb.private = &siocb;
- msg.msg_name = NULL;
- msg.msg_namelen = 0;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_iov = (struct iovec *) iov;
- msg.msg_iovlen = count;
- msg.msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
+ ret = do_sock_write(&msg, &iocb, file, (struct iovec *)iov, nr_segs);
+ if (-EIOCBQUEUED == ret)
+ ret = wait_on_sync_kiocb(&iocb);
+ return ret;
+}
- /* read() does a VERIFY_WRITE */
- if (type == VERIFY_WRITE)
- return sock_recvmsg(sock, &msg, size, msg.msg_flags);
+static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf,
+ size_t count, loff_t pos)
+{
+ struct sock_iocb siocb, *x;
- if (sock->type == SOCK_SEQPACKET)
- msg.msg_flags |= MSG_EOR;
+ if (pos != 0)
+ return -ESPIPE;
+ if (count == 0) /* Match SYS5 behaviour */
+ return 0;
- return sock_sendmsg(sock, &msg, size);
-}
+ x = alloc_sock_iocb(iocb, (void __user *)ubuf, count, &siocb);
+ if (!x)
+ return -ENOMEM;
-static ssize_t sock_readv(struct file *file, const struct iovec *vector,
- unsigned long count, loff_t *ppos)
-{
- size_t tot_len = 0;
- int i;
- for (i = 0 ; i < count ; i++)
- tot_len += vector[i].iov_len;
- return sock_readv_writev(VERIFY_WRITE,
- file, vector, count, tot_len);
-}
-
-static ssize_t sock_writev(struct file *file, const struct iovec *vector,
- unsigned long count, loff_t *ppos)
-{
- size_t tot_len = 0;
- int i;
- for (i = 0 ; i < count ; i++)
- tot_len += vector[i].iov_len;
- return sock_readv_writev(VERIFY_READ,
- file, vector, count, tot_len);
+ return do_sock_write(&x->async_msg, iocb, iocb->ki_filp,
+ &x->async_iov, 1);
}
@@ -904,6 +900,13 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
break;
default:
err = sock->ops->ioctl(sock, cmd, arg);
+
+ /*
+ * If this ioctl is unknown try to hand it down
+ * to the NIC driver.
+ */
+ if (err == -ENOIOCTLCMD)
+ err = dev_ioctl(cmd, argp);
break;
}
return err;
@@ -2036,7 +2039,7 @@ int sock_unregister(int family)
return 0;
}
-void __init sock_init(void)
+static int __init sock_init(void)
{
/*
* Initialize sock SLAB cache.
@@ -2044,12 +2047,10 @@ void __init sock_init(void)
sk_init();
-#ifdef SLAB_SKB
/*
* Initialize skbuff SLAB cache
*/
skb_init();
-#endif
/*
* Initialize the protocols module.
@@ -2058,15 +2059,19 @@ void __init sock_init(void)
init_inodecache();
register_filesystem(&sock_fs_type);
sock_mnt = kern_mount(&sock_fs_type);
- /* The real protocol initialization is performed when
- * do_initcalls is run.
+
+ /* The real protocol initialization is performed in later initcalls.
*/
#ifdef CONFIG_NETFILTER
netfilter_init();
#endif
+
+ return 0;
}
+core_initcall(sock_init); /* early initcall */
+
#ifdef CONFIG_PROC_FS
void socket_seq_show(struct seq_file *seq)
{
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index c6a51911e71e..d68eba481291 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -758,7 +758,7 @@ svc_tcp_accept(struct svc_sock *svsk)
struct svc_serv *serv = svsk->sk_server;
struct socket *sock = svsk->sk_sock;
struct socket *newsock;
- struct proto_ops *ops;
+ const struct proto_ops *ops;
struct svc_sock *newsvsk;
int err, slen;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index acc73ba8bade..5f6ae79b8b16 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -121,7 +121,7 @@
int sysctl_unix_max_dgram_qlen = 10;
struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
-DEFINE_RWLOCK(unix_table_lock);
+DEFINE_SPINLOCK(unix_table_lock);
static atomic_t unix_nr_socks = ATOMIC_INIT(0);
#define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE])
@@ -130,7 +130,7 @@ static atomic_t unix_nr_socks = ATOMIC_INIT(0);
/*
* SMP locking strategy:
- * hash table is protected with rwlock unix_table_lock
+ * hash table is protected with spinlock unix_table_lock
* each socket state is protected by separate rwlock.
*/
@@ -214,16 +214,16 @@ static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
static inline void unix_remove_socket(struct sock *sk)
{
- write_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
__unix_remove_socket(sk);
- write_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
}
static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
{
- write_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
__unix_insert_socket(list, sk);
- write_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
}
static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
@@ -250,11 +250,11 @@ static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
{
struct sock *s;
- read_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
s = __unix_find_socket_byname(sunname, len, type, hash);
if (s)
sock_hold(s);
- read_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
return s;
}
@@ -263,7 +263,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
struct sock *s;
struct hlist_node *node;
- read_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
sk_for_each(s, node,
&unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
struct dentry *dentry = unix_sk(s)->dentry;
@@ -276,7 +276,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
}
s = NULL;
found:
- read_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
return s;
}
@@ -473,7 +473,7 @@ static int unix_dgram_connect(struct socket *, struct sockaddr *,
static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
struct msghdr *, size_t);
-static struct proto_ops unix_stream_ops = {
+static const struct proto_ops unix_stream_ops = {
.family = PF_UNIX,
.owner = THIS_MODULE,
.release = unix_release,
@@ -494,7 +494,7 @@ static struct proto_ops unix_stream_ops = {
.sendpage = sock_no_sendpage,
};
-static struct proto_ops unix_dgram_ops = {
+static const struct proto_ops unix_dgram_ops = {
.family = PF_UNIX,
.owner = THIS_MODULE,
.release = unix_release,
@@ -515,7 +515,7 @@ static struct proto_ops unix_dgram_ops = {
.sendpage = sock_no_sendpage,
};
-static struct proto_ops unix_seqpacket_ops = {
+static const struct proto_ops unix_seqpacket_ops = {
.family = PF_UNIX,
.owner = THIS_MODULE,
.release = unix_release,
@@ -564,7 +564,7 @@ static struct sock * unix_create1(struct socket *sock)
u = unix_sk(sk);
u->dentry = NULL;
u->mnt = NULL;
- rwlock_init(&u->lock);
+ spin_lock_init(&u->lock);
atomic_set(&u->inflight, sock ? 0 : -1);
init_MUTEX(&u->readsem); /* single task reading lock */
init_waitqueue_head(&u->peer_wait);
@@ -642,12 +642,12 @@ retry:
addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
- write_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
ordernum = (ordernum+1)&0xFFFFF;
if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
addr->hash)) {
- write_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
/* Sanity yield. It is unusual case, but yet... */
if (!(ordernum&0xFF))
yield();
@@ -658,7 +658,7 @@ retry:
__unix_remove_socket(sk);
u->addr = addr;
__unix_insert_socket(&unix_socket_table[addr->hash], sk);
- write_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
err = 0;
out: up(&u->readsem);
@@ -791,7 +791,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
addr->hash = UNIX_HASH_SIZE;
}
- write_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
if (!sunaddr->sun_path[0]) {
err = -EADDRINUSE;
@@ -814,7 +814,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
__unix_insert_socket(list, sk);
out_unlock:
- write_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
out_up:
up(&u->readsem);
out:
@@ -1063,10 +1063,12 @@ restart:
/* Set credentials */
sk->sk_peercred = other->sk_peercred;
- sock_hold(newsk);
- unix_peer(sk) = newsk;
sock->state = SS_CONNECTED;
sk->sk_state = TCP_ESTABLISHED;
+ sock_hold(newsk);
+
+ smp_mb__after_atomic_inc(); /* sock_hold() does an atomic_inc() */
+ unix_peer(sk) = newsk;
unix_state_wunlock(sk);
@@ -1414,7 +1416,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
} else {
sunaddr = NULL;
err = -ENOTCONN;
- other = unix_peer_get(sk);
+ other = unix_peer(sk);
if (!other)
goto out_err;
}
@@ -1476,7 +1478,6 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
other->sk_data_ready(other, size);
sent+=size;
}
- sock_put(other);
scm_destroy(siocb->scm);
siocb->scm = NULL;
@@ -1491,8 +1492,6 @@ pipe_err:
send_sig(SIGPIPE,current,0);
err = -EPIPE;
out_err:
- if (other)
- sock_put(other);
scm_destroy(siocb->scm);
siocb->scm = NULL;
return sent ? : err;
@@ -1860,7 +1859,7 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
}
default:
- err = dev_ioctl(cmd, (void __user *)arg);
+ err = -ENOIOCTLCMD;
break;
}
return err;
@@ -1917,7 +1916,7 @@ static struct sock *unix_seq_idx(int *iter, loff_t pos)
static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
{
- read_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
}
@@ -1932,7 +1931,7 @@ static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static void unix_seq_stop(struct seq_file *seq, void *v)
{
- read_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
}
static int unix_seq_show(struct seq_file *seq, void *v)
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 6ffc64e1712d..411802bd4d37 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -182,7 +182,7 @@ void unix_gc(void)
if (down_trylock(&unix_gc_sem))
return;
- read_lock(&unix_table_lock);
+ spin_lock(&unix_table_lock);
forall_unix_sockets(i, s)
{
@@ -301,7 +301,7 @@ void unix_gc(void)
}
u->gc_tree = GC_ORPHAN;
}
- read_unlock(&unix_table_lock);
+ spin_unlock(&unix_table_lock);
/*
* Here we are. Hitlist is filled. Die.
diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c
index 59fec59b2132..7a43ae4721ed 100644
--- a/net/wanrouter/af_wanpipe.c
+++ b/net/wanrouter/af_wanpipe.c
@@ -181,7 +181,7 @@ struct wanpipe_opt
#endif
static int sk_count;
-extern struct proto_ops wanpipe_ops;
+extern const struct proto_ops wanpipe_ops;
static unsigned long find_free_critical;
static void wanpipe_unlink_driver(struct sock *sk);
@@ -1839,7 +1839,7 @@ static int wanpipe_ioctl(struct socket *sock, unsigned int cmd, unsigned long ar
#endif
default:
- return dev_ioctl(cmd,(void __user *) arg);
+ return -ENOIOCTLCMD;
}
/*NOTREACHED*/
}
@@ -2546,7 +2546,7 @@ static int wanpipe_connect(struct socket *sock, struct sockaddr *uaddr, int addr
return 0;
}
-struct proto_ops wanpipe_ops = {
+const struct proto_ops wanpipe_ops = {
.family = PF_WANPIPE,
.owner = THIS_MODULE,
.release = wanpipe_release,
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 020d73cc8414..16459c7f54b2 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -64,7 +64,7 @@ int sysctl_x25_ack_holdback_timeout = X25_DEFAULT_T2;
HLIST_HEAD(x25_list);
DEFINE_RWLOCK(x25_list_lock);
-static struct proto_ops x25_proto_ops;
+static const struct proto_ops x25_proto_ops;
static struct x25_address null_x25_address = {" "};
@@ -1378,7 +1378,7 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
}
default:
- rc = dev_ioctl(cmd, argp);
+ rc = -ENOIOCTLCMD;
break;
}
@@ -1391,7 +1391,7 @@ static struct net_proto_family x25_family_ops = {
.owner = THIS_MODULE,
};
-static struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = {
.family = AF_X25,
.owner = THIS_MODULE,
.release = x25_release,
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d19e274b9c4a..64a447375fdb 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -10,7 +10,7 @@
* YOSHIFUJI Hideaki
* Split up af-specific portion
* Derek Atkins <derek@ihtfp.com> Add the post_input processor
- *
+ *
*/
#include <asm/bug.h>
@@ -256,6 +256,7 @@ void __xfrm_policy_destroy(struct xfrm_policy *policy)
if (del_timer(&policy->timer))
BUG();
+ security_xfrm_policy_free(policy);
kfree(policy);
}
EXPORT_SYMBOL(__xfrm_policy_destroy);
@@ -350,7 +351,8 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
write_lock_bh(&xfrm_policy_lock);
for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL;) {
- if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0) {
+ if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0 &&
+ xfrm_sec_ctx_match(pol->security, policy->security)) {
if (excl) {
write_unlock_bh(&xfrm_policy_lock);
return -EEXIST;
@@ -416,14 +418,15 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
}
EXPORT_SYMBOL(xfrm_policy_insert);
-struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel,
- int delete)
+struct xfrm_policy *xfrm_policy_bysel_ctx(int dir, struct xfrm_selector *sel,
+ struct xfrm_sec_ctx *ctx, int delete)
{
struct xfrm_policy *pol, **p;
write_lock_bh(&xfrm_policy_lock);
for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) {
- if (memcmp(sel, &pol->selector, sizeof(*sel)) == 0) {
+ if ((memcmp(sel, &pol->selector, sizeof(*sel)) == 0) &&
+ (xfrm_sec_ctx_match(ctx, pol->security))) {
xfrm_pol_hold(pol);
if (delete)
*p = pol->next;
@@ -438,7 +441,7 @@ struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel,
}
return pol;
}
-EXPORT_SYMBOL(xfrm_policy_bysel);
+EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete)
{
@@ -519,7 +522,7 @@ EXPORT_SYMBOL(xfrm_policy_walk);
/* Find policy to apply to this flow. */
-static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
+static void xfrm_policy_lookup(struct flowi *fl, u32 sk_sid, u16 family, u8 dir,
void **objp, atomic_t **obj_refp)
{
struct xfrm_policy *pol;
@@ -533,9 +536,12 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
continue;
match = xfrm_selector_match(sel, fl, family);
+
if (match) {
- xfrm_pol_hold(pol);
- break;
+ if (!security_xfrm_policy_lookup(pol, sk_sid, dir)) {
+ xfrm_pol_hold(pol);
+ break;
+ }
}
}
read_unlock_bh(&xfrm_policy_lock);
@@ -543,15 +549,37 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
*obj_refp = &pol->refcnt;
}
-static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl)
+static inline int policy_to_flow_dir(int dir)
+{
+ if (XFRM_POLICY_IN == FLOW_DIR_IN &&
+ XFRM_POLICY_OUT == FLOW_DIR_OUT &&
+ XFRM_POLICY_FWD == FLOW_DIR_FWD)
+ return dir;
+ switch (dir) {
+ default:
+ case XFRM_POLICY_IN:
+ return FLOW_DIR_IN;
+ case XFRM_POLICY_OUT:
+ return FLOW_DIR_OUT;
+ case XFRM_POLICY_FWD:
+ return FLOW_DIR_FWD;
+ };
+}
+
+static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl, u32 sk_sid)
{
struct xfrm_policy *pol;
read_lock_bh(&xfrm_policy_lock);
if ((pol = sk->sk_policy[dir]) != NULL) {
- int match = xfrm_selector_match(&pol->selector, fl,
+ int match = xfrm_selector_match(&pol->selector, fl,
sk->sk_family);
+ int err = 0;
+
if (match)
+ err = security_xfrm_policy_lookup(pol, sk_sid, policy_to_flow_dir(dir));
+
+ if (match && !err)
xfrm_pol_hold(pol);
else
pol = NULL;
@@ -624,6 +652,10 @@ static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir)
if (newp) {
newp->selector = old->selector;
+ if (security_xfrm_policy_clone(old, newp)) {
+ kfree(newp);
+ return NULL; /* ENOMEM */
+ }
newp->lft = old->lft;
newp->curlft = old->curlft;
newp->action = old->action;
@@ -735,22 +767,6 @@ xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
return err;
}
-static inline int policy_to_flow_dir(int dir)
-{
- if (XFRM_POLICY_IN == FLOW_DIR_IN &&
- XFRM_POLICY_OUT == FLOW_DIR_OUT &&
- XFRM_POLICY_FWD == FLOW_DIR_FWD)
- return dir;
- switch (dir) {
- default:
- case XFRM_POLICY_IN:
- return FLOW_DIR_IN;
- case XFRM_POLICY_OUT:
- return FLOW_DIR_OUT;
- case XFRM_POLICY_FWD:
- return FLOW_DIR_FWD;
- };
-}
static int stale_bundle(struct dst_entry *dst);
@@ -769,19 +785,20 @@ int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
int err;
u32 genid;
u16 family = dst_orig->ops->family;
+ u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
+ u32 sk_sid = security_sk_sid(sk, fl, dir);
restart:
genid = atomic_read(&flow_cache_genid);
policy = NULL;
if (sk && sk->sk_policy[1])
- policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
+ policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, sk_sid);
if (!policy) {
/* To accelerate a bit... */
if ((dst_orig->flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT])
return 0;
- policy = flow_cache_lookup(fl, family,
- policy_to_flow_dir(XFRM_POLICY_OUT),
+ policy = flow_cache_lookup(fl, sk_sid, family, dir,
xfrm_policy_lookup);
}
@@ -962,16 +979,20 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
{
struct xfrm_policy *pol;
struct flowi fl;
+ u8 fl_dir = policy_to_flow_dir(dir);
+ u32 sk_sid;
if (_decode_session(skb, &fl, family) < 0)
return 0;
+ sk_sid = security_sk_sid(sk, &fl, fl_dir);
+
/* First, check used SA against their selectors. */
if (skb->sp) {
int i;
for (i=skb->sp->len-1; i>=0; i--) {
- struct sec_decap_state *xvec = &(skb->sp->x[i]);
+ struct sec_decap_state *xvec = &(skb->sp->x[i]);
if (!xfrm_selector_match(&xvec->xvec->sel, &fl, family))
return 0;
@@ -986,11 +1007,10 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
pol = NULL;
if (sk && sk->sk_policy[dir])
- pol = xfrm_sk_policy_lookup(sk, dir, &fl);
+ pol = xfrm_sk_policy_lookup(sk, dir, &fl, sk_sid);
if (!pol)
- pol = flow_cache_lookup(&fl, family,
- policy_to_flow_dir(dir),
+ pol = flow_cache_lookup(&fl, sk_sid, family, fl_dir,
xfrm_policy_lookup);
if (!pol)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 479effc97666..e12d0be5f976 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -10,7 +10,7 @@
* Split up af-specific functions
* Derek Atkins <derek@ihtfp.com>
* Add UDP Encapsulation
- *
+ *
*/
#include <linux/workqueue.h>
@@ -70,6 +70,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
x->type->destructor(x);
xfrm_put_type(x->type);
}
+ security_xfrm_state_free(x);
kfree(x);
}
@@ -343,7 +344,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
selector.
*/
if (x->km.state == XFRM_STATE_VALID) {
- if (!xfrm_selector_match(&x->sel, fl, family))
+ if (!xfrm_selector_match(&x->sel, fl, family) ||
+ !xfrm_sec_ctx_match(pol->security, x->security))
continue;
if (!best ||
best->km.dying > x->km.dying ||
@@ -354,7 +356,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
acquire_in_progress = 1;
} else if (x->km.state == XFRM_STATE_ERROR ||
x->km.state == XFRM_STATE_EXPIRED) {
- if (xfrm_selector_match(&x->sel, fl, family))
+ if (xfrm_selector_match(&x->sel, fl, family) &&
+ xfrm_sec_ctx_match(pol->security, x->security))
error = -ESRCH;
}
}
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 0cdd9a07e043..92e2b804c606 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -7,7 +7,7 @@
* Kazunori MIYAZAWA @USAGI
* Kunihiro Ishiguro <kunihiro@ipinfusion.com>
* IPv6 support
- *
+ *
*/
#include <linux/module.h>
@@ -88,6 +88,34 @@ static int verify_encap_tmpl(struct rtattr **xfrma)
return 0;
}
+
+static inline int verify_sec_ctx_len(struct rtattr **xfrma)
+{
+ struct rtattr *rt = xfrma[XFRMA_SEC_CTX - 1];
+ struct xfrm_user_sec_ctx *uctx;
+ int len = 0;
+
+ if (!rt)
+ return 0;
+
+ if (rt->rta_len < sizeof(*uctx))
+ return -EINVAL;
+
+ uctx = RTA_DATA(rt);
+
+ if (uctx->ctx_len > PAGE_SIZE)
+ return -EINVAL;
+
+ len += sizeof(struct xfrm_user_sec_ctx);
+ len += uctx->ctx_len;
+
+ if (uctx->len != len)
+ return -EINVAL;
+
+ return 0;
+}
+
+
static int verify_newsa_info(struct xfrm_usersa_info *p,
struct rtattr **xfrma)
{
@@ -145,6 +173,8 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
goto out;
if ((err = verify_encap_tmpl(xfrma)))
goto out;
+ if ((err = verify_sec_ctx_len(xfrma)))
+ goto out;
err = -EINVAL;
switch (p->mode) {
@@ -209,6 +239,30 @@ static int attach_encap_tmpl(struct xfrm_encap_tmpl **encapp, struct rtattr *u_a
return 0;
}
+
+static inline int xfrm_user_sec_ctx_size(struct xfrm_policy *xp)
+{
+ struct xfrm_sec_ctx *xfrm_ctx = xp->security;
+ int len = 0;
+
+ if (xfrm_ctx) {
+ len += sizeof(struct xfrm_user_sec_ctx);
+ len += xfrm_ctx->ctx_len;
+ }
+ return len;
+}
+
+static int attach_sec_ctx(struct xfrm_state *x, struct rtattr *u_arg)
+{
+ struct xfrm_user_sec_ctx *uctx;
+
+ if (!u_arg)
+ return 0;
+
+ uctx = RTA_DATA(u_arg);
+ return security_xfrm_state_alloc(x, uctx);
+}
+
static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
{
memcpy(&x->id, &p->id, sizeof(x->id));
@@ -253,6 +307,9 @@ static struct xfrm_state *xfrm_state_construct(struct xfrm_usersa_info *p,
if (err)
goto error;
+ if ((err = attach_sec_ctx(x, xfrma[XFRMA_SEC_CTX-1])))
+ goto error;
+
x->km.seq = p->seq;
return x;
@@ -272,11 +329,11 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
int err;
struct km_event c;
- err = verify_newsa_info(p, (struct rtattr **) xfrma);
+ err = verify_newsa_info(p, (struct rtattr **)xfrma);
if (err)
return err;
- x = xfrm_state_construct(p, (struct rtattr **) xfrma, &err);
+ x = xfrm_state_construct(p, (struct rtattr **)xfrma, &err);
if (!x)
return err;
@@ -390,6 +447,19 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr)
if (x->encap)
RTA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap);
+ if (x->security) {
+ int ctx_size = sizeof(struct xfrm_sec_ctx) +
+ x->security->ctx_len;
+ struct rtattr *rt = __RTA_PUT(skb, XFRMA_SEC_CTX, ctx_size);
+ struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt);
+
+ uctx->exttype = XFRMA_SEC_CTX;
+ uctx->len = ctx_size;
+ uctx->ctx_doi = x->security->ctx_doi;
+ uctx->ctx_alg = x->security->ctx_alg;
+ uctx->ctx_len = x->security->ctx_len;
+ memcpy(uctx + 1, x->security->ctx_str, x->security->ctx_len);
+ }
nlh->nlmsg_len = skb->tail - b;
out:
sp->this_idx++;
@@ -603,6 +673,18 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p)
return verify_policy_dir(p->dir);
}
+static int copy_from_user_sec_ctx(struct xfrm_policy *pol, struct rtattr **xfrma)
+{
+ struct rtattr *rt = xfrma[XFRMA_SEC_CTX-1];
+ struct xfrm_user_sec_ctx *uctx;
+
+ if (!rt)
+ return 0;
+
+ uctx = RTA_DATA(rt);
+ return security_xfrm_policy_alloc(pol, uctx);
+}
+
static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut,
int nr)
{
@@ -681,7 +763,10 @@ static struct xfrm_policy *xfrm_policy_construct(struct xfrm_userpolicy_info *p,
}
copy_from_user_policy(xp, p);
- err = copy_from_user_tmpl(xp, xfrma);
+
+ if (!(err = copy_from_user_tmpl(xp, xfrma)))
+ err = copy_from_user_sec_ctx(xp, xfrma);
+
if (err) {
*errp = err;
kfree(xp);
@@ -702,8 +787,11 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
err = verify_newpolicy_info(p);
if (err)
return err;
+ err = verify_sec_ctx_len((struct rtattr **)xfrma);
+ if (err)
+ return err;
- xp = xfrm_policy_construct(p, (struct rtattr **) xfrma, &err);
+ xp = xfrm_policy_construct(p, (struct rtattr **)xfrma, &err);
if (!xp)
return err;
@@ -761,6 +849,27 @@ rtattr_failure:
return -1;
}
+static int copy_to_user_sec_ctx(struct xfrm_policy *xp, struct sk_buff *skb)
+{
+ if (xp->security) {
+ int ctx_size = sizeof(struct xfrm_sec_ctx) +
+ xp->security->ctx_len;
+ struct rtattr *rt = __RTA_PUT(skb, XFRMA_SEC_CTX, ctx_size);
+ struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt);
+
+ uctx->exttype = XFRMA_SEC_CTX;
+ uctx->len = ctx_size;
+ uctx->ctx_doi = xp->security->ctx_doi;
+ uctx->ctx_alg = xp->security->ctx_alg;
+ uctx->ctx_len = xp->security->ctx_len;
+ memcpy(uctx + 1, xp->security->ctx_str, xp->security->ctx_len);
+ }
+ return 0;
+
+ rtattr_failure:
+ return -1;
+}
+
static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr)
{
struct xfrm_dump_info *sp = ptr;
@@ -782,6 +891,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr
copy_to_user_policy(xp, p, dir);
if (copy_to_user_tmpl(xp, skb) < 0)
goto nlmsg_failure;
+ if (copy_to_user_sec_ctx(xp, skb))
+ goto nlmsg_failure;
nlh->nlmsg_len = skb->tail - b;
out:
@@ -852,8 +963,25 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
if (p->index)
xp = xfrm_policy_byid(p->dir, p->index, delete);
- else
- xp = xfrm_policy_bysel(p->dir, &p->sel, delete);
+ else {
+ struct rtattr **rtattrs = (struct rtattr **)xfrma;
+ struct rtattr *rt = rtattrs[XFRMA_SEC_CTX-1];
+ struct xfrm_policy tmp;
+
+ err = verify_sec_ctx_len(rtattrs);
+ if (err)
+ return err;
+
+ memset(&tmp, 0, sizeof(struct xfrm_policy));
+ if (rt) {
+ struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt);
+
+ if ((err = security_xfrm_policy_alloc(&tmp, uctx)))
+ return err;
+ }
+ xp = xfrm_policy_bysel_ctx(p->dir, &p->sel, tmp.security, delete);
+ security_xfrm_policy_free(&tmp);
+ }
if (xp == NULL)
return -ENOENT;
@@ -1224,6 +1352,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
if (copy_to_user_tmpl(xp, skb) < 0)
goto nlmsg_failure;
+ if (copy_to_user_sec_ctx(xp, skb))
+ goto nlmsg_failure;
nlh->nlmsg_len = skb->tail - b;
return skb->len;
@@ -1241,6 +1371,7 @@ static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt,
len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
len += NLMSG_SPACE(sizeof(struct xfrm_user_acquire));
+ len += RTA_SPACE(xfrm_user_sec_ctx_size(xp));
skb = alloc_skb(len, GFP_ATOMIC);
if (skb == NULL)
return -ENOMEM;
@@ -1324,6 +1455,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
copy_to_user_policy(xp, &upe->pol, dir);
if (copy_to_user_tmpl(xp, skb) < 0)
goto nlmsg_failure;
+ if (copy_to_user_sec_ctx(xp, skb))
+ goto nlmsg_failure;
upe->hard = !!hard;
nlh->nlmsg_len = skb->tail - b;
@@ -1341,6 +1474,7 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_eve
len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
len += NLMSG_SPACE(sizeof(struct xfrm_user_polexpire));
+ len += RTA_SPACE(xfrm_user_sec_ctx_size(xp));
skb = alloc_skb(len, GFP_ATOMIC);
if (skb == NULL)
return -ENOMEM;