aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/802/garp.c4
-rw-r--r--net/8021q/vlan.c14
-rw-r--r--net/9p/trans_virtio.c2
-rw-r--r--net/Kconfig1
-rw-r--r--net/atm/common.c2
-rw-r--r--net/atm/lec.h2
-rw-r--r--net/ax25/af_ax25.c1
-rw-r--r--net/batman-adv/Kconfig14
-rw-r--r--net/batman-adv/Makefile3
-rw-r--r--net/batman-adv/bat_iv_ogm.c11
-rw-r--r--net/batman-adv/debugfs.c18
-rw-r--r--net/batman-adv/distributed-arp-table.c22
-rw-r--r--net/batman-adv/gateway_client.c2
-rw-r--r--net/batman-adv/hard-interface.c66
-rw-r--r--net/batman-adv/hard-interface.h13
-rw-r--r--net/batman-adv/main.c10
-rw-r--r--net/batman-adv/main.h15
-rw-r--r--net/batman-adv/network-coding.c1821
-rw-r--r--net/batman-adv/network-coding.h123
-rw-r--r--net/batman-adv/originator.c10
-rw-r--r--net/batman-adv/packet.h33
-rw-r--r--net/batman-adv/routing.c49
-rw-r--r--net/batman-adv/send.c5
-rw-r--r--net/batman-adv/soft-interface.c281
-rw-r--r--net/batman-adv/soft-interface.h3
-rw-r--r--net/batman-adv/sysfs.c22
-rw-r--r--net/batman-adv/translation-table.c29
-rw-r--r--net/batman-adv/types.h136
-rw-r--r--net/batman-adv/unicast.c6
-rw-r--r--net/batman-adv/vis.c4
-rw-r--r--net/bluetooth/af_bluetooth.c7
-rw-r--r--net/bluetooth/bnep/netdev.c2
-rw-r--r--net/bluetooth/rfcomm/sock.c1
-rw-r--r--net/bluetooth/sco.c1
-rw-r--r--net/bridge/br_device.c2
-rw-r--r--net/bridge/br_fdb.c16
-rw-r--r--net/bridge/br_if.c1
-rw-r--r--net/bridge/br_input.c2
-rw-r--r--net/bridge/br_mdb.c8
-rw-r--r--net/bridge/br_multicast.c5
-rw-r--r--net/bridge/br_netlink.c23
-rw-r--r--net/bridge/br_private.h4
-rw-r--r--net/bridge/netfilter/ebt_log.c44
-rw-r--r--net/bridge/netfilter/ebt_nflog.c5
-rw-r--r--net/bridge/netfilter/ebt_ulog.c132
-rw-r--r--net/bridge/netfilter/ebtable_broute.c4
-rw-r--r--net/bridge/netfilter/ebtables.c2
-rw-r--r--net/caif/caif_dev.c9
-rw-r--r--net/caif/caif_socket.c24
-rw-r--r--net/caif/caif_usb.c4
-rw-r--r--net/caif/cfcnfg.c19
-rw-r--r--net/caif/cfctrl.c14
-rw-r--r--net/caif/cffrml.c4
-rw-r--r--net/caif/cfmuxl.c4
-rw-r--r--net/caif/cfpkt_skbuff.c8
-rw-r--r--net/caif/cfrfml.c4
-rw-r--r--net/caif/cfserl.c4
-rw-r--r--net/caif/cfsrvl.c13
-rw-r--r--net/caif/chnl_net.c6
-rw-r--r--net/can/af_can.c30
-rw-r--r--net/can/gw.c5
-rw-r--r--net/ceph/osdmap.c42
-rw-r--r--net/core/datagram.c4
-rw-r--r--net/core/dev.c92
-rw-r--r--net/core/dev_addr_lists.c6
-rw-r--r--net/core/dst.c9
-rw-r--r--net/core/ethtool.c1
-rw-r--r--net/core/fib_rules.c4
-rw-r--r--net/core/filter.c5
-rw-r--r--net/core/flow.c44
-rw-r--r--net/core/flow_dissector.c70
-rw-r--r--net/core/neighbour.c6
-rw-r--r--net/core/net-procfs.c2
-rw-r--r--net/core/netpoll.c3
-rw-r--r--net/core/rtnetlink.c181
-rw-r--r--net/core/scm.c24
-rw-r--r--net/core/skbuff.c50
-rw-r--r--net/core/sock.c22
-rw-r--r--net/core/utils.c5
-rw-r--r--net/dcb/dcbevent.c1
-rw-r--r--net/dcb/dcbnl.c10
-rw-r--r--net/dccp/ipv4.c5
-rw-r--r--net/dccp/ipv6.c5
-rw-r--r--net/decnet/dn_dev.c4
-rw-r--r--net/decnet/dn_fib.c203
-rw-r--r--net/decnet/dn_route.c43
-rw-r--r--net/decnet/dn_table.c45
-rw-r--r--net/decnet/netfilter/dn_rtmsg.c12
-rw-r--r--net/dsa/dsa.c233
-rw-r--r--net/ethernet/eth.c2
-rw-r--r--net/ieee802154/6lowpan.c142
-rw-r--r--net/ieee802154/6lowpan.h9
-rw-r--r--net/ieee802154/dgram.c10
-rw-r--r--net/ieee802154/netlink.c8
-rw-r--r--net/ieee802154/nl-mac.c25
-rw-r--r--net/ipv4/Kconfig7
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c13
-rw-r--r--net/ipv4/arp.c27
-rw-r--r--net/ipv4/devinet.c89
-rw-r--r--net/ipv4/fib_frontend.c10
-rw-r--r--net/ipv4/gre.c5
-rw-r--r--net/ipv4/inet_connection_sock.c3
-rw-r--r--net/ipv4/inet_diag.c4
-rw-r--r--net/ipv4/inet_fragment.c104
-rw-r--r--net/ipv4/inet_lro.c5
-rw-r--r--net/ipv4/ip_fragment.c42
-rw-r--r--net/ipv4/ip_gre.c1520
-rw-r--r--net/ipv4/ip_options.c5
-rw-r--r--net/ipv4/ip_output.c3
-rw-r--r--net/ipv4/ip_tunnel.c1035
-rw-r--r--net/ipv4/ip_vti.c42
-rw-r--r--net/ipv4/ipconfig.c16
-rw-r--r--net/ipv4/ipip.c748
-rw-r--r--net/ipv4/ipmr.c12
-rw-r--r--net/ipv4/netfilter/Kconfig13
-rw-r--r--net/ipv4/netfilter/arptable_filter.c4
-rw-r--r--net/ipv4/netfilter/ip_tables.c9
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c133
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c8
-rw-r--r--net/ipv4/proc.c2
-rw-r--r--net/ipv4/route.c2
-rw-r--r--net/ipv4/syncookies.c3
-rw-r--r--net/ipv4/sysctl_net_ipv4.c18
-rw-r--r--net/ipv4/tcp.c270
-rw-r--r--net/ipv4/tcp_input.c613
-rw-r--r--net/ipv4/tcp_ipv4.c122
-rw-r--r--net/ipv4/tcp_memcontrol.c2
-rw-r--r--net/ipv4/tcp_minisocks.c44
-rw-r--r--net/ipv4/tcp_output.c375
-rw-r--r--net/ipv4/tcp_timer.c21
-rw-r--r--net/ipv4/tcp_westwood.c2
-rw-r--r--net/ipv4/udp.c122
-rw-r--r--net/ipv4/udp_diag.c6
-rw-r--r--net/ipv6/Kconfig2
-rw-r--r--net/ipv6/addrconf.c261
-rw-r--r--net/ipv6/addrlabel.c12
-rw-r--r--net/ipv6/af_inet6.c7
-rw-r--r--net/ipv6/datagram.c20
-rw-r--r--net/ipv6/icmp.c2
-rw-r--r--net/ipv6/inet6_connection_sock.c10
-rw-r--r--net/ipv6/ip6_flowlabel.c11
-rw-r--r--net/ipv6/ip6_gre.c62
-rw-r--r--net/ipv6/ip6_input.c15
-rw-r--r--net/ipv6/ip6_offload.c4
-rw-r--r--net/ipv6/ip6_tunnel.c16
-rw-r--r--net/ipv6/ip6mr.c10
-rw-r--r--net/ipv6/ndisc.c2
-rw-r--r--net/ipv6/netfilter/ip6_tables.c3
-rw-r--r--net/ipv6/netfilter/ip6t_NPT.c15
-rw-r--r--net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c8
-rw-r--r--net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c7
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c34
-rw-r--r--net/ipv6/raw.c9
-rw-r--r--net/ipv6/reassembly.c31
-rw-r--r--net/ipv6/route.c6
-rw-r--r--net/ipv6/sit.c41
-rw-r--r--net/ipv6/syncookies.c3
-rw-r--r--net/ipv6/tcp_ipv6.c64
-rw-r--r--net/ipv6/udp.c21
-rw-r--r--net/ipv6/udp_offload.c8
-rw-r--r--net/irda/af_irda.c13
-rw-r--r--net/irda/ircomm/ircomm_core.c2
-rw-r--r--net/iucv/af_iucv.c5
-rw-r--r--net/key/af_key.c9
-rw-r--r--net/l2tp/l2tp_core.c206
-rw-r--r--net/l2tp/l2tp_core.h22
-rw-r--r--net/l2tp/l2tp_debugfs.c28
-rw-r--r--net/l2tp/l2tp_ip.c6
-rw-r--r--net/l2tp/l2tp_ip6.c8
-rw-r--r--net/l2tp/l2tp_netlink.c72
-rw-r--r--net/l2tp/l2tp_ppp.c111
-rw-r--r--net/llc/af_llc.c2
-rw-r--r--net/mac80211/tx.c2
-rw-r--r--net/mac802154/mac802154.h3
-rw-r--r--net/mac802154/mac_cmd.c1
-rw-r--r--net/mac802154/mib.c21
-rw-r--r--net/mac802154/tx.c29
-rw-r--r--net/mac802154/wpan.c4
-rw-r--r--net/netfilter/core.c29
-rw-r--r--net/netfilter/ipset/ip_set_core.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c31
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c306
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c95
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c656
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c86
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c115
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c190
-rw-r--r--net/netfilter/ipvs/ip_vs_lc.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_nq.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_pe.c55
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c1
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c52
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c40
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c33
-rw-r--r--net/netfilter/ipvs/ip_vs_rr.c64
-rw-r--r--net/netfilter/ipvs/ip_vs_sched.c63
-rw-r--r--net/netfilter/ipvs/ip_vs_sed.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c86
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c35
-rw-r--r--net/netfilter/ipvs/ip_vs_wlc.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_wrr.c176
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c1050
-rw-r--r--net/netfilter/nf_conntrack_core.c47
-rw-r--r--net/netfilter/nf_conntrack_helper.c13
-rw-r--r--net/netfilter/nf_conntrack_netlink.c100
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c21
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c12
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c12
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c18
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c6
-rw-r--r--net/netfilter/nf_conntrack_proto_udplite.c20
-rw-r--r--net/netfilter/nf_conntrack_standalone.c17
-rw-r--r--net/netfilter/nf_log.c206
-rw-r--r--net/netfilter/nfnetlink.c14
-rw-r--r--net/netfilter/nfnetlink_acct.c2
-rw-r--r--net/netfilter/nfnetlink_log.c186
-rw-r--r--net/netfilter/nfnetlink_queue_core.c273
-rw-r--r--net/netfilter/xt_AUDIT.c3
-rw-r--r--net/netfilter/xt_LOG.c52
-rw-r--r--net/netfilter/xt_NFQUEUE.c63
-rw-r--r--net/netfilter/xt_osf.c6
-rw-r--r--net/netlabel/netlabel_unlabeled.c27
-rw-r--r--net/netlink/Kconfig10
-rw-r--r--net/netlink/Makefile3
-rw-r--r--net/netlink/af_netlink.c63
-rw-r--r--net/netlink/af_netlink.h62
-rw-r--r--net/netlink/diag.c188
-rw-r--r--net/netlink/genetlink.c1
-rw-r--r--net/netrom/af_netrom.c1
-rw-r--r--net/nfc/llcp/sock.c6
-rw-r--r--net/openvswitch/actions.c4
-rw-r--r--net/openvswitch/datapath.c18
-rw-r--r--net/openvswitch/datapath.h2
-rw-r--r--net/openvswitch/flow.c10
-rw-r--r--net/openvswitch/vport-internal_dev.c13
-rw-r--r--net/openvswitch/vport-netdev.c3
-rw-r--r--net/openvswitch/vport.c3
-rw-r--r--net/openvswitch/vport.h4
-rw-r--r--net/packet/af_packet.c114
-rw-r--r--net/packet/internal.h3
-rw-r--r--net/phonet/pn_netlink.c4
-rw-r--r--net/rds/stats.c1
-rw-r--r--net/rose/af_rose.c1
-rw-r--r--net/sched/act_api.c2
-rw-r--r--net/sched/cls_api.c14
-rw-r--r--net/sched/sch_api.c44
-rw-r--r--net/sched/sch_cbq.c5
-rw-r--r--net/sched/sch_fq_codel.c2
-rw-r--r--net/sched/sch_generic.c2
-rw-r--r--net/sched/sch_htb.c31
-rw-r--r--net/sctp/associola.c2
-rw-r--r--net/sctp/sm_statefuns.c2
-rw-r--r--net/sctp/socket.c8
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c12
-rw-r--r--net/sunrpc/rpc_pipe.c5
-rw-r--r--net/sunrpc/sched.c9
-rw-r--r--net/sunrpc/xprtsock.c15
-rw-r--r--net/tipc/netlink.c6
-rw-r--r--net/tipc/socket.c7
-rw-r--r--net/unix/af_unix.c29
-rw-r--r--net/vmw_vsock/af_vsock.c8
-rw-r--r--net/vmw_vsock/vmci_transport.c50
-rw-r--r--net/vmw_vsock/vmci_transport.h3
-rw-r--r--net/vmw_vsock/vsock_addr.c10
-rw-r--r--net/vmw_vsock/vsock_addr.h2
-rw-r--r--net/wireless/util.c2
-rw-r--r--net/xfrm/xfrm_policy.c23
-rw-r--r--net/xfrm/xfrm_replay.c66
271 files changed, 9571 insertions, 6734 deletions
diff --git a/net/802/garp.c b/net/802/garp.c
index 8456f5d98b85..5d9630a0eb93 100644
--- a/net/802/garp.c
+++ b/net/802/garp.c
@@ -609,8 +609,12 @@ void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl
/* Delete timer and generate a final TRANSMIT_PDU event to flush out
* all pending messages before the applicant is gone. */
del_timer_sync(&app->join_timer);
+
+ spin_lock_bh(&app->lock);
garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU);
garp_pdu_queue(app);
+ spin_unlock_bh(&app->lock);
+
garp_queue_xmit(app);
dev_mc_del(dev, appl->proto.group_address);
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index a18714469bf7..85addcd9372b 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -86,13 +86,6 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
grp = &vlan_info->grp;
- /* Take it out of our own structures, but be sure to interlock with
- * HW accelerating devices or SW vlan input packet processing if
- * VLAN is not 0 (leave it there for 802.1p).
- */
- if (vlan_id)
- vlan_vid_del(real_dev, vlan_id);
-
grp->nr_vlan_devs--;
if (vlan->flags & VLAN_FLAG_MVRP)
@@ -114,6 +107,13 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
vlan_gvrp_uninit_applicant(real_dev);
}
+ /* Take it out of our own structures, but be sure to interlock with
+ * HW accelerating devices or SW vlan input packet processing if
+ * VLAN is not 0 (leave it there for 802.1p).
+ */
+ if (vlan_id)
+ vlan_vid_del(real_dev, vlan_id);
+
/* Get rid of the vlan's reference to real_dev */
dev_put(real_dev);
}
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 74dea377fe5b..de2e950a0a7a 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -655,7 +655,7 @@ static struct p9_trans_module p9_virtio_trans = {
.create = p9_virtio_create,
.close = p9_virtio_close,
.request = p9_virtio_request,
- //.zc_request = p9_virtio_zc_request,
+ .zc_request = p9_virtio_zc_request,
.cancel = p9_virtio_cancel,
/*
* We leave one entry for input and one entry for response
diff --git a/net/Kconfig b/net/Kconfig
index 6f676ab885be..2ddc9046868e 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -217,6 +217,7 @@ source "net/dns_resolver/Kconfig"
source "net/batman-adv/Kconfig"
source "net/openvswitch/Kconfig"
source "net/vmw_vsock/Kconfig"
+source "net/netlink/Kconfig"
config RPS
boolean
diff --git a/net/atm/common.c b/net/atm/common.c
index 7b491006eaf4..737bef59ce89 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -531,6 +531,8 @@ int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
struct sk_buff *skb;
int copied, error = -EINVAL;
+ msg->msg_namelen = 0;
+
if (sock->state != SS_CONNECTED)
return -ENOTCONN;
diff --git a/net/atm/lec.h b/net/atm/lec.h
index a86aff9a3c04..4149db1b7885 100644
--- a/net/atm/lec.h
+++ b/net/atm/lec.h
@@ -58,7 +58,7 @@ struct lane2_ops {
* field in h_type field. Data follows immediately after header.
* 2. LLC Data frames whose total length, including LLC field and data,
* but not padding required to meet the minimum data frame length,
- * is less than 1536(0x0600) MUST be encoded by placing that length
+ * is less than ETH_P_802_3_MIN MUST be encoded by placing that length
* in the h_type field. The LLC field follows header immediately.
* 3. LLC data frames longer than this maximum MUST be encoded by placing
* the value 0 in the h_type field.
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 7b11f8bc5071..e277e38f736b 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1642,6 +1642,7 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock,
ax25_address src;
const unsigned char *mac = skb_mac_header(skb);
+ memset(sax, 0, sizeof(struct full_sockaddr_ax25));
ax25_addr_parse(mac + 1, skb->data - mac - 1, &src, NULL,
&digi, NULL, NULL);
sax->sax25_family = AF_AX25;
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index 8d8afb134b3a..fa780b76630e 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -36,6 +36,20 @@ config BATMAN_ADV_DAT
mesh networks. If you think that your network does not need
this option you can safely remove it and save some space.
+config BATMAN_ADV_NC
+ bool "Network Coding"
+ depends on BATMAN_ADV
+ default n
+ help
+ This option enables network coding, a mechanism that aims to
+ increase the overall network throughput by fusing multiple
+ packets in one transmission.
+ Note that interfaces controlled by batman-adv must be manually
+ configured to have promiscuous mode enabled in order to make
+ network coding work.
+ If you think that your network does not need this feature you
+ can safely disable it and save some space.
+
config BATMAN_ADV_DEBUG
bool "B.A.T.M.A.N. debugging"
depends on BATMAN_ADV
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index e45e3b4e32e3..acbac2a9c62f 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (C) 2007-2012 B.A.T.M.A.N. contributors:
+# Copyright (C) 2007-2013 B.A.T.M.A.N. contributors:
#
# Marek Lindner, Simon Wunderlich
#
@@ -30,6 +30,7 @@ batman-adv-y += hard-interface.o
batman-adv-y += hash.o
batman-adv-y += icmp_socket.o
batman-adv-y += main.o
+batman-adv-$(CONFIG_BATMAN_ADV_NC) += network-coding.o
batman-adv-y += originator.o
batman-adv-y += ring_buffer.o
batman-adv-y += routing.o
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index a0b253ecadaf..071f288b77a8 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -27,6 +27,7 @@
#include "hard-interface.h"
#include "send.h"
#include "bat_algo.h"
+#include "network-coding.h"
static struct batadv_neigh_node *
batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface,
@@ -1185,6 +1186,10 @@ static void batadv_iv_ogm_process(const struct ethhdr *ethhdr,
if (!orig_neigh_node)
goto out;
+ /* Update nc_nodes of the originator */
+ batadv_nc_update_nc_node(bat_priv, orig_node, orig_neigh_node,
+ batadv_ogm_packet, is_single_hop_neigh);
+
orig_neigh_router = batadv_orig_node_get_router(orig_neigh_node);
/* drop packet if sender is not a direct neighbor and if we
@@ -1288,7 +1293,8 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb,
batadv_ogm_packet = (struct batadv_ogm_packet *)packet_buff;
/* unpack the aggregated packets and process them one by one */
- do {
+ while (batadv_iv_ogm_aggr_packet(buff_pos, packet_len,
+ batadv_ogm_packet->tt_num_changes)) {
tt_buff = packet_buff + buff_pos + BATADV_OGM_HLEN;
batadv_iv_ogm_process(ethhdr, batadv_ogm_packet, tt_buff,
@@ -1299,8 +1305,7 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb,
packet_pos = packet_buff + buff_pos;
batadv_ogm_packet = (struct batadv_ogm_packet *)packet_pos;
- } while (batadv_iv_ogm_aggr_packet(buff_pos, packet_len,
- batadv_ogm_packet->tt_num_changes));
+ }
kfree_skb(skb);
return NET_RX_SUCCESS;
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 6ae86516db4d..f186a55b23c3 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -32,6 +32,7 @@
#include "icmp_socket.h"
#include "bridge_loop_avoidance.h"
#include "distributed-arp-table.h"
+#include "network-coding.h"
static struct dentry *batadv_debugfs;
@@ -310,6 +311,14 @@ struct batadv_debuginfo {
const struct file_operations fops;
};
+#ifdef CONFIG_BATMAN_ADV_NC
+static int batadv_nc_nodes_open(struct inode *inode, struct file *file)
+{
+ struct net_device *net_dev = (struct net_device *)inode->i_private;
+ return single_open(file, batadv_nc_nodes_seq_print_text, net_dev);
+}
+#endif
+
#define BATADV_DEBUGINFO(_name, _mode, _open) \
struct batadv_debuginfo batadv_debuginfo_##_name = { \
.attr = { .name = __stringify(_name), \
@@ -348,6 +357,9 @@ static BATADV_DEBUGINFO(dat_cache, S_IRUGO, batadv_dat_cache_open);
static BATADV_DEBUGINFO(transtable_local, S_IRUGO,
batadv_transtable_local_open);
static BATADV_DEBUGINFO(vis_data, S_IRUGO, batadv_vis_data_open);
+#ifdef CONFIG_BATMAN_ADV_NC
+static BATADV_DEBUGINFO(nc_nodes, S_IRUGO, batadv_nc_nodes_open);
+#endif
static struct batadv_debuginfo *batadv_mesh_debuginfos[] = {
&batadv_debuginfo_originators,
@@ -362,6 +374,9 @@ static struct batadv_debuginfo *batadv_mesh_debuginfos[] = {
#endif
&batadv_debuginfo_transtable_local,
&batadv_debuginfo_vis_data,
+#ifdef CONFIG_BATMAN_ADV_NC
+ &batadv_debuginfo_nc_nodes,
+#endif
NULL,
};
@@ -431,6 +446,9 @@ int batadv_debugfs_add_meshif(struct net_device *dev)
}
}
+ if (batadv_nc_init_debugfs(bat_priv) < 0)
+ goto rem_attr;
+
return 0;
rem_attr:
debugfs_remove_recursive(bat_priv->debug_dir);
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index d54188a112ea..8e15d966d9b0 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -816,7 +816,6 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
bool ret = false;
struct batadv_dat_entry *dat_entry = NULL;
struct sk_buff *skb_new;
- struct batadv_hard_iface *primary_if = NULL;
if (!atomic_read(&bat_priv->distributed_arp_table))
goto out;
@@ -838,22 +837,18 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
dat_entry = batadv_dat_entry_hash_find(bat_priv, ip_dst);
if (dat_entry) {
- primary_if = batadv_primary_if_get_selected(bat_priv);
- if (!primary_if)
- goto out;
-
skb_new = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_src,
- primary_if->soft_iface, ip_dst, hw_src,
+ bat_priv->soft_iface, ip_dst, hw_src,
dat_entry->mac_addr, hw_src);
if (!skb_new)
goto out;
skb_reset_mac_header(skb_new);
skb_new->protocol = eth_type_trans(skb_new,
- primary_if->soft_iface);
+ bat_priv->soft_iface);
bat_priv->stats.rx_packets++;
bat_priv->stats.rx_bytes += skb->len + ETH_HLEN;
- primary_if->soft_iface->last_rx = jiffies;
+ bat_priv->soft_iface->last_rx = jiffies;
netif_rx(skb_new);
batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP request replied locally\n");
@@ -866,8 +861,6 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
out:
if (dat_entry)
batadv_dat_entry_free_ref(dat_entry);
- if (primary_if)
- batadv_hardif_free_ref(primary_if);
return ret;
}
@@ -887,7 +880,6 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
__be32 ip_src, ip_dst;
uint8_t *hw_src;
struct sk_buff *skb_new;
- struct batadv_hard_iface *primary_if = NULL;
struct batadv_dat_entry *dat_entry = NULL;
bool ret = false;
int err;
@@ -912,12 +904,8 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
if (!dat_entry)
goto out;
- primary_if = batadv_primary_if_get_selected(bat_priv);
- if (!primary_if)
- goto out;
-
skb_new = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_src,
- primary_if->soft_iface, ip_dst, hw_src,
+ bat_priv->soft_iface, ip_dst, hw_src,
dat_entry->mac_addr, hw_src);
if (!skb_new)
@@ -941,8 +929,6 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
out:
if (dat_entry)
batadv_dat_entry_free_ref(dat_entry);
- if (primary_if)
- batadv_hardif_free_ref(primary_if);
if (ret)
kfree_skb(skb);
return ret;
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 34f99a46ec1d..f105219f4a4b 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -500,7 +500,7 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset)
rcu_read_unlock();
if (gw_count == 0)
- seq_printf(seq, "No gateways in range ...\n");
+ seq_puts(seq, "No gateways in range ...\n");
out:
if (primary_if)
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 368219e026a9..522243aff2f3 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -307,11 +307,35 @@ batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface)
batadv_update_min_mtu(hard_iface->soft_iface);
}
+/**
+ * batadv_master_del_slave - remove hard_iface from the current master interface
+ * @slave: the interface enslaved in another master
+ * @master: the master from which slave has to be removed
+ *
+ * Invoke ndo_del_slave on master passing slave as argument. In this way slave
+ * is free'd and master can correctly change its internal state.
+ * Return 0 on success, a negative value representing the error otherwise
+ */
+static int batadv_master_del_slave(struct batadv_hard_iface *slave,
+ struct net_device *master)
+{
+ int ret;
+
+ if (!master)
+ return 0;
+
+ ret = -EBUSY;
+ if (master->netdev_ops->ndo_del_slave)
+ ret = master->netdev_ops->ndo_del_slave(master, slave->net_dev);
+
+ return ret;
+}
+
int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
const char *iface_name)
{
struct batadv_priv *bat_priv;
- struct net_device *soft_iface;
+ struct net_device *soft_iface, *master;
__be16 ethertype = __constant_htons(ETH_P_BATMAN);
int ret;
@@ -321,11 +345,6 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
if (!atomic_inc_not_zero(&hard_iface->refcount))
goto out;
- /* hard-interface is part of a bridge */
- if (hard_iface->net_dev->priv_flags & IFF_BRIDGE_PORT)
- pr_err("You are about to enable batman-adv on '%s' which already is part of a bridge. Unless you know exactly what you are doing this is probably wrong and won't work the way you think it would.\n",
- hard_iface->net_dev->name);
-
soft_iface = dev_get_by_name(&init_net, iface_name);
if (!soft_iface) {
@@ -347,12 +366,24 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
goto err_dev;
}
+ /* check if the interface is enslaved in another virtual one and
+ * in that case unlink it first
+ */
+ master = netdev_master_upper_dev_get(hard_iface->net_dev);
+ ret = batadv_master_del_slave(hard_iface, master);
+ if (ret)
+ goto err_dev;
+
hard_iface->soft_iface = soft_iface;
bat_priv = netdev_priv(hard_iface->soft_iface);
+ ret = netdev_master_upper_dev_link(hard_iface->net_dev, soft_iface);
+ if (ret)
+ goto err_dev;
+
ret = bat_priv->bat_algo_ops->bat_iface_enable(hard_iface);
if (ret < 0)
- goto err_dev;
+ goto err_upper;
hard_iface->if_num = bat_priv->num_ifaces;
bat_priv->num_ifaces++;
@@ -362,7 +393,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
bat_priv->bat_algo_ops->bat_iface_disable(hard_iface);
bat_priv->num_ifaces--;
hard_iface->if_status = BATADV_IF_NOT_IN_USE;
- goto err_dev;
+ goto err_upper;
}
hard_iface->batman_adv_ptype.type = ethertype;
@@ -401,14 +432,18 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
out:
return 0;
+err_upper:
+ netdev_upper_dev_unlink(hard_iface->net_dev, soft_iface);
err_dev:
+ hard_iface->soft_iface = NULL;
dev_put(soft_iface);
err:
batadv_hardif_free_ref(hard_iface);
return ret;
}
-void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface)
+void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
+ enum batadv_hard_if_cleanup autodel)
{
struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
struct batadv_hard_iface *primary_if = NULL;
@@ -446,9 +481,10 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface)
dev_put(hard_iface->soft_iface);
/* nobody uses this interface anymore */
- if (!bat_priv->num_ifaces)
- batadv_softif_destroy(hard_iface->soft_iface);
+ if (!bat_priv->num_ifaces && autodel == BATADV_IF_CLEANUP_AUTO)
+ batadv_softif_destroy_sysfs(hard_iface->soft_iface);
+ netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->soft_iface);
hard_iface->soft_iface = NULL;
batadv_hardif_free_ref(hard_iface);
@@ -533,7 +569,8 @@ static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface)
/* first deactivate interface */
if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
- batadv_hardif_disable_interface(hard_iface);
+ batadv_hardif_disable_interface(hard_iface,
+ BATADV_IF_CLEANUP_AUTO);
if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
return;
@@ -563,6 +600,11 @@ static int batadv_hard_if_event(struct notifier_block *this,
struct batadv_hard_iface *primary_if = NULL;
struct batadv_priv *bat_priv;
+ if (batadv_softif_is_valid(net_dev) && event == NETDEV_REGISTER) {
+ batadv_sysfs_add_meshif(net_dev);
+ return NOTIFY_DONE;
+ }
+
hard_iface = batadv_hardif_get_by_netdev(net_dev);
if (!hard_iface && event == NETDEV_REGISTER)
hard_iface = batadv_hardif_add_interface(net_dev);
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index 308437d52e22..49892881a7c5 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -29,13 +29,24 @@ enum batadv_hard_if_state {
BATADV_IF_I_WANT_YOU,
};
+/**
+ * enum batadv_hard_if_cleanup - Cleanup modi for soft_iface after slave removal
+ * @BATADV_IF_CLEANUP_KEEP: Don't automatically delete soft-interface
+ * @BATADV_IF_CLEANUP_AUTO: Delete soft-interface after last slave was removed
+ */
+enum batadv_hard_if_cleanup {
+ BATADV_IF_CLEANUP_KEEP,
+ BATADV_IF_CLEANUP_AUTO,
+};
+
extern struct notifier_block batadv_hard_if_notifier;
struct batadv_hard_iface*
batadv_hardif_get_by_netdev(const struct net_device *net_dev);
int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
const char *iface_name);
-void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface);
+void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
+ enum batadv_hard_if_cleanup autodel);
void batadv_hardif_remove_interfaces(void);
int batadv_hardif_min_mtu(struct net_device *soft_iface);
void batadv_update_min_mtu(struct net_device *soft_iface);
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 0488d70c8c35..6277735cd89e 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -35,6 +35,7 @@
#include "vis.h"
#include "hash.h"
#include "bat_algo.h"
+#include "network-coding.h"
/* List manipulations on hardif_list have to be rtnl_lock()'ed,
@@ -70,6 +71,7 @@ static int __init batadv_init(void)
batadv_debugfs_init();
register_netdevice_notifier(&batadv_hard_if_notifier);
+ rtnl_link_register(&batadv_link_ops);
pr_info("B.A.T.M.A.N. advanced %s (compatibility version %i) loaded\n",
BATADV_SOURCE_VERSION, BATADV_COMPAT_VERSION);
@@ -80,6 +82,7 @@ static int __init batadv_init(void)
static void __exit batadv_exit(void)
{
batadv_debugfs_destroy();
+ rtnl_link_unregister(&batadv_link_ops);
unregister_netdevice_notifier(&batadv_hard_if_notifier);
batadv_hardif_remove_interfaces();
@@ -135,6 +138,10 @@ int batadv_mesh_init(struct net_device *soft_iface)
if (ret < 0)
goto err;
+ ret = batadv_nc_init(bat_priv);
+ if (ret < 0)
+ goto err;
+
atomic_set(&bat_priv->gw.reselect, 0);
atomic_set(&bat_priv->mesh_state, BATADV_MESH_ACTIVE);
@@ -157,6 +164,7 @@ void batadv_mesh_free(struct net_device *soft_iface)
batadv_gw_node_purge(bat_priv);
batadv_originator_free(bat_priv);
+ batadv_nc_free(bat_priv);
batadv_tt_free(bat_priv);
@@ -411,7 +419,7 @@ int batadv_algo_seq_print_text(struct seq_file *seq, void *offset)
{
struct batadv_algo_ops *bat_algo_ops;
- seq_printf(seq, "Available routing algorithms:\n");
+ seq_puts(seq, "Available routing algorithms:\n");
hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) {
seq_printf(seq, "%s\n", bat_algo_ops->name);
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index ced08b936a96..f90f5bc8e426 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -26,7 +26,7 @@
#define BATADV_DRIVER_DEVICE "batman-adv"
#ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2013.1.0"
+#define BATADV_SOURCE_VERSION "2013.2.0"
#endif
/* B.A.T.M.A.N. parameters */
@@ -105,6 +105,8 @@
#define BATADV_RESET_PROTECTION_MS 30000
#define BATADV_EXPECTED_SEQNO_RANGE 65536
+#define BATADV_NC_NODE_TIMEOUT 10000 /* Milliseconds */
+
enum batadv_mesh_state {
BATADV_MESH_INACTIVE,
BATADV_MESH_ACTIVE,
@@ -150,6 +152,7 @@ enum batadv_uev_type {
#include <linux/percpu.h>
#include <linux/slab.h>
#include <net/sock.h> /* struct sock */
+#include <net/rtnetlink.h>
#include <linux/jiffies.h>
#include <linux/seq_file.h>
#include "types.h"
@@ -185,6 +188,7 @@ __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr);
* @BATADV_DBG_TT: translation table messages
* @BATADV_DBG_BLA: bridge loop avoidance messages
* @BATADV_DBG_DAT: ARP snooping and DAT related messages
+ * @BATADV_DBG_NC: network coding related messages
* @BATADV_DBG_ALL: the union of all the above log levels
*/
enum batadv_dbg_level {
@@ -193,7 +197,8 @@ enum batadv_dbg_level {
BATADV_DBG_TT = BIT(2),
BATADV_DBG_BLA = BIT(3),
BATADV_DBG_DAT = BIT(4),
- BATADV_DBG_ALL = 31,
+ BATADV_DBG_NC = BIT(5),
+ BATADV_DBG_ALL = 63,
};
#ifdef CONFIG_BATMAN_ADV_DEBUG
@@ -298,4 +303,10 @@ static inline uint64_t batadv_sum_counter(struct batadv_priv *bat_priv,
return sum;
}
+/* Define a macro to reach the control buffer of the skb. The members of the
+ * control buffer are defined in struct batadv_skb_cb in types.h.
+ * The macro is inspired by the similar macro TCP_SKB_CB() in tcp.h.
+ */
+#define BATADV_SKB_CB(__skb) ((struct batadv_skb_cb *)&((__skb)->cb[0]))
+
#endif /* _NET_BATMAN_ADV_MAIN_H_ */
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
new file mode 100644
index 000000000000..6b9a54485314
--- /dev/null
+++ b/net/batman-adv/network-coding.c
@@ -0,0 +1,1821 @@
+/* Copyright (C) 2012-2013 B.A.T.M.A.N. contributors:
+ *
+ * Martin Hundebøll, Jeppe Ledet-Pedersen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#include <linux/debugfs.h>
+
+#include "main.h"
+#include "hash.h"
+#include "network-coding.h"
+#include "send.h"
+#include "originator.h"
+#include "hard-interface.h"
+#include "routing.h"
+
+static struct lock_class_key batadv_nc_coding_hash_lock_class_key;
+static struct lock_class_key batadv_nc_decoding_hash_lock_class_key;
+
+static void batadv_nc_worker(struct work_struct *work);
+static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if);
+
+/**
+ * batadv_nc_start_timer - initialise the nc periodic worker
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+static void batadv_nc_start_timer(struct batadv_priv *bat_priv)
+{
+ queue_delayed_work(batadv_event_workqueue, &bat_priv->nc.work,
+ msecs_to_jiffies(10));
+}
+
+/**
+ * batadv_nc_init - initialise coding hash table and start house keeping
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+int batadv_nc_init(struct batadv_priv *bat_priv)
+{
+ bat_priv->nc.timestamp_fwd_flush = jiffies;
+ bat_priv->nc.timestamp_sniffed_purge = jiffies;
+
+ if (bat_priv->nc.coding_hash || bat_priv->nc.decoding_hash)
+ return 0;
+
+ bat_priv->nc.coding_hash = batadv_hash_new(128);
+ if (!bat_priv->nc.coding_hash)
+ goto err;
+
+ batadv_hash_set_lock_class(bat_priv->nc.coding_hash,
+ &batadv_nc_coding_hash_lock_class_key);
+
+ bat_priv->nc.decoding_hash = batadv_hash_new(128);
+ if (!bat_priv->nc.decoding_hash)
+ goto err;
+
+ batadv_hash_set_lock_class(bat_priv->nc.coding_hash,
+ &batadv_nc_decoding_hash_lock_class_key);
+
+ /* Register our packet type */
+ if (batadv_recv_handler_register(BATADV_CODED,
+ batadv_nc_recv_coded_packet) < 0)
+ goto err;
+
+ INIT_DELAYED_WORK(&bat_priv->nc.work, batadv_nc_worker);
+ batadv_nc_start_timer(bat_priv);
+
+ return 0;
+
+err:
+ return -ENOMEM;
+}
+
+/**
+ * batadv_nc_init_bat_priv - initialise the nc specific bat_priv variables
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv)
+{
+ atomic_set(&bat_priv->network_coding, 1);
+ bat_priv->nc.min_tq = 200;
+ bat_priv->nc.max_fwd_delay = 10;
+ bat_priv->nc.max_buffer_time = 200;
+}
+
+/**
+ * batadv_nc_init_orig - initialise the nc fields of an orig_node
+ * @orig_node: the orig_node which is going to be initialised
+ */
+void batadv_nc_init_orig(struct batadv_orig_node *orig_node)
+{
+ INIT_LIST_HEAD(&orig_node->in_coding_list);
+ INIT_LIST_HEAD(&orig_node->out_coding_list);
+ spin_lock_init(&orig_node->in_coding_list_lock);
+ spin_lock_init(&orig_node->out_coding_list_lock);
+}
+
+/**
+ * batadv_nc_node_free_rcu - rcu callback to free an nc node and remove
+ * its refcount on the orig_node
+ * @rcu: rcu pointer of the nc node
+ */
+static void batadv_nc_node_free_rcu(struct rcu_head *rcu)
+{
+ struct batadv_nc_node *nc_node;
+
+ nc_node = container_of(rcu, struct batadv_nc_node, rcu);
+ batadv_orig_node_free_ref(nc_node->orig_node);
+ kfree(nc_node);
+}
+
+/**
+ * batadv_nc_node_free_ref - decrements the nc node refcounter and possibly
+ * frees it
+ * @nc_node: the nc node to free
+ */
+static void batadv_nc_node_free_ref(struct batadv_nc_node *nc_node)
+{
+ if (atomic_dec_and_test(&nc_node->refcount))
+ call_rcu(&nc_node->rcu, batadv_nc_node_free_rcu);
+}
+
+/**
+ * batadv_nc_path_free_ref - decrements the nc path refcounter and possibly
+ * frees it
+ * @nc_path: the nc node to free
+ */
+static void batadv_nc_path_free_ref(struct batadv_nc_path *nc_path)
+{
+ if (atomic_dec_and_test(&nc_path->refcount))
+ kfree_rcu(nc_path, rcu);
+}
+
+/**
+ * batadv_nc_packet_free - frees nc packet
+ * @nc_packet: the nc packet to free
+ */
+static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet)
+{
+ if (nc_packet->skb)
+ kfree_skb(nc_packet->skb);
+
+ batadv_nc_path_free_ref(nc_packet->nc_path);
+ kfree(nc_packet);
+}
+
+/**
+ * batadv_nc_to_purge_nc_node - checks whether an nc node has to be purged
+ * @bat_priv: the bat priv with all the soft interface information
+ * @nc_node: the nc node to check
+ *
+ * Returns true if the entry has to be purged now, false otherwise
+ */
+static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv,
+ struct batadv_nc_node *nc_node)
+{
+ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
+ return true;
+
+ return batadv_has_timed_out(nc_node->last_seen, BATADV_NC_NODE_TIMEOUT);
+}
+
+/**
+ * batadv_nc_to_purge_nc_path_coding - checks whether an nc path has timed out
+ * @bat_priv: the bat priv with all the soft interface information
+ * @nc_path: the nc path to check
+ *
+ * Returns true if the entry has to be purged now, false otherwise
+ */
+static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv,
+ struct batadv_nc_path *nc_path)
+{
+ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
+ return true;
+
+ /* purge the path when no packets has been added for 10 times the
+ * max_fwd_delay time
+ */
+ return batadv_has_timed_out(nc_path->last_valid,
+ bat_priv->nc.max_fwd_delay * 10);
+}
+
+/**
+ * batadv_nc_to_purge_nc_path_decoding - checks whether an nc path has timed out
+ * @bat_priv: the bat priv with all the soft interface information
+ * @nc_path: the nc path to check
+ *
+ * Returns true if the entry has to be purged now, false otherwise
+ */
+static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv,
+ struct batadv_nc_path *nc_path)
+{
+ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
+ return true;
+
+ /* purge the path when no packets has been added for 10 times the
+ * max_buffer time
+ */
+ return batadv_has_timed_out(nc_path->last_valid,
+ bat_priv->nc.max_buffer_time*10);
+}
+
+/**
+ * batadv_nc_purge_orig_nc_nodes - go through list of nc nodes and purge stale
+ * entries
+ * @bat_priv: the bat priv with all the soft interface information
+ * @list: list of nc nodes
+ * @lock: nc node list lock
+ * @to_purge: function in charge to decide whether an entry has to be purged or
+ * not. This function takes the nc node as argument and has to return
+ * a boolean value: true if the entry has to be deleted, false
+ * otherwise
+ */
+static void
+batadv_nc_purge_orig_nc_nodes(struct batadv_priv *bat_priv,
+ struct list_head *list,
+ spinlock_t *lock,
+ bool (*to_purge)(struct batadv_priv *,
+ struct batadv_nc_node *))
+{
+ struct batadv_nc_node *nc_node, *nc_node_tmp;
+
+ /* For each nc_node in list */
+ spin_lock_bh(lock);
+ list_for_each_entry_safe(nc_node, nc_node_tmp, list, list) {
+ /* if an helper function has been passed as parameter,
+ * ask it if the entry has to be purged or not
+ */
+ if (to_purge && !to_purge(bat_priv, nc_node))
+ continue;
+
+ batadv_dbg(BATADV_DBG_NC, bat_priv,
+ "Removing nc_node %pM -> %pM\n",
+ nc_node->addr, nc_node->orig_node->orig);
+ list_del_rcu(&nc_node->list);
+ batadv_nc_node_free_ref(nc_node);
+ }
+ spin_unlock_bh(lock);
+}
+
+/**
+ * batadv_nc_purge_orig - purges all nc node data attached of the given
+ * originator
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: orig_node with the nc node entries to be purged
+ * @to_purge: function in charge to decide whether an entry has to be purged or
+ * not. This function takes the nc node as argument and has to return
+ * a boolean value: true is the entry has to be deleted, false
+ * otherwise
+ */
+void batadv_nc_purge_orig(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ bool (*to_purge)(struct batadv_priv *,
+ struct batadv_nc_node *))
+{
+ /* Check ingoing nc_node's of this orig_node */
+ batadv_nc_purge_orig_nc_nodes(bat_priv, &orig_node->in_coding_list,
+ &orig_node->in_coding_list_lock,
+ to_purge);
+
+ /* Check outgoing nc_node's of this orig_node */
+ batadv_nc_purge_orig_nc_nodes(bat_priv, &orig_node->out_coding_list,
+ &orig_node->out_coding_list_lock,
+ to_purge);
+}
+
+/**
+ * batadv_nc_purge_orig_hash - traverse entire originator hash to check if they
+ * have timed out nc nodes
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+static void batadv_nc_purge_orig_hash(struct batadv_priv *bat_priv)
+{
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct hlist_head *head;
+ struct batadv_orig_node *orig_node;
+ uint32_t i;
+
+ if (!hash)
+ return;
+
+ /* For each orig_node */
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, head, hash_entry)
+ batadv_nc_purge_orig(bat_priv, orig_node,
+ batadv_nc_to_purge_nc_node);
+ rcu_read_unlock();
+ }
+}
+
+/**
+ * batadv_nc_purge_paths - traverse all nc paths part of the hash and remove
+ * unused ones
+ * @bat_priv: the bat priv with all the soft interface information
+ * @hash: hash table containing the nc paths to check
+ * @to_purge: function in charge to decide whether an entry has to be purged or
+ * not. This function takes the nc node as argument and has to return
+ * a boolean value: true is the entry has to be deleted, false
+ * otherwise
+ */
+static void batadv_nc_purge_paths(struct batadv_priv *bat_priv,
+ struct batadv_hashtable *hash,
+ bool (*to_purge)(struct batadv_priv *,
+ struct batadv_nc_path *))
+{
+ struct hlist_head *head;
+ struct hlist_node *node_tmp;
+ struct batadv_nc_path *nc_path;
+ spinlock_t *lock; /* Protects lists in hash */
+ uint32_t i;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ lock = &hash->list_locks[i];
+
+ /* For each nc_path in this bin */
+ spin_lock_bh(lock);
+ hlist_for_each_entry_safe(nc_path, node_tmp, head, hash_entry) {
+ /* if an helper function has been passed as parameter,
+ * ask it if the entry has to be purged or not
+ */
+ if (to_purge && !to_purge(bat_priv, nc_path))
+ continue;
+
+ /* purging an non-empty nc_path should never happen, but
+ * is observed under high CPU load. Delay the purging
+ * until next iteration to allow the packet_list to be
+ * emptied first.
+ */
+ if (!unlikely(list_empty(&nc_path->packet_list))) {
+ net_ratelimited_function(printk,
+ KERN_WARNING
+ "Skipping free of non-empty nc_path (%pM -> %pM)!\n",
+ nc_path->prev_hop,
+ nc_path->next_hop);
+ continue;
+ }
+
+ /* nc_path is unused, so remove it */
+ batadv_dbg(BATADV_DBG_NC, bat_priv,
+ "Remove nc_path %pM -> %pM\n",
+ nc_path->prev_hop, nc_path->next_hop);
+ hlist_del_rcu(&nc_path->hash_entry);
+ batadv_nc_path_free_ref(nc_path);
+ }
+ spin_unlock_bh(lock);
+ }
+}
+
+/**
+ * batadv_nc_hash_key_gen - computes the nc_path hash key
+ * @key: buffer to hold the final hash key
+ * @src: source ethernet mac address going into the hash key
+ * @dst: destination ethernet mac address going into the hash key
+ */
+static void batadv_nc_hash_key_gen(struct batadv_nc_path *key, const char *src,
+ const char *dst)
+{
+ memcpy(key->prev_hop, src, sizeof(key->prev_hop));
+ memcpy(key->next_hop, dst, sizeof(key->next_hop));
+}
+
+/**
+ * batadv_nc_hash_choose - compute the hash value for an nc path
+ * @data: data to hash
+ * @size: size of the hash table
+ *
+ * Returns the selected index in the hash table for the given data.
+ */
+static uint32_t batadv_nc_hash_choose(const void *data, uint32_t size)
+{
+ const struct batadv_nc_path *nc_path = data;
+ uint32_t hash = 0;
+
+ hash = batadv_hash_bytes(hash, &nc_path->prev_hop,
+ sizeof(nc_path->prev_hop));
+ hash = batadv_hash_bytes(hash, &nc_path->next_hop,
+ sizeof(nc_path->next_hop));
+
+ hash += (hash << 3);
+ hash ^= (hash >> 11);
+ hash += (hash << 15);
+
+ return hash % size;
+}
+
+/**
+ * batadv_nc_hash_compare - comparing function used in the network coding hash
+ * tables
+ * @node: node in the local table
+ * @data2: second object to compare the node to
+ *
+ * Returns 1 if the two entry are the same, 0 otherwise
+ */
+static int batadv_nc_hash_compare(const struct hlist_node *node,
+ const void *data2)
+{
+ const struct batadv_nc_path *nc_path1, *nc_path2;
+
+ nc_path1 = container_of(node, struct batadv_nc_path, hash_entry);
+ nc_path2 = data2;
+
+ /* Return 1 if the two keys are identical */
+ if (memcmp(nc_path1->prev_hop, nc_path2->prev_hop,
+ sizeof(nc_path1->prev_hop)) != 0)
+ return 0;
+
+ if (memcmp(nc_path1->next_hop, nc_path2->next_hop,
+ sizeof(nc_path1->next_hop)) != 0)
+ return 0;
+
+ return 1;
+}
+
+/**
+ * batadv_nc_hash_find - search for an existing nc path and return it
+ * @hash: hash table containing the nc path
+ * @data: search key
+ *
+ * Returns the nc_path if found, NULL otherwise.
+ */
+static struct batadv_nc_path *
+batadv_nc_hash_find(struct batadv_hashtable *hash,
+ void *data)
+{
+ struct hlist_head *head;
+ struct batadv_nc_path *nc_path, *nc_path_tmp = NULL;
+ int index;
+
+ if (!hash)
+ return NULL;
+
+ index = batadv_nc_hash_choose(data, hash->size);
+ head = &hash->table[index];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(nc_path, head, hash_entry) {
+ if (!batadv_nc_hash_compare(&nc_path->hash_entry, data))
+ continue;
+
+ if (!atomic_inc_not_zero(&nc_path->refcount))
+ continue;
+
+ nc_path_tmp = nc_path;
+ break;
+ }
+ rcu_read_unlock();
+
+ return nc_path_tmp;
+}
+
+/**
+ * batadv_nc_send_packet - send non-coded packet and free nc_packet struct
+ * @nc_packet: the nc packet to send
+ */
+static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet)
+{
+ batadv_send_skb_packet(nc_packet->skb,
+ nc_packet->neigh_node->if_incoming,
+ nc_packet->nc_path->next_hop);
+ nc_packet->skb = NULL;
+ batadv_nc_packet_free(nc_packet);
+}
+
+/**
+ * batadv_nc_sniffed_purge - Checks timestamp of given sniffed nc_packet.
+ * @bat_priv: the bat priv with all the soft interface information
+ * @nc_path: the nc path the packet belongs to
+ * @nc_packet: the nc packet to be checked
+ *
+ * Checks whether the given sniffed (overheard) nc_packet has hit its buffering
+ * timeout. If so, the packet is no longer kept and the entry deleted from the
+ * queue. Has to be called with the appropriate locks.
+ *
+ * Returns false as soon as the entry in the fifo queue has not been timed out
+ * yet and true otherwise.
+ */
+static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv,
+ struct batadv_nc_path *nc_path,
+ struct batadv_nc_packet *nc_packet)
+{
+ unsigned long timeout = bat_priv->nc.max_buffer_time;
+ bool res = false;
+
+ /* Packets are added to tail, so the remaining packets did not time
+ * out and we can stop processing the current queue
+ */
+ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE &&
+ !batadv_has_timed_out(nc_packet->timestamp, timeout))
+ goto out;
+
+ /* purge nc packet */
+ list_del(&nc_packet->list);
+ batadv_nc_packet_free(nc_packet);
+
+ res = true;
+
+out:
+ return res;
+}
+
+/**
+ * batadv_nc_fwd_flush - Checks the timestamp of the given nc packet.
+ * @bat_priv: the bat priv with all the soft interface information
+ * @nc_path: the nc path the packet belongs to
+ * @nc_packet: the nc packet to be checked
+ *
+ * Checks whether the given nc packet has hit its forward timeout. If so, the
+ * packet is no longer delayed, immediately sent and the entry deleted from the
+ * queue. Has to be called with the appropriate locks.
+ *
+ * Returns false as soon as the entry in the fifo queue has not been timed out
+ * yet and true otherwise.
+ */
+static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv,
+ struct batadv_nc_path *nc_path,
+ struct batadv_nc_packet *nc_packet)
+{
+ unsigned long timeout = bat_priv->nc.max_fwd_delay;
+
+ /* Packets are added to tail, so the remaining packets did not time
+ * out and we can stop processing the current queue
+ */
+ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE &&
+ !batadv_has_timed_out(nc_packet->timestamp, timeout))
+ return false;
+
+ /* Send packet */
+ batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD);
+ batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES,
+ nc_packet->skb->len + ETH_HLEN);
+ list_del(&nc_packet->list);
+ batadv_nc_send_packet(nc_packet);
+
+ return true;
+}
+
+/**
+ * batadv_nc_process_nc_paths - traverse given nc packet pool and free timed out
+ * nc packets
+ * @bat_priv: the bat priv with all the soft interface information
+ * @hash: to be processed hash table
+ * @process_fn: Function called to process given nc packet. Should return true
+ * to encourage this function to proceed with the next packet.
+ * Otherwise the rest of the current queue is skipped.
+ */
+static void
+batadv_nc_process_nc_paths(struct batadv_priv *bat_priv,
+ struct batadv_hashtable *hash,
+ bool (*process_fn)(struct batadv_priv *,
+ struct batadv_nc_path *,
+ struct batadv_nc_packet *))
+{
+ struct hlist_head *head;
+ struct batadv_nc_packet *nc_packet, *nc_packet_tmp;
+ struct batadv_nc_path *nc_path;
+ bool ret;
+ int i;
+
+ if (!hash)
+ return;
+
+ /* Loop hash table bins */
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ /* Loop coding paths */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(nc_path, head, hash_entry) {
+ /* Loop packets */
+ spin_lock_bh(&nc_path->packet_list_lock);
+ list_for_each_entry_safe(nc_packet, nc_packet_tmp,
+ &nc_path->packet_list, list) {
+ ret = process_fn(bat_priv, nc_path, nc_packet);
+ if (!ret)
+ break;
+ }
+ spin_unlock_bh(&nc_path->packet_list_lock);
+ }
+ rcu_read_unlock();
+ }
+}
+
+/**
+ * batadv_nc_worker - periodic task for house keeping related to network coding
+ * @work: kernel work struct
+ */
+static void batadv_nc_worker(struct work_struct *work)
+{
+ struct delayed_work *delayed_work;
+ struct batadv_priv_nc *priv_nc;
+ struct batadv_priv *bat_priv;
+ unsigned long timeout;
+
+ delayed_work = container_of(work, struct delayed_work, work);
+ priv_nc = container_of(delayed_work, struct batadv_priv_nc, work);
+ bat_priv = container_of(priv_nc, struct batadv_priv, nc);
+
+ batadv_nc_purge_orig_hash(bat_priv);
+ batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash,
+ batadv_nc_to_purge_nc_path_coding);
+ batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash,
+ batadv_nc_to_purge_nc_path_decoding);
+
+ timeout = bat_priv->nc.max_fwd_delay;
+
+ if (batadv_has_timed_out(bat_priv->nc.timestamp_fwd_flush, timeout)) {
+ batadv_nc_process_nc_paths(bat_priv, bat_priv->nc.coding_hash,
+ batadv_nc_fwd_flush);
+ bat_priv->nc.timestamp_fwd_flush = jiffies;
+ }
+
+ if (batadv_has_timed_out(bat_priv->nc.timestamp_sniffed_purge,
+ bat_priv->nc.max_buffer_time)) {
+ batadv_nc_process_nc_paths(bat_priv, bat_priv->nc.decoding_hash,
+ batadv_nc_sniffed_purge);
+ bat_priv->nc.timestamp_sniffed_purge = jiffies;
+ }
+
+ /* Schedule a new check */
+ batadv_nc_start_timer(bat_priv);
+}
+
+/**
+ * batadv_can_nc_with_orig - checks whether the given orig node is suitable for
+ * coding or not
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: neighboring orig node which may be used as nc candidate
+ * @ogm_packet: incoming ogm packet also used for the checks
+ *
+ * Returns true if:
+ * 1) The OGM must have the most recent sequence number.
+ * 2) The TTL must be decremented by one and only one.
+ * 3) The OGM must be received from the first hop from orig_node.
+ * 4) The TQ value of the OGM must be above bat_priv->nc.min_tq.
+ */
+static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_ogm_packet *ogm_packet)
+{
+ if (orig_node->last_real_seqno != ntohl(ogm_packet->seqno))
+ return false;
+ if (orig_node->last_ttl != ogm_packet->header.ttl + 1)
+ return false;
+ if (!batadv_compare_eth(ogm_packet->orig, ogm_packet->prev_sender))
+ return false;
+ if (ogm_packet->tq < bat_priv->nc.min_tq)
+ return false;
+
+ return true;
+}
+
+/**
+ * batadv_nc_find_nc_node - search for an existing nc node and return it
+ * @orig_node: orig node originating the ogm packet
+ * @orig_neigh_node: neighboring orig node from which we received the ogm packet
+ * (can be equal to orig_node)
+ * @in_coding: traverse incoming or outgoing network coding list
+ *
+ * Returns the nc_node if found, NULL otherwise.
+ */
+static struct batadv_nc_node
+*batadv_nc_find_nc_node(struct batadv_orig_node *orig_node,
+ struct batadv_orig_node *orig_neigh_node,
+ bool in_coding)
+{
+ struct batadv_nc_node *nc_node, *nc_node_out = NULL;
+ struct list_head *list;
+
+ if (in_coding)
+ list = &orig_neigh_node->in_coding_list;
+ else
+ list = &orig_neigh_node->out_coding_list;
+
+ /* Traverse list of nc_nodes to orig_node */
+ rcu_read_lock();
+ list_for_each_entry_rcu(nc_node, list, list) {
+ if (!batadv_compare_eth(nc_node->addr, orig_node->orig))
+ continue;
+
+ if (!atomic_inc_not_zero(&nc_node->refcount))
+ continue;
+
+ /* Found a match */
+ nc_node_out = nc_node;
+ break;
+ }
+ rcu_read_unlock();
+
+ return nc_node_out;
+}
+
+/**
+ * batadv_nc_get_nc_node - retrieves an nc node or creates the entry if it was
+ * not found
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: orig node originating the ogm packet
+ * @orig_neigh_node: neighboring orig node from which we received the ogm packet
+ * (can be equal to orig_node)
+ * @in_coding: traverse incoming or outgoing network coding list
+ *
+ * Returns the nc_node if found or created, NULL in case of an error.
+ */
+static struct batadv_nc_node
+*batadv_nc_get_nc_node(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_orig_node *orig_neigh_node,
+ bool in_coding)
+{
+ struct batadv_nc_node *nc_node;
+ spinlock_t *lock; /* Used to lock list selected by "int in_coding" */
+ struct list_head *list;
+
+ /* Check if nc_node is already added */
+ nc_node = batadv_nc_find_nc_node(orig_node, orig_neigh_node, in_coding);
+
+ /* Node found */
+ if (nc_node)
+ return nc_node;
+
+ nc_node = kzalloc(sizeof(*nc_node), GFP_ATOMIC);
+ if (!nc_node)
+ return NULL;
+
+ if (!atomic_inc_not_zero(&orig_neigh_node->refcount))
+ goto free;
+
+ /* Initialize nc_node */
+ INIT_LIST_HEAD(&nc_node->list);
+ memcpy(nc_node->addr, orig_node->orig, ETH_ALEN);
+ nc_node->orig_node = orig_neigh_node;
+ atomic_set(&nc_node->refcount, 2);
+
+ /* Select ingoing or outgoing coding node */
+ if (in_coding) {
+ lock = &orig_neigh_node->in_coding_list_lock;
+ list = &orig_neigh_node->in_coding_list;
+ } else {
+ lock = &orig_neigh_node->out_coding_list_lock;
+ list = &orig_neigh_node->out_coding_list;
+ }
+
+ batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_node %pM -> %pM\n",
+ nc_node->addr, nc_node->orig_node->orig);
+
+ /* Add nc_node to orig_node */
+ spin_lock_bh(lock);
+ list_add_tail_rcu(&nc_node->list, list);
+ spin_unlock_bh(lock);
+
+ return nc_node;
+
+free:
+ kfree(nc_node);
+ return NULL;
+}
+
+/**
+ * batadv_nc_update_nc_node - updates stored incoming and outgoing nc node structs
+ * (best called on incoming OGMs)
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: orig node originating the ogm packet
+ * @orig_neigh_node: neighboring orig node from which we received the ogm packet
+ * (can be equal to orig_node)
+ * @ogm_packet: incoming ogm packet
+ * @is_single_hop_neigh: orig_node is a single hop neighbor
+ */
+void batadv_nc_update_nc_node(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_orig_node *orig_neigh_node,
+ struct batadv_ogm_packet *ogm_packet,
+ int is_single_hop_neigh)
+{
+ struct batadv_nc_node *in_nc_node = NULL, *out_nc_node = NULL;
+
+ /* Check if network coding is enabled */
+ if (!atomic_read(&bat_priv->network_coding))
+ goto out;
+
+ /* accept ogms from 'good' neighbors and single hop neighbors */
+ if (!batadv_can_nc_with_orig(bat_priv, orig_node, ogm_packet) &&
+ !is_single_hop_neigh)
+ goto out;
+
+ /* Add orig_node as in_nc_node on hop */
+ in_nc_node = batadv_nc_get_nc_node(bat_priv, orig_node,
+ orig_neigh_node, true);
+ if (!in_nc_node)
+ goto out;
+
+ in_nc_node->last_seen = jiffies;
+
+ /* Add hop as out_nc_node on orig_node */
+ out_nc_node = batadv_nc_get_nc_node(bat_priv, orig_neigh_node,
+ orig_node, false);
+ if (!out_nc_node)
+ goto out;
+
+ out_nc_node->last_seen = jiffies;
+
+out:
+ if (in_nc_node)
+ batadv_nc_node_free_ref(in_nc_node);
+ if (out_nc_node)
+ batadv_nc_node_free_ref(out_nc_node);
+}
+
+/**
+ * batadv_nc_get_path - get existing nc_path or allocate a new one
+ * @bat_priv: the bat priv with all the soft interface information
+ * @hash: hash table containing the nc path
+ * @src: ethernet source address - first half of the nc path search key
+ * @dst: ethernet destination address - second half of the nc path search key
+ *
+ * Returns pointer to nc_path if the path was found or created, returns NULL
+ * on error.
+ */
+static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv,
+ struct batadv_hashtable *hash,
+ uint8_t *src,
+ uint8_t *dst)
+{
+ int hash_added;
+ struct batadv_nc_path *nc_path, nc_path_key;
+
+ batadv_nc_hash_key_gen(&nc_path_key, src, dst);
+
+ /* Search for existing nc_path */
+ nc_path = batadv_nc_hash_find(hash, (void *)&nc_path_key);
+
+ if (nc_path) {
+ /* Set timestamp to delay removal of nc_path */
+ nc_path->last_valid = jiffies;
+ return nc_path;
+ }
+
+ /* No existing nc_path was found; create a new */
+ nc_path = kzalloc(sizeof(*nc_path), GFP_ATOMIC);
+
+ if (!nc_path)
+ return NULL;
+
+ /* Initialize nc_path */
+ INIT_LIST_HEAD(&nc_path->packet_list);
+ spin_lock_init(&nc_path->packet_list_lock);
+ atomic_set(&nc_path->refcount, 2);
+ nc_path->last_valid = jiffies;
+ memcpy(nc_path->next_hop, dst, ETH_ALEN);
+ memcpy(nc_path->prev_hop, src, ETH_ALEN);
+
+ batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_path %pM -> %pM\n",
+ nc_path->prev_hop,
+ nc_path->next_hop);
+
+ /* Add nc_path to hash table */
+ hash_added = batadv_hash_add(hash, batadv_nc_hash_compare,
+ batadv_nc_hash_choose, &nc_path_key,
+ &nc_path->hash_entry);
+
+ if (hash_added < 0) {
+ kfree(nc_path);
+ return NULL;
+ }
+
+ return nc_path;
+}
+
+/**
+ * batadv_nc_random_weight_tq - scale the receivers TQ-value to avoid unfair
+ * selection of a receiver with slightly lower TQ than the other
+ * @tq: to be weighted tq value
+ */
+static uint8_t batadv_nc_random_weight_tq(uint8_t tq)
+{
+ uint8_t rand_val, rand_tq;
+
+ get_random_bytes(&rand_val, sizeof(rand_val));
+
+ /* randomize the estimated packet loss (max TQ - estimated TQ) */
+ rand_tq = rand_val * (BATADV_TQ_MAX_VALUE - tq);
+
+ /* normalize the randomized packet loss */
+ rand_tq /= BATADV_TQ_MAX_VALUE;
+
+ /* convert to (randomized) estimated tq again */
+ return BATADV_TQ_MAX_VALUE - rand_tq;
+}
+
+/**
+ * batadv_nc_memxor - XOR destination with source
+ * @dst: byte array to XOR into
+ * @src: byte array to XOR from
+ * @len: length of destination array
+ */
+static void batadv_nc_memxor(char *dst, const char *src, unsigned int len)
+{
+ unsigned int i;
+
+ for (i = 0; i < len; ++i)
+ dst[i] ^= src[i];
+}
+
+/**
+ * batadv_nc_code_packets - code a received unicast_packet with an nc packet
+ * into a coded_packet and send it
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: data skb to forward
+ * @ethhdr: pointer to the ethernet header inside the skb
+ * @nc_packet: structure containing the packet to the skb can be coded with
+ * @neigh_node: next hop to forward packet to
+ *
+ * Returns true if both packets are consumed, false otherwise.
+ */
+static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ struct ethhdr *ethhdr,
+ struct batadv_nc_packet *nc_packet,
+ struct batadv_neigh_node *neigh_node)
+{
+ uint8_t tq_weighted_neigh, tq_weighted_coding;
+ struct sk_buff *skb_dest, *skb_src;
+ struct batadv_unicast_packet *packet1;
+ struct batadv_unicast_packet *packet2;
+ struct batadv_coded_packet *coded_packet;
+ struct batadv_neigh_node *neigh_tmp, *router_neigh;
+ struct batadv_neigh_node *router_coding = NULL;
+ uint8_t *first_source, *first_dest, *second_source, *second_dest;
+ __be32 packet_id1, packet_id2;
+ size_t count;
+ bool res = false;
+ int coding_len;
+ int unicast_size = sizeof(*packet1);
+ int coded_size = sizeof(*coded_packet);
+ int header_add = coded_size - unicast_size;
+
+ router_neigh = batadv_orig_node_get_router(neigh_node->orig_node);
+ if (!router_neigh)
+ goto out;
+
+ neigh_tmp = nc_packet->neigh_node;
+ router_coding = batadv_orig_node_get_router(neigh_tmp->orig_node);
+ if (!router_coding)
+ goto out;
+
+ tq_weighted_neigh = batadv_nc_random_weight_tq(router_neigh->tq_avg);
+ tq_weighted_coding = batadv_nc_random_weight_tq(router_coding->tq_avg);
+
+ /* Select one destination for the MAC-header dst-field based on
+ * weighted TQ-values.
+ */
+ if (tq_weighted_neigh >= tq_weighted_coding) {
+ /* Destination from nc_packet is selected for MAC-header */
+ first_dest = nc_packet->nc_path->next_hop;
+ first_source = nc_packet->nc_path->prev_hop;
+ second_dest = neigh_node->addr;
+ second_source = ethhdr->h_source;
+ packet1 = (struct batadv_unicast_packet *)nc_packet->skb->data;
+ packet2 = (struct batadv_unicast_packet *)skb->data;
+ packet_id1 = nc_packet->packet_id;
+ packet_id2 = batadv_skb_crc32(skb,
+ skb->data + sizeof(*packet2));
+ } else {
+ /* Destination for skb is selected for MAC-header */
+ first_dest = neigh_node->addr;
+ first_source = ethhdr->h_source;
+ second_dest = nc_packet->nc_path->next_hop;
+ second_source = nc_packet->nc_path->prev_hop;
+ packet1 = (struct batadv_unicast_packet *)skb->data;
+ packet2 = (struct batadv_unicast_packet *)nc_packet->skb->data;
+ packet_id1 = batadv_skb_crc32(skb,
+ skb->data + sizeof(*packet1));
+ packet_id2 = nc_packet->packet_id;
+ }
+
+ /* Instead of zero padding the smallest data buffer, we
+ * code into the largest.
+ */
+ if (skb->len <= nc_packet->skb->len) {
+ skb_dest = nc_packet->skb;
+ skb_src = skb;
+ } else {
+ skb_dest = skb;
+ skb_src = nc_packet->skb;
+ }
+
+ /* coding_len is used when decoding the packet shorter packet */
+ coding_len = skb_src->len - unicast_size;
+
+ if (skb_linearize(skb_dest) < 0 || skb_linearize(skb_src) < 0)
+ goto out;
+
+ skb_push(skb_dest, header_add);
+
+ coded_packet = (struct batadv_coded_packet *)skb_dest->data;
+ skb_reset_mac_header(skb_dest);
+
+ coded_packet->header.packet_type = BATADV_CODED;
+ coded_packet->header.version = BATADV_COMPAT_VERSION;
+ coded_packet->header.ttl = packet1->header.ttl;
+
+ /* Info about first unicast packet */
+ memcpy(coded_packet->first_source, first_source, ETH_ALEN);
+ memcpy(coded_packet->first_orig_dest, packet1->dest, ETH_ALEN);
+ coded_packet->first_crc = packet_id1;
+ coded_packet->first_ttvn = packet1->ttvn;
+
+ /* Info about second unicast packet */
+ memcpy(coded_packet->second_dest, second_dest, ETH_ALEN);
+ memcpy(coded_packet->second_source, second_source, ETH_ALEN);
+ memcpy(coded_packet->second_orig_dest, packet2->dest, ETH_ALEN);
+ coded_packet->second_crc = packet_id2;
+ coded_packet->second_ttl = packet2->header.ttl;
+ coded_packet->second_ttvn = packet2->ttvn;
+ coded_packet->coded_len = htons(coding_len);
+
+ /* This is where the magic happens: Code skb_src into skb_dest */
+ batadv_nc_memxor(skb_dest->data + coded_size,
+ skb_src->data + unicast_size, coding_len);
+
+ /* Update counters accordingly */
+ if (BATADV_SKB_CB(skb_src)->decoded &&
+ BATADV_SKB_CB(skb_dest)->decoded) {
+ /* Both packets are recoded */
+ count = skb_src->len + ETH_HLEN;
+ count += skb_dest->len + ETH_HLEN;
+ batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE, 2);
+ batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, count);
+ } else if (!BATADV_SKB_CB(skb_src)->decoded &&
+ !BATADV_SKB_CB(skb_dest)->decoded) {
+ /* Both packets are newly coded */
+ count = skb_src->len + ETH_HLEN;
+ count += skb_dest->len + ETH_HLEN;
+ batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE, 2);
+ batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, count);
+ } else if (BATADV_SKB_CB(skb_src)->decoded &&
+ !BATADV_SKB_CB(skb_dest)->decoded) {
+ /* skb_src recoded and skb_dest is newly coded */
+ batadv_inc_counter(bat_priv, BATADV_CNT_NC_RECODE);
+ batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES,
+ skb_src->len + ETH_HLEN);
+ batadv_inc_counter(bat_priv, BATADV_CNT_NC_CODE);
+ batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES,
+ skb_dest->len + ETH_HLEN);
+ } else if (!BATADV_SKB_CB(skb_src)->decoded &&
+ BATADV_SKB_CB(skb_dest)->decoded) {
+ /* skb_src is newly coded and skb_dest is recoded */
+ batadv_inc_counter(bat_priv, BATADV_CNT_NC_CODE);
+ batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES,
+ skb_src->len + ETH_HLEN);
+ batadv_inc_counter(bat_priv, BATADV_CNT_NC_RECODE);
+ batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES,
+ skb_dest->len + ETH_HLEN);
+ }
+
+ /* skb_src is now coded into skb_dest, so free it */
+ kfree_skb(skb_src);
+
+ /* avoid duplicate free of skb from nc_packet */
+ nc_packet->skb = NULL;
+ batadv_nc_packet_free(nc_packet);
+
+ /* Send the coded packet and return true */
+ batadv_send_skb_packet(skb_dest, neigh_node->if_incoming, first_dest);
+ res = true;
+out:
+ if (router_neigh)
+ batadv_neigh_node_free_ref(router_neigh);
+ if (router_coding)
+ batadv_neigh_node_free_ref(router_coding);
+ return res;
+}
+
+/**
+ * batadv_nc_skb_coding_possible - true if a decoded skb is available at dst.
+ * @skb: data skb to forward
+ * @dst: destination mac address of the other skb to code with
+ * @src: source mac address of skb
+ *
+ * Whenever we network code a packet we have to check whether we received it in
+ * a network coded form. If so, we may not be able to use it for coding because
+ * some neighbors may also have received (overheard) the packet in the network
+ * coded form without being able to decode it. It is hard to know which of the
+ * neighboring nodes was able to decode the packet, therefore we can only
+ * re-code the packet if the source of the previous encoded packet is involved.
+ * Since the source encoded the packet we can be certain it has all necessary
+ * decode information.
+ *
+ * Returns true if coding of a decoded packet is allowed.
+ */
+static bool batadv_nc_skb_coding_possible(struct sk_buff *skb,
+ uint8_t *dst, uint8_t *src)
+{
+ if (BATADV_SKB_CB(skb)->decoded && !batadv_compare_eth(dst, src))
+ return false;
+ else
+ return true;
+}
+
+/**
+ * batadv_nc_path_search - Find the coding path matching in_nc_node and
+ * out_nc_node to retrieve a buffered packet that can be used for coding.
+ * @bat_priv: the bat priv with all the soft interface information
+ * @in_nc_node: pointer to skb next hop's neighbor nc node
+ * @out_nc_node: pointer to skb source's neighbor nc node
+ * @skb: data skb to forward
+ * @eth_dst: next hop mac address of skb
+ *
+ * Returns true if coding of a decoded skb is allowed.
+ */
+static struct batadv_nc_packet *
+batadv_nc_path_search(struct batadv_priv *bat_priv,
+ struct batadv_nc_node *in_nc_node,
+ struct batadv_nc_node *out_nc_node,
+ struct sk_buff *skb,
+ uint8_t *eth_dst)
+{
+ struct batadv_nc_path *nc_path, nc_path_key;
+ struct batadv_nc_packet *nc_packet_out = NULL;
+ struct batadv_nc_packet *nc_packet, *nc_packet_tmp;
+ struct batadv_hashtable *hash = bat_priv->nc.coding_hash;
+ int idx;
+
+ if (!hash)
+ return NULL;
+
+ /* Create almost path key */
+ batadv_nc_hash_key_gen(&nc_path_key, in_nc_node->addr,
+ out_nc_node->addr);
+ idx = batadv_nc_hash_choose(&nc_path_key, hash->size);
+
+ /* Check for coding opportunities in this nc_path */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(nc_path, &hash->table[idx], hash_entry) {
+ if (!batadv_compare_eth(nc_path->prev_hop, in_nc_node->addr))
+ continue;
+
+ if (!batadv_compare_eth(nc_path->next_hop, out_nc_node->addr))
+ continue;
+
+ spin_lock_bh(&nc_path->packet_list_lock);
+ if (list_empty(&nc_path->packet_list)) {
+ spin_unlock_bh(&nc_path->packet_list_lock);
+ continue;
+ }
+
+ list_for_each_entry_safe(nc_packet, nc_packet_tmp,
+ &nc_path->packet_list, list) {
+ if (!batadv_nc_skb_coding_possible(nc_packet->skb,
+ eth_dst,
+ in_nc_node->addr))
+ continue;
+
+ /* Coding opportunity is found! */
+ list_del(&nc_packet->list);
+ nc_packet_out = nc_packet;
+ break;
+ }
+
+ spin_unlock_bh(&nc_path->packet_list_lock);
+ break;
+ }
+ rcu_read_unlock();
+
+ return nc_packet_out;
+}
+
+/**
+ * batadv_nc_skb_src_search - Loops through the list of neighoring nodes of the
+ * skb's sender (may be equal to the originator).
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: data skb to forward
+ * @eth_dst: next hop mac address of skb
+ * @eth_src: source mac address of skb
+ * @in_nc_node: pointer to skb next hop's neighbor nc node
+ *
+ * Returns an nc packet if a suitable coding packet was found, NULL otherwise.
+ */
+static struct batadv_nc_packet *
+batadv_nc_skb_src_search(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ uint8_t *eth_dst,
+ uint8_t *eth_src,
+ struct batadv_nc_node *in_nc_node)
+{
+ struct batadv_orig_node *orig_node;
+ struct batadv_nc_node *out_nc_node;
+ struct batadv_nc_packet *nc_packet = NULL;
+
+ orig_node = batadv_orig_hash_find(bat_priv, eth_src);
+ if (!orig_node)
+ return NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(out_nc_node,
+ &orig_node->out_coding_list, list) {
+ /* Check if the skb is decoded and if recoding is possible */
+ if (!batadv_nc_skb_coding_possible(skb,
+ out_nc_node->addr, eth_src))
+ continue;
+
+ /* Search for an opportunity in this nc_path */
+ nc_packet = batadv_nc_path_search(bat_priv, in_nc_node,
+ out_nc_node, skb, eth_dst);
+ if (nc_packet)
+ break;
+ }
+ rcu_read_unlock();
+
+ batadv_orig_node_free_ref(orig_node);
+ return nc_packet;
+}
+
+/**
+ * batadv_nc_skb_store_before_coding - set the ethernet src and dst of the
+ * unicast skb before it is stored for use in later decoding
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: data skb to store
+ * @eth_dst_new: new destination mac address of skb
+ */
+static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ uint8_t *eth_dst_new)
+{
+ struct ethhdr *ethhdr;
+
+ /* Copy skb header to change the mac header */
+ skb = pskb_copy(skb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ /* Set the mac header as if we actually sent the packet uncoded */
+ ethhdr = (struct ethhdr *)skb_mac_header(skb);
+ memcpy(ethhdr->h_source, ethhdr->h_dest, ETH_ALEN);
+ memcpy(ethhdr->h_dest, eth_dst_new, ETH_ALEN);
+
+ /* Set data pointer to MAC header to mimic packets from our tx path */
+ skb_push(skb, ETH_HLEN);
+
+ /* Add the packet to the decoding packet pool */
+ batadv_nc_skb_store_for_decoding(bat_priv, skb);
+
+ /* batadv_nc_skb_store_for_decoding() clones the skb, so we must free
+ * our ref
+ */
+ kfree_skb(skb);
+}
+
+/**
+ * batadv_nc_skb_dst_search - Loops through list of neighboring nodes to dst.
+ * @skb: data skb to forward
+ * @neigh_node: next hop to forward packet to
+ * @ethhdr: pointer to the ethernet header inside the skb
+ *
+ * Loops through list of neighboring nodes the next hop has a good connection to
+ * (receives OGMs with a sufficient quality). We need to find a neighbor of our
+ * next hop that potentially sent a packet which our next hop also received
+ * (overheard) and has stored for later decoding.
+ *
+ * Returns true if the skb was consumed (encoded packet sent) or false otherwise
+ */
+static bool batadv_nc_skb_dst_search(struct sk_buff *skb,
+ struct batadv_neigh_node *neigh_node,
+ struct ethhdr *ethhdr)
+{
+ struct net_device *netdev = neigh_node->if_incoming->soft_iface;
+ struct batadv_priv *bat_priv = netdev_priv(netdev);
+ struct batadv_orig_node *orig_node = neigh_node->orig_node;
+ struct batadv_nc_node *nc_node;
+ struct batadv_nc_packet *nc_packet = NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(nc_node, &orig_node->in_coding_list, list) {
+ /* Search for coding opportunity with this in_nc_node */
+ nc_packet = batadv_nc_skb_src_search(bat_priv, skb,
+ neigh_node->addr,
+ ethhdr->h_source, nc_node);
+
+ /* Opportunity was found, so stop searching */
+ if (nc_packet)
+ break;
+ }
+ rcu_read_unlock();
+
+ if (!nc_packet)
+ return false;
+
+ /* Save packets for later decoding */
+ batadv_nc_skb_store_before_coding(bat_priv, skb,
+ neigh_node->addr);
+ batadv_nc_skb_store_before_coding(bat_priv, nc_packet->skb,
+ nc_packet->neigh_node->addr);
+
+ /* Code and send packets */
+ if (batadv_nc_code_packets(bat_priv, skb, ethhdr, nc_packet,
+ neigh_node))
+ return true;
+
+ /* out of mem ? Coding failed - we have to free the buffered packet
+ * to avoid memleaks. The skb passed as argument will be dealt with
+ * by the calling function.
+ */
+ batadv_nc_send_packet(nc_packet);
+ return false;
+}
+
+/**
+ * batadv_nc_skb_add_to_path - buffer skb for later encoding / decoding
+ * @skb: skb to add to path
+ * @nc_path: path to add skb to
+ * @neigh_node: next hop to forward packet to
+ * @packet_id: checksum to identify packet
+ *
+ * Returns true if the packet was buffered or false in case of an error.
+ */
+static bool batadv_nc_skb_add_to_path(struct sk_buff *skb,
+ struct batadv_nc_path *nc_path,
+ struct batadv_neigh_node *neigh_node,
+ __be32 packet_id)
+{
+ struct batadv_nc_packet *nc_packet;
+
+ nc_packet = kzalloc(sizeof(*nc_packet), GFP_ATOMIC);
+ if (!nc_packet)
+ return false;
+
+ /* Initialize nc_packet */
+ nc_packet->timestamp = jiffies;
+ nc_packet->packet_id = packet_id;
+ nc_packet->skb = skb;
+ nc_packet->neigh_node = neigh_node;
+ nc_packet->nc_path = nc_path;
+
+ /* Add coding packet to list */
+ spin_lock_bh(&nc_path->packet_list_lock);
+ list_add_tail(&nc_packet->list, &nc_path->packet_list);
+ spin_unlock_bh(&nc_path->packet_list_lock);
+
+ return true;
+}
+
+/**
+ * batadv_nc_skb_forward - try to code a packet or add it to the coding packet
+ * buffer
+ * @skb: data skb to forward
+ * @neigh_node: next hop to forward packet to
+ * @ethhdr: pointer to the ethernet header inside the skb
+ *
+ * Returns true if the skb was consumed (encoded packet sent) or false otherwise
+ */
+bool batadv_nc_skb_forward(struct sk_buff *skb,
+ struct batadv_neigh_node *neigh_node,
+ struct ethhdr *ethhdr)
+{
+ const struct net_device *netdev = neigh_node->if_incoming->soft_iface;
+ struct batadv_priv *bat_priv = netdev_priv(netdev);
+ struct batadv_unicast_packet *packet;
+ struct batadv_nc_path *nc_path;
+ __be32 packet_id;
+ u8 *payload;
+
+ /* Check if network coding is enabled */
+ if (!atomic_read(&bat_priv->network_coding))
+ goto out;
+
+ /* We only handle unicast packets */
+ payload = skb_network_header(skb);
+ packet = (struct batadv_unicast_packet *)payload;
+ if (packet->header.packet_type != BATADV_UNICAST)
+ goto out;
+
+ /* Try to find a coding opportunity and send the skb if one is found */
+ if (batadv_nc_skb_dst_search(skb, neigh_node, ethhdr))
+ return true;
+
+ /* Find or create a nc_path for this src-dst pair */
+ nc_path = batadv_nc_get_path(bat_priv,
+ bat_priv->nc.coding_hash,
+ ethhdr->h_source,
+ neigh_node->addr);
+
+ if (!nc_path)
+ goto out;
+
+ /* Add skb to nc_path */
+ packet_id = batadv_skb_crc32(skb, payload + sizeof(*packet));
+ if (!batadv_nc_skb_add_to_path(skb, nc_path, neigh_node, packet_id))
+ goto free_nc_path;
+
+ /* Packet is consumed */
+ return true;
+
+free_nc_path:
+ batadv_nc_path_free_ref(nc_path);
+out:
+ /* Packet is not consumed */
+ return false;
+}
+
+/**
+ * batadv_nc_skb_store_for_decoding - save a clone of the skb which can be used
+ * when decoding coded packets
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: data skb to store
+ */
+void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ struct batadv_unicast_packet *packet;
+ struct batadv_nc_path *nc_path;
+ struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb);
+ __be32 packet_id;
+ u8 *payload;
+
+ /* Check if network coding is enabled */
+ if (!atomic_read(&bat_priv->network_coding))
+ goto out;
+
+ /* Check for supported packet type */
+ payload = skb_network_header(skb);
+ packet = (struct batadv_unicast_packet *)payload;
+ if (packet->header.packet_type != BATADV_UNICAST)
+ goto out;
+
+ /* Find existing nc_path or create a new */
+ nc_path = batadv_nc_get_path(bat_priv,
+ bat_priv->nc.decoding_hash,
+ ethhdr->h_source,
+ ethhdr->h_dest);
+
+ if (!nc_path)
+ goto out;
+
+ /* Clone skb and adjust skb->data to point at batman header */
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (unlikely(!skb))
+ goto free_nc_path;
+
+ if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
+ goto free_skb;
+
+ if (unlikely(!skb_pull_rcsum(skb, ETH_HLEN)))
+ goto free_skb;
+
+ /* Add skb to nc_path */
+ packet_id = batadv_skb_crc32(skb, payload + sizeof(*packet));
+ if (!batadv_nc_skb_add_to_path(skb, nc_path, NULL, packet_id))
+ goto free_skb;
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_NC_BUFFER);
+ return;
+
+free_skb:
+ kfree_skb(skb);
+free_nc_path:
+ batadv_nc_path_free_ref(nc_path);
+out:
+ return;
+}
+
+/**
+ * batadv_nc_skb_store_sniffed_unicast - check if a received unicast packet
+ * should be saved in the decoding buffer and, if so, store it there
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: unicast skb to store
+ */
+void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb);
+
+ if (batadv_is_my_mac(ethhdr->h_dest))
+ return;
+
+ /* Set data pointer to MAC header to mimic packets from our tx path */
+ skb_push(skb, ETH_HLEN);
+
+ batadv_nc_skb_store_for_decoding(bat_priv, skb);
+}
+
+/**
+ * batadv_nc_skb_decode_packet - decode given skb using the decode data stored
+ * in nc_packet
+ * @skb: unicast skb to decode
+ * @nc_packet: decode data needed to decode the skb
+ *
+ * Returns pointer to decoded unicast packet if the packet was decoded or NULL
+ * in case of an error.
+ */
+static struct batadv_unicast_packet *
+batadv_nc_skb_decode_packet(struct sk_buff *skb,
+ struct batadv_nc_packet *nc_packet)
+{
+ const int h_size = sizeof(struct batadv_unicast_packet);
+ const int h_diff = sizeof(struct batadv_coded_packet) - h_size;
+ struct batadv_unicast_packet *unicast_packet;
+ struct batadv_coded_packet coded_packet_tmp;
+ struct ethhdr *ethhdr, ethhdr_tmp;
+ uint8_t *orig_dest, ttl, ttvn;
+ unsigned int coding_len;
+
+ /* Save headers temporarily */
+ memcpy(&coded_packet_tmp, skb->data, sizeof(coded_packet_tmp));
+ memcpy(&ethhdr_tmp, skb_mac_header(skb), sizeof(ethhdr_tmp));
+
+ if (skb_cow(skb, 0) < 0)
+ return NULL;
+
+ if (unlikely(!skb_pull_rcsum(skb, h_diff)))
+ return NULL;
+
+ /* Data points to batman header, so set mac header 14 bytes before
+ * and network to data
+ */
+ skb_set_mac_header(skb, -ETH_HLEN);
+ skb_reset_network_header(skb);
+
+ /* Reconstruct original mac header */
+ ethhdr = (struct ethhdr *)skb_mac_header(skb);
+ memcpy(ethhdr, &ethhdr_tmp, sizeof(*ethhdr));
+
+ /* Select the correct unicast header information based on the location
+ * of our mac address in the coded_packet header
+ */
+ if (batadv_is_my_mac(coded_packet_tmp.second_dest)) {
+ /* If we are the second destination the packet was overheard,
+ * so the Ethernet address must be copied to h_dest and
+ * pkt_type changed from PACKET_OTHERHOST to PACKET_HOST
+ */
+ memcpy(ethhdr->h_dest, coded_packet_tmp.second_dest, ETH_ALEN);
+ skb->pkt_type = PACKET_HOST;
+
+ orig_dest = coded_packet_tmp.second_orig_dest;
+ ttl = coded_packet_tmp.second_ttl;
+ ttvn = coded_packet_tmp.second_ttvn;
+ } else {
+ orig_dest = coded_packet_tmp.first_orig_dest;
+ ttl = coded_packet_tmp.header.ttl;
+ ttvn = coded_packet_tmp.first_ttvn;
+ }
+
+ coding_len = ntohs(coded_packet_tmp.coded_len);
+
+ if (coding_len > skb->len)
+ return NULL;
+
+ /* Here the magic is reversed:
+ * extract the missing packet from the received coded packet
+ */
+ batadv_nc_memxor(skb->data + h_size,
+ nc_packet->skb->data + h_size,
+ coding_len);
+
+ /* Resize decoded skb if decoded with larger packet */
+ if (nc_packet->skb->len > coding_len + h_size)
+ pskb_trim_rcsum(skb, coding_len + h_size);
+
+ /* Create decoded unicast packet */
+ unicast_packet = (struct batadv_unicast_packet *)skb->data;
+ unicast_packet->header.packet_type = BATADV_UNICAST;
+ unicast_packet->header.version = BATADV_COMPAT_VERSION;
+ unicast_packet->header.ttl = ttl;
+ memcpy(unicast_packet->dest, orig_dest, ETH_ALEN);
+ unicast_packet->ttvn = ttvn;
+
+ batadv_nc_packet_free(nc_packet);
+ return unicast_packet;
+}
+
+/**
+ * batadv_nc_find_decoding_packet - search through buffered decoding data to
+ * find the data needed to decode the coded packet
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ethhdr: pointer to the ethernet header inside the coded packet
+ * @coded: coded packet we try to find decode data for
+ *
+ * Returns pointer to nc packet if the needed data was found or NULL otherwise.
+ */
+static struct batadv_nc_packet *
+batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv,
+ struct ethhdr *ethhdr,
+ struct batadv_coded_packet *coded)
+{
+ struct batadv_hashtable *hash = bat_priv->nc.decoding_hash;
+ struct batadv_nc_packet *tmp_nc_packet, *nc_packet = NULL;
+ struct batadv_nc_path *nc_path, nc_path_key;
+ uint8_t *dest, *source;
+ __be32 packet_id;
+ int index;
+
+ if (!hash)
+ return NULL;
+
+ /* Select the correct packet id based on the location of our mac-addr */
+ dest = ethhdr->h_source;
+ if (!batadv_is_my_mac(coded->second_dest)) {
+ source = coded->second_source;
+ packet_id = coded->second_crc;
+ } else {
+ source = coded->first_source;
+ packet_id = coded->first_crc;
+ }
+
+ batadv_nc_hash_key_gen(&nc_path_key, source, dest);
+ index = batadv_nc_hash_choose(&nc_path_key, hash->size);
+
+ /* Search for matching coding path */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(nc_path, &hash->table[index], hash_entry) {
+ /* Find matching nc_packet */
+ spin_lock_bh(&nc_path->packet_list_lock);
+ list_for_each_entry(tmp_nc_packet,
+ &nc_path->packet_list, list) {
+ if (packet_id == tmp_nc_packet->packet_id) {
+ list_del(&tmp_nc_packet->list);
+
+ nc_packet = tmp_nc_packet;
+ break;
+ }
+ }
+ spin_unlock_bh(&nc_path->packet_list_lock);
+
+ if (nc_packet)
+ break;
+ }
+ rcu_read_unlock();
+
+ if (!nc_packet)
+ batadv_dbg(BATADV_DBG_NC, bat_priv,
+ "No decoding packet found for %u\n", packet_id);
+
+ return nc_packet;
+}
+
+/**
+ * batadv_nc_recv_coded_packet - try to decode coded packet and enqueue the
+ * resulting unicast packet
+ * @skb: incoming coded packet
+ * @recv_if: pointer to interface this packet was received on
+ */
+static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+ struct batadv_unicast_packet *unicast_packet;
+ struct batadv_coded_packet *coded_packet;
+ struct batadv_nc_packet *nc_packet;
+ struct ethhdr *ethhdr;
+ int hdr_size = sizeof(*coded_packet);
+
+ /* Check if network coding is enabled */
+ if (!atomic_read(&bat_priv->network_coding))
+ return NET_RX_DROP;
+
+ /* Make sure we can access (and remove) header */
+ if (unlikely(!pskb_may_pull(skb, hdr_size)))
+ return NET_RX_DROP;
+
+ coded_packet = (struct batadv_coded_packet *)skb->data;
+ ethhdr = (struct ethhdr *)skb_mac_header(skb);
+
+ /* Verify frame is destined for us */
+ if (!batadv_is_my_mac(ethhdr->h_dest) &&
+ !batadv_is_my_mac(coded_packet->second_dest))
+ return NET_RX_DROP;
+
+ /* Update stat counter */
+ if (batadv_is_my_mac(coded_packet->second_dest))
+ batadv_inc_counter(bat_priv, BATADV_CNT_NC_SNIFFED);
+
+ nc_packet = batadv_nc_find_decoding_packet(bat_priv, ethhdr,
+ coded_packet);
+ if (!nc_packet) {
+ batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED);
+ return NET_RX_DROP;
+ }
+
+ /* Make skb's linear, because decoding accesses the entire buffer */
+ if (skb_linearize(skb) < 0)
+ goto free_nc_packet;
+
+ if (skb_linearize(nc_packet->skb) < 0)
+ goto free_nc_packet;
+
+ /* Decode the packet */
+ unicast_packet = batadv_nc_skb_decode_packet(skb, nc_packet);
+ if (!unicast_packet) {
+ batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED);
+ goto free_nc_packet;
+ }
+
+ /* Mark packet as decoded to do correct recoding when forwarding */
+ BATADV_SKB_CB(skb)->decoded = true;
+ batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE);
+ batadv_add_counter(bat_priv, BATADV_CNT_NC_DECODE_BYTES,
+ skb->len + ETH_HLEN);
+ return batadv_recv_unicast_packet(skb, recv_if);
+
+free_nc_packet:
+ batadv_nc_packet_free(nc_packet);
+ return NET_RX_DROP;
+}
+
+/**
+ * batadv_nc_free - clean up network coding memory
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+void batadv_nc_free(struct batadv_priv *bat_priv)
+{
+ batadv_recv_handler_unregister(BATADV_CODED);
+ cancel_delayed_work_sync(&bat_priv->nc.work);
+
+ batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash, NULL);
+ batadv_hash_destroy(bat_priv->nc.coding_hash);
+ batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash, NULL);
+ batadv_hash_destroy(bat_priv->nc.decoding_hash);
+}
+
+/**
+ * batadv_nc_nodes_seq_print_text - print the nc node information
+ * @seq: seq file to print on
+ * @offset: not used
+ */
+int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset)
+{
+ struct net_device *net_dev = (struct net_device *)seq->private;
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct batadv_hard_iface *primary_if;
+ struct hlist_head *head;
+ struct batadv_orig_node *orig_node;
+ struct batadv_nc_node *nc_node;
+ int i;
+
+ primary_if = batadv_seq_print_text_primary_if_get(seq);
+ if (!primary_if)
+ goto out;
+
+ /* Traverse list of originators */
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ /* For each orig_node in this bin */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+ seq_printf(seq, "Node: %pM\n", orig_node->orig);
+
+ seq_puts(seq, " Ingoing: ");
+ /* For each in_nc_node to this orig_node */
+ list_for_each_entry_rcu(nc_node,
+ &orig_node->in_coding_list,
+ list)
+ seq_printf(seq, "%pM ",
+ nc_node->addr);
+ seq_puts(seq, "\n");
+
+ seq_puts(seq, " Outgoing: ");
+ /* For out_nc_node to this orig_node */
+ list_for_each_entry_rcu(nc_node,
+ &orig_node->out_coding_list,
+ list)
+ seq_printf(seq, "%pM ",
+ nc_node->addr);
+ seq_puts(seq, "\n\n");
+ }
+ rcu_read_unlock();
+ }
+
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ return 0;
+}
+
+/**
+ * batadv_nc_init_debugfs - create nc folder and related files in debugfs
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+int batadv_nc_init_debugfs(struct batadv_priv *bat_priv)
+{
+ struct dentry *nc_dir, *file;
+
+ nc_dir = debugfs_create_dir("nc", bat_priv->debug_dir);
+ if (!nc_dir)
+ goto out;
+
+ file = debugfs_create_u8("min_tq", S_IRUGO | S_IWUSR, nc_dir,
+ &bat_priv->nc.min_tq);
+ if (!file)
+ goto out;
+
+ file = debugfs_create_u32("max_fwd_delay", S_IRUGO | S_IWUSR, nc_dir,
+ &bat_priv->nc.max_fwd_delay);
+ if (!file)
+ goto out;
+
+ file = debugfs_create_u32("max_buffer_time", S_IRUGO | S_IWUSR, nc_dir,
+ &bat_priv->nc.max_buffer_time);
+ if (!file)
+ goto out;
+
+ return 0;
+
+out:
+ return -ENOMEM;
+}
diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h
new file mode 100644
index 000000000000..4fa6d0caddbd
--- /dev/null
+++ b/net/batman-adv/network-coding.h
@@ -0,0 +1,123 @@
+/* Copyright (C) 2012-2013 B.A.T.M.A.N. contributors:
+ *
+ * Martin Hundebøll, Jeppe Ledet-Pedersen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#ifndef _NET_BATMAN_ADV_NETWORK_CODING_H_
+#define _NET_BATMAN_ADV_NETWORK_CODING_H_
+
+#ifdef CONFIG_BATMAN_ADV_NC
+
+int batadv_nc_init(struct batadv_priv *bat_priv);
+void batadv_nc_free(struct batadv_priv *bat_priv);
+void batadv_nc_update_nc_node(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_orig_node *orig_neigh_node,
+ struct batadv_ogm_packet *ogm_packet,
+ int is_single_hop_neigh);
+void batadv_nc_purge_orig(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ bool (*to_purge)(struct batadv_priv *,
+ struct batadv_nc_node *));
+void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv);
+void batadv_nc_init_orig(struct batadv_orig_node *orig_node);
+bool batadv_nc_skb_forward(struct sk_buff *skb,
+ struct batadv_neigh_node *neigh_node,
+ struct ethhdr *ethhdr);
+void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv,
+ struct sk_buff *skb);
+void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv,
+ struct sk_buff *skb);
+int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset);
+int batadv_nc_init_debugfs(struct batadv_priv *bat_priv);
+
+#else /* ifdef CONFIG_BATMAN_ADV_NC */
+
+static inline int batadv_nc_init(struct batadv_priv *bat_priv)
+{
+ return 0;
+}
+
+static inline void batadv_nc_free(struct batadv_priv *bat_priv)
+{
+ return;
+}
+
+static inline void
+batadv_nc_update_nc_node(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_orig_node *orig_neigh_node,
+ struct batadv_ogm_packet *ogm_packet,
+ int is_single_hop_neigh)
+{
+ return;
+}
+
+static inline void
+batadv_nc_purge_orig(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ bool (*to_purge)(struct batadv_priv *,
+ struct batadv_nc_node *))
+{
+ return;
+}
+
+static inline void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv)
+{
+ return;
+}
+
+static inline void batadv_nc_init_orig(struct batadv_orig_node *orig_node)
+{
+ return;
+}
+
+static inline bool batadv_nc_skb_forward(struct sk_buff *skb,
+ struct batadv_neigh_node *neigh_node,
+ struct ethhdr *ethhdr)
+{
+ return false;
+}
+
+static inline void
+batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ return;
+}
+
+static inline void
+batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ return;
+}
+
+static inline int batadv_nc_nodes_seq_print_text(struct seq_file *seq,
+ void *offset)
+{
+ return 0;
+}
+
+static inline int batadv_nc_init_debugfs(struct batadv_priv *bat_priv)
+{
+ return 0;
+}
+
+#endif /* ifdef CONFIG_BATMAN_ADV_NC */
+
+#endif /* _NET_BATMAN_ADV_NETWORK_CODING_H_ */
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 96fb80b724dc..2f3452546636 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -28,6 +28,7 @@
#include "unicast.h"
#include "soft-interface.h"
#include "bridge_loop_avoidance.h"
+#include "network-coding.h"
/* hash class keys */
static struct lock_class_key batadv_orig_hash_lock_class_key;
@@ -142,6 +143,9 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu)
spin_unlock_bh(&orig_node->neigh_list_lock);
+ /* Free nc_nodes */
+ batadv_nc_purge_orig(orig_node->bat_priv, orig_node, NULL);
+
batadv_frag_list_free(&orig_node->frag_list);
batadv_tt_global_del_orig(orig_node->bat_priv, orig_node,
"originator timed out");
@@ -219,6 +223,8 @@ struct batadv_orig_node *batadv_get_orig_node(struct batadv_priv *bat_priv,
spin_lock_init(&orig_node->neigh_list_lock);
spin_lock_init(&orig_node->tt_buff_lock);
+ batadv_nc_init_orig(orig_node);
+
/* extra reference for return */
atomic_set(&orig_node->refcount, 2);
@@ -459,7 +465,7 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset)
neigh_node_tmp->tq_avg);
}
- seq_printf(seq, "\n");
+ seq_puts(seq, "\n");
batman_count++;
next:
@@ -469,7 +475,7 @@ next:
}
if (batman_count == 0)
- seq_printf(seq, "No batman nodes in range ...\n");
+ seq_puts(seq, "No batman nodes in range ...\n");
out:
if (primary_if)
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index ed0aa89bbf8b..a51ccfc39da4 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -30,6 +30,7 @@ enum batadv_packettype {
BATADV_TT_QUERY = 0x07,
BATADV_ROAM_ADV = 0x08,
BATADV_UNICAST_4ADDR = 0x09,
+ BATADV_CODED = 0x0a,
};
/**
@@ -278,4 +279,36 @@ struct batadv_tt_change {
uint8_t addr[ETH_ALEN];
} __packed;
+/**
+ * struct batadv_coded_packet - network coded packet
+ * @header: common batman packet header and ttl of first included packet
+ * @reserved: Align following fields to 2-byte boundaries
+ * @first_source: original source of first included packet
+ * @first_orig_dest: original destinal of first included packet
+ * @first_crc: checksum of first included packet
+ * @first_ttvn: tt-version number of first included packet
+ * @second_ttl: ttl of second packet
+ * @second_dest: second receiver of this coded packet
+ * @second_source: original source of second included packet
+ * @second_orig_dest: original destination of second included packet
+ * @second_crc: checksum of second included packet
+ * @second_ttvn: tt version number of second included packet
+ * @coded_len: length of network coded part of the payload
+ */
+struct batadv_coded_packet {
+ struct batadv_header header;
+ uint8_t first_ttvn;
+ /* uint8_t first_dest[ETH_ALEN]; - saved in mac header destination */
+ uint8_t first_source[ETH_ALEN];
+ uint8_t first_orig_dest[ETH_ALEN];
+ __be32 first_crc;
+ uint8_t second_ttl;
+ uint8_t second_ttvn;
+ uint8_t second_dest[ETH_ALEN];
+ uint8_t second_source[ETH_ALEN];
+ uint8_t second_orig_dest[ETH_ALEN];
+ __be32 second_crc;
+ __be16 coded_len;
+};
+
#endif /* _NET_BATMAN_ADV_PACKET_H_ */
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 5ee21cebbbb0..8f88967ff14b 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -29,6 +29,7 @@
#include "unicast.h"
#include "bridge_loop_avoidance.h"
#include "distributed-arp-table.h"
+#include "network-coding.h"
static int batadv_route_unicast_packet(struct sk_buff *skb,
struct batadv_hard_iface *recv_if);
@@ -548,27 +549,37 @@ batadv_find_ifalter_router(struct batadv_orig_node *primary_orig,
return router;
}
+/**
+ * batadv_check_unicast_packet - Check for malformed unicast packets
+ * @skb: packet to check
+ * @hdr_size: size of header to pull
+ *
+ * Check for short header and bad addresses in given packet. Returns negative
+ * value when check fails and 0 otherwise. The negative value depends on the
+ * reason: -ENODATA for bad header, -EBADR for broadcast destination or source,
+ * and -EREMOTE for non-local (other host) destination.
+ */
static int batadv_check_unicast_packet(struct sk_buff *skb, int hdr_size)
{
struct ethhdr *ethhdr;
/* drop packet if it has not necessary minimum size */
if (unlikely(!pskb_may_pull(skb, hdr_size)))
- return -1;
+ return -ENODATA;
ethhdr = (struct ethhdr *)skb_mac_header(skb);
/* packet with unicast indication but broadcast recipient */
if (is_broadcast_ether_addr(ethhdr->h_dest))
- return -1;
+ return -EBADR;
/* packet with broadcast sender address */
if (is_broadcast_ether_addr(ethhdr->h_source))
- return -1;
+ return -EBADR;
/* not for me */
if (!batadv_is_my_mac(ethhdr->h_dest))
- return -1;
+ return -EREMOTE;
return 0;
}
@@ -850,15 +861,18 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
/* decrement ttl */
unicast_packet->header.ttl--;
- /* Update stats counter */
- batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD);
- batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES,
- skb->len + ETH_HLEN);
-
- /* route it */
- if (batadv_send_skb_to_orig(skb, orig_node, recv_if))
+ /* network code packet if possible */
+ if (batadv_nc_skb_forward(skb, neigh_node, ethhdr)) {
+ ret = NET_RX_SUCCESS;
+ } else if (batadv_send_skb_to_orig(skb, orig_node, recv_if)) {
ret = NET_RX_SUCCESS;
+ /* Update stats counter */
+ batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD);
+ batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES,
+ skb->len + ETH_HLEN);
+ }
+
out:
if (neigh_node)
batadv_neigh_node_free_ref(neigh_node);
@@ -1033,7 +1047,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb,
struct batadv_unicast_4addr_packet *unicast_4addr_packet;
uint8_t *orig_addr;
struct batadv_orig_node *orig_node = NULL;
- int hdr_size = sizeof(*unicast_packet);
+ int check, hdr_size = sizeof(*unicast_packet);
bool is4addr;
unicast_packet = (struct batadv_unicast_packet *)skb->data;
@@ -1044,7 +1058,16 @@ int batadv_recv_unicast_packet(struct sk_buff *skb,
if (is4addr)
hdr_size = sizeof(*unicast_4addr_packet);
- if (batadv_check_unicast_packet(skb, hdr_size) < 0)
+ /* function returns -EREMOTE for promiscuous packets */
+ check = batadv_check_unicast_packet(skb, hdr_size);
+
+ /* Even though the packet is not for us, we might save it to use for
+ * decoding a later received coded packet
+ */
+ if (check == -EREMOTE)
+ batadv_nc_skb_store_sniffed_unicast(bat_priv, skb);
+
+ if (check < 0)
return NET_RX_DROP;
if (!batadv_check_unicast_ttvn(bat_priv, skb))
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index a67cffde37ae..263cfd1ccee7 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -27,6 +27,7 @@
#include "vis.h"
#include "gateway_common.h"
#include "originator.h"
+#include "network-coding.h"
#include <linux/if_ether.h>
@@ -39,6 +40,7 @@ int batadv_send_skb_packet(struct sk_buff *skb,
struct batadv_hard_iface *hard_iface,
const uint8_t *dst_addr)
{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
struct ethhdr *ethhdr;
if (hard_iface->if_status != BATADV_IF_ACTIVE)
@@ -70,6 +72,9 @@ int batadv_send_skb_packet(struct sk_buff *skb,
skb->dev = hard_iface->net_dev;
+ /* Save a clone of the skb to use when decoding coded packets */
+ batadv_nc_skb_store_for_decoding(bat_priv, skb);
+
/* dev_queue_xmit() returns a negative result on error. However on
* congestion and traffic shaping, it drops and returns NET_XMIT_DROP
* (which is > 0). This will not be treated as an error.
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 2711e870f557..403b8c46085e 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -37,6 +37,7 @@
#include <linux/if_ether.h>
#include "unicast.h"
#include "bridge_loop_avoidance.h"
+#include "network-coding.h"
static int batadv_get_settings(struct net_device *dev, struct ethtool_cmd *cmd);
@@ -401,55 +402,6 @@ static void batadv_set_lockdep_class(struct net_device *dev)
}
/**
- * batadv_softif_init - Late stage initialization of soft interface
- * @dev: registered network device to modify
- *
- * Returns error code on failures
- */
-static int batadv_softif_init(struct net_device *dev)
-{
- batadv_set_lockdep_class(dev);
-
- return 0;
-}
-
-static const struct net_device_ops batadv_netdev_ops = {
- .ndo_init = batadv_softif_init,
- .ndo_open = batadv_interface_open,
- .ndo_stop = batadv_interface_release,
- .ndo_get_stats = batadv_interface_stats,
- .ndo_set_mac_address = batadv_interface_set_mac_addr,
- .ndo_change_mtu = batadv_interface_change_mtu,
- .ndo_start_xmit = batadv_interface_tx,
- .ndo_validate_addr = eth_validate_addr
-};
-
-static void batadv_interface_setup(struct net_device *dev)
-{
- struct batadv_priv *priv = netdev_priv(dev);
-
- ether_setup(dev);
-
- dev->netdev_ops = &batadv_netdev_ops;
- dev->destructor = free_netdev;
- dev->tx_queue_len = 0;
-
- /* can't call min_mtu, because the needed variables
- * have not been initialized yet
- */
- dev->mtu = ETH_DATA_LEN;
- /* reserve more space in the skbuff for our header */
- dev->hard_header_len = BATADV_HEADER_LEN;
-
- /* generate random address */
- eth_hw_addr_random(dev);
-
- SET_ETHTOOL_OPS(dev, &batadv_ethtool_ops);
-
- memset(priv, 0, sizeof(*priv));
-}
-
-/**
* batadv_softif_destroy_finish - cleans up the remains of a softif
* @work: work queue item
*
@@ -465,7 +417,6 @@ static void batadv_softif_destroy_finish(struct work_struct *work)
cleanup_work);
soft_iface = bat_priv->soft_iface;
- batadv_debugfs_del_meshif(soft_iface);
batadv_sysfs_del_meshif(soft_iface);
rtnl_lock();
@@ -473,21 +424,22 @@ static void batadv_softif_destroy_finish(struct work_struct *work)
rtnl_unlock();
}
-struct net_device *batadv_softif_create(const char *name)
+/**
+ * batadv_softif_init_late - late stage initialization of soft interface
+ * @dev: registered network device to modify
+ *
+ * Returns error code on failures
+ */
+static int batadv_softif_init_late(struct net_device *dev)
{
- struct net_device *soft_iface;
struct batadv_priv *bat_priv;
int ret;
size_t cnt_len = sizeof(uint64_t) * BATADV_CNT_NUM;
- soft_iface = alloc_netdev(sizeof(*bat_priv), name,
- batadv_interface_setup);
-
- if (!soft_iface)
- goto out;
+ batadv_set_lockdep_class(dev);
- bat_priv = netdev_priv(soft_iface);
- bat_priv->soft_iface = soft_iface;
+ bat_priv = netdev_priv(dev);
+ bat_priv->soft_iface = dev;
INIT_WORK(&bat_priv->cleanup_work, batadv_softif_destroy_finish);
/* batadv_interface_stats() needs to be available as soon as
@@ -495,14 +447,7 @@ struct net_device *batadv_softif_create(const char *name)
*/
bat_priv->bat_counters = __alloc_percpu(cnt_len, __alignof__(uint64_t));
if (!bat_priv->bat_counters)
- goto free_soft_iface;
-
- ret = register_netdevice(soft_iface);
- if (ret < 0) {
- pr_err("Unable to register the batman interface '%s': %i\n",
- name, ret);
- goto free_bat_counters;
- }
+ return -ENOMEM;
atomic_set(&bat_priv->aggregated_ogms, 1);
atomic_set(&bat_priv->bonding, 0);
@@ -540,49 +485,189 @@ struct net_device *batadv_softif_create(const char *name)
bat_priv->primary_if = NULL;
bat_priv->num_ifaces = 0;
- ret = batadv_algo_select(bat_priv, batadv_routing_algo);
- if (ret < 0)
- goto unreg_soft_iface;
+ batadv_nc_init_bat_priv(bat_priv);
- ret = batadv_sysfs_add_meshif(soft_iface);
+ ret = batadv_algo_select(bat_priv, batadv_routing_algo);
if (ret < 0)
- goto unreg_soft_iface;
+ goto free_bat_counters;
- ret = batadv_debugfs_add_meshif(soft_iface);
+ ret = batadv_debugfs_add_meshif(dev);
if (ret < 0)
- goto unreg_sysfs;
+ goto free_bat_counters;
- ret = batadv_mesh_init(soft_iface);
+ ret = batadv_mesh_init(dev);
if (ret < 0)
goto unreg_debugfs;
- return soft_iface;
+ return 0;
unreg_debugfs:
- batadv_debugfs_del_meshif(soft_iface);
-unreg_sysfs:
- batadv_sysfs_del_meshif(soft_iface);
-unreg_soft_iface:
- free_percpu(bat_priv->bat_counters);
- unregister_netdevice(soft_iface);
- return NULL;
-
+ batadv_debugfs_del_meshif(dev);
free_bat_counters:
free_percpu(bat_priv->bat_counters);
-free_soft_iface:
- free_netdev(soft_iface);
+
+ return ret;
+}
+
+/**
+ * batadv_softif_slave_add - Add a slave interface to a batadv_soft_interface
+ * @dev: batadv_soft_interface used as master interface
+ * @slave_dev: net_device which should become the slave interface
+ *
+ * Return 0 if successful or error otherwise.
+ */
+static int batadv_softif_slave_add(struct net_device *dev,
+ struct net_device *slave_dev)
+{
+ struct batadv_hard_iface *hard_iface;
+ int ret = -EINVAL;
+
+ hard_iface = batadv_hardif_get_by_netdev(slave_dev);
+ if (!hard_iface || hard_iface->soft_iface != NULL)
+ goto out;
+
+ ret = batadv_hardif_enable_interface(hard_iface, dev->name);
+
out:
- return NULL;
+ if (hard_iface)
+ batadv_hardif_free_ref(hard_iface);
+ return ret;
}
-void batadv_softif_destroy(struct net_device *soft_iface)
+/**
+ * batadv_softif_slave_del - Delete a slave iface from a batadv_soft_interface
+ * @dev: batadv_soft_interface used as master interface
+ * @slave_dev: net_device which should be removed from the master interface
+ *
+ * Return 0 if successful or error otherwise.
+ */
+static int batadv_softif_slave_del(struct net_device *dev,
+ struct net_device *slave_dev)
+{
+ struct batadv_hard_iface *hard_iface;
+ int ret = -EINVAL;
+
+ hard_iface = batadv_hardif_get_by_netdev(slave_dev);
+
+ if (!hard_iface || hard_iface->soft_iface != dev)
+ goto out;
+
+ batadv_hardif_disable_interface(hard_iface, BATADV_IF_CLEANUP_KEEP);
+ ret = 0;
+
+out:
+ if (hard_iface)
+ batadv_hardif_free_ref(hard_iface);
+ return ret;
+}
+
+static const struct net_device_ops batadv_netdev_ops = {
+ .ndo_init = batadv_softif_init_late,
+ .ndo_open = batadv_interface_open,
+ .ndo_stop = batadv_interface_release,
+ .ndo_get_stats = batadv_interface_stats,
+ .ndo_set_mac_address = batadv_interface_set_mac_addr,
+ .ndo_change_mtu = batadv_interface_change_mtu,
+ .ndo_start_xmit = batadv_interface_tx,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_add_slave = batadv_softif_slave_add,
+ .ndo_del_slave = batadv_softif_slave_del,
+};
+
+/**
+ * batadv_softif_free - Deconstructor of batadv_soft_interface
+ * @dev: Device to cleanup and remove
+ */
+static void batadv_softif_free(struct net_device *dev)
+{
+ batadv_debugfs_del_meshif(dev);
+ batadv_mesh_free(dev);
+ free_netdev(dev);
+}
+
+/**
+ * batadv_softif_init_early - early stage initialization of soft interface
+ * @dev: registered network device to modify
+ */
+static void batadv_softif_init_early(struct net_device *dev)
+{
+ struct batadv_priv *priv = netdev_priv(dev);
+
+ ether_setup(dev);
+
+ dev->netdev_ops = &batadv_netdev_ops;
+ dev->destructor = batadv_softif_free;
+ dev->tx_queue_len = 0;
+
+ /* can't call min_mtu, because the needed variables
+ * have not been initialized yet
+ */
+ dev->mtu = ETH_DATA_LEN;
+ /* reserve more space in the skbuff for our header */
+ dev->hard_header_len = BATADV_HEADER_LEN;
+
+ /* generate random address */
+ eth_hw_addr_random(dev);
+
+ SET_ETHTOOL_OPS(dev, &batadv_ethtool_ops);
+
+ memset(priv, 0, sizeof(*priv));
+}
+
+struct net_device *batadv_softif_create(const char *name)
+{
+ struct net_device *soft_iface;
+ int ret;
+
+ soft_iface = alloc_netdev(sizeof(struct batadv_priv), name,
+ batadv_softif_init_early);
+ if (!soft_iface)
+ return NULL;
+
+ soft_iface->rtnl_link_ops = &batadv_link_ops;
+
+ ret = register_netdevice(soft_iface);
+ if (ret < 0) {
+ pr_err("Unable to register the batman interface '%s': %i\n",
+ name, ret);
+ free_netdev(soft_iface);
+ return NULL;
+ }
+
+ return soft_iface;
+}
+
+/**
+ * batadv_softif_destroy_sysfs - deletion of batadv_soft_interface via sysfs
+ * @soft_iface: the to-be-removed batman-adv interface
+ */
+void batadv_softif_destroy_sysfs(struct net_device *soft_iface)
{
struct batadv_priv *bat_priv = netdev_priv(soft_iface);
- batadv_mesh_free(soft_iface);
queue_work(batadv_event_workqueue, &bat_priv->cleanup_work);
}
+/**
+ * batadv_softif_destroy_netlink - deletion of batadv_soft_interface via netlink
+ * @soft_iface: the to-be-removed batman-adv interface
+ * @head: list pointer
+ */
+static void batadv_softif_destroy_netlink(struct net_device *soft_iface,
+ struct list_head *head)
+{
+ struct batadv_hard_iface *hard_iface;
+
+ list_for_each_entry(hard_iface, &batadv_hardif_list, list) {
+ if (hard_iface->soft_iface == soft_iface)
+ batadv_hardif_disable_interface(hard_iface,
+ BATADV_IF_CLEANUP_KEEP);
+ }
+
+ batadv_sysfs_del_meshif(soft_iface);
+ unregister_netdevice_queue(soft_iface, head);
+}
+
int batadv_softif_is_valid(const struct net_device *net_dev)
{
if (net_dev->netdev_ops->ndo_start_xmit == batadv_interface_tx)
@@ -591,6 +676,13 @@ int batadv_softif_is_valid(const struct net_device *net_dev)
return 0;
}
+struct rtnl_link_ops batadv_link_ops __read_mostly = {
+ .kind = "batadv",
+ .priv_size = sizeof(struct batadv_priv),
+ .setup = batadv_softif_init_early,
+ .dellink = batadv_softif_destroy_netlink,
+};
+
/* ethtool */
static int batadv_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
@@ -662,6 +754,17 @@ static const struct {
{ "dat_put_rx" },
{ "dat_cached_reply_tx" },
#endif
+#ifdef CONFIG_BATMAN_ADV_NC
+ { "nc_code" },
+ { "nc_code_bytes" },
+ { "nc_recode" },
+ { "nc_recode_bytes" },
+ { "nc_buffer" },
+ { "nc_decode" },
+ { "nc_decode_bytes" },
+ { "nc_decode_failed" },
+ { "nc_sniffed" },
+#endif
};
static void batadv_get_strings(struct net_device *dev, uint32_t stringset,
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
index 43182e5e603a..2f2472c2ea0d 100644
--- a/net/batman-adv/soft-interface.h
+++ b/net/batman-adv/soft-interface.h
@@ -25,7 +25,8 @@ void batadv_interface_rx(struct net_device *soft_iface,
struct sk_buff *skb, struct batadv_hard_iface *recv_if,
int hdr_size, struct batadv_orig_node *orig_node);
struct net_device *batadv_softif_create(const char *name);
-void batadv_softif_destroy(struct net_device *soft_iface);
+void batadv_softif_destroy_sysfs(struct net_device *soft_iface);
int batadv_softif_is_valid(const struct net_device *net_dev);
+extern struct rtnl_link_ops batadv_link_ops;
#endif /* _NET_BATMAN_ADV_SOFT_INTERFACE_H_ */
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index afbba319d73a..15a22efa9a67 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -442,6 +442,9 @@ static BATADV_ATTR(gw_bandwidth, S_IRUGO | S_IWUSR, batadv_show_gw_bwidth,
#ifdef CONFIG_BATMAN_ADV_DEBUG
BATADV_ATTR_SIF_UINT(log_level, S_IRUGO | S_IWUSR, 0, BATADV_DBG_ALL, NULL);
#endif
+#ifdef CONFIG_BATMAN_ADV_NC
+BATADV_ATTR_SIF_BOOL(network_coding, S_IRUGO | S_IWUSR, NULL);
+#endif
static struct batadv_attribute *batadv_mesh_attrs[] = {
&batadv_attr_aggregated_ogms,
@@ -464,6 +467,9 @@ static struct batadv_attribute *batadv_mesh_attrs[] = {
#ifdef CONFIG_BATMAN_ADV_DEBUG
&batadv_attr_log_level,
#endif
+#ifdef CONFIG_BATMAN_ADV_NC
+ &batadv_attr_network_coding,
+#endif
NULL,
};
@@ -582,13 +588,15 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj,
}
if (status_tmp == BATADV_IF_NOT_IN_USE) {
- batadv_hardif_disable_interface(hard_iface);
+ batadv_hardif_disable_interface(hard_iface,
+ BATADV_IF_CLEANUP_AUTO);
goto unlock;
}
/* if the interface already is in use */
if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
- batadv_hardif_disable_interface(hard_iface);
+ batadv_hardif_disable_interface(hard_iface,
+ BATADV_IF_CLEANUP_AUTO);
ret = batadv_hardif_enable_interface(hard_iface, buff);
@@ -688,15 +696,10 @@ int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type,
enum batadv_uev_action action, const char *data)
{
int ret = -ENOMEM;
- struct batadv_hard_iface *primary_if;
struct kobject *bat_kobj;
char *uevent_env[4] = { NULL, NULL, NULL, NULL };
- primary_if = batadv_primary_if_get_selected(bat_priv);
- if (!primary_if)
- goto out;
-
- bat_kobj = &primary_if->soft_iface->dev.kobj;
+ bat_kobj = &bat_priv->soft_iface->dev.kobj;
uevent_env[0] = kmalloc(strlen(BATADV_UEV_TYPE_VAR) +
strlen(batadv_uev_type_str[type]) + 1,
@@ -732,9 +735,6 @@ out:
kfree(uevent_env[1]);
kfree(uevent_env[2]);
- if (primary_if)
- batadv_hardif_free_ref(primary_if);
-
if (ret)
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
"Impossible to send uevent for (%s,%s,%s) event (err: %d)\n",
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 98a66a021a60..932232087449 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -385,25 +385,19 @@ static void batadv_tt_prepare_packet_buff(struct batadv_priv *bat_priv,
int *packet_buff_len,
int min_packet_len)
{
- struct batadv_hard_iface *primary_if;
int req_len;
- primary_if = batadv_primary_if_get_selected(bat_priv);
-
req_len = min_packet_len;
req_len += batadv_tt_len(atomic_read(&bat_priv->tt.local_changes));
/* if we have too many changes for one packet don't send any
* and wait for the tt table request which will be fragmented
*/
- if ((!primary_if) || (req_len > primary_if->soft_iface->mtu))
+ if (req_len > bat_priv->soft_iface->mtu)
req_len = min_packet_len;
batadv_tt_realloc_packet_buff(packet_buff, packet_buff_len,
min_packet_len, req_len);
-
- if (primary_if)
- batadv_hardif_free_ref(primary_if);
}
static int batadv_tt_changes_fill_buff(struct batadv_priv *bat_priv,
@@ -908,7 +902,7 @@ out_remove:
/* remove address from local hash if present */
local_flags = batadv_tt_local_remove(bat_priv, tt_addr,
"global tt received",
- !!(flags & BATADV_TT_CLIENT_ROAM));
+ flags & BATADV_TT_CLIENT_ROAM);
tt_global_entry->common.flags |= local_flags & BATADV_TT_CLIENT_WIFI;
if (!(flags & BATADV_TT_CLIENT_ROAM))
@@ -1580,7 +1574,7 @@ static int batadv_tt_global_valid(const void *entry_ptr,
static struct sk_buff *
batadv_tt_response_fill_table(uint16_t tt_len, uint8_t ttvn,
struct batadv_hashtable *hash,
- struct batadv_hard_iface *primary_if,
+ struct batadv_priv *bat_priv,
int (*valid_cb)(const void *, const void *),
void *cb_data)
{
@@ -1594,8 +1588,8 @@ batadv_tt_response_fill_table(uint16_t tt_len, uint8_t ttvn,
uint32_t i;
size_t len;
- if (tt_query_size + tt_len > primary_if->soft_iface->mtu) {
- tt_len = primary_if->soft_iface->mtu - tt_query_size;
+ if (tt_query_size + tt_len > bat_priv->soft_iface->mtu) {
+ tt_len = bat_priv->soft_iface->mtu - tt_query_size;
tt_len -= tt_len % sizeof(struct batadv_tt_change);
}
tt_tot = tt_len / sizeof(struct batadv_tt_change);
@@ -1715,7 +1709,6 @@ batadv_send_other_tt_response(struct batadv_priv *bat_priv,
{
struct batadv_orig_node *req_dst_orig_node;
struct batadv_orig_node *res_dst_orig_node = NULL;
- struct batadv_hard_iface *primary_if = NULL;
uint8_t orig_ttvn, req_ttvn, ttvn;
int ret = false;
unsigned char *tt_buff;
@@ -1740,10 +1733,6 @@ batadv_send_other_tt_response(struct batadv_priv *bat_priv,
if (!res_dst_orig_node)
goto out;
- primary_if = batadv_primary_if_get_selected(bat_priv);
- if (!primary_if)
- goto out;
-
orig_ttvn = (uint8_t)atomic_read(&req_dst_orig_node->last_ttvn);
req_ttvn = tt_request->ttvn;
@@ -1791,7 +1780,7 @@ batadv_send_other_tt_response(struct batadv_priv *bat_priv,
skb = batadv_tt_response_fill_table(tt_len, ttvn,
bat_priv->tt.global_hash,
- primary_if,
+ bat_priv,
batadv_tt_global_valid,
req_dst_orig_node);
if (!skb)
@@ -1828,8 +1817,6 @@ out:
batadv_orig_node_free_ref(res_dst_orig_node);
if (req_dst_orig_node)
batadv_orig_node_free_ref(req_dst_orig_node);
- if (primary_if)
- batadv_hardif_free_ref(primary_if);
if (!ret)
kfree_skb(skb);
return ret;
@@ -1907,7 +1894,7 @@ batadv_send_my_tt_response(struct batadv_priv *bat_priv,
skb = batadv_tt_response_fill_table(tt_len, ttvn,
bat_priv->tt.local_hash,
- primary_if,
+ bat_priv,
batadv_tt_local_valid_entry,
NULL);
if (!skb)
@@ -2528,7 +2515,7 @@ bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv,
if (!tt_global_entry)
goto out;
- ret = !!(tt_global_entry->common.flags & BATADV_TT_CLIENT_ROAM);
+ ret = tt_global_entry->common.flags & BATADV_TT_CLIENT_ROAM;
batadv_tt_global_entry_free_ref(tt_global_entry);
out:
return ret;
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 4cd87a0b5b80..aba8364c3689 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -128,6 +128,10 @@ struct batadv_hard_iface {
* @bond_list: list of bonding candidates
* @refcount: number of contexts the object is used
* @rcu: struct used for freeing in an RCU-safe manner
+ * @in_coding_list: list of nodes this orig can hear
+ * @out_coding_list: list of nodes that can hear this orig
+ * @in_coding_list_lock: protects in_coding_list
+ * @out_coding_list_lock: protects out_coding_list
*/
struct batadv_orig_node {
uint8_t orig[ETH_ALEN];
@@ -171,6 +175,12 @@ struct batadv_orig_node {
struct list_head bond_list;
atomic_t refcount;
struct rcu_head rcu;
+#ifdef CONFIG_BATMAN_ADV_NC
+ struct list_head in_coding_list;
+ struct list_head out_coding_list;
+ spinlock_t in_coding_list_lock; /* Protects in_coding_list */
+ spinlock_t out_coding_list_lock; /* Protects out_coding_list */
+#endif
};
/**
@@ -265,6 +275,17 @@ struct batadv_bcast_duplist_entry {
* @BATADV_CNT_DAT_PUT_RX: received dht PUT traffic packet counter
* @BATADV_CNT_DAT_CACHED_REPLY_TX: transmitted dat cache reply traffic packet
* counter
+ * @BATADV_CNT_NC_CODE: transmitted nc-combined traffic packet counter
+ * @BATADV_CNT_NC_CODE_BYTES: transmitted nc-combined traffic bytes counter
+ * @BATADV_CNT_NC_RECODE: transmitted nc-recombined traffic packet counter
+ * @BATADV_CNT_NC_RECODE_BYTES: transmitted nc-recombined traffic bytes counter
+ * @BATADV_CNT_NC_BUFFER: counter for packets buffered for later nc decoding
+ * @BATADV_CNT_NC_DECODE: received and nc-decoded traffic packet counter
+ * @BATADV_CNT_NC_DECODE_BYTES: received and nc-decoded traffic bytes counter
+ * @BATADV_CNT_NC_DECODE_FAILED: received and decode-failed traffic packet
+ * counter
+ * @BATADV_CNT_NC_SNIFFED: counter for nc-decoded packets received in promisc
+ * mode.
* @BATADV_CNT_NUM: number of traffic counters
*/
enum batadv_counters {
@@ -292,6 +313,17 @@ enum batadv_counters {
BATADV_CNT_DAT_PUT_RX,
BATADV_CNT_DAT_CACHED_REPLY_TX,
#endif
+#ifdef CONFIG_BATMAN_ADV_NC
+ BATADV_CNT_NC_CODE,
+ BATADV_CNT_NC_CODE_BYTES,
+ BATADV_CNT_NC_RECODE,
+ BATADV_CNT_NC_RECODE_BYTES,
+ BATADV_CNT_NC_BUFFER,
+ BATADV_CNT_NC_DECODE,
+ BATADV_CNT_NC_DECODE_BYTES,
+ BATADV_CNT_NC_DECODE_FAILED,
+ BATADV_CNT_NC_SNIFFED,
+#endif
BATADV_CNT_NUM,
};
@@ -428,6 +460,35 @@ struct batadv_priv_dat {
#endif
/**
+ * struct batadv_priv_nc - per mesh interface network coding private data
+ * @work: work queue callback item for cleanup
+ * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
+ * @min_tq: only consider neighbors for encoding if neigh_tq > min_tq
+ * @max_fwd_delay: maximum packet forward delay to allow coding of packets
+ * @max_buffer_time: buffer time for sniffed packets used to decoding
+ * @timestamp_fwd_flush: timestamp of last forward packet queue flush
+ * @timestamp_sniffed_purge: timestamp of last sniffed packet queue purge
+ * @coding_hash: Hash table used to buffer skbs while waiting for another
+ * incoming skb to code it with. Skbs are added to the buffer just before being
+ * forwarded in routing.c
+ * @decoding_hash: Hash table used to buffer skbs that might be needed to decode
+ * a received coded skb. The buffer is used for 1) skbs arriving on the
+ * soft-interface; 2) skbs overheard on the hard-interface; and 3) skbs
+ * forwarded by batman-adv.
+ */
+struct batadv_priv_nc {
+ struct delayed_work work;
+ struct dentry *debug_dir;
+ u8 min_tq;
+ u32 max_fwd_delay;
+ u32 max_buffer_time;
+ unsigned long timestamp_fwd_flush;
+ unsigned long timestamp_sniffed_purge;
+ struct batadv_hashtable *coding_hash;
+ struct batadv_hashtable *decoding_hash;
+};
+
+/**
* struct batadv_priv - per mesh interface data
* @mesh_state: current status of the mesh (inactive/active/deactivating)
* @soft_iface: net device which holds this struct as private data
@@ -470,6 +531,8 @@ struct batadv_priv_dat {
* @tt: translation table data
* @vis: vis data
* @dat: distributed arp table data
+ * @network_coding: bool indicating whether network coding is enabled
+ * @batadv_priv_nc: network coding data
*/
struct batadv_priv {
atomic_t mesh_state;
@@ -522,6 +585,10 @@ struct batadv_priv {
#ifdef CONFIG_BATMAN_ADV_DAT
struct batadv_priv_dat dat;
#endif
+#ifdef CONFIG_BATMAN_ADV_NC
+ atomic_t network_coding;
+ struct batadv_priv_nc nc;
+#endif /* CONFIG_BATMAN_ADV_NC */
};
/**
@@ -702,6 +769,75 @@ struct batadv_tt_roam_node {
};
/**
+ * struct batadv_nc_node - network coding node
+ * @list: next and prev pointer for the list handling
+ * @addr: the node's mac address
+ * @refcount: number of contexts the object is used by
+ * @rcu: struct used for freeing in an RCU-safe manner
+ * @orig_node: pointer to corresponding orig node struct
+ * @last_seen: timestamp of last ogm received from this node
+ */
+struct batadv_nc_node {
+ struct list_head list;
+ uint8_t addr[ETH_ALEN];
+ atomic_t refcount;
+ struct rcu_head rcu;
+ struct batadv_orig_node *orig_node;
+ unsigned long last_seen;
+};
+
+/**
+ * struct batadv_nc_path - network coding path
+ * @hash_entry: next and prev pointer for the list handling
+ * @rcu: struct used for freeing in an RCU-safe manner
+ * @refcount: number of contexts the object is used by
+ * @packet_list: list of buffered packets for this path
+ * @packet_list_lock: access lock for packet list
+ * @next_hop: next hop (destination) of path
+ * @prev_hop: previous hop (source) of path
+ * @last_valid: timestamp for last validation of path
+ */
+struct batadv_nc_path {
+ struct hlist_node hash_entry;
+ struct rcu_head rcu;
+ atomic_t refcount;
+ struct list_head packet_list;
+ spinlock_t packet_list_lock; /* Protects packet_list */
+ uint8_t next_hop[ETH_ALEN];
+ uint8_t prev_hop[ETH_ALEN];
+ unsigned long last_valid;
+};
+
+/**
+ * struct batadv_nc_packet - network coding packet used when coding and
+ * decoding packets
+ * @list: next and prev pointer for the list handling
+ * @packet_id: crc32 checksum of skb data
+ * @timestamp: field containing the info when the packet was added to path
+ * @neigh_node: pointer to original next hop neighbor of skb
+ * @skb: skb which can be encoded or used for decoding
+ * @nc_path: pointer to path this nc packet is attached to
+ */
+struct batadv_nc_packet {
+ struct list_head list;
+ __be32 packet_id;
+ unsigned long timestamp;
+ struct batadv_neigh_node *neigh_node;
+ struct sk_buff *skb;
+ struct batadv_nc_path *nc_path;
+};
+
+/**
+ * batadv_skb_cb - control buffer structure used to store private data relevant
+ * to batman-adv in the skb->cb buffer in skbs.
+ * @decoded: Marks a skb as decoded, which is checked when searching for coding
+ * opportunities in network-coding.c
+ */
+struct batadv_skb_cb {
+ bool decoded;
+};
+
+/**
* struct batadv_forw_packet - structure for bcast packets to be sent/forwarded
* @list: list node for batadv_socket_client::queue_list
* @send_time: execution time for delayed_work (packet sending)
diff --git a/net/batman-adv/unicast.c b/net/batman-adv/unicast.c
index 50e079f00be6..0bb3b5982f94 100644
--- a/net/batman-adv/unicast.c
+++ b/net/batman-adv/unicast.c
@@ -122,7 +122,7 @@ batadv_frag_search_packet(struct list_head *head,
{
struct batadv_frag_packet_list_entry *tfp;
struct batadv_unicast_frag_packet *tmp_up = NULL;
- int is_head_tmp, is_head;
+ bool is_head_tmp, is_head;
uint16_t search_seqno;
if (up->flags & BATADV_UNI_FRAG_HEAD)
@@ -130,7 +130,7 @@ batadv_frag_search_packet(struct list_head *head,
else
search_seqno = ntohs(up->seqno)-1;
- is_head = !!(up->flags & BATADV_UNI_FRAG_HEAD);
+ is_head = up->flags & BATADV_UNI_FRAG_HEAD;
list_for_each_entry(tfp, head, list) {
if (!tfp->skb)
@@ -142,7 +142,7 @@ batadv_frag_search_packet(struct list_head *head,
tmp_up = (struct batadv_unicast_frag_packet *)tfp->skb->data;
if (tfp->seqno == search_seqno) {
- is_head_tmp = !!(tmp_up->flags & BATADV_UNI_FRAG_HEAD);
+ is_head_tmp = tmp_up->flags & BATADV_UNI_FRAG_HEAD;
if (is_head_tmp != is_head)
return tfp;
else
diff --git a/net/batman-adv/vis.c b/net/batman-adv/vis.c
index c053244b97bd..962ccf3b8382 100644
--- a/net/batman-adv/vis.c
+++ b/net/batman-adv/vis.c
@@ -149,7 +149,7 @@ static void batadv_vis_data_read_prim_sec(struct seq_file *seq,
hlist_for_each_entry(entry, if_list, list) {
if (entry->primary)
- seq_printf(seq, "PRIMARY, ");
+ seq_puts(seq, "PRIMARY, ");
else
seq_printf(seq, "SEC %pM, ", entry->addr);
}
@@ -207,7 +207,7 @@ static void batadv_vis_data_read_entries(struct seq_file *seq,
if (batadv_compare_eth(entry->addr, packet->vis_orig))
batadv_vis_data_read_prim_sec(seq, list);
- seq_printf(seq, "\n");
+ seq_puts(seq, "\n");
}
}
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 81598e588f7f..e5338f787d68 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -221,6 +221,8 @@ int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
if (flags & (MSG_OOB))
return -EOPNOTSUPP;
+ msg->msg_namelen = 0;
+
skb = skb_recv_datagram(sk, flags, noblock, &err);
if (!skb) {
if (sk->sk_shutdown & RCV_SHUTDOWN)
@@ -228,8 +230,6 @@ int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
return err;
}
- msg->msg_namelen = 0;
-
copied = skb->len;
if (len < copied) {
msg->msg_flags |= MSG_TRUNC;
@@ -413,7 +413,8 @@ unsigned int bt_sock_poll(struct file *file, struct socket *sock,
return bt_accept_poll(sk);
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
- mask |= POLLERR;
+ mask |= POLLERR |
+ (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= POLLRDHUP | POLLIN | POLLRDNORM;
diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c
index e58c8b32589c..4b488ec26105 100644
--- a/net/bluetooth/bnep/netdev.c
+++ b/net/bluetooth/bnep/netdev.c
@@ -136,7 +136,7 @@ static u16 bnep_net_eth_proto(struct sk_buff *skb)
struct ethhdr *eh = (void *) skb->data;
u16 proto = ntohs(eh->h_proto);
- if (proto >= 1536)
+ if (proto >= ETH_P_802_3_MIN)
return proto;
if (get_unaligned((__be16 *) skb->data) == htons(0xFFFF))
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 3786ddc45152..a8638b58c4bf 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -608,6 +608,7 @@ static int rfcomm_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
rfcomm_dlc_accept(d);
+ msg->msg_namelen = 0;
return 0;
}
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index d919d1161ab4..2c8055350510 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -665,6 +665,7 @@ static int sco_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
hci_conn_accept(pi->conn->hcon, 0);
sk->sk_state = BT_CONFIG;
+ msg->msg_namelen = 0;
release_sock(sk);
return 0;
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index d5f1d3fd4b28..314c73ed418f 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -66,7 +66,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
goto out;
}
- mdst = br_mdb_get(br, skb);
+ mdst = br_mdb_get(br, skb, vid);
if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb))
br_multicast_deliver(mdst, skb);
else
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index b0812c91c0f0..c581f1200ef7 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -161,9 +161,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
if (!pv)
return;
- for (vid = find_next_bit(pv->vlan_bitmap, BR_VLAN_BITMAP_LEN, vid);
- vid < BR_VLAN_BITMAP_LEN;
- vid = find_next_bit(pv->vlan_bitmap, BR_VLAN_BITMAP_LEN, vid+1)) {
+ for_each_set_bit_from(vid, pv->vlan_bitmap, BR_VLAN_BITMAP_LEN) {
f = __br_fdb_get(br, br->dev->dev_addr, vid);
if (f && f->is_local && !f->dst)
fdb_delete(br, f);
@@ -423,7 +421,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
return 0;
br_warn(br, "adding interface %s with same address "
"as a received packet\n",
- source->dev->name);
+ source ? source->dev->name : br->dev->name);
fdb_delete(br, fdb);
}
@@ -724,13 +722,10 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
* specify a VLAN. To be nice, add/update entry for every
* vlan on this port.
*/
- vid = find_first_bit(pv->vlan_bitmap, BR_VLAN_BITMAP_LEN);
- while (vid < BR_VLAN_BITMAP_LEN) {
+ for_each_set_bit(vid, pv->vlan_bitmap, BR_VLAN_BITMAP_LEN) {
err = __br_fdb_add(ndm, p, addr, nlh_flags, vid);
if (err)
goto out;
- vid = find_next_bit(pv->vlan_bitmap,
- BR_VLAN_BITMAP_LEN, vid+1);
}
}
@@ -815,11 +810,8 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
* vlan on this port.
*/
err = -ENOENT;
- vid = find_first_bit(pv->vlan_bitmap, BR_VLAN_BITMAP_LEN);
- while (vid < BR_VLAN_BITMAP_LEN) {
+ for_each_set_bit(vid, pv->vlan_bitmap, BR_VLAN_BITMAP_LEN) {
err &= __br_fdb_delete(p, addr, vid);
- vid = find_next_bit(pv->vlan_bitmap,
- BR_VLAN_BITMAP_LEN, vid+1);
}
}
out:
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index ef1b91431c6b..f17fcb3097c2 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -148,7 +148,6 @@ static void del_nbp(struct net_bridge_port *p)
dev->priv_flags &= ~IFF_BRIDGE_PORT;
netdev_rx_handler_unregister(dev);
- synchronize_net();
netdev_upper_dev_unlink(dev, br->dev);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 480330151898..828e2bcc1f52 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -97,7 +97,7 @@ int br_handle_frame_finish(struct sk_buff *skb)
if (is_broadcast_ether_addr(dest))
skb2 = skb;
else if (is_multicast_ether_addr(dest)) {
- mdst = br_mdb_get(br, skb);
+ mdst = br_mdb_get(br, skb, vid);
if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) {
if ((mdst && mdst->mglist) ||
br_multicast_is_router(br))
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 9f97b850fc65..19942e38fd2d 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -80,6 +80,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
port = p->port;
if (port) {
struct br_mdb_entry e;
+ memset(&e, 0, sizeof(e));
e.ifindex = port->dev->ifindex;
e.state = p->state;
if (p->addr.proto == htons(ETH_P_IP))
@@ -136,6 +137,7 @@ static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
break;
bpm = nlmsg_data(nlh);
+ memset(bpm, 0, sizeof(*bpm));
bpm->ifindex = dev->ifindex;
if (br_mdb_fill_info(skb, cb, dev) < 0)
goto out;
@@ -171,6 +173,7 @@ static int nlmsg_populate_mdb_fill(struct sk_buff *skb,
return -EMSGSIZE;
bpm = nlmsg_data(nlh);
+ memset(bpm, 0, sizeof(*bpm));
bpm->family = AF_BRIDGE;
bpm->ifindex = dev->ifindex;
nest = nla_nest_start(skb, MDBA_MDB);
@@ -228,6 +231,7 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
{
struct br_mdb_entry entry;
+ memset(&entry, 0, sizeof(entry));
entry.ifindex = port->dev->ifindex;
entry.addr.proto = group->proto;
entry.addr.u.ip4 = group->u.ip4;
@@ -378,7 +382,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
return ret;
}
-static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct br_mdb_entry *entry;
@@ -454,7 +458,7 @@ unlock:
return err;
}
-static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net_device *dev;
struct br_mdb_entry *entry;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 10e6fce1bb62..81f2389f78eb 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -132,7 +132,7 @@ static struct net_bridge_mdb_entry *br_mdb_ip6_get(
#endif
struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
- struct sk_buff *skb)
+ struct sk_buff *skb, u16 vid)
{
struct net_bridge_mdb_htable *mdb = rcu_dereference(br->mdb);
struct br_ip ip;
@@ -144,6 +144,7 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
return NULL;
ip.proto = skb->protocol;
+ ip.vid = vid;
switch (skb->protocol) {
case htons(ETH_P_IP):
@@ -1368,7 +1369,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
return -EINVAL;
if (iph->protocol != IPPROTO_IGMP) {
- if ((iph->daddr & IGMP_LOCAL_GROUP_MASK) != IGMP_LOCAL_GROUP)
+ if (!ipv4_is_local_multicast(iph->daddr))
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
return 0;
}
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 27aa3ee517ce..8e3abf564798 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -29,6 +29,7 @@ static inline size_t br_port_info_size(void)
+ nla_total_size(1) /* IFLA_BRPORT_MODE */
+ nla_total_size(1) /* IFLA_BRPORT_GUARD */
+ nla_total_size(1) /* IFLA_BRPORT_PROTECT */
+ + nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */
+ 0;
}
@@ -135,10 +136,7 @@ static int br_fill_ifinfo(struct sk_buff *skb,
goto nla_put_failure;
pvid = br_get_pvid(pv);
- for (vid = find_first_bit(pv->vlan_bitmap, BR_VLAN_BITMAP_LEN);
- vid < BR_VLAN_BITMAP_LEN;
- vid = find_next_bit(pv->vlan_bitmap,
- BR_VLAN_BITMAP_LEN, vid+1)) {
+ for_each_set_bit(vid, pv->vlan_bitmap, BR_VLAN_BITMAP_LEN) {
vinfo.vid = vid;
vinfo.flags = 0;
if (vid == pvid)
@@ -329,6 +327,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE);
br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD);
br_set_port_flag(p, tb, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE);
+ br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK);
if (tb[IFLA_BRPORT_COST]) {
err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST]));
@@ -353,17 +352,14 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
/* Change state and parameters on port. */
int br_setlink(struct net_device *dev, struct nlmsghdr *nlh)
{
- struct ifinfomsg *ifm;
struct nlattr *protinfo;
struct nlattr *afspec;
struct net_bridge_port *p;
struct nlattr *tb[IFLA_BRPORT_MAX + 1];
- int err;
-
- ifm = nlmsg_data(nlh);
+ int err = 0;
- protinfo = nlmsg_find_attr(nlh, sizeof(*ifm), IFLA_PROTINFO);
- afspec = nlmsg_find_attr(nlh, sizeof(*ifm), IFLA_AF_SPEC);
+ protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_PROTINFO);
+ afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
if (!protinfo && !afspec)
return 0;
@@ -371,7 +367,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh)
/* We want to accept dev as bridge itself if the AF_SPEC
* is set to see if someone is setting vlan info on the brigde
*/
- if (!p && ((dev->priv_flags & IFF_EBRIDGE) && !afspec))
+ if (!p && !afspec)
return -EINVAL;
if (p && protinfo) {
@@ -412,14 +408,11 @@ out:
/* Delete port information */
int br_dellink(struct net_device *dev, struct nlmsghdr *nlh)
{
- struct ifinfomsg *ifm;
struct nlattr *afspec;
struct net_bridge_port *p;
int err;
- ifm = nlmsg_data(nlh);
-
- afspec = nlmsg_find_attr(nlh, sizeof(*ifm), IFLA_AF_SPEC);
+ afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
if (!afspec)
return 0;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 6d314c4e6bcb..3cbf5beb3d4b 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -442,7 +442,7 @@ extern int br_multicast_rcv(struct net_bridge *br,
struct net_bridge_port *port,
struct sk_buff *skb);
extern struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
- struct sk_buff *skb);
+ struct sk_buff *skb, u16 vid);
extern void br_multicast_add_port(struct net_bridge_port *port);
extern void br_multicast_del_port(struct net_bridge_port *port);
extern void br_multicast_enable_port(struct net_bridge_port *port);
@@ -504,7 +504,7 @@ static inline int br_multicast_rcv(struct net_bridge *br,
}
static inline struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
- struct sk_buff *skb)
+ struct sk_buff *skb, u16 vid)
{
return NULL;
}
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 92de5e5f9db2..9878eb8204c5 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -78,6 +78,11 @@ ebt_log_packet(u_int8_t pf, unsigned int hooknum,
const char *prefix)
{
unsigned int bitmask;
+ struct net *net = dev_net(in ? in : out);
+
+ /* FIXME: Disabled from containers until syslog ns is supported */
+ if (!net_eq(net, &init_net))
+ return;
spin_lock_bh(&ebt_log_lock);
printk(KERN_SOH "%c%s IN=%s OUT=%s MAC source = %pM MAC dest = %pM proto = 0x%04x",
@@ -176,17 +181,18 @@ ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ebt_log_info *info = par->targinfo;
struct nf_loginfo li;
+ struct net *net = dev_net(par->in ? par->in : par->out);
li.type = NF_LOG_TYPE_LOG;
li.u.log.level = info->loglevel;
li.u.log.logflags = info->bitmask;
if (info->bitmask & EBT_LOG_NFLOG)
- nf_log_packet(NFPROTO_BRIDGE, par->hooknum, skb, par->in,
- par->out, &li, "%s", info->prefix);
+ nf_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb,
+ par->in, par->out, &li, "%s", info->prefix);
else
ebt_log_packet(NFPROTO_BRIDGE, par->hooknum, skb, par->in,
- par->out, &li, info->prefix);
+ par->out, &li, info->prefix);
return EBT_CONTINUE;
}
@@ -206,19 +212,47 @@ static struct nf_logger ebt_log_logger __read_mostly = {
.me = THIS_MODULE,
};
+static int __net_init ebt_log_net_init(struct net *net)
+{
+ nf_log_set(net, NFPROTO_BRIDGE, &ebt_log_logger);
+ return 0;
+}
+
+static void __net_exit ebt_log_net_fini(struct net *net)
+{
+ nf_log_unset(net, &ebt_log_logger);
+}
+
+static struct pernet_operations ebt_log_net_ops = {
+ .init = ebt_log_net_init,
+ .exit = ebt_log_net_fini,
+};
+
static int __init ebt_log_init(void)
{
int ret;
+ ret = register_pernet_subsys(&ebt_log_net_ops);
+ if (ret < 0)
+ goto err_pernet;
+
ret = xt_register_target(&ebt_log_tg_reg);
if (ret < 0)
- return ret;
+ goto err_target;
+
nf_log_register(NFPROTO_BRIDGE, &ebt_log_logger);
- return 0;
+
+ return ret;
+
+err_target:
+ unregister_pernet_subsys(&ebt_log_net_ops);
+err_pernet:
+ return ret;
}
static void __exit ebt_log_fini(void)
{
+ unregister_pernet_subsys(&ebt_log_net_ops);
nf_log_unregister(&ebt_log_logger);
xt_unregister_target(&ebt_log_tg_reg);
}
diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c
index 5be68bbcc341..59ac7952010d 100644
--- a/net/bridge/netfilter/ebt_nflog.c
+++ b/net/bridge/netfilter/ebt_nflog.c
@@ -24,14 +24,15 @@ ebt_nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ebt_nflog_info *info = par->targinfo;
struct nf_loginfo li;
+ struct net *net = dev_net(par->in ? par->in : par->out);
li.type = NF_LOG_TYPE_ULOG;
li.u.ulog.copy_len = info->len;
li.u.ulog.group = info->group;
li.u.ulog.qthreshold = info->threshold;
- nf_log_packet(PF_BRIDGE, par->hooknum, skb, par->in, par->out,
- &li, "%s", info->prefix);
+ nf_log_packet(net, PF_BRIDGE, par->hooknum, skb, par->in,
+ par->out, &li, "%s", info->prefix);
return EBT_CONTINUE;
}
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 3bf43f7bb9d4..fc1905c51417 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -35,12 +35,13 @@
#include <linux/skbuff.h>
#include <linux/kernel.h>
#include <linux/timer.h>
-#include <linux/netlink.h>
+#include <net/netlink.h>
#include <linux/netdevice.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_bridge/ebtables.h>
#include <linux/netfilter_bridge/ebt_ulog.h>
#include <net/netfilter/nf_log.h>
+#include <net/netns/generic.h>
#include <net/sock.h>
#include "../br_private.h"
@@ -62,13 +63,22 @@ typedef struct {
spinlock_t lock; /* the per-queue lock */
} ebt_ulog_buff_t;
-static ebt_ulog_buff_t ulog_buffers[EBT_ULOG_MAXNLGROUPS];
-static struct sock *ebtulognl;
+static int ebt_ulog_net_id __read_mostly;
+struct ebt_ulog_net {
+ unsigned int nlgroup[EBT_ULOG_MAXNLGROUPS];
+ ebt_ulog_buff_t ulog_buffers[EBT_ULOG_MAXNLGROUPS];
+ struct sock *ebtulognl;
+};
+
+static struct ebt_ulog_net *ebt_ulog_pernet(struct net *net)
+{
+ return net_generic(net, ebt_ulog_net_id);
+}
/* send one ulog_buff_t to userspace */
-static void ulog_send(unsigned int nlgroup)
+static void ulog_send(struct ebt_ulog_net *ebt, unsigned int nlgroup)
{
- ebt_ulog_buff_t *ub = &ulog_buffers[nlgroup];
+ ebt_ulog_buff_t *ub = &ebt->ulog_buffers[nlgroup];
del_timer(&ub->timer);
@@ -80,7 +90,7 @@ static void ulog_send(unsigned int nlgroup)
ub->lastnlh->nlmsg_type = NLMSG_DONE;
NETLINK_CB(ub->skb).dst_group = nlgroup + 1;
- netlink_broadcast(ebtulognl, ub->skb, 0, nlgroup + 1, GFP_ATOMIC);
+ netlink_broadcast(ebt->ebtulognl, ub->skb, 0, nlgroup + 1, GFP_ATOMIC);
ub->qlen = 0;
ub->skb = NULL;
@@ -89,10 +99,15 @@ static void ulog_send(unsigned int nlgroup)
/* timer function to flush queue in flushtimeout time */
static void ulog_timer(unsigned long data)
{
- spin_lock_bh(&ulog_buffers[data].lock);
- if (ulog_buffers[data].skb)
- ulog_send(data);
- spin_unlock_bh(&ulog_buffers[data].lock);
+ struct ebt_ulog_net *ebt = container_of((void *)data,
+ struct ebt_ulog_net,
+ nlgroup[*(unsigned int *)data]);
+
+ ebt_ulog_buff_t *ub = &ebt->ulog_buffers[*(unsigned int *)data];
+ spin_lock_bh(&ub->lock);
+ if (ub->skb)
+ ulog_send(ebt, *(unsigned int *)data);
+ spin_unlock_bh(&ub->lock);
}
static struct sk_buff *ulog_alloc_skb(unsigned int size)
@@ -123,8 +138,10 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
ebt_ulog_packet_msg_t *pm;
size_t size, copy_len;
struct nlmsghdr *nlh;
+ struct net *net = dev_net(in ? in : out);
+ struct ebt_ulog_net *ebt = ebt_ulog_pernet(net);
unsigned int group = uloginfo->nlgroup;
- ebt_ulog_buff_t *ub = &ulog_buffers[group];
+ ebt_ulog_buff_t *ub = &ebt->ulog_buffers[group];
spinlock_t *lock = &ub->lock;
ktime_t kt;
@@ -134,7 +151,7 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
else
copy_len = uloginfo->cprange;
- size = NLMSG_SPACE(sizeof(*pm) + copy_len);
+ size = nlmsg_total_size(sizeof(*pm) + copy_len);
if (size > nlbufsiz) {
pr_debug("Size %Zd needed, but nlbufsiz=%d\n", size, nlbufsiz);
return;
@@ -146,7 +163,7 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
if (!(ub->skb = ulog_alloc_skb(size)))
goto unlock;
} else if (size > skb_tailroom(ub->skb)) {
- ulog_send(group);
+ ulog_send(ebt, group);
if (!(ub->skb = ulog_alloc_skb(size)))
goto unlock;
@@ -205,7 +222,7 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
ub->lastnlh = nlh;
if (ub->qlen >= uloginfo->qthreshold)
- ulog_send(group);
+ ulog_send(ebt, group);
else if (!timer_pending(&ub->timer)) {
ub->timer.expires = jiffies + flushtimeout * HZ / 100;
add_timer(&ub->timer);
@@ -277,56 +294,89 @@ static struct nf_logger ebt_ulog_logger __read_mostly = {
.me = THIS_MODULE,
};
-static int __init ebt_ulog_init(void)
+static int __net_init ebt_ulog_net_init(struct net *net)
{
- int ret;
int i;
+ struct ebt_ulog_net *ebt = ebt_ulog_pernet(net);
+
struct netlink_kernel_cfg cfg = {
.groups = EBT_ULOG_MAXNLGROUPS,
};
- if (nlbufsiz >= 128*1024) {
- pr_warning("Netlink buffer has to be <= 128kB,"
- " please try a smaller nlbufsiz parameter.\n");
- return -EINVAL;
- }
-
/* initialize ulog_buffers */
for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) {
- setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
- spin_lock_init(&ulog_buffers[i].lock);
+ ebt->nlgroup[i] = i;
+ setup_timer(&ebt->ulog_buffers[i].timer, ulog_timer,
+ (unsigned long)&ebt->nlgroup[i]);
+ spin_lock_init(&ebt->ulog_buffers[i].lock);
}
- ebtulognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, &cfg);
- if (!ebtulognl)
- ret = -ENOMEM;
- else if ((ret = xt_register_target(&ebt_ulog_tg_reg)) != 0)
- netlink_kernel_release(ebtulognl);
+ ebt->ebtulognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg);
+ if (!ebt->ebtulognl)
+ return -ENOMEM;
- if (ret == 0)
- nf_log_register(NFPROTO_BRIDGE, &ebt_ulog_logger);
-
- return ret;
+ nf_log_set(net, NFPROTO_BRIDGE, &ebt_ulog_logger);
+ return 0;
}
-static void __exit ebt_ulog_fini(void)
+static void __net_exit ebt_ulog_net_fini(struct net *net)
{
- ebt_ulog_buff_t *ub;
int i;
+ struct ebt_ulog_net *ebt = ebt_ulog_pernet(net);
- nf_log_unregister(&ebt_ulog_logger);
- xt_unregister_target(&ebt_ulog_tg_reg);
+ nf_log_unset(net, &ebt_ulog_logger);
for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) {
- ub = &ulog_buffers[i];
+ ebt_ulog_buff_t *ub = &ebt->ulog_buffers[i];
del_timer(&ub->timer);
- spin_lock_bh(&ub->lock);
+
if (ub->skb) {
kfree_skb(ub->skb);
ub->skb = NULL;
}
- spin_unlock_bh(&ub->lock);
}
- netlink_kernel_release(ebtulognl);
+ netlink_kernel_release(ebt->ebtulognl);
+}
+
+static struct pernet_operations ebt_ulog_net_ops = {
+ .init = ebt_ulog_net_init,
+ .exit = ebt_ulog_net_fini,
+ .id = &ebt_ulog_net_id,
+ .size = sizeof(struct ebt_ulog_net),
+};
+
+static int __init ebt_ulog_init(void)
+{
+ int ret;
+
+ if (nlbufsiz >= 128*1024) {
+ pr_warn("Netlink buffer has to be <= 128kB,"
+ "please try a smaller nlbufsiz parameter.\n");
+ return -EINVAL;
+ }
+
+ ret = register_pernet_subsys(&ebt_ulog_net_ops);
+ if (ret)
+ goto out_pernet;
+
+ ret = xt_register_target(&ebt_ulog_tg_reg);
+ if (ret)
+ goto out_target;
+
+ nf_log_register(NFPROTO_BRIDGE, &ebt_ulog_logger);
+
+ return 0;
+
+out_target:
+ unregister_pernet_subsys(&ebt_ulog_net_ops);
+out_pernet:
+ return ret;
+}
+
+static void __exit ebt_ulog_fini(void)
+{
+ nf_log_unregister(&ebt_ulog_logger);
+ xt_unregister_target(&ebt_ulog_tg_reg);
+ unregister_pernet_subsys(&ebt_ulog_net_ops);
}
module_init(ebt_ulog_init);
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index 40d8258bf74f..70f656ce0f4a 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -64,9 +64,7 @@ static int ebt_broute(struct sk_buff *skb)
static int __net_init broute_net_init(struct net *net)
{
net->xt.broute_table = ebt_register_table(net, &broute_table);
- if (IS_ERR(net->xt.broute_table))
- return PTR_ERR(net->xt.broute_table);
- return 0;
+ return PTR_RET(net->xt.broute_table);
}
static void __net_exit broute_net_exit(struct net *net)
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 8d493c91a562..3d110c4fc787 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -138,7 +138,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
ethproto = h->h_proto;
if (e->bitmask & EBT_802_3) {
- if (FWINV2(ntohs(ethproto) >= 1536, EBT_IPROTO))
+ if (FWINV2(ntohs(ethproto) >= ETH_P_802_3_MIN, EBT_IPROTO))
return 1;
} else if (!(e->bitmask & EBT_NOPROTO) &&
FWINV2(e->ethproto != ethproto, EBT_IPROTO))
diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index 21760f008974..df6d56d8689a 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -301,10 +301,11 @@ static void dev_flowctrl(struct net_device *dev, int on)
}
void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev,
- struct cflayer *link_support, int head_room,
- struct cflayer **layer, int (**rcv_func)(
- struct sk_buff *, struct net_device *,
- struct packet_type *, struct net_device *))
+ struct cflayer *link_support, int head_room,
+ struct cflayer **layer,
+ int (**rcv_func)(struct sk_buff *, struct net_device *,
+ struct packet_type *,
+ struct net_device *))
{
struct caif_device_entry *caifd;
enum cfcnfg_phy_preference pref;
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 095259f83902..630b8be6e748 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -197,8 +197,8 @@ static void cfsk_put(struct cflayer *layr)
/* Packet Control Callback function called from CAIF */
static void caif_ctrl_cb(struct cflayer *layr,
- enum caif_ctrlcmd flow,
- int phyid)
+ enum caif_ctrlcmd flow,
+ int phyid)
{
struct caifsock *cf_sk = container_of(layr, struct caifsock, layer);
switch (flow) {
@@ -274,7 +274,7 @@ static void caif_check_flow_release(struct sock *sk)
* changed locking, address handling and added MSG_TRUNC.
*/
static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *m, size_t len, int flags)
+ struct msghdr *m, size_t len, int flags)
{
struct sock *sk = sock->sk;
@@ -286,6 +286,8 @@ static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock,
if (m->msg_flags&MSG_OOB)
goto read_error;
+ m->msg_namelen = 0;
+
skb = skb_recv_datagram(sk, flags, 0 , &ret);
if (!skb)
goto read_error;
@@ -346,8 +348,8 @@ static long caif_stream_data_wait(struct sock *sk, long timeo)
* changed locking calls, changed address handling.
*/
static int caif_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t size,
- int flags)
+ struct msghdr *msg, size_t size,
+ int flags)
{
struct sock *sk = sock->sk;
int copied = 0;
@@ -462,7 +464,7 @@ out:
* CAIF flow-on and sock_writable.
*/
static long caif_wait_for_flow_on(struct caifsock *cf_sk,
- int wait_writeable, long timeo, int *err)
+ int wait_writeable, long timeo, int *err)
{
struct sock *sk = &cf_sk->sk;
DEFINE_WAIT(wait);
@@ -516,7 +518,7 @@ static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk,
/* Copied from af_unix:unix_dgram_sendmsg, and adapted to CAIF */
static int caif_seqpkt_sendmsg(struct kiocb *kiocb, struct socket *sock,
- struct msghdr *msg, size_t len)
+ struct msghdr *msg, size_t len)
{
struct sock *sk = sock->sk;
struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
@@ -591,7 +593,7 @@ err:
* and other minor adaptations.
*/
static int caif_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
- struct msghdr *msg, size_t len)
+ struct msghdr *msg, size_t len)
{
struct sock *sk = sock->sk;
struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
@@ -670,7 +672,7 @@ out_err:
}
static int setsockopt(struct socket *sock,
- int lvl, int opt, char __user *ov, unsigned int ol)
+ int lvl, int opt, char __user *ov, unsigned int ol)
{
struct sock *sk = sock->sk;
struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
@@ -932,7 +934,7 @@ static int caif_release(struct socket *sock)
/* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */
static unsigned int caif_poll(struct file *file,
- struct socket *sock, poll_table *wait)
+ struct socket *sock, poll_table *wait)
{
struct sock *sk = sock->sk;
unsigned int mask;
@@ -1022,7 +1024,7 @@ static void caif_sock_destructor(struct sock *sk)
}
static int caif_create(struct net *net, struct socket *sock, int protocol,
- int kern)
+ int kern)
{
struct sock *sk = NULL;
struct caifsock *cf_sk = NULL;
diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c
index ef8ebaa993cf..d76278d644b8 100644
--- a/net/caif/caif_usb.c
+++ b/net/caif/caif_usb.c
@@ -75,7 +75,7 @@ static int cfusbl_transmit(struct cflayer *layr, struct cfpkt *pkt)
}
static void cfusbl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
- int phyid)
+ int phyid)
{
if (layr->up && layr->up->ctrlcmd)
layr->up->ctrlcmd(layr->up, ctrl, layr->id);
@@ -121,7 +121,7 @@ static struct packet_type caif_usb_type __read_mostly = {
};
static int cfusbl_device_notify(struct notifier_block *me, unsigned long what,
- void *arg)
+ void *arg)
{
struct net_device *dev = arg;
struct caif_dev_common common;
diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c
index f1dbddb95a6c..246ac3aa8de5 100644
--- a/net/caif/cfcnfg.c
+++ b/net/caif/cfcnfg.c
@@ -61,11 +61,11 @@ struct cfcnfg {
};
static void cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id,
- enum cfctrl_srv serv, u8 phyid,
- struct cflayer *adapt_layer);
+ enum cfctrl_srv serv, u8 phyid,
+ struct cflayer *adapt_layer);
static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id);
static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id,
- struct cflayer *adapt_layer);
+ struct cflayer *adapt_layer);
static void cfctrl_resp_func(void);
static void cfctrl_enum_resp(void);
@@ -131,7 +131,7 @@ static void cfctrl_resp_func(void)
}
static struct cfcnfg_phyinfo *cfcnfg_get_phyinfo_rcu(struct cfcnfg *cnfg,
- u8 phyid)
+ u8 phyid)
{
struct cfcnfg_phyinfo *phy;
@@ -216,8 +216,8 @@ static const int protohead[CFCTRL_SRV_MASK] = {
static int caif_connect_req_to_link_param(struct cfcnfg *cnfg,
- struct caif_connect_request *s,
- struct cfctrl_link_param *l)
+ struct caif_connect_request *s,
+ struct cfctrl_link_param *l)
{
struct dev_info *dev_info;
enum cfcnfg_phy_preference pref;
@@ -301,8 +301,7 @@ static int caif_connect_req_to_link_param(struct cfcnfg *cnfg,
int caif_connect_client(struct net *net, struct caif_connect_request *conn_req,
struct cflayer *adap_layer, int *ifindex,
- int *proto_head,
- int *proto_tail)
+ int *proto_head, int *proto_tail)
{
struct cflayer *frml;
struct cfcnfg_phyinfo *phy;
@@ -364,7 +363,7 @@ unlock:
EXPORT_SYMBOL(caif_connect_client);
static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id,
- struct cflayer *adapt_layer)
+ struct cflayer *adapt_layer)
{
if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL)
adapt_layer->ctrlcmd(adapt_layer,
@@ -526,7 +525,7 @@ out_err:
EXPORT_SYMBOL(cfcnfg_add_phy_layer);
int cfcnfg_set_phy_state(struct cfcnfg *cnfg, struct cflayer *phy_layer,
- bool up)
+ bool up)
{
struct cfcnfg_phyinfo *phyinfo;
diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c
index a376ec1ac0a7..9cd057c59c59 100644
--- a/net/caif/cfctrl.c
+++ b/net/caif/cfctrl.c
@@ -20,12 +20,12 @@
#ifdef CAIF_NO_LOOP
static int handle_loop(struct cfctrl *ctrl,
- int cmd, struct cfpkt *pkt){
+ int cmd, struct cfpkt *pkt){
return -1;
}
#else
static int handle_loop(struct cfctrl *ctrl,
- int cmd, struct cfpkt *pkt);
+ int cmd, struct cfpkt *pkt);
#endif
static int cfctrl_recv(struct cflayer *layr, struct cfpkt *pkt);
static void cfctrl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
@@ -72,7 +72,7 @@ void cfctrl_remove(struct cflayer *layer)
}
static bool param_eq(const struct cfctrl_link_param *p1,
- const struct cfctrl_link_param *p2)
+ const struct cfctrl_link_param *p2)
{
bool eq =
p1->linktype == p2->linktype &&
@@ -197,8 +197,8 @@ void cfctrl_enum_req(struct cflayer *layer, u8 physlinkid)
}
int cfctrl_linkup_request(struct cflayer *layer,
- struct cfctrl_link_param *param,
- struct cflayer *user_layer)
+ struct cfctrl_link_param *param,
+ struct cflayer *user_layer)
{
struct cfctrl *cfctrl = container_obj(layer);
u32 tmp32;
@@ -301,7 +301,7 @@ int cfctrl_linkup_request(struct cflayer *layer,
}
int cfctrl_linkdown_req(struct cflayer *layer, u8 channelid,
- struct cflayer *client)
+ struct cflayer *client)
{
int ret;
struct cfpkt *pkt;
@@ -555,7 +555,7 @@ error:
}
static void cfctrl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
- int phyid)
+ int phyid)
{
struct cfctrl *this = container_obj(layr);
switch (ctrl) {
diff --git a/net/caif/cffrml.c b/net/caif/cffrml.c
index 0a7df7ef062d..204c5e226a61 100644
--- a/net/caif/cffrml.c
+++ b/net/caif/cffrml.c
@@ -28,7 +28,7 @@ struct cffrml {
static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt);
static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt);
static void cffrml_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
- int phyid);
+ int phyid);
static u32 cffrml_rcv_error;
static u32 cffrml_rcv_checsum_error;
@@ -167,7 +167,7 @@ static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt)
}
static void cffrml_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
- int phyid)
+ int phyid)
{
if (layr->up && layr->up->ctrlcmd)
layr->up->ctrlcmd(layr->up, ctrl, layr->id);
diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c
index 94b08612a4d8..154d9f8f964c 100644
--- a/net/caif/cfmuxl.c
+++ b/net/caif/cfmuxl.c
@@ -42,7 +42,7 @@ struct cfmuxl {
static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt);
static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt);
static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
- int phyid);
+ int phyid);
static struct cflayer *get_up(struct cfmuxl *muxl, u16 id);
struct cflayer *cfmuxl_create(void)
@@ -244,7 +244,7 @@ static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt)
}
static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
- int phyid)
+ int phyid)
{
struct cfmuxl *muxl = container_obj(layr);
struct cflayer *layer;
diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c
index 863dedd91bb6..e8f9c149504d 100644
--- a/net/caif/cfpkt_skbuff.c
+++ b/net/caif/cfpkt_skbuff.c
@@ -266,8 +266,8 @@ inline u16 cfpkt_getlen(struct cfpkt *pkt)
}
inline u16 cfpkt_iterate(struct cfpkt *pkt,
- u16 (*iter_func)(u16, void *, u16),
- u16 data)
+ u16 (*iter_func)(u16, void *, u16),
+ u16 data)
{
/*
* Don't care about the performance hit of linearizing,
@@ -307,8 +307,8 @@ int cfpkt_setlen(struct cfpkt *pkt, u16 len)
}
struct cfpkt *cfpkt_append(struct cfpkt *dstpkt,
- struct cfpkt *addpkt,
- u16 expectlen)
+ struct cfpkt *addpkt,
+ u16 expectlen)
{
struct sk_buff *dst = pkt_to_skb(dstpkt);
struct sk_buff *add = pkt_to_skb(addpkt);
diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c
index 2b563ad04597..db51830c8587 100644
--- a/net/caif/cfrfml.c
+++ b/net/caif/cfrfml.c
@@ -43,7 +43,7 @@ static void cfrfml_release(struct cflayer *layer)
}
struct cflayer *cfrfml_create(u8 channel_id, struct dev_info *dev_info,
- int mtu_size)
+ int mtu_size)
{
int tmp;
struct cfrfml *this = kzalloc(sizeof(struct cfrfml), GFP_ATOMIC);
@@ -69,7 +69,7 @@ struct cflayer *cfrfml_create(u8 channel_id, struct dev_info *dev_info,
}
static struct cfpkt *rfm_append(struct cfrfml *rfml, char *seghead,
- struct cfpkt *pkt, int *err)
+ struct cfpkt *pkt, int *err)
{
struct cfpkt *tmppkt;
*err = -EPROTO;
diff --git a/net/caif/cfserl.c b/net/caif/cfserl.c
index 8e68b97f13ee..147c232b1285 100644
--- a/net/caif/cfserl.c
+++ b/net/caif/cfserl.c
@@ -29,7 +29,7 @@ struct cfserl {
static int cfserl_receive(struct cflayer *layr, struct cfpkt *pkt);
static int cfserl_transmit(struct cflayer *layr, struct cfpkt *pkt);
static void cfserl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
- int phyid);
+ int phyid);
struct cflayer *cfserl_create(int instance, bool use_stx)
{
@@ -182,7 +182,7 @@ static int cfserl_transmit(struct cflayer *layer, struct cfpkt *newpkt)
}
static void cfserl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
- int phyid)
+ int phyid)
{
layr->up->ctrlcmd(layr->up, ctrl, phyid);
}
diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c
index ba217e90765e..95f7f5ea30ef 100644
--- a/net/caif/cfsrvl.c
+++ b/net/caif/cfsrvl.c
@@ -25,7 +25,7 @@
#define container_obj(layr) container_of(layr, struct cfsrvl, layer)
static void cfservl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
- int phyid)
+ int phyid)
{
struct cfsrvl *service = container_obj(layr);
@@ -158,10 +158,9 @@ static void cfsrvl_release(struct cflayer *layer)
}
void cfsrvl_init(struct cfsrvl *service,
- u8 channel_id,
- struct dev_info *dev_info,
- bool supports_flowctrl
- )
+ u8 channel_id,
+ struct dev_info *dev_info,
+ bool supports_flowctrl)
{
caif_assert(offsetof(struct cfsrvl, layer) == 0);
service->open = false;
@@ -207,8 +206,8 @@ void caif_free_client(struct cflayer *adap_layer)
EXPORT_SYMBOL(caif_free_client);
void caif_client_register_refcnt(struct cflayer *adapt_layer,
- void (*hold)(struct cflayer *lyr),
- void (*put)(struct cflayer *lyr))
+ void (*hold)(struct cflayer *lyr),
+ void (*put)(struct cflayer *lyr))
{
struct cfsrvl *service;
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index e597733affb8..26a4e4e3a767 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -167,7 +167,7 @@ static void chnl_put(struct cflayer *lyr)
}
static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow,
- int phyid)
+ int phyid)
{
struct chnl_net *priv = container_of(layr, struct chnl_net, chnl);
pr_debug("NET flowctrl func called flow: %s\n",
@@ -443,7 +443,7 @@ nla_put_failure:
}
static void caif_netlink_parms(struct nlattr *data[],
- struct caif_connect_request *conn_req)
+ struct caif_connect_request *conn_req)
{
if (!data) {
pr_warn("no params data found\n");
@@ -488,7 +488,7 @@ static int ipcaif_newlink(struct net *src_net, struct net_device *dev,
}
static int ipcaif_changelink(struct net_device *dev, struct nlattr *tb[],
- struct nlattr *data[])
+ struct nlattr *data[])
{
struct chnl_net *caifdev;
ASSERT_RTNL();
diff --git a/net/can/af_can.c b/net/can/af_can.c
index c48e5220bbac..c4e50852c9f4 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -525,7 +525,7 @@ void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask,
d = find_dev_rcv_lists(dev);
if (!d) {
- printk(KERN_ERR "BUG: receive list not found for "
+ pr_err("BUG: receive list not found for "
"dev %s, id %03X, mask %03X\n",
DNAME(dev), can_id, mask);
goto out;
@@ -546,16 +546,13 @@ void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask,
}
/*
- * Check for bugs in CAN protocol implementations:
- * If no matching list item was found, the list cursor variable next
- * will be NULL, while r will point to the last item of the list.
+ * Check for bugs in CAN protocol implementations using af_can.c:
+ * 'r' will be NULL if no matching list item was found for removal.
*/
if (!r) {
- printk(KERN_ERR "BUG: receive list entry not found for "
- "dev %s, id %03X, mask %03X\n",
- DNAME(dev), can_id, mask);
- r = NULL;
+ WARN(1, "BUG: receive list entry not found for dev %s, "
+ "id %03X, mask %03X\n", DNAME(dev), can_id, mask);
goto out;
}
@@ -749,8 +746,7 @@ int can_proto_register(const struct can_proto *cp)
int err = 0;
if (proto < 0 || proto >= CAN_NPROTO) {
- printk(KERN_ERR "can: protocol number %d out of range\n",
- proto);
+ pr_err("can: protocol number %d out of range\n", proto);
return -EINVAL;
}
@@ -761,8 +757,7 @@ int can_proto_register(const struct can_proto *cp)
mutex_lock(&proto_tab_lock);
if (proto_tab[proto]) {
- printk(KERN_ERR "can: protocol %d already registered\n",
- proto);
+ pr_err("can: protocol %d already registered\n", proto);
err = -EBUSY;
} else
RCU_INIT_POINTER(proto_tab[proto], cp);
@@ -816,11 +811,8 @@ static int can_notifier(struct notifier_block *nb, unsigned long msg,
/* create new dev_rcv_lists for this device */
d = kzalloc(sizeof(*d), GFP_KERNEL);
- if (!d) {
- printk(KERN_ERR
- "can: allocation of receive list failed\n");
+ if (!d)
return NOTIFY_DONE;
- }
BUG_ON(dev->ml_priv);
dev->ml_priv = d;
@@ -838,8 +830,8 @@ static int can_notifier(struct notifier_block *nb, unsigned long msg,
dev->ml_priv = NULL;
}
} else
- printk(KERN_ERR "can: notifier: receive list not "
- "found for dev %s\n", dev->name);
+ pr_err("can: notifier: receive list not found for dev "
+ "%s\n", dev->name);
spin_unlock(&can_rcvlists_lock);
@@ -927,7 +919,7 @@ static __exit void can_exit(void)
/* remove created dev_rcv_lists from still registered CAN devices */
rcu_read_lock();
for_each_netdev_rcu(&init_net, dev) {
- if (dev->type == ARPHRD_CAN && dev->ml_priv){
+ if (dev->type == ARPHRD_CAN && dev->ml_priv) {
struct dev_rcv_lists *d = dev->ml_priv;
diff --git a/net/can/gw.c b/net/can/gw.c
index 2d117dc5ebea..2dc619db805a 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -778,8 +778,7 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
return 0;
}
-static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh,
- void *arg)
+static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct rtcanmsg *r;
struct cgw_job *gwj;
@@ -868,7 +867,7 @@ static void cgw_remove_all_jobs(void)
}
}
-static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct cgw_job *gwj = NULL;
struct hlist_node *nx;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 69bc4bf89e3e..4543b9aba40c 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -654,6 +654,24 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
return 0;
}
+static int __decode_pgid(void **p, void *end, struct ceph_pg *pg)
+{
+ u8 v;
+
+ ceph_decode_need(p, end, 1+8+4+4, bad);
+ v = ceph_decode_8(p);
+ if (v != 1)
+ goto bad;
+ pg->pool = ceph_decode_64(p);
+ pg->seed = ceph_decode_32(p);
+ *p += 4; /* skip preferred */
+ return 0;
+
+bad:
+ dout("error decoding pgid\n");
+ return -EINVAL;
+}
+
/*
* decode a full map.
*/
@@ -745,13 +763,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
for (i = 0; i < len; i++) {
int n, j;
struct ceph_pg pgid;
- struct ceph_pg_v1 pgid_v1;
struct ceph_pg_mapping *pg;
- ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
- ceph_decode_copy(p, &pgid_v1, sizeof(pgid_v1));
- pgid.pool = le32_to_cpu(pgid_v1.pool);
- pgid.seed = le16_to_cpu(pgid_v1.ps);
+ err = __decode_pgid(p, end, &pgid);
+ if (err)
+ goto bad;
+ ceph_decode_need(p, end, sizeof(u32), bad);
n = ceph_decode_32(p);
err = -EINVAL;
if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
@@ -818,8 +835,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
u16 version;
ceph_decode_16_safe(p, end, version, bad);
- if (version > 6) {
- pr_warning("got unknown v %d > %d of inc osdmap\n", version, 6);
+ if (version != 6) {
+ pr_warning("got unknown v %d != 6 of inc osdmap\n", version);
goto bad;
}
@@ -963,15 +980,14 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
while (len--) {
struct ceph_pg_mapping *pg;
int j;
- struct ceph_pg_v1 pgid_v1;
struct ceph_pg pgid;
u32 pglen;
- ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
- ceph_decode_copy(p, &pgid_v1, sizeof(pgid_v1));
- pgid.pool = le32_to_cpu(pgid_v1.pool);
- pgid.seed = le16_to_cpu(pgid_v1.ps);
- pglen = ceph_decode_32(p);
+ err = __decode_pgid(p, end, &pgid);
+ if (err)
+ goto bad;
+ ceph_decode_need(p, end, sizeof(u32), bad);
+ pglen = ceph_decode_32(p);
if (pglen) {
ceph_decode_need(p, end, pglen*sizeof(u32), bad);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 368f9c3f9dc6..ebba65d7e0da 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -749,7 +749,9 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
/* exceptional events? */
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
- mask |= POLLERR;
+ mask |= POLLERR |
+ (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
+
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= POLLRDHUP | POLLIN | POLLRDNORM;
if (sk->sk_shutdown == SHUTDOWN_MASK)
diff --git a/net/core/dev.c b/net/core/dev.c
index 8f152f904f70..3655ff927315 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1545,7 +1545,6 @@ void net_enable_timestamp(void)
return;
}
#endif
- WARN_ON(in_interrupt());
static_key_slow_inc(&netstamp_needed);
}
EXPORT_SYMBOL(net_enable_timestamp);
@@ -1625,7 +1624,6 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
}
skb_orphan(skb);
- nf_reset(skb);
if (unlikely(!is_skb_forwardable(dev, skb))) {
atomic_long_inc(&dev->rx_dropped);
@@ -1641,6 +1639,7 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
skb->mark = 0;
secpath_reset(skb);
nf_reset(skb);
+ nf_reset_trace(skb);
return netif_rx(skb);
}
EXPORT_SYMBOL_GPL(dev_forward_skb);
@@ -2208,30 +2207,40 @@ out:
}
EXPORT_SYMBOL(skb_checksum_help);
-/**
- * skb_mac_gso_segment - mac layer segmentation handler.
- * @skb: buffer to segment
- * @features: features for the output path (see dev->features)
- */
-struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
- netdev_features_t features)
+__be16 skb_network_protocol(struct sk_buff *skb)
{
- struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
- struct packet_offload *ptype;
__be16 type = skb->protocol;
+ int vlan_depth = ETH_HLEN;
while (type == htons(ETH_P_8021Q)) {
- int vlan_depth = ETH_HLEN;
struct vlan_hdr *vh;
if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
- return ERR_PTR(-EINVAL);
+ return 0;
vh = (struct vlan_hdr *)(skb->data + vlan_depth);
type = vh->h_vlan_encapsulated_proto;
vlan_depth += VLAN_HLEN;
}
+ return type;
+}
+
+/**
+ * skb_mac_gso_segment - mac layer segmentation handler.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ */
+struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+ struct packet_offload *ptype;
+ __be16 type = skb_network_protocol(skb);
+
+ if (unlikely(!type))
+ return ERR_PTR(-EINVAL);
+
__skb_pull(skb, skb->mac_len);
rcu_read_lock();
@@ -2398,24 +2407,12 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
return 0;
}
-static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
-{
- return ((features & NETIF_F_GEN_CSUM) ||
- ((features & NETIF_F_V4_CSUM) &&
- protocol == htons(ETH_P_IP)) ||
- ((features & NETIF_F_V6_CSUM) &&
- protocol == htons(ETH_P_IPV6)) ||
- ((features & NETIF_F_FCOE_CRC) &&
- protocol == htons(ETH_P_FCOE)));
-}
-
static netdev_features_t harmonize_features(struct sk_buff *skb,
__be16 protocol, netdev_features_t features)
{
if (skb->ip_summed != CHECKSUM_NONE &&
!can_checksum_protocol(features, protocol)) {
features &= ~NETIF_F_ALL_CSUM;
- features &= ~NETIF_F_SG;
} else if (illegal_highdma(skb->dev, skb)) {
features &= ~NETIF_F_SG;
}
@@ -2590,6 +2587,7 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
*/
if (shinfo->gso_size) {
unsigned int hdr_len;
+ u16 gso_segs = shinfo->gso_segs;
/* mac layer + network layer */
hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
@@ -2599,7 +2597,12 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
hdr_len += tcp_hdrlen(skb);
else
hdr_len += sizeof(struct udphdr);
- qdisc_skb_cb(skb)->pkt_len += (shinfo->gso_segs - 1) * hdr_len;
+
+ if (shinfo->gso_type & SKB_GSO_DODGY)
+ gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
+ shinfo->gso_size);
+
+ qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
}
}
@@ -3315,6 +3318,7 @@ int netdev_rx_handler_register(struct net_device *dev,
if (dev->rx_handler)
return -EBUSY;
+ /* Note: rx_handler_data must be set before rx_handler */
rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
rcu_assign_pointer(dev->rx_handler, rx_handler);
@@ -3326,7 +3330,7 @@ EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
* netdev_rx_handler_unregister - unregister receive handler
* @dev: device to unregister a handler from
*
- * Unregister a receive hander from a device.
+ * Unregister a receive handler from a device.
*
* The caller must hold the rtnl_mutex.
*/
@@ -3335,6 +3339,11 @@ void netdev_rx_handler_unregister(struct net_device *dev)
ASSERT_RTNL();
RCU_INIT_POINTER(dev->rx_handler, NULL);
+ /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
+ * section has a guarantee to see a non NULL rx_handler_data
+ * as well.
+ */
+ synchronize_net();
RCU_INIT_POINTER(dev->rx_handler_data, NULL);
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
@@ -3444,6 +3453,7 @@ ncls:
}
switch (rx_handler(&skb)) {
case RX_HANDLER_CONSUMED:
+ ret = NET_RX_SUCCESS;
goto unlock;
case RX_HANDLER_ANOTHER:
goto another_round;
@@ -4057,6 +4067,9 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
napi->gro_list = NULL;
napi->skb = NULL;
napi->poll = poll;
+ if (weight > NAPI_POLL_WEIGHT)
+ pr_err_once("netif_napi_add() called with weight %d on device %s\n",
+ weight, dev->name);
napi->weight = weight;
list_add(&napi->dev_list, &dev->napi_list);
napi->dev = dev;
@@ -4918,20 +4931,25 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
}
- /* Fix illegal SG+CSUM combinations. */
- if ((features & NETIF_F_SG) &&
- !(features & NETIF_F_ALL_CSUM)) {
- netdev_dbg(dev,
- "Dropping NETIF_F_SG since no checksum feature.\n");
- features &= ~NETIF_F_SG;
- }
-
/* TSO requires that SG is present as well. */
if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
features &= ~NETIF_F_ALL_TSO;
}
+ if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
+ !(features & NETIF_F_IP_CSUM)) {
+ netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
+ features &= ~NETIF_F_TSO;
+ features &= ~NETIF_F_TSO_ECN;
+ }
+
+ if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
+ !(features & NETIF_F_IPV6_CSUM)) {
+ netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
+ features &= ~NETIF_F_TSO6;
+ }
+
/* TSO ECN requires that TSO is present as well. */
if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
features &= ~NETIF_F_TSO_ECN;
@@ -5199,6 +5217,10 @@ int register_netdevice(struct net_device *dev)
*/
dev->vlan_features |= NETIF_F_HIGHDMA;
+ /* Make NETIF_F_SG inheritable to tunnel devices.
+ */
+ dev->hw_enc_features |= NETIF_F_SG;
+
ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
ret = notifier_to_errno(ret);
if (ret)
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index bd2eb9d3e369..abdc9e6ef33e 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -37,7 +37,7 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
ha->type = addr_type;
ha->refcount = 1;
ha->global_use = global;
- ha->synced = false;
+ ha->synced = 0;
list_add_tail_rcu(&ha->list, &list->list);
list->count++;
@@ -165,7 +165,7 @@ int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
addr_len, ha->type);
if (err)
break;
- ha->synced = true;
+ ha->synced++;
ha->refcount++;
} else if (ha->refcount == 1) {
__hw_addr_del(to_list, ha->addr, addr_len, ha->type);
@@ -186,7 +186,7 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
if (ha->synced) {
__hw_addr_del(to_list, ha->addr,
addr_len, ha->type);
- ha->synced = false;
+ ha->synced--;
__hw_addr_del(from_list, ha->addr,
addr_len, ha->type);
}
diff --git a/net/core/dst.c b/net/core/dst.c
index 35fd12f1a69c..df9cc810ec8e 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -320,27 +320,28 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
EXPORT_SYMBOL(__dst_destroy_metrics_generic);
/**
- * skb_dst_set_noref - sets skb dst, without a reference
+ * __skb_dst_set_noref - sets skb dst, without a reference
* @skb: buffer
* @dst: dst entry
+ * @force: if force is set, use noref version even for DST_NOCACHE entries
*
* Sets skb dst, assuming a reference was not taken on dst
* skb_dst_drop() should not dst_release() this dst
*/
-void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
+void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, bool force)
{
WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
/* If dst not in cache, we must take a reference, because
* dst_release() will destroy dst as soon as its refcount becomes zero
*/
- if (unlikely(dst->flags & DST_NOCACHE)) {
+ if (unlikely((dst->flags & DST_NOCACHE) && !force)) {
dst_hold(dst);
skb_dst_set(skb, dst);
} else {
skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
}
}
-EXPORT_SYMBOL(skb_dst_set_noref);
+EXPORT_SYMBOL(__skb_dst_set_noref);
/* Dirty hack. We did it in 2.2 (in __dst_free),
* we have _very_ good reasons not to repeat
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 3e9b2c3e30f0..adc1351e6873 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -78,6 +78,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation",
[NETIF_F_FSO_BIT] = "tx-fcoe-segmentation",
[NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation",
+ [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation",
[NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
[NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp",
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 58a4ba27dfe3..d5a9f8ead0d8 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -266,7 +266,7 @@ errout:
return err;
}
-static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
{
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
@@ -415,7 +415,7 @@ errout:
return err;
}
-static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
{
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
diff --git a/net/core/filter.c b/net/core/filter.c
index 2e20b55a7830..dad2a178f9f8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -348,6 +348,9 @@ load_b:
case BPF_S_ANC_VLAN_TAG_PRESENT:
A = !!vlan_tx_tag_present(skb);
continue;
+ case BPF_S_ANC_PAY_OFFSET:
+ A = __skb_get_poff(skb);
+ continue;
case BPF_S_ANC_NLATTR: {
struct nlattr *nla;
@@ -612,6 +615,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
ANCILLARY(ALU_XOR_X);
ANCILLARY(VLAN_TAG);
ANCILLARY(VLAN_TAG_PRESENT);
+ ANCILLARY(PAY_OFFSET);
}
/* ancillary operation unknown or unsupported */
@@ -814,6 +818,7 @@ static void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
[BPF_S_ANC_SECCOMP_LD_W] = BPF_LD|BPF_B|BPF_ABS,
[BPF_S_ANC_VLAN_TAG] = BPF_LD|BPF_B|BPF_ABS,
[BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS,
+ [BPF_S_ANC_PAY_OFFSET] = BPF_LD|BPF_B|BPF_ABS,
[BPF_S_LD_W_LEN] = BPF_LD|BPF_W|BPF_LEN,
[BPF_S_LD_W_IND] = BPF_LD|BPF_W|BPF_IND,
[BPF_S_LD_H_IND] = BPF_LD|BPF_H|BPF_IND,
diff --git a/net/core/flow.c b/net/core/flow.c
index c56ea6f7f6c7..7102f166482d 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -323,12 +323,30 @@ static void flow_cache_flush_tasklet(unsigned long data)
complete(&info->completion);
}
+/*
+ * Return whether a cpu needs flushing. Conservatively, we assume
+ * the presence of any entries means the core may require flushing,
+ * since the flow_cache_ops.check() function may assume it's running
+ * on the same core as the per-cpu cache component.
+ */
+static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu)
+{
+ struct flow_cache_percpu *fcp;
+ int i;
+
+ fcp = per_cpu_ptr(fc->percpu, cpu);
+ for (i = 0; i < flow_cache_hash_size(fc); i++)
+ if (!hlist_empty(&fcp->hash_table[i]))
+ return 0;
+ return 1;
+}
+
static void flow_cache_flush_per_cpu(void *data)
{
struct flow_flush_info *info = data;
struct tasklet_struct *tasklet;
- tasklet = this_cpu_ptr(&info->cache->percpu->flush_tasklet);
+ tasklet = &this_cpu_ptr(info->cache->percpu)->flush_tasklet;
tasklet->data = (unsigned long)info;
tasklet_schedule(tasklet);
}
@@ -337,22 +355,40 @@ void flow_cache_flush(void)
{
struct flow_flush_info info;
static DEFINE_MUTEX(flow_flush_sem);
+ cpumask_var_t mask;
+ int i, self;
+
+ /* Track which cpus need flushing to avoid disturbing all cores. */
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return;
+ cpumask_clear(mask);
/* Don't want cpus going down or up during this. */
get_online_cpus();
mutex_lock(&flow_flush_sem);
info.cache = &flow_cache_global;
- atomic_set(&info.cpuleft, num_online_cpus());
+ for_each_online_cpu(i)
+ if (!flow_cache_percpu_empty(info.cache, i))
+ cpumask_set_cpu(i, mask);
+ atomic_set(&info.cpuleft, cpumask_weight(mask));
+ if (atomic_read(&info.cpuleft) == 0)
+ goto done;
+
init_completion(&info.completion);
local_bh_disable();
- smp_call_function(flow_cache_flush_per_cpu, &info, 0);
- flow_cache_flush_tasklet((unsigned long)&info);
+ self = cpumask_test_and_clear_cpu(smp_processor_id(), mask);
+ on_each_cpu_mask(mask, flow_cache_flush_per_cpu, &info, 0);
+ if (self)
+ flow_cache_flush_tasklet((unsigned long)&info);
local_bh_enable();
wait_for_completion(&info.completion);
+
+done:
mutex_unlock(&flow_flush_sem);
put_online_cpus();
+ free_cpumask_var(mask);
}
static void flow_cache_flush_task(struct work_struct *work)
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 9d4c7201400d..00ee068efc1c 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -5,6 +5,10 @@
#include <linux/if_vlan.h>
#include <net/ip.h>
#include <net/ipv6.h>
+#include <linux/igmp.h>
+#include <linux/icmp.h>
+#include <linux/sctp.h>
+#include <linux/dccp.h>
#include <linux/if_tunnel.h>
#include <linux/if_pppox.h>
#include <linux/ppp_defs.h>
@@ -119,6 +123,17 @@ ipv6:
nhoff += 4;
if (hdr->flags & GRE_SEQ)
nhoff += 4;
+ if (proto == htons(ETH_P_TEB)) {
+ const struct ethhdr *eth;
+ struct ethhdr _eth;
+
+ eth = skb_header_pointer(skb, nhoff,
+ sizeof(_eth), &_eth);
+ if (!eth)
+ return false;
+ proto = eth->h_proto;
+ nhoff += sizeof(*eth);
+ }
goto again;
}
break;
@@ -140,6 +155,8 @@ ipv6:
flow->ports = *ports;
}
+ flow->thoff = (u16) nhoff;
+
return true;
}
EXPORT_SYMBOL(skb_flow_dissect);
@@ -215,6 +232,59 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
}
EXPORT_SYMBOL(__skb_tx_hash);
+/* __skb_get_poff() returns the offset to the payload as far as it could
+ * be dissected. The main user is currently BPF, so that we can dynamically
+ * truncate packets without needing to push actual payload to the user
+ * space and can analyze headers only, instead.
+ */
+u32 __skb_get_poff(const struct sk_buff *skb)
+{
+ struct flow_keys keys;
+ u32 poff = 0;
+
+ if (!skb_flow_dissect(skb, &keys))
+ return 0;
+
+ poff += keys.thoff;
+ switch (keys.ip_proto) {
+ case IPPROTO_TCP: {
+ const struct tcphdr *tcph;
+ struct tcphdr _tcph;
+
+ tcph = skb_header_pointer(skb, poff, sizeof(_tcph), &_tcph);
+ if (!tcph)
+ return poff;
+
+ poff += max_t(u32, sizeof(struct tcphdr), tcph->doff * 4);
+ break;
+ }
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ poff += sizeof(struct udphdr);
+ break;
+ /* For the rest, we do not really care about header
+ * extensions at this point for now.
+ */
+ case IPPROTO_ICMP:
+ poff += sizeof(struct icmphdr);
+ break;
+ case IPPROTO_ICMPV6:
+ poff += sizeof(struct icmp6hdr);
+ break;
+ case IPPROTO_IGMP:
+ poff += sizeof(struct igmphdr);
+ break;
+ case IPPROTO_DCCP:
+ poff += sizeof(struct dccp_hdr);
+ break;
+ case IPPROTO_SCTP:
+ poff += sizeof(struct sctphdr);
+ break;
+ }
+
+ return poff;
+}
+
static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
{
if (unlikely(queue_index >= dev->real_num_tx_queues)) {
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 3863b8f639c5..c72a646d9f44 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1613,7 +1613,7 @@ int neigh_table_clear(struct neigh_table *tbl)
}
EXPORT_SYMBOL(neigh_table_clear);
-static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct ndmsg *ndm;
@@ -1677,7 +1677,7 @@ out:
return err;
}
-static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct ndmsg *ndm;
@@ -1955,7 +1955,7 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
[NDTPA_LOCKTIME] = { .type = NLA_U64 },
};
-static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct neigh_table *tbl;
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 3174f1998ee6..569d355fec3e 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -271,7 +271,7 @@ static int ptype_seq_show(struct seq_file *seq, void *v)
else
seq_printf(seq, "%04x", ntohs(pt->type));
- seq_printf(seq, " %-8s %pF\n",
+ seq_printf(seq, " %-8s %pf\n",
pt->dev ? pt->dev->name : "", pt->func);
}
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index fa32899006a2..a3a17aed3639 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -47,7 +47,7 @@ static struct sk_buff_head skb_pool;
static atomic_t trapped;
-static struct srcu_struct netpoll_srcu;
+DEFINE_STATIC_SRCU(netpoll_srcu);
#define USEC_PER_POLL 50
#define NETPOLL_RX_ENABLED 1
@@ -1212,7 +1212,6 @@ EXPORT_SYMBOL(netpoll_setup);
static int __init netpoll_init(void)
{
skb_queue_head_init(&skb_pool);
- init_srcu_struct(&netpoll_srcu);
return 0;
}
core_initcall(netpoll_init);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index b376410ff259..589d0abb34a0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -496,8 +496,10 @@ static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev)
}
if (ops->fill_info) {
data = nla_nest_start(skb, IFLA_INFO_DATA);
- if (data == NULL)
+ if (data == NULL) {
+ err = -EMSGSIZE;
goto err_cancel_link;
+ }
err = ops->fill_info(skb, dev);
if (err < 0)
goto err_cancel_data;
@@ -515,32 +517,6 @@ out:
return err;
}
-static const int rtm_min[RTM_NR_FAMILIES] =
-{
- [RTM_FAM(RTM_NEWLINK)] = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
- [RTM_FAM(RTM_NEWADDR)] = NLMSG_LENGTH(sizeof(struct ifaddrmsg)),
- [RTM_FAM(RTM_NEWROUTE)] = NLMSG_LENGTH(sizeof(struct rtmsg)),
- [RTM_FAM(RTM_NEWRULE)] = NLMSG_LENGTH(sizeof(struct fib_rule_hdr)),
- [RTM_FAM(RTM_NEWQDISC)] = NLMSG_LENGTH(sizeof(struct tcmsg)),
- [RTM_FAM(RTM_NEWTCLASS)] = NLMSG_LENGTH(sizeof(struct tcmsg)),
- [RTM_FAM(RTM_NEWTFILTER)] = NLMSG_LENGTH(sizeof(struct tcmsg)),
- [RTM_FAM(RTM_NEWACTION)] = NLMSG_LENGTH(sizeof(struct tcamsg)),
- [RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
- [RTM_FAM(RTM_GETANYCAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
-};
-
-static const int rta_max[RTM_NR_FAMILIES] =
-{
- [RTM_FAM(RTM_NEWLINK)] = IFLA_MAX,
- [RTM_FAM(RTM_NEWADDR)] = IFA_MAX,
- [RTM_FAM(RTM_NEWROUTE)] = RTA_MAX,
- [RTM_FAM(RTM_NEWRULE)] = FRA_MAX,
- [RTM_FAM(RTM_NEWQDISC)] = TCA_MAX,
- [RTM_FAM(RTM_NEWTCLASS)] = TCA_MAX,
- [RTM_FAM(RTM_NEWTFILTER)] = TCA_MAX,
- [RTM_FAM(RTM_NEWACTION)] = TCAA_MAX,
-};
-
int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo)
{
struct sock *rtnl = net->rtnl;
@@ -979,6 +955,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
* report anything.
*/
ivi.spoofchk = -1;
+ memset(ivi.mac, 0, sizeof(ivi.mac));
if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi))
break;
vf_mac.vf =
@@ -1536,7 +1513,7 @@ errout:
return err;
}
-static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct ifinfomsg *ifm;
@@ -1577,7 +1554,7 @@ errout:
return err;
}
-static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
const struct rtnl_link_ops *ops;
@@ -1708,7 +1685,7 @@ static int rtnl_group_changelink(struct net *net, int group,
return 0;
}
-static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
const struct rtnl_link_ops *ops;
@@ -1863,7 +1840,7 @@ out:
}
}
-static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh)
{
struct net *net = sock_net(skb->sk);
struct ifinfomsg *ifm;
@@ -1954,8 +1931,11 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
if (rtnl_msg_handlers[idx] == NULL ||
rtnl_msg_handlers[idx][type].dumpit == NULL)
continue;
- if (idx > s_idx)
+ if (idx > s_idx) {
memset(&cb->args[0], 0, sizeof(cb->args));
+ cb->prev_seq = 0;
+ cb->seq = 0;
+ }
if (rtnl_msg_handlers[idx][type].dumpit(skb, cb))
break;
}
@@ -2048,7 +2028,39 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}
-static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+/**
+ * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry
+ */
+int ndo_dflt_fdb_add(struct ndmsg *ndm,
+ struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr,
+ u16 flags)
+{
+ int err = -EINVAL;
+
+ /* If aging addresses are supported device will need to
+ * implement its own handler for this.
+ */
+ if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) {
+ pr_info("%s: FDB only supports static addresses\n", dev->name);
+ return err;
+ }
+
+ if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
+ err = dev_uc_add_excl(dev, addr);
+ else if (is_multicast_ether_addr(addr))
+ err = dev_mc_add_excl(dev, addr);
+
+ /* Only return duplicate errors if NLM_F_EXCL is set */
+ if (err == -EEXIST && !(flags & NLM_F_EXCL))
+ err = 0;
+
+ return err;
+}
+EXPORT_SYMBOL(ndo_dflt_fdb_add);
+
+static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct ndmsg *ndm;
@@ -2079,7 +2091,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
}
addr = nla_data(tb[NDA_LLADDR]);
- if (!is_valid_ether_addr(addr)) {
+ if (is_zero_ether_addr(addr)) {
pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid ether address\n");
return -EINVAL;
}
@@ -2100,10 +2112,13 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
}
/* Embedded bridge, macvlan, and any other device support */
- if ((ndm->ndm_flags & NTF_SELF) && dev->netdev_ops->ndo_fdb_add) {
- err = dev->netdev_ops->ndo_fdb_add(ndm, tb,
- dev, addr,
- nlh->nlmsg_flags);
+ if ((ndm->ndm_flags & NTF_SELF)) {
+ if (dev->netdev_ops->ndo_fdb_add)
+ err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr,
+ nlh->nlmsg_flags);
+ else
+ err = ndo_dflt_fdb_add(ndm, tb, dev, addr,
+ nlh->nlmsg_flags);
if (!err) {
rtnl_fdb_notify(dev, addr, RTM_NEWNEIGH);
@@ -2114,7 +2129,36 @@ out:
return err;
}
-static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+/**
+ * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry
+ */
+int ndo_dflt_fdb_del(struct ndmsg *ndm,
+ struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr)
+{
+ int err = -EOPNOTSUPP;
+
+ /* If aging addresses are supported device will need to
+ * implement its own handler for this.
+ */
+ if (ndm->ndm_state & NUD_PERMANENT) {
+ pr_info("%s: FDB only supports static addresses\n", dev->name);
+ return -EINVAL;
+ }
+
+ if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
+ err = dev_uc_del(dev, addr);
+ else if (is_multicast_ether_addr(addr))
+ err = dev_mc_del(dev, addr);
+ else
+ err = -EINVAL;
+
+ return err;
+}
+EXPORT_SYMBOL(ndo_dflt_fdb_del);
+
+static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct ndmsg *ndm;
@@ -2171,8 +2215,11 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
}
/* Embedded bridge, macvlan, and any other device support */
- if ((ndm->ndm_flags & NTF_SELF) && dev->netdev_ops->ndo_fdb_del) {
- err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr);
+ if (ndm->ndm_flags & NTF_SELF) {
+ if (dev->netdev_ops->ndo_fdb_del)
+ err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr);
+ else
+ err = ndo_dflt_fdb_del(ndm, tb, dev, addr);
if (!err) {
rtnl_fdb_notify(dev, addr, RTM_DELNEIGH);
@@ -2217,7 +2264,7 @@ skip:
* @dev: netdevice
*
* Default netdevice operation to dump the existing unicast address list.
- * Returns zero on success.
+ * Returns number of addresses from list put in skb.
*/
int ndo_dflt_fdb_dump(struct sk_buff *skb,
struct netlink_callback *cb,
@@ -2257,6 +2304,8 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (dev->netdev_ops->ndo_fdb_dump)
idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, idx);
+ else
+ idx = ndo_dflt_fdb_dump(skb, cb, dev, idx);
}
rcu_read_unlock();
@@ -2408,8 +2457,7 @@ errout:
return err;
}
-static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
- void *arg)
+static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct ifinfomsg *ifm;
@@ -2479,8 +2527,7 @@ out:
return err;
}
-static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
- void *arg)
+static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct ifinfomsg *ifm;
@@ -2550,10 +2597,6 @@ out:
return err;
}
-/* Protected by RTNL sempahore. */
-static struct rtattr **rta_buf;
-static int rtattr_max;
-
/* Process one rtnetlink message. */
static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
@@ -2561,7 +2604,6 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
struct net *net = sock_net(skb->sk);
rtnl_doit_func doit;
int sz_idx, kind;
- int min_len;
int family;
int type;
int err;
@@ -2573,10 +2615,10 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
type -= RTM_BASE;
/* All the messages must have at least 1 byte length */
- if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg)))
+ if (nlmsg_len(nlh) < sizeof(struct rtgenmsg))
return 0;
- family = ((struct rtgenmsg *)NLMSG_DATA(nlh))->rtgen_family;
+ family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
sz_idx = type>>2;
kind = type&3;
@@ -2609,32 +2651,11 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return err;
}
- memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *)));
-
- min_len = rtm_min[sz_idx];
- if (nlh->nlmsg_len < min_len)
- return -EINVAL;
-
- if (nlh->nlmsg_len > min_len) {
- int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
- struct rtattr *attr = (void *)nlh + NLMSG_ALIGN(min_len);
-
- while (RTA_OK(attr, attrlen)) {
- unsigned int flavor = attr->rta_type;
- if (flavor) {
- if (flavor > rta_max[sz_idx])
- return -EINVAL;
- rta_buf[flavor-1] = attr;
- }
- attr = RTA_NEXT(attr, attrlen);
- }
- }
-
doit = rtnl_get_doit(family, type);
if (doit == NULL)
return -EOPNOTSUPP;
- return doit(skb, nlh, (void *)&rta_buf[0]);
+ return doit(skb, nlh);
}
static void rtnetlink_rcv(struct sk_buff *skb)
@@ -2704,16 +2725,6 @@ static struct pernet_operations rtnetlink_net_ops = {
void __init rtnetlink_init(void)
{
- int i;
-
- rtattr_max = 0;
- for (i = 0; i < ARRAY_SIZE(rta_max); i++)
- if (rta_max[i] > rtattr_max)
- rtattr_max = rta_max[i];
- rta_buf = kmalloc(rtattr_max * sizeof(struct rtattr *), GFP_KERNEL);
- if (!rta_buf)
- panic("rtnetlink_init: cannot allocate rta_buf\n");
-
if (register_pernet_subsys(&rtnetlink_net_ops))
panic("rtnetlink_init: cannot initialize rtnetlink\n");
diff --git a/net/core/scm.c b/net/core/scm.c
index 905dcc6ad1e3..03795d0147f2 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -24,6 +24,7 @@
#include <linux/interrupt.h>
#include <linux/netdevice.h>
#include <linux/security.h>
+#include <linux/pid_namespace.h>
#include <linux/pid.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
@@ -52,7 +53,8 @@ static __inline__ int scm_check_creds(struct ucred *creds)
if (!uid_valid(uid) || !gid_valid(gid))
return -EINVAL;
- if ((creds->pid == task_tgid_vnr(current) || nsown_capable(CAP_SYS_ADMIN)) &&
+ if ((creds->pid == task_tgid_vnr(current) ||
+ ns_capable(current->nsproxy->pid_ns->user_ns, CAP_SYS_ADMIN)) &&
((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) ||
uid_eq(uid, cred->suid)) || nsown_capable(CAP_SETUID)) &&
((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) ||
@@ -185,22 +187,6 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
p->creds.uid = uid;
p->creds.gid = gid;
-
- if (!p->cred ||
- !uid_eq(p->cred->euid, uid) ||
- !gid_eq(p->cred->egid, gid)) {
- struct cred *cred;
- err = -ENOMEM;
- cred = prepare_creds();
- if (!cred)
- goto error;
-
- cred->uid = cred->euid = uid;
- cred->gid = cred->egid = gid;
- if (p->cred)
- put_cred(p->cred);
- p->cred = cred;
- }
break;
}
default:
@@ -304,8 +290,8 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
/* Bump the usage count and install the file. */
sock = sock_from_file(fp[i], &err);
if (sock) {
- sock_update_netprioidx(sock->sk, current);
- sock_update_classid(sock->sk, current);
+ sock_update_netprioidx(sock->sk);
+ sock_update_classid(sock->sk);
}
fd_install(new_fd, get_file(fp[i]));
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 33245ef54c3b..ba646145cd5c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -673,6 +673,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->mac_header = old->mac_header;
new->inner_transport_header = old->inner_transport_header;
new->inner_network_header = old->inner_network_header;
+ new->inner_mac_header = old->inner_mac_header;
skb_dst_copy(new, old);
new->rxhash = old->rxhash;
new->ooo_okay = old->ooo_okay;
@@ -867,6 +868,18 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
}
EXPORT_SYMBOL(skb_clone);
+static void skb_headers_offset_update(struct sk_buff *skb, int off)
+{
+ /* {transport,network,mac}_header and tail are relative to skb->head */
+ skb->transport_header += off;
+ skb->network_header += off;
+ if (skb_mac_header_was_set(skb))
+ skb->mac_header += off;
+ skb->inner_transport_header += off;
+ skb->inner_network_header += off;
+ skb->inner_mac_header += off;
+}
+
static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
#ifndef NET_SKBUFF_DATA_USES_OFFSET
@@ -879,13 +892,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
__copy_skb_header(new, old);
#ifndef NET_SKBUFF_DATA_USES_OFFSET
- /* {transport,network,mac}_header are relative to skb->head */
- new->transport_header += offset;
- new->network_header += offset;
- if (skb_mac_header_was_set(new))
- new->mac_header += offset;
- new->inner_transport_header += offset;
- new->inner_network_header += offset;
+ skb_headers_offset_update(new, offset);
#endif
skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
@@ -1077,14 +1084,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
#else
skb->end = skb->head + size;
#endif
- /* {transport,network,mac}_header and tail are relative to skb->head */
skb->tail += off;
- skb->transport_header += off;
- skb->network_header += off;
- if (skb_mac_header_was_set(skb))
- skb->mac_header += off;
- skb->inner_transport_header += off;
- skb->inner_network_header += off;
+ skb_headers_offset_update(skb, off);
/* Only adjust this if it actually is csum_start rather than csum */
if (skb->ip_summed == CHECKSUM_PARTIAL)
skb->csum_start += nhead;
@@ -1180,12 +1181,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
if (n->ip_summed == CHECKSUM_PARTIAL)
n->csum_start += off;
#ifdef NET_SKBUFF_DATA_USES_OFFSET
- n->transport_header += off;
- n->network_header += off;
- if (skb_mac_header_was_set(skb))
- n->mac_header += off;
- n->inner_transport_header += off;
- n->inner_network_header += off;
+ skb_headers_offset_update(n, off);
#endif
return n;
@@ -2741,12 +2737,19 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
unsigned int tnl_hlen = skb_tnl_header_len(skb);
unsigned int headroom;
unsigned int len;
+ __be16 proto;
+ bool csum;
int sg = !!(features & NETIF_F_SG);
int nfrags = skb_shinfo(skb)->nr_frags;
int err = -ENOMEM;
int i = 0;
int pos;
+ proto = skb_network_protocol(skb);
+ if (unlikely(!proto))
+ return ERR_PTR(-EINVAL);
+
+ csum = !!can_checksum_protocol(features, proto);
__skb_push(skb, doffset);
headroom = skb_headroom(skb);
pos = skb_headlen(skb);
@@ -2884,6 +2887,12 @@ skip_fraglist:
nskb->data_len = len - hsize;
nskb->len += nskb->data_len;
nskb->truesize += nskb->data_len;
+
+ if (!csum) {
+ nskb->csum = skb_checksum(nskb, doffset,
+ nskb->len - doffset, 0);
+ nskb->ip_summed = CHECKSUM_NONE;
+ }
} while ((offset += len) < skb->len);
return segs;
@@ -3361,6 +3370,7 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum_start = skb_headroom(skb) + start;
skb->csum_offset = off;
+ skb_set_transport_header(skb, start);
return true;
}
EXPORT_SYMBOL_GPL(skb_partial_csum_set);
diff --git a/net/core/sock.c b/net/core/sock.c
index b261a7977746..d4f4cea726e7 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -907,6 +907,10 @@ set_rcvbuf:
sock_valbool_flag(sk, SOCK_NOFCS, valbool);
break;
+ case SO_SELECT_ERR_QUEUE:
+ sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
+ break;
+
default:
ret = -ENOPROTOOPT;
break;
@@ -1160,6 +1164,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
break;
+ case SO_SELECT_ERR_QUEUE:
+ v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
+ break;
+
default:
return -ENOPROTOOPT;
}
@@ -1298,13 +1306,12 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
module_put(owner);
}
-#ifdef CONFIG_CGROUPS
#if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
-void sock_update_classid(struct sock *sk, struct task_struct *task)
+void sock_update_classid(struct sock *sk)
{
u32 classid;
- classid = task_cls_classid(task);
+ classid = task_cls_classid(current);
if (classid != sk->sk_classid)
sk->sk_classid = classid;
}
@@ -1312,16 +1319,15 @@ EXPORT_SYMBOL(sock_update_classid);
#endif
#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
-void sock_update_netprioidx(struct sock *sk, struct task_struct *task)
+void sock_update_netprioidx(struct sock *sk)
{
if (in_interrupt())
return;
- sk->sk_cgrp_prioidx = task_netprioidx(task);
+ sk->sk_cgrp_prioidx = task_netprioidx(current);
}
EXPORT_SYMBOL_GPL(sock_update_netprioidx);
#endif
-#endif
/**
* sk_alloc - All socket objects are allocated here
@@ -1347,8 +1353,8 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
sock_net_set(sk, get_net(net));
atomic_set(&sk->sk_wmem_alloc, 1);
- sock_update_classid(sk, current);
- sock_update_netprioidx(sk, current);
+ sock_update_classid(sk);
+ sock_update_netprioidx(sk);
}
return sk;
diff --git a/net/core/utils.c b/net/core/utils.c
index e3487e461939..3c7f5b51b979 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -17,6 +17,7 @@
#include <linux/module.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
+#include <linux/ctype.h>
#include <linux/inet.h>
#include <linux/mm.h>
#include <linux/net.h>
@@ -348,9 +349,7 @@ int mac_pton(const char *s, u8 *mac)
/* Don't dirty result unless string is valid MAC. */
for (i = 0; i < ETH_ALEN; i++) {
- if (!strchr("0123456789abcdefABCDEF", s[i * 3]))
- return 0;
- if (!strchr("0123456789abcdefABCDEF", s[i * 3 + 1]))
+ if (!isxdigit(s[i * 3]) || !isxdigit(s[i * 3 + 1]))
return 0;
if (i != ETH_ALEN - 1 && s[i * 3 + 2] != ':')
return 0;
diff --git a/net/dcb/dcbevent.c b/net/dcb/dcbevent.c
index 1d9eb7c60a68..4f72fc40bf02 100644
--- a/net/dcb/dcbevent.c
+++ b/net/dcb/dcbevent.c
@@ -20,6 +20,7 @@
#include <linux/rtnetlink.h>
#include <linux/notifier.h>
#include <linux/export.h>
+#include <net/dcbevent.h>
static ATOMIC_NOTIFIER_HEAD(dcbevent_notif_chain);
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 1b588e23cf80..40d5829ed36a 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -284,6 +284,7 @@ static int dcbnl_getperm_hwaddr(struct net_device *netdev, struct nlmsghdr *nlh,
if (!netdev->dcbnl_ops->getpermhwaddr)
return -EOPNOTSUPP;
+ memset(perm_addr, 0, sizeof(perm_addr));
netdev->dcbnl_ops->getpermhwaddr(netdev, perm_addr);
return nla_put(skb, DCB_ATTR_PERM_HWADDR, sizeof(perm_addr), perm_addr);
@@ -1042,6 +1043,7 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
if (ops->ieee_getets) {
struct ieee_ets ets;
+ memset(&ets, 0, sizeof(ets));
err = ops->ieee_getets(netdev, &ets);
if (!err &&
nla_put(skb, DCB_ATTR_IEEE_ETS, sizeof(ets), &ets))
@@ -1050,6 +1052,7 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
if (ops->ieee_getmaxrate) {
struct ieee_maxrate maxrate;
+ memset(&maxrate, 0, sizeof(maxrate));
err = ops->ieee_getmaxrate(netdev, &maxrate);
if (!err) {
err = nla_put(skb, DCB_ATTR_IEEE_MAXRATE,
@@ -1061,6 +1064,7 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
if (ops->ieee_getpfc) {
struct ieee_pfc pfc;
+ memset(&pfc, 0, sizeof(pfc));
err = ops->ieee_getpfc(netdev, &pfc);
if (!err &&
nla_put(skb, DCB_ATTR_IEEE_PFC, sizeof(pfc), &pfc))
@@ -1094,6 +1098,7 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
/* get peer info if available */
if (ops->ieee_peer_getets) {
struct ieee_ets ets;
+ memset(&ets, 0, sizeof(ets));
err = ops->ieee_peer_getets(netdev, &ets);
if (!err &&
nla_put(skb, DCB_ATTR_IEEE_PEER_ETS, sizeof(ets), &ets))
@@ -1102,6 +1107,7 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
if (ops->ieee_peer_getpfc) {
struct ieee_pfc pfc;
+ memset(&pfc, 0, sizeof(pfc));
err = ops->ieee_peer_getpfc(netdev, &pfc);
if (!err &&
nla_put(skb, DCB_ATTR_IEEE_PEER_PFC, sizeof(pfc), &pfc))
@@ -1280,6 +1286,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev)
/* peer info if available */
if (ops->cee_peer_getpg) {
struct cee_pg pg;
+ memset(&pg, 0, sizeof(pg));
err = ops->cee_peer_getpg(netdev, &pg);
if (!err &&
nla_put(skb, DCB_ATTR_CEE_PEER_PG, sizeof(pg), &pg))
@@ -1288,6 +1295,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev)
if (ops->cee_peer_getpfc) {
struct cee_pfc pfc;
+ memset(&pfc, 0, sizeof(pfc));
err = ops->cee_peer_getpfc(netdev, &pfc);
if (!err &&
nla_put(skb, DCB_ATTR_CEE_PEER_PFC, sizeof(pfc), &pfc))
@@ -1650,7 +1658,7 @@ static const struct reply_func reply_funcs[DCB_CMD_MAX+1] = {
[DCB_CMD_CEE_GET] = { RTM_GETDCB, dcbnl_cee_get },
};
-static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct net_device *netdev;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 4f9f5eb478f1..ebc54fef85a5 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -500,8 +500,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
return &rt->dst;
}
-static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
- struct request_values *rv_unused)
+static int dccp_v4_send_response(struct sock *sk, struct request_sock *req)
{
int err = -1;
struct sk_buff *skb;
@@ -658,7 +657,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
dreq->dreq_gss = dreq->dreq_iss;
dreq->dreq_service = service;
- if (dccp_v4_send_response(sk, req, NULL))
+ if (dccp_v4_send_response(sk, req))
goto drop_and_free;
inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 6e05981f271e..9c61f9c02fdb 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -213,8 +213,7 @@ out:
}
-static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
- struct request_values *rv_unused)
+static int dccp_v6_send_response(struct sock *sk, struct request_sock *req)
{
struct inet6_request_sock *ireq6 = inet6_rsk(req);
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -428,7 +427,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
dreq->dreq_gss = dreq->dreq_iss;
dreq->dreq_service = service;
- if (dccp_v6_send_response(sk, req, NULL))
+ if (dccp_v6_send_response(sk, req))
goto drop_and_free;
inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index c8da116d84a4..7d9197063ebb 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -563,7 +563,7 @@ static const struct nla_policy dn_ifa_policy[IFA_MAX+1] = {
.len = IFNAMSIZ - 1 },
};
-static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tb[IFA_MAX+1];
@@ -607,7 +607,7 @@ errout:
return err;
}
-static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tb[IFA_MAX+1];
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index e36614eccc04..57dc159245ec 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -145,22 +145,10 @@ static inline struct dn_fib_info *dn_fib_find_info(const struct dn_fib_info *nfi
return NULL;
}
-__le16 dn_fib_get_attr16(struct rtattr *attr, int attrlen, int type)
+static int dn_fib_count_nhs(const struct nlattr *attr)
{
- while(RTA_OK(attr,attrlen)) {
- if (attr->rta_type == type)
- return *(__le16*)RTA_DATA(attr);
- attr = RTA_NEXT(attr, attrlen);
- }
-
- return 0;
-}
-
-static int dn_fib_count_nhs(struct rtattr *rta)
-{
- int nhs = 0;
- struct rtnexthop *nhp = RTA_DATA(rta);
- int nhlen = RTA_PAYLOAD(rta);
+ struct rtnexthop *nhp = nla_data(attr);
+ int nhs = 0, nhlen = nla_len(attr);
while(nhlen >= (int)sizeof(struct rtnexthop)) {
if ((nhlen -= nhp->rtnh_len) < 0)
@@ -172,10 +160,11 @@ static int dn_fib_count_nhs(struct rtattr *rta)
return nhs;
}
-static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
+static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct nlattr *attr,
+ const struct rtmsg *r)
{
- struct rtnexthop *nhp = RTA_DATA(rta);
- int nhlen = RTA_PAYLOAD(rta);
+ struct rtnexthop *nhp = nla_data(attr);
+ int nhlen = nla_len(attr);
change_nexthops(fi) {
int attrlen = nhlen - sizeof(struct rtnexthop);
@@ -187,7 +176,10 @@ static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct rtattr *rta, cons
nh->nh_weight = nhp->rtnh_hops + 1;
if (attrlen) {
- nh->nh_gw = dn_fib_get_attr16(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
+ struct nlattr *gw_attr;
+
+ gw_attr = nla_find((struct nlattr *) (nhp + 1), attrlen, RTA_GATEWAY);
+ nh->nh_gw = gw_attr ? nla_get_le16(gw_attr) : 0;
}
nhp = RTNH_NEXT(nhp);
} endfor_nexthops(fi);
@@ -268,7 +260,8 @@ out:
}
-struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct dn_kern_rta *rta, const struct nlmsghdr *nlh, int *errp)
+struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct nlattr *attrs[],
+ const struct nlmsghdr *nlh, int *errp)
{
int err;
struct dn_fib_info *fi = NULL;
@@ -281,11 +274,9 @@ struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct dn_kern_rta
if (dn_fib_props[r->rtm_type].scope > r->rtm_scope)
goto err_inval;
- if (rta->rta_mp) {
- nhs = dn_fib_count_nhs(rta->rta_mp);
- if (nhs == 0)
- goto err_inval;
- }
+ if (attrs[RTA_MULTIPATH] &&
+ (nhs = dn_fib_count_nhs(attrs[RTA_MULTIPATH])) == 0)
+ goto err_inval;
fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct dn_fib_nh), GFP_KERNEL);
err = -ENOBUFS;
@@ -295,53 +286,65 @@ struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct dn_kern_rta
fi->fib_protocol = r->rtm_protocol;
fi->fib_nhs = nhs;
fi->fib_flags = r->rtm_flags;
- if (rta->rta_priority)
- fi->fib_priority = *rta->rta_priority;
- if (rta->rta_mx) {
- int attrlen = RTA_PAYLOAD(rta->rta_mx);
- struct rtattr *attr = RTA_DATA(rta->rta_mx);
- while(RTA_OK(attr, attrlen)) {
- unsigned int flavour = attr->rta_type;
+ if (attrs[RTA_PRIORITY])
+ fi->fib_priority = nla_get_u32(attrs[RTA_PRIORITY]);
+
+ if (attrs[RTA_METRICS]) {
+ struct nlattr *attr;
+ int rem;
- if (flavour) {
- if (flavour > RTAX_MAX)
+ nla_for_each_nested(attr, attrs[RTA_METRICS], rem) {
+ int type = nla_type(attr);
+
+ if (type) {
+ if (type > RTAX_MAX || nla_len(attr) < 4)
goto err_inval;
- fi->fib_metrics[flavour-1] = *(unsigned int *)RTA_DATA(attr);
+
+ fi->fib_metrics[type-1] = nla_get_u32(attr);
}
- attr = RTA_NEXT(attr, attrlen);
}
}
- if (rta->rta_prefsrc)
- memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 2);
- if (rta->rta_mp) {
- if ((err = dn_fib_get_nhs(fi, rta->rta_mp, r)) != 0)
+ if (attrs[RTA_PREFSRC])
+ fi->fib_prefsrc = nla_get_le16(attrs[RTA_PREFSRC]);
+
+ if (attrs[RTA_MULTIPATH]) {
+ if ((err = dn_fib_get_nhs(fi, attrs[RTA_MULTIPATH], r)) != 0)
goto failure;
- if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
+
+ if (attrs[RTA_OIF] &&
+ fi->fib_nh->nh_oif != nla_get_u32(attrs[RTA_OIF]))
goto err_inval;
- if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 2))
+
+ if (attrs[RTA_GATEWAY] &&
+ fi->fib_nh->nh_gw != nla_get_le16(attrs[RTA_GATEWAY]))
goto err_inval;
} else {
struct dn_fib_nh *nh = fi->fib_nh;
- if (rta->rta_oif)
- nh->nh_oif = *rta->rta_oif;
- if (rta->rta_gw)
- memcpy(&nh->nh_gw, rta->rta_gw, 2);
+
+ if (attrs[RTA_OIF])
+ nh->nh_oif = nla_get_u32(attrs[RTA_OIF]);
+
+ if (attrs[RTA_GATEWAY])
+ nh->nh_gw = nla_get_le16(attrs[RTA_GATEWAY]);
+
nh->nh_flags = r->rtm_flags;
nh->nh_weight = 1;
}
if (r->rtm_type == RTN_NAT) {
- if (rta->rta_gw == NULL || nhs != 1 || rta->rta_oif)
+ if (!attrs[RTA_GATEWAY] || nhs != 1 || attrs[RTA_OIF])
goto err_inval;
- memcpy(&fi->fib_nh->nh_gw, rta->rta_gw, 2);
+
+ fi->fib_nh->nh_gw = nla_get_le16(attrs[RTA_GATEWAY]);
goto link_it;
}
if (dn_fib_props[r->rtm_type].error) {
- if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
+ if (attrs[RTA_GATEWAY] || attrs[RTA_OIF] || attrs[RTA_MULTIPATH])
goto err_inval;
+
goto link_it;
}
@@ -367,8 +370,8 @@ struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct dn_kern_rta
}
if (fi->fib_prefsrc) {
- if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL ||
- memcmp(&fi->fib_prefsrc, rta->rta_dst, 2))
+ if (r->rtm_type != RTN_LOCAL || !attrs[RTA_DST] ||
+ fi->fib_prefsrc != nla_get_le16(attrs[RTA_DST]))
if (dnet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
goto err_inval;
}
@@ -486,39 +489,21 @@ void dn_fib_select_multipath(const struct flowidn *fld, struct dn_fib_res *res)
spin_unlock_bh(&dn_fib_multipath_lock);
}
-
-static int dn_fib_check_attr(struct rtmsg *r, struct rtattr **rta)
-{
- int i;
-
- for(i = 1; i <= RTA_MAX; i++) {
- struct rtattr *attr = rta[i-1];
- if (attr) {
- if (RTA_PAYLOAD(attr) < 4 && RTA_PAYLOAD(attr) != 2)
- return -EINVAL;
- if (i != RTA_MULTIPATH && i != RTA_METRICS &&
- i != RTA_TABLE)
- rta[i-1] = (struct rtattr *)RTA_DATA(attr);
- }
- }
-
- return 0;
-}
-
-static inline u32 rtm_get_table(struct rtattr **rta, u8 table)
+static inline u32 rtm_get_table(struct nlattr *attrs[], u8 table)
{
- if (rta[RTA_TABLE - 1])
- table = nla_get_u32((struct nlattr *) rta[RTA_TABLE - 1]);
+ if (attrs[RTA_TABLE])
+ table = nla_get_u32(attrs[RTA_TABLE]);
return table;
}
-static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct dn_fib_table *tb;
- struct rtattr **rta = arg;
- struct rtmsg *r = NLMSG_DATA(nlh);
+ struct rtmsg *r = nlmsg_data(nlh);
+ struct nlattr *attrs[RTA_MAX+1];
+ int err;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
@@ -526,22 +511,24 @@ static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *
if (!net_eq(net, &init_net))
return -EINVAL;
- if (dn_fib_check_attr(r, rta))
- return -EINVAL;
+ err = nlmsg_parse(nlh, sizeof(*r), attrs, RTA_MAX, rtm_dn_policy);
+ if (err < 0)
+ return err;
- tb = dn_fib_get_table(rtm_get_table(rta, r->rtm_table), 0);
- if (tb)
- return tb->delete(tb, r, (struct dn_kern_rta *)rta, nlh, &NETLINK_CB(skb));
+ tb = dn_fib_get_table(rtm_get_table(attrs, r->rtm_table), 0);
+ if (!tb)
+ return -ESRCH;
- return -ESRCH;
+ return tb->delete(tb, r, attrs, nlh, &NETLINK_CB(skb));
}
-static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct dn_fib_table *tb;
- struct rtattr **rta = arg;
- struct rtmsg *r = NLMSG_DATA(nlh);
+ struct rtmsg *r = nlmsg_data(nlh);
+ struct nlattr *attrs[RTA_MAX+1];
+ int err;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
@@ -549,14 +536,15 @@ static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *
if (!net_eq(net, &init_net))
return -EINVAL;
- if (dn_fib_check_attr(r, rta))
- return -EINVAL;
+ err = nlmsg_parse(nlh, sizeof(*r), attrs, RTA_MAX, rtm_dn_policy);
+ if (err < 0)
+ return err;
- tb = dn_fib_get_table(rtm_get_table(rta, r->rtm_table), 1);
- if (tb)
- return tb->insert(tb, r, (struct dn_kern_rta *)rta, nlh, &NETLINK_CB(skb));
+ tb = dn_fib_get_table(rtm_get_table(attrs, r->rtm_table), 1);
+ if (!tb)
+ return -ENOBUFS;
- return -ENOBUFS;
+ return tb->insert(tb, r, attrs, nlh, &NETLINK_CB(skb));
}
static void fib_magic(int cmd, int type, __le16 dst, int dst_len, struct dn_ifaddr *ifa)
@@ -566,10 +554,31 @@ static void fib_magic(int cmd, int type, __le16 dst, int dst_len, struct dn_ifad
struct nlmsghdr nlh;
struct rtmsg rtm;
} req;
- struct dn_kern_rta rta;
+ struct {
+ struct nlattr hdr;
+ __le16 dst;
+ } dst_attr = {
+ .dst = dst,
+ };
+ struct {
+ struct nlattr hdr;
+ __le16 prefsrc;
+ } prefsrc_attr = {
+ .prefsrc = ifa->ifa_local,
+ };
+ struct {
+ struct nlattr hdr;
+ u32 oif;
+ } oif_attr = {
+ .oif = ifa->ifa_dev->dev->ifindex,
+ };
+ struct nlattr *attrs[RTA_MAX+1] = {
+ [RTA_DST] = (struct nlattr *) &dst_attr,
+ [RTA_PREFSRC] = (struct nlattr * ) &prefsrc_attr,
+ [RTA_OIF] = (struct nlattr *) &oif_attr,
+ };
memset(&req.rtm, 0, sizeof(req.rtm));
- memset(&rta, 0, sizeof(rta));
if (type == RTN_UNICAST)
tb = dn_fib_get_table(RT_MIN_TABLE, 1);
@@ -591,14 +600,10 @@ static void fib_magic(int cmd, int type, __le16 dst, int dst_len, struct dn_ifad
req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST);
req.rtm.rtm_type = type;
- rta.rta_dst = &dst;
- rta.rta_prefsrc = &ifa->ifa_local;
- rta.rta_oif = &ifa->ifa_dev->dev->ifindex;
-
if (cmd == RTM_NEWROUTE)
- tb->insert(tb, &req.rtm, &rta, &req.nlh, NULL);
+ tb->insert(tb, &req.rtm, attrs, &req.nlh, NULL);
else
- tb->delete(tb, &req.rtm, &rta, &req.nlh, NULL);
+ tb->delete(tb, &req.rtm, attrs, &req.nlh, NULL);
}
static void dn_fib_add_ifaddr(struct dn_ifaddr *ifa)
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 5ac0e153ef83..fe32388ea24f 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1613,23 +1613,41 @@ errout:
return -EMSGSIZE;
}
+const struct nla_policy rtm_dn_policy[RTA_MAX + 1] = {
+ [RTA_DST] = { .type = NLA_U16 },
+ [RTA_SRC] = { .type = NLA_U16 },
+ [RTA_IIF] = { .type = NLA_U32 },
+ [RTA_OIF] = { .type = NLA_U32 },
+ [RTA_GATEWAY] = { .type = NLA_U16 },
+ [RTA_PRIORITY] = { .type = NLA_U32 },
+ [RTA_PREFSRC] = { .type = NLA_U16 },
+ [RTA_METRICS] = { .type = NLA_NESTED },
+ [RTA_MULTIPATH] = { .type = NLA_NESTED },
+ [RTA_TABLE] = { .type = NLA_U32 },
+ [RTA_MARK] = { .type = NLA_U32 },
+};
+
/*
* This is called by both endnodes and routers now.
*/
-static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
+static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(in_skb->sk);
- struct rtattr **rta = arg;
struct rtmsg *rtm = nlmsg_data(nlh);
struct dn_route *rt = NULL;
struct dn_skb_cb *cb;
int err;
struct sk_buff *skb;
struct flowidn fld;
+ struct nlattr *tb[RTA_MAX+1];
if (!net_eq(net, &init_net))
return -EINVAL;
+ err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_dn_policy);
+ if (err < 0)
+ return err;
+
memset(&fld, 0, sizeof(fld));
fld.flowidn_proto = DNPROTO_NSP;
@@ -1639,12 +1657,14 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
skb_reset_mac_header(skb);
cb = DN_SKB_CB(skb);
- if (rta[RTA_SRC-1])
- memcpy(&fld.saddr, RTA_DATA(rta[RTA_SRC-1]), 2);
- if (rta[RTA_DST-1])
- memcpy(&fld.daddr, RTA_DATA(rta[RTA_DST-1]), 2);
- if (rta[RTA_IIF-1])
- memcpy(&fld.flowidn_iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
+ if (tb[RTA_SRC])
+ fld.saddr = nla_get_le16(tb[RTA_SRC]);
+
+ if (tb[RTA_DST])
+ fld.daddr = nla_get_le16(tb[RTA_DST]);
+
+ if (tb[RTA_IIF])
+ fld.flowidn_iif = nla_get_u32(tb[RTA_IIF]);
if (fld.flowidn_iif) {
struct net_device *dev;
@@ -1669,10 +1689,9 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
if (!err && -rt->dst.error)
err = rt->dst.error;
} else {
- int oif = 0;
- if (rta[RTA_OIF - 1])
- memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
- fld.flowidn_oif = oif;
+ if (tb[RTA_OIF])
+ fld.flowidn_oif = nla_get_u32(tb[RTA_OIF]);
+
err = dn_route_output_key((struct dst_entry **)&rt, &fld, 0);
}
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 6c2445bcaba1..86e3807052e9 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -19,7 +19,6 @@
#include <linux/sockios.h>
#include <linux/init.h>
#include <linux/skbuff.h>
-#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/proc_fs.h>
#include <linux/netdevice.h>
@@ -224,26 +223,27 @@ static struct dn_zone *dn_new_zone(struct dn_hash *table, int z)
}
-static int dn_fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct dn_kern_rta *rta, struct dn_fib_info *fi)
+static int dn_fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct nlattr *attrs[], struct dn_fib_info *fi)
{
struct rtnexthop *nhp;
int nhlen;
- if (rta->rta_priority && *rta->rta_priority != fi->fib_priority)
+ if (attrs[RTA_PRIORITY] &&
+ nla_get_u32(attrs[RTA_PRIORITY]) != fi->fib_priority)
return 1;
- if (rta->rta_oif || rta->rta_gw) {
- if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) &&
- (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 2) == 0))
+ if (attrs[RTA_OIF] || attrs[RTA_GATEWAY]) {
+ if ((!attrs[RTA_OIF] || nla_get_u32(attrs[RTA_OIF]) == fi->fib_nh->nh_oif) &&
+ (!attrs[RTA_GATEWAY] || nla_get_le16(attrs[RTA_GATEWAY]) != fi->fib_nh->nh_gw))
return 0;
return 1;
}
- if (rta->rta_mp == NULL)
+ if (!attrs[RTA_MULTIPATH])
return 0;
- nhp = RTA_DATA(rta->rta_mp);
- nhlen = RTA_PAYLOAD(rta->rta_mp);
+ nhp = nla_data(attrs[RTA_MULTIPATH]);
+ nhlen = nla_len(attrs[RTA_MULTIPATH]);
for_nexthops(fi) {
int attrlen = nhlen - sizeof(struct rtnexthop);
@@ -254,7 +254,10 @@ static int dn_fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct dn_kern
if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
return 1;
if (attrlen) {
- gw = dn_fib_get_attr16(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
+ struct nlattr *gw_attr;
+
+ gw_attr = nla_find((struct nlattr *) (nhp + 1), attrlen, RTA_GATEWAY);
+ gw = gw_attr ? nla_get_le16(gw_attr) : 0;
if (gw && gw != nh->nh_gw)
return 1;
@@ -488,7 +491,7 @@ int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (!net_eq(net, &init_net))
return 0;
- if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) &&
+ if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
((struct rtmsg *)nlmsg_data(cb->nlh))->rtm_flags&RTM_F_CLONED)
return dn_cache_dump(skb, cb);
@@ -517,7 +520,8 @@ out:
return skb->len;
}
-static int dn_fib_table_insert(struct dn_fib_table *tb, struct rtmsg *r, struct dn_kern_rta *rta, struct nlmsghdr *n, struct netlink_skb_parms *req)
+static int dn_fib_table_insert(struct dn_fib_table *tb, struct rtmsg *r, struct nlattr *attrs[],
+ struct nlmsghdr *n, struct netlink_skb_parms *req)
{
struct dn_hash *table = (struct dn_hash *)tb->data;
struct dn_fib_node *new_f, *f, **fp, **del_fp;
@@ -536,15 +540,14 @@ static int dn_fib_table_insert(struct dn_fib_table *tb, struct rtmsg *r, struct
return -ENOBUFS;
dz_key_0(key);
- if (rta->rta_dst) {
- __le16 dst;
- memcpy(&dst, rta->rta_dst, 2);
+ if (attrs[RTA_DST]) {
+ __le16 dst = nla_get_le16(attrs[RTA_DST]);
if (dst & ~DZ_MASK(dz))
return -EINVAL;
key = dz_key(dst, dz);
}
- if ((fi = dn_fib_create_info(r, rta, n, &err)) == NULL)
+ if ((fi = dn_fib_create_info(r, attrs, n, &err)) == NULL)
return err;
if (dz->dz_nent > (dz->dz_divisor << 2) &&
@@ -654,7 +657,8 @@ out:
}
-static int dn_fib_table_delete(struct dn_fib_table *tb, struct rtmsg *r, struct dn_kern_rta *rta, struct nlmsghdr *n, struct netlink_skb_parms *req)
+static int dn_fib_table_delete(struct dn_fib_table *tb, struct rtmsg *r, struct nlattr *attrs[],
+ struct nlmsghdr *n, struct netlink_skb_parms *req)
{
struct dn_hash *table = (struct dn_hash*)tb->data;
struct dn_fib_node **fp, **del_fp, *f;
@@ -671,9 +675,8 @@ static int dn_fib_table_delete(struct dn_fib_table *tb, struct rtmsg *r, struct
return -ESRCH;
dz_key_0(key);
- if (rta->rta_dst) {
- __le16 dst;
- memcpy(&dst, rta->rta_dst, 2);
+ if (attrs[RTA_DST]) {
+ __le16 dst = nla_get_le16(attrs[RTA_DST]);
if (dst & ~DZ_MASK(dz))
return -EINVAL;
key = dz_key(dst, dz);
@@ -703,7 +706,7 @@ static int dn_fib_table_delete(struct dn_fib_table *tb, struct rtmsg *r, struct
(r->rtm_scope == RT_SCOPE_NOWHERE || f->fn_scope == r->rtm_scope) &&
(!r->rtm_protocol ||
fi->fib_protocol == r->rtm_protocol) &&
- dn_fib_nh_match(r, n, rta, fi) == 0)
+ dn_fib_nh_match(r, n, attrs, fi) == 0)
del_fp = fp;
}
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index dfe42012a044..2a7efe388344 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -19,7 +19,7 @@
#include <linux/netdevice.h>
#include <linux/netfilter.h>
#include <linux/spinlock.h>
-#include <linux/netlink.h>
+#include <net/netlink.h>
#include <linux/netfilter_decnet.h>
#include <net/sock.h>
@@ -39,21 +39,21 @@ static struct sk_buff *dnrmg_build_message(struct sk_buff *rt_skb, int *errp)
unsigned char *ptr;
struct nf_dn_rtmsg *rtm;
- size = NLMSG_SPACE(rt_skb->len);
- size += NLMSG_ALIGN(sizeof(struct nf_dn_rtmsg));
- skb = alloc_skb(size, GFP_ATOMIC);
+ size = NLMSG_ALIGN(rt_skb->len) +
+ NLMSG_ALIGN(sizeof(struct nf_dn_rtmsg));
+ skb = nlmsg_new(size, GFP_ATOMIC);
if (!skb) {
*errp = -ENOMEM;
return NULL;
}
old_tail = skb->tail;
- nlh = nlmsg_put(skb, 0, 0, 0, size - sizeof(*nlh), 0);
+ nlh = nlmsg_put(skb, 0, 0, 0, size, 0);
if (!nlh) {
kfree_skb(skb);
*errp = -ENOMEM;
return NULL;
}
- rtm = (struct nf_dn_rtmsg *)NLMSG_DATA(nlh);
+ rtm = (struct nf_dn_rtmsg *)nlmsg_data(nlh);
rtm->nfdn_ifindex = rt_skb->dev->ifindex;
ptr = NFDN_RTMSG(rtm);
skb_copy_from_linear_data(rt_skb, ptr, rt_skb->len);
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 2bc62ea857c8..0eb5d5e76dfb 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -1,6 +1,7 @@
/*
* net/dsa/dsa.c - Hardware switch handling
* Copyright (c) 2008-2009 Marvell Semiconductor
+ * Copyright (c) 2013 Florian Fainelli <florian@openwrt.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -14,6 +15,9 @@
#include <linux/slab.h>
#include <linux/module.h>
#include <net/dsa.h>
+#include <linux/of.h>
+#include <linux/of_mdio.h>
+#include <linux/of_platform.h>
#include "dsa_priv.h"
char dsa_driver_version[] = "0.1";
@@ -287,34 +291,239 @@ static struct net_device *dev_to_net_device(struct device *dev)
return NULL;
}
+#ifdef CONFIG_OF
+static int dsa_of_setup_routing_table(struct dsa_platform_data *pd,
+ struct dsa_chip_data *cd,
+ int chip_index,
+ struct device_node *link)
+{
+ int ret;
+ const __be32 *reg;
+ int link_port_addr;
+ int link_sw_addr;
+ struct device_node *parent_sw;
+ int len;
+
+ parent_sw = of_get_parent(link);
+ if (!parent_sw)
+ return -EINVAL;
+
+ reg = of_get_property(parent_sw, "reg", &len);
+ if (!reg || (len != sizeof(*reg) * 2))
+ return -EINVAL;
+
+ link_sw_addr = be32_to_cpup(reg + 1);
+
+ if (link_sw_addr >= pd->nr_chips)
+ return -EINVAL;
+
+ /* First time routing table allocation */
+ if (!cd->rtable) {
+ cd->rtable = kmalloc(pd->nr_chips * sizeof(s8), GFP_KERNEL);
+ if (!cd->rtable)
+ return -ENOMEM;
+
+ /* default to no valid uplink/downlink */
+ memset(cd->rtable, -1, pd->nr_chips * sizeof(s8));
+ }
+
+ reg = of_get_property(link, "reg", NULL);
+ if (!reg) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ link_port_addr = be32_to_cpup(reg);
+
+ cd->rtable[link_sw_addr] = link_port_addr;
+
+ return 0;
+out:
+ kfree(cd->rtable);
+ return ret;
+}
+
+static void dsa_of_free_platform_data(struct dsa_platform_data *pd)
+{
+ int i;
+ int port_index;
+
+ for (i = 0; i < pd->nr_chips; i++) {
+ port_index = 0;
+ while (port_index < DSA_MAX_PORTS) {
+ if (pd->chip[i].port_names[port_index])
+ kfree(pd->chip[i].port_names[port_index]);
+ port_index++;
+ }
+ kfree(pd->chip[i].rtable);
+ }
+ kfree(pd->chip);
+}
+
+static int dsa_of_probe(struct platform_device *pdev)
+{
+ struct device_node *np = pdev->dev.of_node;
+ struct device_node *child, *mdio, *ethernet, *port, *link;
+ struct mii_bus *mdio_bus;
+ struct platform_device *ethernet_dev;
+ struct dsa_platform_data *pd;
+ struct dsa_chip_data *cd;
+ const char *port_name;
+ int chip_index, port_index;
+ const unsigned int *sw_addr, *port_reg;
+ int ret;
+
+ mdio = of_parse_phandle(np, "dsa,mii-bus", 0);
+ if (!mdio)
+ return -EINVAL;
+
+ mdio_bus = of_mdio_find_bus(mdio);
+ if (!mdio_bus)
+ return -EINVAL;
+
+ ethernet = of_parse_phandle(np, "dsa,ethernet", 0);
+ if (!ethernet)
+ return -EINVAL;
+
+ ethernet_dev = of_find_device_by_node(ethernet);
+ if (!ethernet_dev)
+ return -ENODEV;
+
+ pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+ if (!pd)
+ return -ENOMEM;
+
+ pdev->dev.platform_data = pd;
+ pd->netdev = &ethernet_dev->dev;
+ pd->nr_chips = of_get_child_count(np);
+ if (pd->nr_chips > DSA_MAX_SWITCHES)
+ pd->nr_chips = DSA_MAX_SWITCHES;
+
+ pd->chip = kzalloc(pd->nr_chips * sizeof(struct dsa_chip_data),
+ GFP_KERNEL);
+ if (!pd->chip) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+
+ chip_index = 0;
+ for_each_available_child_of_node(np, child) {
+ cd = &pd->chip[chip_index];
+
+ cd->mii_bus = &mdio_bus->dev;
+
+ sw_addr = of_get_property(child, "reg", NULL);
+ if (!sw_addr)
+ continue;
+
+ cd->sw_addr = be32_to_cpup(sw_addr);
+ if (cd->sw_addr > PHY_MAX_ADDR)
+ continue;
+
+ for_each_available_child_of_node(child, port) {
+ port_reg = of_get_property(port, "reg", NULL);
+ if (!port_reg)
+ continue;
+
+ port_index = be32_to_cpup(port_reg);
+
+ port_name = of_get_property(port, "label", NULL);
+ if (!port_name)
+ continue;
+
+ cd->port_names[port_index] = kstrdup(port_name,
+ GFP_KERNEL);
+ if (!cd->port_names[port_index]) {
+ ret = -ENOMEM;
+ goto out_free_chip;
+ }
+
+ link = of_parse_phandle(port, "link", 0);
+
+ if (!strcmp(port_name, "dsa") && link &&
+ pd->nr_chips > 1) {
+ ret = dsa_of_setup_routing_table(pd, cd,
+ chip_index, link);
+ if (ret)
+ goto out_free_chip;
+ }
+
+ if (port_index == DSA_MAX_PORTS)
+ break;
+ }
+ }
+
+ return 0;
+
+out_free_chip:
+ dsa_of_free_platform_data(pd);
+out_free:
+ kfree(pd);
+ pdev->dev.platform_data = NULL;
+ return ret;
+}
+
+static void dsa_of_remove(struct platform_device *pdev)
+{
+ struct dsa_platform_data *pd = pdev->dev.platform_data;
+
+ if (!pdev->dev.of_node)
+ return;
+
+ dsa_of_free_platform_data(pd);
+ kfree(pd);
+}
+#else
+static inline int dsa_of_probe(struct platform_device *pdev)
+{
+ return 0;
+}
+
+static inline void dsa_of_remove(struct platform_device *pdev)
+{
+}
+#endif
+
static int dsa_probe(struct platform_device *pdev)
{
static int dsa_version_printed;
struct dsa_platform_data *pd = pdev->dev.platform_data;
struct net_device *dev;
struct dsa_switch_tree *dst;
- int i;
+ int i, ret;
if (!dsa_version_printed++)
printk(KERN_NOTICE "Distributed Switch Architecture "
"driver version %s\n", dsa_driver_version);
+ if (pdev->dev.of_node) {
+ ret = dsa_of_probe(pdev);
+ if (ret)
+ return ret;
+
+ pd = pdev->dev.platform_data;
+ }
+
if (pd == NULL || pd->netdev == NULL)
return -EINVAL;
dev = dev_to_net_device(pd->netdev);
- if (dev == NULL)
- return -EINVAL;
+ if (dev == NULL) {
+ ret = -EINVAL;
+ goto out;
+ }
if (dev->dsa_ptr != NULL) {
dev_put(dev);
- return -EEXIST;
+ ret = -EEXIST;
+ goto out;
}
dst = kzalloc(sizeof(*dst), GFP_KERNEL);
if (dst == NULL) {
dev_put(dev);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
platform_set_drvdata(pdev, dst);
@@ -366,6 +575,11 @@ static int dsa_probe(struct platform_device *pdev)
}
return 0;
+
+out:
+ dsa_of_remove(pdev);
+
+ return ret;
}
static int dsa_remove(struct platform_device *pdev)
@@ -385,6 +599,8 @@ static int dsa_remove(struct platform_device *pdev)
dsa_switch_destroy(ds);
}
+ dsa_of_remove(pdev);
+
return 0;
}
@@ -392,6 +608,12 @@ static void dsa_shutdown(struct platform_device *pdev)
{
}
+static const struct of_device_id dsa_of_match_table[] = {
+ { .compatible = "marvell,dsa", },
+ {}
+};
+MODULE_DEVICE_TABLE(of, dsa_of_match_table);
+
static struct platform_driver dsa_driver = {
.probe = dsa_probe,
.remove = dsa_remove,
@@ -399,6 +621,7 @@ static struct platform_driver dsa_driver = {
.driver = {
.name = "dsa",
.owner = THIS_MODULE,
+ .of_match_table = dsa_of_match_table,
},
};
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index a36c85eab5b4..5359560926bc 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -195,7 +195,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
if (netdev_uses_trailer_tags(dev))
return htons(ETH_P_TRAILER);
- if (ntohs(eth->h_proto) >= 1536)
+ if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN)
return eth->h_proto;
/*
diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c
index 43b95ca61114..55e1fd5b3e56 100644
--- a/net/ieee802154/6lowpan.c
+++ b/net/ieee802154/6lowpan.c
@@ -104,6 +104,7 @@ static const u8 lowpan_llprefix[] = {0xfe, 0x80};
struct lowpan_dev_info {
struct net_device *real_dev; /* real WPAN device ptr */
struct mutex dev_list_mtx; /* mutex for list ops */
+ unsigned short fragment_tag;
};
struct lowpan_dev_record {
@@ -120,7 +121,6 @@ struct lowpan_fragment {
struct list_head list; /* fragments list */
};
-static unsigned short fragment_tag;
static LIST_HEAD(lowpan_fragments);
static DEFINE_SPINLOCK(flist_lock);
@@ -284,6 +284,9 @@ lowpan_compress_udp_header(u8 **hc06_ptr, struct sk_buff *skb)
/* checksum is always inline */
memcpy(*hc06_ptr, &uh->check, 2);
*hc06_ptr += 2;
+
+ /* skip the UDP header */
+ skb_pull(skb, sizeof(struct udphdr));
}
static inline int lowpan_fetch_skb_u8(struct sk_buff *skb, u8 *val)
@@ -309,9 +312,8 @@ static inline int lowpan_fetch_skb_u16(struct sk_buff *skb, u16 *val)
}
static int
-lowpan_uncompress_udp_header(struct sk_buff *skb)
+lowpan_uncompress_udp_header(struct sk_buff *skb, struct udphdr *uh)
{
- struct udphdr *uh = udp_hdr(skb);
u8 tmp;
if (!uh)
@@ -358,6 +360,14 @@ lowpan_uncompress_udp_header(struct sk_buff *skb)
/* copy checksum */
memcpy(&uh->check, &skb->data[0], 2);
skb_pull(skb, 2);
+
+ /*
+ * UDP lenght needs to be infered from the lower layers
+ * here, we obtain the hint from the remaining size of the
+ * frame
+ */
+ uh->len = htons(skb->len + sizeof(struct udphdr));
+ pr_debug("uncompressed UDP length: src = %d", uh->len);
} else {
pr_debug("ERROR: unsupported NH format\n");
goto err;
@@ -572,17 +582,31 @@ static int lowpan_header_create(struct sk_buff *skb,
* this isn't implemented in mainline yet, so currently we assign 0xff
*/
{
+ mac_cb(skb)->flags = IEEE802154_FC_TYPE_DATA;
+ mac_cb(skb)->seq = ieee802154_mlme_ops(dev)->get_dsn(dev);
+
/* prepare wpan address data */
sa.addr_type = IEEE802154_ADDR_LONG;
- sa.pan_id = 0xff;
-
- da.addr_type = IEEE802154_ADDR_LONG;
- da.pan_id = 0xff;
+ sa.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev);
- memcpy(&(da.hwaddr), daddr, 8);
memcpy(&(sa.hwaddr), saddr, 8);
+ /* intra-PAN communications */
+ da.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev);
- mac_cb(skb)->flags = IEEE802154_FC_TYPE_DATA;
+ /*
+ * if the destination address is the broadcast address, use the
+ * corresponding short address
+ */
+ if (lowpan_is_addr_broadcast(daddr)) {
+ da.addr_type = IEEE802154_ADDR_SHORT;
+ da.short_addr = IEEE802154_ADDR_BROADCAST;
+ } else {
+ da.addr_type = IEEE802154_ADDR_LONG;
+ memcpy(&(da.hwaddr), daddr, IEEE802154_ADDR_LEN);
+
+ /* request acknowledgment */
+ mac_cb(skb)->flags |= MAC_CB_FLAG_ACKREQ;
+ }
return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev,
type, (void *)&da, (void *)&sa, skb->len);
@@ -650,7 +674,7 @@ static void lowpan_fragment_timer_expired(unsigned long entry_addr)
}
static struct lowpan_fragment *
-lowpan_alloc_new_frame(struct sk_buff *skb, u8 len, u16 tag)
+lowpan_alloc_new_frame(struct sk_buff *skb, u16 len, u16 tag)
{
struct lowpan_fragment *frame;
@@ -720,7 +744,7 @@ lowpan_process_data(struct sk_buff *skb)
{
struct lowpan_fragment *frame;
/* slen stores the rightmost 8 bits of the 11 bits length */
- u8 slen, offset;
+ u8 slen, offset = 0;
u16 len, tag;
bool found = false;
@@ -731,6 +755,18 @@ lowpan_process_data(struct sk_buff *skb)
/* adds the 3 MSB to the 8 LSB to retrieve the 11 bits length */
len = ((iphc0 & 7) << 8) | slen;
+ if ((iphc0 & LOWPAN_DISPATCH_MASK) == LOWPAN_DISPATCH_FRAG1) {
+ pr_debug("%s received a FRAG1 packet (tag: %d, "
+ "size of the entire IP packet: %d)",
+ __func__, tag, len);
+ } else { /* FRAGN */
+ if (lowpan_fetch_skb_u8(skb, &offset))
+ goto unlock_and_drop;
+ pr_debug("%s received a FRAGN packet (tag: %d, "
+ "size of the entire IP packet: %d, "
+ "offset: %d)", __func__, tag, len, offset * 8);
+ }
+
/*
* check if frame assembling with the same tag is
* already in progress
@@ -745,17 +781,13 @@ lowpan_process_data(struct sk_buff *skb)
/* alloc new frame structure */
if (!found) {
+ pr_debug("%s first fragment received for tag %d, "
+ "begin packet reassembly", __func__, tag);
frame = lowpan_alloc_new_frame(skb, len, tag);
if (!frame)
goto unlock_and_drop;
}
- if ((iphc0 & LOWPAN_DISPATCH_MASK) == LOWPAN_DISPATCH_FRAG1)
- goto unlock_and_drop;
-
- if (lowpan_fetch_skb_u8(skb, &offset)) /* fetch offset */
- goto unlock_and_drop;
-
/* if payload fits buffer, copy it */
if (likely((offset * 8 + skb->len) <= frame->length))
skb_copy_to_linear_data_offset(frame->skb, offset * 8,
@@ -773,6 +805,9 @@ lowpan_process_data(struct sk_buff *skb)
list_del(&frame->list);
spin_unlock_bh(&flist_lock);
+ pr_debug("%s successfully reassembled fragment "
+ "(tag %d)", __func__, tag);
+
dev_kfree_skb(skb);
skb = frame->skb;
kfree(frame);
@@ -918,10 +953,35 @@ lowpan_process_data(struct sk_buff *skb)
}
/* UDP data uncompression */
- if (iphc0 & LOWPAN_IPHC_NH_C)
- if (lowpan_uncompress_udp_header(skb))
+ if (iphc0 & LOWPAN_IPHC_NH_C) {
+ struct udphdr uh;
+ struct sk_buff *new;
+ if (lowpan_uncompress_udp_header(skb, &uh))
goto drop;
+ /*
+ * replace the compressed UDP head by the uncompressed UDP
+ * header
+ */
+ new = skb_copy_expand(skb, sizeof(struct udphdr),
+ skb_tailroom(skb), GFP_ATOMIC);
+ kfree_skb(skb);
+
+ if (!new)
+ return -ENOMEM;
+
+ skb = new;
+
+ skb_push(skb, sizeof(struct udphdr));
+ skb_reset_transport_header(skb);
+ skb_copy_to_linear_data(skb, &uh, sizeof(struct udphdr));
+
+ lowpan_raw_dump_table(__func__, "raw UDP header dump",
+ (u8 *)&uh, sizeof(uh));
+
+ hdr.nexthdr = UIP_PROTO_UDP;
+ }
+
/* Not fragmented package */
hdr.payload_len = htons(skb->len);
@@ -969,13 +1029,13 @@ static int lowpan_get_mac_header_length(struct sk_buff *skb)
static int
lowpan_fragment_xmit(struct sk_buff *skb, u8 *head,
- int mlen, int plen, int offset)
+ int mlen, int plen, int offset, int type)
{
struct sk_buff *frag;
int hlen, ret;
- /* if payload length is zero, therefore it's a first fragment */
- hlen = (plen == 0 ? LOWPAN_FRAG1_HEAD_SIZE : LOWPAN_FRAGN_HEAD_SIZE);
+ hlen = (type == LOWPAN_DISPATCH_FRAG1) ?
+ LOWPAN_FRAG1_HEAD_SIZE : LOWPAN_FRAGN_HEAD_SIZE;
lowpan_raw_dump_inline(__func__, "6lowpan fragment header", head, hlen);
@@ -1003,14 +1063,14 @@ lowpan_fragment_xmit(struct sk_buff *skb, u8 *head,
}
static int
-lowpan_skb_fragmentation(struct sk_buff *skb)
+lowpan_skb_fragmentation(struct sk_buff *skb, struct net_device *dev)
{
int err, header_length, payload_length, tag, offset = 0;
u8 head[5];
header_length = lowpan_get_mac_header_length(skb);
payload_length = skb->len - header_length;
- tag = fragment_tag++;
+ tag = lowpan_dev_info(dev)->fragment_tag++;
/* first fragment header */
head[0] = LOWPAN_DISPATCH_FRAG1 | ((payload_length >> 8) & 0x7);
@@ -1018,7 +1078,16 @@ lowpan_skb_fragmentation(struct sk_buff *skb)
head[2] = tag >> 8;
head[3] = tag & 0xff;
- err = lowpan_fragment_xmit(skb, head, header_length, 0, 0);
+ err = lowpan_fragment_xmit(skb, head, header_length, LOWPAN_FRAG_SIZE,
+ 0, LOWPAN_DISPATCH_FRAG1);
+
+ if (err) {
+ pr_debug("%s unable to send FRAG1 packet (tag: %d)",
+ __func__, tag);
+ goto exit;
+ }
+
+ offset = LOWPAN_FRAG_SIZE;
/* next fragment header */
head[0] &= ~LOWPAN_DISPATCH_FRAG1;
@@ -1033,10 +1102,17 @@ lowpan_skb_fragmentation(struct sk_buff *skb)
len = payload_length - offset;
err = lowpan_fragment_xmit(skb, head, header_length,
- len, offset);
+ len, offset, LOWPAN_DISPATCH_FRAGN);
+ if (err) {
+ pr_debug("%s unable to send a subsequent FRAGN packet "
+ "(tag: %d, offset: %d", __func__, tag, offset);
+ goto exit;
+ }
+
offset += len;
}
+exit:
return err;
}
@@ -1059,14 +1135,14 @@ static netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev)
}
pr_debug("frame is too big, fragmentation is needed\n");
- err = lowpan_skb_fragmentation(skb);
+ err = lowpan_skb_fragmentation(skb, dev);
error:
dev_kfree_skb(skb);
out:
- if (err < 0)
+ if (err)
pr_debug("ERROR: xmit failed\n");
- return (err < 0 ? NETDEV_TX_BUSY : NETDEV_TX_OK);
+ return (err < 0) ? NET_XMIT_DROP : err;
}
static struct wpan_phy *lowpan_get_phy(const struct net_device *dev)
@@ -1087,6 +1163,12 @@ static u16 lowpan_get_short_addr(const struct net_device *dev)
return ieee802154_mlme_ops(real_dev)->get_short_addr(real_dev);
}
+static u8 lowpan_get_dsn(const struct net_device *dev)
+{
+ struct net_device *real_dev = lowpan_dev_info(dev)->real_dev;
+ return ieee802154_mlme_ops(real_dev)->get_dsn(real_dev);
+}
+
static struct header_ops lowpan_header_ops = {
.create = lowpan_header_create,
};
@@ -1100,6 +1182,7 @@ static struct ieee802154_mlme_ops lowpan_mlme = {
.get_pan_id = lowpan_get_pan_id,
.get_phy = lowpan_get_phy,
.get_short_addr = lowpan_get_short_addr,
+ .get_dsn = lowpan_get_dsn,
};
static void lowpan_setup(struct net_device *dev)
@@ -1203,6 +1286,7 @@ static int lowpan_newlink(struct net *src_net, struct net_device *dev,
return -ENODEV;
lowpan_dev_info(dev)->real_dev = real_dev;
+ lowpan_dev_info(dev)->fragment_tag = 0;
mutex_init(&lowpan_dev_info(dev)->dev_list_mtx);
entry = kzalloc(sizeof(struct lowpan_dev_record), GFP_KERNEL);
diff --git a/net/ieee802154/6lowpan.h b/net/ieee802154/6lowpan.h
index 8c2251fb0a3f..4b8f917658b5 100644
--- a/net/ieee802154/6lowpan.h
+++ b/net/ieee802154/6lowpan.h
@@ -84,7 +84,7 @@
(memcmp(addr1, addr2, length >> 3) == 0)
/* local link, i.e. FE80::/10 */
-#define is_addr_link_local(a) (((a)->s6_addr16[0]) == 0x80FE)
+#define is_addr_link_local(a) (((a)->s6_addr16[0]) == htons(0xFE80))
/*
* check whether we can compress the IID to 16 bits,
@@ -92,9 +92,10 @@
*/
#define lowpan_is_iid_16_bit_compressable(a) \
((((a)->s6_addr16[4]) == 0) && \
- (((a)->s6_addr16[5]) == 0) && \
- (((a)->s6_addr16[6]) == 0) && \
- ((((a)->s6_addr[14]) & 0x80) == 0))
+ (((a)->s6_addr[10]) == 0) && \
+ (((a)->s6_addr[11]) == 0xff) && \
+ (((a)->s6_addr[12]) == 0xfe) && \
+ (((a)->s6_addr[13]) == 0))
/* multicast address */
#define is_addr_mcast(a) (((a)->s6_addr[0]) == 0xFF)
diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c
index e0da175f8e5b..581a59504bd5 100644
--- a/net/ieee802154/dgram.c
+++ b/net/ieee802154/dgram.c
@@ -291,6 +291,9 @@ static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk,
size_t copied = 0;
int err = -EOPNOTSUPP;
struct sk_buff *skb;
+ struct sockaddr_ieee802154 *saddr;
+
+ saddr = (struct sockaddr_ieee802154 *)msg->msg_name;
skb = skb_recv_datagram(sk, flags, noblock, &err);
if (!skb)
@@ -309,6 +312,13 @@ static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk,
sock_recv_ts_and_drops(msg, sk, skb);
+ if (saddr) {
+ saddr->family = AF_IEEE802154;
+ saddr->addr = mac_cb(skb)->sa;
+ }
+ if (addr_len)
+ *addr_len = sizeof(*saddr);
+
if (flags & MSG_TRUNC)
copied = skb->len;
done:
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index 97351e1d07a4..7e49bbcc6967 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -64,8 +64,8 @@ struct sk_buff *ieee802154_nl_create(int flags, u8 req)
int ieee802154_nl_mcast(struct sk_buff *msg, unsigned int group)
{
- /* XXX: nlh is right at the start of msg */
- void *hdr = genlmsg_data(NLMSG_DATA(msg->data));
+ struct nlmsghdr *nlh = nlmsg_hdr(msg);
+ void *hdr = genlmsg_data(nlmsg_data(nlh));
if (genlmsg_end(msg, hdr) < 0)
goto out;
@@ -97,8 +97,8 @@ struct sk_buff *ieee802154_nl_new_reply(struct genl_info *info,
int ieee802154_nl_reply(struct sk_buff *msg, struct genl_info *info)
{
- /* XXX: nlh is right at the start of msg */
- void *hdr = genlmsg_data(NLMSG_DATA(msg->data));
+ struct nlmsghdr *nlh = nlmsg_hdr(msg);
+ void *hdr = genlmsg_data(nlmsg_data(nlh));
if (genlmsg_end(msg, hdr) < 0)
goto out;
diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c
index 96bb08abece2..b0bdd8c51e9c 100644
--- a/net/ieee802154/nl-mac.c
+++ b/net/ieee802154/nl-mac.c
@@ -315,7 +315,7 @@ static int ieee802154_associate_req(struct sk_buff *skb,
struct net_device *dev;
struct ieee802154_addr addr;
u8 page;
- int ret = -EINVAL;
+ int ret = -EOPNOTSUPP;
if (!info->attrs[IEEE802154_ATTR_CHANNEL] ||
!info->attrs[IEEE802154_ATTR_COORD_PAN_ID] ||
@@ -327,6 +327,8 @@ static int ieee802154_associate_req(struct sk_buff *skb,
dev = ieee802154_nl_get_dev(info);
if (!dev)
return -ENODEV;
+ if (!ieee802154_mlme_ops(dev)->assoc_req)
+ goto out;
if (info->attrs[IEEE802154_ATTR_COORD_HW_ADDR]) {
addr.addr_type = IEEE802154_ADDR_LONG;
@@ -350,6 +352,7 @@ static int ieee802154_associate_req(struct sk_buff *skb,
page,
nla_get_u8(info->attrs[IEEE802154_ATTR_CAPABILITY]));
+out:
dev_put(dev);
return ret;
}
@@ -359,7 +362,7 @@ static int ieee802154_associate_resp(struct sk_buff *skb,
{
struct net_device *dev;
struct ieee802154_addr addr;
- int ret = -EINVAL;
+ int ret = -EOPNOTSUPP;
if (!info->attrs[IEEE802154_ATTR_STATUS] ||
!info->attrs[IEEE802154_ATTR_DEST_HW_ADDR] ||
@@ -369,6 +372,8 @@ static int ieee802154_associate_resp(struct sk_buff *skb,
dev = ieee802154_nl_get_dev(info);
if (!dev)
return -ENODEV;
+ if (!ieee802154_mlme_ops(dev)->assoc_resp)
+ goto out;
addr.addr_type = IEEE802154_ADDR_LONG;
nla_memcpy(addr.hwaddr, info->attrs[IEEE802154_ATTR_DEST_HW_ADDR],
@@ -380,6 +385,7 @@ static int ieee802154_associate_resp(struct sk_buff *skb,
nla_get_u16(info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]),
nla_get_u8(info->attrs[IEEE802154_ATTR_STATUS]));
+out:
dev_put(dev);
return ret;
}
@@ -389,7 +395,7 @@ static int ieee802154_disassociate_req(struct sk_buff *skb,
{
struct net_device *dev;
struct ieee802154_addr addr;
- int ret = -EINVAL;
+ int ret = -EOPNOTSUPP;
if ((!info->attrs[IEEE802154_ATTR_DEST_HW_ADDR] &&
!info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]) ||
@@ -399,6 +405,8 @@ static int ieee802154_disassociate_req(struct sk_buff *skb,
dev = ieee802154_nl_get_dev(info);
if (!dev)
return -ENODEV;
+ if (!ieee802154_mlme_ops(dev)->disassoc_req)
+ goto out;
if (info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]) {
addr.addr_type = IEEE802154_ADDR_LONG;
@@ -415,6 +423,7 @@ static int ieee802154_disassociate_req(struct sk_buff *skb,
ret = ieee802154_mlme_ops(dev)->disassoc_req(dev, &addr,
nla_get_u8(info->attrs[IEEE802154_ATTR_REASON]));
+out:
dev_put(dev);
return ret;
}
@@ -432,7 +441,7 @@ static int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info)
u8 channel, bcn_ord, sf_ord;
u8 page;
int pan_coord, blx, coord_realign;
- int ret;
+ int ret = -EOPNOTSUPP;
if (!info->attrs[IEEE802154_ATTR_COORD_PAN_ID] ||
!info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR] ||
@@ -448,6 +457,8 @@ static int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info)
dev = ieee802154_nl_get_dev(info);
if (!dev)
return -ENODEV;
+ if (!ieee802154_mlme_ops(dev)->start_req)
+ goto out;
addr.addr_type = IEEE802154_ADDR_SHORT;
addr.short_addr = nla_get_u16(
@@ -476,6 +487,7 @@ static int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info)
ret = ieee802154_mlme_ops(dev)->start_req(dev, &addr, channel, page,
bcn_ord, sf_ord, pan_coord, blx, coord_realign);
+out:
dev_put(dev);
return ret;
}
@@ -483,7 +495,7 @@ static int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info)
static int ieee802154_scan_req(struct sk_buff *skb, struct genl_info *info)
{
struct net_device *dev;
- int ret;
+ int ret = -EOPNOTSUPP;
u8 type;
u32 channels;
u8 duration;
@@ -497,6 +509,8 @@ static int ieee802154_scan_req(struct sk_buff *skb, struct genl_info *info)
dev = ieee802154_nl_get_dev(info);
if (!dev)
return -ENODEV;
+ if (!ieee802154_mlme_ops(dev)->scan_req)
+ goto out;
type = nla_get_u8(info->attrs[IEEE802154_ATTR_SCAN_TYPE]);
channels = nla_get_u32(info->attrs[IEEE802154_ATTR_CHANNELS]);
@@ -511,6 +525,7 @@ static int ieee802154_scan_req(struct sk_buff *skb, struct genl_info *info)
ret = ieee802154_mlme_ops(dev)->scan_req(dev, type, channels, page,
duration);
+out:
dev_put(dev);
return ret;
}
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 7944df768454..8603ca827104 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -166,6 +166,7 @@ config IP_PNP_RARP
config NET_IPIP
tristate "IP: tunneling"
select INET_TUNNEL
+ select NET_IP_TUNNEL
---help---
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
@@ -186,9 +187,14 @@ config NET_IPGRE_DEMUX
This is helper module to demultiplex GRE packets on GRE version field criteria.
Required by ip_gre and pptp modules.
+config NET_IP_TUNNEL
+ tristate
+ default n
+
config NET_IPGRE
tristate "IP: GRE tunnels over IP"
depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
+ select NET_IP_TUNNEL
help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
@@ -313,6 +319,7 @@ config SYN_COOKIES
config NET_IPVTI
tristate "Virtual (secure) IP: tunneling"
select INET_TUNNEL
+ select NET_IP_TUNNEL
depends on INET_XFRM_MODE_TUNNEL
---help---
Tunneling means encapsulating data of one protocol type within
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 15ca63ec604e..089cb9f36387 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -13,6 +13,7 @@ obj-y := route.o inetpeer.o protocol.o \
fib_frontend.o fib_semantics.o fib_trie.o \
inet_fragment.o ping.o
+obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 68f6a94f7661..93824c57b108 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -111,7 +111,6 @@
#include <net/sock.h>
#include <net/raw.h>
#include <net/icmp.h>
-#include <net/ipip.h>
#include <net/inet_common.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
@@ -1283,9 +1282,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
int ihl;
int id;
unsigned int offset = 0;
-
- if (!(features & NETIF_F_V4_CSUM))
- features &= ~NETIF_F_SG;
+ bool tunnel;
if (unlikely(skb_shinfo(skb)->gso_type &
~(SKB_GSO_TCPV4 |
@@ -1293,6 +1290,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
SKB_GSO_GRE |
+ SKB_GSO_UDP_TUNNEL |
0)))
goto out;
@@ -1307,6 +1305,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
if (unlikely(!pskb_may_pull(skb, ihl)))
goto out;
+ tunnel = !!skb->encapsulation;
+
__skb_pull(skb, ihl);
skb_reset_transport_header(skb);
iph = ip_hdr(skb);
@@ -1326,15 +1326,14 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
skb = segs;
do {
iph = ip_hdr(skb);
- if (proto == IPPROTO_UDP) {
+ if (!tunnel && proto == IPPROTO_UDP) {
iph->id = htons(id);
iph->frag_off = htons(offset >> 3);
if (skb->next != NULL)
iph->frag_off |= htons(IP_MF);
offset += (skb->len - skb->mac_len - iph->ihl * 4);
} else {
- if (!(iph->frag_off & htons(IP_DF)))
- iph->id = htons(id++);
+ iph->id = htons(id++);
}
iph->tot_len = htons(skb->len - skb->mac_len);
iph->check = 0;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index fea4929f6200..247ec1951c35 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -654,11 +654,19 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
arp_ptr += dev->addr_len;
memcpy(arp_ptr, &src_ip, 4);
arp_ptr += 4;
- if (target_hw != NULL)
- memcpy(arp_ptr, target_hw, dev->addr_len);
- else
- memset(arp_ptr, 0, dev->addr_len);
- arp_ptr += dev->addr_len;
+
+ switch (dev->type) {
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+ case ARPHRD_IEEE1394:
+ break;
+#endif
+ default:
+ if (target_hw != NULL)
+ memcpy(arp_ptr, target_hw, dev->addr_len);
+ else
+ memset(arp_ptr, 0, dev->addr_len);
+ arp_ptr += dev->addr_len;
+ }
memcpy(arp_ptr, &dest_ip, 4);
return skb;
@@ -781,7 +789,14 @@ static int arp_process(struct sk_buff *skb)
arp_ptr += dev->addr_len;
memcpy(&sip, arp_ptr, 4);
arp_ptr += 4;
- arp_ptr += dev->addr_len;
+ switch (dev_type) {
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+ case ARPHRD_IEEE1394:
+ break;
+#endif
+ default:
+ arp_ptr += dev->addr_len;
+ }
memcpy(&tip, arp_ptr, 4);
/*
* Check for bad requests for 127.x.x.x and requests for multicast
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index f678507bc829..2759dfd576ae 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -536,7 +536,7 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
return NULL;
}
-static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tb[IFA_MAX+1];
@@ -775,7 +775,7 @@ static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa)
return NULL;
}
-static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct in_ifaddr *ifa;
@@ -802,8 +802,10 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
if (nlh->nlmsg_flags & NLM_F_EXCL ||
!(nlh->nlmsg_flags & NLM_F_REPLACE))
return -EEXIST;
-
- set_ifa_lifetime(ifa_existing, valid_lft, prefered_lft);
+ ifa = ifa_existing;
+ set_ifa_lifetime(ifa, valid_lft, prefered_lft);
+ rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid);
+ blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
}
return 0;
}
@@ -1499,6 +1501,8 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
idx = 0;
head = &net->dev_index_head[h];
rcu_read_lock();
+ cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
+ net->dev_base_seq;
hlist_for_each_entry_rcu(dev, head, index_hlist) {
if (idx < s_idx)
goto cont;
@@ -1519,6 +1523,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
rcu_read_unlock();
goto done;
}
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
}
cont:
idx++;
@@ -1730,8 +1735,7 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
};
static int inet_netconf_get_devconf(struct sk_buff *in_skb,
- struct nlmsghdr *nlh,
- void *arg)
+ struct nlmsghdr *nlh)
{
struct net *net = sock_net(in_skb->sk);
struct nlattr *tb[NETCONFA_MAX+1];
@@ -1791,6 +1795,77 @@ errout:
return err;
}
+static int inet_netconf_dump_devconf(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int h, s_h;
+ int idx, s_idx;
+ struct net_device *dev;
+ struct in_device *in_dev;
+ struct hlist_head *head;
+
+ s_h = cb->args[0];
+ s_idx = idx = cb->args[1];
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ rcu_read_lock();
+ cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
+ net->dev_base_seq;
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ goto cont;
+
+ if (inet_netconf_fill_devconf(skb, dev->ifindex,
+ &in_dev->cnf,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF,
+ NLM_F_MULTI,
+ -1) <= 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ rcu_read_unlock();
+ }
+ if (h == NETDEV_HASHENTRIES) {
+ if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ -1) <= 0)
+ goto done;
+ else
+ h++;
+ }
+ if (h == NETDEV_HASHENTRIES + 1) {
+ if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
+ net->ipv4.devconf_dflt,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ -1) <= 0)
+ goto done;
+ else
+ h++;
+ }
+done:
+ cb->args[0] = h;
+ cb->args[1] = idx;
+
+ return skb->len;
+}
+
#ifdef CONFIG_SYSCTL
static void devinet_copy_dflt_conf(struct net *net, int i)
@@ -2195,6 +2270,6 @@ void __init devinet_init(void)
rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL);
rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL);
rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
- NULL, NULL);
+ inet_netconf_dump_devconf, NULL);
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index eb4bb12b3eb4..c7629a209f9d 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -604,7 +604,7 @@ errout:
return err;
}
-static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct fib_config cfg;
@@ -626,7 +626,7 @@ errout:
return err;
}
-static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct fib_config cfg;
@@ -957,8 +957,8 @@ static void nl_fib_input(struct sk_buff *skb)
net = sock_net(skb->sk);
nlh = nlmsg_hdr(skb);
- if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
- nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn)))
+ if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len ||
+ nlmsg_len(nlh) < sizeof(*frn))
return;
skb = skb_clone(skb, GFP_KERNEL);
@@ -966,7 +966,7 @@ static void nl_fib_input(struct sk_buff *skb)
return;
nlh = nlmsg_hdr(skb);
- frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
+ frn = (struct fib_result_nl *) nlmsg_data(nlh);
tb = fib_get_table(net, frn->tb_id_in);
nl_fib_lookup(frn, tb);
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
index 7a4c710c4cdd..d2d5a99fba09 100644
--- a/net/ipv4/gre.c
+++ b/net/ipv4/gre.c
@@ -27,11 +27,6 @@
static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
static DEFINE_SPINLOCK(gre_proto_lock);
-struct gre_base_hdr {
- __be16 flags;
- __be16 protocol;
-};
-#define GRE_HEADER_SECTION 4
int gre_add_protocol(const struct gre_protocol *proto, u8 version)
{
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 7d1874be1df3..6acb541c9091 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -559,7 +559,7 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
{
- int err = req->rsk_ops->rtx_syn_ack(parent, req, NULL);
+ int err = req->rsk_ops->rtx_syn_ack(parent, req);
if (!err)
req->num_retrans++;
@@ -735,6 +735,7 @@ EXPORT_SYMBOL(inet_csk_destroy_sock);
* tcp/dccp_create_openreq_child().
*/
void inet_csk_prepare_forced_close(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
{
/* sk_clone_lock locked the socket and set refcnt to 2 */
bh_unlock_sock(sk);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 7afa2c3c788f..8620408af574 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -158,7 +158,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
- if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
r->idiag_timer = 1;
r->idiag_retrans = icsk->icsk_retransmits;
r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 245ae078a07f..e97d66a1fdde 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -21,7 +21,30 @@
#include <linux/rtnetlink.h>
#include <linux/slab.h>
+#include <net/sock.h>
#include <net/inet_frag.h>
+#include <net/inet_ecn.h>
+
+/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
+ * Value : 0xff if frame should be dropped.
+ * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
+ */
+const u8 ip_frag_ecn_table[16] = {
+ /* at least one fragment had CE, and others ECT_0 or ECT_1 */
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE,
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
+
+ /* invalid combinations : drop frame */
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+};
+EXPORT_SYMBOL(ip_frag_ecn_table);
static void inet_frag_secret_rebuild(unsigned long dummy)
{
@@ -29,20 +52,27 @@ static void inet_frag_secret_rebuild(unsigned long dummy)
unsigned long now = jiffies;
int i;
+ /* Per bucket lock NOT needed here, due to write lock protection */
write_lock(&f->lock);
+
get_random_bytes(&f->rnd, sizeof(u32));
for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+ struct inet_frag_bucket *hb;
struct inet_frag_queue *q;
struct hlist_node *n;
- hlist_for_each_entry_safe(q, n, &f->hash[i], list) {
+ hb = &f->hash[i];
+ hlist_for_each_entry_safe(q, n, &hb->chain, list) {
unsigned int hval = f->hashfn(q);
if (hval != i) {
+ struct inet_frag_bucket *hb_dest;
+
hlist_del(&q->list);
/* Relink to new hash chain. */
- hlist_add_head(&q->list, &f->hash[hval]);
+ hb_dest = &f->hash[hval];
+ hlist_add_head(&q->list, &hb_dest->chain);
}
}
}
@@ -55,9 +85,12 @@ void inet_frags_init(struct inet_frags *f)
{
int i;
- for (i = 0; i < INETFRAGS_HASHSZ; i++)
- INIT_HLIST_HEAD(&f->hash[i]);
+ for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+ struct inet_frag_bucket *hb = &f->hash[i];
+ spin_lock_init(&hb->chain_lock);
+ INIT_HLIST_HEAD(&hb->chain);
+ }
rwlock_init(&f->lock);
f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
@@ -99,10 +132,18 @@ EXPORT_SYMBOL(inet_frags_exit_net);
static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
{
- write_lock(&f->lock);
+ struct inet_frag_bucket *hb;
+ unsigned int hash;
+
+ read_lock(&f->lock);
+ hash = f->hashfn(fq);
+ hb = &f->hash[hash];
+
+ spin_lock(&hb->chain_lock);
hlist_del(&fq->list);
- fq->net->nqueues--;
- write_unlock(&f->lock);
+ spin_unlock(&hb->chain_lock);
+
+ read_unlock(&f->lock);
inet_frag_lru_del(fq);
}
@@ -181,6 +222,9 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
q = list_first_entry(&nf->lru_list,
struct inet_frag_queue, lru_list);
atomic_inc(&q->refcnt);
+ /* Remove q from list to avoid several CPUs grabbing it */
+ list_del_init(&q->lru_list);
+
spin_unlock(&nf->lru_lock);
spin_lock(&q->lock);
@@ -201,27 +245,32 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
struct inet_frag_queue *qp_in, struct inet_frags *f,
void *arg)
{
+ struct inet_frag_bucket *hb;
struct inet_frag_queue *qp;
#ifdef CONFIG_SMP
#endif
unsigned int hash;
- write_lock(&f->lock);
+ read_lock(&f->lock); /* Protects against hash rebuild */
/*
* While we stayed w/o the lock other CPU could update
* the rnd seed, so we need to re-calculate the hash
* chain. Fortunatelly the qp_in can be used to get one.
*/
hash = f->hashfn(qp_in);
+ hb = &f->hash[hash];
+ spin_lock(&hb->chain_lock);
+
#ifdef CONFIG_SMP
/* With SMP race we have to recheck hash table, because
* such entry could be created on other cpu, while we
- * promoted read lock to write lock.
+ * released the hash bucket lock.
*/
- hlist_for_each_entry(qp, &f->hash[hash], list) {
+ hlist_for_each_entry(qp, &hb->chain, list) {
if (qp->net == nf && f->match(qp, arg)) {
atomic_inc(&qp->refcnt);
- write_unlock(&f->lock);
+ spin_unlock(&hb->chain_lock);
+ read_unlock(&f->lock);
qp_in->last_in |= INET_FRAG_COMPLETE;
inet_frag_put(qp_in, f);
return qp;
@@ -233,9 +282,9 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
atomic_inc(&qp->refcnt);
atomic_inc(&qp->refcnt);
- hlist_add_head(&qp->list, &f->hash[hash]);
- nf->nqueues++;
- write_unlock(&f->lock);
+ hlist_add_head(&qp->list, &hb->chain);
+ spin_unlock(&hb->chain_lock);
+ read_unlock(&f->lock);
inet_frag_lru_add(nf, qp);
return qp;
}
@@ -276,17 +325,40 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
struct inet_frags *f, void *key, unsigned int hash)
__releases(&f->lock)
{
+ struct inet_frag_bucket *hb;
struct inet_frag_queue *q;
+ int depth = 0;
+
+ hb = &f->hash[hash];
- hlist_for_each_entry(q, &f->hash[hash], list) {
+ spin_lock(&hb->chain_lock);
+ hlist_for_each_entry(q, &hb->chain, list) {
if (q->net == nf && f->match(q, key)) {
atomic_inc(&q->refcnt);
+ spin_unlock(&hb->chain_lock);
read_unlock(&f->lock);
return q;
}
+ depth++;
}
+ spin_unlock(&hb->chain_lock);
read_unlock(&f->lock);
- return inet_frag_create(nf, f, key);
+ if (depth <= INETFRAGS_MAXDEPTH)
+ return inet_frag_create(nf, f, key);
+ else
+ return ERR_PTR(-ENOBUFS);
}
EXPORT_SYMBOL(inet_frag_find);
+
+void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
+ const char *prefix)
+{
+ static const char msg[] = "inet_frag_find: Fragment hash bucket"
+ " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
+ ". Dropping fragment.\n";
+
+ if (PTR_ERR(q) == -ENOBUFS)
+ LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg);
+}
+EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index cc280a3f4f96..1975f52933c5 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -29,6 +29,7 @@
#include <linux/module.h>
#include <linux/if_vlan.h>
#include <linux/inet_lro.h>
+#include <net/checksum.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>");
@@ -114,11 +115,9 @@ static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc)
*(p+2) = lro_desc->tcp_rcv_tsecr;
}
+ csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len));
iph->tot_len = htons(lro_desc->ip_tot_len);
- iph->check = 0;
- iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl);
-
tcph->check = 0;
tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0);
lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b6d30acb600c..938520668b2f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -79,40 +79,11 @@ struct ipq {
struct inet_peer *peer;
};
-/* RFC 3168 support :
- * We want to check ECN values of all fragments, do detect invalid combinations.
- * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
- */
-#define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */
-#define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */
-#define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */
-#define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */
-
static inline u8 ip4_frag_ecn(u8 tos)
{
return 1 << (tos & INET_ECN_MASK);
}
-/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
- * Value : 0xff if frame should be dropped.
- * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
- */
-static const u8 ip4_frag_ecn_table[16] = {
- /* at least one fragment had CE, and others ECT_0 or ECT_1 */
- [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE,
- [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
- [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
-
- /* invalid combinations : drop frame */
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
-};
-
static struct inet_frags ip4_frags;
int ip_frag_nqueues(struct net *net)
@@ -292,14 +263,11 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
- if (q == NULL)
- goto out_nomem;
-
+ if (IS_ERR_OR_NULL(q)) {
+ inet_frag_maybe_warn_overflow(q, pr_fmt());
+ return NULL;
+ }
return container_of(q, struct ipq, q);
-
-out_nomem:
- LIMIT_NETDEBUG(KERN_ERR pr_fmt("ip_frag_create: no memory left !\n"));
- return NULL;
}
/* Is the fragment too far ahead to be part of ipq? */
@@ -554,7 +522,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
ipq_kill(qp);
- ecn = ip4_frag_ecn_table[qp->ecn];
+ ecn = ip_frag_ecn_table[qp->ecn];
if (unlikely(ecn == 0xff)) {
err = -EINVAL;
goto out_fail;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index d0ef0e674ec5..987a4e5e07e2 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -37,7 +37,7 @@
#include <net/ip.h>
#include <net/icmp.h>
#include <net/protocol.h>
-#include <net/ipip.h>
+#include <net/ip_tunnels.h>
#include <net/arp.h>
#include <net/checksum.h>
#include <net/dsfield.h>
@@ -108,15 +108,6 @@
fatal route to network, even if it were you who configured
fatal static route: you are innocent. :-)
-
-
- 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
- practically identical code. It would be good to glue them
- together, but it is not very evident, how to make them modular.
- sit is integral part of IPv6, ipip and gre are naturally modular.
- We could extract common parts (hash table, ioctl etc)
- to a separate module (ip_tunnel.c).
-
Alexey Kuznetsov.
*/
@@ -126,400 +117,137 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
static struct rtnl_link_ops ipgre_link_ops __read_mostly;
static int ipgre_tunnel_init(struct net_device *dev);
-static void ipgre_tunnel_setup(struct net_device *dev);
-static int ipgre_tunnel_bind_dev(struct net_device *dev);
-
-/* Fallback tunnel: no source, no destination, no key, no options */
-
-#define HASH_SIZE 16
static int ipgre_net_id __read_mostly;
-struct ipgre_net {
- struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
-
- struct net_device *fb_tunnel_dev;
-};
-
-/* Tunnel hash table */
-
-/*
- 4 hash tables:
-
- 3: (remote,local)
- 2: (remote,*)
- 1: (*,local)
- 0: (*,*)
+static int gre_tap_net_id __read_mostly;
- We require exact key match i.e. if a key is present in packet
- it will match only tunnel with the same key; if it is not present,
- it will match only keyless tunnel.
-
- All keysless packets, if not matched configured keyless tunnels
- will match fallback tunnel.
- */
+static __sum16 check_checksum(struct sk_buff *skb)
+{
+ __sum16 csum = 0;
-#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ csum = csum_fold(skb->csum);
-#define tunnels_r_l tunnels[3]
-#define tunnels_r tunnels[2]
-#define tunnels_l tunnels[1]
-#define tunnels_wc tunnels[0]
+ if (!csum)
+ break;
+ /* Fall through. */
-static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *tot)
-{
- int i;
-
- for_each_possible_cpu(i) {
- const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
- u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
- unsigned int start;
-
- do {
- start = u64_stats_fetch_begin_bh(&tstats->syncp);
- rx_packets = tstats->rx_packets;
- tx_packets = tstats->tx_packets;
- rx_bytes = tstats->rx_bytes;
- tx_bytes = tstats->tx_bytes;
- } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
-
- tot->rx_packets += rx_packets;
- tot->tx_packets += tx_packets;
- tot->rx_bytes += rx_bytes;
- tot->tx_bytes += tx_bytes;
+ case CHECKSUM_NONE:
+ skb->csum = 0;
+ csum = __skb_checksum_complete(skb);
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ break;
}
- tot->multicast = dev->stats.multicast;
- tot->rx_crc_errors = dev->stats.rx_crc_errors;
- tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
- tot->rx_length_errors = dev->stats.rx_length_errors;
- tot->rx_frame_errors = dev->stats.rx_frame_errors;
- tot->rx_errors = dev->stats.rx_errors;
-
- tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
- tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
- tot->tx_dropped = dev->stats.tx_dropped;
- tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
- tot->tx_errors = dev->stats.tx_errors;
-
- return tot;
+ return csum;
}
-/* Does key in tunnel parameters match packet */
-static bool ipgre_key_match(const struct ip_tunnel_parm *p,
- __be16 flags, __be32 key)
+static int ip_gre_calc_hlen(__be16 o_flags)
{
- if (p->i_flags & GRE_KEY) {
- if (flags & GRE_KEY)
- return key == p->i_key;
- else
- return false; /* key expected, none present */
- } else
- return !(flags & GRE_KEY);
-}
+ int addend = 4;
-/* Given src, dst and key, find appropriate for input tunnel. */
+ if (o_flags&TUNNEL_CSUM)
+ addend += 4;
+ if (o_flags&TUNNEL_KEY)
+ addend += 4;
+ if (o_flags&TUNNEL_SEQ)
+ addend += 4;
+ return addend;
+}
-static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
- __be32 remote, __be32 local,
- __be16 flags, __be32 key,
- __be16 gre_proto)
+static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+ bool *csum_err, int *hdr_len)
{
- struct net *net = dev_net(dev);
- int link = dev->ifindex;
- unsigned int h0 = HASH(remote);
- unsigned int h1 = HASH(key);
- struct ip_tunnel *t, *cand = NULL;
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
- int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
- ARPHRD_ETHER : ARPHRD_IPGRE;
- int score, cand_score = 4;
-
- for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) {
- if (local != t->parms.iph.saddr ||
- remote != t->parms.iph.daddr ||
- !(t->dev->flags & IFF_UP))
- continue;
-
- if (!ipgre_key_match(&t->parms, flags, key))
- continue;
-
- if (t->dev->type != ARPHRD_IPGRE &&
- t->dev->type != dev_type)
- continue;
-
- score = 0;
- if (t->parms.link != link)
- score |= 1;
- if (t->dev->type != dev_type)
- score |= 2;
- if (score == 0)
- return t;
-
- if (score < cand_score) {
- cand = t;
- cand_score = score;
- }
- }
+ unsigned int ip_hlen = ip_hdrlen(skb);
+ const struct gre_base_hdr *greh;
+ __be32 *options;
- for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) {
- if (remote != t->parms.iph.daddr ||
- !(t->dev->flags & IFF_UP))
- continue;
-
- if (!ipgre_key_match(&t->parms, flags, key))
- continue;
-
- if (t->dev->type != ARPHRD_IPGRE &&
- t->dev->type != dev_type)
- continue;
-
- score = 0;
- if (t->parms.link != link)
- score |= 1;
- if (t->dev->type != dev_type)
- score |= 2;
- if (score == 0)
- return t;
-
- if (score < cand_score) {
- cand = t;
- cand_score = score;
- }
- }
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
+ return -EINVAL;
- for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) {
- if ((local != t->parms.iph.saddr &&
- (local != t->parms.iph.daddr ||
- !ipv4_is_multicast(local))) ||
- !(t->dev->flags & IFF_UP))
- continue;
-
- if (!ipgre_key_match(&t->parms, flags, key))
- continue;
-
- if (t->dev->type != ARPHRD_IPGRE &&
- t->dev->type != dev_type)
- continue;
-
- score = 0;
- if (t->parms.link != link)
- score |= 1;
- if (t->dev->type != dev_type)
- score |= 2;
- if (score == 0)
- return t;
-
- if (score < cand_score) {
- cand = t;
- cand_score = score;
- }
- }
+ greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
+ if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
+ return -EINVAL;
- for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) {
- if (t->parms.i_key != key ||
- !(t->dev->flags & IFF_UP))
- continue;
-
- if (t->dev->type != ARPHRD_IPGRE &&
- t->dev->type != dev_type)
- continue;
-
- score = 0;
- if (t->parms.link != link)
- score |= 1;
- if (t->dev->type != dev_type)
- score |= 2;
- if (score == 0)
- return t;
-
- if (score < cand_score) {
- cand = t;
- cand_score = score;
- }
- }
+ tpi->flags = gre_flags_to_tnl_flags(greh->flags);
+ *hdr_len = ip_gre_calc_hlen(tpi->flags);
- if (cand != NULL)
- return cand;
+ if (!pskb_may_pull(skb, *hdr_len))
+ return -EINVAL;
- dev = ign->fb_tunnel_dev;
- if (dev->flags & IFF_UP)
- return netdev_priv(dev);
+ greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
- return NULL;
-}
+ tpi->proto = greh->protocol;
-static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
- struct ip_tunnel_parm *parms)
-{
- __be32 remote = parms->iph.daddr;
- __be32 local = parms->iph.saddr;
- __be32 key = parms->i_key;
- unsigned int h = HASH(key);
- int prio = 0;
-
- if (local)
- prio |= 1;
- if (remote && !ipv4_is_multicast(remote)) {
- prio |= 2;
- h ^= HASH(remote);
+ options = (__be32 *)(greh + 1);
+ if (greh->flags & GRE_CSUM) {
+ if (check_checksum(skb)) {
+ *csum_err = true;
+ return -EINVAL;
+ }
+ options++;
}
- return &ign->tunnels[prio][h];
-}
-
-static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
- struct ip_tunnel *t)
-{
- return __ipgre_bucket(ign, &t->parms);
-}
-
-static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
-{
- struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
+ if (greh->flags & GRE_KEY) {
+ tpi->key = *options;
+ options++;
+ } else
+ tpi->key = 0;
- rcu_assign_pointer(t->next, rtnl_dereference(*tp));
- rcu_assign_pointer(*tp, t);
-}
+ if (unlikely(greh->flags & GRE_SEQ)) {
+ tpi->seq = *options;
+ options++;
+ } else
+ tpi->seq = 0;
-static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
-{
- struct ip_tunnel __rcu **tp;
- struct ip_tunnel *iter;
-
- for (tp = ipgre_bucket(ign, t);
- (iter = rtnl_dereference(*tp)) != NULL;
- tp = &iter->next) {
- if (t == iter) {
- rcu_assign_pointer(*tp, t->next);
- break;
+ /* WCCP version 1 and 2 protocol decoding.
+ * - Change protocol to IP
+ * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+ */
+ if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
+ tpi->proto = htons(ETH_P_IP);
+ if ((*(u8 *)options & 0xF0) != 0x40) {
+ *hdr_len += 4;
+ if (!pskb_may_pull(skb, *hdr_len))
+ return -EINVAL;
}
}
-}
-
-static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
- struct ip_tunnel_parm *parms,
- int type)
-{
- __be32 remote = parms->iph.daddr;
- __be32 local = parms->iph.saddr;
- __be32 key = parms->i_key;
- int link = parms->link;
- struct ip_tunnel *t;
- struct ip_tunnel __rcu **tp;
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
-
- for (tp = __ipgre_bucket(ign, parms);
- (t = rtnl_dereference(*tp)) != NULL;
- tp = &t->next)
- if (local == t->parms.iph.saddr &&
- remote == t->parms.iph.daddr &&
- key == t->parms.i_key &&
- link == t->parms.link &&
- type == t->dev->type)
- break;
-
- return t;
-}
-
-static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
- struct ip_tunnel_parm *parms, int create)
-{
- struct ip_tunnel *t, *nt;
- struct net_device *dev;
- char name[IFNAMSIZ];
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
-
- t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
- if (t || !create)
- return t;
-
- if (parms->name[0])
- strlcpy(name, parms->name, IFNAMSIZ);
- else
- strcpy(name, "gre%d");
-
- dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
- if (!dev)
- return NULL;
-
- dev_net_set(dev, net);
-
- nt = netdev_priv(dev);
- nt->parms = *parms;
- dev->rtnl_link_ops = &ipgre_link_ops;
-
- dev->mtu = ipgre_tunnel_bind_dev(dev);
- if (register_netdevice(dev) < 0)
- goto failed_free;
-
- /* Can use a lockless transmit, unless we generate output sequences */
- if (!(nt->parms.o_flags & GRE_SEQ))
- dev->features |= NETIF_F_LLTX;
-
- dev_hold(dev);
- ipgre_tunnel_link(ign, nt);
- return nt;
-
-failed_free:
- free_netdev(dev);
- return NULL;
-}
-
-static void ipgre_tunnel_uninit(struct net_device *dev)
-{
- struct net *net = dev_net(dev);
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
-
- ipgre_tunnel_unlink(ign, netdev_priv(dev));
- dev_put(dev);
+ return 0;
}
-
static void ipgre_err(struct sk_buff *skb, u32 info)
{
-/* All the routers (except for Linux) return only
- 8 bytes of packet payload. It means, that precise relaying of
- ICMP in the real Internet is absolutely infeasible.
+ /* All the routers (except for Linux) return only
+ 8 bytes of packet payload. It means, that precise relaying of
+ ICMP in the real Internet is absolutely infeasible.
- Moreover, Cisco "wise men" put GRE key to the third word
- in GRE header. It makes impossible maintaining even soft state for keyed
- GRE tunnels with enabled checksum. Tell them "thank you".
-
- Well, I wonder, rfc1812 was written by Cisco employee,
- what the hell these idiots break standards established
- by themselves???
- */
+ Moreover, Cisco "wise men" put GRE key to the third word
+ in GRE header. It makes impossible maintaining even soft
+ state for keyed GRE tunnels with enabled checksum. Tell
+ them "thank you".
+ Well, I wonder, rfc1812 was written by Cisco employee,
+ what the hell these idiots break standards established
+ by themselves???
+ */
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn;
const struct iphdr *iph = (const struct iphdr *)skb->data;
- __be16 *p = (__be16 *)(skb->data+(iph->ihl<<2));
- int grehlen = (iph->ihl<<2) + 4;
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct ip_tunnel *t;
- __be16 flags;
- __be32 key = 0;
+ struct tnl_ptk_info tpi;
+ int hdr_len;
+ bool csum_err = false;
- flags = p[0];
- if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
- if (flags&(GRE_VERSION|GRE_ROUTING))
+ if (parse_gre_header(skb, &tpi, &csum_err, &hdr_len)) {
+ if (!csum_err) /* ignore csum errors. */
return;
- if (flags&GRE_KEY) {
- grehlen += 4;
- if (flags&GRE_CSUM)
- grehlen += 4;
- }
}
- /* If only 8 bytes returned, keyed message will be dropped here */
- if (skb_headlen(skb) < grehlen)
- return;
-
- if (flags & GRE_KEY)
- key = *(((__be32 *)p) + (grehlen / 4) - 1);
-
switch (type) {
default:
case ICMP_PARAMETERPROB:
@@ -548,8 +276,13 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
break;
}
- t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
- flags, key, p[1]);
+ if (tpi.proto == htons(ETH_P_TEB))
+ itn = net_generic(net, gre_tap_net_id);
+ else
+ itn = net_generic(net, ipgre_net_id);
+
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags,
+ iph->daddr, iph->saddr, tpi.key);
if (t == NULL)
return;
@@ -578,158 +311,33 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
t->err_time = jiffies;
}
-static inline u8
-ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
-{
- u8 inner = 0;
- if (skb->protocol == htons(ETH_P_IP))
- inner = old_iph->tos;
- else if (skb->protocol == htons(ETH_P_IPV6))
- inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
- return INET_ECN_encapsulate(tos, inner);
-}
-
static int ipgre_rcv(struct sk_buff *skb)
{
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn;
const struct iphdr *iph;
- u8 *h;
- __be16 flags;
- __sum16 csum = 0;
- __be32 key = 0;
- u32 seqno = 0;
struct ip_tunnel *tunnel;
- int offset = 4;
- __be16 gre_proto;
- int err;
+ struct tnl_ptk_info tpi;
+ int hdr_len;
+ bool csum_err = false;
- if (!pskb_may_pull(skb, 16))
+ if (parse_gre_header(skb, &tpi, &csum_err, &hdr_len) < 0)
goto drop;
- iph = ip_hdr(skb);
- h = skb->data;
- flags = *(__be16 *)h;
-
- if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
- /* - Version must be 0.
- - We do not support routing headers.
- */
- if (flags&(GRE_VERSION|GRE_ROUTING))
- goto drop;
-
- if (flags&GRE_CSUM) {
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- csum = csum_fold(skb->csum);
- if (!csum)
- break;
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = 0;
- csum = __skb_checksum_complete(skb);
- skb->ip_summed = CHECKSUM_COMPLETE;
- }
- offset += 4;
- }
- if (flags&GRE_KEY) {
- key = *(__be32 *)(h + offset);
- offset += 4;
- }
- if (flags&GRE_SEQ) {
- seqno = ntohl(*(__be32 *)(h + offset));
- offset += 4;
- }
- }
+ if (tpi.proto == htons(ETH_P_TEB))
+ itn = net_generic(net, gre_tap_net_id);
+ else
+ itn = net_generic(net, ipgre_net_id);
- gre_proto = *(__be16 *)(h + 2);
+ iph = ip_hdr(skb);
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags,
+ iph->saddr, iph->daddr, tpi.key);
- tunnel = ipgre_tunnel_lookup(skb->dev,
- iph->saddr, iph->daddr, flags, key,
- gre_proto);
if (tunnel) {
- struct pcpu_tstats *tstats;
-
- secpath_reset(skb);
-
- skb->protocol = gre_proto;
- /* WCCP version 1 and 2 protocol decoding.
- * - Change protocol to IP
- * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
- */
- if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
- skb->protocol = htons(ETH_P_IP);
- if ((*(h + offset) & 0xF0) != 0x40)
- offset += 4;
- }
-
- skb->mac_header = skb->network_header;
- __pskb_pull(skb, offset);
- skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
- skb->pkt_type = PACKET_HOST;
-#ifdef CONFIG_NET_IPGRE_BROADCAST
- if (ipv4_is_multicast(iph->daddr)) {
- /* Looped back packet, drop it! */
- if (rt_is_output_route(skb_rtable(skb)))
- goto drop;
- tunnel->dev->stats.multicast++;
- skb->pkt_type = PACKET_BROADCAST;
- }
-#endif
-
- if (((flags&GRE_CSUM) && csum) ||
- (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
- tunnel->dev->stats.rx_crc_errors++;
- tunnel->dev->stats.rx_errors++;
- goto drop;
- }
- if (tunnel->parms.i_flags&GRE_SEQ) {
- if (!(flags&GRE_SEQ) ||
- (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
- tunnel->dev->stats.rx_fifo_errors++;
- tunnel->dev->stats.rx_errors++;
- goto drop;
- }
- tunnel->i_seqno = seqno + 1;
- }
-
- /* Warning: All skb pointers will be invalidated! */
- if (tunnel->dev->type == ARPHRD_ETHER) {
- if (!pskb_may_pull(skb, ETH_HLEN)) {
- tunnel->dev->stats.rx_length_errors++;
- tunnel->dev->stats.rx_errors++;
- goto drop;
- }
-
- iph = ip_hdr(skb);
- skb->protocol = eth_type_trans(skb, tunnel->dev);
- skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
- }
-
- __skb_tunnel_rx(skb, tunnel->dev);
-
- skb_reset_network_header(skb);
- err = IP_ECN_decapsulate(iph, skb);
- if (unlikely(err)) {
- if (log_ecn_error)
- net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
- &iph->saddr, iph->tos);
- if (err > 1) {
- ++tunnel->dev->stats.rx_frame_errors;
- ++tunnel->dev->stats.rx_errors;
- goto drop;
- }
- }
-
- tstats = this_cpu_ptr(tunnel->dev->tstats);
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
- u64_stats_update_end(&tstats->syncp);
-
- gro_cells_receive(&tunnel->gro_cells, skb);
+ ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
return 0;
}
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
-
drop:
kfree_skb(skb);
return 0;
@@ -746,7 +354,7 @@ static struct sk_buff *handle_offloads(struct ip_tunnel *tunnel, struct sk_buff
skb_shinfo(skb)->gso_type |= SKB_GSO_GRE;
return skb;
} else if (skb->ip_summed == CHECKSUM_PARTIAL &&
- tunnel->parms.o_flags&GRE_CSUM) {
+ tunnel->parms.o_flags&TUNNEL_CSUM) {
err = skb_checksum_help(skb);
if (unlikely(err))
goto error;
@@ -760,497 +368,157 @@ error:
return ERR_PTR(err);
}
-static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+static struct sk_buff *gre_build_header(struct sk_buff *skb,
+ const struct tnl_ptk_info *tpi,
+ int hdr_len)
{
- struct pcpu_tstats *tstats = this_cpu_ptr(dev->tstats);
- struct ip_tunnel *tunnel = netdev_priv(dev);
- const struct iphdr *old_iph;
- const struct iphdr *tiph;
- struct flowi4 fl4;
- u8 tos;
- __be16 df;
- struct rtable *rt; /* Route to the other host */
- struct net_device *tdev; /* Device to other host */
- struct iphdr *iph; /* Our new IP header */
- unsigned int max_headroom; /* The extra header space needed */
- int gre_hlen;
- __be32 dst;
- int mtu;
- u8 ttl;
- int err;
- int pkt_len;
-
- skb = handle_offloads(tunnel, skb);
- if (IS_ERR(skb)) {
- dev->stats.tx_dropped++;
- return NETDEV_TX_OK;
- }
+ struct gre_base_hdr *greh;
- if (!skb->encapsulation) {
- skb_reset_inner_headers(skb);
- skb->encapsulation = 1;
- }
+ skb_push(skb, hdr_len);
- old_iph = ip_hdr(skb);
+ greh = (struct gre_base_hdr *)skb->data;
+ greh->flags = tnl_flags_to_gre_flags(tpi->flags);
+ greh->protocol = tpi->proto;
- if (dev->type == ARPHRD_ETHER)
- IPCB(skb)->flags = 0;
+ if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
+ __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
- if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
- gre_hlen = 0;
- if (skb->protocol == htons(ETH_P_IP))
- tiph = (const struct iphdr *)skb->data;
- else
- tiph = &tunnel->parms.iph;
- } else {
- gre_hlen = tunnel->hlen;
- tiph = &tunnel->parms.iph;
- }
-
- if ((dst = tiph->daddr) == 0) {
- /* NBMA tunnel */
-
- if (skb_dst(skb) == NULL) {
- dev->stats.tx_fifo_errors++;
- goto tx_error;
+ if (tpi->flags&TUNNEL_SEQ) {
+ *ptr = tpi->seq;
+ ptr--;
}
-
- if (skb->protocol == htons(ETH_P_IP)) {
- rt = skb_rtable(skb);
- dst = rt_nexthop(rt, old_iph->daddr);
+ if (tpi->flags&TUNNEL_KEY) {
+ *ptr = tpi->key;
+ ptr--;
}
-#if IS_ENABLED(CONFIG_IPV6)
- else if (skb->protocol == htons(ETH_P_IPV6)) {
- const struct in6_addr *addr6;
- struct neighbour *neigh;
- bool do_tx_error_icmp;
- int addr_type;
-
- neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
- if (neigh == NULL)
- goto tx_error;
-
- addr6 = (const struct in6_addr *)&neigh->primary_key;
- addr_type = ipv6_addr_type(addr6);
-
- if (addr_type == IPV6_ADDR_ANY) {
- addr6 = &ipv6_hdr(skb)->daddr;
- addr_type = ipv6_addr_type(addr6);
- }
-
- if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
- do_tx_error_icmp = true;
- else {
- do_tx_error_icmp = false;
- dst = addr6->s6_addr32[3];
- }
- neigh_release(neigh);
- if (do_tx_error_icmp)
- goto tx_error_icmp;
+ if (tpi->flags&TUNNEL_CSUM &&
+ !(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) {
+ *(__sum16 *)ptr = 0;
+ *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
+ skb->len, 0));
}
-#endif
- else
- goto tx_error;
}
- ttl = tiph->ttl;
- tos = tiph->tos;
- if (tos & 0x1) {
- tos &= ~0x1;
- if (skb->protocol == htons(ETH_P_IP))
- tos = old_iph->tos;
- else if (skb->protocol == htons(ETH_P_IPV6))
- tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
- }
+ return skb;
+}
- rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
- tunnel->parms.o_key, RT_TOS(tos),
- tunnel->parms.link);
- if (IS_ERR(rt)) {
- dev->stats.tx_carrier_errors++;
- goto tx_error;
- }
- tdev = rt->dst.dev;
+static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
+ const struct iphdr *tnl_params,
+ __be16 proto)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct tnl_ptk_info tpi;
- if (tdev == dev) {
- ip_rt_put(rt);
- dev->stats.collisions++;
- goto tx_error;
+ if (likely(!skb->encapsulation)) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
}
- df = tiph->frag_off;
- if (df)
- mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
- else
- mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
-
- if (skb_dst(skb))
- skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
-
- if (skb->protocol == htons(ETH_P_IP)) {
- df |= (old_iph->frag_off&htons(IP_DF));
+ tpi.flags = tunnel->parms.o_flags;
+ tpi.proto = proto;
+ tpi.key = tunnel->parms.o_key;
+ if (tunnel->parms.o_flags & TUNNEL_SEQ)
+ tunnel->o_seqno++;
+ tpi.seq = htonl(tunnel->o_seqno);
- if (!skb_is_gso(skb) &&
- (old_iph->frag_off&htons(IP_DF)) &&
- mtu < ntohs(old_iph->tot_len)) {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
- ip_rt_put(rt);
- goto tx_error;
- }
+ /* Push GRE header. */
+ skb = gre_build_header(skb, &tpi, tunnel->hlen);
+ if (unlikely(!skb)) {
+ dev->stats.tx_dropped++;
+ return;
}
-#if IS_ENABLED(CONFIG_IPV6)
- else if (skb->protocol == htons(ETH_P_IPV6)) {
- struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
-
- if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
- if ((tunnel->parms.iph.daddr &&
- !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
- rt6->rt6i_dst.plen == 128) {
- rt6->rt6i_flags |= RTF_MODIFIED;
- dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
- }
- }
- if (!skb_is_gso(skb) &&
- mtu >= IPV6_MIN_MTU &&
- mtu < skb->len - tunnel->hlen + gre_hlen) {
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- ip_rt_put(rt);
- goto tx_error;
- }
- }
-#endif
+ ip_tunnel_xmit(skb, dev, tnl_params);
+}
- if (tunnel->err_count > 0) {
- if (time_before(jiffies,
- tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
- tunnel->err_count--;
+static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *tnl_params;
- dst_link_failure(skb);
- } else
- tunnel->err_count = 0;
- }
+ skb = handle_offloads(tunnel, skb);
+ if (IS_ERR(skb))
+ goto out;
- max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
-
- if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
- (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
- struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
- if (max_headroom > dev->needed_headroom)
- dev->needed_headroom = max_headroom;
- if (!new_skb) {
- ip_rt_put(rt);
- dev->stats.tx_dropped++;
- dev_kfree_skb(skb);
- return NETDEV_TX_OK;
- }
- if (skb->sk)
- skb_set_owner_w(new_skb, skb->sk);
- dev_kfree_skb(skb);
- skb = new_skb;
- old_iph = ip_hdr(skb);
- /* Warning : tiph value might point to freed memory */
- }
+ if (dev->header_ops) {
+ /* Need space for new headers */
+ if (skb_cow_head(skb, dev->needed_headroom -
+ (tunnel->hlen + sizeof(struct iphdr))));
+ goto free_skb;
- skb_push(skb, gre_hlen);
- skb_reset_network_header(skb);
- skb_set_transport_header(skb, sizeof(*iph));
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
- IPSKB_REROUTED);
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
- /*
- * Push down and install the IPIP header.
- */
+ tnl_params = (const struct iphdr *)skb->data;
- iph = ip_hdr(skb);
- iph->version = 4;
- iph->ihl = sizeof(struct iphdr) >> 2;
- iph->frag_off = df;
- iph->protocol = IPPROTO_GRE;
- iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
- iph->daddr = fl4.daddr;
- iph->saddr = fl4.saddr;
- iph->ttl = ttl;
-
- tunnel_ip_select_ident(skb, old_iph, &rt->dst);
-
- if (ttl == 0) {
- if (skb->protocol == htons(ETH_P_IP))
- iph->ttl = old_iph->ttl;
-#if IS_ENABLED(CONFIG_IPV6)
- else if (skb->protocol == htons(ETH_P_IPV6))
- iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
-#endif
- else
- iph->ttl = ip4_dst_hoplimit(&rt->dst);
- }
-
- ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
- ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
- htons(ETH_P_TEB) : skb->protocol;
-
- if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
- __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
+ /* Pull skb since ip_tunnel_xmit() needs skb->data pointing
+ * to gre header.
+ */
+ skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
+ } else {
+ if (skb_cow_head(skb, dev->needed_headroom))
+ goto free_skb;
- if (tunnel->parms.o_flags&GRE_SEQ) {
- ++tunnel->o_seqno;
- *ptr = htonl(tunnel->o_seqno);
- ptr--;
- }
- if (tunnel->parms.o_flags&GRE_KEY) {
- *ptr = tunnel->parms.o_key;
- ptr--;
- }
- /* Skip GRE checksum if skb is getting offloaded. */
- if (!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE) &&
- (tunnel->parms.o_flags&GRE_CSUM)) {
- int offset = skb_transport_offset(skb);
-
- if (skb_has_shared_frag(skb)) {
- err = __skb_linearize(skb);
- if (err)
- goto tx_error;
- }
-
- *ptr = 0;
- *(__sum16 *)ptr = csum_fold(skb_checksum(skb, offset,
- skb->len - offset,
- 0));
- }
+ tnl_params = &tunnel->parms.iph;
}
- nf_reset(skb);
+ __gre_xmit(skb, dev, tnl_params, skb->protocol);
- pkt_len = skb->len - skb_transport_offset(skb);
- err = ip_local_out(skb);
- if (likely(net_xmit_eval(err) == 0)) {
- u64_stats_update_begin(&tstats->syncp);
- tstats->tx_bytes += pkt_len;
- tstats->tx_packets++;
- u64_stats_update_end(&tstats->syncp);
- } else {
- dev->stats.tx_errors++;
- dev->stats.tx_aborted_errors++;
- }
return NETDEV_TX_OK;
-#if IS_ENABLED(CONFIG_IPV6)
-tx_error_icmp:
- dst_link_failure(skb);
-#endif
-tx_error:
- dev->stats.tx_errors++;
+free_skb:
dev_kfree_skb(skb);
+out:
+ dev->stats.tx_dropped++;
return NETDEV_TX_OK;
}
-static int ipgre_tunnel_bind_dev(struct net_device *dev)
+static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
+ struct net_device *dev)
{
- struct net_device *tdev = NULL;
- struct ip_tunnel *tunnel;
- const struct iphdr *iph;
- int hlen = LL_MAX_HEADER;
- int mtu = ETH_DATA_LEN;
- int addend = sizeof(struct iphdr) + 4;
-
- tunnel = netdev_priv(dev);
- iph = &tunnel->parms.iph;
-
- /* Guess output device to choose reasonable mtu and needed_headroom */
-
- if (iph->daddr) {
- struct flowi4 fl4;
- struct rtable *rt;
-
- rt = ip_route_output_gre(dev_net(dev), &fl4,
- iph->daddr, iph->saddr,
- tunnel->parms.o_key,
- RT_TOS(iph->tos),
- tunnel->parms.link);
- if (!IS_ERR(rt)) {
- tdev = rt->dst.dev;
- ip_rt_put(rt);
- }
-
- if (dev->type != ARPHRD_ETHER)
- dev->flags |= IFF_POINTOPOINT;
- }
+ struct ip_tunnel *tunnel = netdev_priv(dev);
- if (!tdev && tunnel->parms.link)
- tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
+ skb = handle_offloads(tunnel, skb);
+ if (IS_ERR(skb))
+ goto out;
- if (tdev) {
- hlen = tdev->hard_header_len + tdev->needed_headroom;
- mtu = tdev->mtu;
- }
- dev->iflink = tunnel->parms.link;
-
- /* Precalculate GRE options length */
- if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
- if (tunnel->parms.o_flags&GRE_CSUM)
- addend += 4;
- if (tunnel->parms.o_flags&GRE_KEY)
- addend += 4;
- if (tunnel->parms.o_flags&GRE_SEQ)
- addend += 4;
- }
- dev->needed_headroom = addend + hlen;
- mtu -= dev->hard_header_len + addend;
+ if (skb_cow_head(skb, dev->needed_headroom))
+ goto free_skb;
- if (mtu < 68)
- mtu = 68;
+ __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
- tunnel->hlen = addend;
- /* TCP offload with GRE SEQ is not supported. */
- if (!(tunnel->parms.o_flags & GRE_SEQ)) {
- dev->features |= NETIF_F_GSO_SOFTWARE;
- dev->hw_features |= NETIF_F_GSO_SOFTWARE;
- }
+ return NETDEV_TX_OK;
- return mtu;
+free_skb:
+ dev_kfree_skb(skb);
+out:
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
}
-static int
-ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
+static int ipgre_tunnel_ioctl(struct net_device *dev,
+ struct ifreq *ifr, int cmd)
{
int err = 0;
struct ip_tunnel_parm p;
- struct ip_tunnel *t;
- struct net *net = dev_net(dev);
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
-
- switch (cmd) {
- case SIOCGETTUNNEL:
- t = NULL;
- if (dev == ign->fb_tunnel_dev) {
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
- err = -EFAULT;
- break;
- }
- t = ipgre_tunnel_locate(net, &p, 0);
- }
- if (t == NULL)
- t = netdev_priv(dev);
- memcpy(&p, &t->parms, sizeof(p));
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
- err = -EFAULT;
- break;
-
- case SIOCADDTUNNEL:
- case SIOCCHGTUNNEL:
- err = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- goto done;
-
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
-
- err = -EINVAL;
- if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
- p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
- ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
- goto done;
- if (p.iph.ttl)
- p.iph.frag_off |= htons(IP_DF);
-
- if (!(p.i_flags&GRE_KEY))
- p.i_key = 0;
- if (!(p.o_flags&GRE_KEY))
- p.o_key = 0;
-
- t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
-
- if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
- if (t != NULL) {
- if (t->dev != dev) {
- err = -EEXIST;
- break;
- }
- } else {
- unsigned int nflags = 0;
-
- t = netdev_priv(dev);
-
- if (ipv4_is_multicast(p.iph.daddr))
- nflags = IFF_BROADCAST;
- else if (p.iph.daddr)
- nflags = IFF_POINTOPOINT;
-
- if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
- err = -EINVAL;
- break;
- }
- ipgre_tunnel_unlink(ign, t);
- synchronize_net();
- t->parms.iph.saddr = p.iph.saddr;
- t->parms.iph.daddr = p.iph.daddr;
- t->parms.i_key = p.i_key;
- t->parms.o_key = p.o_key;
- memcpy(dev->dev_addr, &p.iph.saddr, 4);
- memcpy(dev->broadcast, &p.iph.daddr, 4);
- ipgre_tunnel_link(ign, t);
- netdev_state_change(dev);
- }
- }
-
- if (t) {
- err = 0;
- if (cmd == SIOCCHGTUNNEL) {
- t->parms.iph.ttl = p.iph.ttl;
- t->parms.iph.tos = p.iph.tos;
- t->parms.iph.frag_off = p.iph.frag_off;
- if (t->parms.link != p.link) {
- t->parms.link = p.link;
- dev->mtu = ipgre_tunnel_bind_dev(dev);
- netdev_state_change(dev);
- }
- }
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
- err = -EFAULT;
- } else
- err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
- break;
-
- case SIOCDELTUNNEL:
- err = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- goto done;
-
- if (dev == ign->fb_tunnel_dev) {
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
- err = -ENOENT;
- if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
- goto done;
- err = -EPERM;
- if (t == netdev_priv(ign->fb_tunnel_dev))
- goto done;
- dev = t->dev;
- }
- unregister_netdevice(dev);
- err = 0;
- break;
- default:
- err = -EINVAL;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ return -EFAULT;
+ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
+ p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
+ ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) {
+ return -EINVAL;
}
+ p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
+ p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
-done:
- return err;
-}
+ err = ip_tunnel_ioctl(dev, &p, cmd);
+ if (err)
+ return err;
-static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
-{
- struct ip_tunnel *tunnel = netdev_priv(dev);
- if (new_mtu < 68 ||
- new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
- return -EINVAL;
- dev->mtu = new_mtu;
+ p.i_flags = tnl_flags_to_gre_flags(p.i_flags);
+ p.o_flags = tnl_flags_to_gre_flags(p.o_flags);
+
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ return -EFAULT;
return 0;
}
@@ -1280,25 +548,23 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
...
ftp fec0:6666:6666::193.233.7.65
...
-
*/
-
static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type,
const void *daddr, const void *saddr, unsigned int len)
{
struct ip_tunnel *t = netdev_priv(dev);
- struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
- __be16 *p = (__be16 *)(iph+1);
+ struct iphdr *iph;
+ struct gre_base_hdr *greh;
- memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
- p[0] = t->parms.o_flags;
- p[1] = htons(type);
+ iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph));
+ greh = (struct gre_base_hdr *)(iph+1);
+ greh->flags = tnl_flags_to_gre_flags(t->parms.o_flags);
+ greh->protocol = htons(type);
- /*
- * Set the source hardware address.
- */
+ memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
+ /* Set the source hardware address. */
if (saddr)
memcpy(&iph->saddr, saddr, 4);
if (daddr)
@@ -1306,7 +572,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
if (iph->daddr)
return t->hlen;
- return -t->hlen;
+ return -(t->hlen + sizeof(*iph));
}
static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
@@ -1360,31 +626,21 @@ static int ipgre_close(struct net_device *dev)
}
return 0;
}
-
#endif
static const struct net_device_ops ipgre_netdev_ops = {
.ndo_init = ipgre_tunnel_init,
- .ndo_uninit = ipgre_tunnel_uninit,
+ .ndo_uninit = ip_tunnel_uninit,
#ifdef CONFIG_NET_IPGRE_BROADCAST
.ndo_open = ipgre_open,
.ndo_stop = ipgre_close,
#endif
- .ndo_start_xmit = ipgre_tunnel_xmit,
+ .ndo_start_xmit = ipgre_xmit,
.ndo_do_ioctl = ipgre_tunnel_ioctl,
- .ndo_change_mtu = ipgre_tunnel_change_mtu,
- .ndo_get_stats64 = ipgre_get_stats64,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
};
-static void ipgre_dev_free(struct net_device *dev)
-{
- struct ip_tunnel *tunnel = netdev_priv(dev);
-
- gro_cells_destroy(&tunnel->gro_cells);
- free_percpu(dev->tstats);
- free_netdev(dev);
-}
-
#define GRE_FEATURES (NETIF_F_SG | \
NETIF_F_FRAGLIST | \
NETIF_F_HIGHDMA | \
@@ -1393,35 +649,48 @@ static void ipgre_dev_free(struct net_device *dev)
static void ipgre_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &ipgre_netdev_ops;
- dev->destructor = ipgre_dev_free;
+ ip_tunnel_setup(dev, ipgre_net_id);
+}
- dev->type = ARPHRD_IPGRE;
- dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
+static void __gre_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel;
+
+ tunnel = netdev_priv(dev);
+ tunnel->hlen = ip_gre_calc_hlen(tunnel->parms.o_flags);
+ tunnel->parms.iph.protocol = IPPROTO_GRE;
+
+ dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
- dev->flags = IFF_NOARP;
- dev->iflink = 0;
- dev->addr_len = 4;
- dev->features |= NETIF_F_NETNS_LOCAL;
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
- dev->features |= GRE_FEATURES;
+ dev->features |= NETIF_F_NETNS_LOCAL | GRE_FEATURES;
dev->hw_features |= GRE_FEATURES;
+
+ if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
+ /* TCP offload with GRE SEQ is not supported. */
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ /* Can use a lockless transmit, unless we generate
+ * output sequences
+ */
+ dev->features |= NETIF_F_LLTX;
+ }
}
static int ipgre_tunnel_init(struct net_device *dev)
{
- struct ip_tunnel *tunnel;
- struct iphdr *iph;
- int err;
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
- tunnel = netdev_priv(dev);
- iph = &tunnel->parms.iph;
+ __gre_tunnel_init(dev);
- tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
+ memcpy(dev->dev_addr, &iph->saddr, 4);
+ memcpy(dev->broadcast, &iph->daddr, 4);
- memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
- memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+ dev->type = ARPHRD_IPGRE;
+ dev->flags = IFF_NOARP;
+ dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ dev->addr_len = 4;
if (iph->daddr) {
#ifdef CONFIG_NET_IPGRE_BROADCAST
@@ -1435,106 +704,30 @@ static int ipgre_tunnel_init(struct net_device *dev)
} else
dev->header_ops = &ipgre_header_ops;
- dev->tstats = alloc_percpu(struct pcpu_tstats);
- if (!dev->tstats)
- return -ENOMEM;
-
- err = gro_cells_init(&tunnel->gro_cells, dev);
- if (err) {
- free_percpu(dev->tstats);
- return err;
- }
-
- return 0;
-}
-
-static void ipgre_fb_tunnel_init(struct net_device *dev)
-{
- struct ip_tunnel *tunnel = netdev_priv(dev);
- struct iphdr *iph = &tunnel->parms.iph;
-
- tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
-
- iph->version = 4;
- iph->protocol = IPPROTO_GRE;
- iph->ihl = 5;
- tunnel->hlen = sizeof(struct iphdr) + 4;
-
- dev_hold(dev);
+ return ip_tunnel_init(dev);
}
-
static const struct gre_protocol ipgre_protocol = {
.handler = ipgre_rcv,
.err_handler = ipgre_err,
};
-static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
-{
- int prio;
-
- for (prio = 0; prio < 4; prio++) {
- int h;
- for (h = 0; h < HASH_SIZE; h++) {
- struct ip_tunnel *t;
-
- t = rtnl_dereference(ign->tunnels[prio][h]);
-
- while (t != NULL) {
- unregister_netdevice_queue(t->dev, head);
- t = rtnl_dereference(t->next);
- }
- }
- }
-}
-
static int __net_init ipgre_init_net(struct net *net)
{
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
- int err;
-
- ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
- ipgre_tunnel_setup);
- if (!ign->fb_tunnel_dev) {
- err = -ENOMEM;
- goto err_alloc_dev;
- }
- dev_net_set(ign->fb_tunnel_dev, net);
-
- ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
- ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
-
- if ((err = register_netdev(ign->fb_tunnel_dev)))
- goto err_reg_dev;
-
- rcu_assign_pointer(ign->tunnels_wc[0],
- netdev_priv(ign->fb_tunnel_dev));
- return 0;
-
-err_reg_dev:
- ipgre_dev_free(ign->fb_tunnel_dev);
-err_alloc_dev:
- return err;
+ return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
}
static void __net_exit ipgre_exit_net(struct net *net)
{
- struct ipgre_net *ign;
- LIST_HEAD(list);
-
- ign = net_generic(net, ipgre_net_id);
- rtnl_lock();
- ipgre_destroy_tunnels(ign, &list);
- unregister_netdevice_many(&list);
- rtnl_unlock();
+ struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
+ ip_tunnel_delete_net(itn);
}
static struct pernet_operations ipgre_net_ops = {
.init = ipgre_init_net,
.exit = ipgre_exit_net,
.id = &ipgre_net_id,
- .size = sizeof(struct ipgre_net),
+ .size = sizeof(struct ip_tunnel_net),
};
static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -1579,8 +772,8 @@ out:
return ipgre_tunnel_validate(tb, data);
}
-static void ipgre_netlink_parms(struct nlattr *data[],
- struct ip_tunnel_parm *parms)
+static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
+ struct ip_tunnel_parm *parms)
{
memset(parms, 0, sizeof(*parms));
@@ -1593,10 +786,10 @@ static void ipgre_netlink_parms(struct nlattr *data[],
parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
if (data[IFLA_GRE_IFLAGS])
- parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
+ parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
if (data[IFLA_GRE_OFLAGS])
- parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
+ parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
if (data[IFLA_GRE_IKEY])
parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
@@ -1620,148 +813,46 @@ static void ipgre_netlink_parms(struct nlattr *data[],
parms->iph.frag_off = htons(IP_DF);
}
-static int ipgre_tap_init(struct net_device *dev)
+static int gre_tap_init(struct net_device *dev)
{
- struct ip_tunnel *tunnel;
-
- tunnel = netdev_priv(dev);
-
- tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
+ __gre_tunnel_init(dev);
- ipgre_tunnel_bind_dev(dev);
-
- dev->tstats = alloc_percpu(struct pcpu_tstats);
- if (!dev->tstats)
- return -ENOMEM;
-
- return 0;
+ return ip_tunnel_init(dev);
}
-static const struct net_device_ops ipgre_tap_netdev_ops = {
- .ndo_init = ipgre_tap_init,
- .ndo_uninit = ipgre_tunnel_uninit,
- .ndo_start_xmit = ipgre_tunnel_xmit,
+static const struct net_device_ops gre_tap_netdev_ops = {
+ .ndo_init = gre_tap_init,
+ .ndo_uninit = ip_tunnel_uninit,
+ .ndo_start_xmit = gre_tap_xmit,
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
- .ndo_change_mtu = ipgre_tunnel_change_mtu,
- .ndo_get_stats64 = ipgre_get_stats64,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
};
static void ipgre_tap_setup(struct net_device *dev)
{
-
ether_setup(dev);
-
- dev->netdev_ops = &ipgre_tap_netdev_ops;
- dev->destructor = ipgre_dev_free;
-
- dev->iflink = 0;
- dev->features |= NETIF_F_NETNS_LOCAL;
-
- dev->features |= GRE_FEATURES;
- dev->hw_features |= GRE_FEATURES;
+ dev->netdev_ops = &gre_tap_netdev_ops;
+ ip_tunnel_setup(dev, gre_tap_net_id);
}
-static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
- struct nlattr *data[])
+static int ipgre_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
{
- struct ip_tunnel *nt;
- struct net *net = dev_net(dev);
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
- int mtu;
- int err;
-
- nt = netdev_priv(dev);
- ipgre_netlink_parms(data, &nt->parms);
-
- if (ipgre_tunnel_find(net, &nt->parms, dev->type))
- return -EEXIST;
-
- if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
- eth_hw_addr_random(dev);
-
- mtu = ipgre_tunnel_bind_dev(dev);
- if (!tb[IFLA_MTU])
- dev->mtu = mtu;
-
- /* Can use a lockless transmit, unless we generate output sequences */
- if (!(nt->parms.o_flags & GRE_SEQ))
- dev->features |= NETIF_F_LLTX;
-
- err = register_netdevice(dev);
- if (err)
- goto out;
-
- dev_hold(dev);
- ipgre_tunnel_link(ign, nt);
+ struct ip_tunnel_parm p;
-out:
- return err;
+ ipgre_netlink_parms(data, tb, &p);
+ return ip_tunnel_newlink(dev, tb, &p);
}
static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[])
{
- struct ip_tunnel *t, *nt;
- struct net *net = dev_net(dev);
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
struct ip_tunnel_parm p;
- int mtu;
-
- if (dev == ign->fb_tunnel_dev)
- return -EINVAL;
-
- nt = netdev_priv(dev);
- ipgre_netlink_parms(data, &p);
-
- t = ipgre_tunnel_locate(net, &p, 0);
-
- if (t) {
- if (t->dev != dev)
- return -EEXIST;
- } else {
- t = nt;
-
- if (dev->type != ARPHRD_ETHER) {
- unsigned int nflags = 0;
-
- if (ipv4_is_multicast(p.iph.daddr))
- nflags = IFF_BROADCAST;
- else if (p.iph.daddr)
- nflags = IFF_POINTOPOINT;
-
- if ((dev->flags ^ nflags) &
- (IFF_POINTOPOINT | IFF_BROADCAST))
- return -EINVAL;
- }
- ipgre_tunnel_unlink(ign, t);
- t->parms.iph.saddr = p.iph.saddr;
- t->parms.iph.daddr = p.iph.daddr;
- t->parms.i_key = p.i_key;
- if (dev->type != ARPHRD_ETHER) {
- memcpy(dev->dev_addr, &p.iph.saddr, 4);
- memcpy(dev->broadcast, &p.iph.daddr, 4);
- }
- ipgre_tunnel_link(ign, t);
- netdev_state_change(dev);
- }
-
- t->parms.o_key = p.o_key;
- t->parms.iph.ttl = p.iph.ttl;
- t->parms.iph.tos = p.iph.tos;
- t->parms.iph.frag_off = p.iph.frag_off;
-
- if (t->parms.link != p.link) {
- t->parms.link = p.link;
- mtu = ipgre_tunnel_bind_dev(dev);
- if (!tb[IFLA_MTU])
- dev->mtu = mtu;
- netdev_state_change(dev);
- }
-
- return 0;
+ ipgre_netlink_parms(data, tb, &p);
+ return ip_tunnel_changelink(dev, tb, &p);
}
static size_t ipgre_get_size(const struct net_device *dev)
@@ -1796,8 +887,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
struct ip_tunnel_parm *p = &t->parms;
if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
- nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
- nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
+ nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) ||
+ nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) ||
nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
@@ -1835,6 +926,7 @@ static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
.validate = ipgre_tunnel_validate,
.newlink = ipgre_newlink,
.changelink = ipgre_changelink,
+ .dellink = ip_tunnel_dellink,
.get_size = ipgre_get_size,
.fill_info = ipgre_fill_info,
};
@@ -1848,13 +940,28 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
.validate = ipgre_tap_validate,
.newlink = ipgre_newlink,
.changelink = ipgre_changelink,
+ .dellink = ip_tunnel_dellink,
.get_size = ipgre_get_size,
.fill_info = ipgre_fill_info,
};
-/*
- * And now the modules code and kernel interface.
- */
+static int __net_init ipgre_tap_init_net(struct net *net)
+{
+ return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL);
+}
+
+static void __net_exit ipgre_tap_exit_net(struct net *net)
+{
+ struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
+ ip_tunnel_delete_net(itn);
+}
+
+static struct pernet_operations ipgre_tap_net_ops = {
+ .init = ipgre_tap_init_net,
+ .exit = ipgre_tap_exit_net,
+ .id = &gre_tap_net_id,
+ .size = sizeof(struct ip_tunnel_net),
+};
static int __init ipgre_init(void)
{
@@ -1866,6 +973,10 @@ static int __init ipgre_init(void)
if (err < 0)
return err;
+ err = register_pernet_device(&ipgre_tap_net_ops);
+ if (err < 0)
+ goto pnet_tap_faied;
+
err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
if (err < 0) {
pr_info("%s: can't add protocol\n", __func__);
@@ -1880,16 +991,17 @@ static int __init ipgre_init(void)
if (err < 0)
goto tap_ops_failed;
-out:
- return err;
+ return 0;
tap_ops_failed:
rtnl_link_unregister(&ipgre_link_ops);
rtnl_link_failed:
gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
add_proto_failed:
+ unregister_pernet_device(&ipgre_tap_net_ops);
+pnet_tap_faied:
unregister_pernet_device(&ipgre_net_ops);
- goto out;
+ return err;
}
static void __exit ipgre_fini(void)
@@ -1898,6 +1010,7 @@ static void __exit ipgre_fini(void)
rtnl_link_unregister(&ipgre_link_ops);
if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
pr_info("%s: can't remove protocol\n", __func__);
+ unregister_pernet_device(&ipgre_tap_net_ops);
unregister_pernet_device(&ipgre_net_ops);
}
@@ -1907,3 +1020,4 @@ MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK("gre");
MODULE_ALIAS_RTNL_LINK("gretap");
MODULE_ALIAS_NETDEV("gre0");
+MODULE_ALIAS_NETDEV("gretap0");
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 310a3647c83d..ec7264514a82 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -370,7 +370,6 @@ int ip_options_compile(struct net *net,
}
switch (optptr[3]&0xF) {
case IPOPT_TS_TSONLY:
- opt->ts = optptr - iph;
if (skb)
timeptr = &optptr[optptr[2]-1];
opt->ts_needtime = 1;
@@ -381,7 +380,6 @@ int ip_options_compile(struct net *net,
pp_ptr = optptr + 2;
goto error;
}
- opt->ts = optptr - iph;
if (rt) {
spec_dst_fill(&spec_dst, skb);
memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
@@ -396,7 +394,6 @@ int ip_options_compile(struct net *net,
pp_ptr = optptr + 2;
goto error;
}
- opt->ts = optptr - iph;
{
__be32 addr;
memcpy(&addr, &optptr[optptr[2]-1], 4);
@@ -429,12 +426,12 @@ int ip_options_compile(struct net *net,
pp_ptr = optptr + 3;
goto error;
}
- opt->ts = optptr - iph;
if (skb) {
optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4);
opt->is_changed = 1;
}
}
+ opt->ts = optptr - iph;
break;
case IPOPT_RA:
if (optlen < 4) {
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 5e12dca7b3dd..147abf5275aa 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -430,8 +430,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->tc_index = from->tc_index;
#endif
nf_copy(to, from);
-#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
- defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
to->nf_trace = from->nf_trace;
#endif
#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
new file mode 100644
index 000000000000..e4147ec1665a
--- /dev/null
+++ b/net/ipv4/ip_tunnel.c
@@ -0,0 +1,1035 @@
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/rculist.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ip_tunnels.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#endif
+
+static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn,
+ __be32 key, __be32 remote)
+{
+ return hash_32((__force u32)key ^ (__force u32)remote,
+ IP_TNL_HASH_BITS);
+}
+
+/* Often modified stats are per cpu, other are shared (netdev->stats) */
+struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *tot)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+ u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin_bh(&tstats->syncp);
+ rx_packets = tstats->rx_packets;
+ tx_packets = tstats->tx_packets;
+ rx_bytes = tstats->rx_bytes;
+ tx_bytes = tstats->tx_bytes;
+ } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
+
+ tot->rx_packets += rx_packets;
+ tot->tx_packets += tx_packets;
+ tot->rx_bytes += rx_bytes;
+ tot->tx_bytes += tx_bytes;
+ }
+
+ tot->multicast = dev->stats.multicast;
+
+ tot->rx_crc_errors = dev->stats.rx_crc_errors;
+ tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
+ tot->rx_length_errors = dev->stats.rx_length_errors;
+ tot->rx_frame_errors = dev->stats.rx_frame_errors;
+ tot->rx_errors = dev->stats.rx_errors;
+
+ tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
+ tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
+ tot->tx_dropped = dev->stats.tx_dropped;
+ tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
+ tot->tx_errors = dev->stats.tx_errors;
+
+ tot->collisions = dev->stats.collisions;
+
+ return tot;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
+
+static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
+ __be16 flags, __be32 key)
+{
+ if (p->i_flags & TUNNEL_KEY) {
+ if (flags & TUNNEL_KEY)
+ return key == p->i_key;
+ else
+ /* key expected, none present */
+ return false;
+ } else
+ return !(flags & TUNNEL_KEY);
+}
+
+/* Fallback tunnel: no source, no destination, no key, no options
+
+ Tunnel hash table:
+ We require exact key match i.e. if a key is present in packet
+ it will match only tunnel with the same key; if it is not present,
+ it will match only keyless tunnel.
+
+ All keysless packets, if not matched configured keyless tunnels
+ will match fallback tunnel.
+ Given src, dst and key, find appropriate for input tunnel.
+*/
+struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
+ int link, __be16 flags,
+ __be32 remote, __be32 local,
+ __be32 key)
+{
+ unsigned int hash;
+ struct ip_tunnel *t, *cand = NULL;
+ struct hlist_head *head;
+
+ hash = ip_tunnel_hash(itn, key, remote);
+ head = &itn->tunnels[hash];
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (local != t->parms.iph.saddr ||
+ remote != t->parms.iph.daddr ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (!ip_tunnel_key_match(&t->parms, flags, key))
+ continue;
+
+ if (t->parms.link == link)
+ return t;
+ else
+ cand = t;
+ }
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (remote != t->parms.iph.daddr ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (!ip_tunnel_key_match(&t->parms, flags, key))
+ continue;
+
+ if (t->parms.link == link)
+ return t;
+ else if (!cand)
+ cand = t;
+ }
+
+ hash = ip_tunnel_hash(itn, key, 0);
+ head = &itn->tunnels[hash];
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if ((local != t->parms.iph.saddr &&
+ (local != t->parms.iph.daddr ||
+ !ipv4_is_multicast(local))) ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (!ip_tunnel_key_match(&t->parms, flags, key))
+ continue;
+
+ if (t->parms.link == link)
+ return t;
+ else if (!cand)
+ cand = t;
+ }
+
+ if (flags & TUNNEL_NO_KEY)
+ goto skip_key_lookup;
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (t->parms.i_key != key ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (t->parms.link == link)
+ return t;
+ else if (!cand)
+ cand = t;
+ }
+
+skip_key_lookup:
+ if (cand)
+ return cand;
+
+ if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
+ return netdev_priv(itn->fb_tunnel_dev);
+
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
+
+static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
+ struct ip_tunnel_parm *parms)
+{
+ unsigned int h;
+ __be32 remote;
+
+ if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
+ remote = parms->iph.daddr;
+ else
+ remote = 0;
+
+ h = ip_tunnel_hash(itn, parms->i_key, remote);
+ return &itn->tunnels[h];
+}
+
+static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
+{
+ struct hlist_head *head = ip_bucket(itn, &t->parms);
+
+ hlist_add_head_rcu(&t->hash_node, head);
+}
+
+static void ip_tunnel_del(struct ip_tunnel *t)
+{
+ hlist_del_init_rcu(&t->hash_node);
+}
+
+static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
+ struct ip_tunnel_parm *parms,
+ int type)
+{
+ __be32 remote = parms->iph.daddr;
+ __be32 local = parms->iph.saddr;
+ __be32 key = parms->i_key;
+ int link = parms->link;
+ struct ip_tunnel *t = NULL;
+ struct hlist_head *head = ip_bucket(itn, parms);
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (local == t->parms.iph.saddr &&
+ remote == t->parms.iph.daddr &&
+ key == t->parms.i_key &&
+ link == t->parms.link &&
+ type == t->dev->type)
+ break;
+ }
+ return t;
+}
+
+static struct net_device *__ip_tunnel_create(struct net *net,
+ const struct rtnl_link_ops *ops,
+ struct ip_tunnel_parm *parms)
+{
+ int err;
+ struct ip_tunnel *tunnel;
+ struct net_device *dev;
+ char name[IFNAMSIZ];
+
+ if (parms->name[0])
+ strlcpy(name, parms->name, IFNAMSIZ);
+ else {
+ if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
+ err = -E2BIG;
+ goto failed;
+ }
+ strlcpy(name, ops->kind, IFNAMSIZ);
+ strncat(name, "%d", 2);
+ }
+
+ ASSERT_RTNL();
+ dev = alloc_netdev(ops->priv_size, name, ops->setup);
+ if (!dev) {
+ err = -ENOMEM;
+ goto failed;
+ }
+ dev_net_set(dev, net);
+
+ dev->rtnl_link_ops = ops;
+
+ tunnel = netdev_priv(dev);
+ tunnel->parms = *parms;
+
+ err = register_netdevice(dev);
+ if (err)
+ goto failed_free;
+
+ return dev;
+
+failed_free:
+ free_netdev(dev);
+failed:
+ return ERR_PTR(err);
+}
+
+static inline struct rtable *ip_route_output_tunnel(struct net *net,
+ struct flowi4 *fl4,
+ int proto,
+ __be32 daddr, __be32 saddr,
+ __be32 key, __u8 tos, int oif)
+{
+ memset(fl4, 0, sizeof(*fl4));
+ fl4->flowi4_oif = oif;
+ fl4->daddr = daddr;
+ fl4->saddr = saddr;
+ fl4->flowi4_tos = tos;
+ fl4->flowi4_proto = proto;
+ fl4->fl4_gre_key = key;
+ return ip_route_output_key(net, fl4);
+}
+
+static int ip_tunnel_bind_dev(struct net_device *dev)
+{
+ struct net_device *tdev = NULL;
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *iph;
+ int hlen = LL_MAX_HEADER;
+ int mtu = ETH_DATA_LEN;
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ iph = &tunnel->parms.iph;
+
+ /* Guess output device to choose reasonable mtu and needed_headroom */
+ if (iph->daddr) {
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ rt = ip_route_output_tunnel(dev_net(dev), &fl4,
+ tunnel->parms.iph.protocol,
+ iph->daddr, iph->saddr,
+ tunnel->parms.o_key,
+ RT_TOS(iph->tos),
+ tunnel->parms.link);
+ if (!IS_ERR(rt)) {
+ tdev = rt->dst.dev;
+ ip_rt_put(rt);
+ }
+ if (dev->type != ARPHRD_ETHER)
+ dev->flags |= IFF_POINTOPOINT;
+ }
+
+ if (!tdev && tunnel->parms.link)
+ tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
+
+ if (tdev) {
+ hlen = tdev->hard_header_len + tdev->needed_headroom;
+ mtu = tdev->mtu;
+ }
+ dev->iflink = tunnel->parms.link;
+
+ dev->needed_headroom = t_hlen + hlen;
+ mtu -= (dev->hard_header_len + t_hlen);
+
+ if (mtu < 68)
+ mtu = 68;
+
+ return mtu;
+}
+
+static struct ip_tunnel *ip_tunnel_create(struct net *net,
+ struct ip_tunnel_net *itn,
+ struct ip_tunnel_parm *parms)
+{
+ struct ip_tunnel *nt, *fbt;
+ struct net_device *dev;
+
+ BUG_ON(!itn->fb_tunnel_dev);
+ fbt = netdev_priv(itn->fb_tunnel_dev);
+ dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
+ if (IS_ERR(dev))
+ return NULL;
+
+ dev->mtu = ip_tunnel_bind_dev(dev);
+
+ nt = netdev_priv(dev);
+ ip_tunnel_add(itn, nt);
+ return nt;
+}
+
+int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
+ const struct tnl_ptk_info *tpi, bool log_ecn_error)
+{
+ struct pcpu_tstats *tstats;
+ const struct iphdr *iph = ip_hdr(skb);
+ int err;
+
+ secpath_reset(skb);
+
+ skb->protocol = tpi->proto;
+
+ skb->mac_header = skb->network_header;
+ __pskb_pull(skb, tunnel->hlen);
+ skb_postpull_rcsum(skb, skb_transport_header(skb), tunnel->hlen);
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (ipv4_is_multicast(iph->daddr)) {
+ /* Looped back packet, drop it! */
+ if (rt_is_output_route(skb_rtable(skb)))
+ goto drop;
+ tunnel->dev->stats.multicast++;
+ skb->pkt_type = PACKET_BROADCAST;
+ }
+#endif
+
+ if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
+ ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
+ tunnel->dev->stats.rx_crc_errors++;
+ tunnel->dev->stats.rx_errors++;
+ goto drop;
+ }
+
+ if (tunnel->parms.i_flags&TUNNEL_SEQ) {
+ if (!(tpi->flags&TUNNEL_SEQ) ||
+ (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
+ tunnel->dev->stats.rx_fifo_errors++;
+ tunnel->dev->stats.rx_errors++;
+ goto drop;
+ }
+ tunnel->i_seqno = ntohl(tpi->seq) + 1;
+ }
+
+ /* Warning: All skb pointers will be invalidated! */
+ if (tunnel->dev->type == ARPHRD_ETHER) {
+ if (!pskb_may_pull(skb, ETH_HLEN)) {
+ tunnel->dev->stats.rx_length_errors++;
+ tunnel->dev->stats.rx_errors++;
+ goto drop;
+ }
+
+ iph = ip_hdr(skb);
+ skb->protocol = eth_type_trans(skb, tunnel->dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+ }
+
+ skb->pkt_type = PACKET_HOST;
+ __skb_tunnel_rx(skb, tunnel->dev);
+
+ skb_reset_network_header(skb);
+ err = IP_ECN_decapsulate(iph, skb);
+ if (unlikely(err)) {
+ if (log_ecn_error)
+ net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+ &iph->saddr, iph->tos);
+ if (err > 1) {
+ ++tunnel->dev->stats.rx_frame_errors;
+ ++tunnel->dev->stats.rx_errors;
+ goto drop;
+ }
+ }
+
+ tstats = this_cpu_ptr(tunnel->dev->tstats);
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
+
+ gro_cells_receive(&tunnel->gro_cells, skb);
+ return 0;
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
+
+void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
+ const struct iphdr *tnl_params)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *inner_iph;
+ struct iphdr *iph;
+ struct flowi4 fl4;
+ u8 tos, ttl;
+ __be16 df;
+ struct rtable *rt; /* Route to the other host */
+ struct net_device *tdev; /* Device to other host */
+ unsigned int max_headroom; /* The extra header space needed */
+ __be32 dst;
+ int mtu;
+
+ inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+
+ dst = tnl_params->daddr;
+ if (dst == 0) {
+ /* NBMA tunnel */
+
+ if (skb_dst(skb) == NULL) {
+ dev->stats.tx_fifo_errors++;
+ goto tx_error;
+ }
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ rt = skb_rtable(skb);
+ dst = rt_nexthop(rt, inner_iph->daddr);
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (skb->protocol == htons(ETH_P_IPV6)) {
+ const struct in6_addr *addr6;
+ struct neighbour *neigh;
+ bool do_tx_error_icmp;
+ int addr_type;
+
+ neigh = dst_neigh_lookup(skb_dst(skb),
+ &ipv6_hdr(skb)->daddr);
+ if (neigh == NULL)
+ goto tx_error;
+
+ addr6 = (const struct in6_addr *)&neigh->primary_key;
+ addr_type = ipv6_addr_type(addr6);
+
+ if (addr_type == IPV6_ADDR_ANY) {
+ addr6 = &ipv6_hdr(skb)->daddr;
+ addr_type = ipv6_addr_type(addr6);
+ }
+
+ if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
+ do_tx_error_icmp = true;
+ else {
+ do_tx_error_icmp = false;
+ dst = addr6->s6_addr32[3];
+ }
+ neigh_release(neigh);
+ if (do_tx_error_icmp)
+ goto tx_error_icmp;
+ }
+#endif
+ else
+ goto tx_error;
+ }
+
+ tos = tnl_params->tos;
+ if (tos & 0x1) {
+ tos &= ~0x1;
+ if (skb->protocol == htons(ETH_P_IP))
+ tos = inner_iph->tos;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+ }
+
+ rt = ip_route_output_tunnel(dev_net(dev), &fl4,
+ tunnel->parms.iph.protocol,
+ dst, tnl_params->saddr,
+ tunnel->parms.o_key,
+ RT_TOS(tos),
+ tunnel->parms.link);
+ if (IS_ERR(rt)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error;
+ }
+ tdev = rt->dst.dev;
+
+ if (tdev == dev) {
+ ip_rt_put(rt);
+ dev->stats.collisions++;
+ goto tx_error;
+ }
+
+ df = tnl_params->frag_off;
+
+ if (df)
+ mtu = dst_mtu(&rt->dst) - dev->hard_header_len
+ - sizeof(struct iphdr);
+ else
+ mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
+
+ if (skb_dst(skb))
+ skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ df |= (inner_iph->frag_off&htons(IP_DF));
+
+ if (!skb_is_gso(skb) &&
+ (inner_iph->frag_off&htons(IP_DF)) &&
+ mtu < ntohs(inner_iph->tot_len)) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
+
+ if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
+ mtu >= IPV6_MIN_MTU) {
+ if ((tunnel->parms.iph.daddr &&
+ !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
+ rt6->rt6i_dst.plen == 128) {
+ rt6->rt6i_flags |= RTF_MODIFIED;
+ dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
+ }
+ }
+
+ if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
+ mtu < skb->len) {
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+ }
+#endif
+
+ if (tunnel->err_count > 0) {
+ if (time_before(jiffies,
+ tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
+ tunnel->err_count--;
+
+ dst_link_failure(skb);
+ } else
+ tunnel->err_count = 0;
+ }
+
+ ttl = tnl_params->ttl;
+ if (ttl == 0) {
+ if (skb->protocol == htons(ETH_P_IP))
+ ttl = inner_iph->ttl;
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
+#endif
+ else
+ ttl = ip4_dst_hoplimit(&rt->dst);
+ }
+
+ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr)
+ + rt->dst.header_len;
+ if (max_headroom > dev->needed_headroom) {
+ dev->needed_headroom = max_headroom;
+ if (skb_cow_head(skb, dev->needed_headroom)) {
+ dev->stats.tx_dropped++;
+ dev_kfree_skb(skb);
+ return;
+ }
+ }
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+
+ /* Push down and install the IP header. */
+ skb_push(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+
+ iph = ip_hdr(skb);
+ inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+
+ iph->version = 4;
+ iph->ihl = sizeof(struct iphdr) >> 2;
+ iph->frag_off = df;
+ iph->protocol = tnl_params->protocol;
+ iph->tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
+ iph->daddr = fl4.daddr;
+ iph->saddr = fl4.saddr;
+ iph->ttl = ttl;
+ tunnel_ip_select_ident(skb, inner_iph, &rt->dst);
+
+ iptunnel_xmit(skb, dev);
+ return;
+
+#if IS_ENABLED(CONFIG_IPV6)
+tx_error_icmp:
+ dst_link_failure(skb);
+#endif
+tx_error:
+ dev->stats.tx_errors++;
+ dev_kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
+
+static void ip_tunnel_update(struct ip_tunnel_net *itn,
+ struct ip_tunnel *t,
+ struct net_device *dev,
+ struct ip_tunnel_parm *p,
+ bool set_mtu)
+{
+ ip_tunnel_del(t);
+ t->parms.iph.saddr = p->iph.saddr;
+ t->parms.iph.daddr = p->iph.daddr;
+ t->parms.i_key = p->i_key;
+ t->parms.o_key = p->o_key;
+ if (dev->type != ARPHRD_ETHER) {
+ memcpy(dev->dev_addr, &p->iph.saddr, 4);
+ memcpy(dev->broadcast, &p->iph.daddr, 4);
+ }
+ ip_tunnel_add(itn, t);
+
+ t->parms.iph.ttl = p->iph.ttl;
+ t->parms.iph.tos = p->iph.tos;
+ t->parms.iph.frag_off = p->iph.frag_off;
+
+ if (t->parms.link != p->link) {
+ int mtu;
+
+ t->parms.link = p->link;
+ mtu = ip_tunnel_bind_dev(dev);
+ if (set_mtu)
+ dev->mtu = mtu;
+ }
+ netdev_state_change(dev);
+}
+
+int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
+{
+ int err = 0;
+ struct ip_tunnel *t;
+ struct net *net = dev_net(dev);
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
+
+ BUG_ON(!itn->fb_tunnel_dev);
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ t = NULL;
+ if (dev == itn->fb_tunnel_dev)
+ t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+ if (t == NULL)
+ t = netdev_priv(dev);
+ memcpy(p, &t->parms, sizeof(*p));
+ break;
+
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto done;
+ if (p->iph.ttl)
+ p->iph.frag_off |= htons(IP_DF);
+ if (!(p->i_flags&TUNNEL_KEY))
+ p->i_key = 0;
+ if (!(p->o_flags&TUNNEL_KEY))
+ p->o_key = 0;
+
+ t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+
+ if (!t && (cmd == SIOCADDTUNNEL))
+ t = ip_tunnel_create(net, itn, p);
+
+ if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+ if (t != NULL) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else {
+ unsigned int nflags = 0;
+
+ if (ipv4_is_multicast(p->iph.daddr))
+ nflags = IFF_BROADCAST;
+ else if (p->iph.daddr)
+ nflags = IFF_POINTOPOINT;
+
+ if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
+ err = -EINVAL;
+ break;
+ }
+
+ t = netdev_priv(dev);
+ }
+ }
+
+ if (t) {
+ err = 0;
+ ip_tunnel_update(itn, t, dev, p, true);
+ } else
+ err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+ break;
+
+ case SIOCDELTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto done;
+
+ if (dev == itn->fb_tunnel_dev) {
+ err = -ENOENT;
+ t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+ if (t == NULL)
+ goto done;
+ err = -EPERM;
+ if (t == netdev_priv(itn->fb_tunnel_dev))
+ goto done;
+ dev = t->dev;
+ }
+ unregister_netdevice(dev);
+ err = 0;
+ break;
+
+ default:
+ err = -EINVAL;
+ }
+
+done:
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
+
+int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ if (new_mtu < 68 ||
+ new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
+
+static void ip_tunnel_dev_free(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ gro_cells_destroy(&tunnel->gro_cells);
+ free_percpu(dev->tstats);
+ free_netdev(dev);
+}
+
+void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
+{
+ struct net *net = dev_net(dev);
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_net *itn;
+
+ itn = net_generic(net, tunnel->ip_tnl_net_id);
+
+ if (itn->fb_tunnel_dev != dev) {
+ ip_tunnel_del(netdev_priv(dev));
+ unregister_netdevice_queue(dev, head);
+ }
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
+
+int __net_init ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
+ struct rtnl_link_ops *ops, char *devname)
+{
+ struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
+ struct ip_tunnel_parm parms;
+
+ itn->tunnels = kzalloc(IP_TNL_HASH_SIZE * sizeof(struct hlist_head), GFP_KERNEL);
+ if (!itn->tunnels)
+ return -ENOMEM;
+
+ if (!ops) {
+ itn->fb_tunnel_dev = NULL;
+ return 0;
+ }
+ memset(&parms, 0, sizeof(parms));
+ if (devname)
+ strlcpy(parms.name, devname, IFNAMSIZ);
+
+ rtnl_lock();
+ itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
+ rtnl_unlock();
+ if (IS_ERR(itn->fb_tunnel_dev)) {
+ kfree(itn->tunnels);
+ return PTR_ERR(itn->fb_tunnel_dev);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
+
+static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head)
+{
+ int h;
+
+ for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
+ struct ip_tunnel *t;
+ struct hlist_node *n;
+ struct hlist_head *thead = &itn->tunnels[h];
+
+ hlist_for_each_entry_safe(t, n, thead, hash_node)
+ unregister_netdevice_queue(t->dev, head);
+ }
+ if (itn->fb_tunnel_dev)
+ unregister_netdevice_queue(itn->fb_tunnel_dev, head);
+}
+
+void __net_exit ip_tunnel_delete_net(struct ip_tunnel_net *itn)
+{
+ LIST_HEAD(list);
+
+ rtnl_lock();
+ ip_tunnel_destroy(itn, &list);
+ unregister_netdevice_many(&list);
+ rtnl_unlock();
+ kfree(itn->tunnels);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
+
+int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
+ struct ip_tunnel_parm *p)
+{
+ struct ip_tunnel *nt;
+ struct net *net = dev_net(dev);
+ struct ip_tunnel_net *itn;
+ int mtu;
+ int err;
+
+ nt = netdev_priv(dev);
+ itn = net_generic(net, nt->ip_tnl_net_id);
+
+ if (ip_tunnel_find(itn, p, dev->type))
+ return -EEXIST;
+
+ nt->parms = *p;
+ err = register_netdevice(dev);
+ if (err)
+ goto out;
+
+ if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
+ eth_hw_addr_random(dev);
+
+ mtu = ip_tunnel_bind_dev(dev);
+ if (!tb[IFLA_MTU])
+ dev->mtu = mtu;
+
+ ip_tunnel_add(itn, nt);
+
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
+
+int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct ip_tunnel_parm *p)
+{
+ struct ip_tunnel *t, *nt;
+ struct net *net = dev_net(dev);
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
+
+ if (dev == itn->fb_tunnel_dev)
+ return -EINVAL;
+
+ nt = netdev_priv(dev);
+
+ t = ip_tunnel_find(itn, p, dev->type);
+
+ if (t) {
+ if (t->dev != dev)
+ return -EEXIST;
+ } else {
+ t = nt;
+
+ if (dev->type != ARPHRD_ETHER) {
+ unsigned int nflags = 0;
+
+ if (ipv4_is_multicast(p->iph.daddr))
+ nflags = IFF_BROADCAST;
+ else if (p->iph.daddr)
+ nflags = IFF_POINTOPOINT;
+
+ if ((dev->flags ^ nflags) &
+ (IFF_POINTOPOINT | IFF_BROADCAST))
+ return -EINVAL;
+ }
+ }
+
+ ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
+
+int ip_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+ int err;
+
+ dev->destructor = ip_tunnel_dev_free;
+ dev->tstats = alloc_percpu(struct pcpu_tstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ err = gro_cells_init(&tunnel->gro_cells, dev);
+ if (err) {
+ free_percpu(dev->tstats);
+ return err;
+ }
+
+ tunnel->dev = dev;
+ strcpy(tunnel->parms.name, dev->name);
+ iph->version = 4;
+ iph->ihl = 5;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_init);
+
+void ip_tunnel_uninit(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_net *itn;
+
+ itn = net_generic(net, tunnel->ip_tnl_net_id);
+ /* fb_tunnel_dev will be unregisted in net-exit call. */
+ if (itn->fb_tunnel_dev != dev)
+ ip_tunnel_del(netdev_priv(dev));
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
+
+/* Do least required initialization, rest of init is done in tunnel_init call */
+void ip_tunnel_setup(struct net_device *dev, int net_id)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ tunnel->ip_tnl_net_id = net_id;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_setup);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index c3a4233c0ac2..9d2bdb2c1d3f 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -38,7 +38,7 @@
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
-#include <net/ipip.h>
+#include <net/ip_tunnels.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
@@ -82,44 +82,6 @@ static int vti_tunnel_bind_dev(struct net_device *dev);
} while (0)
-static struct rtnl_link_stats64 *vti_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *tot)
-{
- int i;
-
- for_each_possible_cpu(i) {
- const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
- u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
- unsigned int start;
-
- do {
- start = u64_stats_fetch_begin_bh(&tstats->syncp);
- rx_packets = tstats->rx_packets;
- tx_packets = tstats->tx_packets;
- rx_bytes = tstats->rx_bytes;
- tx_bytes = tstats->tx_bytes;
- } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
-
- tot->rx_packets += rx_packets;
- tot->tx_packets += tx_packets;
- tot->rx_bytes += rx_bytes;
- tot->tx_bytes += tx_bytes;
- }
-
- tot->multicast = dev->stats.multicast;
- tot->rx_crc_errors = dev->stats.rx_crc_errors;
- tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
- tot->rx_length_errors = dev->stats.rx_length_errors;
- tot->rx_errors = dev->stats.rx_errors;
- tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
- tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
- tot->tx_dropped = dev->stats.tx_dropped;
- tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
- tot->tx_errors = dev->stats.tx_errors;
-
- return tot;
-}
-
static struct ip_tunnel *vti_tunnel_lookup(struct net *net,
__be32 remote, __be32 local)
{
@@ -597,7 +559,7 @@ static const struct net_device_ops vti_netdev_ops = {
.ndo_start_xmit = vti_tunnel_xmit,
.ndo_do_ioctl = vti_tunnel_ioctl,
.ndo_change_mtu = vti_tunnel_change_mtu,
- .ndo_get_stats64 = vti_get_stats64,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
};
static void vti_dev_free(struct net_device *dev)
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 98cbc6877019..efa1138fa523 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -206,7 +206,7 @@ static int __init ic_open_devs(void)
struct ic_device *d, **last;
struct net_device *dev;
unsigned short oflags;
- unsigned long start;
+ unsigned long start, next_msg;
last = &ic_first_dev;
rtnl_lock();
@@ -263,12 +263,23 @@ static int __init ic_open_devs(void)
/* wait for a carrier on at least one device */
start = jiffies;
+ next_msg = start + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12);
while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) {
+ int wait, elapsed;
+
for_each_netdev(&init_net, dev)
if (ic_is_init_dev(dev) && netif_carrier_ok(dev))
goto have_carrier;
msleep(1);
+
+ if time_before(jiffies, next_msg)
+ continue;
+
+ elapsed = jiffies_to_msecs(jiffies - start);
+ wait = (CONF_CARRIER_TIMEOUT - elapsed + 500)/1000;
+ pr_info("Waiting up to %d more seconds for network.\n", wait);
+ next_msg = jiffies + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12);
}
have_carrier:
rtnl_unlock();
@@ -1522,7 +1533,8 @@ static int __init ip_auto_config(void)
}
for (i++; i < CONF_NAMESERVERS_MAX; i++)
if (ic_nameservers[i] != NONE)
- pr_cont(", nameserver%u=%pI4\n", i, &ic_nameservers[i]);
+ pr_cont(", nameserver%u=%pI4", i, &ic_nameservers[i]);
+ pr_cont("\n");
#endif /* !SILENT */
return 0;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 8f024d41eefa..77bfcce64fe5 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -111,227 +111,21 @@
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
-#include <net/ipip.h>
+#include <net/ip_tunnels.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
-#define HASH_SIZE 16
-#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
-
static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
static int ipip_net_id __read_mostly;
-struct ipip_net {
- struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
- struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
- struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
- struct ip_tunnel __rcu *tunnels_wc[1];
- struct ip_tunnel __rcu **tunnels[4];
-
- struct net_device *fb_tunnel_dev;
-};
static int ipip_tunnel_init(struct net_device *dev);
-static void ipip_tunnel_setup(struct net_device *dev);
-static void ipip_dev_free(struct net_device *dev);
static struct rtnl_link_ops ipip_link_ops __read_mostly;
-static struct rtnl_link_stats64 *ipip_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *tot)
-{
- int i;
-
- for_each_possible_cpu(i) {
- const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
- u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
- unsigned int start;
-
- do {
- start = u64_stats_fetch_begin_bh(&tstats->syncp);
- rx_packets = tstats->rx_packets;
- tx_packets = tstats->tx_packets;
- rx_bytes = tstats->rx_bytes;
- tx_bytes = tstats->tx_bytes;
- } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
-
- tot->rx_packets += rx_packets;
- tot->tx_packets += tx_packets;
- tot->rx_bytes += rx_bytes;
- tot->tx_bytes += tx_bytes;
- }
-
- tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
- tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
- tot->tx_dropped = dev->stats.tx_dropped;
- tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
- tot->tx_errors = dev->stats.tx_errors;
- tot->collisions = dev->stats.collisions;
-
- return tot;
-}
-
-static struct ip_tunnel *ipip_tunnel_lookup(struct net *net,
- __be32 remote, __be32 local)
-{
- unsigned int h0 = HASH(remote);
- unsigned int h1 = HASH(local);
- struct ip_tunnel *t;
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
-
- for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1])
- if (local == t->parms.iph.saddr &&
- remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
- return t;
-
- for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0])
- if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
- return t;
-
- for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1])
- if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
- return t;
-
- t = rcu_dereference(ipn->tunnels_wc[0]);
- if (t && (t->dev->flags&IFF_UP))
- return t;
- return NULL;
-}
-
-static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn,
- struct ip_tunnel_parm *parms)
-{
- __be32 remote = parms->iph.daddr;
- __be32 local = parms->iph.saddr;
- unsigned int h = 0;
- int prio = 0;
-
- if (remote) {
- prio |= 2;
- h ^= HASH(remote);
- }
- if (local) {
- prio |= 1;
- h ^= HASH(local);
- }
- return &ipn->tunnels[prio][h];
-}
-
-static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn,
- struct ip_tunnel *t)
-{
- return __ipip_bucket(ipn, &t->parms);
-}
-
-static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
-{
- struct ip_tunnel __rcu **tp;
- struct ip_tunnel *iter;
-
- for (tp = ipip_bucket(ipn, t);
- (iter = rtnl_dereference(*tp)) != NULL;
- tp = &iter->next) {
- if (t == iter) {
- rcu_assign_pointer(*tp, t->next);
- break;
- }
- }
-}
-
-static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
-{
- struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
-
- rcu_assign_pointer(t->next, rtnl_dereference(*tp));
- rcu_assign_pointer(*tp, t);
-}
-
-static int ipip_tunnel_create(struct net_device *dev)
-{
- struct ip_tunnel *t = netdev_priv(dev);
- struct net *net = dev_net(dev);
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
- int err;
-
- err = ipip_tunnel_init(dev);
- if (err < 0)
- goto out;
-
- err = register_netdevice(dev);
- if (err < 0)
- goto out;
-
- strcpy(t->parms.name, dev->name);
- dev->rtnl_link_ops = &ipip_link_ops;
-
- dev_hold(dev);
- ipip_tunnel_link(ipn, t);
- return 0;
-
-out:
- return err;
-}
-
-static struct ip_tunnel *ipip_tunnel_locate(struct net *net,
- struct ip_tunnel_parm *parms, int create)
-{
- __be32 remote = parms->iph.daddr;
- __be32 local = parms->iph.saddr;
- struct ip_tunnel *t, *nt;
- struct ip_tunnel __rcu **tp;
- struct net_device *dev;
- char name[IFNAMSIZ];
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
-
- for (tp = __ipip_bucket(ipn, parms);
- (t = rtnl_dereference(*tp)) != NULL;
- tp = &t->next) {
- if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
- return t;
- }
- if (!create)
- return NULL;
-
- if (parms->name[0])
- strlcpy(name, parms->name, IFNAMSIZ);
- else
- strcpy(name, "tunl%d");
-
- dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
- if (dev == NULL)
- return NULL;
-
- dev_net_set(dev, net);
-
- nt = netdev_priv(dev);
- nt->parms = *parms;
-
- if (ipip_tunnel_create(dev) < 0)
- goto failed_free;
-
- return nt;
-
-failed_free:
- ipip_dev_free(dev);
- return NULL;
-}
-
-/* called with RTNL */
-static void ipip_tunnel_uninit(struct net_device *dev)
-{
- struct net *net = dev_net(dev);
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
-
- if (dev == ipn->fb_tunnel_dev)
- RCU_INIT_POINTER(ipn->tunnels_wc[0], NULL);
- else
- ipip_tunnel_unlink(ipn, netdev_priv(dev));
- dev_put(dev);
-}
-
static int ipip_err(struct sk_buff *skb, u32 info)
{
@@ -339,41 +133,17 @@ static int ipip_err(struct sk_buff *skb, u32 info)
8 bytes of packet payload. It means, that precise relaying of
ICMP in the real Internet is absolutely infeasible.
*/
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
const struct iphdr *iph = (const struct iphdr *)skb->data;
- const int type = icmp_hdr(skb)->type;
- const int code = icmp_hdr(skb)->code;
struct ip_tunnel *t;
int err;
-
- switch (type) {
- default:
- case ICMP_PARAMETERPROB:
- return 0;
-
- case ICMP_DEST_UNREACH:
- switch (code) {
- case ICMP_SR_FAILED:
- case ICMP_PORT_UNREACH:
- /* Impossible event. */
- return 0;
- default:
- /* All others are translated to HOST_UNREACH.
- rfc2003 contains "deep thoughts" about NET_UNREACH,
- I believe they are just ether pollution. --ANK
- */
- break;
- }
- break;
- case ICMP_TIME_EXCEEDED:
- if (code != ICMP_EXC_TTL)
- return 0;
- break;
- case ICMP_REDIRECT:
- break;
- }
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
err = -ENOENT;
- t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->daddr, iph->saddr, 0);
if (t == NULL)
goto out;
@@ -403,53 +173,29 @@ static int ipip_err(struct sk_buff *skb, u32 info)
else
t->err_count = 1;
t->err_time = jiffies;
-out:
+out:
return err;
}
+static const struct tnl_ptk_info tpi = {
+ /* no tunnel info required for ipip. */
+ .proto = htons(ETH_P_IP),
+};
+
static int ipip_rcv(struct sk_buff *skb)
{
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
struct ip_tunnel *tunnel;
const struct iphdr *iph = ip_hdr(skb);
- int err;
-
- tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
- if (tunnel != NULL) {
- struct pcpu_tstats *tstats;
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->saddr, iph->daddr, 0);
+ if (tunnel) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
-
- secpath_reset(skb);
-
- skb->mac_header = skb->network_header;
- skb_reset_network_header(skb);
- skb->protocol = htons(ETH_P_IP);
- skb->pkt_type = PACKET_HOST;
-
- __skb_tunnel_rx(skb, tunnel->dev);
-
- err = IP_ECN_decapsulate(iph, skb);
- if (unlikely(err)) {
- if (log_ecn_error)
- net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
- &iph->saddr, iph->tos);
- if (err > 1) {
- ++tunnel->dev->stats.rx_frame_errors;
- ++tunnel->dev->stats.rx_errors;
- goto drop;
- }
- }
-
- tstats = this_cpu_ptr(tunnel->dev->tstats);
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
- u64_stats_update_end(&tstats->syncp);
-
- netif_rx(skb);
- return 0;
+ return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
}
return -1;
@@ -463,329 +209,64 @@ drop:
* This function assumes it is being called from dev_queue_xmit()
* and that skb is filled properly by that function.
*/
-
static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *tiph = &tunnel->parms.iph;
- u8 tos = tunnel->parms.iph.tos;
- __be16 df = tiph->frag_off;
- struct rtable *rt; /* Route to the other host */
- struct net_device *tdev; /* Device to other host */
- const struct iphdr *old_iph;
- struct iphdr *iph; /* Our new IP header */
- unsigned int max_headroom; /* The extra header space needed */
- __be32 dst = tiph->daddr;
- struct flowi4 fl4;
- int mtu;
-
- if (skb->protocol != htons(ETH_P_IP))
- goto tx_error;
- if (skb->ip_summed == CHECKSUM_PARTIAL &&
- skb_checksum_help(skb))
+ if (unlikely(skb->protocol != htons(ETH_P_IP)))
goto tx_error;
- old_iph = ip_hdr(skb);
-
- if (tos & 1)
- tos = old_iph->tos;
-
- if (!dst) {
- /* NBMA tunnel */
- if ((rt = skb_rtable(skb)) == NULL) {
- dev->stats.tx_fifo_errors++;
- goto tx_error;
- }
- dst = rt_nexthop(rt, old_iph->daddr);
+ if (likely(!skb->encapsulation)) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
}
- rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
- dst, tiph->saddr,
- 0, 0,
- IPPROTO_IPIP, RT_TOS(tos),
- tunnel->parms.link);
- if (IS_ERR(rt)) {
- dev->stats.tx_carrier_errors++;
- goto tx_error_icmp;
- }
- tdev = rt->dst.dev;
-
- if (tdev == dev) {
- ip_rt_put(rt);
- dev->stats.collisions++;
- goto tx_error;
- }
-
- df |= old_iph->frag_off & htons(IP_DF);
-
- if (df) {
- mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
-
- if (mtu < 68) {
- dev->stats.collisions++;
- ip_rt_put(rt);
- goto tx_error;
- }
-
- if (skb_dst(skb))
- skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
-
- if ((old_iph->frag_off & htons(IP_DF)) &&
- mtu < ntohs(old_iph->tot_len)) {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(mtu));
- ip_rt_put(rt);
- goto tx_error;
- }
- }
-
- if (tunnel->err_count > 0) {
- if (time_before(jiffies,
- tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
- tunnel->err_count--;
- dst_link_failure(skb);
- } else
- tunnel->err_count = 0;
- }
-
- /*
- * Okay, now see if we can stuff it in the buffer as-is.
- */
- max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));
-
- if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
- (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
- struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
- if (!new_skb) {
- ip_rt_put(rt);
- dev->stats.tx_dropped++;
- dev_kfree_skb(skb);
- return NETDEV_TX_OK;
- }
- if (skb->sk)
- skb_set_owner_w(new_skb, skb->sk);
- dev_kfree_skb(skb);
- skb = new_skb;
- old_iph = ip_hdr(skb);
- }
-
- skb->transport_header = skb->network_header;
- skb_push(skb, sizeof(struct iphdr));
- skb_reset_network_header(skb);
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
- IPSKB_REROUTED);
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
- /*
- * Push down and install the IPIP header.
- */
-
- iph = ip_hdr(skb);
- iph->version = 4;
- iph->ihl = sizeof(struct iphdr)>>2;
- iph->frag_off = df;
- iph->protocol = IPPROTO_IPIP;
- iph->tos = INET_ECN_encapsulate(tos, old_iph->tos);
- iph->daddr = fl4.daddr;
- iph->saddr = fl4.saddr;
-
- if ((iph->ttl = tiph->ttl) == 0)
- iph->ttl = old_iph->ttl;
-
- iptunnel_xmit(skb, dev);
+ ip_tunnel_xmit(skb, dev, tiph);
return NETDEV_TX_OK;
-tx_error_icmp:
- dst_link_failure(skb);
tx_error:
dev->stats.tx_errors++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
-static void ipip_tunnel_bind_dev(struct net_device *dev)
-{
- struct net_device *tdev = NULL;
- struct ip_tunnel *tunnel;
- const struct iphdr *iph;
-
- tunnel = netdev_priv(dev);
- iph = &tunnel->parms.iph;
-
- if (iph->daddr) {
- struct rtable *rt;
- struct flowi4 fl4;
-
- rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
- iph->daddr, iph->saddr,
- 0, 0,
- IPPROTO_IPIP,
- RT_TOS(iph->tos),
- tunnel->parms.link);
- if (!IS_ERR(rt)) {
- tdev = rt->dst.dev;
- ip_rt_put(rt);
- }
- dev->flags |= IFF_POINTOPOINT;
- }
-
- if (!tdev && tunnel->parms.link)
- tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
-
- if (tdev) {
- dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
- dev->mtu = tdev->mtu - sizeof(struct iphdr);
- }
- dev->iflink = tunnel->parms.link;
-}
-
-static void ipip_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p)
-{
- struct net *net = dev_net(t->dev);
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
-
- ipip_tunnel_unlink(ipn, t);
- synchronize_net();
- t->parms.iph.saddr = p->iph.saddr;
- t->parms.iph.daddr = p->iph.daddr;
- memcpy(t->dev->dev_addr, &p->iph.saddr, 4);
- memcpy(t->dev->broadcast, &p->iph.daddr, 4);
- ipip_tunnel_link(ipn, t);
- t->parms.iph.ttl = p->iph.ttl;
- t->parms.iph.tos = p->iph.tos;
- t->parms.iph.frag_off = p->iph.frag_off;
- if (t->parms.link != p->link) {
- t->parms.link = p->link;
- ipip_tunnel_bind_dev(t->dev);
- }
- netdev_state_change(t->dev);
-}
-
static int
-ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
+ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
{
int err = 0;
struct ip_tunnel_parm p;
- struct ip_tunnel *t;
- struct net *net = dev_net(dev);
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
-
- switch (cmd) {
- case SIOCGETTUNNEL:
- t = NULL;
- if (dev == ipn->fb_tunnel_dev) {
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
- err = -EFAULT;
- break;
- }
- t = ipip_tunnel_locate(net, &p, 0);
- }
- if (t == NULL)
- t = netdev_priv(dev);
- memcpy(&p, &t->parms, sizeof(p));
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
- err = -EFAULT;
- break;
-
- case SIOCADDTUNNEL:
- case SIOCCHGTUNNEL:
- err = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- goto done;
-
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
-
- err = -EINVAL;
- if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
- p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
- goto done;
- if (p.iph.ttl)
- p.iph.frag_off |= htons(IP_DF);
-
- t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
-
- if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
- if (t != NULL) {
- if (t->dev != dev) {
- err = -EEXIST;
- break;
- }
- } else {
- if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
- (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
- err = -EINVAL;
- break;
- }
- t = netdev_priv(dev);
- }
-
- ipip_tunnel_update(t, &p);
- }
-
- if (t) {
- err = 0;
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
- err = -EFAULT;
- } else
- err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
- break;
-
- case SIOCDELTUNNEL:
- err = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- goto done;
-
- if (dev == ipn->fb_tunnel_dev) {
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
- err = -ENOENT;
- if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL)
- goto done;
- err = -EPERM;
- if (t->dev == ipn->fb_tunnel_dev)
- goto done;
- dev = t->dev;
- }
- unregister_netdevice(dev);
- err = 0;
- break;
- default:
- err = -EINVAL;
- }
-
-done:
- return err;
-}
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ return -EFAULT;
-static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
-{
- if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
+ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
+ p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
+ return -EINVAL;
+ if (p.i_key || p.o_key || p.i_flags || p.o_flags)
return -EINVAL;
- dev->mtu = new_mtu;
+ if (p.iph.ttl)
+ p.iph.frag_off |= htons(IP_DF);
+
+ err = ip_tunnel_ioctl(dev, &p, cmd);
+ if (err)
+ return err;
+
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ return -EFAULT;
+
return 0;
}
static const struct net_device_ops ipip_netdev_ops = {
- .ndo_uninit = ipip_tunnel_uninit,
+ .ndo_init = ipip_tunnel_init,
+ .ndo_uninit = ip_tunnel_uninit,
.ndo_start_xmit = ipip_tunnel_xmit,
.ndo_do_ioctl = ipip_tunnel_ioctl,
- .ndo_change_mtu = ipip_tunnel_change_mtu,
- .ndo_get_stats64 = ipip_get_stats64,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
};
-static void ipip_dev_free(struct net_device *dev)
-{
- free_percpu(dev->tstats);
- free_netdev(dev);
-}
-
#define IPIP_FEATURES (NETIF_F_SG | \
NETIF_F_FRAGLIST | \
NETIF_F_HIGHDMA | \
@@ -794,11 +275,8 @@ static void ipip_dev_free(struct net_device *dev)
static void ipip_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &ipip_netdev_ops;
- dev->destructor = ipip_dev_free;
dev->type = ARPHRD_TUNNEL;
- dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
- dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr);
dev->flags = IFF_NOARP;
dev->iflink = 0;
dev->addr_len = 4;
@@ -808,46 +286,19 @@ static void ipip_tunnel_setup(struct net_device *dev)
dev->features |= IPIP_FEATURES;
dev->hw_features |= IPIP_FEATURES;
+ ip_tunnel_setup(dev, ipip_net_id);
}
static int ipip_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- tunnel->dev = dev;
-
memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
- ipip_tunnel_bind_dev(dev);
-
- dev->tstats = alloc_percpu(struct pcpu_tstats);
- if (!dev->tstats)
- return -ENOMEM;
-
- return 0;
-}
-
-static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
-{
- struct ip_tunnel *tunnel = netdev_priv(dev);
- struct iphdr *iph = &tunnel->parms.iph;
- struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id);
-
- tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
-
- iph->version = 4;
- iph->protocol = IPPROTO_IPIP;
- iph->ihl = 5;
-
- dev->tstats = alloc_percpu(struct pcpu_tstats);
- if (!dev->tstats)
- return -ENOMEM;
-
- dev_hold(dev);
- rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
- return 0;
+ tunnel->hlen = 0;
+ tunnel->parms.iph.protocol = IPPROTO_IPIP;
+ return ip_tunnel_init(dev);
}
static void ipip_netlink_parms(struct nlattr *data[],
@@ -887,28 +338,16 @@ static void ipip_netlink_parms(struct nlattr *data[],
static int ipip_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
- struct net *net = dev_net(dev);
- struct ip_tunnel *nt;
-
- nt = netdev_priv(dev);
- ipip_netlink_parms(data, &nt->parms);
-
- if (ipip_tunnel_locate(net, &nt->parms, 0))
- return -EEXIST;
+ struct ip_tunnel_parm p;
- return ipip_tunnel_create(dev);
+ ipip_netlink_parms(data, &p);
+ return ip_tunnel_newlink(dev, tb, &p);
}
static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[])
{
- struct ip_tunnel *t;
struct ip_tunnel_parm p;
- struct net *net = dev_net(dev);
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
-
- if (dev == ipn->fb_tunnel_dev)
- return -EINVAL;
ipip_netlink_parms(data, &p);
@@ -916,16 +355,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
(!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
return -EINVAL;
- t = ipip_tunnel_locate(net, &p, 0);
-
- if (t) {
- if (t->dev != dev)
- return -EEXIST;
- } else
- t = netdev_priv(dev);
-
- ipip_tunnel_update(t, &p);
- return 0;
+ return ip_tunnel_changelink(dev, tb, &p);
}
static size_t ipip_get_size(const struct net_device *dev)
@@ -982,6 +412,7 @@ static struct rtnl_link_ops ipip_link_ops __read_mostly = {
.setup = ipip_tunnel_setup,
.newlink = ipip_newlink,
.changelink = ipip_changelink,
+ .dellink = ip_tunnel_dellink,
.get_size = ipip_get_size,
.fill_info = ipip_fill_info,
};
@@ -992,90 +423,29 @@ static struct xfrm_tunnel ipip_handler __read_mostly = {
.priority = 1,
};
-static const char banner[] __initconst =
- KERN_INFO "IPv4 over IPv4 tunneling driver\n";
-
-static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
-{
- int prio;
-
- for (prio = 1; prio < 4; prio++) {
- int h;
- for (h = 0; h < HASH_SIZE; h++) {
- struct ip_tunnel *t;
-
- t = rtnl_dereference(ipn->tunnels[prio][h]);
- while (t != NULL) {
- unregister_netdevice_queue(t->dev, head);
- t = rtnl_dereference(t->next);
- }
- }
- }
-}
-
static int __net_init ipip_init_net(struct net *net)
{
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
- struct ip_tunnel *t;
- int err;
-
- ipn->tunnels[0] = ipn->tunnels_wc;
- ipn->tunnels[1] = ipn->tunnels_l;
- ipn->tunnels[2] = ipn->tunnels_r;
- ipn->tunnels[3] = ipn->tunnels_r_l;
-
- ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
- "tunl0",
- ipip_tunnel_setup);
- if (!ipn->fb_tunnel_dev) {
- err = -ENOMEM;
- goto err_alloc_dev;
- }
- dev_net_set(ipn->fb_tunnel_dev, net);
-
- err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
- if (err)
- goto err_reg_dev;
-
- if ((err = register_netdev(ipn->fb_tunnel_dev)))
- goto err_reg_dev;
-
- t = netdev_priv(ipn->fb_tunnel_dev);
-
- strcpy(t->parms.name, ipn->fb_tunnel_dev->name);
- return 0;
-
-err_reg_dev:
- ipip_dev_free(ipn->fb_tunnel_dev);
-err_alloc_dev:
- /* nothing */
- return err;
+ return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
}
static void __net_exit ipip_exit_net(struct net *net)
{
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
- LIST_HEAD(list);
-
- rtnl_lock();
- ipip_destroy_tunnels(ipn, &list);
- unregister_netdevice_queue(ipn->fb_tunnel_dev, &list);
- unregister_netdevice_many(&list);
- rtnl_unlock();
+ struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+ ip_tunnel_delete_net(itn);
}
static struct pernet_operations ipip_net_ops = {
.init = ipip_init_net,
.exit = ipip_exit_net,
.id = &ipip_net_id,
- .size = sizeof(struct ipip_net),
+ .size = sizeof(struct ip_tunnel_net),
};
static int __init ipip_init(void)
{
int err;
- printk(banner);
+ pr_info("ipip: IPv4 over IPv4 tunneling driver\n");
err = register_pernet_device(&ipip_net_ops);
if (err < 0)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 5f95b3aa579e..9d9610ae7855 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -61,7 +61,7 @@
#include <linux/netfilter_ipv4.h>
#include <linux/compat.h>
#include <linux/export.h>
-#include <net/ipip.h>
+#include <net/ip_tunnels.h>
#include <net/checksum.h>
#include <net/netlink.h>
#include <net/fib_rules.h>
@@ -626,9 +626,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
if (ip_hdr(skb)->version == 0) {
struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
nlh->nlmsg_type = NLMSG_ERROR;
- nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+ nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
skb_trim(skb, nlh->nlmsg_len);
- e = NLMSG_DATA(nlh);
+ e = nlmsg_data(nlh);
e->error = -ETIMEDOUT;
memset(&e->msg, 0, sizeof(e->msg));
@@ -910,14 +910,14 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
if (ip_hdr(skb)->version == 0) {
struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
- if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
+ if (__ipmr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) {
nlh->nlmsg_len = skb_tail_pointer(skb) -
(u8 *)nlh;
} else {
nlh->nlmsg_type = NLMSG_ERROR;
- nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+ nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
skb_trim(skb, nlh->nlmsg_len);
- e = NLMSG_DATA(nlh);
+ e = nlmsg_data(nlh);
e->error = -EMSGSIZE;
memset(&e->msg, 0, sizeof(e->msg));
}
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index ce2d43e1f09f..0d755c50994b 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -36,19 +36,6 @@ config NF_CONNTRACK_PROC_COMPAT
If unsure, say Y.
-config IP_NF_QUEUE
- tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
- depends on NETFILTER_ADVANCED
- help
- Netfilter has the ability to queue packets to user space: the
- netlink device can be used to access them using this driver.
-
- This option enables the old IPv4-only "ip_queue" implementation
- which has been obsoleted by the new "nfnetlink_queue" code (see
- CONFIG_NETFILTER_NETLINK_QUEUE).
-
- To compile it as a module, choose M here. If unsure, say N.
-
config IP_NF_IPTABLES
tristate "IP tables support (required for filtering/masq/NAT)"
default m if NETFILTER_ADVANCED=n
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 79ca5e70d497..eadab1ed6500 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -48,9 +48,7 @@ static int __net_init arptable_filter_net_init(struct net *net)
net->ipv4.arptable_filter =
arpt_register_table(net, &packet_filter, repl);
kfree(repl);
- if (IS_ERR(net->ipv4.arptable_filter))
- return PTR_ERR(net->ipv4.arptable_filter);
- return 0;
+ return PTR_RET(net->ipv4.arptable_filter);
}
static void __net_exit arptable_filter_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 3efcf87400c3..e391db1f056d 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -182,8 +182,7 @@ ipt_get_target_c(const struct ipt_entry *e)
return ipt_get_target((struct ipt_entry *)e);
}
-#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
- defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
static const char *const hooknames[] = {
[NF_INET_PRE_ROUTING] = "PREROUTING",
[NF_INET_LOCAL_IN] = "INPUT",
@@ -259,6 +258,7 @@ static void trace_packet(const struct sk_buff *skb,
const char *hookname, *chainname, *comment;
const struct ipt_entry *iter;
unsigned int rulenum = 0;
+ struct net *net = dev_net(in ? in : out);
table_base = private->entries[smp_processor_id()];
root = get_entry(table_base, private->hook_entry[hook]);
@@ -271,7 +271,7 @@ static void trace_packet(const struct sk_buff *skb,
&chainname, &comment, &rulenum) != 0)
break;
- nf_log_packet(AF_INET, hook, skb, in, out, &trace_loginfo,
+ nf_log_packet(net, AF_INET, hook, skb, in, out, &trace_loginfo,
"TRACE: %s:%s:%s:%u ",
tablename, chainname, comment, rulenum);
}
@@ -361,8 +361,7 @@ ipt_do_table(struct sk_buff *skb,
t = ipt_get_target(e);
IP_NF_ASSERT(t->u.kernel.target);
-#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
- defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
/* The packet is traced: log it */
if (unlikely(skb->nf_trace))
trace_packet(skb, hook, in, out,
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 7d168dcbd135..8799c836ccaa 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -37,7 +37,7 @@
#include <linux/skbuff.h>
#include <linux/kernel.h>
#include <linux/timer.h>
-#include <linux/netlink.h>
+#include <net/netlink.h>
#include <linux/netdevice.h>
#include <linux/mm.h>
#include <linux/moduleparam.h>
@@ -45,6 +45,7 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ipt_ULOG.h>
#include <net/netfilter/nf_log.h>
+#include <net/netns/generic.h>
#include <net/sock.h>
#include <linux/bitops.h>
#include <asm/unaligned.h>
@@ -78,15 +79,23 @@ typedef struct {
struct timer_list timer; /* the timer function */
} ulog_buff_t;
-static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */
+static int ulog_net_id __read_mostly;
+struct ulog_net {
+ unsigned int nlgroup[ULOG_MAXNLGROUPS];
+ ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS];
+ struct sock *nflognl;
+ spinlock_t lock;
+};
-static struct sock *nflognl; /* our socket */
-static DEFINE_SPINLOCK(ulog_lock); /* spinlock */
+static struct ulog_net *ulog_pernet(struct net *net)
+{
+ return net_generic(net, ulog_net_id);
+}
/* send one ulog_buff_t to userspace */
-static void ulog_send(unsigned int nlgroupnum)
+static void ulog_send(struct ulog_net *ulog, unsigned int nlgroupnum)
{
- ulog_buff_t *ub = &ulog_buffers[nlgroupnum];
+ ulog_buff_t *ub = &ulog->ulog_buffers[nlgroupnum];
pr_debug("ulog_send: timer is deleting\n");
del_timer(&ub->timer);
@@ -103,7 +112,8 @@ static void ulog_send(unsigned int nlgroupnum)
NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
pr_debug("throwing %d packets to netlink group %u\n",
ub->qlen, nlgroupnum + 1);
- netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC);
+ netlink_broadcast(ulog->nflognl, ub->skb, 0, nlgroupnum + 1,
+ GFP_ATOMIC);
ub->qlen = 0;
ub->skb = NULL;
@@ -114,13 +124,16 @@ static void ulog_send(unsigned int nlgroupnum)
/* timer function to flush queue in flushtimeout time */
static void ulog_timer(unsigned long data)
{
+ struct ulog_net *ulog = container_of((void *)data,
+ struct ulog_net,
+ nlgroup[*(unsigned int *)data]);
pr_debug("timer function called, calling ulog_send\n");
/* lock to protect against somebody modifying our structure
* from ipt_ulog_target at the same time */
- spin_lock_bh(&ulog_lock);
- ulog_send(data);
- spin_unlock_bh(&ulog_lock);
+ spin_lock_bh(&ulog->lock);
+ ulog_send(ulog, data);
+ spin_unlock_bh(&ulog->lock);
}
static struct sk_buff *ulog_alloc_skb(unsigned int size)
@@ -160,6 +173,8 @@ static void ipt_ulog_packet(unsigned int hooknum,
size_t size, copy_len;
struct nlmsghdr *nlh;
struct timeval tv;
+ struct net *net = dev_net(in ? in : out);
+ struct ulog_net *ulog = ulog_pernet(net);
/* ffs == find first bit set, necessary because userspace
* is already shifting groupnumber, but we need unshifted.
@@ -172,11 +187,11 @@ static void ipt_ulog_packet(unsigned int hooknum,
else
copy_len = loginfo->copy_range;
- size = NLMSG_SPACE(sizeof(*pm) + copy_len);
+ size = nlmsg_total_size(sizeof(*pm) + copy_len);
- ub = &ulog_buffers[groupnum];
+ ub = &ulog->ulog_buffers[groupnum];
- spin_lock_bh(&ulog_lock);
+ spin_lock_bh(&ulog->lock);
if (!ub->skb) {
if (!(ub->skb = ulog_alloc_skb(size)))
@@ -186,7 +201,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
/* either the queue len is too high or we don't have
* enough room in nlskb left. send it to userspace. */
- ulog_send(groupnum);
+ ulog_send(ulog, groupnum);
if (!(ub->skb = ulog_alloc_skb(size)))
goto alloc_failure;
@@ -260,16 +275,16 @@ static void ipt_ulog_packet(unsigned int hooknum,
if (ub->qlen >= loginfo->qthreshold) {
if (loginfo->qthreshold > 1)
nlh->nlmsg_type = NLMSG_DONE;
- ulog_send(groupnum);
+ ulog_send(ulog, groupnum);
}
out_unlock:
- spin_unlock_bh(&ulog_lock);
+ spin_unlock_bh(&ulog->lock);
return;
alloc_failure:
pr_debug("Error building netlink message\n");
- spin_unlock_bh(&ulog_lock);
+ spin_unlock_bh(&ulog->lock);
}
static unsigned int
@@ -376,54 +391,43 @@ static struct nf_logger ipt_ulog_logger __read_mostly = {
.me = THIS_MODULE,
};
-static int __init ulog_tg_init(void)
+static int __net_init ulog_tg_net_init(struct net *net)
{
- int ret, i;
+ int i;
+ struct ulog_net *ulog = ulog_pernet(net);
struct netlink_kernel_cfg cfg = {
.groups = ULOG_MAXNLGROUPS,
};
- pr_debug("init module\n");
-
- if (nlbufsiz > 128*1024) {
- pr_warning("Netlink buffer has to be <= 128kB\n");
- return -EINVAL;
- }
-
+ spin_lock_init(&ulog->lock);
/* initialize ulog_buffers */
for (i = 0; i < ULOG_MAXNLGROUPS; i++)
- setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
+ setup_timer(&ulog->ulog_buffers[i].timer, ulog_timer, i);
- nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, &cfg);
- if (!nflognl)
+ ulog->nflognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg);
+ if (!ulog->nflognl)
return -ENOMEM;
- ret = xt_register_target(&ulog_tg_reg);
- if (ret < 0) {
- netlink_kernel_release(nflognl);
- return ret;
- }
if (nflog)
- nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger);
+ nf_log_set(net, NFPROTO_IPV4, &ipt_ulog_logger);
return 0;
}
-static void __exit ulog_tg_exit(void)
+static void __net_exit ulog_tg_net_exit(struct net *net)
{
ulog_buff_t *ub;
int i;
-
- pr_debug("cleanup_module\n");
+ struct ulog_net *ulog = ulog_pernet(net);
if (nflog)
- nf_log_unregister(&ipt_ulog_logger);
- xt_unregister_target(&ulog_tg_reg);
- netlink_kernel_release(nflognl);
+ nf_log_unset(net, &ipt_ulog_logger);
+
+ netlink_kernel_release(ulog->nflognl);
/* remove pending timers and free allocated skb's */
for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
- ub = &ulog_buffers[i];
+ ub = &ulog->ulog_buffers[i];
pr_debug("timer is deleting\n");
del_timer(&ub->timer);
@@ -434,5 +438,50 @@ static void __exit ulog_tg_exit(void)
}
}
+static struct pernet_operations ulog_tg_net_ops = {
+ .init = ulog_tg_net_init,
+ .exit = ulog_tg_net_exit,
+ .id = &ulog_net_id,
+ .size = sizeof(struct ulog_net),
+};
+
+static int __init ulog_tg_init(void)
+{
+ int ret;
+ pr_debug("init module\n");
+
+ if (nlbufsiz > 128*1024) {
+ pr_warn("Netlink buffer has to be <= 128kB\n");
+ return -EINVAL;
+ }
+
+ ret = register_pernet_subsys(&ulog_tg_net_ops);
+ if (ret)
+ goto out_pernet;
+
+ ret = xt_register_target(&ulog_tg_reg);
+ if (ret < 0)
+ goto out_target;
+
+ if (nflog)
+ nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger);
+
+ return 0;
+
+out_target:
+ unregister_pernet_subsys(&ulog_tg_net_ops);
+out_pernet:
+ return ret;
+}
+
+static void __exit ulog_tg_exit(void)
+{
+ pr_debug("cleanup_module\n");
+ if (nflog)
+ nf_log_unregister(&ipt_ulog_logger);
+ xt_unregister_target(&ulog_tg_reg);
+ unregister_pernet_subsys(&ulog_tg_net_ops);
+}
+
module_init(ulog_tg_init);
module_exit(ulog_tg_exit);
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 5241d997ab75..c2cd63d2d892 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -187,8 +187,8 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
if (icmph == NULL) {
if (LOG_INVALID(net, IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "nf_ct_icmp: short packet ");
+ nf_log_packet(net, PF_INET, 0, skb, NULL, NULL,
+ NULL, "nf_ct_icmp: short packet ");
return -NF_ACCEPT;
}
@@ -196,7 +196,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
nf_ip_checksum(skb, hooknum, dataoff, 0)) {
if (LOG_INVALID(net, IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,
"nf_ct_icmp: bad HW ICMP checksum ");
return -NF_ACCEPT;
}
@@ -209,7 +209,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
*/
if (icmph->type > NR_ICMP_TYPES) {
if (LOG_INVALID(net, IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,
"nf_ct_icmp: invalid ICMP type ");
return -NF_ACCEPT;
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 32030a24e776..b6f2ea174898 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -224,6 +224,8 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
+ SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
+ SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY),
SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6e2851464f8f..550781a17b34 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2311,7 +2311,7 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
+static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(in_skb->sk);
struct rtmsg *rtm;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index ef54377fb11c..7f4a5cb8f8d0 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -267,7 +267,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
struct ip_options *opt)
{
struct tcp_options_received tcp_opt;
- const u8 *hash_location;
struct inet_request_sock *ireq;
struct tcp_request_sock *treq;
struct tcp_sock *tp = tcp_sk(sk);
@@ -294,7 +293,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
/* check for timestamp cookie support */
memset(&tcp_opt, 0, sizeof(tcp_opt));
- tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL);
+ tcp_parse_options(skb, &tcp_opt, 0, NULL);
if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
goto out;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 960fd29d9b8e..fa2f63fc453b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -28,7 +28,7 @@
static int zero;
static int one = 1;
-static int two = 2;
+static int four = 4;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -592,13 +592,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_frto_response",
- .data = &sysctl_tcp_frto_response,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_low_latency",
.data = &sysctl_tcp_low_latency,
.maxlen = sizeof(int),
@@ -733,13 +726,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec,
},
{
- .procname = "tcp_cookie_size",
- .data = &sysctl_tcp_cookie_size,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_thin_linear_timeouts",
.data = &sysctl_tcp_thin_linear_timeouts,
.maxlen = sizeof(int),
@@ -760,7 +746,7 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
- .extra2 = &two,
+ .extra2 = &four,
},
{
.procname = "udp_mem",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 47e854fcae24..a96f7b586277 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -409,15 +409,6 @@ void tcp_init_sock(struct sock *sk)
icsk->icsk_sync_mss = tcp_sync_mss;
- /* TCP Cookie Transactions */
- if (sysctl_tcp_cookie_size > 0) {
- /* Default, cookies without s_data_payload. */
- tp->cookie_values =
- kzalloc(sizeof(*tp->cookie_values),
- sk->sk_allocation);
- if (tp->cookie_values != NULL)
- kref_init(&tp->cookie_values->kref);
- }
/* Presumed zeroed, in order of appearance:
* cookie_in_always, cookie_out_never,
* s_data_constant, s_data_in, s_data_out
@@ -775,7 +766,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
* Make sure that we have exactly size bytes
* available to the caller, no more, no less.
*/
- skb->avail_size = size;
+ skb->reserved_tailroom = skb->end - skb->tail - size;
return skb;
}
__kfree_skb(skb);
@@ -2397,92 +2388,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
release_sock(sk);
return err;
}
- case TCP_COOKIE_TRANSACTIONS: {
- struct tcp_cookie_transactions ctd;
- struct tcp_cookie_values *cvp = NULL;
-
- if (sizeof(ctd) > optlen)
- return -EINVAL;
- if (copy_from_user(&ctd, optval, sizeof(ctd)))
- return -EFAULT;
-
- if (ctd.tcpct_used > sizeof(ctd.tcpct_value) ||
- ctd.tcpct_s_data_desired > TCP_MSS_DESIRED)
- return -EINVAL;
-
- if (ctd.tcpct_cookie_desired == 0) {
- /* default to global value */
- } else if ((0x1 & ctd.tcpct_cookie_desired) ||
- ctd.tcpct_cookie_desired > TCP_COOKIE_MAX ||
- ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) {
- return -EINVAL;
- }
-
- if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) {
- /* Supercedes all other values */
- lock_sock(sk);
- if (tp->cookie_values != NULL) {
- kref_put(&tp->cookie_values->kref,
- tcp_cookie_values_release);
- tp->cookie_values = NULL;
- }
- tp->rx_opt.cookie_in_always = 0; /* false */
- tp->rx_opt.cookie_out_never = 1; /* true */
- release_sock(sk);
- return err;
- }
-
- /* Allocate ancillary memory before locking.
- */
- if (ctd.tcpct_used > 0 ||
- (tp->cookie_values == NULL &&
- (sysctl_tcp_cookie_size > 0 ||
- ctd.tcpct_cookie_desired > 0 ||
- ctd.tcpct_s_data_desired > 0))) {
- cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used,
- GFP_KERNEL);
- if (cvp == NULL)
- return -ENOMEM;
-
- kref_init(&cvp->kref);
- }
- lock_sock(sk);
- tp->rx_opt.cookie_in_always =
- (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags);
- tp->rx_opt.cookie_out_never = 0; /* false */
-
- if (tp->cookie_values != NULL) {
- if (cvp != NULL) {
- /* Changed values are recorded by a changed
- * pointer, ensuring the cookie will differ,
- * without separately hashing each value later.
- */
- kref_put(&tp->cookie_values->kref,
- tcp_cookie_values_release);
- } else {
- cvp = tp->cookie_values;
- }
- }
-
- if (cvp != NULL) {
- cvp->cookie_desired = ctd.tcpct_cookie_desired;
-
- if (ctd.tcpct_used > 0) {
- memcpy(cvp->s_data_payload, ctd.tcpct_value,
- ctd.tcpct_used);
- cvp->s_data_desired = ctd.tcpct_used;
- cvp->s_data_constant = 1; /* true */
- } else {
- /* No constant payload data. */
- cvp->s_data_desired = ctd.tcpct_s_data_desired;
- cvp->s_data_constant = 0; /* false */
- }
-
- tp->cookie_values = cvp;
- }
- release_sock(sk);
- return err;
- }
default:
/* fallthru */
break;
@@ -2902,41 +2807,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return -EFAULT;
return 0;
- case TCP_COOKIE_TRANSACTIONS: {
- struct tcp_cookie_transactions ctd;
- struct tcp_cookie_values *cvp = tp->cookie_values;
-
- if (get_user(len, optlen))
- return -EFAULT;
- if (len < sizeof(ctd))
- return -EINVAL;
-
- memset(&ctd, 0, sizeof(ctd));
- ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ?
- TCP_COOKIE_IN_ALWAYS : 0)
- | (tp->rx_opt.cookie_out_never ?
- TCP_COOKIE_OUT_NEVER : 0);
-
- if (cvp != NULL) {
- ctd.tcpct_flags |= (cvp->s_data_in ?
- TCP_S_DATA_IN : 0)
- | (cvp->s_data_out ?
- TCP_S_DATA_OUT : 0);
-
- ctd.tcpct_cookie_desired = cvp->cookie_desired;
- ctd.tcpct_s_data_desired = cvp->s_data_desired;
-
- memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0],
- cvp->cookie_pair_size);
- ctd.tcpct_used = cvp->cookie_pair_size;
- }
-
- if (put_user(sizeof(ctd), optlen))
- return -EFAULT;
- if (copy_to_user(optval, &ctd, sizeof(ctd)))
- return -EFAULT;
- return 0;
- }
case TCP_THIN_LINEAR_TIMEOUTS:
val = tp->thin_lto;
break;
@@ -3044,6 +2914,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
SKB_GSO_TCP_ECN |
SKB_GSO_TCPV6 |
SKB_GSO_GRE |
+ SKB_GSO_UDP_TUNNEL |
0) ||
!(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
goto out;
@@ -3408,134 +3279,6 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
#endif
-/* Each Responder maintains up to two secret values concurrently for
- * efficient secret rollover. Each secret value has 4 states:
- *
- * Generating. (tcp_secret_generating != tcp_secret_primary)
- * Generates new Responder-Cookies, but not yet used for primary
- * verification. This is a short-term state, typically lasting only
- * one round trip time (RTT).
- *
- * Primary. (tcp_secret_generating == tcp_secret_primary)
- * Used both for generation and primary verification.
- *
- * Retiring. (tcp_secret_retiring != tcp_secret_secondary)
- * Used for verification, until the first failure that can be
- * verified by the newer Generating secret. At that time, this
- * cookie's state is changed to Secondary, and the Generating
- * cookie's state is changed to Primary. This is a short-term state,
- * typically lasting only one round trip time (RTT).
- *
- * Secondary. (tcp_secret_retiring == tcp_secret_secondary)
- * Used for secondary verification, after primary verification
- * failures. This state lasts no more than twice the Maximum Segment
- * Lifetime (2MSL). Then, the secret is discarded.
- */
-struct tcp_cookie_secret {
- /* The secret is divided into two parts. The digest part is the
- * equivalent of previously hashing a secret and saving the state,
- * and serves as an initialization vector (IV). The message part
- * serves as the trailing secret.
- */
- u32 secrets[COOKIE_WORKSPACE_WORDS];
- unsigned long expires;
-};
-
-#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
-#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
-#define TCP_SECRET_LIFE (HZ * 600)
-
-static struct tcp_cookie_secret tcp_secret_one;
-static struct tcp_cookie_secret tcp_secret_two;
-
-/* Essentially a circular list, without dynamic allocation. */
-static struct tcp_cookie_secret *tcp_secret_generating;
-static struct tcp_cookie_secret *tcp_secret_primary;
-static struct tcp_cookie_secret *tcp_secret_retiring;
-static struct tcp_cookie_secret *tcp_secret_secondary;
-
-static DEFINE_SPINLOCK(tcp_secret_locker);
-
-/* Select a pseudo-random word in the cookie workspace.
- */
-static inline u32 tcp_cookie_work(const u32 *ws, const int n)
-{
- return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])];
-}
-
-/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed.
- * Called in softirq context.
- * Returns: 0 for success.
- */
-int tcp_cookie_generator(u32 *bakery)
-{
- unsigned long jiffy = jiffies;
-
- if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) {
- spin_lock_bh(&tcp_secret_locker);
- if (!time_after_eq(jiffy, tcp_secret_generating->expires)) {
- /* refreshed by another */
- memcpy(bakery,
- &tcp_secret_generating->secrets[0],
- COOKIE_WORKSPACE_WORDS);
- } else {
- /* still needs refreshing */
- get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS);
-
- /* The first time, paranoia assumes that the
- * randomization function isn't as strong. But,
- * this secret initialization is delayed until
- * the last possible moment (packet arrival).
- * Although that time is observable, it is
- * unpredictably variable. Mash in the most
- * volatile clock bits available, and expire the
- * secret extra quickly.
- */
- if (unlikely(tcp_secret_primary->expires ==
- tcp_secret_secondary->expires)) {
- struct timespec tv;
-
- getnstimeofday(&tv);
- bakery[COOKIE_DIGEST_WORDS+0] ^=
- (u32)tv.tv_nsec;
-
- tcp_secret_secondary->expires = jiffy
- + TCP_SECRET_1MSL
- + (0x0f & tcp_cookie_work(bakery, 0));
- } else {
- tcp_secret_secondary->expires = jiffy
- + TCP_SECRET_LIFE
- + (0xff & tcp_cookie_work(bakery, 1));
- tcp_secret_primary->expires = jiffy
- + TCP_SECRET_2MSL
- + (0x1f & tcp_cookie_work(bakery, 2));
- }
- memcpy(&tcp_secret_secondary->secrets[0],
- bakery, COOKIE_WORKSPACE_WORDS);
-
- rcu_assign_pointer(tcp_secret_generating,
- tcp_secret_secondary);
- rcu_assign_pointer(tcp_secret_retiring,
- tcp_secret_primary);
- /*
- * Neither call_rcu() nor synchronize_rcu() needed.
- * Retiring data is not freed. It is replaced after
- * further (locked) pointer updates, and a quiet time
- * (minimum 1MSL, maximum LIFE - 2MSL).
- */
- }
- spin_unlock_bh(&tcp_secret_locker);
- } else {
- rcu_read_lock_bh();
- memcpy(bakery,
- &rcu_dereference(tcp_secret_generating)->secrets[0],
- COOKIE_WORKSPACE_WORDS);
- rcu_read_unlock_bh();
- }
- return 0;
-}
-EXPORT_SYMBOL(tcp_cookie_generator);
-
void tcp_done(struct sock *sk)
{
struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
@@ -3590,7 +3333,6 @@ void __init tcp_init(void)
unsigned long limit;
int max_rshare, max_wshare, cnt;
unsigned int i;
- unsigned long jiffy = jiffies;
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
@@ -3666,13 +3408,5 @@ void __init tcp_init(void)
tcp_register_congestion_control(&tcp_reno);
- memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
- memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets));
- tcp_secret_one.expires = jiffy; /* past due */
- tcp_secret_two.expires = jiffy; /* past due */
- tcp_secret_generating = &tcp_secret_one;
- tcp_secret_primary = &tcp_secret_one;
- tcp_secret_retiring = &tcp_secret_two;
- tcp_secret_secondary = &tcp_secret_two;
tcp_tasklet_init();
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0d9bdacce99f..6d9ca35f0c35 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -93,12 +93,11 @@ int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_frto __read_mostly = 2;
-int sysctl_tcp_frto_response __read_mostly;
int sysctl_tcp_thin_dupack __read_mostly;
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
-int sysctl_tcp_early_retrans __read_mostly = 2;
+int sysctl_tcp_early_retrans __read_mostly = 3;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -108,17 +107,15 @@ int sysctl_tcp_early_retrans __read_mostly = 2;
#define FLAG_DATA_SACKED 0x20 /* New SACK. */
#define FLAG_ECE 0x40 /* ECE in this ACK */
#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
-#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */
+#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
-#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */
#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
-#define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED)
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
@@ -1159,10 +1156,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
tcp_highest_sack_seq(tp)))
state->reord = min(fack_count,
state->reord);
-
- /* SACK enhanced F-RTO (RFC4138; Appendix B) */
- if (!after(end_seq, tp->frto_highmark))
- state->flag |= FLAG_ONLY_ORIG_SACKED;
+ if (!after(end_seq, tp->high_seq))
+ state->flag |= FLAG_ORIG_SACK_ACKED;
}
if (sacked & TCPCB_LOST) {
@@ -1555,7 +1550,6 @@ static int
tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
u32 prior_snd_una)
{
- const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
const unsigned char *ptr = (skb_transport_header(ack_skb) +
TCP_SKB_CB(ack_skb)->sacked);
@@ -1728,12 +1722,6 @@ walk:
start_seq, end_seq, dup_sack);
advance_sp:
- /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct
- * due to in-order walk
- */
- if (after(end_seq, tp->frto_highmark))
- state.flag &= ~FLAG_ONLY_ORIG_SACKED;
-
i++;
}
@@ -1750,8 +1738,7 @@ advance_sp:
tcp_verify_left_out(tp);
if ((state.reord < tp->fackets_out) &&
- ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) &&
- (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
+ ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
out:
@@ -1825,197 +1812,6 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
tp->sacked_out = 0;
}
-static int tcp_is_sackfrto(const struct tcp_sock *tp)
-{
- return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp);
-}
-
-/* F-RTO can only be used if TCP has never retransmitted anything other than
- * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
- */
-bool tcp_use_frto(struct sock *sk)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct sk_buff *skb;
-
- if (!sysctl_tcp_frto)
- return false;
-
- /* MTU probe and F-RTO won't really play nicely along currently */
- if (icsk->icsk_mtup.probe_size)
- return false;
-
- if (tcp_is_sackfrto(tp))
- return true;
-
- /* Avoid expensive walking of rexmit queue if possible */
- if (tp->retrans_out > 1)
- return false;
-
- skb = tcp_write_queue_head(sk);
- if (tcp_skb_is_last(sk, skb))
- return true;
- skb = tcp_write_queue_next(sk, skb); /* Skips head */
- tcp_for_write_queue_from(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
- if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
- return false;
- /* Short-circuit when first non-SACKed skb has been checked */
- if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
- break;
- }
- return true;
-}
-
-/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
- * recovery a bit and use heuristics in tcp_process_frto() to detect if
- * the RTO was spurious. Only clear SACKED_RETRANS of the head here to
- * keep retrans_out counting accurate (with SACK F-RTO, other than head
- * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
- * bits are handled if the Loss state is really to be entered (in
- * tcp_enter_frto_loss).
- *
- * Do like tcp_enter_loss() would; when RTO expires the second time it
- * does:
- * "Reduce ssthresh if it has not yet been made inside this window."
- */
-void tcp_enter_frto(struct sock *sk)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
-
- if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) ||
- tp->snd_una == tp->high_seq ||
- ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
- !icsk->icsk_retransmits)) {
- tp->prior_ssthresh = tcp_current_ssthresh(sk);
- /* Our state is too optimistic in ssthresh() call because cwnd
- * is not reduced until tcp_enter_frto_loss() when previous F-RTO
- * recovery has not yet completed. Pattern would be this: RTO,
- * Cumulative ACK, RTO (2xRTO for the same segment does not end
- * up here twice).
- * RFC4138 should be more specific on what to do, even though
- * RTO is quite unlikely to occur after the first Cumulative ACK
- * due to back-off and complexity of triggering events ...
- */
- if (tp->frto_counter) {
- u32 stored_cwnd;
- stored_cwnd = tp->snd_cwnd;
- tp->snd_cwnd = 2;
- tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
- tp->snd_cwnd = stored_cwnd;
- } else {
- tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
- }
- /* ... in theory, cong.control module could do "any tricks" in
- * ssthresh(), which means that ca_state, lost bits and lost_out
- * counter would have to be faked before the call occurs. We
- * consider that too expensive, unlikely and hacky, so modules
- * using these in ssthresh() must deal these incompatibility
- * issues if they receives CA_EVENT_FRTO and frto_counter != 0
- */
- tcp_ca_event(sk, CA_EVENT_FRTO);
- }
-
- tp->undo_marker = tp->snd_una;
- tp->undo_retrans = 0;
-
- skb = tcp_write_queue_head(sk);
- if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
- tp->undo_marker = 0;
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
- }
- tcp_verify_left_out(tp);
-
- /* Too bad if TCP was application limited */
- tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
-
- /* Earlier loss recovery underway (see RFC4138; Appendix B).
- * The last condition is necessary at least in tp->frto_counter case.
- */
- if (tcp_is_sackfrto(tp) && (tp->frto_counter ||
- ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
- after(tp->high_seq, tp->snd_una)) {
- tp->frto_highmark = tp->high_seq;
- } else {
- tp->frto_highmark = tp->snd_nxt;
- }
- tcp_set_ca_state(sk, TCP_CA_Disorder);
- tp->high_seq = tp->snd_nxt;
- tp->frto_counter = 1;
-}
-
-/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
- * which indicates that we should follow the traditional RTO recovery,
- * i.e. mark everything lost and do go-back-N retransmission.
- */
-static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
-
- tp->lost_out = 0;
- tp->retrans_out = 0;
- if (tcp_is_reno(tp))
- tcp_reset_reno_sack(tp);
-
- tcp_for_write_queue(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
-
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
- /*
- * Count the retransmission made on RTO correctly (only when
- * waiting for the first ACK and did not get it)...
- */
- if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) {
- /* For some reason this R-bit might get cleared? */
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
- tp->retrans_out += tcp_skb_pcount(skb);
- /* ...enter this if branch just for the first segment */
- flag |= FLAG_DATA_ACKED;
- } else {
- if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
- tp->undo_marker = 0;
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- }
-
- /* Marking forward transmissions that were made after RTO lost
- * can cause unnecessary retransmissions in some scenarios,
- * SACK blocks will mitigate that in some but not in all cases.
- * We used to not mark them but it was causing break-ups with
- * receivers that do only in-order receival.
- *
- * TODO: we could detect presence of such receiver and select
- * different behavior per flow.
- */
- if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
- TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- tp->lost_out += tcp_skb_pcount(skb);
- tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
- }
- }
- tcp_verify_left_out(tp);
-
- tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
- tp->snd_cwnd_cnt = 0;
- tp->snd_cwnd_stamp = tcp_time_stamp;
- tp->frto_counter = 0;
-
- tp->reordering = min_t(unsigned int, tp->reordering,
- sysctl_tcp_reordering);
- tcp_set_ca_state(sk, TCP_CA_Loss);
- tp->high_seq = tp->snd_nxt;
- TCP_ECN_queue_cwr(tp);
-
- tcp_clear_all_retrans_hints(tp);
-}
-
static void tcp_clear_retrans_partial(struct tcp_sock *tp)
{
tp->retrans_out = 0;
@@ -2042,10 +1838,13 @@ void tcp_enter_loss(struct sock *sk, int how)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
+ bool new_recovery = false;
/* Reduce ssthresh if it has not yet been made inside this window. */
- if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
+ !after(tp->high_seq, tp->snd_una) ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
+ new_recovery = true;
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
@@ -2059,11 +1858,8 @@ void tcp_enter_loss(struct sock *sk, int how)
if (tcp_is_reno(tp))
tcp_reset_reno_sack(tp);
- if (!how) {
- /* Push undo marker, if it was plain RTO and nothing
- * was retransmitted. */
- tp->undo_marker = tp->snd_una;
- } else {
+ tp->undo_marker = tp->snd_una;
+ if (how) {
tp->sacked_out = 0;
tp->fackets_out = 0;
}
@@ -2090,8 +1886,14 @@ void tcp_enter_loss(struct sock *sk, int how)
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
TCP_ECN_queue_cwr(tp);
- /* Abort F-RTO algorithm if one is in progress */
- tp->frto_counter = 0;
+
+ /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
+ * loss recovery is underway except recurring timeout(s) on
+ * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
+ */
+ tp->frto = sysctl_tcp_frto &&
+ (new_recovery || icsk->icsk_retransmits) &&
+ !inet_csk(sk)->icsk_mtup.probe_size;
}
/* If ACK arrived pointing to a remembered SACK, it means that our
@@ -2150,15 +1952,16 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
* max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
* available, or RTO is scheduled to fire first.
*/
- if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt)
+ if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
+ (flag & FLAG_ECE) || !tp->srtt)
return false;
delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2));
if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
return false;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX);
- tp->early_retrans_delayed = 1;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
+ TCP_RTO_MAX);
return true;
}
@@ -2274,10 +2077,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
struct tcp_sock *tp = tcp_sk(sk);
__u32 packets_out;
- /* Do not perform any recovery during F-RTO algorithm */
- if (tp->frto_counter)
- return false;
-
/* Trick#1: The loss is proven. */
if (tp->lost_out)
return true;
@@ -2321,7 +2120,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
* interval if appropriate.
*/
if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
- (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) &&
+ (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
!tcp_may_send_now(sk))
return !tcp_pause_early_retransmit(sk, flag);
@@ -2638,12 +2437,12 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
return failed;
}
-/* Undo during loss recovery after partial ACK. */
-static bool tcp_try_undo_loss(struct sock *sk)
+/* Undo during loss recovery after partial ACK or using F-RTO. */
+static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tcp_may_undo(tp)) {
+ if (frto_undo || tcp_may_undo(tp)) {
struct sk_buff *skb;
tcp_for_write_queue(skb, sk) {
if (skb == tcp_send_head(sk))
@@ -2657,9 +2456,12 @@ static bool tcp_try_undo_loss(struct sock *sk)
tp->lost_out = 0;
tcp_undo_cwr(sk, true);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
+ if (frto_undo)
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPSPURIOUSRTOS);
inet_csk(sk)->icsk_retransmits = 0;
tp->undo_marker = 0;
- if (tcp_is_sack(tp))
+ if (frto_undo || tcp_is_sack(tp))
tcp_set_ca_state(sk, TCP_CA_Open);
return true;
}
@@ -2681,6 +2483,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
struct tcp_sock *tp = tcp_sk(sk);
tp->high_seq = tp->snd_nxt;
+ tp->tlp_high_seq = 0;
tp->snd_cwnd_cnt = 0;
tp->prior_cwnd = tp->snd_cwnd;
tp->prr_delivered = 0;
@@ -2758,7 +2561,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked)
tcp_verify_left_out(tp);
- if (!tp->frto_counter && !tcp_any_retrans_done(sk))
+ if (!tcp_any_retrans_done(sk))
tp->retrans_stamp = 0;
if (flag & FLAG_ECE)
@@ -2875,6 +2678,58 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
tcp_set_ca_state(sk, TCP_CA_Recovery);
}
+/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
+ * recovered or spurious. Otherwise retransmits more on partial ACKs.
+ */
+static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool recovered = !before(tp->snd_una, tp->high_seq);
+
+ if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
+ if (flag & FLAG_ORIG_SACK_ACKED) {
+ /* Step 3.b. A timeout is spurious if not all data are
+ * lost, i.e., never-retransmitted data are (s)acked.
+ */
+ tcp_try_undo_loss(sk, true);
+ return;
+ }
+ if (after(tp->snd_nxt, tp->high_seq) &&
+ (flag & FLAG_DATA_SACKED || is_dupack)) {
+ tp->frto = 0; /* Loss was real: 2nd part of step 3.a */
+ } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
+ tp->high_seq = tp->snd_nxt;
+ __tcp_push_pending_frames(sk, tcp_current_mss(sk),
+ TCP_NAGLE_OFF);
+ if (after(tp->snd_nxt, tp->high_seq))
+ return; /* Step 2.b */
+ tp->frto = 0;
+ }
+ }
+
+ if (recovered) {
+ /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
+ icsk->icsk_retransmits = 0;
+ tcp_try_undo_recovery(sk);
+ return;
+ }
+ if (flag & FLAG_DATA_ACKED)
+ icsk->icsk_retransmits = 0;
+ if (tcp_is_reno(tp)) {
+ /* A Reno DUPACK means new data in F-RTO step 2.b above are
+ * delivered. Lower inflight to clock out (re)tranmissions.
+ */
+ if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
+ tcp_add_reno_sack(sk);
+ else if (flag & FLAG_SND_UNA_ADVANCED)
+ tcp_reset_reno_sack(tp);
+ }
+ if (tcp_try_undo_loss(sk, false))
+ return;
+ tcp_xmit_retransmit_queue(sk);
+}
+
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
@@ -2921,12 +2776,6 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
tp->retrans_stamp = 0;
} else if (!before(tp->snd_una, tp->high_seq)) {
switch (icsk->icsk_ca_state) {
- case TCP_CA_Loss:
- icsk->icsk_retransmits = 0;
- if (tcp_try_undo_recovery(sk))
- return;
- break;
-
case TCP_CA_CWR:
/* CWR is to be held something *above* high_seq
* is ACKed for CWR bit to reach receiver. */
@@ -2957,18 +2806,10 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
break;
case TCP_CA_Loss:
- if (flag & FLAG_DATA_ACKED)
- icsk->icsk_retransmits = 0;
- if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED)
- tcp_reset_reno_sack(tp);
- if (!tcp_try_undo_loss(sk)) {
- tcp_moderate_cwnd(tp);
- tcp_xmit_retransmit_queue(sk);
- return;
- }
+ tcp_process_loss(sk, flag, is_dupack);
if (icsk->icsk_ca_state != TCP_CA_Open)
return;
- /* Loss is undone; fall through to processing in Open state. */
+ /* Fall through to processing in Open state. */
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
@@ -3081,6 +2922,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
*/
void tcp_rearm_rto(struct sock *sk)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
/* If the retrans timer is currently being used by Fast Open
@@ -3094,12 +2936,13 @@ void tcp_rearm_rto(struct sock *sk)
} else {
u32 rto = inet_csk(sk)->icsk_rto;
/* Offset the time elapsed after installing regular RTO */
- if (tp->early_retrans_delayed) {
+ if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
struct sk_buff *skb = tcp_write_queue_head(sk);
const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
/* delta may not be positive if the socket is locked
- * when the delayed ER timer fires and is rescheduled.
+ * when the retrans timer fires and is rescheduled.
*/
if (delta > 0)
rto = delta;
@@ -3107,7 +2950,6 @@ void tcp_rearm_rto(struct sock *sk)
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
TCP_RTO_MAX);
}
- tp->early_retrans_delayed = 0;
}
/* This function is called when the delayed ER timer fires. TCP enters
@@ -3195,8 +3037,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
flag |= FLAG_RETRANS_DATA_ACKED;
ca_seq_rtt = -1;
seq_rtt = -1;
- if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1))
- flag |= FLAG_NONHEAD_RETRANS_ACKED;
} else {
ca_seq_rtt = now - scb->when;
last_ackt = skb->tstamp;
@@ -3205,6 +3045,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
}
if (!(sacked & TCPCB_SACKED_ACKED))
reord = min(pkts_acked, reord);
+ if (!after(scb->end_seq, tp->high_seq))
+ flag |= FLAG_ORIG_SACK_ACKED;
}
if (sacked & TCPCB_SACKED_ACKED)
@@ -3405,150 +3247,6 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
return flag;
}
-/* A very conservative spurious RTO response algorithm: reduce cwnd and
- * continue in congestion avoidance.
- */
-static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
-{
- tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
- tp->snd_cwnd_cnt = 0;
- TCP_ECN_queue_cwr(tp);
- tcp_moderate_cwnd(tp);
-}
-
-/* A conservative spurious RTO response algorithm: reduce cwnd using
- * PRR and continue in congestion avoidance.
- */
-static void tcp_cwr_spur_to_response(struct sock *sk)
-{
- tcp_enter_cwr(sk, 0);
-}
-
-static void tcp_undo_spur_to_response(struct sock *sk, int flag)
-{
- if (flag & FLAG_ECE)
- tcp_cwr_spur_to_response(sk);
- else
- tcp_undo_cwr(sk, true);
-}
-
-/* F-RTO spurious RTO detection algorithm (RFC4138)
- *
- * F-RTO affects during two new ACKs following RTO (well, almost, see inline
- * comments). State (ACK number) is kept in frto_counter. When ACK advances
- * window (but not to or beyond highest sequence sent before RTO):
- * On First ACK, send two new segments out.
- * On Second ACK, RTO was likely spurious. Do spurious response (response
- * algorithm is not part of the F-RTO detection algorithm
- * given in RFC4138 but can be selected separately).
- * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
- * and TCP falls back to conventional RTO recovery. F-RTO allows overriding
- * of Nagle, this is done using frto_counter states 2 and 3, when a new data
- * segment of any size sent during F-RTO, state 2 is upgraded to 3.
- *
- * Rationale: if the RTO was spurious, new ACKs should arrive from the
- * original window even after we transmit two new data segments.
- *
- * SACK version:
- * on first step, wait until first cumulative ACK arrives, then move to
- * the second step. In second step, the next ACK decides.
- *
- * F-RTO is implemented (mainly) in four functions:
- * - tcp_use_frto() is used to determine if TCP is can use F-RTO
- * - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is
- * called when tcp_use_frto() showed green light
- * - tcp_process_frto() handles incoming ACKs during F-RTO algorithm
- * - tcp_enter_frto_loss() is called if there is not enough evidence
- * to prove that the RTO is indeed spurious. It transfers the control
- * from F-RTO to the conventional RTO recovery
- */
-static bool tcp_process_frto(struct sock *sk, int flag)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- tcp_verify_left_out(tp);
-
- /* Duplicate the behavior from Loss state (fastretrans_alert) */
- if (flag & FLAG_DATA_ACKED)
- inet_csk(sk)->icsk_retransmits = 0;
-
- if ((flag & FLAG_NONHEAD_RETRANS_ACKED) ||
- ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED)))
- tp->undo_marker = 0;
-
- if (!before(tp->snd_una, tp->frto_highmark)) {
- tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
- return true;
- }
-
- if (!tcp_is_sackfrto(tp)) {
- /* RFC4138 shortcoming in step 2; should also have case c):
- * ACK isn't duplicate nor advances window, e.g., opposite dir
- * data, winupdate
- */
- if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP))
- return true;
-
- if (!(flag & FLAG_DATA_ACKED)) {
- tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
- flag);
- return true;
- }
- } else {
- if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
- if (!tcp_packets_in_flight(tp)) {
- tcp_enter_frto_loss(sk, 2, flag);
- return true;
- }
-
- /* Prevent sending of new data. */
- tp->snd_cwnd = min(tp->snd_cwnd,
- tcp_packets_in_flight(tp));
- return true;
- }
-
- if ((tp->frto_counter >= 2) &&
- (!(flag & FLAG_FORWARD_PROGRESS) ||
- ((flag & FLAG_DATA_SACKED) &&
- !(flag & FLAG_ONLY_ORIG_SACKED)))) {
- /* RFC4138 shortcoming (see comment above) */
- if (!(flag & FLAG_FORWARD_PROGRESS) &&
- (flag & FLAG_NOT_DUP))
- return true;
-
- tcp_enter_frto_loss(sk, 3, flag);
- return true;
- }
- }
-
- if (tp->frto_counter == 1) {
- /* tcp_may_send_now needs to see updated state */
- tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
- tp->frto_counter = 2;
-
- if (!tcp_may_send_now(sk))
- tcp_enter_frto_loss(sk, 2, flag);
-
- return true;
- } else {
- switch (sysctl_tcp_frto_response) {
- case 2:
- tcp_undo_spur_to_response(sk, flag);
- break;
- case 1:
- tcp_conservative_spur_to_response(tp);
- break;
- default:
- tcp_cwr_spur_to_response(sk);
- break;
- }
- tp->frto_counter = 0;
- tp->undo_marker = 0;
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS);
- }
- return false;
-}
-
/* RFC 5961 7 [ACK Throttling] */
static void tcp_send_challenge_ack(struct sock *sk)
{
@@ -3567,6 +3265,38 @@ static void tcp_send_challenge_ack(struct sock *sk)
}
}
+/* This routine deals with acks during a TLP episode.
+ * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.
+ */
+static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool is_tlp_dupack = (ack == tp->tlp_high_seq) &&
+ !(flag & (FLAG_SND_UNA_ADVANCED |
+ FLAG_NOT_DUP | FLAG_DATA_SACKED));
+
+ /* Mark the end of TLP episode on receiving TLP dupack or when
+ * ack is after tlp_high_seq.
+ */
+ if (is_tlp_dupack) {
+ tp->tlp_high_seq = 0;
+ return;
+ }
+
+ if (after(ack, tp->tlp_high_seq)) {
+ tp->tlp_high_seq = 0;
+ /* Don't reduce cwnd if DSACK arrives for TLP retrans. */
+ if (!(flag & FLAG_DSACKING_ACK)) {
+ tcp_init_cwnd_reduction(sk, true);
+ tcp_set_ca_state(sk, TCP_CA_CWR);
+ tcp_end_cwnd_reduction(sk);
+ tcp_set_ca_state(sk, TCP_CA_Open);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPLOSSPROBERECOVERY);
+ }
+ }
+}
+
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
@@ -3581,7 +3311,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
int prior_packets;
int prior_sacked = tp->sacked_out;
int pkts_acked = 0;
- bool frto_cwnd = false;
/* If the ack is older than previous acks
* then we can probably ignore it.
@@ -3601,7 +3330,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt))
goto invalid_ack;
- if (tp->early_retrans_delayed)
+ if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
if (after(ack, prior_snd_una))
@@ -3654,30 +3384,29 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
pkts_acked = prior_packets - tp->packets_out;
- if (tp->frto_counter)
- frto_cwnd = tcp_process_frto(sk, flag);
- /* Guarantee sacktag reordering detection against wrap-arounds */
- if (before(tp->frto_highmark, tp->snd_una))
- tp->frto_highmark = 0;
-
if (tcp_ack_is_dubious(sk, flag)) {
/* Advance CWND, if state allows this. */
- if ((flag & FLAG_DATA_ACKED) && !frto_cwnd &&
- tcp_may_raise_cwnd(sk, flag))
+ if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
tcp_cong_avoid(sk, ack, prior_in_flight);
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
is_dupack, flag);
} else {
- if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
+ if (flag & FLAG_DATA_ACKED)
tcp_cong_avoid(sk, ack, prior_in_flight);
}
+ if (tp->tlp_high_seq)
+ tcp_process_tlp_ack(sk, ack, flag);
+
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
struct dst_entry *dst = __sk_dst_get(sk);
if (dst)
dst_confirm(dst);
}
+
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS)
+ tcp_schedule_loss_probe(sk);
return 1;
no_queue:
@@ -3691,6 +3420,9 @@ no_queue:
*/
if (tcp_send_head(sk))
tcp_ack_probe(sk);
+
+ if (tp->tlp_high_seq)
+ tcp_process_tlp_ack(sk, ack, flag);
return 1;
invalid_ack:
@@ -3715,8 +3447,8 @@ old_ack:
* But, this can also be called on packets in the established flow when
* the fast version below fails.
*/
-void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx,
- const u8 **hvpp, int estab,
+void tcp_parse_options(const struct sk_buff *skb,
+ struct tcp_options_received *opt_rx, int estab,
struct tcp_fastopen_cookie *foc)
{
const unsigned char *ptr;
@@ -3800,31 +3532,6 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
*/
break;
#endif
- case TCPOPT_COOKIE:
- /* This option is variable length.
- */
- switch (opsize) {
- case TCPOLEN_COOKIE_BASE:
- /* not yet implemented */
- break;
- case TCPOLEN_COOKIE_PAIR:
- /* not yet implemented */
- break;
- case TCPOLEN_COOKIE_MIN+0:
- case TCPOLEN_COOKIE_MIN+2:
- case TCPOLEN_COOKIE_MIN+4:
- case TCPOLEN_COOKIE_MIN+6:
- case TCPOLEN_COOKIE_MAX:
- /* 16-bit multiple */
- opt_rx->cookie_plus = opsize;
- *hvpp = ptr;
- break;
- default:
- /* ignore option */
- break;
- }
- break;
-
case TCPOPT_EXP:
/* Fast Open option shares code 254 using a
* 16 bits magic number. It's valid only in
@@ -3870,8 +3577,7 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr
* If it is wrong it falls back on tcp_parse_options().
*/
static bool tcp_fast_parse_options(const struct sk_buff *skb,
- const struct tcphdr *th,
- struct tcp_sock *tp, const u8 **hvpp)
+ const struct tcphdr *th, struct tcp_sock *tp)
{
/* In the spirit of fast parsing, compare doff directly to constant
* values. Because equality is used, short doff can be ignored here.
@@ -3885,7 +3591,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
return true;
}
- tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL);
+ tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
if (tp->rx_opt.saw_tstamp)
tp->rx_opt.rcv_tsecr -= tp->tsoffset;
@@ -5266,12 +4972,10 @@ out:
static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, int syn_inerr)
{
- const u8 *hash_location;
struct tcp_sock *tp = tcp_sk(sk);
/* RFC1323: H1. Apply PAWS check first. */
- if (tcp_fast_parse_options(skb, th, tp, &hash_location) &&
- tp->rx_opt.saw_tstamp &&
+ if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
tcp_paws_discard(sk, skb)) {
if (!th->rst) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
@@ -5625,12 +5329,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
if (mss == tp->rx_opt.user_mss) {
struct tcp_options_received opt;
- const u8 *hash_location;
/* Get original SYNACK MSS value if user MSS sets mss_clamp */
tcp_clear_options(&opt);
opt.user_mss = opt.mss_clamp = 0;
- tcp_parse_options(synack, &opt, &hash_location, 0, NULL);
+ tcp_parse_options(synack, &opt, 0, NULL);
mss = opt.mss_clamp;
}
@@ -5661,14 +5364,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, unsigned int len)
{
- const u8 *hash_location;
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_cookie_values *cvp = tp->cookie_values;
struct tcp_fastopen_cookie foc = { .len = -1 };
int saved_clamp = tp->rx_opt.mss_clamp;
- tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc);
+ tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
if (tp->rx_opt.saw_tstamp)
tp->rx_opt.rcv_tsecr -= tp->tsoffset;
@@ -5765,30 +5466,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* is initialized. */
tp->copied_seq = tp->rcv_nxt;
- if (cvp != NULL &&
- cvp->cookie_pair_size > 0 &&
- tp->rx_opt.cookie_plus > 0) {
- int cookie_size = tp->rx_opt.cookie_plus
- - TCPOLEN_COOKIE_BASE;
- int cookie_pair_size = cookie_size
- + cvp->cookie_desired;
-
- /* A cookie extension option was sent and returned.
- * Note that each incoming SYNACK replaces the
- * Responder cookie. The initial exchange is most
- * fragile, as protection against spoofing relies
- * entirely upon the sequence and timestamp (above).
- * This replacement strategy allows the correct pair to
- * pass through, while any others will be filtered via
- * Responder verification later.
- */
- if (sizeof(cvp->cookie_pair) >= cookie_pair_size) {
- memcpy(&cvp->cookie_pair[cvp->cookie_desired],
- hash_location, cookie_size);
- cvp->cookie_pair_size = cookie_pair_size;
- }
- }
-
smp_mb();
tcp_finish_connect(sk, skb);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4a8ec457310f..2278669b1d85 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -274,13 +274,6 @@ static void tcp_v4_mtu_reduced(struct sock *sk)
struct inet_sock *inet = inet_sk(sk);
u32 mtu = tcp_sk(sk)->mtu_info;
- /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
- * send out by Linux are always <576bytes so they should go through
- * unfragmented).
- */
- if (sk->sk_state == TCP_LISTEN)
- return;
-
dst = inet_csk_update_pmtu(sk, mtu);
if (!dst)
return;
@@ -408,6 +401,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
goto out;
if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
+ /* We are not interested in TCP_LISTEN and open_requests
+ * (SYN-ACKs send out by Linux are always <576bytes so
+ * they should go through unfragmented).
+ */
+ if (sk->sk_state == TCP_LISTEN)
+ goto out;
+
tp->mtu_info = info;
if (!sock_owned_by_user(sk)) {
tcp_v4_mtu_reduced(sk);
@@ -838,7 +838,6 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
*/
static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
- struct request_values *rvp,
u16 queue_mapping,
bool nocache)
{
@@ -851,7 +850,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1;
- skb = tcp_make_synack(sk, dst, req, rvp, NULL);
+ skb = tcp_make_synack(sk, dst, req, NULL);
if (skb) {
__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
@@ -868,10 +867,9 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
return err;
}
-static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
- struct request_values *rvp)
+static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
{
- int res = tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
+ int res = tcp_v4_send_synack(sk, NULL, req, 0, false);
if (!res)
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
@@ -1371,8 +1369,7 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
static int tcp_v4_conn_req_fastopen(struct sock *sk,
struct sk_buff *skb,
struct sk_buff *skb_synack,
- struct request_sock *req,
- struct request_values *rvp)
+ struct request_sock *req)
{
struct tcp_sock *tp = tcp_sk(sk);
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
@@ -1467,9 +1464,7 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk,
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
- struct tcp_extend_values tmp_ext;
struct tcp_options_received tmp_opt;
- const u8 *hash_location;
struct request_sock *req;
struct inet_request_sock *ireq;
struct tcp_sock *tp = tcp_sk(sk);
@@ -1519,42 +1514,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
tmp_opt.user_mss = tp->rx_opt.user_mss;
- tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
- want_cookie ? NULL : &foc);
-
- if (tmp_opt.cookie_plus > 0 &&
- tmp_opt.saw_tstamp &&
- !tp->rx_opt.cookie_out_never &&
- (sysctl_tcp_cookie_size > 0 ||
- (tp->cookie_values != NULL &&
- tp->cookie_values->cookie_desired > 0))) {
- u8 *c;
- u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
- int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
-
- if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
- goto drop_and_release;
-
- /* Secret recipe starts with IP addresses */
- *mess++ ^= (__force u32)daddr;
- *mess++ ^= (__force u32)saddr;
-
- /* plus variable length Initiator Cookie */
- c = (u8 *)mess;
- while (l-- > 0)
- *c++ ^= *hash_location++;
-
- want_cookie = false; /* not our kind of cookie */
- tmp_ext.cookie_out_never = 0; /* false */
- tmp_ext.cookie_plus = tmp_opt.cookie_plus;
- } else if (!tp->rx_opt.cookie_in_always) {
- /* redundant indications, but ensure initialization. */
- tmp_ext.cookie_out_never = 1; /* true */
- tmp_ext.cookie_plus = 0;
- } else {
- goto drop_and_release;
- }
- tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
+ tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
if (want_cookie && !tmp_opt.saw_tstamp)
tcp_clear_options(&tmp_opt);
@@ -1636,7 +1596,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* of tcp_v4_send_synack()->tcp_select_initial_window().
*/
skb_synack = tcp_make_synack(sk, dst, req,
- (struct request_values *)&tmp_ext,
fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
if (skb_synack) {
@@ -1660,8 +1619,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
if (fastopen_cookie_present(&foc) && foc.len != 0)
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
- } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
- (struct request_values *)&tmp_ext))
+ } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req))
goto drop_and_free;
return 0;
@@ -1950,6 +1908,50 @@ void tcp_v4_early_demux(struct sk_buff *skb)
}
}
+/* Packet is added to VJ-style prequeue for processing in process
+ * context, if a reader task is waiting. Apparently, this exciting
+ * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
+ * failed somewhere. Latency? Burstiness? Well, at least now we will
+ * see, why it failed. 8)8) --ANK
+ *
+ */
+bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (sysctl_tcp_low_latency || !tp->ucopy.task)
+ return false;
+
+ if (skb->len <= tcp_hdrlen(skb) &&
+ skb_queue_len(&tp->ucopy.prequeue) == 0)
+ return false;
+
+ __skb_queue_tail(&tp->ucopy.prequeue, skb);
+ tp->ucopy.memory += skb->truesize;
+ if (tp->ucopy.memory > sk->sk_rcvbuf) {
+ struct sk_buff *skb1;
+
+ BUG_ON(sock_owned_by_user(sk));
+
+ while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
+ sk_backlog_rcv(sk, skb1);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPPREQUEUEDROPPED);
+ }
+
+ tp->ucopy.memory = 0;
+ } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
+ wake_up_interruptible_sync_poll(sk_sleep(sk),
+ POLLIN | POLLRDNORM | POLLRDBAND);
+ if (!inet_csk_ack_scheduled(sk))
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ (3 * tcp_rto_min(sk)) / 4,
+ TCP_RTO_MAX);
+ }
+ return true;
+}
+EXPORT_SYMBOL(tcp_prequeue);
+
/*
* From tcp_input.c
*/
@@ -2197,12 +2199,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
if (inet_csk(sk)->icsk_bind_hash)
inet_put_port(sk);
- /* TCP Cookie Transactions */
- if (tp->cookie_values != NULL) {
- kref_put(&tp->cookie_values->kref,
- tcp_cookie_values_release);
- tp->cookie_values = NULL;
- }
BUG_ON(tp->fastopen_rsk != NULL);
/* If socket is aborted during connect operation */
@@ -2659,7 +2655,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
__u16 srcp = ntohs(inet->inet_sport);
int rx_queue;
- if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
timer_expires = icsk->icsk_timeout;
} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index b6f3583ddfe8..d52196f56862 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -72,8 +72,6 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg)
tcp = tcp_from_cgproto(cg_proto);
percpu_counter_destroy(&tcp->tcp_sockets_allocated);
-
- val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
}
EXPORT_SYMBOL(tcp_destroy_cgroup);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b83a49cc3816..05eaf8904613 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -93,13 +93,12 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
const struct tcphdr *th)
{
struct tcp_options_received tmp_opt;
- const u8 *hash_location;
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
bool paws_reject = false;
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
- tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
+ tcp_parse_options(skb, &tmp_opt, 0, NULL);
if (tmp_opt.saw_tstamp) {
tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
@@ -388,32 +387,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
struct tcp_request_sock *treq = tcp_rsk(req);
struct inet_connection_sock *newicsk = inet_csk(newsk);
struct tcp_sock *newtp = tcp_sk(newsk);
- struct tcp_sock *oldtp = tcp_sk(sk);
- struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
-
- /* TCP Cookie Transactions require space for the cookie pair,
- * as it differs for each connection. There is no need to
- * copy any s_data_payload stored at the original socket.
- * Failure will prevent resuming the connection.
- *
- * Presumed copied, in order of appearance:
- * cookie_in_always, cookie_out_never
- */
- if (oldcvp != NULL) {
- struct tcp_cookie_values *newcvp =
- kzalloc(sizeof(*newtp->cookie_values),
- GFP_ATOMIC);
-
- if (newcvp != NULL) {
- kref_init(&newcvp->kref);
- newcvp->cookie_desired =
- oldcvp->cookie_desired;
- newtp->cookie_values = newcvp;
- } else {
- /* Not Yet Implemented */
- newtp->cookie_values = NULL;
- }
- }
/* Now setup tcp_sock */
newtp->pred_flags = 0;
@@ -422,8 +395,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->rcv_nxt = treq->rcv_isn + 1;
newtp->snd_sml = newtp->snd_una =
- newtp->snd_nxt = newtp->snd_up =
- treq->snt_isn + 1 + tcp_s_data_size(oldtp);
+ newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
tcp_prequeue_init(newtp);
INIT_LIST_HEAD(&newtp->tsq_node);
@@ -440,6 +412,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->fackets_out = 0;
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tcp_enable_early_retrans(newtp);
+ newtp->tlp_high_seq = 0;
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
@@ -449,9 +422,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->snd_cwnd = TCP_INIT_CWND;
newtp->snd_cwnd_cnt = 0;
- newtp->frto_counter = 0;
- newtp->frto_highmark = 0;
-
if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops &&
!try_module_get(newicsk->icsk_ca_ops->owner))
newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
@@ -459,8 +429,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
tcp_set_ca_state(newsk, TCP_CA_Open);
tcp_init_xmit_timers(newsk);
skb_queue_head_init(&newtp->out_of_order_queue);
- newtp->write_seq = newtp->pushed_seq =
- treq->snt_isn + 1 + tcp_s_data_size(oldtp);
+ newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
newtp->rx_opt.saw_tstamp = 0;
@@ -537,7 +506,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
bool fastopen)
{
struct tcp_options_received tmp_opt;
- const u8 *hash_location;
struct sock *child;
const struct tcphdr *th = tcp_hdr(skb);
__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
@@ -547,7 +515,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
- tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
+ tcp_parse_options(skb, &tmp_opt, 0, NULL);
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = req->ts_recent;
@@ -647,7 +615,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
*/
if ((flg & TCP_FLAG_ACK) && !fastopen &&
(TCP_SKB_CB(skb)->ack_seq !=
- tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
+ tcp_rsk(req)->snt_isn + 1))
return sk;
/* Also, it would be not so bad idea to check rcv_tsecr, which
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e2b4461074da..af354c98fdb5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -65,27 +65,22 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
/* By default, RFC2861 behavior. */
int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
-int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
-EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
-
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);
/* Account for new data that has been sent to the network. */
static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
unsigned int prior_packets = tp->packets_out;
tcp_advance_send_head(sk, skb);
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
- /* Don't override Nagle indefinitely with F-RTO */
- if (tp->frto_counter == 2)
- tp->frto_counter = 3;
-
tp->packets_out += tcp_skb_pcount(skb);
- if (!prior_packets || tp->early_retrans_delayed)
+ if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
}
@@ -384,7 +379,6 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
#define OPTION_TS (1 << 1)
#define OPTION_MD5 (1 << 2)
#define OPTION_WSCALE (1 << 3)
-#define OPTION_COOKIE_EXTENSION (1 << 4)
#define OPTION_FAST_OPEN_COOKIE (1 << 8)
struct tcp_out_options {
@@ -398,36 +392,6 @@ struct tcp_out_options {
struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
};
-/* The sysctl int routines are generic, so check consistency here.
- */
-static u8 tcp_cookie_size_check(u8 desired)
-{
- int cookie_size;
-
- if (desired > 0)
- /* previously specified */
- return desired;
-
- cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size);
- if (cookie_size <= 0)
- /* no default specified */
- return 0;
-
- if (cookie_size <= TCP_COOKIE_MIN)
- /* value too small, specify minimum */
- return TCP_COOKIE_MIN;
-
- if (cookie_size >= TCP_COOKIE_MAX)
- /* value too large, specify maximum */
- return TCP_COOKIE_MAX;
-
- if (cookie_size & 1)
- /* 8-bit multiple, illegal, fix it */
- cookie_size++;
-
- return (u8)cookie_size;
-}
-
/* Write previously computed TCP options to the packet.
*
* Beware: Something in the Internet is very sensitive to the ordering of
@@ -446,27 +410,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
{
u16 options = opts->options; /* mungable copy */
- /* Having both authentication and cookies for security is redundant,
- * and there's certainly not enough room. Instead, the cookie-less
- * extension variant is proposed.
- *
- * Consider the pessimal case with authentication. The options
- * could look like:
- * COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40
- */
if (unlikely(OPTION_MD5 & options)) {
- if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
- *ptr++ = htonl((TCPOPT_COOKIE << 24) |
- (TCPOLEN_COOKIE_BASE << 16) |
- (TCPOPT_MD5SIG << 8) |
- TCPOLEN_MD5SIG);
- } else {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_MD5SIG << 8) |
- TCPOLEN_MD5SIG);
- }
- options &= ~OPTION_COOKIE_EXTENSION;
+ *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
/* overload cookie hash location */
opts->hash_location = (__u8 *)ptr;
ptr += 4;
@@ -495,44 +441,6 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
*ptr++ = htonl(opts->tsecr);
}
- /* Specification requires after timestamp, so do it now.
- *
- * Consider the pessimal case without authentication. The options
- * could look like:
- * MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40
- */
- if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
- __u8 *cookie_copy = opts->hash_location;
- u8 cookie_size = opts->hash_size;
-
- /* 8-bit multiple handled in tcp_cookie_size_check() above,
- * and elsewhere.
- */
- if (0x2 & cookie_size) {
- __u8 *p = (__u8 *)ptr;
-
- /* 16-bit multiple */
- *p++ = TCPOPT_COOKIE;
- *p++ = TCPOLEN_COOKIE_BASE + cookie_size;
- *p++ = *cookie_copy++;
- *p++ = *cookie_copy++;
- ptr++;
- cookie_size -= 2;
- } else {
- /* 32-bit multiple */
- *ptr++ = htonl(((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_COOKIE << 8) |
- TCPOLEN_COOKIE_BASE) +
- cookie_size);
- }
-
- if (cookie_size > 0) {
- memcpy(ptr, cookie_copy, cookie_size);
- ptr += (cookie_size / 4);
- }
- }
-
if (unlikely(OPTION_SACK_ADVERTISE & options)) {
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
@@ -591,11 +499,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
struct tcp_md5sig_key **md5)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_cookie_values *cvp = tp->cookie_values;
unsigned int remaining = MAX_TCP_OPTION_SPACE;
- u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
- tcp_cookie_size_check(cvp->cookie_desired) :
- 0;
struct tcp_fastopen_request *fastopen = tp->fastopen_req;
#ifdef CONFIG_TCP_MD5SIG
@@ -647,52 +551,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
tp->syn_fastopen = 1;
}
}
- /* Note that timestamps are required by the specification.
- *
- * Odd numbers of bytes are prohibited by the specification, ensuring
- * that the cookie is 16-bit aligned, and the resulting cookie pair is
- * 32-bit aligned.
- */
- if (*md5 == NULL &&
- (OPTION_TS & opts->options) &&
- cookie_size > 0) {
- int need = TCPOLEN_COOKIE_BASE + cookie_size;
-
- if (0x2 & need) {
- /* 32-bit multiple */
- need += 2; /* NOPs */
-
- if (need > remaining) {
- /* try shrinking cookie to fit */
- cookie_size -= 2;
- need -= 4;
- }
- }
- while (need > remaining && TCP_COOKIE_MIN <= cookie_size) {
- cookie_size -= 4;
- need -= 4;
- }
- if (TCP_COOKIE_MIN <= cookie_size) {
- opts->options |= OPTION_COOKIE_EXTENSION;
- opts->hash_location = (__u8 *)&cvp->cookie_pair[0];
- opts->hash_size = cookie_size;
-
- /* Remember for future incarnations. */
- cvp->cookie_desired = cookie_size;
-
- if (cvp->cookie_desired != cvp->cookie_pair_size) {
- /* Currently use random bytes as a nonce,
- * assuming these are completely unpredictable
- * by hostile users of the same system.
- */
- get_random_bytes(&cvp->cookie_pair[0],
- cookie_size);
- cvp->cookie_pair_size = cookie_size;
- }
- remaining -= need;
- }
- }
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -702,14 +561,10 @@ static unsigned int tcp_synack_options(struct sock *sk,
unsigned int mss, struct sk_buff *skb,
struct tcp_out_options *opts,
struct tcp_md5sig_key **md5,
- struct tcp_extend_values *xvp,
struct tcp_fastopen_cookie *foc)
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
- u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
- xvp->cookie_plus :
- 0;
#ifdef CONFIG_TCP_MD5SIG
*md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
@@ -757,28 +612,7 @@ static unsigned int tcp_synack_options(struct sock *sk,
remaining -= need;
}
}
- /* Similar rationale to tcp_syn_options() applies here, too.
- * If the <SYN> options fit, the same options should fit now!
- */
- if (*md5 == NULL &&
- ireq->tstamp_ok &&
- cookie_plus > TCPOLEN_COOKIE_BASE) {
- int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */
-
- if (0x2 & need) {
- /* 32-bit multiple */
- need += 2; /* NOPs */
- }
- if (need <= remaining) {
- opts->options |= OPTION_COOKIE_EXTENSION;
- opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE;
- remaining -= need;
- } else {
- /* There's no error return, so flag it. */
- xvp->cookie_out_never = 1; /* true */
- opts->hash_size = 0;
- }
- }
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -1298,7 +1132,6 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
eat = min_t(int, len, skb_headlen(skb));
if (eat) {
__skb_pull(skb, eat);
- skb->avail_size -= eat;
len -= eat;
if (!len)
return;
@@ -1633,11 +1466,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
if (nonagle & TCP_NAGLE_PUSH)
return true;
- /* Don't use the nagle rule for urgent data (or for the final FIN).
- * Nagle can be ignored during F-RTO too (see RFC4138).
- */
- if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
- (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
+ /* Don't use the nagle rule for urgent data (or for the final FIN). */
+ if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
return true;
if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
@@ -1810,8 +1640,11 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
goto send_now;
}
- /* Ok, it looks like it is advisable to defer. */
- tp->tso_deferred = 1 | (jiffies << 1);
+ /* Ok, it looks like it is advisable to defer.
+ * Do not rearm the timer if already set to not break TCP ACK clocking.
+ */
+ if (!tp->tso_deferred)
+ tp->tso_deferred = 1 | (jiffies << 1);
return true;
@@ -1959,6 +1792,9 @@ static int tcp_mtu_probe(struct sock *sk)
* snd_up-64k-mss .. snd_up cannot be large. However, taking into
* account rare use of URG, this is not a big flaw.
*
+ * Send at most one packet when push_one > 0. Temporarily ignore
+ * cwnd limit to force at most one packet out when push_one == 2.
+
* Returns true, if no segments are in flight and we have queued segments,
* but cannot send anything now because of SWS or another problem.
*/
@@ -1994,8 +1830,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
goto repair; /* Skip network transmission */
cwnd_quota = tcp_cwnd_test(tp, skb);
- if (!cwnd_quota)
- break;
+ if (!cwnd_quota) {
+ if (push_one == 2)
+ /* Force out a loss probe pkt. */
+ cwnd_quota = 1;
+ else
+ break;
+ }
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
break;
@@ -2049,10 +1890,129 @@ repair:
if (likely(sent_pkts)) {
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += sent_pkts;
+
+ /* Send one loss probe per tail loss episode. */
+ if (push_one != 2)
+ tcp_schedule_loss_probe(sk);
tcp_cwnd_validate(sk);
return false;
}
- return !tp->packets_out && tcp_send_head(sk);
+ return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
+}
+
+bool tcp_schedule_loss_probe(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 timeout, tlp_time_stamp, rto_time_stamp;
+ u32 rtt = tp->srtt >> 3;
+
+ if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
+ return false;
+ /* No consecutive loss probes. */
+ if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
+ tcp_rearm_rto(sk);
+ return false;
+ }
+ /* Don't do any loss probe on a Fast Open connection before 3WHS
+ * finishes.
+ */
+ if (sk->sk_state == TCP_SYN_RECV)
+ return false;
+
+ /* TLP is only scheduled when next timer event is RTO. */
+ if (icsk->icsk_pending != ICSK_TIME_RETRANS)
+ return false;
+
+ /* Schedule a loss probe in 2*RTT for SACK capable connections
+ * in Open state, that are either limited by cwnd or application.
+ */
+ if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out ||
+ !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
+ return false;
+
+ if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
+ tcp_send_head(sk))
+ return false;
+
+ /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account
+ * for delayed ack when there's one outstanding packet.
+ */
+ timeout = rtt << 1;
+ if (tp->packets_out == 1)
+ timeout = max_t(u32, timeout,
+ (rtt + (rtt >> 1) + TCP_DELACK_MAX));
+ timeout = max_t(u32, timeout, msecs_to_jiffies(10));
+
+ /* If RTO is shorter, just schedule TLP in its place. */
+ tlp_time_stamp = tcp_time_stamp + timeout;
+ rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
+ if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
+ s32 delta = rto_time_stamp - tcp_time_stamp;
+ if (delta > 0)
+ timeout = delta;
+ }
+
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
+ TCP_RTO_MAX);
+ return true;
+}
+
+/* When probe timeout (PTO) fires, send a new segment if one exists, else
+ * retransmit the last segment.
+ */
+void tcp_send_loss_probe(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ int pcount;
+ int mss = tcp_current_mss(sk);
+ int err = -1;
+
+ if (tcp_send_head(sk) != NULL) {
+ err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
+ goto rearm_timer;
+ }
+
+ /* At most one outstanding TLP retransmission. */
+ if (tp->tlp_high_seq)
+ goto rearm_timer;
+
+ /* Retransmit last segment. */
+ skb = tcp_write_queue_tail(sk);
+ if (WARN_ON(!skb))
+ goto rearm_timer;
+
+ pcount = tcp_skb_pcount(skb);
+ if (WARN_ON(!pcount))
+ goto rearm_timer;
+
+ if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
+ if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss)))
+ goto rearm_timer;
+ skb = tcp_write_queue_tail(sk);
+ }
+
+ if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+ goto rearm_timer;
+
+ /* Probe with zero data doesn't trigger fast recovery. */
+ if (skb->len > 0)
+ err = __tcp_retransmit_skb(sk, skb);
+
+ /* Record snd_nxt for loss detection. */
+ if (likely(!err))
+ tp->tlp_high_seq = tp->snd_nxt;
+
+rearm_timer:
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto,
+ TCP_RTO_MAX);
+
+ if (likely(!err))
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPLOSSPROBES);
+ return;
}
/* Push out any pending frames which were held back due to
@@ -2673,32 +2633,24 @@ int tcp_send_synack(struct sock *sk)
* sk: listener socket
* dst: dst entry attached to the SYNACK
* req: request_sock pointer
- * rvp: request_values pointer
*
* Allocate one skb and build a SYNACK packet.
* @dst is consumed : Caller should not use it again.
*/
struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
- struct request_values *rvp,
struct tcp_fastopen_cookie *foc)
{
struct tcp_out_options opts;
- struct tcp_extend_values *xvp = tcp_xv(rvp);
struct inet_request_sock *ireq = inet_rsk(req);
struct tcp_sock *tp = tcp_sk(sk);
- const struct tcp_cookie_values *cvp = tp->cookie_values;
struct tcphdr *th;
struct sk_buff *skb;
struct tcp_md5sig_key *md5;
int tcp_header_size;
int mss;
- int s_data_desired = 0;
- if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
- s_data_desired = cvp->s_data_desired;
- skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired,
- sk_gfp_atomic(sk, GFP_ATOMIC));
+ skb = alloc_skb(MAX_TCP_HEADER + 15, sk_gfp_atomic(sk, GFP_ATOMIC));
if (unlikely(!skb)) {
dst_release(dst);
return NULL;
@@ -2740,9 +2692,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
else
#endif
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_header_size = tcp_synack_options(sk, req, mss,
- skb, &opts, &md5, xvp, foc)
- + sizeof(*th);
+ tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5,
+ foc) + sizeof(*th);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
@@ -2760,40 +2711,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
TCPHDR_SYN | TCPHDR_ACK);
- if (OPTION_COOKIE_EXTENSION & opts.options) {
- if (s_data_desired) {
- u8 *buf = skb_put(skb, s_data_desired);
-
- /* copy data directly from the listening socket. */
- memcpy(buf, cvp->s_data_payload, s_data_desired);
- TCP_SKB_CB(skb)->end_seq += s_data_desired;
- }
-
- if (opts.hash_size > 0) {
- __u32 workspace[SHA_WORKSPACE_WORDS];
- u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS];
- u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1];
-
- /* Secret recipe depends on the Timestamp, (future)
- * Sequence and Acknowledgment Numbers, Initiator
- * Cookie, and others handled by IP variant caller.
- */
- *tail-- ^= opts.tsval;
- *tail-- ^= tcp_rsk(req)->rcv_isn + 1;
- *tail-- ^= TCP_SKB_CB(skb)->seq + 1;
-
- /* recommended */
- *tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source);
- *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */
-
- sha_transform((__u32 *)&xvp->cookie_bakery[0],
- (char *)mess,
- &workspace[0]);
- opts.hash_location =
- (__u8 *)&xvp->cookie_bakery[0];
- }
- }
-
th->seq = htonl(TCP_SKB_CB(skb)->seq);
/* XXX data is queued and acked as is. No buffer/window check */
th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b78aac30c498..4b85e6f636c9 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -342,10 +342,6 @@ void tcp_retransmit_timer(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
- if (tp->early_retrans_delayed) {
- tcp_resume_early_retransmit(sk);
- return;
- }
if (tp->fastopen_rsk) {
WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
sk->sk_state != TCP_FIN_WAIT1);
@@ -360,6 +356,8 @@ void tcp_retransmit_timer(struct sock *sk)
WARN_ON(tcp_write_queue_empty(sk));
+ tp->tlp_high_seq = 0;
+
if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
!((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
/* Receiver dastardly shrinks window. Our retransmits
@@ -418,11 +416,7 @@ void tcp_retransmit_timer(struct sock *sk)
NET_INC_STATS_BH(sock_net(sk), mib_idx);
}
- if (tcp_use_frto(sk)) {
- tcp_enter_frto(sk);
- } else {
- tcp_enter_loss(sk, 0);
- }
+ tcp_enter_loss(sk, 0);
if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
/* Retransmission failed because of local congestion,
@@ -495,13 +489,20 @@ void tcp_write_timer_handler(struct sock *sk)
}
event = icsk->icsk_pending;
- icsk->icsk_pending = 0;
switch (event) {
+ case ICSK_TIME_EARLY_RETRANS:
+ tcp_resume_early_retransmit(sk);
+ break;
+ case ICSK_TIME_LOSS_PROBE:
+ tcp_send_loss_probe(sk);
+ break;
case ICSK_TIME_RETRANS:
+ icsk->icsk_pending = 0;
tcp_retransmit_timer(sk);
break;
case ICSK_TIME_PROBE0:
+ icsk->icsk_pending = 0;
tcp_probe_timer(sk);
break;
}
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 1b91bf48e277..76a1e23259e1 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -236,7 +236,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
break;
- case CA_EVENT_FRTO:
+ case CA_EVENT_LOSS:
tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
/* Update RTT_min when next ack arrives */
w->reset_rtt_min = 1;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 265c42cf963c..7117d1467b02 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1762,9 +1762,16 @@ int udp_rcv(struct sk_buff *skb)
void udp_destroy_sock(struct sock *sk)
{
+ struct udp_sock *up = udp_sk(sk);
bool slow = lock_sock_fast(sk);
udp_flush_pending_frames(sk);
unlock_sock_fast(sk, slow);
+ if (static_key_false(&udp_encap_needed) && up->encap_type) {
+ void (*encap_destroy)(struct sock *sk);
+ encap_destroy = ACCESS_ONCE(up->encap_destroy);
+ if (encap_destroy)
+ encap_destroy(sk);
+ }
}
/*
@@ -2272,31 +2279,88 @@ void __init udp_init(void)
int udp4_ufo_send_check(struct sk_buff *skb)
{
- const struct iphdr *iph;
- struct udphdr *uh;
-
- if (!pskb_may_pull(skb, sizeof(*uh)))
+ if (!pskb_may_pull(skb, sizeof(struct udphdr)))
return -EINVAL;
- iph = ip_hdr(skb);
- uh = udp_hdr(skb);
+ if (likely(!skb->encapsulation)) {
+ const struct iphdr *iph;
+ struct udphdr *uh;
- uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
- IPPROTO_UDP, 0);
- skb->csum_start = skb_transport_header(skb) - skb->head;
- skb->csum_offset = offsetof(struct udphdr, check);
- skb->ip_summed = CHECKSUM_PARTIAL;
+ iph = ip_hdr(skb);
+ uh = udp_hdr(skb);
+
+ uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
+ IPPROTO_UDP, 0);
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ }
return 0;
}
+static struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ int mac_len = skb->mac_len;
+ int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+ int outer_hlen;
+ netdev_features_t enc_features;
+
+ if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
+ goto out;
+
+ skb->encapsulation = 0;
+ __skb_pull(skb, tnl_hlen);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb_inner_network_offset(skb));
+ skb->mac_len = skb_inner_network_offset(skb);
+
+ /* segment inner packet. */
+ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
+ segs = skb_mac_gso_segment(skb, enc_features);
+ if (!segs || IS_ERR(segs))
+ goto out;
+
+ outer_hlen = skb_tnl_header_len(skb);
+ skb = segs;
+ do {
+ struct udphdr *uh;
+ int udp_offset = outer_hlen - tnl_hlen;
+
+ skb->mac_len = mac_len;
+
+ skb_push(skb, outer_hlen);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ skb_set_transport_header(skb, udp_offset);
+ uh = udp_hdr(skb);
+ uh->len = htons(skb->len - udp_offset);
+
+ /* csum segment if tunnel sets skb with csum. */
+ if (unlikely(uh->check)) {
+ struct iphdr *iph = ip_hdr(skb);
+
+ uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+ skb->len - udp_offset,
+ IPPROTO_UDP, 0);
+ uh->check = csum_fold(skb_checksum(skb, udp_offset,
+ skb->len - udp_offset, 0));
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+
+ }
+ skb->ip_summed = CHECKSUM_NONE;
+ } while ((skb = skb->next));
+out:
+ return segs;
+}
+
struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
unsigned int mss;
- int offset;
- __wsum csum;
-
mss = skb_shinfo(skb)->gso_size;
if (unlikely(skb->len <= mss))
goto out;
@@ -2306,6 +2370,7 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
int type = skb_shinfo(skb)->gso_type;
if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
+ SKB_GSO_UDP_TUNNEL |
SKB_GSO_GRE) ||
!(type & (SKB_GSO_UDP))))
goto out;
@@ -2316,20 +2381,27 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
goto out;
}
- /* Do software UFO. Complete and fill in the UDP checksum as HW cannot
- * do checksum of UDP packets sent as multiple IP fragments.
- */
- offset = skb_checksum_start_offset(skb);
- csum = skb_checksum(skb, offset, skb->len - offset, 0);
- offset += skb->csum_offset;
- *(__sum16 *)(skb->data + offset) = csum_fold(csum);
- skb->ip_summed = CHECKSUM_NONE;
-
/* Fragment the skb. IP headers of the fragments are updated in
* inet_gso_segment()
*/
- segs = skb_segment(skb, features);
+ if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL)
+ segs = skb_udp_tunnel_segment(skb, features);
+ else {
+ int offset;
+ __wsum csum;
+
+ /* Do software UFO. Complete and fill in the UDP checksum as
+ * HW cannot do checksum of UDP packets sent as multiple
+ * IP fragments.
+ */
+ offset = skb_checksum_start_offset(skb);
+ csum = skb_checksum(skb, offset, skb->len - offset, 0);
+ offset += skb->csum_offset;
+ *(__sum16 *)(skb->data + offset) = csum_fold(csum);
+ skb->ip_summed = CHECKSUM_NONE;
+
+ segs = skb_segment(skb, features);
+ }
out:
return segs;
}
-
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 505b30ad9182..369a781851ad 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -64,9 +64,9 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
goto out;
err = -ENOMEM;
- rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
- sizeof(struct inet_diag_meminfo) +
- 64)), GFP_KERNEL);
+ rep = nlmsg_new(sizeof(struct inet_diag_msg) +
+ sizeof(struct inet_diag_meminfo) + 64,
+ GFP_KERNEL);
if (!rep)
goto out;
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index ed0b9e2e797a..11b13ea69db4 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -156,6 +156,7 @@ config INET6_XFRM_MODE_ROUTEOPTIMIZATION
config IPV6_SIT
tristate "IPv6: IPv6-in-IPv4 tunnel (SIT driver)"
select INET_TUNNEL
+ select NET_IP_TUNNEL
select IPV6_NDISC_NODETYPE
default y
---help---
@@ -201,6 +202,7 @@ config IPV6_TUNNEL
config IPV6_GRE
tristate "IPv6: GRE tunnel"
select IPV6_TUNNEL
+ select NET_IP_TUNNEL
---help---
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f2c7e615f902..28b61e89bbb8 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -70,6 +70,7 @@
#include <net/snmp.h>
#include <net/af_ieee802154.h>
+#include <net/firewire.h>
#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/ndisc.h>
@@ -421,6 +422,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
ipv6_regen_rndid((unsigned long) ndev);
}
#endif
+ ndev->token = in6addr_any;
if (netif_running(dev) && addrconf_qdisc_ok(dev))
ndev->if_flags |= IF_READY;
@@ -544,8 +546,7 @@ static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = {
};
static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
- struct nlmsghdr *nlh,
- void *arg)
+ struct nlmsghdr *nlh)
{
struct net *net = sock_net(in_skb->sk);
struct nlattr *tb[NETCONFA_MAX+1];
@@ -605,6 +606,77 @@ errout:
return err;
}
+static int inet6_netconf_dump_devconf(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int h, s_h;
+ int idx, s_idx;
+ struct net_device *dev;
+ struct inet6_dev *idev;
+ struct hlist_head *head;
+
+ s_h = cb->args[0];
+ s_idx = idx = cb->args[1];
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ rcu_read_lock();
+ cb->seq = atomic_read(&net->ipv6.dev_addr_genid) ^
+ net->dev_base_seq;
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ goto cont;
+
+ if (inet6_netconf_fill_devconf(skb, dev->ifindex,
+ &idev->cnf,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF,
+ NLM_F_MULTI,
+ -1) <= 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ rcu_read_unlock();
+ }
+ if (h == NETDEV_HASHENTRIES) {
+ if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
+ net->ipv6.devconf_all,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ -1) <= 0)
+ goto done;
+ else
+ h++;
+ }
+ if (h == NETDEV_HASHENTRIES + 1) {
+ if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
+ net->ipv6.devconf_dflt,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ -1) <= 0)
+ goto done;
+ else
+ h++;
+ }
+done:
+ cb->args[0] = h;
+ cb->args[1] = idx;
+
+ return skb->len;
+}
+
#ifdef CONFIG_SYSCTL
static void dev_forward_change(struct inet6_dev *idev)
{
@@ -806,6 +878,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
ifa->prefix_len = pfxlen;
ifa->flags = flags | IFA_F_TENTATIVE;
ifa->cstamp = ifa->tstamp = jiffies;
+ ifa->tokenized = false;
ifa->rt = rt;
@@ -1668,6 +1741,20 @@ static int addrconf_ifid_eui64(u8 *eui, struct net_device *dev)
return 0;
}
+static int addrconf_ifid_ieee1394(u8 *eui, struct net_device *dev)
+{
+ union fwnet_hwaddr *ha;
+
+ if (dev->addr_len != FWNET_ALEN)
+ return -1;
+
+ ha = (union fwnet_hwaddr *)dev->dev_addr;
+
+ memcpy(eui, &ha->uc.uniq_id, sizeof(ha->uc.uniq_id));
+ eui[0] ^= 2;
+ return 0;
+}
+
static int addrconf_ifid_arcnet(u8 *eui, struct net_device *dev)
{
/* XXX: inherit EUI-64 from other interface -- yoshfuji */
@@ -1732,6 +1819,8 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
return addrconf_ifid_gre(eui, dev);
case ARPHRD_IEEE802154:
return addrconf_ifid_eui64(eui, dev);
+ case ARPHRD_IEEE1394:
+ return addrconf_ifid_ieee1394(eui, dev);
}
return -1;
}
@@ -2046,11 +2135,19 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
struct inet6_ifaddr *ifp;
struct in6_addr addr;
int create = 0, update_lft = 0;
+ bool tokenized = false;
if (pinfo->prefix_len == 64) {
memcpy(&addr, &pinfo->prefix, 8);
- if (ipv6_generate_eui64(addr.s6_addr + 8, dev) &&
- ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) {
+
+ if (!ipv6_addr_any(&in6_dev->token)) {
+ read_lock_bh(&in6_dev->lock);
+ memcpy(addr.s6_addr + 8,
+ in6_dev->token.s6_addr + 8, 8);
+ read_unlock_bh(&in6_dev->lock);
+ tokenized = true;
+ } else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) &&
+ ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) {
in6_dev_put(in6_dev);
return;
}
@@ -2091,6 +2188,7 @@ ok:
update_lft = create = 1;
ifp->cstamp = jiffies;
+ ifp->tokenized = tokenized;
addrconf_dad_start(ifp);
}
@@ -2529,6 +2627,9 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
static void init_loopback(struct net_device *dev)
{
struct inet6_dev *idev;
+ struct net_device *sp_dev;
+ struct inet6_ifaddr *sp_ifa;
+ struct rt6_info *sp_rt;
/* ::1 */
@@ -2540,6 +2641,30 @@ static void init_loopback(struct net_device *dev)
}
add_addr(idev, &in6addr_loopback, 128, IFA_HOST);
+
+ /* Add routes to other interface's IPv6 addresses */
+ for_each_netdev(dev_net(dev), sp_dev) {
+ if (!strcmp(sp_dev->name, dev->name))
+ continue;
+
+ idev = __in6_dev_get(sp_dev);
+ if (!idev)
+ continue;
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(sp_ifa, &idev->addr_list, if_list) {
+
+ if (sp_ifa->flags & (IFA_F_DADFAILED | IFA_F_TENTATIVE))
+ continue;
+
+ sp_rt = addrconf_dst_alloc(idev, &sp_ifa->addr, 0);
+
+ /* Failure cases are ignored */
+ if (!IS_ERR(sp_rt))
+ ip6_ins_rt(sp_rt);
+ }
+ read_unlock_bh(&idev->lock);
+ }
}
static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr *addr)
@@ -2573,7 +2698,8 @@ static void addrconf_dev_config(struct net_device *dev)
(dev->type != ARPHRD_FDDI) &&
(dev->type != ARPHRD_ARCNET) &&
(dev->type != ARPHRD_INFINIBAND) &&
- (dev->type != ARPHRD_IEEE802154)) {
+ (dev->type != ARPHRD_IEEE802154) &&
+ (dev->type != ARPHRD_IEEE1394)) {
/* Alas, we support only Ethernet autoconfiguration. */
return;
}
@@ -3510,7 +3636,7 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = {
};
static int
-inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct ifaddrmsg *ifm;
@@ -3576,7 +3702,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags,
}
static int
-inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct ifaddrmsg *ifm;
@@ -3807,6 +3933,7 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
NLM_F_MULTI);
if (err <= 0)
break;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
}
break;
}
@@ -3864,6 +3991,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
s_ip_idx = ip_idx = cb->args[2];
rcu_read_lock();
+ cb->seq = atomic_read(&net->ipv6.dev_addr_genid) ^ net->dev_base_seq;
for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
idx = 0;
head = &net->dev_index_head[h];
@@ -3915,8 +4043,7 @@ static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb)
return inet6_dump_addr(skb, cb, type);
}
-static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
- void *arg)
+static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(in_skb->sk);
struct ifaddrmsg *ifm;
@@ -4049,7 +4176,8 @@ static inline size_t inet6_ifla6_size(void)
+ nla_total_size(sizeof(struct ifla_cacheinfo))
+ nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */
+ nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */
- + nla_total_size(ICMP6_MIB_MAX * 8); /* IFLA_INET6_ICMP6STATS */
+ + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */
+ + nla_total_size(sizeof(struct in6_addr)); /* IFLA_INET6_TOKEN */
}
static inline size_t inet6_if_nlmsg_size(void)
@@ -4136,6 +4264,13 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev)
goto nla_put_failure;
snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla));
+ nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr));
+ if (nla == NULL)
+ goto nla_put_failure;
+ read_lock_bh(&idev->lock);
+ memcpy(nla_data(nla), idev->token.s6_addr, nla_len(nla));
+ read_unlock_bh(&idev->lock);
+
return 0;
nla_put_failure:
@@ -4163,6 +4298,80 @@ static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
return 0;
}
+static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
+{
+ struct inet6_ifaddr *ifp;
+ struct net_device *dev = idev->dev;
+ bool update_rs = false;
+
+ if (token == NULL)
+ return -EINVAL;
+ if (ipv6_addr_any(token))
+ return -EINVAL;
+ if (dev->flags & (IFF_LOOPBACK | IFF_NOARP))
+ return -EINVAL;
+ if (!ipv6_accept_ra(idev))
+ return -EINVAL;
+ if (idev->cnf.rtr_solicits <= 0)
+ return -EINVAL;
+
+ write_lock_bh(&idev->lock);
+
+ BUILD_BUG_ON(sizeof(token->s6_addr) != 16);
+ memcpy(idev->token.s6_addr + 8, token->s6_addr + 8, 8);
+
+ write_unlock_bh(&idev->lock);
+
+ if (!idev->dead && (idev->if_flags & IF_READY)) {
+ struct in6_addr ll_addr;
+
+ ipv6_get_lladdr(dev, &ll_addr, IFA_F_TENTATIVE |
+ IFA_F_OPTIMISTIC);
+
+ /* If we're not ready, then normal ifup will take care
+ * of this. Otherwise, we need to request our rs here.
+ */
+ ndisc_send_rs(dev, &ll_addr, &in6addr_linklocal_allrouters);
+ update_rs = true;
+ }
+
+ write_lock_bh(&idev->lock);
+
+ if (update_rs)
+ idev->if_flags |= IF_RS_SENT;
+
+ /* Well, that's kinda nasty ... */
+ list_for_each_entry(ifp, &idev->addr_list, if_list) {
+ spin_lock(&ifp->lock);
+ if (ifp->tokenized) {
+ ifp->valid_lft = 0;
+ ifp->prefered_lft = 0;
+ }
+ spin_unlock(&ifp->lock);
+ }
+
+ write_unlock_bh(&idev->lock);
+ return 0;
+}
+
+static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
+{
+ int err = -EINVAL;
+ struct inet6_dev *idev = __in6_dev_get(dev);
+ struct nlattr *tb[IFLA_INET6_MAX + 1];
+
+ if (!idev)
+ return -EAFNOSUPPORT;
+
+ if (nla_parse_nested(tb, IFLA_INET6_MAX, nla, NULL) < 0)
+ BUG();
+
+ if (tb[IFLA_INET6_TOKEN])
+ err = inet6_set_iftoken(idev, nla_data(tb[IFLA_INET6_TOKEN]));
+
+ return err;
+}
+
static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
u32 portid, u32 seq, int event, unsigned int flags)
{
@@ -4341,6 +4550,8 @@ errout:
static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
{
+ struct net *net = dev_net(ifp->idev->dev);
+
inet6_ifa_notify(event ? : RTM_NEWADDR, ifp);
switch (event) {
@@ -4366,6 +4577,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
dst_free(&ifp->rt->dst);
break;
}
+ atomic_inc(&net->ipv6.dev_addr_genid);
}
static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
@@ -4784,26 +4996,20 @@ static void addrconf_sysctl_unregister(struct inet6_dev *idev)
static int __net_init addrconf_init_net(struct net *net)
{
- int err;
+ int err = -ENOMEM;
struct ipv6_devconf *all, *dflt;
- err = -ENOMEM;
- all = &ipv6_devconf;
- dflt = &ipv6_devconf_dflt;
+ all = kmemdup(&ipv6_devconf, sizeof(ipv6_devconf), GFP_KERNEL);
+ if (all == NULL)
+ goto err_alloc_all;
- if (!net_eq(net, &init_net)) {
- all = kmemdup(all, sizeof(ipv6_devconf), GFP_KERNEL);
- if (all == NULL)
- goto err_alloc_all;
+ dflt = kmemdup(&ipv6_devconf_dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL);
+ if (dflt == NULL)
+ goto err_alloc_dflt;
- dflt = kmemdup(dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL);
- if (dflt == NULL)
- goto err_alloc_dflt;
- } else {
- /* these will be inherited by all namespaces */
- dflt->autoconf = ipv6_defaults.autoconf;
- dflt->disable_ipv6 = ipv6_defaults.disable_ipv6;
- }
+ /* these will be inherited by all namespaces */
+ dflt->autoconf = ipv6_defaults.autoconf;
+ dflt->disable_ipv6 = ipv6_defaults.disable_ipv6;
net->ipv6.devconf_all = all;
net->ipv6.devconf_dflt = dflt;
@@ -4868,6 +5074,7 @@ static struct rtnl_af_ops inet6_ops = {
.family = AF_INET6,
.fill_link_af = inet6_fill_link_af,
.get_link_af_size = inet6_get_link_af_size,
+ .set_link_af = inet6_set_link_af,
};
/*
@@ -4940,7 +5147,7 @@ int __init addrconf_init(void)
__rtnl_register(PF_INET6, RTM_GETANYCAST, NULL,
inet6_dump_ifacaddr, NULL);
__rtnl_register(PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf,
- NULL, NULL);
+ inet6_netconf_dump_devconf, NULL);
ipv6_addr_label_rtnl_register();
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index aad64352cb60..f083a583a05c 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -414,8 +414,7 @@ static const struct nla_policy ifal_policy[IFAL_MAX+1] = {
[IFAL_LABEL] = { .len = sizeof(u32), },
};
-static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh,
- void *arg)
+static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct ifaddrlblmsg *ifal;
@@ -436,10 +435,7 @@ static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh,
if (!tb[IFAL_ADDRESS])
return -EINVAL;
-
pfx = nla_data(tb[IFAL_ADDRESS]);
- if (!pfx)
- return -EINVAL;
if (!tb[IFAL_LABEL])
return -EINVAL;
@@ -533,8 +529,7 @@ static inline int ip6addrlbl_msgsize(void)
+ nla_total_size(4); /* IFAL_LABEL */
}
-static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh,
- void *arg)
+static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh)
{
struct net *net = sock_net(in_skb->sk);
struct ifaddrlblmsg *ifal;
@@ -561,10 +556,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh,
if (!tb[IFAL_ADDRESS])
return -EINVAL;
-
addr = nla_data(tb[IFAL_ADDRESS]);
- if (!addr)
- return -EINVAL;
rcu_read_lock();
p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 6b793bfc0e10..ab5c7ad482cd 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -49,7 +49,6 @@
#include <net/udp.h>
#include <net/udplite.h>
#include <net/tcp.h>
-#include <net/ipip.h>
#include <net/protocol.h>
#include <net/inet_common.h>
#include <net/route.h>
@@ -323,7 +322,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct net_device *dev = NULL;
rcu_read_lock();
- if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ if (__ipv6_addr_needs_scope_id(addr_type)) {
if (addr_len >= sizeof(struct sockaddr_in6) &&
addr->sin6_scope_id) {
/* Override any existing binding, if another one
@@ -471,8 +470,8 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
sin->sin6_port = inet->inet_sport;
}
- if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL)
- sin->sin6_scope_id = sk->sk_bound_dev_if;
+ sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
+ sk->sk_bound_dev_if);
*uaddr_len = sizeof(*sin);
return 0;
}
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index f5a54782a340..4b56cbbc7890 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -124,7 +124,7 @@ ipv4_connected:
goto out;
}
- if (addr_type&IPV6_ADDR_LINKLOCAL) {
+ if (__ipv6_addr_needs_scope_id(addr_type)) {
if (addr_len >= sizeof(struct sockaddr_in6) &&
usin->sin6_scope_id) {
if (sk->sk_bound_dev_if &&
@@ -355,18 +355,19 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len)
sin->sin6_family = AF_INET6;
sin->sin6_flowinfo = 0;
sin->sin6_port = serr->port;
- sin->sin6_scope_id = 0;
if (skb->protocol == htons(ETH_P_IPV6)) {
const struct ipv6hdr *ip6h = container_of((struct in6_addr *)(nh + serr->addr_offset),
struct ipv6hdr, daddr);
sin->sin6_addr = ip6h->daddr;
if (np->sndflow)
sin->sin6_flowinfo = ip6_flowinfo(ip6h);
- if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL)
- sin->sin6_scope_id = IP6CB(skb)->iif;
+ sin->sin6_scope_id =
+ ipv6_iface_scope_id(&sin->sin6_addr,
+ IP6CB(skb)->iif);
} else {
ipv6_addr_set_v4mapped(*(__be32 *)(nh + serr->addr_offset),
&sin->sin6_addr);
+ sin->sin6_scope_id = 0;
}
}
@@ -376,18 +377,19 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len)
if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) {
sin->sin6_family = AF_INET6;
sin->sin6_flowinfo = 0;
- sin->sin6_scope_id = 0;
if (skb->protocol == htons(ETH_P_IPV6)) {
sin->sin6_addr = ipv6_hdr(skb)->saddr;
if (np->rxopt.all)
ip6_datagram_recv_ctl(sk, msg, skb);
- if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL)
- sin->sin6_scope_id = IP6CB(skb)->iif;
+ sin->sin6_scope_id =
+ ipv6_iface_scope_id(&sin->sin6_addr,
+ IP6CB(skb)->iif);
} else {
struct inet_sock *inet = inet_sk(sk);
ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr,
&sin->sin6_addr);
+ sin->sin6_scope_id = 0;
if (inet->cmsg_flags)
ip_cmsg_recv(msg, skb);
}
@@ -592,7 +594,9 @@ int ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
sin6.sin6_addr = ipv6_hdr(skb)->daddr;
sin6.sin6_port = ports[1];
sin6.sin6_flowinfo = 0;
- sin6.sin6_scope_id = 0;
+ sin6.sin6_scope_id =
+ ipv6_iface_scope_id(&ipv6_hdr(skb)->daddr,
+ opt->iif);
put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6);
}
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index fff5bdd8b680..71b900c3f4ff 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -434,7 +434,7 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
* Source addr check
*/
- if (addr_type & IPV6_ADDR_LINKLOCAL)
+ if (__ipv6_addr_needs_scope_id(addr_type))
iif = skb->dev->ifindex;
/*
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 9bfab19ff3c0..e4311cbc8b4e 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -54,6 +54,10 @@ int inet6_csk_bind_conflict(const struct sock *sk,
if (ipv6_rcv_saddr_equal(sk, sk2))
break;
}
+ if (!relax && reuse && sk2->sk_reuse &&
+ sk2->sk_state != TCP_LISTEN &&
+ ipv6_rcv_saddr_equal(sk, sk2))
+ break;
}
}
@@ -169,10 +173,8 @@ void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
sin6->sin6_port = inet_sk(sk)->inet_dport;
/* We do not store received flowlabel for TCP */
sin6->sin6_flowinfo = 0;
- sin6->sin6_scope_id = 0;
- if (sk->sk_bound_dev_if &&
- ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
- sin6->sin6_scope_id = sk->sk_bound_dev_if;
+ sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr,
+ sk->sk_bound_dev_if);
}
EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr);
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index b973ed3d06cf..46e88433ec7d 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -144,7 +144,9 @@ static void ip6_fl_gc(unsigned long dummy)
spin_lock(&ip6_fl_lock);
for (i=0; i<=FL_HASH_MASK; i++) {
- struct ip6_flowlabel *fl, **flp;
+ struct ip6_flowlabel *fl;
+ struct ip6_flowlabel __rcu **flp;
+
flp = &fl_ht[i];
while ((fl = rcu_dereference_protected(*flp,
lockdep_is_held(&ip6_fl_lock))) != NULL) {
@@ -179,7 +181,9 @@ static void __net_exit ip6_fl_purge(struct net *net)
spin_lock(&ip6_fl_lock);
for (i = 0; i <= FL_HASH_MASK; i++) {
- struct ip6_flowlabel *fl, **flp;
+ struct ip6_flowlabel *fl;
+ struct ip6_flowlabel __rcu **flp;
+
flp = &fl_ht[i];
while ((fl = rcu_dereference_protected(*flp,
lockdep_is_held(&ip6_fl_lock))) != NULL) {
@@ -506,7 +510,8 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen)
struct ipv6_pinfo *np = inet6_sk(sk);
struct in6_flowlabel_req freq;
struct ipv6_fl_socklist *sfl1=NULL;
- struct ipv6_fl_socklist *sfl, **sflp;
+ struct ipv6_fl_socklist *sfl;
+ struct ipv6_fl_socklist __rcu **sflp;
struct ip6_flowlabel *fl, *fl1 = NULL;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index e4efffe2522e..d3ddd8400354 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -38,6 +38,7 @@
#include <net/sock.h>
#include <net/ip.h>
+#include <net/ip_tunnels.h>
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/addrconf.h>
@@ -110,46 +111,6 @@ static u32 HASH_ADDR(const struct in6_addr *addr)
#define tunnels_l tunnels[1]
#define tunnels_wc tunnels[0]
-static struct rtnl_link_stats64 *ip6gre_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *tot)
-{
- int i;
-
- for_each_possible_cpu(i) {
- const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
- u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
- unsigned int start;
-
- do {
- start = u64_stats_fetch_begin_bh(&tstats->syncp);
- rx_packets = tstats->rx_packets;
- tx_packets = tstats->tx_packets;
- rx_bytes = tstats->rx_bytes;
- tx_bytes = tstats->tx_bytes;
- } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
-
- tot->rx_packets += rx_packets;
- tot->tx_packets += tx_packets;
- tot->rx_bytes += rx_bytes;
- tot->tx_bytes += tx_bytes;
- }
-
- tot->multicast = dev->stats.multicast;
- tot->rx_crc_errors = dev->stats.rx_crc_errors;
- tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
- tot->rx_length_errors = dev->stats.rx_length_errors;
- tot->rx_frame_errors = dev->stats.rx_frame_errors;
- tot->rx_errors = dev->stats.rx_errors;
-
- tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
- tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
- tot->tx_dropped = dev->stats.tx_dropped;
- tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
- tot->tx_errors = dev->stats.tx_errors;
-
- return tot;
-}
-
/* Given src, dst and key, find appropriate for input tunnel. */
static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
@@ -667,7 +628,6 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
struct net_device_stats *stats = &tunnel->dev->stats;
int err = -1;
u8 proto;
- int pkt_len;
struct sk_buff *new_skb;
if (dev->type == ARPHRD_ETHER)
@@ -801,23 +761,9 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
}
}
- nf_reset(skb);
- pkt_len = skb->len;
- err = ip6_local_out(skb);
-
- if (net_xmit_eval(err) == 0) {
- struct pcpu_tstats *tstats = this_cpu_ptr(tunnel->dev->tstats);
-
- tstats->tx_bytes += pkt_len;
- tstats->tx_packets++;
- } else {
- stats->tx_errors++;
- stats->tx_aborted_errors++;
- }
-
+ ip6tunnel_xmit(skb, dev);
if (ndst)
ip6_tnl_dst_store(tunnel, ndst);
-
return 0;
tx_err_link_failure:
stats->tx_carrier_errors++;
@@ -1271,7 +1217,7 @@ static const struct net_device_ops ip6gre_netdev_ops = {
.ndo_start_xmit = ip6gre_tunnel_xmit,
.ndo_do_ioctl = ip6gre_tunnel_ioctl,
.ndo_change_mtu = ip6gre_tunnel_change_mtu,
- .ndo_get_stats64 = ip6gre_get_stats64,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
};
static void ip6gre_dev_free(struct net_device *dev)
@@ -1520,7 +1466,7 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_change_mtu = ip6gre_tunnel_change_mtu,
- .ndo_get_stats64 = ip6gre_get_stats64,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
};
static void ip6gre_tap_setup(struct net_device *dev)
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index b1876e52091e..2bab2aa59745 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -118,6 +118,18 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
ipv6_addr_loopback(&hdr->daddr))
goto err;
+ /* RFC4291 Errata ID: 3480
+ * Interface-Local scope spans only a single interface on a
+ * node and is useful only for loopback transmission of
+ * multicast. Packets with interface-local scope received
+ * from another node must be discarded.
+ */
+ if (!(skb->pkt_type == PACKET_LOOPBACK ||
+ dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_is_multicast(&hdr->daddr) &&
+ IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 1)
+ goto err;
+
/* RFC4291 2.7
* Nodes must not originate a packet to a multicast address whose scope
* field contains the reserved value 0; if such a packet is received, it
@@ -281,7 +293,8 @@ int ip6_mc_input(struct sk_buff *skb)
* IPv6 multicast router mode is now supported ;)
*/
if (dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding &&
- !(ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) &&
+ !(ipv6_addr_type(&hdr->daddr) &
+ (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) &&
likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) {
/*
* Okay, we try to forward - split and duplicate
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 8234c1dcdf72..71b766ee821d 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -92,14 +92,12 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
u8 *prevhdr;
int offset = 0;
- if (!(features & NETIF_F_V6_CSUM))
- features &= ~NETIF_F_SG;
-
if (unlikely(skb_shinfo(skb)->gso_type &
~(SKB_GSO_UDP |
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
SKB_GSO_GRE |
+ SKB_GSO_UDP_TUNNEL |
SKB_GSO_TCPV6 |
0)))
goto out;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index fff83cbc197f..1e55866cead7 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -47,6 +47,7 @@
#include <net/icmp.h>
#include <net/ip.h>
+#include <net/ip_tunnels.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
@@ -955,7 +956,6 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
unsigned int max_headroom = sizeof(struct ipv6hdr);
u8 proto;
int err = -1;
- int pkt_len;
if (!fl6->flowi6_mark)
dst = ip6_tnl_dst_check(t);
@@ -1035,19 +1035,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
ipv6h->nexthdr = proto;
ipv6h->saddr = fl6->saddr;
ipv6h->daddr = fl6->daddr;
- nf_reset(skb);
- pkt_len = skb->len;
- err = ip6_local_out(skb);
-
- if (net_xmit_eval(err) == 0) {
- struct pcpu_tstats *tstats = this_cpu_ptr(t->dev->tstats);
-
- tstats->tx_bytes += pkt_len;
- tstats->tx_packets++;
- } else {
- stats->tx_errors++;
- stats->tx_aborted_errors++;
- }
+ ip6tunnel_xmit(skb, dev);
if (ndst)
ip6_tnl_dst_store(t, ndst);
return 0;
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 96bfb4e4b820..241fb8ad9fcf 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -842,9 +842,9 @@ static void ip6mr_destroy_unres(struct mr6_table *mrt, struct mfc6_cache *c)
if (ipv6_hdr(skb)->version == 0) {
struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr));
nlh->nlmsg_type = NLMSG_ERROR;
- nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+ nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
skb_trim(skb, nlh->nlmsg_len);
- ((struct nlmsgerr *)NLMSG_DATA(nlh))->error = -ETIMEDOUT;
+ ((struct nlmsgerr *)nlmsg_data(nlh))->error = -ETIMEDOUT;
rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
} else
kfree_skb(skb);
@@ -1100,13 +1100,13 @@ static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt,
if (ipv6_hdr(skb)->version == 0) {
struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr));
- if (__ip6mr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
+ if (__ip6mr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) {
nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh;
} else {
nlh->nlmsg_type = NLMSG_ERROR;
- nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+ nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
skb_trim(skb, nlh->nlmsg_len);
- ((struct nlmsgerr *)NLMSG_DATA(nlh))->error = -EMSGSIZE;
+ ((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE;
}
rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
} else
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 76ef4353d518..2712ab22a174 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -610,8 +610,6 @@ void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr,
}
}
#endif
- if (!dev->addr_len)
- send_sllao = 0;
if (send_sllao)
optlen += ndisc_opt_addr_space(dev);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 341b54ade72c..8861b1ef420e 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -284,6 +284,7 @@ static void trace_packet(const struct sk_buff *skb,
const char *hookname, *chainname, *comment;
const struct ip6t_entry *iter;
unsigned int rulenum = 0;
+ struct net *net = dev_net(in ? in : out);
table_base = private->entries[smp_processor_id()];
root = get_entry(table_base, private->hook_entry[hook]);
@@ -296,7 +297,7 @@ static void trace_packet(const struct sk_buff *skb,
&chainname, &comment, &rulenum) != 0)
break;
- nf_log_packet(AF_INET6, hook, skb, in, out, &trace_loginfo,
+ nf_log_packet(net, AF_INET6, hook, skb, in, out, &trace_loginfo,
"TRACE: %s:%s:%s:%u ",
tablename, chainname, comment, rulenum);
}
diff --git a/net/ipv6/netfilter/ip6t_NPT.c b/net/ipv6/netfilter/ip6t_NPT.c
index 83acc1405a18..590f767db5d4 100644
--- a/net/ipv6/netfilter/ip6t_NPT.c
+++ b/net/ipv6/netfilter/ip6t_NPT.c
@@ -18,9 +18,8 @@
static int ip6t_npt_checkentry(const struct xt_tgchk_param *par)
{
struct ip6t_npt_tginfo *npt = par->targinfo;
- __wsum src_sum = 0, dst_sum = 0;
struct in6_addr pfx;
- unsigned int i;
+ __wsum src_sum, dst_sum;
if (npt->src_pfx_len > 64 || npt->dst_pfx_len > 64)
return -EINVAL;
@@ -33,12 +32,8 @@ static int ip6t_npt_checkentry(const struct xt_tgchk_param *par)
if (!ipv6_addr_equal(&pfx, &npt->dst_pfx.in6))
return -EINVAL;
- for (i = 0; i < ARRAY_SIZE(npt->src_pfx.in6.s6_addr16); i++) {
- src_sum = csum_add(src_sum,
- (__force __wsum)npt->src_pfx.in6.s6_addr16[i]);
- dst_sum = csum_add(dst_sum,
- (__force __wsum)npt->dst_pfx.in6.s6_addr16[i]);
- }
+ src_sum = csum_partial(&npt->src_pfx.in6, sizeof(npt->src_pfx.in6), 0);
+ dst_sum = csum_partial(&npt->dst_pfx.in6, sizeof(npt->dst_pfx.in6), 0);
npt->adjustment = ~csum_fold(csum_sub(src_sum, dst_sum));
return 0;
@@ -57,7 +52,7 @@ static bool ip6t_npt_map_pfx(const struct ip6t_npt_tginfo *npt,
if (pfx_len - i >= 32)
mask = 0;
else
- mask = htonl(~((1 << (pfx_len - i)) - 1));
+ mask = htonl((1 << (i - pfx_len + 32)) - 1);
idx = i / 32;
addr->s6_addr32[idx] &= mask;
@@ -114,6 +109,7 @@ ip6t_dnpt_tg(struct sk_buff *skb, const struct xt_action_param *par)
static struct xt_target ip6t_npt_target_reg[] __read_mostly = {
{
.name = "SNPT",
+ .table = "mangle",
.target = ip6t_snpt_tg,
.targetsize = sizeof(struct ip6t_npt_tginfo),
.checkentry = ip6t_npt_checkentry,
@@ -124,6 +120,7 @@ static struct xt_target ip6t_npt_target_reg[] __read_mostly = {
},
{
.name = "DNPT",
+ .table = "mangle",
.target = ip6t_dnpt_tg,
.targetsize = sizeof(struct ip6t_npt_tginfo),
.checkentry = ip6t_npt_checkentry,
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 2b6c226f5198..97bcf2bae857 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -330,12 +330,8 @@ ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len)
sizeof(sin6.sin6_addr));
nf_ct_put(ct);
-
- if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
- sin6.sin6_scope_id = sk->sk_bound_dev_if;
- else
- sin6.sin6_scope_id = 0;
-
+ sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr,
+ sk->sk_bound_dev_if);
return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0;
}
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 24df3dde0076..b3807c5cb888 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -131,7 +131,8 @@ static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb,
type + 128);
nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple);
if (LOG_INVALID(nf_ct_net(ct), IPPROTO_ICMPV6))
- nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(nf_ct_net(ct), PF_INET6, 0, skb, NULL,
+ NULL, NULL,
"nf_ct_icmpv6: invalid new with type %d ",
type + 128);
return false;
@@ -203,7 +204,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih);
if (icmp6h == NULL) {
if (LOG_INVALID(net, IPPROTO_ICMPV6))
- nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, PF_INET6, 0, skb, NULL, NULL, NULL,
"nf_ct_icmpv6: short packet ");
return -NF_ACCEPT;
}
@@ -211,7 +212,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) {
if (LOG_INVALID(net, IPPROTO_ICMPV6))
- nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, PF_INET6, 0, skb, NULL, NULL, NULL,
"nf_ct_icmpv6: ICMPv6 checksum failed ");
return -NF_ACCEPT;
}
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 54087e96d7b8..dffdc1a389c5 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -14,6 +14,8 @@
* 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) "IPv6-nf: " fmt
+
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/string.h>
@@ -39,6 +41,7 @@
#include <net/rawv6.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
+#include <net/inet_ecn.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
#include <linux/sysctl.h>
#include <linux/netfilter.h>
@@ -136,6 +139,11 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net)
}
#endif
+static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
+{
+ return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
+}
+
static unsigned int nf_hashfn(struct inet_frag_queue *q)
{
const struct frag_queue *nq;
@@ -164,7 +172,7 @@ static void nf_ct_frag6_expire(unsigned long data)
/* Creation primitives. */
static inline struct frag_queue *fq_find(struct net *net, __be32 id,
u32 user, struct in6_addr *src,
- struct in6_addr *dst)
+ struct in6_addr *dst, u8 ecn)
{
struct inet_frag_queue *q;
struct ip6_create_arg arg;
@@ -174,19 +182,18 @@ static inline struct frag_queue *fq_find(struct net *net, __be32 id,
arg.user = user;
arg.src = src;
arg.dst = dst;
+ arg.ecn = ecn;
read_lock_bh(&nf_frags.lock);
hash = inet6_hash_frag(id, src, dst, nf_frags.rnd);
q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash);
local_bh_enable();
- if (q == NULL)
- goto oom;
-
+ if (IS_ERR_OR_NULL(q)) {
+ inet_frag_maybe_warn_overflow(q, pr_fmt());
+ return NULL;
+ }
return container_of(q, struct frag_queue, q);
-
-oom:
- return NULL;
}
@@ -196,6 +203,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
struct sk_buff *prev, *next;
unsigned int payload_len;
int offset, end;
+ u8 ecn;
if (fq->q.last_in & INET_FRAG_COMPLETE) {
pr_debug("Already completed\n");
@@ -213,6 +221,8 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
return -1;
}
+ ecn = ip6_frag_ecn(ipv6_hdr(skb));
+
if (skb->ip_summed == CHECKSUM_COMPLETE) {
const unsigned char *nh = skb_network_header(skb);
skb->csum = csum_sub(skb->csum,
@@ -317,6 +327,7 @@ found:
}
fq->q.stamp = skb->tstamp;
fq->q.meat += skb->len;
+ fq->ecn |= ecn;
if (payload_len > fq->q.max_size)
fq->q.max_size = payload_len;
add_frag_mem_limit(&fq->q, skb->truesize);
@@ -352,12 +363,17 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
{
struct sk_buff *fp, *op, *head = fq->q.fragments;
int payload_len;
+ u8 ecn;
inet_frag_kill(&fq->q, &nf_frags);
WARN_ON(head == NULL);
WARN_ON(NFCT_FRAG6_CB(head)->offset != 0);
+ ecn = ip_frag_ecn_table[fq->ecn];
+ if (unlikely(ecn == 0xff))
+ goto out_fail;
+
/* Unfragmented part is taken from the first segment. */
payload_len = ((head->data - skb_network_header(head)) -
sizeof(struct ipv6hdr) + fq->q.len -
@@ -428,6 +444,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
head->dev = dev;
head->tstamp = fq->q.stamp;
ipv6_hdr(head)->payload_len = htons(payload_len);
+ ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn);
IP6CB(head)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size;
/* Yes, and fold redundant checksum back. 8) */
@@ -572,7 +589,8 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user)
inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false);
local_bh_enable();
- fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr);
+ fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
+ ip6_frag_ecn(hdr));
if (fq == NULL) {
pr_debug("Can't find and can't create new queue\n");
goto ret_orig;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 330b5e7b7df6..eedff8ccded5 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -263,7 +263,7 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (addr_type != IPV6_ADDR_ANY) {
struct net_device *dev = NULL;
- if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ if (__ipv6_addr_needs_scope_id(addr_type)) {
if (addr_len >= sizeof(struct sockaddr_in6) &&
addr->sin6_scope_id) {
/* Override any existing binding, if another
@@ -498,9 +498,8 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk,
sin6->sin6_port = 0;
sin6->sin6_addr = ipv6_hdr(skb)->saddr;
sin6->sin6_flowinfo = 0;
- sin6->sin6_scope_id = 0;
- if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
- sin6->sin6_scope_id = IP6CB(skb)->iif;
+ sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr,
+ IP6CB(skb)->iif);
}
sock_recv_ts_and_drops(msg, sk, skb);
@@ -802,7 +801,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
if (addr_len >= sizeof(struct sockaddr_in6) &&
sin6->sin6_scope_id &&
- ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL)
+ __ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr)))
fl6.flowi6_oif = sin6->sin6_scope_id;
} else {
if (sk->sk_state != TCP_ESTABLISHED)
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 3c6a77290c6e..e6e44cef8db2 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -26,6 +26,9 @@
* YOSHIFUJI,H. @USAGI Always remove fragment header to
* calculate ICV correctly.
*/
+
+#define pr_fmt(fmt) "IPv6: " fmt
+
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/string.h>
@@ -55,6 +58,7 @@
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/inet_frag.h>
+#include <net/inet_ecn.h>
struct ip6frag_skb_cb
{
@@ -64,6 +68,10 @@ struct ip6frag_skb_cb
#define FRAG6_CB(skb) ((struct ip6frag_skb_cb*)((skb)->cb))
+static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
+{
+ return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
+}
static struct inet_frags ip6_frags;
@@ -116,6 +124,7 @@ void ip6_frag_init(struct inet_frag_queue *q, void *a)
fq->user = arg->user;
fq->saddr = *arg->src;
fq->daddr = *arg->dst;
+ fq->ecn = arg->ecn;
}
EXPORT_SYMBOL(ip6_frag_init);
@@ -170,7 +179,8 @@ static void ip6_frag_expire(unsigned long data)
}
static __inline__ struct frag_queue *
-fq_find(struct net *net, __be32 id, const struct in6_addr *src, const struct in6_addr *dst)
+fq_find(struct net *net, __be32 id, const struct in6_addr *src,
+ const struct in6_addr *dst, u8 ecn)
{
struct inet_frag_queue *q;
struct ip6_create_arg arg;
@@ -180,14 +190,16 @@ fq_find(struct net *net, __be32 id, const struct in6_addr *src, const struct in6
arg.user = IP6_DEFRAG_LOCAL_DELIVER;
arg.src = src;
arg.dst = dst;
+ arg.ecn = ecn;
read_lock(&ip6_frags.lock);
hash = inet6_hash_frag(id, src, dst, ip6_frags.rnd);
q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash);
- if (q == NULL)
+ if (IS_ERR_OR_NULL(q)) {
+ inet_frag_maybe_warn_overflow(q, pr_fmt());
return NULL;
-
+ }
return container_of(q, struct frag_queue, q);
}
@@ -198,6 +210,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
struct net_device *dev;
int offset, end;
struct net *net = dev_net(skb_dst(skb)->dev);
+ u8 ecn;
if (fq->q.last_in & INET_FRAG_COMPLETE)
goto err;
@@ -215,6 +228,8 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
return -1;
}
+ ecn = ip6_frag_ecn(ipv6_hdr(skb));
+
if (skb->ip_summed == CHECKSUM_COMPLETE) {
const unsigned char *nh = skb_network_header(skb);
skb->csum = csum_sub(skb->csum,
@@ -315,6 +330,7 @@ found:
}
fq->q.stamp = skb->tstamp;
fq->q.meat += skb->len;
+ fq->ecn |= ecn;
add_frag_mem_limit(&fq->q, skb->truesize);
/* The first fragment.
@@ -358,9 +374,14 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
int payload_len;
unsigned int nhoff;
int sum_truesize;
+ u8 ecn;
inet_frag_kill(&fq->q, &ip6_frags);
+ ecn = ip_frag_ecn_table[fq->ecn];
+ if (unlikely(ecn == 0xff))
+ goto out_fail;
+
/* Make the one we just received the head. */
if (prev) {
head = prev->next;
@@ -459,6 +480,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
head->dev = dev;
head->tstamp = fq->q.stamp;
ipv6_hdr(head)->payload_len = htons(payload_len);
+ ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn);
IP6CB(head)->nhoff = nhoff;
/* Yes, and fold redundant checksum back. 8) */
@@ -522,7 +544,8 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
IPSTATS_MIB_REASMFAILS, evicted);
- fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr);
+ fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr,
+ ip6_frag_ecn(hdr));
if (fq != NULL) {
int ret;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e5fe0041adfa..ad0aa6b0b86a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2355,7 +2355,7 @@ beginning:
return last_err;
}
-static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh)
{
struct fib6_config cfg;
int err;
@@ -2370,7 +2370,7 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a
return ip6_route_del(&cfg);
}
-static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh)
{
struct fib6_config cfg;
int err;
@@ -2562,7 +2562,7 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg)
prefix, 0, NLM_F_MULTI);
}
-static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
+static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh)
{
struct net *net = sock_net(in_skb->sk);
struct nlattr *tb[RTA_MAX+1];
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 02f96dcbcf02..335363478bbf 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -49,7 +49,7 @@
#include <net/ip.h>
#include <net/udp.h>
#include <net/icmp.h>
-#include <net/ipip.h>
+#include <net/ip_tunnels.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/dsfield.h>
@@ -87,41 +87,6 @@ struct sit_net {
struct net_device *fb_tunnel_dev;
};
-static struct rtnl_link_stats64 *ipip6_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *tot)
-{
- int i;
-
- for_each_possible_cpu(i) {
- const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
- u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
- unsigned int start;
-
- do {
- start = u64_stats_fetch_begin_bh(&tstats->syncp);
- rx_packets = tstats->rx_packets;
- tx_packets = tstats->tx_packets;
- rx_bytes = tstats->rx_bytes;
- tx_bytes = tstats->tx_bytes;
- } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
-
- tot->rx_packets += rx_packets;
- tot->tx_packets += tx_packets;
- tot->rx_bytes += rx_bytes;
- tot->tx_bytes += tx_bytes;
- }
-
- tot->rx_errors = dev->stats.rx_errors;
- tot->rx_frame_errors = dev->stats.rx_frame_errors;
- tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
- tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
- tot->tx_dropped = dev->stats.tx_dropped;
- tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
- tot->tx_errors = dev->stats.tx_errors;
-
- return tot;
-}
-
/*
* Must be invoked with rcu_read_lock
*/
@@ -899,6 +864,8 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
if ((iph->ttl = tiph->ttl) == 0)
iph->ttl = iph6->hop_limit;
+ skb->ip_summed = CHECKSUM_NONE;
+ ip_select_ident(iph, skb_dst(skb), NULL);
iptunnel_xmit(skb, dev);
return NETDEV_TX_OK;
@@ -1200,7 +1167,7 @@ static const struct net_device_ops ipip6_netdev_ops = {
.ndo_start_xmit = ipip6_tunnel_xmit,
.ndo_do_ioctl = ipip6_tunnel_ioctl,
.ndo_change_mtu = ipip6_tunnel_change_mtu,
- .ndo_get_stats64= ipip6_get_stats64,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
};
static void ipip6_dev_free(struct net_device *dev)
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 8a0848b60b35..d5dda20bd717 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -149,7 +149,6 @@ static inline int cookie_check(const struct sk_buff *skb, __u32 cookie)
struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
{
struct tcp_options_received tcp_opt;
- const u8 *hash_location;
struct inet_request_sock *ireq;
struct inet6_request_sock *ireq6;
struct tcp_request_sock *treq;
@@ -177,7 +176,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
/* check for timestamp cookie support */
memset(&tcp_opt, 0, sizeof(tcp_opt));
- tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL);
+ tcp_parse_options(skb, &tcp_opt, 0, NULL);
if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
goto out;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 9b6460055df5..e51bd1a58264 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -386,9 +386,17 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (dst)
dst->ops->redirect(dst, sk, skb);
+ goto out;
}
if (type == ICMPV6_PKT_TOOBIG) {
+ /* We are not interested in TCP_LISTEN and open_requests
+ * (SYN-ACKs send out by Linux are always <576bytes so
+ * they should go through unfragmented).
+ */
+ if (sk->sk_state == TCP_LISTEN)
+ goto out;
+
tp->mtu_info = ntohl(info);
if (!sock_owned_by_user(sk))
tcp_v6_mtu_reduced(sk);
@@ -454,7 +462,6 @@ out:
static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
struct flowi6 *fl6,
struct request_sock *req,
- struct request_values *rvp,
u16 queue_mapping)
{
struct inet6_request_sock *treq = inet6_rsk(req);
@@ -466,7 +473,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
if (!dst && (dst = inet6_csk_route_req(sk, fl6, req)) == NULL)
goto done;
- skb = tcp_make_synack(sk, dst, req, rvp, NULL);
+ skb = tcp_make_synack(sk, dst, req, NULL);
if (skb) {
__tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr);
@@ -481,13 +488,12 @@ done:
return err;
}
-static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req,
- struct request_values *rvp)
+static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
{
struct flowi6 fl6;
int res;
- res = tcp_v6_send_synack(sk, NULL, &fl6, req, rvp, 0);
+ res = tcp_v6_send_synack(sk, NULL, &fl6, req, 0);
if (!res)
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
return res;
@@ -940,9 +946,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
*/
static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
- struct tcp_extend_values tmp_ext;
struct tcp_options_received tmp_opt;
- const u8 *hash_location;
struct request_sock *req;
struct inet6_request_sock *treq;
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -980,50 +984,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
tmp_opt.user_mss = tp->rx_opt.user_mss;
- tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
-
- if (tmp_opt.cookie_plus > 0 &&
- tmp_opt.saw_tstamp &&
- !tp->rx_opt.cookie_out_never &&
- (sysctl_tcp_cookie_size > 0 ||
- (tp->cookie_values != NULL &&
- tp->cookie_values->cookie_desired > 0))) {
- u8 *c;
- u32 *d;
- u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
- int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
-
- if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
- goto drop_and_free;
-
- /* Secret recipe starts with IP addresses */
- d = (__force u32 *)&ipv6_hdr(skb)->daddr.s6_addr32[0];
- *mess++ ^= *d++;
- *mess++ ^= *d++;
- *mess++ ^= *d++;
- *mess++ ^= *d++;
- d = (__force u32 *)&ipv6_hdr(skb)->saddr.s6_addr32[0];
- *mess++ ^= *d++;
- *mess++ ^= *d++;
- *mess++ ^= *d++;
- *mess++ ^= *d++;
-
- /* plus variable length Initiator Cookie */
- c = (u8 *)mess;
- while (l-- > 0)
- *c++ ^= *hash_location++;
-
- want_cookie = false; /* not our kind of cookie */
- tmp_ext.cookie_out_never = 0; /* false */
- tmp_ext.cookie_plus = tmp_opt.cookie_plus;
- } else if (!tp->rx_opt.cookie_in_always) {
- /* redundant indications, but ensure initialization. */
- tmp_ext.cookie_out_never = 1; /* true */
- tmp_ext.cookie_plus = 0;
- } else {
- goto drop_and_free;
- }
- tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
+ tcp_parse_options(skb, &tmp_opt, 0, NULL);
if (want_cookie && !tmp_opt.saw_tstamp)
tcp_clear_options(&tmp_opt);
@@ -1101,7 +1062,6 @@ have_isn:
goto drop_and_release;
if (tcp_v6_send_synack(sk, dst, &fl6, req,
- (struct request_values *)&tmp_ext,
skb_get_queue_mapping(skb)) ||
want_cookie)
goto drop_and_free;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 599e1ba6d1ce..da6019b63730 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -450,15 +450,16 @@ try_again:
sin6->sin6_family = AF_INET6;
sin6->sin6_port = udp_hdr(skb)->source;
sin6->sin6_flowinfo = 0;
- sin6->sin6_scope_id = 0;
- if (is_udp4)
+ if (is_udp4) {
ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr,
&sin6->sin6_addr);
- else {
+ sin6->sin6_scope_id = 0;
+ } else {
sin6->sin6_addr = ipv6_hdr(skb)->saddr;
- if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
- sin6->sin6_scope_id = IP6CB(skb)->iif;
+ sin6->sin6_scope_id =
+ ipv6_iface_scope_id(&sin6->sin6_addr,
+ IP6CB(skb)->iif);
}
}
@@ -1118,7 +1119,7 @@ do_udp_sendmsg:
if (addr_len >= sizeof(struct sockaddr_in6) &&
sin6->sin6_scope_id &&
- ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL)
+ __ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr)))
fl6.flowi6_oif = sin6->sin6_scope_id;
} else {
if (sk->sk_state != TCP_ESTABLISHED)
@@ -1285,10 +1286,18 @@ do_confirm:
void udpv6_destroy_sock(struct sock *sk)
{
+ struct udp_sock *up = udp_sk(sk);
lock_sock(sk);
udp_v6_flush_pending_frames(sk);
release_sock(sk);
+ if (static_key_false(&udpv6_encap_needed) && up->encap_type) {
+ void (*encap_destroy)(struct sock *sk);
+ encap_destroy = ACCESS_ONCE(up->encap_destroy);
+ if (encap_destroy)
+ encap_destroy(sk);
+ }
+
inet6_destroy_sock(sk);
}
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index cf05cf073c51..3bb3a891a424 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -21,6 +21,10 @@ static int udp6_ufo_send_check(struct sk_buff *skb)
const struct ipv6hdr *ipv6h;
struct udphdr *uh;
+ /* UDP Tunnel offload on ipv6 is not yet supported. */
+ if (skb->encapsulation)
+ return -EINVAL;
+
if (!pskb_may_pull(skb, sizeof(*uh)))
return -EINVAL;
@@ -56,7 +60,9 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
/* Packet is from an untrusted source, reset gso_segs. */
int type = skb_shinfo(skb)->gso_type;
- if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
+ if (unlikely(type & ~(SKB_GSO_UDP |
+ SKB_GSO_DODGY |
+ SKB_GSO_UDP_TUNNEL |
SKB_GSO_GRE) ||
!(type & (SKB_GSO_UDP))))
goto out;
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index d07e3a626446..0578d4fa00a9 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -305,8 +305,7 @@ static void irda_connect_response(struct irda_sock *self)
IRDA_DEBUG(2, "%s()\n", __func__);
- skb = alloc_skb(TTP_MAX_HEADER + TTP_SAR_HEADER,
- GFP_ATOMIC);
+ skb = alloc_skb(TTP_MAX_HEADER + TTP_SAR_HEADER, GFP_KERNEL);
if (skb == NULL) {
IRDA_DEBUG(0, "%s() Unable to allocate sk_buff!\n",
__func__);
@@ -1120,7 +1119,7 @@ static int irda_create(struct net *net, struct socket *sock, int protocol,
}
/* Allocate networking socket */
- sk = sk_alloc(net, PF_IRDA, GFP_ATOMIC, &irda_proto);
+ sk = sk_alloc(net, PF_IRDA, GFP_KERNEL, &irda_proto);
if (sk == NULL)
return -ENOMEM;
@@ -1386,6 +1385,8 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock,
IRDA_DEBUG(4, "%s()\n", __func__);
+ msg->msg_namelen = 0;
+
skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
flags & MSG_DONTWAIT, &err);
if (!skb)
@@ -2583,8 +2584,10 @@ bed:
NULL, NULL, NULL);
/* Check if the we got some results */
- if (!self->cachedaddr)
- return -EAGAIN; /* Didn't find any devices */
+ if (!self->cachedaddr) {
+ err = -EAGAIN; /* Didn't find any devices */
+ goto out;
+ }
daddr = self->cachedaddr;
/* Cleanup */
self->cachedaddr = 0;
diff --git a/net/irda/ircomm/ircomm_core.c b/net/irda/ircomm/ircomm_core.c
index 52079f19bbbe..b797daac063c 100644
--- a/net/irda/ircomm/ircomm_core.c
+++ b/net/irda/ircomm/ircomm_core.c
@@ -117,7 +117,7 @@ struct ircomm_cb *ircomm_open(notify_t *notify, __u8 service_type, int line)
IRDA_ASSERT(ircomm != NULL, return NULL;);
- self = kzalloc(sizeof(struct ircomm_cb), GFP_ATOMIC);
+ self = kzalloc(sizeof(struct ircomm_cb), GFP_KERNEL);
if (self == NULL)
return NULL;
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index a7d11ffe4284..e165e8dc962e 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1328,6 +1328,8 @@ static int iucv_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
struct sk_buff *skb, *rskb, *cskb;
int err = 0;
+ msg->msg_namelen = 0;
+
if ((sk->sk_state == IUCV_DISCONN) &&
skb_queue_empty(&iucv->backlog_skb_q) &&
skb_queue_empty(&sk->sk_receive_queue) &&
@@ -1461,7 +1463,8 @@ unsigned int iucv_sock_poll(struct file *file, struct socket *sock,
return iucv_accept_poll(sk);
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
- mask |= POLLERR;
+ mask |= POLLERR |
+ (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= POLLRDHUP;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 556fdafdd1ea..5b1e5af25713 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -2201,7 +2201,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, const struct sadb_
XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW);
xp->priority = pol->sadb_x_policy_priority;
- sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
+ sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1];
xp->family = pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.saddr);
if (!xp->family) {
err = -EINVAL;
@@ -2214,7 +2214,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, const struct sadb_
if (xp->selector.sport)
xp->selector.sport_mask = htons(0xffff);
- sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1],
+ sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1];
pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.daddr);
xp->selector.prefixlen_d = sa->sadb_address_prefixlen;
@@ -2315,7 +2315,7 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa
memset(&sel, 0, sizeof(sel));
- sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
+ sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1];
sel.family = pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr);
sel.prefixlen_s = sa->sadb_address_prefixlen;
sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto);
@@ -2323,7 +2323,7 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa
if (sel.sport)
sel.sport_mask = htons(0xffff);
- sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1],
+ sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1];
pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr);
sel.prefixlen_d = sa->sadb_address_prefixlen;
sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto);
@@ -2693,6 +2693,7 @@ static int key_notify_policy_flush(const struct km_event *c)
hdr->sadb_msg_pid = c->portid;
hdr->sadb_msg_version = PF_KEY_V2;
hdr->sadb_msg_errno = (uint8_t) 0;
+ hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC;
hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net);
return 0;
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index d36875f3427e..6984c3a353cd 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -114,7 +114,6 @@ struct l2tp_net {
static void l2tp_session_set_header_len(struct l2tp_session *session, int version);
static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel);
-static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel);
static inline struct l2tp_net *l2tp_pernet(struct net *net)
{
@@ -192,6 +191,7 @@ struct sock *l2tp_tunnel_sock_lookup(struct l2tp_tunnel *tunnel)
} else {
/* Socket is owned by kernelspace */
sk = tunnel->sock;
+ sock_hold(sk);
}
out:
@@ -210,6 +210,7 @@ void l2tp_tunnel_sock_put(struct sock *sk)
}
sock_put(sk);
}
+ sock_put(sk);
}
EXPORT_SYMBOL_GPL(l2tp_tunnel_sock_put);
@@ -373,10 +374,8 @@ static void l2tp_recv_queue_skb(struct l2tp_session *session, struct sk_buff *sk
struct sk_buff *skbp;
struct sk_buff *tmp;
u32 ns = L2TP_SKB_CB(skb)->ns;
- struct l2tp_stats *sstats;
spin_lock_bh(&session->reorder_q.lock);
- sstats = &session->stats;
skb_queue_walk_safe(&session->reorder_q, skbp, tmp) {
if (L2TP_SKB_CB(skbp)->ns > ns) {
__skb_queue_before(&session->reorder_q, skbp, skb);
@@ -384,9 +383,7 @@ static void l2tp_recv_queue_skb(struct l2tp_session *session, struct sk_buff *sk
"%s: pkt %hu, inserted before %hu, reorder_q len=%d\n",
session->name, ns, L2TP_SKB_CB(skbp)->ns,
skb_queue_len(&session->reorder_q));
- u64_stats_update_begin(&sstats->syncp);
- sstats->rx_oos_packets++;
- u64_stats_update_end(&sstats->syncp);
+ atomic_long_inc(&session->stats.rx_oos_packets);
goto out;
}
}
@@ -403,23 +400,16 @@ static void l2tp_recv_dequeue_skb(struct l2tp_session *session, struct sk_buff *
{
struct l2tp_tunnel *tunnel = session->tunnel;
int length = L2TP_SKB_CB(skb)->length;
- struct l2tp_stats *tstats, *sstats;
/* We're about to requeue the skb, so return resources
* to its current owner (a socket receive buffer).
*/
skb_orphan(skb);
- tstats = &tunnel->stats;
- u64_stats_update_begin(&tstats->syncp);
- sstats = &session->stats;
- u64_stats_update_begin(&sstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += length;
- sstats->rx_packets++;
- sstats->rx_bytes += length;
- u64_stats_update_end(&tstats->syncp);
- u64_stats_update_end(&sstats->syncp);
+ atomic_long_inc(&tunnel->stats.rx_packets);
+ atomic_long_add(length, &tunnel->stats.rx_bytes);
+ atomic_long_inc(&session->stats.rx_packets);
+ atomic_long_add(length, &session->stats.rx_bytes);
if (L2TP_SKB_CB(skb)->has_seq) {
/* Bump our Nr */
@@ -450,7 +440,6 @@ static void l2tp_recv_dequeue(struct l2tp_session *session)
{
struct sk_buff *skb;
struct sk_buff *tmp;
- struct l2tp_stats *sstats;
/* If the pkt at the head of the queue has the nr that we
* expect to send up next, dequeue it and any other
@@ -458,13 +447,10 @@ static void l2tp_recv_dequeue(struct l2tp_session *session)
*/
start:
spin_lock_bh(&session->reorder_q.lock);
- sstats = &session->stats;
skb_queue_walk_safe(&session->reorder_q, skb, tmp) {
if (time_after(jiffies, L2TP_SKB_CB(skb)->expires)) {
- u64_stats_update_begin(&sstats->syncp);
- sstats->rx_seq_discards++;
- sstats->rx_errors++;
- u64_stats_update_end(&sstats->syncp);
+ atomic_long_inc(&session->stats.rx_seq_discards);
+ atomic_long_inc(&session->stats.rx_errors);
l2tp_dbg(session, L2TP_MSG_SEQ,
"%s: oos pkt %u len %d discarded (too old), waiting for %u, reorder_q_len=%d\n",
session->name, L2TP_SKB_CB(skb)->ns,
@@ -623,7 +609,6 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
struct l2tp_tunnel *tunnel = session->tunnel;
int offset;
u32 ns, nr;
- struct l2tp_stats *sstats = &session->stats;
/* The ref count is increased since we now hold a pointer to
* the session. Take care to decrement the refcnt when exiting
@@ -640,9 +625,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
"%s: cookie mismatch (%u/%u). Discarding.\n",
tunnel->name, tunnel->tunnel_id,
session->session_id);
- u64_stats_update_begin(&sstats->syncp);
- sstats->rx_cookie_discards++;
- u64_stats_update_end(&sstats->syncp);
+ atomic_long_inc(&session->stats.rx_cookie_discards);
goto discard;
}
ptr += session->peer_cookie_len;
@@ -711,9 +694,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
l2tp_warn(session, L2TP_MSG_SEQ,
"%s: recv data has no seq numbers when required. Discarding.\n",
session->name);
- u64_stats_update_begin(&sstats->syncp);
- sstats->rx_seq_discards++;
- u64_stats_update_end(&sstats->syncp);
+ atomic_long_inc(&session->stats.rx_seq_discards);
goto discard;
}
@@ -732,9 +713,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
l2tp_warn(session, L2TP_MSG_SEQ,
"%s: recv data has no seq numbers when required. Discarding.\n",
session->name);
- u64_stats_update_begin(&sstats->syncp);
- sstats->rx_seq_discards++;
- u64_stats_update_end(&sstats->syncp);
+ atomic_long_inc(&session->stats.rx_seq_discards);
goto discard;
}
}
@@ -788,9 +767,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
* packets
*/
if (L2TP_SKB_CB(skb)->ns != session->nr) {
- u64_stats_update_begin(&sstats->syncp);
- sstats->rx_seq_discards++;
- u64_stats_update_end(&sstats->syncp);
+ atomic_long_inc(&session->stats.rx_seq_discards);
l2tp_dbg(session, L2TP_MSG_SEQ,
"%s: oos pkt %u len %d discarded, waiting for %u, reorder_q_len=%d\n",
session->name, L2TP_SKB_CB(skb)->ns,
@@ -816,9 +793,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
return;
discard:
- u64_stats_update_begin(&sstats->syncp);
- sstats->rx_errors++;
- u64_stats_update_end(&sstats->syncp);
+ atomic_long_inc(&session->stats.rx_errors);
kfree_skb(skb);
if (session->deref)
@@ -828,6 +803,23 @@ discard:
}
EXPORT_SYMBOL(l2tp_recv_common);
+/* Drop skbs from the session's reorder_q
+ */
+int l2tp_session_queue_purge(struct l2tp_session *session)
+{
+ struct sk_buff *skb = NULL;
+ BUG_ON(!session);
+ BUG_ON(session->magic != L2TP_SESSION_MAGIC);
+ while ((skb = skb_dequeue(&session->reorder_q))) {
+ atomic_long_inc(&session->stats.rx_errors);
+ kfree_skb(skb);
+ if (session->deref)
+ (*session->deref)(session);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(l2tp_session_queue_purge);
+
/* Internal UDP receive frame. Do the real work of receiving an L2TP data frame
* here. The skb is not on a list when we get here.
* Returns 0 if the packet was a data packet and was successfully passed on.
@@ -843,7 +835,6 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
u32 tunnel_id, session_id;
u16 version;
int length;
- struct l2tp_stats *tstats;
if (tunnel->sock && l2tp_verify_udp_checksum(tunnel->sock, skb))
goto discard_bad_csum;
@@ -932,10 +923,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
discard_bad_csum:
LIMIT_NETDEBUG("%s: UDP: bad checksum\n", tunnel->name);
UDP_INC_STATS_USER(tunnel->l2tp_net, UDP_MIB_INERRORS, 0);
- tstats = &tunnel->stats;
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_errors++;
- u64_stats_update_end(&tstats->syncp);
+ atomic_long_inc(&tunnel->stats.rx_errors);
kfree_skb(skb);
return 0;
@@ -1062,7 +1050,6 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
struct l2tp_tunnel *tunnel = session->tunnel;
unsigned int len = skb->len;
int error;
- struct l2tp_stats *tstats, *sstats;
/* Debug */
if (session->send_seq)
@@ -1091,21 +1078,15 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
error = ip_queue_xmit(skb, fl);
/* Update stats */
- tstats = &tunnel->stats;
- u64_stats_update_begin(&tstats->syncp);
- sstats = &session->stats;
- u64_stats_update_begin(&sstats->syncp);
if (error >= 0) {
- tstats->tx_packets++;
- tstats->tx_bytes += len;
- sstats->tx_packets++;
- sstats->tx_bytes += len;
+ atomic_long_inc(&tunnel->stats.tx_packets);
+ atomic_long_add(len, &tunnel->stats.tx_bytes);
+ atomic_long_inc(&session->stats.tx_packets);
+ atomic_long_add(len, &session->stats.tx_bytes);
} else {
- tstats->tx_errors++;
- sstats->tx_errors++;
+ atomic_long_inc(&tunnel->stats.tx_errors);
+ atomic_long_inc(&session->stats.tx_errors);
}
- u64_stats_update_end(&tstats->syncp);
- u64_stats_update_end(&sstats->syncp);
return 0;
}
@@ -1282,6 +1263,7 @@ static void l2tp_tunnel_destruct(struct sock *sk)
/* No longer an encapsulation socket. See net/ipv4/udp.c */
(udp_sk(sk))->encap_type = 0;
(udp_sk(sk))->encap_rcv = NULL;
+ (udp_sk(sk))->encap_destroy = NULL;
break;
case L2TP_ENCAPTYPE_IP:
break;
@@ -1311,7 +1293,7 @@ end:
/* When the tunnel is closed, all the attached sessions need to go too.
*/
-static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel)
+void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel)
{
int hash;
struct hlist_node *walk;
@@ -1334,25 +1316,13 @@ again:
hlist_del_init(&session->hlist);
- /* Since we should hold the sock lock while
- * doing any unbinding, we need to release the
- * lock we're holding before taking that lock.
- * Hold a reference to the sock so it doesn't
- * disappear as we're jumping between locks.
- */
if (session->ref != NULL)
(*session->ref)(session);
write_unlock_bh(&tunnel->hlist_lock);
- if (tunnel->version != L2TP_HDR_VER_2) {
- struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net);
-
- spin_lock_bh(&pn->l2tp_session_hlist_lock);
- hlist_del_init_rcu(&session->global_hlist);
- spin_unlock_bh(&pn->l2tp_session_hlist_lock);
- synchronize_rcu();
- }
+ __l2tp_session_unhash(session);
+ l2tp_session_queue_purge(session);
if (session->session_close != NULL)
(*session->session_close)(session);
@@ -1360,6 +1330,8 @@ again:
if (session->deref != NULL)
(*session->deref)(session);
+ l2tp_session_dec_refcount(session);
+
write_lock_bh(&tunnel->hlist_lock);
/* Now restart from the beginning of this hash
@@ -1372,6 +1344,17 @@ again:
}
write_unlock_bh(&tunnel->hlist_lock);
}
+EXPORT_SYMBOL_GPL(l2tp_tunnel_closeall);
+
+/* Tunnel socket destroy hook for UDP encapsulation */
+static void l2tp_udp_encap_destroy(struct sock *sk)
+{
+ struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk);
+ if (tunnel) {
+ l2tp_tunnel_closeall(tunnel);
+ sock_put(sk);
+ }
+}
/* Really kill the tunnel.
* Come here only when all sessions have been cleared from the tunnel.
@@ -1397,19 +1380,21 @@ static void l2tp_tunnel_del_work(struct work_struct *work)
return;
sock = sk->sk_socket;
- BUG_ON(!sock);
- /* If the tunnel socket was created directly by the kernel, use the
- * sk_* API to release the socket now. Otherwise go through the
- * inet_* layer to shut the socket down, and let userspace close it.
+ /* If the tunnel socket was created by userspace, then go through the
+ * inet layer to shut the socket down, and let userspace close it.
+ * Otherwise, if we created the socket directly within the kernel, use
+ * the sk API to release it here.
* In either case the tunnel resources are freed in the socket
* destructor when the tunnel socket goes away.
*/
- if (sock->file == NULL) {
- kernel_sock_shutdown(sock, SHUT_RDWR);
- sk_release_kernel(sk);
+ if (tunnel->fd >= 0) {
+ if (sock)
+ inet_shutdown(sock, 2);
} else {
- inet_shutdown(sock, 2);
+ if (sock)
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+ sk_release_kernel(sk);
}
l2tp_tunnel_sock_put(sk);
@@ -1668,6 +1653,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32
/* Mark socket as an encapsulation socket. See net/ipv4/udp.c */
udp_sk(sk)->encap_type = UDP_ENCAP_L2TPINUDP;
udp_sk(sk)->encap_rcv = l2tp_udp_encap_recv;
+ udp_sk(sk)->encap_destroy = l2tp_udp_encap_destroy;
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == PF_INET6)
udpv6_encap_enable();
@@ -1723,6 +1709,7 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create);
*/
int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel)
{
+ l2tp_tunnel_closeall(tunnel);
return (false == queue_work(l2tp_wq, &tunnel->del_work));
}
EXPORT_SYMBOL_GPL(l2tp_tunnel_delete);
@@ -1731,62 +1718,71 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_delete);
*/
void l2tp_session_free(struct l2tp_session *session)
{
- struct l2tp_tunnel *tunnel;
+ struct l2tp_tunnel *tunnel = session->tunnel;
BUG_ON(atomic_read(&session->ref_count) != 0);
- tunnel = session->tunnel;
- if (tunnel != NULL) {
+ if (tunnel) {
BUG_ON(tunnel->magic != L2TP_TUNNEL_MAGIC);
+ if (session->session_id != 0)
+ atomic_dec(&l2tp_session_count);
+ sock_put(tunnel->sock);
+ session->tunnel = NULL;
+ l2tp_tunnel_dec_refcount(tunnel);
+ }
+
+ kfree(session);
- /* Delete the session from the hash */
+ return;
+}
+EXPORT_SYMBOL_GPL(l2tp_session_free);
+
+/* Remove an l2tp session from l2tp_core's hash lists.
+ * Provides a tidyup interface for pseudowire code which can't just route all
+ * shutdown via. l2tp_session_delete and a pseudowire-specific session_close
+ * callback.
+ */
+void __l2tp_session_unhash(struct l2tp_session *session)
+{
+ struct l2tp_tunnel *tunnel = session->tunnel;
+
+ /* Remove the session from core hashes */
+ if (tunnel) {
+ /* Remove from the per-tunnel hash */
write_lock_bh(&tunnel->hlist_lock);
hlist_del_init(&session->hlist);
write_unlock_bh(&tunnel->hlist_lock);
- /* Unlink from the global hash if not L2TPv2 */
+ /* For L2TPv3 we have a per-net hash: remove from there, too */
if (tunnel->version != L2TP_HDR_VER_2) {
struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net);
-
spin_lock_bh(&pn->l2tp_session_hlist_lock);
hlist_del_init_rcu(&session->global_hlist);
spin_unlock_bh(&pn->l2tp_session_hlist_lock);
synchronize_rcu();
}
-
- if (session->session_id != 0)
- atomic_dec(&l2tp_session_count);
-
- sock_put(tunnel->sock);
-
- /* This will delete the tunnel context if this
- * is the last session on the tunnel.
- */
- session->tunnel = NULL;
- l2tp_tunnel_dec_refcount(tunnel);
}
-
- kfree(session);
-
- return;
}
-EXPORT_SYMBOL_GPL(l2tp_session_free);
+EXPORT_SYMBOL_GPL(__l2tp_session_unhash);
/* This function is used by the netlink SESSION_DELETE command and by
pseudowire modules.
*/
int l2tp_session_delete(struct l2tp_session *session)
{
+ if (session->ref)
+ (*session->ref)(session);
+ __l2tp_session_unhash(session);
+ l2tp_session_queue_purge(session);
if (session->session_close != NULL)
(*session->session_close)(session);
-
+ if (session->deref)
+ (*session->deref)(session);
l2tp_session_dec_refcount(session);
-
return 0;
}
EXPORT_SYMBOL_GPL(l2tp_session_delete);
-
/* We come here whenever a session's send_seq, cookie_len or
* l2specific_len parameters are set.
*/
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 8eb8f1d47f3a..485a490fd990 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -36,16 +36,15 @@ enum {
struct sk_buff;
struct l2tp_stats {
- u64 tx_packets;
- u64 tx_bytes;
- u64 tx_errors;
- u64 rx_packets;
- u64 rx_bytes;
- u64 rx_seq_discards;
- u64 rx_oos_packets;
- u64 rx_errors;
- u64 rx_cookie_discards;
- struct u64_stats_sync syncp;
+ atomic_long_t tx_packets;
+ atomic_long_t tx_bytes;
+ atomic_long_t tx_errors;
+ atomic_long_t rx_packets;
+ atomic_long_t rx_bytes;
+ atomic_long_t rx_seq_discards;
+ atomic_long_t rx_oos_packets;
+ atomic_long_t rx_errors;
+ atomic_long_t rx_cookie_discards;
};
struct l2tp_tunnel;
@@ -240,11 +239,14 @@ extern struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id);
extern struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth);
extern int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp);
+extern void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel);
extern int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel);
extern struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg);
+extern void __l2tp_session_unhash(struct l2tp_session *session);
extern int l2tp_session_delete(struct l2tp_session *session);
extern void l2tp_session_free(struct l2tp_session *session);
extern void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length, int (*payload_hook)(struct sk_buff *skb));
+extern int l2tp_session_queue_purge(struct l2tp_session *session);
extern int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb);
extern int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len);
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index c3813bc84552..072d7202e182 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -146,14 +146,14 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v)
tunnel->sock ? atomic_read(&tunnel->sock->sk_refcnt) : 0,
atomic_read(&tunnel->ref_count));
- seq_printf(m, " %08x rx %llu/%llu/%llu rx %llu/%llu/%llu\n",
+ seq_printf(m, " %08x rx %ld/%ld/%ld rx %ld/%ld/%ld\n",
tunnel->debug,
- (unsigned long long)tunnel->stats.tx_packets,
- (unsigned long long)tunnel->stats.tx_bytes,
- (unsigned long long)tunnel->stats.tx_errors,
- (unsigned long long)tunnel->stats.rx_packets,
- (unsigned long long)tunnel->stats.rx_bytes,
- (unsigned long long)tunnel->stats.rx_errors);
+ atomic_long_read(&tunnel->stats.tx_packets),
+ atomic_long_read(&tunnel->stats.tx_bytes),
+ atomic_long_read(&tunnel->stats.tx_errors),
+ atomic_long_read(&tunnel->stats.rx_packets),
+ atomic_long_read(&tunnel->stats.rx_bytes),
+ atomic_long_read(&tunnel->stats.rx_errors));
if (tunnel->show != NULL)
tunnel->show(m, tunnel);
@@ -203,14 +203,14 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
seq_printf(m, "\n");
}
- seq_printf(m, " %hu/%hu tx %llu/%llu/%llu rx %llu/%llu/%llu\n",
+ seq_printf(m, " %hu/%hu tx %ld/%ld/%ld rx %ld/%ld/%ld\n",
session->nr, session->ns,
- (unsigned long long)session->stats.tx_packets,
- (unsigned long long)session->stats.tx_bytes,
- (unsigned long long)session->stats.tx_errors,
- (unsigned long long)session->stats.rx_packets,
- (unsigned long long)session->stats.rx_bytes,
- (unsigned long long)session->stats.rx_errors);
+ atomic_long_read(&session->stats.tx_packets),
+ atomic_long_read(&session->stats.tx_bytes),
+ atomic_long_read(&session->stats.tx_errors),
+ atomic_long_read(&session->stats.rx_packets),
+ atomic_long_read(&session->stats.rx_bytes),
+ atomic_long_read(&session->stats.rx_errors));
if (session->show != NULL)
session->show(m, session);
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 7f41b7051269..571db8dd2292 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -228,10 +228,16 @@ static void l2tp_ip_close(struct sock *sk, long timeout)
static void l2tp_ip_destroy_sock(struct sock *sk)
{
struct sk_buff *skb;
+ struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk);
while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
kfree_skb(skb);
+ if (tunnel) {
+ l2tp_tunnel_closeall(tunnel);
+ sock_put(sk);
+ }
+
sk_refcnt_debug_dec(sk);
}
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 41f2f8126ebc..b8a6039314e8 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -241,10 +241,17 @@ static void l2tp_ip6_close(struct sock *sk, long timeout)
static void l2tp_ip6_destroy_sock(struct sock *sk)
{
+ struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk);
+
lock_sock(sk);
ip6_flush_pending_frames(sk);
release_sock(sk);
+ if (tunnel) {
+ l2tp_tunnel_closeall(tunnel);
+ sock_put(sk);
+ }
+
inet6_destroy_sock(sk);
}
@@ -683,6 +690,7 @@ static int l2tp_ip6_recvmsg(struct kiocb *iocb, struct sock *sk,
lsa->l2tp_addr = ipv6_hdr(skb)->saddr;
lsa->l2tp_flowinfo = 0;
lsa->l2tp_scope_id = 0;
+ lsa->l2tp_conn_id = 0;
if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL)
lsa->l2tp_scope_id = IP6CB(skb)->iif;
}
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index c1bab22db85e..0825ff26e113 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -246,8 +246,6 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla
#if IS_ENABLED(CONFIG_IPV6)
struct ipv6_pinfo *np = NULL;
#endif
- struct l2tp_stats stats;
- unsigned int start;
hdr = genlmsg_put(skb, portid, seq, &l2tp_nl_family, flags,
L2TP_CMD_TUNNEL_GET);
@@ -265,28 +263,22 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla
if (nest == NULL)
goto nla_put_failure;
- do {
- start = u64_stats_fetch_begin(&tunnel->stats.syncp);
- stats.tx_packets = tunnel->stats.tx_packets;
- stats.tx_bytes = tunnel->stats.tx_bytes;
- stats.tx_errors = tunnel->stats.tx_errors;
- stats.rx_packets = tunnel->stats.rx_packets;
- stats.rx_bytes = tunnel->stats.rx_bytes;
- stats.rx_errors = tunnel->stats.rx_errors;
- stats.rx_seq_discards = tunnel->stats.rx_seq_discards;
- stats.rx_oos_packets = tunnel->stats.rx_oos_packets;
- } while (u64_stats_fetch_retry(&tunnel->stats.syncp, start));
-
- if (nla_put_u64(skb, L2TP_ATTR_TX_PACKETS, stats.tx_packets) ||
- nla_put_u64(skb, L2TP_ATTR_TX_BYTES, stats.tx_bytes) ||
- nla_put_u64(skb, L2TP_ATTR_TX_ERRORS, stats.tx_errors) ||
- nla_put_u64(skb, L2TP_ATTR_RX_PACKETS, stats.rx_packets) ||
- nla_put_u64(skb, L2TP_ATTR_RX_BYTES, stats.rx_bytes) ||
+ if (nla_put_u64(skb, L2TP_ATTR_TX_PACKETS,
+ atomic_long_read(&tunnel->stats.tx_packets)) ||
+ nla_put_u64(skb, L2TP_ATTR_TX_BYTES,
+ atomic_long_read(&tunnel->stats.tx_bytes)) ||
+ nla_put_u64(skb, L2TP_ATTR_TX_ERRORS,
+ atomic_long_read(&tunnel->stats.tx_errors)) ||
+ nla_put_u64(skb, L2TP_ATTR_RX_PACKETS,
+ atomic_long_read(&tunnel->stats.rx_packets)) ||
+ nla_put_u64(skb, L2TP_ATTR_RX_BYTES,
+ atomic_long_read(&tunnel->stats.rx_bytes)) ||
nla_put_u64(skb, L2TP_ATTR_RX_SEQ_DISCARDS,
- stats.rx_seq_discards) ||
+ atomic_long_read(&tunnel->stats.rx_seq_discards)) ||
nla_put_u64(skb, L2TP_ATTR_RX_OOS_PACKETS,
- stats.rx_oos_packets) ||
- nla_put_u64(skb, L2TP_ATTR_RX_ERRORS, stats.rx_errors))
+ atomic_long_read(&tunnel->stats.rx_oos_packets)) ||
+ nla_put_u64(skb, L2TP_ATTR_RX_ERRORS,
+ atomic_long_read(&tunnel->stats.rx_errors)))
goto nla_put_failure;
nla_nest_end(skb, nest);
@@ -612,8 +604,6 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
struct nlattr *nest;
struct l2tp_tunnel *tunnel = session->tunnel;
struct sock *sk = NULL;
- struct l2tp_stats stats;
- unsigned int start;
sk = tunnel->sock;
@@ -656,28 +646,22 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
if (nest == NULL)
goto nla_put_failure;
- do {
- start = u64_stats_fetch_begin(&session->stats.syncp);
- stats.tx_packets = session->stats.tx_packets;
- stats.tx_bytes = session->stats.tx_bytes;
- stats.tx_errors = session->stats.tx_errors;
- stats.rx_packets = session->stats.rx_packets;
- stats.rx_bytes = session->stats.rx_bytes;
- stats.rx_errors = session->stats.rx_errors;
- stats.rx_seq_discards = session->stats.rx_seq_discards;
- stats.rx_oos_packets = session->stats.rx_oos_packets;
- } while (u64_stats_fetch_retry(&session->stats.syncp, start));
-
- if (nla_put_u64(skb, L2TP_ATTR_TX_PACKETS, stats.tx_packets) ||
- nla_put_u64(skb, L2TP_ATTR_TX_BYTES, stats.tx_bytes) ||
- nla_put_u64(skb, L2TP_ATTR_TX_ERRORS, stats.tx_errors) ||
- nla_put_u64(skb, L2TP_ATTR_RX_PACKETS, stats.rx_packets) ||
- nla_put_u64(skb, L2TP_ATTR_RX_BYTES, stats.rx_bytes) ||
+ if (nla_put_u64(skb, L2TP_ATTR_TX_PACKETS,
+ atomic_long_read(&session->stats.tx_packets)) ||
+ nla_put_u64(skb, L2TP_ATTR_TX_BYTES,
+ atomic_long_read(&session->stats.tx_bytes)) ||
+ nla_put_u64(skb, L2TP_ATTR_TX_ERRORS,
+ atomic_long_read(&session->stats.tx_errors)) ||
+ nla_put_u64(skb, L2TP_ATTR_RX_PACKETS,
+ atomic_long_read(&session->stats.rx_packets)) ||
+ nla_put_u64(skb, L2TP_ATTR_RX_BYTES,
+ atomic_long_read(&session->stats.rx_bytes)) ||
nla_put_u64(skb, L2TP_ATTR_RX_SEQ_DISCARDS,
- stats.rx_seq_discards) ||
+ atomic_long_read(&session->stats.rx_seq_discards)) ||
nla_put_u64(skb, L2TP_ATTR_RX_OOS_PACKETS,
- stats.rx_oos_packets) ||
- nla_put_u64(skb, L2TP_ATTR_RX_ERRORS, stats.rx_errors))
+ atomic_long_read(&session->stats.rx_oos_packets)) ||
+ nla_put_u64(skb, L2TP_ATTR_RX_ERRORS,
+ atomic_long_read(&session->stats.rx_errors)))
goto nla_put_failure;
nla_nest_end(skb, nest);
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 6a53371dba1f..637a341c1e2d 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -97,6 +97,7 @@
#include <net/ip.h>
#include <net/udp.h>
#include <net/xfrm.h>
+#include <net/inet_common.h>
#include <asm/byteorder.h>
#include <linux/atomic.h>
@@ -259,7 +260,7 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int
session->name);
/* Not bound. Nothing we can do, so discard. */
- session->stats.rx_errors++;
+ atomic_long_inc(&session->stats.rx_errors);
kfree_skb(skb);
}
@@ -447,34 +448,16 @@ static void pppol2tp_session_close(struct l2tp_session *session)
{
struct pppol2tp_session *ps = l2tp_session_priv(session);
struct sock *sk = ps->sock;
- struct sk_buff *skb;
+ struct socket *sock = sk->sk_socket;
BUG_ON(session->magic != L2TP_SESSION_MAGIC);
- if (session->session_id == 0)
- goto out;
-
- if (sk != NULL) {
- lock_sock(sk);
-
- if (sk->sk_state & (PPPOX_CONNECTED | PPPOX_BOUND)) {
- pppox_unbind_sock(sk);
- sk->sk_state = PPPOX_DEAD;
- sk->sk_state_change(sk);
- }
-
- /* Purge any queued data */
- skb_queue_purge(&sk->sk_receive_queue);
- skb_queue_purge(&sk->sk_write_queue);
- while ((skb = skb_dequeue(&session->reorder_q))) {
- kfree_skb(skb);
- sock_put(sk);
- }
- release_sock(sk);
+ if (sock) {
+ inet_shutdown(sock, 2);
+ /* Don't let the session go away before our socket does */
+ l2tp_session_inc_refcount(session);
}
-
-out:
return;
}
@@ -483,19 +466,12 @@ out:
*/
static void pppol2tp_session_destruct(struct sock *sk)
{
- struct l2tp_session *session;
-
- if (sk->sk_user_data != NULL) {
- session = sk->sk_user_data;
- if (session == NULL)
- goto out;
-
+ struct l2tp_session *session = sk->sk_user_data;
+ if (session) {
sk->sk_user_data = NULL;
BUG_ON(session->magic != L2TP_SESSION_MAGIC);
l2tp_session_dec_refcount(session);
}
-
-out:
return;
}
@@ -525,16 +501,13 @@ static int pppol2tp_release(struct socket *sock)
session = pppol2tp_sock_to_session(sk);
/* Purge any queued data */
- skb_queue_purge(&sk->sk_receive_queue);
- skb_queue_purge(&sk->sk_write_queue);
if (session != NULL) {
- struct sk_buff *skb;
- while ((skb = skb_dequeue(&session->reorder_q))) {
- kfree_skb(skb);
- sock_put(sk);
- }
+ __l2tp_session_unhash(session);
+ l2tp_session_queue_purge(session);
sock_put(sk);
}
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_queue_purge(&sk->sk_write_queue);
release_sock(sk);
@@ -880,18 +853,6 @@ out:
return error;
}
-/* Called when deleting sessions via the netlink interface.
- */
-static int pppol2tp_session_delete(struct l2tp_session *session)
-{
- struct pppol2tp_session *ps = l2tp_session_priv(session);
-
- if (ps->sock == NULL)
- l2tp_session_dec_refcount(session);
-
- return 0;
-}
-
#endif /* CONFIG_L2TP_V3 */
/* getname() support.
@@ -1025,14 +986,14 @@ end:
static void pppol2tp_copy_stats(struct pppol2tp_ioc_stats *dest,
struct l2tp_stats *stats)
{
- dest->tx_packets = stats->tx_packets;
- dest->tx_bytes = stats->tx_bytes;
- dest->tx_errors = stats->tx_errors;
- dest->rx_packets = stats->rx_packets;
- dest->rx_bytes = stats->rx_bytes;
- dest->rx_seq_discards = stats->rx_seq_discards;
- dest->rx_oos_packets = stats->rx_oos_packets;
- dest->rx_errors = stats->rx_errors;
+ dest->tx_packets = atomic_long_read(&stats->tx_packets);
+ dest->tx_bytes = atomic_long_read(&stats->tx_bytes);
+ dest->tx_errors = atomic_long_read(&stats->tx_errors);
+ dest->rx_packets = atomic_long_read(&stats->rx_packets);
+ dest->rx_bytes = atomic_long_read(&stats->rx_bytes);
+ dest->rx_seq_discards = atomic_long_read(&stats->rx_seq_discards);
+ dest->rx_oos_packets = atomic_long_read(&stats->rx_oos_packets);
+ dest->rx_errors = atomic_long_read(&stats->rx_errors);
}
/* Session ioctl helper.
@@ -1666,14 +1627,14 @@ static void pppol2tp_seq_tunnel_show(struct seq_file *m, void *v)
tunnel->name,
(tunnel == tunnel->sock->sk_user_data) ? 'Y' : 'N',
atomic_read(&tunnel->ref_count) - 1);
- seq_printf(m, " %08x %llu/%llu/%llu %llu/%llu/%llu\n",
+ seq_printf(m, " %08x %ld/%ld/%ld %ld/%ld/%ld\n",
tunnel->debug,
- (unsigned long long)tunnel->stats.tx_packets,
- (unsigned long long)tunnel->stats.tx_bytes,
- (unsigned long long)tunnel->stats.tx_errors,
- (unsigned long long)tunnel->stats.rx_packets,
- (unsigned long long)tunnel->stats.rx_bytes,
- (unsigned long long)tunnel->stats.rx_errors);
+ atomic_long_read(&tunnel->stats.tx_packets),
+ atomic_long_read(&tunnel->stats.tx_bytes),
+ atomic_long_read(&tunnel->stats.tx_errors),
+ atomic_long_read(&tunnel->stats.rx_packets),
+ atomic_long_read(&tunnel->stats.rx_bytes),
+ atomic_long_read(&tunnel->stats.rx_errors));
}
static void pppol2tp_seq_session_show(struct seq_file *m, void *v)
@@ -1708,14 +1669,14 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v)
session->lns_mode ? "LNS" : "LAC",
session->debug,
jiffies_to_msecs(session->reorder_timeout));
- seq_printf(m, " %hu/%hu %llu/%llu/%llu %llu/%llu/%llu\n",
+ seq_printf(m, " %hu/%hu %ld/%ld/%ld %ld/%ld/%ld\n",
session->nr, session->ns,
- (unsigned long long)session->stats.tx_packets,
- (unsigned long long)session->stats.tx_bytes,
- (unsigned long long)session->stats.tx_errors,
- (unsigned long long)session->stats.rx_packets,
- (unsigned long long)session->stats.rx_bytes,
- (unsigned long long)session->stats.rx_errors);
+ atomic_long_read(&session->stats.tx_packets),
+ atomic_long_read(&session->stats.tx_bytes),
+ atomic_long_read(&session->stats.tx_errors),
+ atomic_long_read(&session->stats.rx_packets),
+ atomic_long_read(&session->stats.rx_bytes),
+ atomic_long_read(&session->stats.rx_errors));
if (po)
seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan));
@@ -1839,7 +1800,7 @@ static const struct pppox_proto pppol2tp_proto = {
static const struct l2tp_nl_cmd_ops pppol2tp_nl_cmd_ops = {
.session_create = pppol2tp_session_create,
- .session_delete = pppol2tp_session_delete,
+ .session_delete = l2tp_session_delete,
};
#endif /* CONFIG_L2TP_V3 */
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 88709882c464..48aaa89253e0 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -720,6 +720,8 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
int target; /* Read at least this many bytes */
long timeo;
+ msg->msg_namelen = 0;
+
lock_sock(sk);
copied = -ENOTCONN;
if (unlikely(sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN))
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 2a6ae8030bd9..9e67cc97b87b 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2089,7 +2089,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
encaps_data = bridge_tunnel_header;
encaps_len = sizeof(bridge_tunnel_header);
skip_header_bytes -= 2;
- } else if (ethertype >= 0x600) {
+ } else if (ethertype >= ETH_P_802_3_MIN) {
encaps_data = rfc1042_header;
encaps_len = sizeof(rfc1042_header);
skip_header_bytes -= 2;
diff --git a/net/mac802154/mac802154.h b/net/mac802154/mac802154.h
index a4dcaf1dd4b6..5c9e021994ba 100644
--- a/net/mac802154/mac802154.h
+++ b/net/mac802154/mac802154.h
@@ -88,8 +88,6 @@ struct mac802154_sub_if_data {
#define mac802154_to_priv(_hw) container_of(_hw, struct mac802154_priv, hw)
-#define MAC802154_MAX_XMIT_ATTEMPTS 3
-
#define MAC802154_CHAN_NONE (~(u8)0) /* No channel is assigned */
extern struct ieee802154_reduced_mlme_ops mac802154_mlme_reduced;
@@ -114,5 +112,6 @@ void mac802154_dev_set_ieee_addr(struct net_device *dev);
u16 mac802154_dev_get_pan_id(const struct net_device *dev);
void mac802154_dev_set_pan_id(struct net_device *dev, u16 val);
void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan);
+u8 mac802154_dev_get_dsn(const struct net_device *dev);
#endif /* MAC802154_H */
diff --git a/net/mac802154/mac_cmd.c b/net/mac802154/mac_cmd.c
index d8d277006089..a99910d4d52f 100644
--- a/net/mac802154/mac_cmd.c
+++ b/net/mac802154/mac_cmd.c
@@ -73,4 +73,5 @@ struct ieee802154_mlme_ops mac802154_mlme_wpan = {
.start_req = mac802154_mlme_start_req,
.get_pan_id = mac802154_dev_get_pan_id,
.get_short_addr = mac802154_dev_get_short_addr,
+ .get_dsn = mac802154_dev_get_dsn,
};
diff --git a/net/mac802154/mib.c b/net/mac802154/mib.c
index f47781ab0ccc..8ded97cf1c33 100644
--- a/net/mac802154/mib.c
+++ b/net/mac802154/mib.c
@@ -159,6 +159,15 @@ void mac802154_dev_set_pan_id(struct net_device *dev, u16 val)
}
}
+u8 mac802154_dev_get_dsn(const struct net_device *dev)
+{
+ struct mac802154_sub_if_data *priv = netdev_priv(dev);
+
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ return priv->dsn++;
+}
+
static void phy_chan_notify(struct work_struct *work)
{
struct phy_chan_notify_work *nw = container_of(work,
@@ -167,9 +176,15 @@ static void phy_chan_notify(struct work_struct *work)
struct mac802154_sub_if_data *priv = netdev_priv(nw->dev);
int res;
+ mutex_lock(&priv->hw->phy->pib_lock);
res = hw->ops->set_channel(&hw->hw, priv->page, priv->chan);
if (res)
pr_debug("set_channel failed\n");
+ else {
+ priv->hw->phy->current_channel = priv->chan;
+ priv->hw->phy->current_page = priv->page;
+ }
+ mutex_unlock(&priv->hw->phy->pib_lock);
kfree(nw);
}
@@ -186,8 +201,11 @@ void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan)
priv->chan = chan;
spin_unlock_bh(&priv->mib_lock);
+ mutex_lock(&priv->hw->phy->pib_lock);
if (priv->hw->phy->current_channel != priv->chan ||
priv->hw->phy->current_page != priv->page) {
+ mutex_unlock(&priv->hw->phy->pib_lock);
+
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work)
return;
@@ -195,5 +213,6 @@ void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan)
INIT_WORK(&work->work, phy_chan_notify);
work->dev = dev;
queue_work(priv->hw->dev_workqueue, &work->work);
- }
+ } else
+ mutex_unlock(&priv->hw->phy->pib_lock);
}
diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c
index 4e09d070995a..6d1647399d4f 100644
--- a/net/mac802154/tx.c
+++ b/net/mac802154/tx.c
@@ -25,6 +25,7 @@
#include <linux/if_arp.h>
#include <linux/crc-ccitt.h>
+#include <net/ieee802154_netdev.h>
#include <net/mac802154.h>
#include <net/wpan-phy.h>
@@ -39,12 +40,12 @@ struct xmit_work {
struct mac802154_priv *priv;
u8 chan;
u8 page;
- u8 xmit_attempts;
};
static void mac802154_xmit_worker(struct work_struct *work)
{
struct xmit_work *xw = container_of(work, struct xmit_work, work);
+ struct mac802154_sub_if_data *sdata;
int res;
mutex_lock(&xw->priv->phy->pib_lock);
@@ -57,21 +58,23 @@ static void mac802154_xmit_worker(struct work_struct *work)
pr_debug("set_channel failed\n");
goto out;
}
+
+ xw->priv->phy->current_channel = xw->chan;
+ xw->priv->phy->current_page = xw->page;
}
res = xw->priv->ops->xmit(&xw->priv->hw, xw->skb);
+ if (res)
+ pr_debug("transmission failed\n");
out:
mutex_unlock(&xw->priv->phy->pib_lock);
- if (res) {
- if (xw->xmit_attempts++ < MAC802154_MAX_XMIT_ATTEMPTS) {
- queue_work(xw->priv->dev_workqueue, &xw->work);
- return;
- } else
- pr_debug("transmission failed for %d times",
- MAC802154_MAX_XMIT_ATTEMPTS);
- }
+ /* Restart the netif queue on each sub_if_data object. */
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdata, &xw->priv->slaves, list)
+ netif_wake_queue(sdata->dev);
+ rcu_read_unlock();
dev_kfree_skb(xw->skb);
@@ -82,6 +85,7 @@ netdev_tx_t mac802154_tx(struct mac802154_priv *priv, struct sk_buff *skb,
u8 page, u8 chan)
{
struct xmit_work *work;
+ struct mac802154_sub_if_data *sdata;
if (!(priv->phy->channels_supported[page] & (1 << chan))) {
WARN_ON(1);
@@ -109,12 +113,17 @@ netdev_tx_t mac802154_tx(struct mac802154_priv *priv, struct sk_buff *skb,
return NETDEV_TX_BUSY;
}
+ /* Stop the netif queue on each sub_if_data object. */
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdata, &priv->slaves, list)
+ netif_stop_queue(sdata->dev);
+ rcu_read_unlock();
+
INIT_WORK(&work->work, mac802154_xmit_worker);
work->skb = skb;
work->priv = priv;
work->page = page;
work->chan = chan;
- work->xmit_attempts = 0;
queue_work(priv->dev_workqueue, &work->work);
diff --git a/net/mac802154/wpan.c b/net/mac802154/wpan.c
index d20c6d3c247d..2ca2f4dceab7 100644
--- a/net/mac802154/wpan.c
+++ b/net/mac802154/wpan.c
@@ -145,6 +145,8 @@ static int mac802154_header_create(struct sk_buff *skb,
head[pos++] = mac_cb(skb)->seq; /* DSN/BSN */
fc = mac_cb_type(skb);
+ if (mac_cb_is_ackreq(skb))
+ fc |= IEEE802154_FC_ACK_REQ;
if (!saddr) {
spin_lock_bh(&priv->mib_lock);
@@ -358,7 +360,7 @@ void mac802154_wpan_setup(struct net_device *dev)
dev->header_ops = &mac802154_header_ops;
dev->needed_tailroom = 2; /* FCS */
dev->mtu = IEEE802154_MTU;
- dev->tx_queue_len = 10;
+ dev->tx_queue_len = 300;
dev->type = ARPHRD_IEEE802154;
dev->flags = IFF_NOARP | IFF_BROADCAST;
dev->watchdog_timeo = 0;
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index a9c488b6c50d..7d97302f7c07 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -276,10 +276,30 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
EXPORT_SYMBOL(nf_nat_decode_session_hook);
#endif
+static int __net_init netfilter_net_init(struct net *net)
+{
#ifdef CONFIG_PROC_FS
-struct proc_dir_entry *proc_net_netfilter;
-EXPORT_SYMBOL(proc_net_netfilter);
+ net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
+ net->proc_net);
+ if (!net->nf.proc_netfilter) {
+ if (!net_eq(net, &init_net))
+ pr_err("cannot create netfilter proc entry");
+
+ return -ENOMEM;
+ }
#endif
+ return 0;
+}
+
+static void __net_exit netfilter_net_exit(struct net *net)
+{
+ remove_proc_entry("netfilter", net->proc_net);
+}
+
+static struct pernet_operations netfilter_net_ops = {
+ .init = netfilter_net_init,
+ .exit = netfilter_net_exit,
+};
void __init netfilter_init(void)
{
@@ -289,11 +309,8 @@ void __init netfilter_init(void)
INIT_LIST_HEAD(&nf_hooks[i][h]);
}
-#ifdef CONFIG_PROC_FS
- proc_net_netfilter = proc_mkdir("netfilter", init_net.proc_net);
- if (!proc_net_netfilter)
+ if (register_pernet_subsys(&netfilter_net_ops) < 0)
panic("cannot create netfilter proc entry");
-#endif
if (netfilter_log_init() < 0)
panic("cannot initialize nf_log");
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 1ba9dbc0e107..86f5e26f39d3 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -15,7 +15,6 @@
#include <linux/ip.h>
#include <linux/skbuff.h>
#include <linux/spinlock.h>
-#include <linux/netlink.h>
#include <linux/rculist.h>
#include <net/netlink.h>
@@ -1085,7 +1084,7 @@ static int
dump_init(struct netlink_callback *cb)
{
struct nlmsghdr *nlh = nlmsg_hdr(cb->skb);
- int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
+ int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
struct nlattr *attr = (void *)nlh + min_len;
u32 dump_type;
@@ -1301,7 +1300,7 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
struct sk_buff *skb2;
struct nlmsgerr *errmsg;
size_t payload = sizeof(*errmsg) + nlmsg_len(nlh);
- int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
+ int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
struct nlattr *cmdattr;
u32 *errline;
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 0b779d7df881..dfd7b65b3d2a 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -58,6 +58,18 @@ static inline void ip_vs_app_put(struct ip_vs_app *app)
module_put(app->module);
}
+static void ip_vs_app_inc_destroy(struct ip_vs_app *inc)
+{
+ kfree(inc->timeout_table);
+ kfree(inc);
+}
+
+static void ip_vs_app_inc_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_app *inc = container_of(head, struct ip_vs_app, rcu_head);
+
+ ip_vs_app_inc_destroy(inc);
+}
/*
* Allocate/initialize app incarnation and register it in proto apps.
@@ -106,8 +118,7 @@ ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto,
return 0;
out:
- kfree(inc->timeout_table);
- kfree(inc);
+ ip_vs_app_inc_destroy(inc);
return ret;
}
@@ -131,8 +142,7 @@ ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc)
list_del(&inc->a_list);
- kfree(inc->timeout_table);
- kfree(inc);
+ call_rcu(&inc->rcu_head, ip_vs_app_inc_rcu_free);
}
@@ -144,9 +154,9 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc)
{
int result;
- atomic_inc(&inc->usecnt);
- if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
- atomic_dec(&inc->usecnt);
+ result = ip_vs_app_get(inc->app);
+ if (result)
+ atomic_inc(&inc->usecnt);
return result;
}
@@ -156,8 +166,8 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc)
*/
void ip_vs_app_inc_put(struct ip_vs_app *inc)
{
- ip_vs_app_put(inc->app);
atomic_dec(&inc->usecnt);
+ ip_vs_app_put(inc->app);
}
@@ -218,6 +228,7 @@ out_unlock:
/*
* ip_vs_app unregistration routine
* We are sure there are no app incarnations attached to services
+ * Caller should use synchronize_rcu() or rcu_barrier()
*/
void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app)
{
@@ -341,14 +352,14 @@ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
unsigned int flag, __u32 seq, int diff)
{
/* spinlock is to keep updating cp->flags atomic */
- spin_lock(&cp->lock);
+ spin_lock_bh(&cp->lock);
if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
vseq->previous_delta = vseq->delta;
vseq->delta += diff;
vseq->init_seq = seq;
cp->flags |= flag;
}
- spin_unlock(&cp->lock);
+ spin_unlock_bh(&cp->lock);
}
static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 704e514e02ab..de6475894a39 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -79,51 +79,21 @@ static unsigned int ip_vs_conn_rnd __read_mostly;
struct ip_vs_aligned_lock
{
- rwlock_t l;
+ spinlock_t l;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
/* lock array for conn table */
static struct ip_vs_aligned_lock
__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
-static inline void ct_read_lock(unsigned int key)
-{
- read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_unlock(unsigned int key)
-{
- read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_lock(unsigned int key)
-{
- write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_unlock(unsigned int key)
-{
- write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_lock_bh(unsigned int key)
-{
- read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_unlock_bh(unsigned int key)
-{
- read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
static inline void ct_write_lock_bh(unsigned int key)
{
- write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+ spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
static inline void ct_write_unlock_bh(unsigned int key)
{
- write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+ spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
@@ -197,13 +167,13 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
/* Hash by protocol, client address and port */
hash = ip_vs_conn_hashkey_conn(cp);
- ct_write_lock(hash);
+ ct_write_lock_bh(hash);
spin_lock(&cp->lock);
if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
- hlist_add_head(&cp->c_list, &ip_vs_conn_tab[hash]);
cp->flags |= IP_VS_CONN_F_HASHED;
atomic_inc(&cp->refcnt);
+ hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]);
ret = 1;
} else {
pr_err("%s(): request for already hashed, called from %pF\n",
@@ -212,7 +182,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
}
spin_unlock(&cp->lock);
- ct_write_unlock(hash);
+ ct_write_unlock_bh(hash);
return ret;
}
@@ -220,7 +190,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
/*
* UNhashes ip_vs_conn from ip_vs_conn_tab.
- * returns bool success.
+ * returns bool success. Caller should hold conn reference.
*/
static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
{
@@ -230,11 +200,11 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
/* unhash it and decrease its reference counter */
hash = ip_vs_conn_hashkey_conn(cp);
- ct_write_lock(hash);
+ ct_write_lock_bh(hash);
spin_lock(&cp->lock);
if (cp->flags & IP_VS_CONN_F_HASHED) {
- hlist_del(&cp->c_list);
+ hlist_del_rcu(&cp->c_list);
cp->flags &= ~IP_VS_CONN_F_HASHED;
atomic_dec(&cp->refcnt);
ret = 1;
@@ -242,7 +212,37 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
ret = 0;
spin_unlock(&cp->lock);
- ct_write_unlock(hash);
+ ct_write_unlock_bh(hash);
+
+ return ret;
+}
+
+/* Try to unlink ip_vs_conn from ip_vs_conn_tab.
+ * returns bool success.
+ */
+static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
+{
+ unsigned int hash;
+ bool ret;
+
+ hash = ip_vs_conn_hashkey_conn(cp);
+
+ ct_write_lock_bh(hash);
+ spin_lock(&cp->lock);
+
+ if (cp->flags & IP_VS_CONN_F_HASHED) {
+ ret = false;
+ /* Decrease refcnt and unlink conn only if we are last user */
+ if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) {
+ hlist_del_rcu(&cp->c_list);
+ cp->flags &= ~IP_VS_CONN_F_HASHED;
+ ret = true;
+ }
+ } else
+ ret = atomic_read(&cp->refcnt) ? false : true;
+
+ spin_unlock(&cp->lock);
+ ct_write_unlock_bh(hash);
return ret;
}
@@ -262,24 +262,25 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
hash = ip_vs_conn_hashkey_param(p, false);
- ct_read_lock(hash);
+ rcu_read_lock();
- hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (cp->af == p->af &&
- p->cport == cp->cport && p->vport == cp->vport &&
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (p->cport == cp->cport && p->vport == cp->vport &&
+ cp->af == p->af &&
ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
p->protocol == cp->protocol &&
ip_vs_conn_net_eq(cp, p->net)) {
+ if (!__ip_vs_conn_get(cp))
+ continue;
/* HIT */
- atomic_inc(&cp->refcnt);
- ct_read_unlock(hash);
+ rcu_read_unlock();
return cp;
}
}
- ct_read_unlock(hash);
+ rcu_read_unlock();
return NULL;
}
@@ -346,14 +347,16 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
hash = ip_vs_conn_hashkey_param(p, false);
- ct_read_lock(hash);
+ rcu_read_lock();
- hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (!ip_vs_conn_net_eq(cp, p->net))
- continue;
- if (p->pe_data && p->pe->ct_match) {
- if (p->pe == cp->pe && p->pe->ct_match(p, cp))
- goto out;
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (unlikely(p->pe_data && p->pe->ct_match)) {
+ if (!ip_vs_conn_net_eq(cp, p->net))
+ continue;
+ if (p->pe == cp->pe && p->pe->ct_match(p, cp)) {
+ if (__ip_vs_conn_get(cp))
+ goto out;
+ }
continue;
}
@@ -363,17 +366,18 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
* p->vaddr is a fwmark */
ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
p->af, p->vaddr, &cp->vaddr) &&
- p->cport == cp->cport && p->vport == cp->vport &&
+ p->vport == cp->vport && p->cport == cp->cport &&
cp->flags & IP_VS_CONN_F_TEMPLATE &&
- p->protocol == cp->protocol)
- goto out;
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
+ if (__ip_vs_conn_get(cp))
+ goto out;
+ }
}
cp = NULL;
out:
- if (cp)
- atomic_inc(&cp->refcnt);
- ct_read_unlock(hash);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
ip_vs_proto_name(p->protocol),
@@ -398,23 +402,24 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
*/
hash = ip_vs_conn_hashkey_param(p, true);
- ct_read_lock(hash);
+ rcu_read_lock();
- hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (cp->af == p->af &&
- p->vport == cp->cport && p->cport == cp->dport &&
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (p->vport == cp->cport && p->cport == cp->dport &&
+ cp->af == p->af &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
p->protocol == cp->protocol &&
ip_vs_conn_net_eq(cp, p->net)) {
+ if (!__ip_vs_conn_get(cp))
+ continue;
/* HIT */
- atomic_inc(&cp->refcnt);
ret = cp;
break;
}
}
- ct_read_unlock(hash);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
ip_vs_proto_name(p->protocol),
@@ -457,13 +462,13 @@ void ip_vs_conn_put(struct ip_vs_conn *cp)
void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
{
if (ip_vs_conn_unhash(cp)) {
- spin_lock(&cp->lock);
+ spin_lock_bh(&cp->lock);
if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
atomic_dec(&ip_vs_conn_no_cport_cnt);
cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
cp->cport = cport;
}
- spin_unlock(&cp->lock);
+ spin_unlock_bh(&cp->lock);
/* hash on new dport */
ip_vs_conn_hash(cp);
@@ -549,7 +554,7 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
return;
/* Increase the refcnt counter of the dest */
- atomic_inc(&dest->refcnt);
+ ip_vs_dest_hold(dest);
conn_flags = atomic_read(&dest->conn_flags);
if (cp->protocol != IPPROTO_UDP)
@@ -606,20 +611,22 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
* Check if there is a destination for the connection, if so
* bind the connection to the destination.
*/
-struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
+void ip_vs_try_bind_dest(struct ip_vs_conn *cp)
{
struct ip_vs_dest *dest;
+ rcu_read_lock();
dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr,
cp->dport, &cp->vaddr, cp->vport,
cp->protocol, cp->fwmark, cp->flags);
if (dest) {
struct ip_vs_proto_data *pd;
- spin_lock(&cp->lock);
+ spin_lock_bh(&cp->lock);
if (cp->dest) {
- spin_unlock(&cp->lock);
- return dest;
+ spin_unlock_bh(&cp->lock);
+ rcu_read_unlock();
+ return;
}
/* Applications work depending on the forwarding method
@@ -628,7 +635,7 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
ip_vs_unbind_app(cp);
ip_vs_bind_dest(cp, dest);
- spin_unlock(&cp->lock);
+ spin_unlock_bh(&cp->lock);
/* Update its packet transmitter */
cp->packet_xmit = NULL;
@@ -643,7 +650,7 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
if (pd && atomic_read(&pd->appcnt))
ip_vs_bind_app(cp, pd->pp);
}
- return dest;
+ rcu_read_unlock();
}
@@ -695,12 +702,7 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
}
- /*
- * Simply decrease the refcnt of the dest, because the
- * dest will be either in service's destination list
- * or in the trash.
- */
- atomic_dec(&dest->refcnt);
+ ip_vs_dest_put(dest);
}
static int expire_quiescent_template(struct netns_ipvs *ipvs,
@@ -757,41 +759,36 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
* Simply decrease the refcnt of the template,
* don't restart its timer.
*/
- atomic_dec(&ct->refcnt);
+ __ip_vs_conn_put(ct);
return 0;
}
return 1;
}
+static void ip_vs_conn_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn,
+ rcu_head);
+
+ ip_vs_pe_put(cp->pe);
+ kfree(cp->pe_data);
+ kmem_cache_free(ip_vs_conn_cachep, cp);
+}
+
static void ip_vs_conn_expire(unsigned long data)
{
struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
struct net *net = ip_vs_conn_net(cp);
struct netns_ipvs *ipvs = net_ipvs(net);
- cp->timeout = 60*HZ;
-
- /*
- * hey, I'm using it
- */
- atomic_inc(&cp->refcnt);
-
/*
* do I control anybody?
*/
if (atomic_read(&cp->n_control))
goto expire_later;
- /*
- * unhash it if it is hashed in the conn table
- */
- if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET))
- goto expire_later;
-
- /*
- * refcnt==1 implies I'm the only one referrer
- */
- if (likely(atomic_read(&cp->refcnt) == 1)) {
+ /* Unlink conn if not referenced anymore */
+ if (likely(ip_vs_conn_unlink(cp))) {
/* delete the timer if it is activated by other users */
del_timer(&cp->timer);
@@ -810,38 +807,41 @@ static void ip_vs_conn_expire(unsigned long data)
ip_vs_conn_drop_conntrack(cp);
}
- ip_vs_pe_put(cp->pe);
- kfree(cp->pe_data);
if (unlikely(cp->app != NULL))
ip_vs_unbind_app(cp);
ip_vs_unbind_dest(cp);
if (cp->flags & IP_VS_CONN_F_NO_CPORT)
atomic_dec(&ip_vs_conn_no_cport_cnt);
+ call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
atomic_dec(&ipvs->conn_count);
-
- kmem_cache_free(ip_vs_conn_cachep, cp);
return;
}
- /* hash it back to the table */
- ip_vs_conn_hash(cp);
-
expire_later:
- IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
- atomic_read(&cp->refcnt)-1,
+ IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n",
+ atomic_read(&cp->refcnt),
atomic_read(&cp->n_control));
+ atomic_inc(&cp->refcnt);
+ cp->timeout = 60*HZ;
+
if (ipvs->sync_state & IP_VS_STATE_MASTER)
ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs));
ip_vs_conn_put(cp);
}
-
+/* Modify timer, so that it expires as soon as possible.
+ * Can be called without reference only if under RCU lock.
+ */
void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
{
- if (del_timer(&cp->timer))
- mod_timer(&cp->timer, jiffies);
+ /* Using mod_timer_pending will ensure the timer is not
+ * modified after the final del_timer in ip_vs_conn_expire.
+ */
+ if (timer_pending(&cp->timer) &&
+ time_after(cp->timer.expires, jiffies))
+ mod_timer_pending(&cp->timer, jiffies);
}
@@ -858,7 +858,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
p->protocol);
- cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
+ cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
if (cp == NULL) {
IP_VS_ERR_RL("%s(): no memory\n", __func__);
return NULL;
@@ -869,13 +869,13 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
ip_vs_conn_net_set(cp, p->net);
cp->af = p->af;
cp->protocol = p->protocol;
- ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
+ ip_vs_addr_set(p->af, &cp->caddr, p->caddr);
cp->cport = p->cport;
- ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr);
+ ip_vs_addr_set(p->af, &cp->vaddr, p->vaddr);
cp->vport = p->vport;
/* proto should only be IPPROTO_IP if d_addr is a fwmark */
- ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
- &cp->daddr, daddr);
+ ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+ &cp->daddr, daddr);
cp->dport = dport;
cp->flags = flags;
cp->fwmark = fwmark;
@@ -884,6 +884,10 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
cp->pe = p->pe;
cp->pe_data = p->pe_data;
cp->pe_data_len = p->pe_data_len;
+ } else {
+ cp->pe = NULL;
+ cp->pe_data = NULL;
+ cp->pe_data_len = 0;
}
spin_lock_init(&cp->lock);
@@ -894,18 +898,28 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
*/
atomic_set(&cp->refcnt, 1);
+ cp->control = NULL;
atomic_set(&cp->n_control, 0);
atomic_set(&cp->in_pkts, 0);
+ cp->packet_xmit = NULL;
+ cp->app = NULL;
+ cp->app_data = NULL;
+ /* reset struct ip_vs_seq */
+ cp->in_seq.delta = 0;
+ cp->out_seq.delta = 0;
+
atomic_inc(&ipvs->conn_count);
if (flags & IP_VS_CONN_F_NO_CPORT)
atomic_inc(&ip_vs_conn_no_cport_cnt);
/* Bind the connection with a destination server */
+ cp->dest = NULL;
ip_vs_bind_dest(cp, dest);
/* Set its state and timeout */
cp->state = 0;
+ cp->old_state = 0;
cp->timeout = 3*HZ;
cp->sync_endtime = jiffies & ~3UL;
@@ -952,14 +966,17 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
struct ip_vs_iter_state *iter = seq->private;
for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
- ct_read_lock_bh(idx);
- hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+ /* __ip_vs_conn_get() is not needed by
+ * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
+ */
if (pos-- == 0) {
iter->l = &ip_vs_conn_tab[idx];
return cp;
}
}
- ct_read_unlock_bh(idx);
+ rcu_read_unlock();
}
return NULL;
@@ -977,6 +994,7 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct ip_vs_conn *cp = v;
struct ip_vs_iter_state *iter = seq->private;
+ struct hlist_node *e;
struct hlist_head *l = iter->l;
int idx;
@@ -985,19 +1003,19 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return ip_vs_conn_array(seq, 0);
/* more on same hash chain? */
- if (cp->c_list.next)
- return hlist_entry(cp->c_list.next, struct ip_vs_conn, c_list);
+ e = rcu_dereference(hlist_next_rcu(&cp->c_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_conn, c_list);
+ rcu_read_unlock();
idx = l - ip_vs_conn_tab;
- ct_read_unlock_bh(idx);
-
while (++idx < ip_vs_conn_tab_size) {
- ct_read_lock_bh(idx);
- hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
iter->l = &ip_vs_conn_tab[idx];
return cp;
}
- ct_read_unlock_bh(idx);
+ rcu_read_unlock();
}
iter->l = NULL;
return NULL;
@@ -1009,7 +1027,7 @@ static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
struct hlist_head *l = iter->l;
if (l)
- ct_read_unlock_bh(l - ip_vs_conn_tab);
+ rcu_read_unlock();
}
static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
@@ -1188,7 +1206,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
void ip_vs_random_dropentry(struct net *net)
{
int idx;
- struct ip_vs_conn *cp;
+ struct ip_vs_conn *cp, *cp_c;
/*
* Randomly scan 1/32 of the whole table every second
@@ -1199,9 +1217,9 @@ void ip_vs_random_dropentry(struct net *net)
/*
* Lock is actually needed in this loop.
*/
- ct_write_lock_bh(hash);
+ rcu_read_lock();
- hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
if (cp->flags & IP_VS_CONN_F_TEMPLATE)
/* connection template */
continue;
@@ -1228,12 +1246,15 @@ void ip_vs_random_dropentry(struct net *net)
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
- if (cp->control) {
+ cp_c = cp->control;
+ /* cp->control is valid only with reference to cp */
+ if (cp_c && __ip_vs_conn_get(cp)) {
IP_VS_DBG(4, "del conn template\n");
- ip_vs_conn_expire_now(cp->control);
+ ip_vs_conn_expire_now(cp_c);
+ __ip_vs_conn_put(cp);
}
}
- ct_write_unlock_bh(hash);
+ rcu_read_unlock();
}
}
@@ -1244,7 +1265,7 @@ void ip_vs_random_dropentry(struct net *net)
static void ip_vs_conn_flush(struct net *net)
{
int idx;
- struct ip_vs_conn *cp;
+ struct ip_vs_conn *cp, *cp_c;
struct netns_ipvs *ipvs = net_ipvs(net);
flush_again:
@@ -1252,19 +1273,22 @@ flush_again:
/*
* Lock is actually needed in this loop.
*/
- ct_write_lock_bh(idx);
+ rcu_read_lock();
- hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
if (!ip_vs_conn_net_eq(cp, net))
continue;
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
- if (cp->control) {
+ cp_c = cp->control;
+ /* cp->control is valid only with reference to cp */
+ if (cp_c && __ip_vs_conn_get(cp)) {
IP_VS_DBG(4, "del conn template\n");
- ip_vs_conn_expire_now(cp->control);
+ ip_vs_conn_expire_now(cp_c);
+ __ip_vs_conn_put(cp);
}
}
- ct_write_unlock_bh(idx);
+ rcu_read_unlock();
}
/* the counter may be not NULL, because maybe some conn entries
@@ -1331,7 +1355,7 @@ int __init ip_vs_conn_init(void)
INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
- rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
+ spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l);
}
/* calculate the random value for connection hash */
@@ -1342,6 +1366,8 @@ int __init ip_vs_conn_init(void)
void ip_vs_conn_cleanup(void)
{
+ /* Wait all ip_vs_conn_rcu_free() callbacks to complete */
+ rcu_barrier();
/* Release the empty cache */
kmem_cache_destroy(ip_vs_conn_cachep);
vfree(ip_vs_conn_tab);
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 47edf5a40a59..f26fe3353a30 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -69,10 +69,7 @@ EXPORT_SYMBOL(ip_vs_conn_put);
EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif
-int ip_vs_net_id __read_mostly;
-#ifdef IP_VS_GENERIC_NETNS
-EXPORT_SYMBOL(ip_vs_net_id);
-#endif
+static int ip_vs_net_id __read_mostly;
/* netns cnt used for uniqueness */
static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
@@ -206,7 +203,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
{
ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr,
vport, p);
- p->pe = svc->pe;
+ p->pe = rcu_dereference(svc->pe);
if (p->pe && p->pe->fill_param)
return p->pe->fill_param(p, skb);
@@ -299,12 +296,15 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
/* Check if a template already exists */
ct = ip_vs_ct_in_get(&param);
if (!ct || !ip_vs_check_template(ct)) {
+ struct ip_vs_scheduler *sched;
+
/*
* No template found or the dest of the connection
* template is not available.
* return *ignored=0 i.e. ICMP and NF_DROP
*/
- dest = svc->scheduler->schedule(svc, skb);
+ sched = rcu_dereference(svc->scheduler);
+ dest = sched->schedule(svc, skb);
if (!dest) {
IP_VS_DBG(1, "p-schedule: no dest found.\n");
kfree(param.pe_data);
@@ -394,6 +394,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
{
struct ip_vs_protocol *pp = pd->pp;
struct ip_vs_conn *cp = NULL;
+ struct ip_vs_scheduler *sched;
struct ip_vs_dest *dest;
__be16 _ports[2], *pptr;
unsigned int flags;
@@ -449,7 +450,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
return NULL;
}
- dest = svc->scheduler->schedule(svc, skb);
+ sched = rcu_dereference(svc->scheduler);
+ dest = sched->schedule(svc, skb);
if (dest == NULL) {
IP_VS_DBG(1, "Schedule: no dest found.\n");
return NULL;
@@ -507,7 +509,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
if (pptr == NULL) {
- ip_vs_service_put(svc);
return NF_DROP;
}
@@ -533,8 +534,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
IP_VS_CONN_F_ONE_PACKET : 0;
union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
- ip_vs_service_put(svc);
-
/* create a new connection entry */
IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
{
@@ -571,12 +570,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
* listed in the ipvs table), pass the packets, because it is
* not ipvs job to decide to drop the packets.
*/
- if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
- ip_vs_service_put(svc);
+ if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT))
return NF_ACCEPT;
- }
-
- ip_vs_service_put(svc);
/*
* Notify the client that the destination is unreachable, and
@@ -643,8 +638,11 @@ static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
{
- int err = ip_defrag(skb, user);
+ int err;
+ local_bh_disable();
+ err = ip_defrag(skb, user);
+ local_bh_enable();
if (!err)
ip_send_check(ip_hdr(skb));
@@ -1164,9 +1162,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
sizeof(_ports), _ports, &iph);
if (pptr == NULL)
return NF_ACCEPT; /* Not for me */
- if (ip_vs_lookup_real_service(net, af, iph.protocol,
- &iph.saddr,
- pptr[0])) {
+ if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr,
+ pptr[0])) {
/*
* Notify the real server: there is no
* existing entry if it is not RST
@@ -1181,9 +1178,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
iph.len)))) {
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
- struct net *net =
- dev_net(skb_dst(skb)->dev);
-
if (!skb->dev)
skb->dev = net->loopback_dev;
icmpv6_send(skb,
@@ -1226,13 +1220,7 @@ ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- unsigned int verdict;
-
- /* Disable BH in LOCAL_OUT until all places are fixed */
- local_bh_disable();
- verdict = ip_vs_out(hooknum, skb, AF_INET);
- local_bh_enable();
- return verdict;
+ return ip_vs_out(hooknum, skb, AF_INET);
}
#ifdef CONFIG_IP_VS_IPV6
@@ -1259,13 +1247,7 @@ ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- unsigned int verdict;
-
- /* Disable BH in LOCAL_OUT until all places are fixed */
- local_bh_disable();
- verdict = ip_vs_out(hooknum, skb, AF_INET6);
- local_bh_enable();
- return verdict;
+ return ip_vs_out(hooknum, skb, AF_INET6);
}
#endif
@@ -1394,19 +1376,20 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
skb_reset_network_header(skb);
IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
- rcu_read_lock();
ipv4_update_pmtu(skb, dev_net(skb->dev),
mtu, 0, 0, 0, 0);
- rcu_read_unlock();
/* Client uses PMTUD? */
if (!(cih->frag_off & htons(IP_DF)))
goto ignore_ipip;
/* Prefer the resulting PMTU */
if (dest) {
- spin_lock(&dest->dst_lock);
- if (dest->dst_cache)
- mtu = dst_mtu(dest->dst_cache);
- spin_unlock(&dest->dst_lock);
+ struct ip_vs_dest_dst *dest_dst;
+
+ rcu_read_lock();
+ dest_dst = rcu_dereference(dest->dest_dst);
+ if (dest_dst)
+ mtu = dst_mtu(dest_dst->dst_cache);
+ rcu_read_unlock();
}
if (mtu > 68 + sizeof(struct iphdr))
mtu -= sizeof(struct iphdr);
@@ -1577,7 +1560,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
}
/* ipvs enabled in this netns ? */
net = skb_net(skb);
- if (!net_ipvs(net)->enable)
+ ipvs = net_ipvs(net);
+ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
return NF_ACCEPT;
ip_vs_fill_iph_skb(af, skb, &iph);
@@ -1654,7 +1638,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
}
IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
- ipvs = net_ipvs(net);
/* Check the server status */
if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
/* the destination server is not available */
@@ -1722,13 +1705,7 @@ ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- unsigned int verdict;
-
- /* Disable BH in LOCAL_OUT until all places are fixed */
- local_bh_disable();
- verdict = ip_vs_in(hooknum, skb, AF_INET);
- local_bh_enable();
- return verdict;
+ return ip_vs_in(hooknum, skb, AF_INET);
}
#ifdef CONFIG_IP_VS_IPV6
@@ -1787,13 +1764,7 @@ ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- unsigned int verdict;
-
- /* Disable BH in LOCAL_OUT until all places are fixed */
- local_bh_disable();
- verdict = ip_vs_in(hooknum, skb, AF_INET6);
- local_bh_enable();
- return verdict;
+ return ip_vs_in(hooknum, skb, AF_INET6);
}
#endif
@@ -1815,13 +1786,15 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
{
int r;
struct net *net;
+ struct netns_ipvs *ipvs;
if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
return NF_ACCEPT;
/* ipvs enabled in this netns ? */
net = skb_net(skb);
- if (!net_ipvs(net)->enable)
+ ipvs = net_ipvs(net);
+ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
return NF_ACCEPT;
return ip_vs_in_icmp(skb, &r, hooknum);
@@ -1835,6 +1808,7 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
{
int r;
struct net *net;
+ struct netns_ipvs *ipvs;
struct ip_vs_iphdr iphdr;
ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr);
@@ -1843,7 +1817,8 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
/* ipvs enabled in this netns ? */
net = skb_net(skb);
- if (!net_ipvs(net)->enable)
+ ipvs = net_ipvs(net);
+ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
return NF_ACCEPT;
return ip_vs_in_icmp_v6(skb, &r, hooknum, &iphdr);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index c68198bf9128..9e4074c26dc2 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -55,9 +55,6 @@
/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
static DEFINE_MUTEX(__ip_vs_mutex);
-/* lock for service table */
-static DEFINE_RWLOCK(__ip_vs_svc_lock);
-
/* sysctl variables */
#ifdef CONFIG_IP_VS_DEBUG
@@ -71,7 +68,7 @@ int ip_vs_get_debug_level(void)
/* Protos */
-static void __ip_vs_del_service(struct ip_vs_service *svc);
+static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup);
#ifdef CONFIG_IP_VS_IPV6
@@ -257,9 +254,9 @@ ip_vs_use_count_dec(void)
#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
/* the service table hashed by <protocol, addr, port> */
-static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
+static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
/* the service table hashed by fwmark */
-static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
+static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
/*
@@ -271,16 +268,18 @@ ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
{
register unsigned int porth = ntohs(port);
__be32 addr_fold = addr->ip;
+ __u32 ahash;
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
- addr_fold ^= ((size_t)net>>8);
+ ahash = ntohl(addr_fold);
+ ahash ^= ((size_t) net >> 8);
- return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
- & IP_VS_SVC_TAB_MASK;
+ return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
+ IP_VS_SVC_TAB_MASK;
}
/*
@@ -312,13 +311,13 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
*/
hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
&svc->addr, svc->port);
- list_add(&svc->s_list, &ip_vs_svc_table[hash]);
+ hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]);
} else {
/*
* Hash it by fwmark in svc_fwm_table
*/
hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
- list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
+ hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
}
svc->flags |= IP_VS_SVC_F_HASHED;
@@ -342,10 +341,10 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
if (svc->fwmark == 0) {
/* Remove it from the svc_table table */
- list_del(&svc->s_list);
+ hlist_del_rcu(&svc->s_list);
} else {
/* Remove it from the svc_fwm_table table */
- list_del(&svc->f_list);
+ hlist_del_rcu(&svc->f_list);
}
svc->flags &= ~IP_VS_SVC_F_HASHED;
@@ -367,7 +366,7 @@ __ip_vs_service_find(struct net *net, int af, __u16 protocol,
/* Check for "full" addressed entries */
hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
- list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) {
if ((svc->af == af)
&& ip_vs_addr_equal(af, &svc->addr, vaddr)
&& (svc->port == vport)
@@ -394,7 +393,7 @@ __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
/* Check for fwmark addressed entries */
hash = ip_vs_svc_fwm_hashkey(net, fwmark);
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) {
if (svc->fwmark == fwmark && svc->af == af
&& net_eq(svc->net, net)) {
/* HIT */
@@ -405,15 +404,14 @@ __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
return NULL;
}
+/* Find service, called under RCU lock */
struct ip_vs_service *
-ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
- const union nf_inet_addr *vaddr, __be16 vport)
+ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol,
+ const union nf_inet_addr *vaddr, __be16 vport)
{
struct ip_vs_service *svc;
struct netns_ipvs *ipvs = net_ipvs(net);
- read_lock(&__ip_vs_svc_lock);
-
/*
* Check the table hashed by fwmark first
*/
@@ -449,10 +447,6 @@ ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
}
out:
- if (svc)
- atomic_inc(&svc->usecnt);
- read_unlock(&__ip_vs_svc_lock);
-
IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
fwmark, ip_vs_proto_name(protocol),
IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
@@ -469,6 +463,13 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
dest->svc = svc;
}
+static void ip_vs_service_free(struct ip_vs_service *svc)
+{
+ if (svc->stats.cpustats)
+ free_percpu(svc->stats.cpustats);
+ kfree(svc);
+}
+
static void
__ip_vs_unbind_svc(struct ip_vs_dest *dest)
{
@@ -476,12 +477,11 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest)
dest->svc = NULL;
if (atomic_dec_and_test(&svc->refcnt)) {
- IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+ IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
svc->fwmark,
IP_VS_DBG_ADDR(svc->af, &svc->addr),
- ntohs(svc->port), atomic_read(&svc->usecnt));
- free_percpu(svc->stats.cpustats);
- kfree(svc);
+ ntohs(svc->port));
+ ip_vs_service_free(svc);
}
}
@@ -506,17 +506,13 @@ static inline unsigned int ip_vs_rs_hashkey(int af,
& IP_VS_RTAB_MASK;
}
-/*
- * Hashes ip_vs_dest in rs_table by <proto,addr,port>.
- * should be called with locked tables.
- */
-static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
+/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */
+static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
{
unsigned int hash;
- if (!list_empty(&dest->d_list)) {
- return 0;
- }
+ if (dest->in_rs_table)
+ return;
/*
* Hash by proto,addr,port,
@@ -524,64 +520,51 @@ static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
*/
hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
- list_add(&dest->d_list, &ipvs->rs_table[hash]);
-
- return 1;
+ hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
+ dest->in_rs_table = 1;
}
-/*
- * UNhashes ip_vs_dest from rs_table.
- * should be called with locked tables.
- */
-static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
+/* Unhash ip_vs_dest from rs_table. */
+static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
{
/*
* Remove it from the rs_table table.
*/
- if (!list_empty(&dest->d_list)) {
- list_del_init(&dest->d_list);
+ if (dest->in_rs_table) {
+ hlist_del_rcu(&dest->d_list);
+ dest->in_rs_table = 0;
}
-
- return 1;
}
-/*
- * Lookup real service by <proto,addr,port> in the real service table.
- */
-struct ip_vs_dest *
-ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
- const union nf_inet_addr *daddr,
- __be16 dport)
+/* Check if real service by <proto,addr,port> is present */
+bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol,
+ const union nf_inet_addr *daddr, __be16 dport)
{
struct netns_ipvs *ipvs = net_ipvs(net);
unsigned int hash;
struct ip_vs_dest *dest;
- /*
- * Check for "full" addressed entries
- * Return the first found entry
- */
+ /* Check for "full" addressed entries */
hash = ip_vs_rs_hashkey(af, daddr, dport);
- read_lock(&ipvs->rs_lock);
- list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
- if ((dest->af == af)
- && ip_vs_addr_equal(af, &dest->addr, daddr)
- && (dest->port == dport)
- && ((dest->protocol == protocol) ||
- dest->vfwmark)) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
+ if (dest->port == dport &&
+ dest->af == af &&
+ ip_vs_addr_equal(af, &dest->addr, daddr) &&
+ (dest->protocol == protocol || dest->vfwmark)) {
/* HIT */
- read_unlock(&ipvs->rs_lock);
- return dest;
+ rcu_read_unlock();
+ return true;
}
}
- read_unlock(&ipvs->rs_lock);
+ rcu_read_unlock();
- return NULL;
+ return false;
}
-/*
- * Lookup destination by {addr,port} in the given service
+/* Lookup destination by {addr,port} in the given service
+ * Called under RCU lock.
*/
static struct ip_vs_dest *
ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
@@ -592,7 +575,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
/*
* Find the destination for the given service
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if ((dest->af == svc->af)
&& ip_vs_addr_equal(svc->af, &dest->addr, daddr)
&& (dest->port == dport)) {
@@ -606,13 +589,11 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
/*
* Find destination by {daddr,dport,vaddr,protocol}
- * Cretaed to be used in ip_vs_process_message() in
+ * Created to be used in ip_vs_process_message() in
* the backup synchronization daemon. It finds the
* destination to be bound to the received connection
* on the backup.
- *
- * ip_vs_lookup_real_service() looked promissing, but
- * seems not working as expected.
+ * Called under RCU lock, no refcnt is returned.
*/
struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af,
const union nf_inet_addr *daddr,
@@ -625,7 +606,7 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af,
struct ip_vs_service *svc;
__be16 port = dport;
- svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
+ svc = ip_vs_service_find(net, af, fwmark, protocol, vaddr, vport);
if (!svc)
return NULL;
if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
@@ -633,12 +614,31 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af,
dest = ip_vs_lookup_dest(svc, daddr, port);
if (!dest)
dest = ip_vs_lookup_dest(svc, daddr, port ^ dport);
- if (dest)
- atomic_inc(&dest->refcnt);
- ip_vs_service_put(svc);
return dest;
}
+void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_dest_dst *dest_dst = container_of(head,
+ struct ip_vs_dest_dst,
+ rcu_head);
+
+ dst_release(dest_dst->dst_cache);
+ kfree(dest_dst);
+}
+
+/* Release dest_dst and dst_cache for dest in user context */
+static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
+{
+ struct ip_vs_dest_dst *old;
+
+ old = rcu_dereference_protected(dest->dest_dst, 1);
+ if (old) {
+ RCU_INIT_POINTER(dest->dest_dst, NULL);
+ call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
+ }
+}
+
/*
* Lookup dest by {svc,addr,port} in the destination trash.
* The destination trash is used to hold the destinations that are removed
@@ -653,19 +653,25 @@ static struct ip_vs_dest *
ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
__be16 dport)
{
- struct ip_vs_dest *dest, *nxt;
+ struct ip_vs_dest *dest;
struct netns_ipvs *ipvs = net_ipvs(svc->net);
/*
* Find the destination in trash
*/
- list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
"dest->refcnt=%d\n",
dest->vfwmark,
IP_VS_DBG_ADDR(svc->af, &dest->addr),
ntohs(dest->port),
atomic_read(&dest->refcnt));
+ /* We can not reuse dest while in grace period
+ * because conns still can use dest->svc
+ */
+ if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state))
+ continue;
if (dest->af == svc->af &&
ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
dest->port == dport &&
@@ -675,29 +681,27 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
(ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
dest->vport == svc->port))) {
/* HIT */
- return dest;
- }
-
- /*
- * Try to purge the destination from trash if not referenced
- */
- if (atomic_read(&dest->refcnt) == 1) {
- IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
- "from trash\n",
- dest->vfwmark,
- IP_VS_DBG_ADDR(svc->af, &dest->addr),
- ntohs(dest->port));
- list_del(&dest->n_list);
- ip_vs_dst_reset(dest);
- __ip_vs_unbind_svc(dest);
- free_percpu(dest->stats.cpustats);
- kfree(dest);
+ list_del(&dest->t_list);
+ ip_vs_dest_hold(dest);
+ goto out;
}
}
- return NULL;
+ dest = NULL;
+
+out:
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+
+ return dest;
}
+static void ip_vs_dest_free(struct ip_vs_dest *dest)
+{
+ __ip_vs_dst_cache_reset(dest);
+ __ip_vs_unbind_svc(dest);
+ free_percpu(dest->stats.cpustats);
+ kfree(dest);
+}
/*
* Clean up all the destinations in the trash
@@ -706,19 +710,18 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
* When the ip_vs_control_clearup is activated by ipvs module exit,
* the service tables must have been flushed and all the connections
* are expired, and the refcnt of each destination in the trash must
- * be 1, so we simply release them here.
+ * be 0, so we simply release them here.
*/
static void ip_vs_trash_cleanup(struct net *net)
{
struct ip_vs_dest *dest, *nxt;
struct netns_ipvs *ipvs = net_ipvs(net);
- list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
- list_del(&dest->n_list);
- ip_vs_dst_reset(dest);
- __ip_vs_unbind_svc(dest);
- free_percpu(dest->stats.cpustats);
- kfree(dest);
+ del_timer_sync(&ipvs->dest_trash_timer);
+ /* No need to use dest_trash_lock */
+ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
+ list_del(&dest->t_list);
+ ip_vs_dest_free(dest);
}
}
@@ -768,6 +771,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
struct ip_vs_dest_user_kern *udest, int add)
{
struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct ip_vs_scheduler *sched;
int conn_flags;
/* set the weight and the flags */
@@ -783,9 +787,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
* Put the real service in rs_table if not present.
* For now only for NAT!
*/
- write_lock_bh(&ipvs->rs_lock);
ip_vs_rs_hash(ipvs, dest);
- write_unlock_bh(&ipvs->rs_lock);
}
atomic_set(&dest->conn_flags, conn_flags);
@@ -809,27 +811,20 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
dest->l_threshold = udest->l_threshold;
spin_lock_bh(&dest->dst_lock);
- ip_vs_dst_reset(dest);
+ __ip_vs_dst_cache_reset(dest);
spin_unlock_bh(&dest->dst_lock);
- if (add)
- ip_vs_start_estimator(svc->net, &dest->stats);
-
- write_lock_bh(&__ip_vs_svc_lock);
-
- /* Wait until all other svc users go away */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-
+ sched = rcu_dereference_protected(svc->scheduler, 1);
if (add) {
- list_add(&dest->n_list, &svc->destinations);
+ ip_vs_start_estimator(svc->net, &dest->stats);
+ list_add_rcu(&dest->n_list, &svc->destinations);
svc->num_dests++;
+ if (sched->add_dest)
+ sched->add_dest(svc, dest);
+ } else {
+ if (sched->upd_dest)
+ sched->upd_dest(svc, dest);
}
-
- /* call the update_service, because server weight may be changed */
- if (svc->scheduler->update_service)
- svc->scheduler->update_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
}
@@ -881,7 +876,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
atomic_set(&dest->persistconns, 0);
atomic_set(&dest->refcnt, 1);
- INIT_LIST_HEAD(&dest->d_list);
+ INIT_HLIST_NODE(&dest->d_list);
spin_lock_init(&dest->dst_lock);
spin_lock_init(&dest->stats.lock);
__ip_vs_update_dest(svc, dest, udest, 1);
@@ -923,10 +918,10 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
- /*
- * Check if the dest already exists in the list
- */
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
dest = ip_vs_lookup_dest(svc, &daddr, dport);
+ rcu_read_unlock();
if (dest != NULL) {
IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
@@ -948,11 +943,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
ntohs(dest->vport));
- /*
- * Get the destination from the trash
- */
- list_del(&dest->n_list);
-
__ip_vs_update_dest(svc, dest, udest, 1);
ret = 0;
} else {
@@ -992,10 +982,10 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
- /*
- * Lookup the destination list
- */
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
dest = ip_vs_lookup_dest(svc, &daddr, dport);
+ rcu_read_unlock();
if (dest == NULL) {
IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
@@ -1008,11 +998,21 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
return 0;
}
+static void ip_vs_dest_wait_readers(struct rcu_head *head)
+{
+ struct ip_vs_dest *dest = container_of(head, struct ip_vs_dest,
+ rcu_head);
+
+ /* End of grace period after unlinking */
+ clear_bit(IP_VS_DEST_STATE_REMOVING, &dest->state);
+}
+
/*
* Delete a destination (must be already unlinked from the service)
*/
-static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
+static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest,
+ bool cleanup)
{
struct netns_ipvs *ipvs = net_ipvs(net);
@@ -1021,38 +1021,24 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
/*
* Remove it from the d-linked list with the real services.
*/
- write_lock_bh(&ipvs->rs_lock);
ip_vs_rs_unhash(dest);
- write_unlock_bh(&ipvs->rs_lock);
- /*
- * Decrease the refcnt of the dest, and free the dest
- * if nobody refers to it (refcnt=0). Otherwise, throw
- * the destination into the trash.
- */
- if (atomic_dec_and_test(&dest->refcnt)) {
- IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
- dest->vfwmark,
- IP_VS_DBG_ADDR(dest->af, &dest->addr),
- ntohs(dest->port));
- ip_vs_dst_reset(dest);
- /* simply decrease svc->refcnt here, let the caller check
- and release the service if nobody refers to it.
- Only user context can release destination and service,
- and only one user context can update virtual service at a
- time, so the operation here is OK */
- atomic_dec(&dest->svc->refcnt);
- free_percpu(dest->stats.cpustats);
- kfree(dest);
- } else {
- IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
- "dest->refcnt=%d\n",
- IP_VS_DBG_ADDR(dest->af, &dest->addr),
- ntohs(dest->port),
- atomic_read(&dest->refcnt));
- list_add(&dest->n_list, &ipvs->dest_trash);
- atomic_inc(&dest->refcnt);
+ if (!cleanup) {
+ set_bit(IP_VS_DEST_STATE_REMOVING, &dest->state);
+ call_rcu(&dest->rcu_head, ip_vs_dest_wait_readers);
}
+
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
+ atomic_read(&dest->refcnt));
+ if (list_empty(&ipvs->dest_trash) && !cleanup)
+ mod_timer(&ipvs->dest_trash_timer,
+ jiffies + IP_VS_DEST_TRASH_PERIOD);
+ /* dest lives in trash without reference */
+ list_add(&dest->t_list, &ipvs->dest_trash);
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+ ip_vs_dest_put(dest);
}
@@ -1068,14 +1054,16 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
/*
* Remove it from the d-linked destination list.
*/
- list_del(&dest->n_list);
+ list_del_rcu(&dest->n_list);
svc->num_dests--;
- /*
- * Call the update_service function of its scheduler
- */
- if (svcupd && svc->scheduler->update_service)
- svc->scheduler->update_service(svc);
+ if (svcupd) {
+ struct ip_vs_scheduler *sched;
+
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (sched->del_dest)
+ sched->del_dest(svc, dest);
+ }
}
@@ -1090,37 +1078,56 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
EnterFunction(2);
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
+ rcu_read_unlock();
if (dest == NULL) {
IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
return -ENOENT;
}
- write_lock_bh(&__ip_vs_svc_lock);
-
- /*
- * Wait until all other svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-
/*
* Unlink dest from the service
*/
__ip_vs_unlink_dest(svc, dest, 1);
- write_unlock_bh(&__ip_vs_svc_lock);
-
/*
* Delete the destination
*/
- __ip_vs_del_dest(svc->net, dest);
+ __ip_vs_del_dest(svc->net, dest, false);
LeaveFunction(2);
return 0;
}
+static void ip_vs_dest_trash_expire(unsigned long data)
+{
+ struct net *net = (struct net *) data;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_dest *dest, *next;
+
+ spin_lock(&ipvs->dest_trash_lock);
+ list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
+ /* Skip if dest is in grace period */
+ if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state))
+ continue;
+ if (atomic_read(&dest->refcnt) > 0)
+ continue;
+ IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
+ dest->vfwmark,
+ IP_VS_DBG_ADDR(dest->svc->af, &dest->addr),
+ ntohs(dest->port));
+ list_del(&dest->t_list);
+ ip_vs_dest_free(dest);
+ }
+ if (!list_empty(&ipvs->dest_trash))
+ mod_timer(&ipvs->dest_trash_timer,
+ jiffies + IP_VS_DEST_TRASH_PERIOD);
+ spin_unlock(&ipvs->dest_trash_lock);
+}
/*
* Add a service into the service hash table
@@ -1176,7 +1183,6 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
}
/* I'm the first user of the service */
- atomic_set(&svc->usecnt, 0);
atomic_set(&svc->refcnt, 0);
svc->af = u->af;
@@ -1190,7 +1196,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
svc->net = net;
INIT_LIST_HEAD(&svc->destinations);
- rwlock_init(&svc->sched_lock);
+ spin_lock_init(&svc->sched_lock);
spin_lock_init(&svc->stats.lock);
/* Bind the scheduler */
@@ -1200,7 +1206,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
sched = NULL;
/* Bind the ct retriever */
- ip_vs_bind_pe(svc, pe);
+ RCU_INIT_POINTER(svc->pe, pe);
pe = NULL;
/* Update the virtual service counters */
@@ -1216,9 +1222,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
ipvs->num_services++;
/* Hash the service into the service table */
- write_lock_bh(&__ip_vs_svc_lock);
ip_vs_svc_hash(svc);
- write_unlock_bh(&__ip_vs_svc_lock);
*svc_p = svc;
/* Now there is a service - full throttle */
@@ -1228,15 +1232,8 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
out_err:
if (svc != NULL) {
- ip_vs_unbind_scheduler(svc);
- if (svc->inc) {
- local_bh_disable();
- ip_vs_app_inc_put(svc->inc);
- local_bh_enable();
- }
- if (svc->stats.cpustats)
- free_percpu(svc->stats.cpustats);
- kfree(svc);
+ ip_vs_unbind_scheduler(svc, sched);
+ ip_vs_service_free(svc);
}
ip_vs_scheduler_put(sched);
ip_vs_pe_put(pe);
@@ -1286,12 +1283,17 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
}
#endif
- write_lock_bh(&__ip_vs_svc_lock);
-
- /*
- * Wait until all other svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+ old_sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (sched != old_sched) {
+ /* Bind the new scheduler */
+ ret = ip_vs_bind_scheduler(svc, sched);
+ if (ret) {
+ old_sched = sched;
+ goto out;
+ }
+ /* Unbind the old scheduler on success */
+ ip_vs_unbind_scheduler(svc, old_sched);
+ }
/*
* Set the flags and timeout value
@@ -1300,57 +1302,30 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
- old_sched = svc->scheduler;
- if (sched != old_sched) {
- /*
- * Unbind the old scheduler
- */
- if ((ret = ip_vs_unbind_scheduler(svc))) {
- old_sched = sched;
- goto out_unlock;
- }
-
- /*
- * Bind the new scheduler
- */
- if ((ret = ip_vs_bind_scheduler(svc, sched))) {
- /*
- * If ip_vs_bind_scheduler fails, restore the old
- * scheduler.
- * The main reason of failure is out of memory.
- *
- * The question is if the old scheduler can be
- * restored all the time. TODO: if it cannot be
- * restored some time, we must delete the service,
- * otherwise the system may crash.
- */
- ip_vs_bind_scheduler(svc, old_sched);
- old_sched = sched;
- goto out_unlock;
- }
- }
-
- old_pe = svc->pe;
- if (pe != old_pe) {
- ip_vs_unbind_pe(svc);
- ip_vs_bind_pe(svc, pe);
- }
+ old_pe = rcu_dereference_protected(svc->pe, 1);
+ if (pe != old_pe)
+ rcu_assign_pointer(svc->pe, pe);
-out_unlock:
- write_unlock_bh(&__ip_vs_svc_lock);
out:
ip_vs_scheduler_put(old_sched);
ip_vs_pe_put(old_pe);
return ret;
}
+static void ip_vs_service_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_service *svc;
+
+ svc = container_of(head, struct ip_vs_service, rcu_head);
+ ip_vs_service_free(svc);
+}
/*
* Delete a service from the service list
* - The service must be unlinked, unlocked and not referenced!
* - We are called under _bh lock
*/
-static void __ip_vs_del_service(struct ip_vs_service *svc)
+static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
{
struct ip_vs_dest *dest, *nxt;
struct ip_vs_scheduler *old_sched;
@@ -1366,27 +1341,20 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
ip_vs_stop_estimator(svc->net, &svc->stats);
/* Unbind scheduler */
- old_sched = svc->scheduler;
- ip_vs_unbind_scheduler(svc);
+ old_sched = rcu_dereference_protected(svc->scheduler, 1);
+ ip_vs_unbind_scheduler(svc, old_sched);
ip_vs_scheduler_put(old_sched);
- /* Unbind persistence engine */
- old_pe = svc->pe;
- ip_vs_unbind_pe(svc);
+ /* Unbind persistence engine, keep svc->pe */
+ old_pe = rcu_dereference_protected(svc->pe, 1);
ip_vs_pe_put(old_pe);
- /* Unbind app inc */
- if (svc->inc) {
- ip_vs_app_inc_put(svc->inc);
- svc->inc = NULL;
- }
-
/*
* Unlink the whole destination list
*/
list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
__ip_vs_unlink_dest(svc, dest, 0);
- __ip_vs_del_dest(svc->net, dest);
+ __ip_vs_del_dest(svc->net, dest, cleanup);
}
/*
@@ -1400,13 +1368,12 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
/*
* Free the service if nobody refers to it
*/
- if (atomic_read(&svc->refcnt) == 0) {
- IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+ if (atomic_dec_and_test(&svc->refcnt)) {
+ IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
svc->fwmark,
IP_VS_DBG_ADDR(svc->af, &svc->addr),
- ntohs(svc->port), atomic_read(&svc->usecnt));
- free_percpu(svc->stats.cpustats);
- kfree(svc);
+ ntohs(svc->port));
+ call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
}
/* decrease the module use count */
@@ -1416,23 +1383,16 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
/*
* Unlink a service from list and try to delete it if its refcnt reached 0
*/
-static void ip_vs_unlink_service(struct ip_vs_service *svc)
+static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
{
+ /* Hold svc to avoid double release from dest_trash */
+ atomic_inc(&svc->refcnt);
/*
* Unhash it from the service table
*/
- write_lock_bh(&__ip_vs_svc_lock);
-
ip_vs_svc_unhash(svc);
- /*
- * Wait until all the svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-
- __ip_vs_del_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
+ __ip_vs_del_service(svc, cleanup);
}
/*
@@ -1442,7 +1402,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
{
if (svc == NULL)
return -EEXIST;
- ip_vs_unlink_service(svc);
+ ip_vs_unlink_service(svc, false);
return 0;
}
@@ -1451,19 +1411,20 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
/*
* Flush all the virtual services
*/
-static int ip_vs_flush(struct net *net)
+static int ip_vs_flush(struct net *net, bool cleanup)
{
int idx;
- struct ip_vs_service *svc, *nxt;
+ struct ip_vs_service *svc;
+ struct hlist_node *n;
/*
* Flush the service table hashed by <netns,protocol,addr,port>
*/
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
- s_list) {
+ hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx],
+ s_list) {
if (net_eq(svc->net, net))
- ip_vs_unlink_service(svc);
+ ip_vs_unlink_service(svc, cleanup);
}
}
@@ -1471,10 +1432,10 @@ static int ip_vs_flush(struct net *net)
* Flush the service table hashed by fwmark
*/
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry_safe(svc, nxt,
- &ip_vs_svc_fwm_table[idx], f_list) {
+ hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx],
+ f_list) {
if (net_eq(svc->net, net))
- ip_vs_unlink_service(svc);
+ ip_vs_unlink_service(svc, cleanup);
}
}
@@ -1490,32 +1451,29 @@ void ip_vs_service_net_cleanup(struct net *net)
EnterFunction(2);
/* Check for "full" addressed entries */
mutex_lock(&__ip_vs_mutex);
- ip_vs_flush(net);
+ ip_vs_flush(net, true);
mutex_unlock(&__ip_vs_mutex);
LeaveFunction(2);
}
-/*
- * Release dst hold by dst_cache
- */
+
+/* Put all references for device (dst_cache) */
static inline void
-__ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev)
+ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
{
spin_lock_bh(&dest->dst_lock);
- if (dest->dst_cache && dest->dst_cache->dev == dev) {
+ if (dest->dest_dst && dest->dest_dst->dst_cache->dev == dev) {
IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
dev->name,
IP_VS_DBG_ADDR(dest->af, &dest->addr),
ntohs(dest->port),
atomic_read(&dest->refcnt));
- ip_vs_dst_reset(dest);
+ __ip_vs_dst_cache_reset(dest);
}
spin_unlock_bh(&dest->dst_lock);
}
-/*
- * Netdev event receiver
- * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to
- * a device that is "unregister" it must be released.
+/* Netdev event receiver
+ * Currently only NETDEV_DOWN is handled to release refs to cached dsts
*/
static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
void *ptr)
@@ -1527,35 +1485,37 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
struct ip_vs_dest *dest;
unsigned int idx;
- if (event != NETDEV_UNREGISTER || !ipvs)
+ if (event != NETDEV_DOWN || !ipvs)
return NOTIFY_DONE;
IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
EnterFunction(2);
mutex_lock(&__ip_vs_mutex);
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
if (net_eq(svc->net, net)) {
list_for_each_entry(dest, &svc->destinations,
n_list) {
- __ip_vs_dev_reset(dest, dev);
+ ip_vs_forget_dev(dest, dev);
}
}
}
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
if (net_eq(svc->net, net)) {
list_for_each_entry(dest, &svc->destinations,
n_list) {
- __ip_vs_dev_reset(dest, dev);
+ ip_vs_forget_dev(dest, dev);
}
}
}
}
- list_for_each_entry(dest, &ipvs->dest_trash, n_list) {
- __ip_vs_dev_reset(dest, dev);
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
+ ip_vs_forget_dev(dest, dev);
}
+ spin_unlock_bh(&ipvs->dest_trash_lock);
mutex_unlock(&__ip_vs_mutex);
LeaveFunction(2);
return NOTIFY_DONE;
@@ -1568,12 +1528,10 @@ static int ip_vs_zero_service(struct ip_vs_service *svc)
{
struct ip_vs_dest *dest;
- write_lock_bh(&__ip_vs_svc_lock);
list_for_each_entry(dest, &svc->destinations, n_list) {
ip_vs_zero_stats(&dest->stats);
}
ip_vs_zero_stats(&svc->stats);
- write_unlock_bh(&__ip_vs_svc_lock);
return 0;
}
@@ -1583,14 +1541,14 @@ static int ip_vs_zero_all(struct net *net)
struct ip_vs_service *svc;
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
if (net_eq(svc->net, net))
ip_vs_zero_service(svc);
}
}
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
if (net_eq(svc->net, net))
ip_vs_zero_service(svc);
}
@@ -1808,6 +1766,12 @@ static struct ctl_table vs_vars[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "backup_only",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#ifdef CONFIG_IP_VS_DEBUG
{
.procname = "debug_level",
@@ -1912,7 +1876,7 @@ static struct ctl_table vs_vars[] = {
struct ip_vs_iter {
struct seq_net_private p; /* Do not move this, netns depends upon it*/
- struct list_head *table;
+ struct hlist_head *table;
int bucket;
};
@@ -1945,7 +1909,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
/* look in hash by protocol */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) {
if (net_eq(svc->net, net) && pos-- == 0) {
iter->table = ip_vs_svc_table;
iter->bucket = idx;
@@ -1956,7 +1920,8 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
/* keep looking in fwmark */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx],
+ f_list) {
if (net_eq(svc->net, net) && pos-- == 0) {
iter->table = ip_vs_svc_fwm_table;
iter->bucket = idx;
@@ -1969,17 +1934,16 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
}
static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
-__acquires(__ip_vs_svc_lock)
{
- read_lock_bh(&__ip_vs_svc_lock);
+ rcu_read_lock();
return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
}
static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct list_head *e;
+ struct hlist_node *e;
struct ip_vs_iter *iter;
struct ip_vs_service *svc;
@@ -1992,13 +1956,14 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
if (iter->table == ip_vs_svc_table) {
/* next service in table hashed by protocol */
- if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
- return list_entry(e, struct ip_vs_service, s_list);
-
+ e = rcu_dereference(hlist_next_rcu(&svc->s_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_service, s_list);
while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
- list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
- s_list) {
+ hlist_for_each_entry_rcu(svc,
+ &ip_vs_svc_table[iter->bucket],
+ s_list) {
return svc;
}
}
@@ -2009,13 +1974,15 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
/* next service in hashed by fwmark */
- if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
- return list_entry(e, struct ip_vs_service, f_list);
+ e = rcu_dereference(hlist_next_rcu(&svc->f_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_service, f_list);
scan_fwmark:
while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
- f_list)
+ hlist_for_each_entry_rcu(svc,
+ &ip_vs_svc_fwm_table[iter->bucket],
+ f_list)
return svc;
}
@@ -2023,9 +1990,8 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
-__releases(__ip_vs_svc_lock)
{
- read_unlock_bh(&__ip_vs_svc_lock);
+ rcu_read_unlock();
}
@@ -2043,6 +2009,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
const struct ip_vs_service *svc = v;
const struct ip_vs_iter *iter = seq->private;
const struct ip_vs_dest *dest;
+ struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
if (iter->table == ip_vs_svc_table) {
#ifdef CONFIG_IP_VS_IPV6
@@ -2051,18 +2018,18 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
ip_vs_proto_name(svc->protocol),
&svc->addr.in6,
ntohs(svc->port),
- svc->scheduler->name);
+ sched->name);
else
#endif
seq_printf(seq, "%s %08X:%04X %s %s ",
ip_vs_proto_name(svc->protocol),
ntohl(svc->addr.ip),
ntohs(svc->port),
- svc->scheduler->name,
+ sched->name,
(svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
} else {
seq_printf(seq, "FWM %08X %s %s",
- svc->fwmark, svc->scheduler->name,
+ svc->fwmark, sched->name,
(svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
}
@@ -2073,7 +2040,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
else
seq_putc(seq, '\n');
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
#ifdef CONFIG_IP_VS_IPV6
if (dest->af == AF_INET6)
seq_printf(seq,
@@ -2383,7 +2350,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
if (cmd == IP_VS_SO_SET_FLUSH) {
/* Flush the virtual service */
- ret = ip_vs_flush(net);
+ ret = ip_vs_flush(net, false);
goto out_unlock;
} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
/* Set timeout values for (tcp tcpfin udp) */
@@ -2418,11 +2385,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
}
/* Lookup the exact service by <protocol, addr, port> or fwmark */
+ rcu_read_lock();
if (usvc.fwmark == 0)
svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
&usvc.addr, usvc.port);
else
svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
+ rcu_read_unlock();
if (cmd != IP_VS_SO_SET_ADD
&& (svc == NULL || svc->protocol != usvc.protocol)) {
@@ -2474,11 +2443,14 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
static void
ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
{
+ struct ip_vs_scheduler *sched;
+
+ sched = rcu_dereference_protected(src->scheduler, 1);
dst->protocol = src->protocol;
dst->addr = src->addr.ip;
dst->port = src->port;
dst->fwmark = src->fwmark;
- strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
+ strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name));
dst->flags = src->flags;
dst->timeout = src->timeout / HZ;
dst->netmask = src->netmask;
@@ -2497,7 +2469,7 @@ __ip_vs_get_service_entries(struct net *net,
int ret = 0;
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
/* Only expose IPv4 entries to old interface */
if (svc->af != AF_INET || !net_eq(svc->net, net))
continue;
@@ -2516,7 +2488,7 @@ __ip_vs_get_service_entries(struct net *net,
}
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
/* Only expose IPv4 entries to old interface */
if (svc->af != AF_INET || !net_eq(svc->net, net))
continue;
@@ -2545,11 +2517,13 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
union nf_inet_addr addr = { .ip = get->addr };
int ret = 0;
+ rcu_read_lock();
if (get->fwmark)
svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
else
svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
get->port);
+ rcu_read_unlock();
if (svc) {
int count = 0;
@@ -2732,12 +2706,14 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
entry = (struct ip_vs_service_entry *)arg;
addr.ip = entry->addr;
+ rcu_read_lock();
if (entry->fwmark)
svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
else
svc = __ip_vs_service_find(net, AF_INET,
entry->protocol, &addr,
entry->port);
+ rcu_read_unlock();
if (svc) {
ip_vs_copy_service(entry, svc);
if (copy_to_user(user, entry, sizeof(*entry)) != 0)
@@ -2894,6 +2870,7 @@ nla_put_failure:
static int ip_vs_genl_fill_service(struct sk_buff *skb,
struct ip_vs_service *svc)
{
+ struct ip_vs_scheduler *sched;
struct nlattr *nl_service;
struct ip_vs_flags flags = { .flags = svc->flags,
.mask = ~0 };
@@ -2914,7 +2891,8 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
goto nla_put_failure;
}
- if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name) ||
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) ||
(svc->pe &&
nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name)) ||
nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
@@ -2965,7 +2943,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
mutex_lock(&__ip_vs_mutex);
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
- list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
if (++idx <= start || !net_eq(svc->net, net))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
@@ -2976,7 +2954,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
}
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
if (++idx <= start || !net_eq(svc->net, net))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
@@ -3036,11 +3014,13 @@ static int ip_vs_genl_parse_service(struct net *net,
usvc->fwmark = 0;
}
+ rcu_read_lock();
if (usvc->fwmark)
svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
else
svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
&usvc->addr, usvc->port);
+ rcu_read_unlock();
*ret_svc = svc;
/* If a full entry was requested, check for the additional fields */
@@ -3392,7 +3372,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
mutex_lock(&__ip_vs_mutex);
if (cmd == IPVS_CMD_FLUSH) {
- ret = ip_vs_flush(net);
+ ret = ip_vs_flush(net, false);
goto out;
} else if (cmd == IPVS_CMD_SET_CONFIG) {
ret = ip_vs_genl_set_config(net, info->attrs);
@@ -3741,6 +3721,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
ipvs->sysctl_pmtu_disc = 1;
tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
+ tbl[idx++].data = &ipvs->sysctl_backup_only;
ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
@@ -3783,13 +3764,14 @@ int __net_init ip_vs_control_net_init(struct net *net)
int idx;
struct netns_ipvs *ipvs = net_ipvs(net);
- rwlock_init(&ipvs->rs_lock);
-
/* Initialize rs_table */
for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
- INIT_LIST_HEAD(&ipvs->rs_table[idx]);
+ INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
INIT_LIST_HEAD(&ipvs->dest_trash);
+ spin_lock_init(&ipvs->dest_trash_lock);
+ setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire,
+ (unsigned long) net);
atomic_set(&ipvs->ftpsvc_counter, 0);
atomic_set(&ipvs->nullsvc_counter, 0);
@@ -3819,6 +3801,10 @@ void __net_exit ip_vs_control_net_cleanup(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
+ /* Some dest can be in grace period even before cleanup, we have to
+ * defer ip_vs_trash_cleanup until ip_vs_dest_wait_readers is called.
+ */
+ rcu_barrier();
ip_vs_trash_cleanup(net);
ip_vs_stop_estimator(net, &ipvs->tot_stats);
ip_vs_control_net_cleanup_sysctl(net);
@@ -3864,10 +3850,10 @@ int __init ip_vs_control_init(void)
EnterFunction(2);
- /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
+ /* Initialize svc_table, ip_vs_svc_fwm_table */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
- INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+ INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
+ INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]);
}
smp_wmb(); /* Do we really need it now ? */
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 7f3b0cc00b7a..ccab120df45e 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -51,7 +51,7 @@
* IPVS DH bucket
*/
struct ip_vs_dh_bucket {
- struct ip_vs_dest *dest; /* real server (cache) */
+ struct ip_vs_dest __rcu *dest; /* real server (cache) */
};
/*
@@ -64,6 +64,10 @@ struct ip_vs_dh_bucket {
#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS)
#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1)
+struct ip_vs_dh_state {
+ struct ip_vs_dh_bucket buckets[IP_VS_DH_TAB_SIZE];
+ struct rcu_head rcu_head;
+};
/*
* Returns hash value for IPVS DH entry
@@ -85,10 +89,9 @@ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *ad
* Get ip_vs_dest associated with supplied parameters.
*/
static inline struct ip_vs_dest *
-ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl,
- const union nf_inet_addr *addr)
+ip_vs_dh_get(int af, struct ip_vs_dh_state *s, const union nf_inet_addr *addr)
{
- return (tbl[ip_vs_dh_hashkey(af, addr)]).dest;
+ return rcu_dereference(s->buckets[ip_vs_dh_hashkey(af, addr)].dest);
}
@@ -96,25 +99,30 @@ ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl,
* Assign all the hash buckets of the specified table with the service.
*/
static int
-ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
+ip_vs_dh_reassign(struct ip_vs_dh_state *s, struct ip_vs_service *svc)
{
int i;
struct ip_vs_dh_bucket *b;
struct list_head *p;
struct ip_vs_dest *dest;
+ bool empty;
- b = tbl;
+ b = &s->buckets[0];
p = &svc->destinations;
+ empty = list_empty(p);
for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
- if (list_empty(p)) {
- b->dest = NULL;
- } else {
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest)
+ ip_vs_dest_put(dest);
+ if (empty)
+ RCU_INIT_POINTER(b->dest, NULL);
+ else {
if (p == &svc->destinations)
p = p->next;
dest = list_entry(p, struct ip_vs_dest, n_list);
- atomic_inc(&dest->refcnt);
- b->dest = dest;
+ ip_vs_dest_hold(dest);
+ RCU_INIT_POINTER(b->dest, dest);
p = p->next;
}
@@ -127,16 +135,18 @@ ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
/*
* Flush all the hash buckets of the specified table.
*/
-static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
+static void ip_vs_dh_flush(struct ip_vs_dh_state *s)
{
int i;
struct ip_vs_dh_bucket *b;
+ struct ip_vs_dest *dest;
- b = tbl;
+ b = &s->buckets[0];
for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
- if (b->dest) {
- atomic_dec(&b->dest->refcnt);
- b->dest = NULL;
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest) {
+ ip_vs_dest_put(dest);
+ RCU_INIT_POINTER(b->dest, NULL);
}
b++;
}
@@ -145,51 +155,46 @@ static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
{
- struct ip_vs_dh_bucket *tbl;
+ struct ip_vs_dh_state *s;
/* allocate the DH table for this service */
- tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
- GFP_KERNEL);
- if (tbl == NULL)
+ s = kzalloc(sizeof(struct ip_vs_dh_state), GFP_KERNEL);
+ if (s == NULL)
return -ENOMEM;
- svc->sched_data = tbl;
+ svc->sched_data = s;
IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
"current service\n",
sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
- /* assign the hash buckets with the updated service */
- ip_vs_dh_assign(tbl, svc);
+ /* assign the hash buckets with current dests */
+ ip_vs_dh_reassign(s, svc);
return 0;
}
-static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
+static void ip_vs_dh_done_svc(struct ip_vs_service *svc)
{
- struct ip_vs_dh_bucket *tbl = svc->sched_data;
+ struct ip_vs_dh_state *s = svc->sched_data;
/* got to clean up hash buckets here */
- ip_vs_dh_flush(tbl);
+ ip_vs_dh_flush(s);
/* release the table itself */
- kfree(svc->sched_data);
+ kfree_rcu(s, rcu_head);
IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
-
- return 0;
}
-static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
+static int ip_vs_dh_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
{
- struct ip_vs_dh_bucket *tbl = svc->sched_data;
-
- /* got to clean up hash buckets here */
- ip_vs_dh_flush(tbl);
+ struct ip_vs_dh_state *s = svc->sched_data;
/* assign the hash buckets with the updated service */
- ip_vs_dh_assign(tbl, svc);
+ ip_vs_dh_reassign(s, svc);
return 0;
}
@@ -212,19 +217,20 @@ static struct ip_vs_dest *
ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_dest *dest;
- struct ip_vs_dh_bucket *tbl;
+ struct ip_vs_dh_state *s;
struct ip_vs_iphdr iph;
ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
- tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
- dest = ip_vs_dh_get(svc->af, tbl, &iph.daddr);
+ s = (struct ip_vs_dh_state *) svc->sched_data;
+ dest = ip_vs_dh_get(svc->af, s, &iph.daddr);
if (!dest
|| !(dest->flags & IP_VS_DEST_F_AVAILABLE)
|| atomic_read(&dest->weight) <= 0
|| is_overloaded(dest)) {
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
}
@@ -248,7 +254,8 @@ static struct ip_vs_scheduler ip_vs_dh_scheduler =
.n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list),
.init_service = ip_vs_dh_init_svc,
.done_service = ip_vs_dh_done_svc,
- .update_service = ip_vs_dh_update_svc,
+ .add_dest = ip_vs_dh_dest_changed,
+ .del_dest = ip_vs_dh_dest_changed,
.schedule = ip_vs_dh_schedule,
};
@@ -262,6 +269,7 @@ static int __init ip_vs_dh_init(void)
static void __exit ip_vs_dh_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
+ synchronize_rcu();
}
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 0fac6017b6fb..6bee6d0c73a5 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -56,7 +56,7 @@
* Make a summary from each cpu
*/
static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum,
- struct ip_vs_cpu_stats *stats)
+ struct ip_vs_cpu_stats __percpu *stats)
{
int i;
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 4f53a5f04437..77c173282f38 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -267,10 +267,12 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
* hopefully it will succeed on the retransmitted
* packet.
*/
+ rcu_read_lock();
ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
iph->ihl * 4,
start-data, end-start,
buf, buf_len);
+ rcu_read_unlock();
if (ret) {
ip_vs_nfct_expect_related(skb, ct, n_cp,
IPPROTO_TCP, 0, 0);
@@ -480,6 +482,7 @@ static int __init ip_vs_ftp_init(void)
int rv;
rv = register_pernet_subsys(&ip_vs_ftp_ops);
+ /* rcu_barrier() is called by netns on error */
return rv;
}
@@ -489,6 +492,7 @@ static int __init ip_vs_ftp_init(void)
static void __exit ip_vs_ftp_exit(void)
{
unregister_pernet_subsys(&ip_vs_ftp_ops);
+ /* rcu_barrier() is called by netns */
}
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index fdd89b9564ea..b2cc2528a4df 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -90,11 +90,12 @@
* IP address and its destination server
*/
struct ip_vs_lblc_entry {
- struct list_head list;
+ struct hlist_node list;
int af; /* address family */
union nf_inet_addr addr; /* destination IP address */
- struct ip_vs_dest *dest; /* real server (cache) */
+ struct ip_vs_dest __rcu *dest; /* real server (cache) */
unsigned long lastuse; /* last used time */
+ struct rcu_head rcu_head;
};
@@ -102,12 +103,14 @@ struct ip_vs_lblc_entry {
* IPVS lblc hash table
*/
struct ip_vs_lblc_table {
- struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
+ struct rcu_head rcu_head;
+ struct hlist_head __rcu bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
+ struct timer_list periodic_timer; /* collect stale entries */
atomic_t entries; /* number of entries */
int max_size; /* maximum size of entries */
- struct timer_list periodic_timer; /* collect stale entries */
int rover; /* rover for expire check */
int counter; /* counter for no expire */
+ bool dead;
};
@@ -129,13 +132,16 @@ static ctl_table vs_vars_table[] = {
static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
{
- list_del(&en->list);
+ struct ip_vs_dest *dest;
+
+ hlist_del_rcu(&en->list);
/*
* We don't kfree dest because it is referred either by its service
* or the trash dest list.
*/
- atomic_dec(&en->dest->refcnt);
- kfree(en);
+ dest = rcu_dereference_protected(en->dest, 1);
+ ip_vs_dest_put(dest);
+ kfree_rcu(en, rcu_head);
}
@@ -165,15 +171,12 @@ ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
{
unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr);
- list_add(&en->list, &tbl->bucket[hash]);
+ hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
atomic_inc(&tbl->entries);
}
-/*
- * Get ip_vs_lblc_entry associated with supplied parameters. Called under read
- * lock
- */
+/* Get ip_vs_lblc_entry associated with supplied parameters. */
static inline struct ip_vs_lblc_entry *
ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
const union nf_inet_addr *addr)
@@ -181,7 +184,7 @@ ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
unsigned int hash = ip_vs_lblc_hashkey(af, addr);
struct ip_vs_lblc_entry *en;
- list_for_each_entry(en, &tbl->bucket[hash], list)
+ hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
if (ip_vs_addr_equal(af, &en->addr, addr))
return en;
@@ -191,7 +194,7 @@ ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
/*
* Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
- * address to a server. Called under write lock.
+ * address to a server. Called under spin lock.
*/
static inline struct ip_vs_lblc_entry *
ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
@@ -209,14 +212,20 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
ip_vs_addr_copy(dest->af, &en->addr, daddr);
en->lastuse = jiffies;
- atomic_inc(&dest->refcnt);
- en->dest = dest;
+ ip_vs_dest_hold(dest);
+ RCU_INIT_POINTER(en->dest, dest);
ip_vs_lblc_hash(tbl, en);
- } else if (en->dest != dest) {
- atomic_dec(&en->dest->refcnt);
- atomic_inc(&dest->refcnt);
- en->dest = dest;
+ } else {
+ struct ip_vs_dest *old_dest;
+
+ old_dest = rcu_dereference_protected(en->dest, 1);
+ if (old_dest != dest) {
+ ip_vs_dest_put(old_dest);
+ ip_vs_dest_hold(dest);
+ /* No ordering constraints for refcnt */
+ RCU_INIT_POINTER(en->dest, dest);
+ }
}
return en;
@@ -226,17 +235,22 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
/*
* Flush all the entries of the specified table.
*/
-static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
+static void ip_vs_lblc_flush(struct ip_vs_service *svc)
{
- struct ip_vs_lblc_entry *en, *nxt;
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
int i;
+ spin_lock_bh(&svc->sched_lock);
+ tbl->dead = 1;
for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
- list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
ip_vs_lblc_free(en);
atomic_dec(&tbl->entries);
}
}
+ spin_unlock_bh(&svc->sched_lock);
}
static int sysctl_lblc_expiration(struct ip_vs_service *svc)
@@ -252,15 +266,16 @@ static int sysctl_lblc_expiration(struct ip_vs_service *svc)
static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
{
struct ip_vs_lblc_table *tbl = svc->sched_data;
- struct ip_vs_lblc_entry *en, *nxt;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
unsigned long now = jiffies;
int i, j;
for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLC_TAB_MASK;
- write_lock(&svc->sched_lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
if (time_before(now,
en->lastuse +
sysctl_lblc_expiration(svc)))
@@ -269,7 +284,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
ip_vs_lblc_free(en);
atomic_dec(&tbl->entries);
}
- write_unlock(&svc->sched_lock);
+ spin_unlock(&svc->sched_lock);
}
tbl->rover = j;
}
@@ -293,7 +308,8 @@ static void ip_vs_lblc_check_expire(unsigned long data)
unsigned long now = jiffies;
int goal;
int i, j;
- struct ip_vs_lblc_entry *en, *nxt;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
/* do full expiration check */
@@ -314,8 +330,8 @@ static void ip_vs_lblc_check_expire(unsigned long data)
for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLC_TAB_MASK;
- write_lock(&svc->sched_lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
continue;
@@ -323,7 +339,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)
atomic_dec(&tbl->entries);
goal--;
}
- write_unlock(&svc->sched_lock);
+ spin_unlock(&svc->sched_lock);
if (goal <= 0)
break;
}
@@ -354,11 +370,12 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
* Initialize the hash buckets
*/
for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
- INIT_LIST_HEAD(&tbl->bucket[i]);
+ INIT_HLIST_HEAD(&tbl->bucket[i]);
}
tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
tbl->rover = 0;
tbl->counter = 1;
+ tbl->dead = 0;
/*
* Hook periodic timer for garbage collection
@@ -371,7 +388,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
}
-static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
+static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
{
struct ip_vs_lblc_table *tbl = svc->sched_data;
@@ -379,14 +396,12 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
del_timer_sync(&tbl->periodic_timer);
/* got to clean up table entries here */
- ip_vs_lblc_flush(tbl);
+ ip_vs_lblc_flush(svc);
/* release the table itself */
- kfree(tbl);
+ kfree_rcu(tbl, rcu_head);
IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
sizeof(*tbl));
-
- return 0;
}
@@ -408,7 +423,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)
* The server with weight=0 is quiesced and will not receive any
* new connection.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
if (atomic_read(&dest->weight) > 0) {
@@ -423,7 +438,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)
* Find the destination with the least load.
*/
nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
@@ -457,7 +472,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
struct ip_vs_dest *d;
- list_for_each_entry(d, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(d, &svc->destinations, n_list) {
if (atomic_read(&d->activeconns)*2
< atomic_read(&d->weight)) {
return 1;
@@ -484,7 +499,6 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
/* First look in our cache */
- read_lock(&svc->sched_lock);
en = ip_vs_lblc_get(svc->af, tbl, &iph.daddr);
if (en) {
/* We only hold a read lock, but this is atomic */
@@ -499,14 +513,11 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* free up entries from the trash at any time.
*/
- if (en->dest->flags & IP_VS_DEST_F_AVAILABLE)
- dest = en->dest;
+ dest = rcu_dereference(en->dest);
+ if ((dest->flags & IP_VS_DEST_F_AVAILABLE) &&
+ atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
+ goto out;
}
- read_unlock(&svc->sched_lock);
-
- /* If the destination has a weight and is not overloaded, use it */
- if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
- goto out;
/* No cache entry or it is invalid, time to schedule */
dest = __ip_vs_lblc_schedule(svc);
@@ -516,9 +527,10 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
}
/* If we fail to create a cache entry, we'll just use the valid dest */
- write_lock(&svc->sched_lock);
- ip_vs_lblc_new(tbl, &iph.daddr, dest);
- write_unlock(&svc->sched_lock);
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_lblc_new(tbl, &iph.daddr, dest);
+ spin_unlock_bh(&svc->sched_lock);
out:
IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n",
@@ -621,6 +633,7 @@ static void __exit ip_vs_lblc_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
unregister_pernet_subsys(&ip_vs_lblc_ops);
+ synchronize_rcu();
}
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index c03b6a3ade2f..feb9656eac58 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -89,40 +89,44 @@
*/
struct ip_vs_dest_set_elem {
struct list_head list; /* list link */
- struct ip_vs_dest *dest; /* destination server */
+ struct ip_vs_dest __rcu *dest; /* destination server */
+ struct rcu_head rcu_head;
};
struct ip_vs_dest_set {
atomic_t size; /* set size */
unsigned long lastmod; /* last modified time */
struct list_head list; /* destination list */
- rwlock_t lock; /* lock for this list */
};
-static struct ip_vs_dest_set_elem *
-ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set,
+ struct ip_vs_dest *dest, bool check)
{
struct ip_vs_dest_set_elem *e;
- list_for_each_entry(e, &set->list, list) {
- if (e->dest == dest)
- /* already existed */
- return NULL;
+ if (check) {
+ list_for_each_entry(e, &set->list, list) {
+ struct ip_vs_dest *d;
+
+ d = rcu_dereference_protected(e->dest, 1);
+ if (d == dest)
+ /* already existed */
+ return;
+ }
}
e = kmalloc(sizeof(*e), GFP_ATOMIC);
if (e == NULL)
- return NULL;
+ return;
- atomic_inc(&dest->refcnt);
- e->dest = dest;
+ ip_vs_dest_hold(dest);
+ RCU_INIT_POINTER(e->dest, dest);
- list_add(&e->list, &set->list);
+ list_add_rcu(&e->list, &set->list);
atomic_inc(&set->size);
set->lastmod = jiffies;
- return e;
}
static void
@@ -131,13 +135,16 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
struct ip_vs_dest_set_elem *e;
list_for_each_entry(e, &set->list, list) {
- if (e->dest == dest) {
+ struct ip_vs_dest *d;
+
+ d = rcu_dereference_protected(e->dest, 1);
+ if (d == dest) {
/* HIT */
atomic_dec(&set->size);
set->lastmod = jiffies;
- atomic_dec(&e->dest->refcnt);
- list_del(&e->list);
- kfree(e);
+ ip_vs_dest_put(dest);
+ list_del_rcu(&e->list);
+ kfree_rcu(e, rcu_head);
break;
}
}
@@ -147,17 +154,18 @@ static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
{
struct ip_vs_dest_set_elem *e, *ep;
- write_lock(&set->lock);
list_for_each_entry_safe(e, ep, &set->list, list) {
+ struct ip_vs_dest *d;
+
+ d = rcu_dereference_protected(e->dest, 1);
/*
* We don't kfree dest because it is referred either
* by its service or by the trash dest list.
*/
- atomic_dec(&e->dest->refcnt);
- list_del(&e->list);
- kfree(e);
+ ip_vs_dest_put(d);
+ list_del_rcu(&e->list);
+ kfree_rcu(e, rcu_head);
}
- write_unlock(&set->lock);
}
/* get weighted least-connection node in the destination set */
@@ -171,8 +179,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
return NULL;
/* select the first destination server, whose weight > 0 */
- list_for_each_entry(e, &set->list, list) {
- least = e->dest;
+ list_for_each_entry_rcu(e, &set->list, list) {
+ least = rcu_dereference(e->dest);
if (least->flags & IP_VS_DEST_F_OVERLOAD)
continue;
@@ -186,8 +194,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
/* find the destination with the weighted least load */
nextstage:
- list_for_each_entry(e, &set->list, list) {
- dest = e->dest;
+ list_for_each_entry_continue_rcu(e, &set->list, list) {
+ dest = rcu_dereference(e->dest);
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
@@ -224,7 +232,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
/* select the first destination server, whose weight > 0 */
list_for_each_entry(e, &set->list, list) {
- most = e->dest;
+ most = rcu_dereference_protected(e->dest, 1);
if (atomic_read(&most->weight) > 0) {
moh = ip_vs_dest_conn_overhead(most);
goto nextstage;
@@ -234,8 +242,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
/* find the destination with the weighted most load */
nextstage:
- list_for_each_entry(e, &set->list, list) {
- dest = e->dest;
+ list_for_each_entry_continue(e, &set->list, list) {
+ dest = rcu_dereference_protected(e->dest, 1);
doh = ip_vs_dest_conn_overhead(dest);
/* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
if ((moh * atomic_read(&dest->weight) <
@@ -262,11 +270,12 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
* IP address and its destination server set
*/
struct ip_vs_lblcr_entry {
- struct list_head list;
+ struct hlist_node list;
int af; /* address family */
union nf_inet_addr addr; /* destination IP address */
struct ip_vs_dest_set set; /* destination server set */
unsigned long lastuse; /* last used time */
+ struct rcu_head rcu_head;
};
@@ -274,12 +283,14 @@ struct ip_vs_lblcr_entry {
* IPVS lblcr hash table
*/
struct ip_vs_lblcr_table {
- struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
+ struct rcu_head rcu_head;
+ struct hlist_head __rcu bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
atomic_t entries; /* number of entries */
int max_size; /* maximum size of entries */
struct timer_list periodic_timer; /* collect stale entries */
int rover; /* rover for expire check */
int counter; /* counter for no expire */
+ bool dead;
};
@@ -302,9 +313,9 @@ static ctl_table vs_vars_table[] = {
static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
{
- list_del(&en->list);
+ hlist_del_rcu(&en->list);
ip_vs_dest_set_eraseall(&en->set);
- kfree(en);
+ kfree_rcu(en, rcu_head);
}
@@ -334,15 +345,12 @@ ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
{
unsigned int hash = ip_vs_lblcr_hashkey(en->af, &en->addr);
- list_add(&en->list, &tbl->bucket[hash]);
+ hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
atomic_inc(&tbl->entries);
}
-/*
- * Get ip_vs_lblcr_entry associated with supplied parameters. Called under
- * read lock.
- */
+/* Get ip_vs_lblcr_entry associated with supplied parameters. */
static inline struct ip_vs_lblcr_entry *
ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
const union nf_inet_addr *addr)
@@ -350,7 +358,7 @@ ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
unsigned int hash = ip_vs_lblcr_hashkey(af, addr);
struct ip_vs_lblcr_entry *en;
- list_for_each_entry(en, &tbl->bucket[hash], list)
+ hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
if (ip_vs_addr_equal(af, &en->addr, addr))
return en;
@@ -360,7 +368,7 @@ ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
/*
* Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
- * IP address to a server. Called under write lock.
+ * IP address to a server. Called under spin lock.
*/
static inline struct ip_vs_lblcr_entry *
ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
@@ -381,14 +389,14 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
/* initialize its dest set */
atomic_set(&(en->set.size), 0);
INIT_LIST_HEAD(&en->set.list);
- rwlock_init(&en->set.lock);
+
+ ip_vs_dest_set_insert(&en->set, dest, false);
ip_vs_lblcr_hash(tbl, en);
+ return en;
}
- write_lock(&en->set.lock);
- ip_vs_dest_set_insert(&en->set, dest);
- write_unlock(&en->set.lock);
+ ip_vs_dest_set_insert(&en->set, dest, true);
return en;
}
@@ -397,17 +405,21 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
/*
* Flush all the entries of the specified table.
*/
-static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
+static void ip_vs_lblcr_flush(struct ip_vs_service *svc)
{
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
int i;
- struct ip_vs_lblcr_entry *en, *nxt;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
- /* No locking required, only called during cleanup. */
+ spin_lock_bh(&svc->sched_lock);
+ tbl->dead = 1;
for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
- list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
ip_vs_lblcr_free(en);
}
}
+ spin_unlock_bh(&svc->sched_lock);
}
static int sysctl_lblcr_expiration(struct ip_vs_service *svc)
@@ -425,13 +437,14 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
struct ip_vs_lblcr_table *tbl = svc->sched_data;
unsigned long now = jiffies;
int i, j;
- struct ip_vs_lblcr_entry *en, *nxt;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
- write_lock(&svc->sched_lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
if (time_after(en->lastuse +
sysctl_lblcr_expiration(svc), now))
continue;
@@ -439,7 +452,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
ip_vs_lblcr_free(en);
atomic_dec(&tbl->entries);
}
- write_unlock(&svc->sched_lock);
+ spin_unlock(&svc->sched_lock);
}
tbl->rover = j;
}
@@ -463,7 +476,8 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
unsigned long now = jiffies;
int goal;
int i, j;
- struct ip_vs_lblcr_entry *en, *nxt;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
/* do full expiration check */
@@ -484,8 +498,8 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
- write_lock(&svc->sched_lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
continue;
@@ -493,7 +507,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
atomic_dec(&tbl->entries);
goal--;
}
- write_unlock(&svc->sched_lock);
+ spin_unlock(&svc->sched_lock);
if (goal <= 0)
break;
}
@@ -523,11 +537,12 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
* Initialize the hash buckets
*/
for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
- INIT_LIST_HEAD(&tbl->bucket[i]);
+ INIT_HLIST_HEAD(&tbl->bucket[i]);
}
tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
tbl->rover = 0;
tbl->counter = 1;
+ tbl->dead = 0;
/*
* Hook periodic timer for garbage collection
@@ -540,7 +555,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
}
-static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
+static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
{
struct ip_vs_lblcr_table *tbl = svc->sched_data;
@@ -548,14 +563,12 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
del_timer_sync(&tbl->periodic_timer);
/* got to clean up table entries here */
- ip_vs_lblcr_flush(tbl);
+ ip_vs_lblcr_flush(svc);
/* release the table itself */
- kfree(tbl);
+ kfree_rcu(tbl, rcu_head);
IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
sizeof(*tbl));
-
- return 0;
}
@@ -577,7 +590,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
* The server with weight=0 is quiesced and will not receive any
* new connection.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
@@ -593,7 +606,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
* Find the destination with the least load.
*/
nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
@@ -627,7 +640,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
struct ip_vs_dest *d;
- list_for_each_entry(d, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(d, &svc->destinations, n_list) {
if (atomic_read(&d->activeconns)*2
< atomic_read(&d->weight)) {
return 1;
@@ -646,7 +659,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_lblcr_table *tbl = svc->sched_data;
struct ip_vs_iphdr iph;
- struct ip_vs_dest *dest = NULL;
+ struct ip_vs_dest *dest;
struct ip_vs_lblcr_entry *en;
ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
@@ -654,53 +667,46 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
/* First look in our cache */
- read_lock(&svc->sched_lock);
en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr);
if (en) {
- /* We only hold a read lock, but this is atomic */
en->lastuse = jiffies;
/* Get the least loaded destination */
- read_lock(&en->set.lock);
dest = ip_vs_dest_set_min(&en->set);
- read_unlock(&en->set.lock);
/* More than one destination + enough time passed by, cleanup */
if (atomic_read(&en->set.size) > 1 &&
- time_after(jiffies, en->set.lastmod +
+ time_after(jiffies, en->set.lastmod +
sysctl_lblcr_expiration(svc))) {
- struct ip_vs_dest *m;
+ spin_lock_bh(&svc->sched_lock);
+ if (atomic_read(&en->set.size) > 1) {
+ struct ip_vs_dest *m;
- write_lock(&en->set.lock);
- m = ip_vs_dest_set_max(&en->set);
- if (m)
- ip_vs_dest_set_erase(&en->set, m);
- write_unlock(&en->set.lock);
+ m = ip_vs_dest_set_max(&en->set);
+ if (m)
+ ip_vs_dest_set_erase(&en->set, m);
+ }
+ spin_unlock_bh(&svc->sched_lock);
}
/* If the destination is not overloaded, use it */
- if (dest && !is_overloaded(dest, svc)) {
- read_unlock(&svc->sched_lock);
+ if (dest && !is_overloaded(dest, svc))
goto out;
- }
/* The cache entry is invalid, time to schedule */
dest = __ip_vs_lblcr_schedule(svc);
if (!dest) {
ip_vs_scheduler_err(svc, "no destination available");
- read_unlock(&svc->sched_lock);
return NULL;
}
/* Update our cache entry */
- write_lock(&en->set.lock);
- ip_vs_dest_set_insert(&en->set, dest);
- write_unlock(&en->set.lock);
- }
- read_unlock(&svc->sched_lock);
-
- if (dest)
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_dest_set_insert(&en->set, dest, true);
+ spin_unlock_bh(&svc->sched_lock);
goto out;
+ }
/* No cache entry, time to schedule */
dest = __ip_vs_lblcr_schedule(svc);
@@ -710,9 +716,10 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
}
/* If we fail to create a cache entry, we'll just use the valid dest */
- write_lock(&svc->sched_lock);
- ip_vs_lblcr_new(tbl, &iph.daddr, dest);
- write_unlock(&svc->sched_lock);
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_lblcr_new(tbl, &iph.daddr, dest);
+ spin_unlock_bh(&svc->sched_lock);
out:
IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n",
@@ -814,6 +821,7 @@ static void __exit ip_vs_lblcr_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
unregister_pernet_subsys(&ip_vs_lblcr_ops);
+ synchronize_rcu();
}
diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c
index f391819c0cca..5128e338a749 100644
--- a/net/netfilter/ipvs/ip_vs_lc.c
+++ b/net/netfilter/ipvs/ip_vs_lc.c
@@ -42,7 +42,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* served, but no new connection is assigned to the server.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
atomic_read(&dest->weight) == 0)
continue;
@@ -84,6 +84,7 @@ static int __init ip_vs_lc_init(void)
static void __exit ip_vs_lc_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_lc_init);
diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c
index 984d9c137d84..646cfd4baa73 100644
--- a/net/netfilter/ipvs/ip_vs_nq.c
+++ b/net/netfilter/ipvs/ip_vs_nq.c
@@ -75,7 +75,7 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* new connections.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
!atomic_read(&dest->weight))
@@ -133,6 +133,7 @@ static int __init ip_vs_nq_init(void)
static void __exit ip_vs_nq_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_nq_init);
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
index 5cf859ccb31b..1a82b29ce8ea 100644
--- a/net/netfilter/ipvs/ip_vs_pe.c
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -13,20 +13,8 @@
/* IPVS pe list */
static LIST_HEAD(ip_vs_pe);
-/* lock for service table */
-static DEFINE_SPINLOCK(ip_vs_pe_lock);
-
-/* Bind a service with a pe */
-void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe)
-{
- svc->pe = pe;
-}
-
-/* Unbind a service from its pe */
-void ip_vs_unbind_pe(struct ip_vs_service *svc)
-{
- svc->pe = NULL;
-}
+/* semaphore for IPVS PEs. */
+static DEFINE_MUTEX(ip_vs_pe_mutex);
/* Get pe in the pe list by name */
struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
@@ -36,9 +24,8 @@ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__,
pe_name);
- spin_lock_bh(&ip_vs_pe_lock);
-
- list_for_each_entry(pe, &ip_vs_pe, n_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pe, &ip_vs_pe, n_list) {
/* Test and get the modules atomically */
if (pe->module &&
!try_module_get(pe->module)) {
@@ -47,14 +34,14 @@ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
}
if (strcmp(pe_name, pe->name)==0) {
/* HIT */
- spin_unlock_bh(&ip_vs_pe_lock);
+ rcu_read_unlock();
return pe;
}
if (pe->module)
module_put(pe->module);
}
+ rcu_read_unlock();
- spin_unlock_bh(&ip_vs_pe_lock);
return NULL;
}
@@ -83,22 +70,13 @@ int register_ip_vs_pe(struct ip_vs_pe *pe)
/* increase the module use count */
ip_vs_use_count_inc();
- spin_lock_bh(&ip_vs_pe_lock);
-
- if (!list_empty(&pe->n_list)) {
- spin_unlock_bh(&ip_vs_pe_lock);
- ip_vs_use_count_dec();
- pr_err("%s(): [%s] pe already linked\n",
- __func__, pe->name);
- return -EINVAL;
- }
-
+ mutex_lock(&ip_vs_pe_mutex);
/* Make sure that the pe with this name doesn't exist
* in the pe list.
*/
list_for_each_entry(tmp, &ip_vs_pe, n_list) {
if (strcmp(tmp->name, pe->name) == 0) {
- spin_unlock_bh(&ip_vs_pe_lock);
+ mutex_unlock(&ip_vs_pe_mutex);
ip_vs_use_count_dec();
pr_err("%s(): [%s] pe already existed "
"in the system\n", __func__, pe->name);
@@ -106,8 +84,8 @@ int register_ip_vs_pe(struct ip_vs_pe *pe)
}
}
/* Add it into the d-linked pe list */
- list_add(&pe->n_list, &ip_vs_pe);
- spin_unlock_bh(&ip_vs_pe_lock);
+ list_add_rcu(&pe->n_list, &ip_vs_pe);
+ mutex_unlock(&ip_vs_pe_mutex);
pr_info("[%s] pe registered.\n", pe->name);
@@ -118,17 +96,10 @@ EXPORT_SYMBOL_GPL(register_ip_vs_pe);
/* Unregister a pe from the pe list */
int unregister_ip_vs_pe(struct ip_vs_pe *pe)
{
- spin_lock_bh(&ip_vs_pe_lock);
- if (list_empty(&pe->n_list)) {
- spin_unlock_bh(&ip_vs_pe_lock);
- pr_err("%s(): [%s] pe is not in the list. failed\n",
- __func__, pe->name);
- return -EINVAL;
- }
-
+ mutex_lock(&ip_vs_pe_mutex);
/* Remove it from the d-linked pe list */
- list_del(&pe->n_list);
- spin_unlock_bh(&ip_vs_pe_lock);
+ list_del_rcu(&pe->n_list);
+ mutex_unlock(&ip_vs_pe_mutex);
/* decrease the module use count */
ip_vs_use_count_dec();
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index 12475ef88daf..00cc0241ed87 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -172,6 +172,7 @@ static int __init ip_vs_sip_init(void)
static void __exit ip_vs_sip_cleanup(void)
{
unregister_ip_vs_pe(&ip_vs_sip_pe);
+ synchronize_rcu();
}
module_init(ip_vs_sip_init);
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index ae8ec6f27688..6e14a7b5602f 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -27,9 +27,10 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
if (sch == NULL)
return 0;
net = skb_net(skb);
+ rcu_read_lock();
if ((sch->type == SCTP_CID_INIT) &&
- (svc = ip_vs_service_get(net, af, skb->mark, iph->protocol,
- &iph->daddr, sh->dest))) {
+ (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, sh->dest))) {
int ignored;
if (ip_vs_todrop(net_ipvs(net))) {
@@ -37,7 +38,7 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
* It seems that we are very loaded.
* We have to drop this packet :(
*/
- ip_vs_service_put(svc);
+ rcu_read_unlock();
*verdict = NF_DROP;
return 0;
}
@@ -49,14 +50,13 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
if (!*cpp && ignored <= 0) {
if (!ignored)
*verdict = ip_vs_leave(svc, skb, pd, iph);
- else {
- ip_vs_service_put(svc);
+ else
*verdict = NF_DROP;
- }
+ rcu_read_unlock();
return 0;
}
- ip_vs_service_put(svc);
}
+ rcu_read_unlock();
/* NF_ACCEPT */
return 1;
}
@@ -906,7 +906,7 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
sctp_chunkhdr_t _sctpch, *sch;
unsigned char chunk_type;
int event, next_state;
- int ihl;
+ int ihl, cofs;
#ifdef CONFIG_IP_VS_IPV6
ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
@@ -914,8 +914,8 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
ihl = ip_hdrlen(skb);
#endif
- sch = skb_header_pointer(skb, ihl + sizeof(sctp_sctphdr_t),
- sizeof(_sctpch), &_sctpch);
+ cofs = ihl + sizeof(sctp_sctphdr_t);
+ sch = skb_header_pointer(skb, cofs, sizeof(_sctpch), &_sctpch);
if (sch == NULL)
return;
@@ -933,10 +933,12 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
*/
if ((sch->type == SCTP_CID_COOKIE_ECHO) ||
(sch->type == SCTP_CID_COOKIE_ACK)) {
- sch = skb_header_pointer(skb, (ihl + sizeof(sctp_sctphdr_t) +
- sch->length), sizeof(_sctpch), &_sctpch);
- if (sch) {
- if (sch->type == SCTP_CID_ABORT)
+ int clen = ntohs(sch->length);
+
+ if (clen >= sizeof(sctp_chunkhdr_t)) {
+ sch = skb_header_pointer(skb, cofs + ALIGN(clen, 4),
+ sizeof(_sctpch), &_sctpch);
+ if (sch && sch->type == SCTP_CID_ABORT)
chunk_type = sch->type;
}
}
@@ -992,9 +994,9 @@ static void
sctp_state_transition(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb, struct ip_vs_proto_data *pd)
{
- spin_lock(&cp->lock);
+ spin_lock_bh(&cp->lock);
set_sctp_state(pd, cp, direction, skb);
- spin_unlock(&cp->lock);
+ spin_unlock_bh(&cp->lock);
}
static inline __u16 sctp_app_hashkey(__be16 port)
@@ -1014,30 +1016,25 @@ static int sctp_register_app(struct net *net, struct ip_vs_app *inc)
hash = sctp_app_hashkey(port);
- spin_lock_bh(&ipvs->sctp_app_lock);
list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &ipvs->sctp_apps[hash]);
+ list_add_rcu(&inc->p_list, &ipvs->sctp_apps[hash]);
atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&ipvs->sctp_app_lock);
return ret;
}
static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
- spin_lock_bh(&ipvs->sctp_app_lock);
atomic_dec(&pd->appcnt);
- list_del(&inc->p_list);
- spin_unlock_bh(&ipvs->sctp_app_lock);
+ list_del_rcu(&inc->p_list);
}
static int sctp_app_conn_bind(struct ip_vs_conn *cp)
@@ -1053,12 +1050,12 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = sctp_app_hashkey(cp->vport);
- spin_lock(&ipvs->sctp_app_lock);
- list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&ipvs->sctp_app_lock);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -1074,7 +1071,7 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&ipvs->sctp_app_lock);
+ rcu_read_unlock();
out:
return result;
}
@@ -1088,7 +1085,6 @@ static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
struct netns_ipvs *ipvs = net_ipvs(net);
ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
- spin_lock_init(&ipvs->sctp_app_lock);
pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts,
sizeof(sctp_timeouts));
if (!pd->timeout_table)
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 9af653a75825..50a15944c6c1 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -47,9 +47,10 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
}
net = skb_net(skb);
/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
+ rcu_read_lock();
if (th->syn &&
- (svc = ip_vs_service_get(net, af, skb->mark, iph->protocol,
- &iph->daddr, th->dest))) {
+ (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, th->dest))) {
int ignored;
if (ip_vs_todrop(net_ipvs(net))) {
@@ -57,7 +58,7 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
* It seems that we are very loaded.
* We have to drop this packet :(
*/
- ip_vs_service_put(svc);
+ rcu_read_unlock();
*verdict = NF_DROP;
return 0;
}
@@ -70,14 +71,13 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
if (!*cpp && ignored <= 0) {
if (!ignored)
*verdict = ip_vs_leave(svc, skb, pd, iph);
- else {
- ip_vs_service_put(svc);
+ else
*verdict = NF_DROP;
- }
+ rcu_read_unlock();
return 0;
}
- ip_vs_service_put(svc);
}
+ rcu_read_unlock();
/* NF_ACCEPT */
return 1;
}
@@ -557,9 +557,9 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction,
if (th == NULL)
return;
- spin_lock(&cp->lock);
+ spin_lock_bh(&cp->lock);
set_tcp_state(pd, cp, direction, th);
- spin_unlock(&cp->lock);
+ spin_unlock_bh(&cp->lock);
}
static inline __u16 tcp_app_hashkey(__be16 port)
@@ -580,18 +580,16 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
hash = tcp_app_hashkey(port);
- spin_lock_bh(&ipvs->tcp_app_lock);
list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &ipvs->tcp_apps[hash]);
+ list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&ipvs->tcp_app_lock);
return ret;
}
@@ -599,13 +597,10 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
static void
tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
- spin_lock_bh(&ipvs->tcp_app_lock);
atomic_dec(&pd->appcnt);
- list_del(&inc->p_list);
- spin_unlock_bh(&ipvs->tcp_app_lock);
+ list_del_rcu(&inc->p_list);
}
@@ -624,12 +619,12 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = tcp_app_hashkey(cp->vport);
- spin_lock(&ipvs->tcp_app_lock);
- list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&ipvs->tcp_app_lock);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -646,7 +641,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&ipvs->tcp_app_lock);
+ rcu_read_unlock();
out:
return result;
@@ -660,11 +655,11 @@ void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
{
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
- spin_lock(&cp->lock);
+ spin_lock_bh(&cp->lock);
cp->state = IP_VS_TCP_S_LISTEN;
cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
: tcp_timeouts[IP_VS_TCP_S_LISTEN]);
- spin_unlock(&cp->lock);
+ spin_unlock_bh(&cp->lock);
}
/* ---------------------------------------------
@@ -676,7 +671,6 @@ static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
struct netns_ipvs *ipvs = net_ipvs(net);
ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
- spin_lock_init(&ipvs->tcp_app_lock);
pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
sizeof(tcp_timeouts));
if (!pd->timeout_table)
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 503a842c90d2..b62a3c0ff9bf 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -44,8 +44,9 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
return 0;
}
net = skb_net(skb);
- svc = ip_vs_service_get(net, af, skb->mark, iph->protocol,
- &iph->daddr, uh->dest);
+ rcu_read_lock();
+ svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, uh->dest);
if (svc) {
int ignored;
@@ -54,7 +55,7 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
* It seems that we are very loaded.
* We have to drop this packet :(
*/
- ip_vs_service_put(svc);
+ rcu_read_unlock();
*verdict = NF_DROP;
return 0;
}
@@ -67,14 +68,13 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
if (!*cpp && ignored <= 0) {
if (!ignored)
*verdict = ip_vs_leave(svc, skb, pd, iph);
- else {
- ip_vs_service_put(svc);
+ else
*verdict = NF_DROP;
- }
+ rcu_read_unlock();
return 0;
}
- ip_vs_service_put(svc);
}
+ rcu_read_unlock();
/* NF_ACCEPT */
return 1;
}
@@ -359,19 +359,16 @@ static int udp_register_app(struct net *net, struct ip_vs_app *inc)
hash = udp_app_hashkey(port);
-
- spin_lock_bh(&ipvs->udp_app_lock);
list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &ipvs->udp_apps[hash]);
+ list_add_rcu(&inc->p_list, &ipvs->udp_apps[hash]);
atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&ipvs->udp_app_lock);
return ret;
}
@@ -380,12 +377,9 @@ static void
udp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
- struct netns_ipvs *ipvs = net_ipvs(net);
- spin_lock_bh(&ipvs->udp_app_lock);
atomic_dec(&pd->appcnt);
- list_del(&inc->p_list);
- spin_unlock_bh(&ipvs->udp_app_lock);
+ list_del_rcu(&inc->p_list);
}
@@ -403,12 +397,12 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = udp_app_hashkey(cp->vport);
- spin_lock(&ipvs->udp_app_lock);
- list_for_each_entry(inc, &ipvs->udp_apps[hash], p_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&ipvs->udp_app_lock);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -425,7 +419,7 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&ipvs->udp_app_lock);
+ rcu_read_unlock();
out:
return result;
@@ -467,7 +461,6 @@ static int __udp_init(struct net *net, struct ip_vs_proto_data *pd)
struct netns_ipvs *ipvs = net_ipvs(net);
ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE);
- spin_lock_init(&ipvs->udp_app_lock);
pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts,
sizeof(udp_timeouts));
if (!pd->timeout_table)
diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
index c49b388d1085..c35986c793d9 100644
--- a/net/netfilter/ipvs/ip_vs_rr.c
+++ b/net/netfilter/ipvs/ip_vs_rr.c
@@ -35,9 +35,18 @@ static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
}
-static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
+static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)
{
- svc->sched_data = &svc->destinations;
+ struct list_head *p;
+
+ spin_lock_bh(&svc->sched_lock);
+ p = (struct list_head *) svc->sched_data;
+ /* dest is already unlinked, so p->prev is not valid but
+ * p->next is valid, use it to reach previous entry.
+ */
+ if (p == &dest->n_list)
+ svc->sched_data = p->next->prev;
+ spin_unlock_bh(&svc->sched_lock);
return 0;
}
@@ -48,36 +57,41 @@ static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
static struct ip_vs_dest *
ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
- struct list_head *p, *q;
- struct ip_vs_dest *dest;
+ struct list_head *p;
+ struct ip_vs_dest *dest, *last;
+ int pass = 0;
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
- write_lock(&svc->sched_lock);
- p = (struct list_head *)svc->sched_data;
- p = p->next;
- q = p;
+ spin_lock_bh(&svc->sched_lock);
+ p = (struct list_head *) svc->sched_data;
+ last = dest = list_entry(p, struct ip_vs_dest, n_list);
+
do {
- /* skip list head */
- if (q == &svc->destinations) {
- q = q->next;
- continue;
+ list_for_each_entry_continue_rcu(dest,
+ &svc->destinations,
+ n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) > 0)
+ /* HIT */
+ goto out;
+ if (dest == last)
+ goto stop;
}
-
- dest = list_entry(q, struct ip_vs_dest, n_list);
- if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
- atomic_read(&dest->weight) > 0)
- /* HIT */
- goto out;
- q = q->next;
- } while (q != p);
- write_unlock(&svc->sched_lock);
+ pass++;
+ /* Previous dest could be unlinked, do not loop forever.
+ * If we stay at head there is no need for 2nd pass.
+ */
+ } while (pass < 2 && p != &svc->destinations);
+
+stop:
+ spin_unlock_bh(&svc->sched_lock);
ip_vs_scheduler_err(svc, "no destination available");
return NULL;
out:
- svc->sched_data = q;
- write_unlock(&svc->sched_lock);
+ svc->sched_data = &dest->n_list;
+ spin_unlock_bh(&svc->sched_lock);
IP_VS_DBG_BUF(6, "RR: server %s:%u "
"activeconns %d refcnt %d weight %d\n",
IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
@@ -94,7 +108,8 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = {
.module = THIS_MODULE,
.n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
.init_service = ip_vs_rr_init_svc,
- .update_service = ip_vs_rr_update_svc,
+ .add_dest = NULL,
+ .del_dest = ip_vs_rr_del_dest,
.schedule = ip_vs_rr_schedule,
};
@@ -106,6 +121,7 @@ static int __init ip_vs_rr_init(void)
static void __exit ip_vs_rr_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_rr_init);
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index d6bf20d6cdbe..4dbcda6258bc 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -35,8 +35,8 @@ EXPORT_SYMBOL(ip_vs_scheduler_err);
*/
static LIST_HEAD(ip_vs_schedulers);
-/* lock for service table */
-static DEFINE_SPINLOCK(ip_vs_sched_lock);
+/* semaphore for schedulers */
+static DEFINE_MUTEX(ip_vs_sched_mutex);
/*
@@ -47,8 +47,6 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
{
int ret;
- svc->scheduler = scheduler;
-
if (scheduler->init_service) {
ret = scheduler->init_service(svc);
if (ret) {
@@ -56,7 +54,7 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
return ret;
}
}
-
+ rcu_assign_pointer(svc->scheduler, scheduler);
return 0;
}
@@ -64,22 +62,19 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
/*
* Unbind a service with its scheduler
*/
-int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
+void ip_vs_unbind_scheduler(struct ip_vs_service *svc,
+ struct ip_vs_scheduler *sched)
{
- struct ip_vs_scheduler *sched = svc->scheduler;
+ struct ip_vs_scheduler *cur_sched;
- if (!sched)
- return 0;
+ cur_sched = rcu_dereference_protected(svc->scheduler, 1);
+ /* This check proves that old 'sched' was installed */
+ if (!cur_sched)
+ return;
- if (sched->done_service) {
- if (sched->done_service(svc) != 0) {
- pr_err("%s(): done error\n", __func__);
- return -EINVAL;
- }
- }
-
- svc->scheduler = NULL;
- return 0;
+ if (sched->done_service)
+ sched->done_service(svc);
+ /* svc->scheduler can not be set to NULL */
}
@@ -92,7 +87,7 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name);
- spin_lock_bh(&ip_vs_sched_lock);
+ mutex_lock(&ip_vs_sched_mutex);
list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
/*
@@ -106,14 +101,14 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
}
if (strcmp(sched_name, sched->name)==0) {
/* HIT */
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
return sched;
}
if (sched->module)
module_put(sched->module);
}
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
return NULL;
}
@@ -153,21 +148,21 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg)
{
+ struct ip_vs_scheduler *sched;
+
+ sched = rcu_dereference(svc->scheduler);
if (svc->fwmark) {
IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n",
- svc->scheduler->name, svc->fwmark,
- svc->fwmark, msg);
+ sched->name, svc->fwmark, svc->fwmark, msg);
#ifdef CONFIG_IP_VS_IPV6
} else if (svc->af == AF_INET6) {
IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n",
- svc->scheduler->name,
- ip_vs_proto_name(svc->protocol),
+ sched->name, ip_vs_proto_name(svc->protocol),
&svc->addr.in6, ntohs(svc->port), msg);
#endif
} else {
IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n",
- svc->scheduler->name,
- ip_vs_proto_name(svc->protocol),
+ sched->name, ip_vs_proto_name(svc->protocol),
&svc->addr.ip, ntohs(svc->port), msg);
}
}
@@ -192,10 +187,10 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
/* increase the module use count */
ip_vs_use_count_inc();
- spin_lock_bh(&ip_vs_sched_lock);
+ mutex_lock(&ip_vs_sched_mutex);
if (!list_empty(&scheduler->n_list)) {
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
ip_vs_use_count_dec();
pr_err("%s(): [%s] scheduler already linked\n",
__func__, scheduler->name);
@@ -208,7 +203,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
*/
list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
if (strcmp(scheduler->name, sched->name) == 0) {
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
ip_vs_use_count_dec();
pr_err("%s(): [%s] scheduler already existed "
"in the system\n", __func__, scheduler->name);
@@ -219,7 +214,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
* Add it into the d-linked scheduler list
*/
list_add(&scheduler->n_list, &ip_vs_schedulers);
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
pr_info("[%s] scheduler registered.\n", scheduler->name);
@@ -237,9 +232,9 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
return -EINVAL;
}
- spin_lock_bh(&ip_vs_sched_lock);
+ mutex_lock(&ip_vs_sched_mutex);
if (list_empty(&scheduler->n_list)) {
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
pr_err("%s(): [%s] scheduler is not in the list. failed\n",
__func__, scheduler->name);
return -EINVAL;
@@ -249,7 +244,7 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
* Remove it from the d-linked scheduler list
*/
list_del(&scheduler->n_list);
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
/* decrease the module use count */
ip_vs_use_count_dec();
diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c
index 89ead246ed3d..f3205925359a 100644
--- a/net/netfilter/ipvs/ip_vs_sed.c
+++ b/net/netfilter/ipvs/ip_vs_sed.c
@@ -79,7 +79,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* new connections.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
atomic_read(&dest->weight) > 0) {
least = dest;
@@ -94,7 +94,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* Find the destination with the least load.
*/
nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = ip_vs_sed_dest_overhead(dest);
@@ -134,6 +134,7 @@ static int __init ip_vs_sed_init(void)
static void __exit ip_vs_sed_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_sed_init);
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index e33126994628..0df269d7c99f 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -53,7 +53,7 @@
* IPVS SH bucket
*/
struct ip_vs_sh_bucket {
- struct ip_vs_dest *dest; /* real server (cache) */
+ struct ip_vs_dest __rcu *dest; /* real server (cache) */
};
/*
@@ -66,6 +66,10 @@ struct ip_vs_sh_bucket {
#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS)
#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1)
+struct ip_vs_sh_state {
+ struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE];
+ struct rcu_head rcu_head;
+};
/*
* Returns hash value for IPVS SH entry
@@ -87,10 +91,9 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
* Get ip_vs_dest associated with supplied parameters.
*/
static inline struct ip_vs_dest *
-ip_vs_sh_get(int af, struct ip_vs_sh_bucket *tbl,
- const union nf_inet_addr *addr)
+ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr)
{
- return (tbl[ip_vs_sh_hashkey(af, addr)]).dest;
+ return rcu_dereference(s->buckets[ip_vs_sh_hashkey(af, addr)].dest);
}
@@ -98,27 +101,32 @@ ip_vs_sh_get(int af, struct ip_vs_sh_bucket *tbl,
* Assign all the hash buckets of the specified table with the service.
*/
static int
-ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
+ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc)
{
int i;
struct ip_vs_sh_bucket *b;
struct list_head *p;
struct ip_vs_dest *dest;
int d_count;
+ bool empty;
- b = tbl;
+ b = &s->buckets[0];
p = &svc->destinations;
+ empty = list_empty(p);
d_count = 0;
for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
- if (list_empty(p)) {
- b->dest = NULL;
- } else {
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest)
+ ip_vs_dest_put(dest);
+ if (empty)
+ RCU_INIT_POINTER(b->dest, NULL);
+ else {
if (p == &svc->destinations)
p = p->next;
dest = list_entry(p, struct ip_vs_dest, n_list);
- atomic_inc(&dest->refcnt);
- b->dest = dest;
+ ip_vs_dest_hold(dest);
+ RCU_INIT_POINTER(b->dest, dest);
IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n",
i, IP_VS_DBG_ADDR(svc->af, &dest->addr),
@@ -140,16 +148,18 @@ ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
/*
* Flush all the hash buckets of the specified table.
*/
-static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
+static void ip_vs_sh_flush(struct ip_vs_sh_state *s)
{
int i;
struct ip_vs_sh_bucket *b;
+ struct ip_vs_dest *dest;
- b = tbl;
+ b = &s->buckets[0];
for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
- if (b->dest) {
- atomic_dec(&b->dest->refcnt);
- b->dest = NULL;
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest) {
+ ip_vs_dest_put(dest);
+ RCU_INIT_POINTER(b->dest, NULL);
}
b++;
}
@@ -158,51 +168,46 @@ static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
{
- struct ip_vs_sh_bucket *tbl;
+ struct ip_vs_sh_state *s;
/* allocate the SH table for this service */
- tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
- GFP_KERNEL);
- if (tbl == NULL)
+ s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL);
+ if (s == NULL)
return -ENOMEM;
- svc->sched_data = tbl;
+ svc->sched_data = s;
IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
"current service\n",
sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
- /* assign the hash buckets with the updated service */
- ip_vs_sh_assign(tbl, svc);
+ /* assign the hash buckets with current dests */
+ ip_vs_sh_reassign(s, svc);
return 0;
}
-static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
+static void ip_vs_sh_done_svc(struct ip_vs_service *svc)
{
- struct ip_vs_sh_bucket *tbl = svc->sched_data;
+ struct ip_vs_sh_state *s = svc->sched_data;
/* got to clean up hash buckets here */
- ip_vs_sh_flush(tbl);
+ ip_vs_sh_flush(s);
/* release the table itself */
- kfree(svc->sched_data);
+ kfree_rcu(s, rcu_head);
IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
-
- return 0;
}
-static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
+static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
{
- struct ip_vs_sh_bucket *tbl = svc->sched_data;
-
- /* got to clean up hash buckets here */
- ip_vs_sh_flush(tbl);
+ struct ip_vs_sh_state *s = svc->sched_data;
/* assign the hash buckets with the updated service */
- ip_vs_sh_assign(tbl, svc);
+ ip_vs_sh_reassign(s, svc);
return 0;
}
@@ -225,15 +230,15 @@ static struct ip_vs_dest *
ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_dest *dest;
- struct ip_vs_sh_bucket *tbl;
+ struct ip_vs_sh_state *s;
struct ip_vs_iphdr iph;
ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
- tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
- dest = ip_vs_sh_get(svc->af, tbl, &iph.saddr);
+ s = (struct ip_vs_sh_state *) svc->sched_data;
+ dest = ip_vs_sh_get(svc->af, s, &iph.saddr);
if (!dest
|| !(dest->flags & IP_VS_DEST_F_AVAILABLE)
|| atomic_read(&dest->weight) <= 0
@@ -262,7 +267,9 @@ static struct ip_vs_scheduler ip_vs_sh_scheduler =
.n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
.init_service = ip_vs_sh_init_svc,
.done_service = ip_vs_sh_done_svc,
- .update_service = ip_vs_sh_update_svc,
+ .add_dest = ip_vs_sh_dest_changed,
+ .del_dest = ip_vs_sh_dest_changed,
+ .upd_dest = ip_vs_sh_dest_changed,
.schedule = ip_vs_sh_schedule,
};
@@ -276,6 +283,7 @@ static int __init ip_vs_sh_init(void)
static void __exit ip_vs_sh_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
+ synchronize_rcu();
}
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 44fd10c539ac..8e57077e5540 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -531,9 +531,9 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
return;
- spin_lock(&ipvs->sync_buff_lock);
+ spin_lock_bh(&ipvs->sync_buff_lock);
if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
- spin_unlock(&ipvs->sync_buff_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
return;
}
@@ -552,7 +552,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
if (!buff) {
buff = ip_vs_sync_buff_create_v0(ipvs);
if (!buff) {
- spin_unlock(&ipvs->sync_buff_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
pr_err("ip_vs_sync_buff_create failed.\n");
return;
}
@@ -590,7 +590,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
sb_queue_tail(ipvs, ms);
ms->sync_buff = NULL;
}
- spin_unlock(&ipvs->sync_buff_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
/* synchronize its controller if it has */
cp = cp->control;
@@ -641,9 +641,9 @@ sloop:
pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
}
- spin_lock(&ipvs->sync_buff_lock);
+ spin_lock_bh(&ipvs->sync_buff_lock);
if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
- spin_unlock(&ipvs->sync_buff_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
return;
}
@@ -683,7 +683,7 @@ sloop:
if (!buff) {
buff = ip_vs_sync_buff_create(ipvs);
if (!buff) {
- spin_unlock(&ipvs->sync_buff_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
pr_err("ip_vs_sync_buff_create failed.\n");
return;
}
@@ -750,7 +750,7 @@ sloop:
}
}
- spin_unlock(&ipvs->sync_buff_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
control:
/* synchronize its controller if it has */
@@ -843,7 +843,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
kfree(param->pe_data);
dest = cp->dest;
- spin_lock(&cp->lock);
+ spin_lock_bh(&cp->lock);
if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE &&
!(flags & IP_VS_CONN_F_TEMPLATE) && dest) {
if (flags & IP_VS_CONN_F_INACTIVE) {
@@ -857,24 +857,21 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
flags &= IP_VS_CONN_F_BACKUP_UPD_MASK;
flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK;
cp->flags = flags;
- spin_unlock(&cp->lock);
- if (!dest) {
- dest = ip_vs_try_bind_dest(cp);
- if (dest)
- atomic_dec(&dest->refcnt);
- }
+ spin_unlock_bh(&cp->lock);
+ if (!dest)
+ ip_vs_try_bind_dest(cp);
} else {
/*
* Find the appropriate destination for the connection.
* If it is not found the connection will remain unbound
* but still handled.
*/
+ rcu_read_lock();
dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr,
param->vport, protocol, fwmark, flags);
cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark);
- if (dest)
- atomic_dec(&dest->refcnt);
+ rcu_read_unlock();
if (!cp) {
if (param->pe_data)
kfree(param->pe_data);
@@ -1692,11 +1689,7 @@ static int sync_thread_backup(void *data)
break;
}
- /* disable bottom half, because it accesses the data
- shared by softirq while getting/creating conns */
- local_bh_disable();
ip_vs_process_message(tinfo->net, tinfo->buf, len);
- local_bh_enable();
}
}
diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c
index bc1bfc48a17f..c60a81c4ce9a 100644
--- a/net/netfilter/ipvs/ip_vs_wlc.c
+++ b/net/netfilter/ipvs/ip_vs_wlc.c
@@ -51,7 +51,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* new connections.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
atomic_read(&dest->weight) > 0) {
least = dest;
@@ -66,7 +66,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* Find the destination with the least load.
*/
nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = ip_vs_dest_conn_overhead(dest);
@@ -106,6 +106,7 @@ static int __init ip_vs_wlc_init(void)
static void __exit ip_vs_wlc_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_wlc_init);
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
index 231be7dd547a..0e68555bceb9 100644
--- a/net/netfilter/ipvs/ip_vs_wrr.c
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -29,14 +29,45 @@
#include <net/ip_vs.h>
+/* The WRR algorithm depends on some caclulations:
+ * - mw: maximum weight
+ * - di: weight step, greatest common divisor from all weights
+ * - cw: current required weight
+ * As result, all weights are in the [di..mw] range with a step=di.
+ *
+ * First, we start with cw = mw and select dests with weight >= cw.
+ * Then cw is reduced with di and all dests are checked again.
+ * Last pass should be with cw = di. We have mw/di passes in total:
+ *
+ * pass 1: cw = max weight
+ * pass 2: cw = max weight - di
+ * pass 3: cw = max weight - 2 * di
+ * ...
+ * last pass: cw = di
+ *
+ * Weights are supposed to be >= di but we run in parallel with
+ * weight changes, it is possible some dest weight to be reduced
+ * below di, bad if it is the only available dest.
+ *
+ * So, we modify how mw is calculated, now it is reduced with (di - 1),
+ * so that last cw is 1 to catch such dests with weight below di:
+ * pass 1: cw = max weight - (di - 1)
+ * pass 2: cw = max weight - di - (di - 1)
+ * pass 3: cw = max weight - 2 * di - (di - 1)
+ * ...
+ * last pass: cw = 1
+ *
+ */
+
/*
* current destination pointer for weighted round-robin scheduling
*/
struct ip_vs_wrr_mark {
- struct list_head *cl; /* current list head */
+ struct ip_vs_dest *cl; /* current dest or head */
int cw; /* current weight */
int mw; /* maximum weight */
int di; /* decreasing interval */
+ struct rcu_head rcu_head;
};
@@ -88,36 +119,41 @@ static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
if (mark == NULL)
return -ENOMEM;
- mark->cl = &svc->destinations;
- mark->cw = 0;
- mark->mw = ip_vs_wrr_max_weight(svc);
+ mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
mark->di = ip_vs_wrr_gcd_weight(svc);
+ mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
+ mark->cw = mark->mw;
svc->sched_data = mark;
return 0;
}
-static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
+static void ip_vs_wrr_done_svc(struct ip_vs_service *svc)
{
+ struct ip_vs_wrr_mark *mark = svc->sched_data;
+
/*
* Release the mark variable
*/
- kfree(svc->sched_data);
-
- return 0;
+ kfree_rcu(mark, rcu_head);
}
-static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
+static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
{
struct ip_vs_wrr_mark *mark = svc->sched_data;
- mark->cl = &svc->destinations;
- mark->mw = ip_vs_wrr_max_weight(svc);
+ spin_lock_bh(&svc->sched_lock);
+ mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
mark->di = ip_vs_wrr_gcd_weight(svc);
- if (mark->cw > mark->mw)
- mark->cw = 0;
+ mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
+ if (mark->cw > mark->mw || !mark->cw)
+ mark->cw = mark->mw;
+ else if (mark->di > 1)
+ mark->cw = (mark->cw / mark->di) * mark->di + 1;
+ spin_unlock_bh(&svc->sched_lock);
return 0;
}
@@ -128,80 +164,79 @@ static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
static struct ip_vs_dest *
ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
- struct ip_vs_dest *dest;
+ struct ip_vs_dest *dest, *last, *stop = NULL;
struct ip_vs_wrr_mark *mark = svc->sched_data;
- struct list_head *p;
+ bool last_pass = false, restarted = false;
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
- /*
- * This loop will always terminate, because mark->cw in (0, max_weight]
- * and at least one server has its weight equal to max_weight.
- */
- write_lock(&svc->sched_lock);
- p = mark->cl;
+ spin_lock_bh(&svc->sched_lock);
+ dest = mark->cl;
+ /* No available dests? */
+ if (mark->mw == 0)
+ goto err_noavail;
+ last = dest;
+ /* Stop only after all dests were checked for weight >= 1 (last pass) */
while (1) {
- if (mark->cl == &svc->destinations) {
- /* it is at the head of the destination list */
-
- if (mark->cl == mark->cl->next) {
- /* no dest entry */
- ip_vs_scheduler_err(svc,
- "no destination available: "
- "no destinations present");
- dest = NULL;
- goto out;
- }
-
- mark->cl = svc->destinations.next;
- mark->cw -= mark->di;
- if (mark->cw <= 0) {
- mark->cw = mark->mw;
- /*
- * Still zero, which means no available servers.
- */
- if (mark->cw == 0) {
- mark->cl = &svc->destinations;
- ip_vs_scheduler_err(svc,
- "no destination available");
- dest = NULL;
- goto out;
- }
- }
- } else
- mark->cl = mark->cl->next;
-
- if (mark->cl != &svc->destinations) {
- /* not at the head of the list */
- dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
+ list_for_each_entry_continue_rcu(dest,
+ &svc->destinations,
+ n_list) {
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
- atomic_read(&dest->weight) >= mark->cw) {
- /* got it */
- break;
- }
+ atomic_read(&dest->weight) >= mark->cw)
+ goto found;
+ if (dest == stop)
+ goto err_over;
}
-
- if (mark->cl == p && mark->cw == mark->di) {
- /* back to the start, and no dest is found.
- It is only possible when all dests are OVERLOADED */
- dest = NULL;
- ip_vs_scheduler_err(svc,
- "no destination available: "
- "all destinations are overloaded");
- goto out;
+ mark->cw -= mark->di;
+ if (mark->cw <= 0) {
+ mark->cw = mark->mw;
+ /* Stop if we tried last pass from first dest:
+ * 1. last_pass: we started checks when cw > di but
+ * then all dests were checked for w >= 1
+ * 2. last was head: the first and only traversal
+ * was for weight >= 1, for all dests.
+ */
+ if (last_pass ||
+ &last->n_list == &svc->destinations)
+ goto err_over;
+ restarted = true;
+ }
+ last_pass = mark->cw <= mark->di;
+ if (last_pass && restarted &&
+ &last->n_list != &svc->destinations) {
+ /* First traversal was for w >= 1 but only
+ * for dests after 'last', now do the same
+ * for all dests up to 'last'.
+ */
+ stop = last;
}
}
+found:
IP_VS_DBG_BUF(6, "WRR: server %s:%u "
"activeconns %d refcnt %d weight %d\n",
IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
atomic_read(&dest->activeconns),
atomic_read(&dest->refcnt),
atomic_read(&dest->weight));
+ mark->cl = dest;
out:
- write_unlock(&svc->sched_lock);
+ spin_unlock_bh(&svc->sched_lock);
return dest;
+
+err_noavail:
+ mark->cl = dest;
+ dest = NULL;
+ ip_vs_scheduler_err(svc, "no destination available");
+ goto out;
+
+err_over:
+ mark->cl = dest;
+ dest = NULL;
+ ip_vs_scheduler_err(svc, "no destination available: "
+ "all destinations are overloaded");
+ goto out;
}
@@ -212,7 +247,9 @@ static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
.n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
.init_service = ip_vs_wrr_init_svc,
.done_service = ip_vs_wrr_done_svc,
- .update_service = ip_vs_wrr_update_svc,
+ .add_dest = ip_vs_wrr_dest_changed,
+ .del_dest = ip_vs_wrr_dest_changed,
+ .upd_dest = ip_vs_wrr_dest_changed,
.schedule = ip_vs_wrr_schedule,
};
@@ -224,6 +261,7 @@ static int __init ip_vs_wrr_init(void)
static void __exit ip_vs_wrr_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_wrr_init);
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index ee6b7a9f1ec2..b75ff6429a04 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -17,6 +17,8 @@
* - not all connections have destination server, for example,
* connections in backup server when fwmark is used
* - bypass connections use daddr from packet
+ * - we can use dst without ref while sending in RCU section, we use
+ * ref when returning NF_ACCEPT for NAT-ed packet via loopback
* LOCAL_OUT rules:
* - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
* - skb->pkt_type is not set yet
@@ -51,39 +53,54 @@ enum {
*/
IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */
IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */
+ IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */
};
+static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void)
+{
+ return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC);
+}
+
+static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst)
+{
+ kfree(dest_dst);
+}
+
/*
* Destination cache to speed up outgoing route lookup
*/
static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
- u32 dst_cookie)
+__ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst,
+ struct dst_entry *dst, u32 dst_cookie)
{
- struct dst_entry *old_dst;
+ struct ip_vs_dest_dst *old;
+
+ old = rcu_dereference_protected(dest->dest_dst,
+ lockdep_is_held(&dest->dst_lock));
- old_dst = dest->dst_cache;
- dest->dst_cache = dst;
- dest->dst_rtos = rtos;
- dest->dst_cookie = dst_cookie;
- dst_release(old_dst);
+ if (dest_dst) {
+ dest_dst->dst_cache = dst;
+ dest_dst->dst_cookie = dst_cookie;
+ }
+ rcu_assign_pointer(dest->dest_dst, dest_dst);
+
+ if (old)
+ call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
}
-static inline struct dst_entry *
-__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
+static inline struct ip_vs_dest_dst *
+__ip_vs_dst_check(struct ip_vs_dest *dest)
{
- struct dst_entry *dst = dest->dst_cache;
+ struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst);
+ struct dst_entry *dst;
- if (!dst)
+ if (!dest_dst)
return NULL;
- if ((dst->obsolete || rtos != dest->dst_rtos) &&
- dst->ops->check(dst, dest->dst_cookie) == NULL) {
- dest->dst_cache = NULL;
- dst_release(dst);
+ dst = dest_dst->dst_cache;
+ if (dst->obsolete &&
+ dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
return NULL;
- }
- dst_hold(dst);
- return dst;
+ return dest_dst;
}
static inline bool
@@ -104,7 +121,7 @@ __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
/* Get route to daddr, update *saddr, optionally bind route to saddr */
static struct rtable *do_output_route4(struct net *net, __be32 daddr,
- u32 rtos, int rt_mode, __be32 *saddr)
+ int rt_mode, __be32 *saddr)
{
struct flowi4 fl4;
struct rtable *rt;
@@ -113,7 +130,6 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr,
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = daddr;
fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0;
- fl4.flowi4_tos = rtos;
fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
FLOWI_FLAG_KNOWN_NH : 0;
@@ -124,7 +140,7 @@ retry:
if (PTR_ERR(rt) == -EINVAL && *saddr &&
rt_mode & IP_VS_RT_MODE_CONNECT && !loop) {
*saddr = 0;
- flowi4_update_output(&fl4, 0, rtos, daddr, 0);
+ flowi4_update_output(&fl4, 0, 0, daddr, 0);
goto retry;
}
IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
@@ -132,7 +148,7 @@ retry:
} else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
ip_rt_put(rt);
*saddr = fl4.saddr;
- flowi4_update_output(&fl4, 0, rtos, daddr, fl4.saddr);
+ flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr);
loop++;
goto retry;
}
@@ -141,113 +157,140 @@ retry:
}
/* Get route to destination or remote server */
-static struct rtable *
+static int
__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
- __be32 daddr, u32 rtos, int rt_mode, __be32 *ret_saddr)
+ __be32 daddr, int rt_mode, __be32 *ret_saddr)
{
struct net *net = dev_net(skb_dst(skb)->dev);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_dest_dst *dest_dst;
struct rtable *rt; /* Route to the other host */
struct rtable *ort; /* Original route */
- int local;
+ struct iphdr *iph;
+ __be16 df;
+ int mtu;
+ int local, noref = 1;
if (dest) {
- spin_lock(&dest->dst_lock);
- if (!(rt = (struct rtable *)
- __ip_vs_dst_check(dest, rtos))) {
- rt = do_output_route4(net, dest->addr.ip, rtos,
- rt_mode, &dest->dst_saddr.ip);
+ dest_dst = __ip_vs_dst_check(dest);
+ if (likely(dest_dst))
+ rt = (struct rtable *) dest_dst->dst_cache;
+ else {
+ dest_dst = ip_vs_dest_dst_alloc();
+ spin_lock_bh(&dest->dst_lock);
+ if (!dest_dst) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ goto err_unreach;
+ }
+ rt = do_output_route4(net, dest->addr.ip, rt_mode,
+ &dest_dst->dst_saddr.ip);
if (!rt) {
- spin_unlock(&dest->dst_lock);
- return NULL;
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ ip_vs_dest_dst_free(dest_dst);
+ goto err_unreach;
}
- __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
- IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, "
- "rtos=%X\n",
- &dest->addr.ip, &dest->dst_saddr.ip,
- atomic_read(&rt->dst.__refcnt), rtos);
+ __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
+ &dest->addr.ip, &dest_dst->dst_saddr.ip,
+ atomic_read(&rt->dst.__refcnt));
}
daddr = dest->addr.ip;
if (ret_saddr)
- *ret_saddr = dest->dst_saddr.ip;
- spin_unlock(&dest->dst_lock);
+ *ret_saddr = dest_dst->dst_saddr.ip;
} else {
__be32 saddr = htonl(INADDR_ANY);
+ noref = 0;
+
/* For such unconfigured boxes avoid many route lookups
* for performance reasons because we do not remember saddr
*/
rt_mode &= ~IP_VS_RT_MODE_CONNECT;
- rt = do_output_route4(net, daddr, rtos, rt_mode, &saddr);
+ rt = do_output_route4(net, daddr, rt_mode, &saddr);
if (!rt)
- return NULL;
+ goto err_unreach;
if (ret_saddr)
*ret_saddr = saddr;
}
- local = rt->rt_flags & RTCF_LOCAL;
+ local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0;
if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) &
rt_mode)) {
IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
(rt->rt_flags & RTCF_LOCAL) ?
"local":"non-local", &daddr);
- ip_rt_put(rt);
- return NULL;
- }
- if (local && !(rt_mode & IP_VS_RT_MODE_RDR) &&
- !((ort = skb_rtable(skb)) && ort->rt_flags & RTCF_LOCAL)) {
- IP_VS_DBG_RL("Redirect from non-local address %pI4 to local "
- "requires NAT method, dest: %pI4\n",
- &ip_hdr(skb)->daddr, &daddr);
- ip_rt_put(rt);
- return NULL;
+ goto err_put;
}
- if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) {
- IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 "
- "to non-local address, dest: %pI4\n",
- &ip_hdr(skb)->saddr, &daddr);
- ip_rt_put(rt);
- return NULL;
+ iph = ip_hdr(skb);
+ if (likely(!local)) {
+ if (unlikely(ipv4_is_loopback(iph->saddr))) {
+ IP_VS_DBG_RL("Stopping traffic from loopback address "
+ "%pI4 to non-local address, dest: %pI4\n",
+ &iph->saddr, &daddr);
+ goto err_put;
+ }
+ } else {
+ ort = skb_rtable(skb);
+ if (!(rt_mode & IP_VS_RT_MODE_RDR) &&
+ !(ort->rt_flags & RTCF_LOCAL)) {
+ IP_VS_DBG_RL("Redirect from non-local address %pI4 to "
+ "local requires NAT method, dest: %pI4\n",
+ &iph->daddr, &daddr);
+ goto err_put;
+ }
+ /* skb to local stack, preserve old route */
+ if (!noref)
+ ip_rt_put(rt);
+ return local;
}
- return rt;
-}
-
-/* Reroute packet to local IPv4 stack after DNAT */
-static int
-__ip_vs_reroute_locally(struct sk_buff *skb)
-{
- struct rtable *rt = skb_rtable(skb);
- struct net_device *dev = rt->dst.dev;
- struct net *net = dev_net(dev);
- struct iphdr *iph = ip_hdr(skb);
-
- if (rt_is_input_route(rt)) {
- unsigned long orefdst = skb->_skb_refdst;
-
- if (ip_route_input(skb, iph->daddr, iph->saddr,
- iph->tos, skb->dev))
- return 0;
- refdst_drop(orefdst);
+ if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) {
+ mtu = dst_mtu(&rt->dst);
+ df = iph->frag_off & htons(IP_DF);
} else {
- struct flowi4 fl4 = {
- .daddr = iph->daddr,
- .saddr = iph->saddr,
- .flowi4_tos = RT_TOS(iph->tos),
- .flowi4_mark = skb->mark,
- };
-
- rt = ip_route_output_key(net, &fl4);
- if (IS_ERR(rt))
- return 0;
- if (!(rt->rt_flags & RTCF_LOCAL)) {
- ip_rt_put(rt);
- return 0;
+ struct sock *sk = skb->sk;
+
+ mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
+ if (mtu < 68) {
+ IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
+ goto err_put;
}
- /* Drop old route. */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
+ ort = skb_rtable(skb);
+ if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT)
+ ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
+ /* MTU check allowed? */
+ df = sysctl_pmtu_disc(ipvs) ? iph->frag_off & htons(IP_DF) : 0;
}
- return 1;
+
+ /* MTU checking */
+ if (unlikely(df && skb->len > mtu && !skb_is_gso(skb))) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ IP_VS_DBG(1, "frag needed for %pI4\n", &iph->saddr);
+ goto err_put;
+ }
+
+ skb_dst_drop(skb);
+ if (noref) {
+ if (!local)
+ skb_dst_set_noref_force(skb, &rt->dst);
+ else
+ skb_dst_set(skb, dst_clone(&rt->dst));
+ } else
+ skb_dst_set(skb, &rt->dst);
+
+ return local;
+
+err_put:
+ if (!noref)
+ ip_rt_put(rt);
+ return -1;
+
+err_unreach:
+ dst_link_failure(skb);
+ return -1;
}
#ifdef CONFIG_IP_VS_IPV6
@@ -294,44 +337,57 @@ out_err:
/*
* Get route to destination or remote server
*/
-static struct rt6_info *
+static int
__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
struct in6_addr *daddr, struct in6_addr *ret_saddr,
- int do_xfrm, int rt_mode)
+ struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode)
{
struct net *net = dev_net(skb_dst(skb)->dev);
+ struct ip_vs_dest_dst *dest_dst;
struct rt6_info *rt; /* Route to the other host */
struct rt6_info *ort; /* Original route */
struct dst_entry *dst;
- int local;
+ int mtu;
+ int local, noref = 1;
if (dest) {
- spin_lock(&dest->dst_lock);
- rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
- if (!rt) {
+ dest_dst = __ip_vs_dst_check(dest);
+ if (likely(dest_dst))
+ rt = (struct rt6_info *) dest_dst->dst_cache;
+ else {
u32 cookie;
+ dest_dst = ip_vs_dest_dst_alloc();
+ spin_lock_bh(&dest->dst_lock);
+ if (!dest_dst) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ goto err_unreach;
+ }
dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
- &dest->dst_saddr.in6,
+ &dest_dst->dst_saddr.in6,
do_xfrm);
if (!dst) {
- spin_unlock(&dest->dst_lock);
- return NULL;
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ ip_vs_dest_dst_free(dest_dst);
+ goto err_unreach;
}
rt = (struct rt6_info *) dst;
cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
- __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
+ __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
+ spin_unlock_bh(&dest->dst_lock);
IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
- &dest->addr.in6, &dest->dst_saddr.in6,
+ &dest->addr.in6, &dest_dst->dst_saddr.in6,
atomic_read(&rt->dst.__refcnt));
}
if (ret_saddr)
- *ret_saddr = dest->dst_saddr.in6;
- spin_unlock(&dest->dst_lock);
+ *ret_saddr = dest_dst->dst_saddr.in6;
} else {
+ noref = 0;
dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
if (!dst)
- return NULL;
+ goto err_unreach;
rt = (struct rt6_info *) dst;
}
@@ -340,86 +396,137 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
rt_mode)) {
IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6c\n",
local ? "local":"non-local", daddr);
- dst_release(&rt->dst);
- return NULL;
+ goto err_put;
}
- if (local && !(rt_mode & IP_VS_RT_MODE_RDR) &&
- !((ort = (struct rt6_info *) skb_dst(skb)) &&
- __ip_vs_is_local_route6(ort))) {
- IP_VS_DBG_RL("Redirect from non-local address %pI6c to local "
- "requires NAT method, dest: %pI6c\n",
- &ipv6_hdr(skb)->daddr, daddr);
- dst_release(&rt->dst);
- return NULL;
+ if (likely(!local)) {
+ if (unlikely((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
+ IPV6_ADDR_LOOPBACK)) {
+ IP_VS_DBG_RL("Stopping traffic from loopback address "
+ "%pI6c to non-local address, "
+ "dest: %pI6c\n",
+ &ipv6_hdr(skb)->saddr, daddr);
+ goto err_put;
+ }
+ } else {
+ ort = (struct rt6_info *) skb_dst(skb);
+ if (!(rt_mode & IP_VS_RT_MODE_RDR) &&
+ !__ip_vs_is_local_route6(ort)) {
+ IP_VS_DBG_RL("Redirect from non-local address %pI6c "
+ "to local requires NAT method, "
+ "dest: %pI6c\n",
+ &ipv6_hdr(skb)->daddr, daddr);
+ goto err_put;
+ }
+ /* skb to local stack, preserve old route */
+ if (!noref)
+ dst_release(&rt->dst);
+ return local;
}
- if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
- ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
- IPV6_ADDR_LOOPBACK)) {
- IP_VS_DBG_RL("Stopping traffic from loopback address %pI6c "
- "to non-local address, dest: %pI6c\n",
- &ipv6_hdr(skb)->saddr, daddr);
- dst_release(&rt->dst);
- return NULL;
+
+ /* MTU checking */
+ if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL)))
+ mtu = dst_mtu(&rt->dst);
+ else {
+ struct sock *sk = skb->sk;
+
+ mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
+ if (mtu < IPV6_MIN_MTU) {
+ IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
+ IPV6_MIN_MTU);
+ goto err_put;
+ }
+ ort = (struct rt6_info *) skb_dst(skb);
+ if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT)
+ ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
}
- return rt;
+ if (unlikely(__mtu_check_toobig_v6(skb, mtu))) {
+ if (!skb->dev)
+ skb->dev = net->loopback_dev;
+ /* only send ICMP too big on first fragment */
+ if (!ipvsh->fragoffs)
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ IP_VS_DBG(1, "frag needed for %pI6c\n", &ipv6_hdr(skb)->saddr);
+ goto err_put;
+ }
+
+ skb_dst_drop(skb);
+ if (noref) {
+ if (!local)
+ skb_dst_set_noref_force(skb, &rt->dst);
+ else
+ skb_dst_set(skb, dst_clone(&rt->dst));
+ } else
+ skb_dst_set(skb, &rt->dst);
+
+ return local;
+
+err_put:
+ if (!noref)
+ dst_release(&rt->dst);
+ return -1;
+
+err_unreach:
+ dst_link_failure(skb);
+ return -1;
}
#endif
-/*
- * Release dest->dst_cache before a dest is removed
- */
-void
-ip_vs_dst_reset(struct ip_vs_dest *dest)
+/* return NF_ACCEPT to allow forwarding or other NF_xxx on error */
+static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
+ struct ip_vs_conn *cp)
{
- struct dst_entry *old_dst;
+ int ret = NF_ACCEPT;
+
+ skb->ipvs_property = 1;
+ if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
+ ret = ip_vs_confirm_conntrack(skb);
+ if (ret == NF_ACCEPT) {
+ nf_reset(skb);
+ skb_forward_csum(skb);
+ }
+ return ret;
+}
+
+/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
+static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
+ struct ip_vs_conn *cp, int local)
+{
+ int ret = NF_STOLEN;
- old_dst = dest->dst_cache;
- dest->dst_cache = NULL;
- dst_release(old_dst);
- dest->dst_saddr.ip = 0;
+ skb->ipvs_property = 1;
+ if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
+ ip_vs_notrack(skb);
+ else
+ ip_vs_update_conntrack(skb, cp, 1);
+ if (!local) {
+ skb_forward_csum(skb);
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
+ dst_output);
+ } else
+ ret = NF_ACCEPT;
+ return ret;
}
-#define IP_VS_XMIT_TUNNEL(skb, cp) \
-({ \
- int __ret = NF_ACCEPT; \
- \
- (skb)->ipvs_property = 1; \
- if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \
- __ret = ip_vs_confirm_conntrack(skb); \
- if (__ret == NF_ACCEPT) { \
- nf_reset(skb); \
- skb_forward_csum(skb); \
- } \
- __ret; \
-})
-
-#define IP_VS_XMIT_NAT(pf, skb, cp, local) \
-do { \
- (skb)->ipvs_property = 1; \
- if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
- ip_vs_notrack(skb); \
- else \
- ip_vs_update_conntrack(skb, cp, 1); \
- if (local) \
- return NF_ACCEPT; \
- skb_forward_csum(skb); \
- NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
- skb_dst(skb)->dev, dst_output); \
-} while (0)
-
-#define IP_VS_XMIT(pf, skb, cp, local) \
-do { \
- (skb)->ipvs_property = 1; \
- if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
- ip_vs_notrack(skb); \
- if (local) \
- return NF_ACCEPT; \
- skb_forward_csum(skb); \
- NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
- skb_dst(skb)->dev, dst_output); \
-} while (0)
+/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
+static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
+ struct ip_vs_conn *cp, int local)
+{
+ int ret = NF_STOLEN;
+
+ skb->ipvs_property = 1;
+ if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
+ ip_vs_notrack(skb);
+ if (!local) {
+ skb_forward_csum(skb);
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
+ dst_output);
+ } else
+ ret = NF_ACCEPT;
+ return ret;
+}
/*
@@ -430,7 +537,7 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
/* we do not touch skb and do not need pskb ptr */
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
@@ -443,52 +550,29 @@ int
ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
- struct rtable *rt; /* Route to the other host */
struct iphdr *iph = ip_hdr(skb);
- int mtu;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, RT_TOS(iph->tos),
- IP_VS_RT_MODE_NON_LOCAL, NULL)))
- goto tx_error_icmp;
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
- !skb_is_gso(skb)) {
- ip_rt_put(rt);
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+ rcu_read_lock();
+ if (__ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL,
+ NULL) < 0)
goto tx_error;
- }
- /*
- * Call ip_send_check because we are not sure it is called
- * after ip_defrag. Is copy-on-write needed?
- */
- if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
- ip_rt_put(rt);
- return NF_STOLEN;
- }
- ip_send_check(ip_hdr(skb));
-
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
+ ip_send_check(iph);
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
+ ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
@@ -496,60 +580,27 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
#ifdef CONFIG_IP_VS_IPV6
int
ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
- struct rt6_info *rt; /* Route to the other host */
- int mtu;
-
EnterFunction(10);
- rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr.in6, NULL, 0,
- IP_VS_RT_MODE_NON_LOCAL);
- if (!rt)
- goto tx_error_icmp;
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if (__mtu_check_toobig_v6(skb, mtu)) {
- if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net->loopback_dev;
- }
- /* only send ICMP too big on first fragment */
- if (!iph->fragoffs)
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- dst_release(&rt->dst);
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+ rcu_read_lock();
+ if (__ip_vs_get_out_rt_v6(skb, NULL, &ipvsh->daddr.in6, NULL,
+ ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
goto tx_error;
- }
-
- /*
- * Call ip_send_check because we are not sure it is called
- * after ip_defrag. Is copy-on-write needed?
- */
- skb = skb_share_check(skb, GFP_ATOMIC);
- if (unlikely(skb == NULL)) {
- dst_release(&rt->dst);
- return NF_STOLEN;
- }
-
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
+ ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
@@ -564,29 +615,30 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
struct rtable *rt; /* Route to the other host */
- int mtu;
- struct iphdr *iph = ip_hdr(skb);
- int local;
+ int local, rc, was_input;
EnterFunction(10);
+ rcu_read_lock();
/* check if it is a connection of no-client-port */
if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
__be16 _pt, *p;
- p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
+
+ p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
if (p == NULL)
goto tx_error;
ip_vs_conn_fill_cport(cp, *p);
IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
}
- if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(iph->tos),
- IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL |
- IP_VS_RT_MODE_RDR, NULL)))
- goto tx_error_icmp;
- local = rt->rt_flags & RTCF_LOCAL;
+ was_input = rt_is_input_route(skb_rtable(skb));
+ local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR, NULL);
+ if (local < 0)
+ goto tx_error;
+ rt = skb_rtable(skb);
/*
* Avoid duplicate tuple in reply direction for NAT traffic
* to local address when connection is sync-ed
@@ -600,57 +652,31 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,
"ip_vs_nat_xmit(): "
"stopping DNAT to local address");
- goto tx_error_put;
+ goto tx_error;
}
}
#endif
/* From world but DNAT to loopback address? */
- if (local && ipv4_is_loopback(cp->daddr.ip) &&
- rt_is_input_route(skb_rtable(skb))) {
+ if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "
"stopping DNAT to loopback address");
- goto tx_error_put;
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
- !skb_is_gso(skb)) {
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0,
- "ip_vs_nat_xmit(): frag needed for");
- goto tx_error_put;
+ goto tx_error;
}
/* copy-on-write the packet before mangling it */
if (!skb_make_writable(skb, sizeof(struct iphdr)))
- goto tx_error_put;
+ goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
- goto tx_error_put;
+ goto tx_error;
/* mangle the packet */
if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
- goto tx_error_put;
+ goto tx_error;
ip_hdr(skb)->daddr = cp->daddr.ip;
ip_send_check(ip_hdr(skb));
- if (!local) {
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- } else {
- ip_rt_put(rt);
- /*
- * Some IPv4 replies get local address from routes,
- * not from iph, so while we DNAT after routing
- * we need this second input/output route.
- */
- if (!__ip_vs_reroute_locally(skb))
- goto tx_error;
- }
-
IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
/* FIXME: when application helper enlarges the packet and the length
@@ -660,49 +686,48 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
+ rcu_read_unlock();
LeaveFunction(10);
- return NF_STOLEN;
+ return rc;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
- tx_error_put:
- ip_rt_put(rt);
- goto tx_error;
}
#ifdef CONFIG_IP_VS_IPV6
int
ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
struct rt6_info *rt; /* Route to the other host */
- int mtu;
- int local;
+ int local, rc;
EnterFunction(10);
+ rcu_read_lock();
/* check if it is a connection of no-client-port */
- if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !iph->fragoffs)) {
+ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
__be16 _pt, *p;
- p = skb_header_pointer(skb, iph->len, sizeof(_pt), &_pt);
+ p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
if (p == NULL)
goto tx_error;
ip_vs_conn_fill_cport(cp, *p);
IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
}
- if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
- 0, (IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL |
- IP_VS_RT_MODE_RDR))))
- goto tx_error_icmp;
- local = __ip_vs_is_local_route6(rt);
+ local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ ipvsh, 0,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR);
+ if (local < 0)
+ goto tx_error;
+ rt = (struct rt6_info *) skb_dst(skb);
/*
* Avoid duplicate tuple in reply direction for NAT traffic
* to local address when connection is sync-ed
@@ -716,7 +741,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,
"ip_vs_nat_xmit_v6(): "
"stopping DNAT to local address");
- goto tx_error_put;
+ goto tx_error;
}
}
#endif
@@ -727,46 +752,21 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
"ip_vs_nat_xmit_v6(): "
"stopping DNAT to loopback address");
- goto tx_error_put;
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if (__mtu_check_toobig_v6(skb, mtu)) {
- if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net->loopback_dev;
- }
- /* only send ICMP too big on first fragment */
- if (!iph->fragoffs)
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0,
- "ip_vs_nat_xmit_v6(): frag needed for");
- goto tx_error_put;
+ goto tx_error;
}
/* copy-on-write the packet before mangling it */
if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
- goto tx_error_put;
+ goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
- goto tx_error_put;
+ goto tx_error;
/* mangle the packet */
- if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, iph))
+ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
goto tx_error;
ipv6_hdr(skb)->daddr = cp->daddr.in6;
- if (!local || !skb->dev) {
- /* drop the old route when skb is not shared */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- } else {
- /* destined to loopback, do we need to change route? */
- dst_release(&rt->dst);
- }
-
IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
/* FIXME: when application helper enlarges the packet and the length
@@ -776,20 +776,17 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
+ rcu_read_unlock();
LeaveFunction(10);
- return NF_STOLEN;
+ return rc;
-tx_error_icmp:
- dst_link_failure(skb);
tx_error:
LeaveFunction(10);
kfree_skb(skb);
+ rcu_read_unlock();
return NF_STOLEN;
-tx_error_put:
- dst_release(&rt->dst);
- goto tx_error;
}
#endif
@@ -826,56 +823,40 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
__be16 df;
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
- int mtu;
- int ret;
+ int ret, local;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(tos), IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL |
- IP_VS_RT_MODE_CONNECT,
- &saddr)))
- goto tx_error_icmp;
- if (rt->rt_flags & RTCF_LOCAL) {
- ip_rt_put(rt);
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_CONNECT |
+ IP_VS_RT_MODE_TUNNEL, &saddr);
+ if (local < 0)
+ goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
+ rt = skb_rtable(skb);
tdev = rt->dst.dev;
- mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
- if (mtu < 68) {
- IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
- goto tx_error_put;
- }
- if (rt_is_output_route(skb_rtable(skb)))
- skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
-
/* Copy DF, reset fragment offset and MF */
df = sysctl_pmtu_disc(ipvs) ? old_iph->frag_off & htons(IP_DF) : 0;
- if (df && mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb)) {
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error_put;
- }
-
/*
* Okay, now see if we can stuff it in the buffer as-is.
*/
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
- if (skb_headroom(skb) < max_headroom
- || skb_cloned(skb) || skb_shared(skb)) {
+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
struct sk_buff *new_skb =
skb_realloc_headroom(skb, max_headroom);
- if (!new_skb) {
- ip_rt_put(rt);
- kfree_skb(skb);
- IP_VS_ERR_RL("%s(): no memory\n", __func__);
- return NF_STOLEN;
- }
+
+ if (!new_skb)
+ goto tx_error;
consume_skb(skb);
skb = new_skb;
old_iph = ip_hdr(skb);
@@ -890,10 +871,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
skb_reset_network_header(skb);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
/*
* Push down and install the IPIP header.
*/
@@ -911,25 +888,22 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- ret = IP_VS_XMIT_TUNNEL(skb, cp);
+ ret = ip_vs_tunnel_xmit_prepare(skb, cp);
if (ret == NF_ACCEPT)
ip_local_out(skb);
else if (ret == NF_DROP)
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
-tx_error_put:
- ip_rt_put(rt);
- goto tx_error;
}
#ifdef CONFIG_IP_VS_IPV6
@@ -943,60 +917,37 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ipv6hdr *old_iph = ipv6_hdr(skb);
struct ipv6hdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
- int mtu;
- int ret;
+ int ret, local;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
- &saddr, 1, (IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL))))
- goto tx_error_icmp;
- if (__ip_vs_is_local_route6(rt)) {
- dst_release(&rt->dst);
- IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
+ &saddr, ipvsh, 1,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_TUNNEL);
+ if (local < 0)
+ goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
}
+ rt = (struct rt6_info *) skb_dst(skb);
tdev = rt->dst.dev;
- mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
- if (mtu < IPV6_MIN_MTU) {
- IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
- IPV6_MIN_MTU);
- goto tx_error_put;
- }
- if (skb_dst(skb))
- skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
-
- /* MTU checking: Notice that 'mtu' have been adjusted before hand */
- if (__mtu_check_toobig_v6(skb, mtu)) {
- if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net->loopback_dev;
- }
- /* only send ICMP too big on first fragment */
- if (!ipvsh->fragoffs)
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error_put;
- }
-
/*
* Okay, now see if we can stuff it in the buffer as-is.
*/
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
- if (skb_headroom(skb) < max_headroom
- || skb_cloned(skb) || skb_shared(skb)) {
+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
struct sk_buff *new_skb =
skb_realloc_headroom(skb, max_headroom);
- if (!new_skb) {
- dst_release(&rt->dst);
- kfree_skb(skb);
- IP_VS_ERR_RL("%s(): no memory\n", __func__);
- return NF_STOLEN;
- }
+
+ if (!new_skb)
+ goto tx_error;
consume_skb(skb);
skb = new_skb;
old_iph = ipv6_hdr(skb);
@@ -1008,10 +959,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
skb_reset_network_header(skb);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
/*
* Push down and install the IPIP header.
*/
@@ -1029,25 +976,22 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- ret = IP_VS_XMIT_TUNNEL(skb, cp);
+ ret = ip_vs_tunnel_xmit_prepare(skb, cp);
if (ret == NF_ACCEPT)
ip6_local_out(skb);
else if (ret == NF_DROP)
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
-tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
-tx_error_put:
- dst_release(&rt->dst);
- goto tx_error;
}
#endif
@@ -1060,59 +1004,36 @@ int
ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
- struct rtable *rt; /* Route to the other host */
- struct iphdr *iph = ip_hdr(skb);
- int mtu;
+ int local;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(iph->tos),
- IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL |
- IP_VS_RT_MODE_KNOWN_NH, NULL)))
- goto tx_error_icmp;
- if (rt->rt_flags & RTCF_LOCAL) {
- ip_rt_put(rt);
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu &&
- !skb_is_gso(skb)) {
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- ip_rt_put(rt);
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_KNOWN_NH, NULL);
+ if (local < 0)
goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
- /*
- * Call ip_send_check because we are not sure it is called
- * after ip_defrag. Is copy-on-write needed?
- */
- if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
- ip_rt_put(rt);
- return NF_STOLEN;
- }
ip_send_check(ip_hdr(skb));
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
+ ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
@@ -1120,64 +1041,36 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
#ifdef CONFIG_IP_VS_IPV6
int
ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
- struct rt6_info *rt; /* Route to the other host */
- int mtu;
+ int local;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
- 0, (IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL))))
- goto tx_error_icmp;
- if (__ip_vs_is_local_route6(rt)) {
- dst_release(&rt->dst);
- IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if (__mtu_check_toobig_v6(skb, mtu)) {
- if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net->loopback_dev;
- }
- /* only send ICMP too big on first fragment */
- if (!iph->fragoffs)
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- dst_release(&rt->dst);
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ ipvsh, 0,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL);
+ if (local < 0)
goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
}
- /*
- * Call ip_send_check because we are not sure it is called
- * after ip_defrag. Is copy-on-write needed?
- */
- skb = skb_share_check(skb, GFP_ATOMIC);
- if (unlikely(skb == NULL)) {
- dst_release(&rt->dst);
- return NF_STOLEN;
- }
-
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
+ ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
-tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
@@ -1194,10 +1087,9 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_iphdr *iph)
{
struct rtable *rt; /* Route to the other host */
- int mtu;
int rc;
int local;
- int rt_mode;
+ int rt_mode, was_input;
EnterFunction(10);
@@ -1217,16 +1109,17 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/*
* mangle and send the packet here (only for VS/NAT)
*/
+ was_input = rt_is_input_route(skb_rtable(skb));
/* LOCALNODE from FORWARD hook is not supported */
rt_mode = (hooknum != NF_INET_FORWARD) ?
IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
- if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(ip_hdr(skb)->tos),
- rt_mode, NULL)))
- goto tx_error_icmp;
- local = rt->rt_flags & RTCF_LOCAL;
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, rt_mode, NULL);
+ if (local < 0)
+ goto tx_error;
+ rt = skb_rtable(skb);
/*
* Avoid duplicate tuple in reply direction for NAT traffic
@@ -1241,82 +1134,51 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG(10, "%s(): "
"stopping DNAT to local address %pI4\n",
__func__, &cp->daddr.ip);
- goto tx_error_put;
+ goto tx_error;
}
}
#endif
/* From world but DNAT to loopback address? */
- if (local && ipv4_is_loopback(cp->daddr.ip) &&
- rt_is_input_route(skb_rtable(skb))) {
+ if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
IP_VS_DBG(1, "%s(): "
"stopping DNAT to loopback %pI4\n",
__func__, &cp->daddr.ip);
- goto tx_error_put;
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF)) &&
- !skb_is_gso(skb)) {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error_put;
+ goto tx_error;
}
/* copy-on-write the packet before mangling it */
if (!skb_make_writable(skb, offset))
- goto tx_error_put;
+ goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
- goto tx_error_put;
+ goto tx_error;
ip_vs_nat_icmp(skb, pp, cp, 0);
- if (!local) {
- /* drop the old route when skb is not shared */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- } else {
- ip_rt_put(rt);
- /*
- * Some IPv4 replies get local address from routes,
- * not from iph, so while we DNAT after routing
- * we need this second input/output route.
- */
- if (!__ip_vs_reroute_locally(skb))
- goto tx_error;
- }
-
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
-
- rc = NF_STOLEN;
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
+ rcu_read_unlock();
goto out;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
- dev_kfree_skb(skb);
+ kfree_skb(skb);
+ rcu_read_unlock();
rc = NF_STOLEN;
out:
LeaveFunction(10);
return rc;
- tx_error_put:
- ip_rt_put(rt);
- goto tx_error;
}
#ifdef CONFIG_IP_VS_IPV6
int
ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
- struct ip_vs_iphdr *iph)
+ struct ip_vs_iphdr *ipvsh)
{
struct rt6_info *rt; /* Route to the other host */
- int mtu;
int rc;
int local;
int rt_mode;
@@ -1328,7 +1190,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
translate address/port back */
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
if (cp->packet_xmit)
- rc = cp->packet_xmit(skb, cp, pp, iph);
+ rc = cp->packet_xmit(skb, cp, pp, ipvsh);
else
rc = NF_ACCEPT;
/* do not touch skb anymore */
@@ -1344,11 +1206,12 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
rt_mode = (hooknum != NF_INET_FORWARD) ?
IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
- if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
- 0, rt_mode)))
- goto tx_error_icmp;
-
- local = __ip_vs_is_local_route6(rt);
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ ipvsh, 0, rt_mode);
+ if (local < 0)
+ goto tx_error;
+ rt = (struct rt6_info *) skb_dst(skb);
/*
* Avoid duplicate tuple in reply direction for NAT traffic
* to local address when connection is sync-ed
@@ -1362,7 +1225,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG(10, "%s(): "
"stopping DNAT to local address %pI6\n",
__func__, &cp->daddr.in6);
- goto tx_error_put;
+ goto tx_error;
}
}
#endif
@@ -1373,60 +1236,31 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG(1, "%s(): "
"stopping DNAT to loopback %pI6\n",
__func__, &cp->daddr.in6);
- goto tx_error_put;
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if (__mtu_check_toobig_v6(skb, mtu)) {
- if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net->loopback_dev;
- }
- /* only send ICMP too big on first fragment */
- if (!iph->fragoffs)
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error_put;
+ goto tx_error;
}
/* copy-on-write the packet before mangling it */
if (!skb_make_writable(skb, offset))
- goto tx_error_put;
+ goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
- goto tx_error_put;
+ goto tx_error;
ip_vs_nat_icmp_v6(skb, pp, cp, 0);
- if (!local || !skb->dev) {
- /* drop the old route when skb is not shared */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- } else {
- /* destined to loopback, do we need to change route? */
- dst_release(&rt->dst);
- }
-
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
-
- rc = NF_STOLEN;
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
+ rcu_read_unlock();
goto out;
-tx_error_icmp:
- dst_link_failure(skb);
tx_error:
- dev_kfree_skb(skb);
+ kfree_skb(skb);
+ rcu_read_unlock();
rc = NF_STOLEN;
out:
LeaveFunction(10);
return rc;
-tx_error_put:
- dst_release(&rt->dst);
- goto tx_error;
}
#endif
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index c8e001a9c45b..007e8c43d19a 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -48,6 +48,7 @@
#include <net/netfilter/nf_conntrack_labels.h>
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_helper.h>
#define NF_CONNTRACK_VERSION "0.5.0"
@@ -1364,30 +1365,48 @@ void nf_conntrack_cleanup_end(void)
*/
void nf_conntrack_cleanup_net(struct net *net)
{
+ LIST_HEAD(single);
+
+ list_add(&net->exit_list, &single);
+ nf_conntrack_cleanup_net_list(&single);
+}
+
+void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
+{
+ int busy;
+ struct net *net;
+
/*
* This makes sure all current packets have passed through
* netfilter framework. Roll on, two-stage module
* delete...
*/
synchronize_net();
- i_see_dead_people:
- nf_ct_iterate_cleanup(net, kill_all, NULL);
- nf_ct_release_dying_list(net);
- if (atomic_read(&net->ct.count) != 0) {
+i_see_dead_people:
+ busy = 0;
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nf_ct_iterate_cleanup(net, kill_all, NULL);
+ nf_ct_release_dying_list(net);
+ if (atomic_read(&net->ct.count) != 0)
+ busy = 1;
+ }
+ if (busy) {
schedule();
goto i_see_dead_people;
}
- nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
- nf_conntrack_proto_pernet_fini(net);
- nf_conntrack_helper_pernet_fini(net);
- nf_conntrack_ecache_pernet_fini(net);
- nf_conntrack_tstamp_pernet_fini(net);
- nf_conntrack_acct_pernet_fini(net);
- nf_conntrack_expect_pernet_fini(net);
- kmem_cache_destroy(net->ct.nf_conntrack_cachep);
- kfree(net->ct.slabname);
- free_percpu(net->ct.stat);
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
+ nf_conntrack_proto_pernet_fini(net);
+ nf_conntrack_helper_pernet_fini(net);
+ nf_conntrack_ecache_pernet_fini(net);
+ nf_conntrack_tstamp_pernet_fini(net);
+ nf_conntrack_acct_pernet_fini(net);
+ nf_conntrack_expect_pernet_fini(net);
+ kmem_cache_destroy(net->ct.nf_conntrack_cachep);
+ kfree(net->ct.slabname);
+ free_percpu(net->ct.stat);
+ }
}
void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index a9740bd6fe54..a0b1c5c23d1c 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -339,6 +339,13 @@ void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct,
{
const struct nf_conn_help *help;
const struct nf_conntrack_helper *helper;
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
/* Called from the helper function, this call never fails */
help = nfct_help(ct);
@@ -346,8 +353,10 @@ void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct,
/* rcu_read_lock()ed by nf_hook_slow */
helper = rcu_dereference(help->helper);
- nf_log_packet(nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL,
- "nf_ct_%s: dropping packet: %s ", helper->name, fmt);
+ nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL,
+ "nf_ct_%s: dropping packet: %pV ", helper->name, &vaf);
+
+ va_end(args);
}
EXPORT_SYMBOL_GPL(nf_ct_helper_log);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 9904b15f600e..6d0f8a17c5b7 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2409,6 +2409,92 @@ out:
return skb->len;
}
+static int
+ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nf_conntrack_expect *exp, *last;
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ struct nf_conn *ct = cb->data;
+ struct nf_conn_help *help = nfct_help(ct);
+ u_int8_t l3proto = nfmsg->nfgen_family;
+
+ if (cb->args[0])
+ return 0;
+
+ rcu_read_lock();
+ last = (struct nf_conntrack_expect *)cb->args[1];
+restart:
+ hlist_for_each_entry(exp, &help->expectations, lnode) {
+ if (l3proto && exp->tuple.src.l3num != l3proto)
+ continue;
+ if (cb->args[1]) {
+ if (exp != last)
+ continue;
+ cb->args[1] = 0;
+ }
+ if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ IPCTNL_MSG_EXP_NEW,
+ exp) < 0) {
+ if (!atomic_inc_not_zero(&exp->use))
+ continue;
+ cb->args[1] = (unsigned long)exp;
+ goto out;
+ }
+ }
+ if (cb->args[1]) {
+ cb->args[1] = 0;
+ goto restart;
+ }
+ cb->args[0] = 1;
+out:
+ rcu_read_unlock();
+ if (last)
+ nf_ct_expect_put(last);
+
+ return skb->len;
+}
+
+static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ int err;
+ struct net *net = sock_net(ctnl);
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u_int8_t u3 = nfmsg->nfgen_family;
+ struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+ u16 zone = 0;
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_exp_ct_dump_table,
+ .done = ctnetlink_exp_done,
+ };
+
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);
+ if (err < 0)
+ return err;
+
+ if (cda[CTA_EXPECT_ZONE]) {
+ err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
+ if (err < 0)
+ return err;
+ }
+
+ h = nf_conntrack_find_get(net, zone, &tuple);
+ if (!h)
+ return -ENOENT;
+
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ c.data = ct;
+
+ err = netlink_dump_start(ctnl, skb, nlh, &c);
+ nf_ct_put(ct);
+
+ return err;
+}
+
static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
[CTA_EXPECT_MASTER] = { .type = NLA_NESTED },
[CTA_EXPECT_TUPLE] = { .type = NLA_NESTED },
@@ -2439,11 +2525,15 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
int err;
if (nlh->nlmsg_flags & NLM_F_DUMP) {
- struct netlink_dump_control c = {
- .dump = ctnetlink_exp_dump_table,
- .done = ctnetlink_exp_done,
- };
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ if (cda[CTA_EXPECT_MASTER])
+ return ctnetlink_dump_exp_ct(ctnl, skb, nlh, cda);
+ else {
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_exp_dump_table,
+ .done = ctnetlink_exp_done,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
}
err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 432f95780003..a99b6c3427b0 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -456,7 +456,8 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
out_invalid:
if (LOG_INVALID(net, IPPROTO_DCCP))
- nf_log_packet(nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, msg);
+ nf_log_packet(net, nf_ct_l3num(ct), 0, skb, NULL, NULL,
+ NULL, msg);
return false;
}
@@ -542,13 +543,13 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
spin_unlock_bh(&ct->lock);
if (LOG_INVALID(net, IPPROTO_DCCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_dccp: invalid packet ignored ");
return NF_ACCEPT;
case CT_DCCP_INVALID:
spin_unlock_bh(&ct->lock);
if (LOG_INVALID(net, IPPROTO_DCCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_dccp: invalid state transition ");
return -NF_ACCEPT;
}
@@ -613,7 +614,7 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl,
out_invalid:
if (LOG_INVALID(net, IPPROTO_DCCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL, msg);
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, msg);
return -NF_ACCEPT;
}
@@ -969,6 +970,10 @@ static int __init nf_conntrack_proto_dccp_init(void)
{
int ret;
+ ret = register_pernet_subsys(&dccp_net_ops);
+ if (ret < 0)
+ goto out_pernet;
+
ret = nf_ct_l4proto_register(&dccp_proto4);
if (ret < 0)
goto out_dccp4;
@@ -977,16 +982,12 @@ static int __init nf_conntrack_proto_dccp_init(void)
if (ret < 0)
goto out_dccp6;
- ret = register_pernet_subsys(&dccp_net_ops);
- if (ret < 0)
- goto out_pernet;
-
return 0;
-out_pernet:
- nf_ct_l4proto_unregister(&dccp_proto6);
out_dccp6:
nf_ct_l4proto_unregister(&dccp_proto4);
out_dccp4:
+ unregister_pernet_subsys(&dccp_net_ops);
+out_pernet:
return ret;
}
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index bd7d01d9c7e7..155ce9f8a0db 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -420,18 +420,18 @@ static int __init nf_ct_proto_gre_init(void)
{
int ret;
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_gre4);
- if (ret < 0)
- goto out_gre4;
-
ret = register_pernet_subsys(&proto_gre_net_ops);
if (ret < 0)
goto out_pernet;
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_gre4);
+ if (ret < 0)
+ goto out_gre4;
+
return 0;
-out_pernet:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_gre4);
out_gre4:
+ unregister_pernet_subsys(&proto_gre_net_ops);
+out_pernet:
return ret;
}
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 480f616d5936..ec83536def9a 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -888,6 +888,10 @@ static int __init nf_conntrack_proto_sctp_init(void)
{
int ret;
+ ret = register_pernet_subsys(&sctp_net_ops);
+ if (ret < 0)
+ goto out_pernet;
+
ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp4);
if (ret < 0)
goto out_sctp4;
@@ -896,16 +900,12 @@ static int __init nf_conntrack_proto_sctp_init(void)
if (ret < 0)
goto out_sctp6;
- ret = register_pernet_subsys(&sctp_net_ops);
- if (ret < 0)
- goto out_pernet;
-
return 0;
-out_pernet:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp6);
out_sctp6:
nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
out_sctp4:
+ unregister_pernet_subsys(&sctp_net_ops);
+out_pernet:
return ret;
}
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 83876e9877f1..f021a2076c87 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -720,7 +720,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
tn->tcp_be_liberal)
res = true;
if (!res && LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: %s ",
before(seq, sender->td_maxend + 1) ?
after(end, sender->td_end - receiver->td_maxwin - 1) ?
@@ -772,7 +772,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
if (th == NULL) {
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: short packet ");
return -NF_ACCEPT;
}
@@ -780,7 +780,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
/* Not whole TCP header or malformed packet */
if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: truncated/malformed packet ");
return -NF_ACCEPT;
}
@@ -793,7 +793,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: bad TCP checksum ");
return -NF_ACCEPT;
}
@@ -802,7 +802,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
if (!tcp_valid_flags[tcpflags]) {
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: invalid TCP flag combination ");
return -NF_ACCEPT;
}
@@ -949,7 +949,7 @@ static int tcp_packet(struct nf_conn *ct,
}
spin_unlock_bh(&ct->lock);
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: invalid packet ignored in "
"state %s ", tcp_conntrack_names[old_state]);
return NF_ACCEPT;
@@ -959,7 +959,7 @@ static int tcp_packet(struct nf_conn *ct,
dir, get_conntrack_index(th), old_state);
spin_unlock_bh(&ct->lock);
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: invalid state ");
return -NF_ACCEPT;
case TCP_CONNTRACK_CLOSE:
@@ -969,8 +969,8 @@ static int tcp_packet(struct nf_conn *ct,
/* Invalid RST */
spin_unlock_bh(&ct->lock);
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
- "nf_ct_tcp: invalid RST ");
+ nf_log_packet(net, pf, 0, skb, NULL, NULL,
+ NULL, "nf_ct_tcp: invalid RST ");
return -NF_ACCEPT;
}
if (index == TCP_RST_SET
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 59623cc56e8d..fee43228e115 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -119,7 +119,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
if (hdr == NULL) {
if (LOG_INVALID(net, IPPROTO_UDP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udp: short packet ");
return -NF_ACCEPT;
}
@@ -127,7 +127,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
/* Truncated/malformed packets */
if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
if (LOG_INVALID(net, IPPROTO_UDP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udp: truncated/malformed packet ");
return -NF_ACCEPT;
}
@@ -143,7 +143,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) {
if (LOG_INVALID(net, IPPROTO_UDP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udp: bad UDP checksum ");
return -NF_ACCEPT;
}
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index 157489581c31..2750e6c69f82 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -131,7 +131,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
if (hdr == NULL) {
if (LOG_INVALID(net, IPPROTO_UDPLITE))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udplite: short packet ");
return -NF_ACCEPT;
}
@@ -141,7 +141,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
cscov = udplen;
else if (cscov < sizeof(*hdr) || cscov > udplen) {
if (LOG_INVALID(net, IPPROTO_UDPLITE))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udplite: invalid checksum coverage ");
return -NF_ACCEPT;
}
@@ -149,7 +149,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
/* UDPLITE mandates checksums */
if (!hdr->check) {
if (LOG_INVALID(net, IPPROTO_UDPLITE))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udplite: checksum missing ");
return -NF_ACCEPT;
}
@@ -159,7 +159,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
pf)) {
if (LOG_INVALID(net, IPPROTO_UDPLITE))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udplite: bad UDPLite checksum ");
return -NF_ACCEPT;
}
@@ -371,6 +371,10 @@ static int __init nf_conntrack_proto_udplite_init(void)
{
int ret;
+ ret = register_pernet_subsys(&udplite_net_ops);
+ if (ret < 0)
+ goto out_pernet;
+
ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite4);
if (ret < 0)
goto out_udplite4;
@@ -379,16 +383,12 @@ static int __init nf_conntrack_proto_udplite_init(void)
if (ret < 0)
goto out_udplite6;
- ret = register_pernet_subsys(&udplite_net_ops);
- if (ret < 0)
- goto out_pernet;
-
return 0;
-out_pernet:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite6);
out_udplite6:
nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4);
out_udplite4:
+ unregister_pernet_subsys(&udplite_net_ops);
+out_pernet:
return ret;
}
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 6bcce401fd1c..ebb67d33bd63 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -545,16 +545,20 @@ out_init:
return ret;
}
-static void nf_conntrack_pernet_exit(struct net *net)
+static void nf_conntrack_pernet_exit(struct list_head *net_exit_list)
{
- nf_conntrack_standalone_fini_sysctl(net);
- nf_conntrack_standalone_fini_proc(net);
- nf_conntrack_cleanup_net(net);
+ struct net *net;
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nf_conntrack_standalone_fini_sysctl(net);
+ nf_conntrack_standalone_fini_proc(net);
+ }
+ nf_conntrack_cleanup_net_list(net_exit_list);
}
static struct pernet_operations nf_conntrack_net_ops = {
- .init = nf_conntrack_pernet_init,
- .exit = nf_conntrack_pernet_exit,
+ .init = nf_conntrack_pernet_init,
+ .exit_batch = nf_conntrack_pernet_exit,
};
static int __init nf_conntrack_standalone_init(void)
@@ -568,6 +572,7 @@ static int __init nf_conntrack_standalone_init(void)
register_net_sysctl(&init_net, "net", nf_ct_netfilter_table);
if (!nf_ct_netfilter_header) {
pr_err("nf_conntrack: can't register to sysctl.\n");
+ ret = -ENOMEM;
goto out_sysctl;
}
#endif
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 9e312695c818..388656d5a9ec 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -16,7 +16,6 @@
#define NF_LOG_PREFIXLEN 128
#define NFLOGGER_NAME_LEN 64
-static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly;
static DEFINE_MUTEX(nf_log_mutex);
@@ -32,13 +31,46 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger)
return NULL;
}
+void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger)
+{
+ const struct nf_logger *log;
+
+ if (pf == NFPROTO_UNSPEC)
+ return;
+
+ mutex_lock(&nf_log_mutex);
+ log = rcu_dereference_protected(net->nf.nf_loggers[pf],
+ lockdep_is_held(&nf_log_mutex));
+ if (log == NULL)
+ rcu_assign_pointer(net->nf.nf_loggers[pf], logger);
+
+ mutex_unlock(&nf_log_mutex);
+}
+EXPORT_SYMBOL(nf_log_set);
+
+void nf_log_unset(struct net *net, const struct nf_logger *logger)
+{
+ int i;
+ const struct nf_logger *log;
+
+ mutex_lock(&nf_log_mutex);
+ for (i = 0; i < NFPROTO_NUMPROTO; i++) {
+ log = rcu_dereference_protected(net->nf.nf_loggers[i],
+ lockdep_is_held(&nf_log_mutex));
+ if (log == logger)
+ RCU_INIT_POINTER(net->nf.nf_loggers[i], NULL);
+ }
+ mutex_unlock(&nf_log_mutex);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL(nf_log_unset);
+
/* return EEXIST if the same logger is registered, 0 on success. */
int nf_log_register(u_int8_t pf, struct nf_logger *logger)
{
- const struct nf_logger *llog;
int i;
- if (pf >= ARRAY_SIZE(nf_loggers))
+ if (pf >= ARRAY_SIZE(init_net.nf.nf_loggers))
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(logger->list); i++)
@@ -52,10 +84,6 @@ int nf_log_register(u_int8_t pf, struct nf_logger *logger)
} else {
/* register at end of list to honor first register win */
list_add_tail(&logger->list[pf], &nf_loggers_l[pf]);
- llog = rcu_dereference_protected(nf_loggers[pf],
- lockdep_is_held(&nf_log_mutex));
- if (llog == NULL)
- rcu_assign_pointer(nf_loggers[pf], logger);
}
mutex_unlock(&nf_log_mutex);
@@ -66,49 +94,43 @@ EXPORT_SYMBOL(nf_log_register);
void nf_log_unregister(struct nf_logger *logger)
{
- const struct nf_logger *c_logger;
int i;
mutex_lock(&nf_log_mutex);
- for (i = 0; i < ARRAY_SIZE(nf_loggers); i++) {
- c_logger = rcu_dereference_protected(nf_loggers[i],
- lockdep_is_held(&nf_log_mutex));
- if (c_logger == logger)
- RCU_INIT_POINTER(nf_loggers[i], NULL);
+ for (i = 0; i < NFPROTO_NUMPROTO; i++)
list_del(&logger->list[i]);
- }
mutex_unlock(&nf_log_mutex);
-
- synchronize_rcu();
}
EXPORT_SYMBOL(nf_log_unregister);
-int nf_log_bind_pf(u_int8_t pf, const struct nf_logger *logger)
+int nf_log_bind_pf(struct net *net, u_int8_t pf,
+ const struct nf_logger *logger)
{
- if (pf >= ARRAY_SIZE(nf_loggers))
+ if (pf >= ARRAY_SIZE(net->nf.nf_loggers))
return -EINVAL;
mutex_lock(&nf_log_mutex);
if (__find_logger(pf, logger->name) == NULL) {
mutex_unlock(&nf_log_mutex);
return -ENOENT;
}
- rcu_assign_pointer(nf_loggers[pf], logger);
+ rcu_assign_pointer(net->nf.nf_loggers[pf], logger);
mutex_unlock(&nf_log_mutex);
return 0;
}
EXPORT_SYMBOL(nf_log_bind_pf);
-void nf_log_unbind_pf(u_int8_t pf)
+void nf_log_unbind_pf(struct net *net, u_int8_t pf)
{
- if (pf >= ARRAY_SIZE(nf_loggers))
+ if (pf >= ARRAY_SIZE(net->nf.nf_loggers))
return;
mutex_lock(&nf_log_mutex);
- RCU_INIT_POINTER(nf_loggers[pf], NULL);
+ RCU_INIT_POINTER(net->nf.nf_loggers[pf], NULL);
mutex_unlock(&nf_log_mutex);
}
EXPORT_SYMBOL(nf_log_unbind_pf);
-void nf_log_packet(u_int8_t pf,
+void nf_log_packet(struct net *net,
+ u_int8_t pf,
unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
@@ -121,7 +143,7 @@ void nf_log_packet(u_int8_t pf,
const struct nf_logger *logger;
rcu_read_lock();
- logger = rcu_dereference(nf_loggers[pf]);
+ logger = rcu_dereference(net->nf.nf_loggers[pf]);
if (logger) {
va_start(args, fmt);
vsnprintf(prefix, sizeof(prefix), fmt, args);
@@ -135,9 +157,11 @@ EXPORT_SYMBOL(nf_log_packet);
#ifdef CONFIG_PROC_FS
static void *seq_start(struct seq_file *seq, loff_t *pos)
{
+ struct net *net = seq_file_net(seq);
+
mutex_lock(&nf_log_mutex);
- if (*pos >= ARRAY_SIZE(nf_loggers))
+ if (*pos >= ARRAY_SIZE(net->nf.nf_loggers))
return NULL;
return pos;
@@ -145,9 +169,11 @@ static void *seq_start(struct seq_file *seq, loff_t *pos)
static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
{
+ struct net *net = seq_file_net(s);
+
(*pos)++;
- if (*pos >= ARRAY_SIZE(nf_loggers))
+ if (*pos >= ARRAY_SIZE(net->nf.nf_loggers))
return NULL;
return pos;
@@ -164,8 +190,9 @@ static int seq_show(struct seq_file *s, void *v)
const struct nf_logger *logger;
struct nf_logger *t;
int ret;
+ struct net *net = seq_file_net(s);
- logger = rcu_dereference_protected(nf_loggers[*pos],
+ logger = rcu_dereference_protected(net->nf.nf_loggers[*pos],
lockdep_is_held(&nf_log_mutex));
if (!logger)
@@ -199,7 +226,8 @@ static const struct seq_operations nflog_seq_ops = {
static int nflog_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &nflog_seq_ops);
+ return seq_open_net(inode, file, &nflog_seq_ops,
+ sizeof(struct seq_net_private));
}
static const struct file_operations nflog_file_ops = {
@@ -207,7 +235,7 @@ static const struct file_operations nflog_file_ops = {
.open = nflog_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = seq_release_net,
};
@@ -216,7 +244,6 @@ static const struct file_operations nflog_file_ops = {
#ifdef CONFIG_SYSCTL
static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];
static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1];
-static struct ctl_table_header *nf_log_dir_header;
static int nf_log_proc_dostring(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -226,6 +253,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write,
size_t size = *lenp;
int r = 0;
int tindex = (unsigned long)table->extra1;
+ struct net *net = current->nsproxy->net_ns;
if (write) {
if (size > sizeof(buf))
@@ -234,7 +262,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write,
return -EFAULT;
if (!strcmp(buf, "NONE")) {
- nf_log_unbind_pf(tindex);
+ nf_log_unbind_pf(net, tindex);
return 0;
}
mutex_lock(&nf_log_mutex);
@@ -243,11 +271,11 @@ static int nf_log_proc_dostring(ctl_table *table, int write,
mutex_unlock(&nf_log_mutex);
return -ENOENT;
}
- rcu_assign_pointer(nf_loggers[tindex], logger);
+ rcu_assign_pointer(net->nf.nf_loggers[tindex], logger);
mutex_unlock(&nf_log_mutex);
} else {
mutex_lock(&nf_log_mutex);
- logger = rcu_dereference_protected(nf_loggers[tindex],
+ logger = rcu_dereference_protected(net->nf.nf_loggers[tindex],
lockdep_is_held(&nf_log_mutex));
if (!logger)
table->data = "NONE";
@@ -260,49 +288,111 @@ static int nf_log_proc_dostring(ctl_table *table, int write,
return r;
}
-static __init int netfilter_log_sysctl_init(void)
+static int netfilter_log_sysctl_init(struct net *net)
{
int i;
-
- for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) {
- snprintf(nf_log_sysctl_fnames[i-NFPROTO_UNSPEC], 3, "%d", i);
- nf_log_sysctl_table[i].procname =
- nf_log_sysctl_fnames[i-NFPROTO_UNSPEC];
- nf_log_sysctl_table[i].data = NULL;
- nf_log_sysctl_table[i].maxlen =
- NFLOGGER_NAME_LEN * sizeof(char);
- nf_log_sysctl_table[i].mode = 0644;
- nf_log_sysctl_table[i].proc_handler = nf_log_proc_dostring;
- nf_log_sysctl_table[i].extra1 = (void *)(unsigned long) i;
+ struct ctl_table *table;
+
+ table = nf_log_sysctl_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(nf_log_sysctl_table,
+ sizeof(nf_log_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+ } else {
+ for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) {
+ snprintf(nf_log_sysctl_fnames[i],
+ 3, "%d", i);
+ nf_log_sysctl_table[i].procname =
+ nf_log_sysctl_fnames[i];
+ nf_log_sysctl_table[i].data = NULL;
+ nf_log_sysctl_table[i].maxlen =
+ NFLOGGER_NAME_LEN * sizeof(char);
+ nf_log_sysctl_table[i].mode = 0644;
+ nf_log_sysctl_table[i].proc_handler =
+ nf_log_proc_dostring;
+ nf_log_sysctl_table[i].extra1 =
+ (void *)(unsigned long) i;
+ }
}
- nf_log_dir_header = register_net_sysctl(&init_net, "net/netfilter/nf_log",
- nf_log_sysctl_table);
- if (!nf_log_dir_header)
- return -ENOMEM;
+ net->nf.nf_log_dir_header = register_net_sysctl(net,
+ "net/netfilter/nf_log",
+ table);
+ if (!net->nf.nf_log_dir_header)
+ goto err_reg;
return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void netfilter_log_sysctl_exit(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->nf.nf_log_dir_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->nf.nf_log_dir_header);
+ if (!net_eq(net, &init_net))
+ kfree(table);
}
#else
-static __init int netfilter_log_sysctl_init(void)
+static int netfilter_log_sysctl_init(struct net *net)
{
return 0;
}
+
+static void netfilter_log_sysctl_exit(struct net *net)
+{
+}
#endif /* CONFIG_SYSCTL */
-int __init netfilter_log_init(void)
+static int __net_init nf_log_net_init(struct net *net)
{
- int i, r;
+ int ret = -ENOMEM;
+
#ifdef CONFIG_PROC_FS
if (!proc_create("nf_log", S_IRUGO,
- proc_net_netfilter, &nflog_file_ops))
- return -1;
+ net->nf.proc_netfilter, &nflog_file_ops))
+ return ret;
#endif
+ ret = netfilter_log_sysctl_init(net);
+ if (ret < 0)
+ goto out_sysctl;
+
+ return 0;
- /* Errors will trigger panic, unroll on error is unnecessary. */
- r = netfilter_log_sysctl_init();
- if (r < 0)
- return r;
+out_sysctl:
+ /* For init_net: errors will trigger panic, don't unroll on error. */
+ if (!net_eq(net, &init_net))
+ remove_proc_entry("nf_log", net->nf.proc_netfilter);
+
+ return ret;
+}
+
+static void __net_exit nf_log_net_exit(struct net *net)
+{
+ netfilter_log_sysctl_exit(net);
+ remove_proc_entry("nf_log", net->nf.proc_netfilter);
+}
+
+static struct pernet_operations nf_log_net_ops = {
+ .init = nf_log_net_init,
+ .exit = nf_log_net_exit,
+};
+
+int __init netfilter_log_init(void)
+{
+ int i, ret;
+
+ ret = register_pernet_subsys(&nf_log_net_ops);
+ if (ret < 0)
+ return ret;
for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
INIT_LIST_HEAD(&(nf_loggers_l[i]));
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index d578ec251712..bc4c499adb13 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -24,10 +24,9 @@
#include <linux/skbuff.h>
#include <asm/uaccess.h>
#include <net/sock.h>
-#include <net/netlink.h>
#include <linux/init.h>
-#include <linux/netlink.h>
+#include <net/netlink.h>
#include <linux/netfilter/nfnetlink.h>
MODULE_LICENSE("GPL");
@@ -62,11 +61,6 @@ void nfnl_unlock(__u8 subsys_id)
}
EXPORT_SYMBOL_GPL(nfnl_unlock);
-static struct mutex *nfnl_get_lock(__u8 subsys_id)
-{
- return &table[subsys_id].mutex;
-}
-
int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)
{
nfnl_lock(n->subsys_id);
@@ -149,7 +143,7 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return -EPERM;
/* All the messages must at least contain nfgenmsg */
- if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct nfgenmsg)))
+ if (nlmsg_len(nlh) < sizeof(struct nfgenmsg))
return 0;
type = nlh->nlmsg_type;
@@ -177,7 +171,7 @@ replay:
}
{
- int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
+ int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
struct nlattr *attr = (void *)nlh + min_len;
@@ -199,7 +193,7 @@ replay:
rcu_read_unlock();
nfnl_lock(subsys_id);
if (rcu_dereference_protected(table[subsys_id].subsys,
- lockdep_is_held(nfnl_get_lock(subsys_id))) != ss ||
+ lockdep_is_held(&table[subsys_id].mutex)) != ss ||
nfnetlink_find_client(type, ss) != nc)
err = -EAGAIN;
else if (nc->call)
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 589d686f0b4c..dc3fd5d44464 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -49,6 +49,8 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
return -EINVAL;
acct_name = nla_data(tb[NFACCT_NAME]);
+ if (strlen(acct_name) == 0)
+ return -EINVAL;
list_for_each_entry(nfacct, &nfnl_acct_list, head) {
if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0)
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index f248db572972..1a0be2af1dd8 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -19,7 +19,7 @@
#include <linux/ipv6.h>
#include <linux/netdevice.h>
#include <linux/netfilter.h>
-#include <linux/netlink.h>
+#include <net/netlink.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_log.h>
#include <linux/spinlock.h>
@@ -32,6 +32,7 @@
#include <linux/slab.h>
#include <net/sock.h>
#include <net/netfilter/nf_log.h>
+#include <net/netns/generic.h>
#include <net/netfilter/nfnetlink_log.h>
#include <linux/atomic.h>
@@ -56,6 +57,7 @@ struct nfulnl_instance {
unsigned int qlen; /* number of nlmsgs in skb */
struct sk_buff *skb; /* pre-allocatd skb */
struct timer_list timer;
+ struct net *net;
struct user_namespace *peer_user_ns; /* User namespace of the peer process */
int peer_portid; /* PORTID of the peer process */
@@ -71,25 +73,34 @@ struct nfulnl_instance {
struct rcu_head rcu;
};
-static DEFINE_SPINLOCK(instances_lock);
-static atomic_t global_seq;
-
#define INSTANCE_BUCKETS 16
-static struct hlist_head instance_table[INSTANCE_BUCKETS];
static unsigned int hash_init;
+static int nfnl_log_net_id __read_mostly;
+
+struct nfnl_log_net {
+ spinlock_t instances_lock;
+ struct hlist_head instance_table[INSTANCE_BUCKETS];
+ atomic_t global_seq;
+};
+
+static struct nfnl_log_net *nfnl_log_pernet(struct net *net)
+{
+ return net_generic(net, nfnl_log_net_id);
+}
+
static inline u_int8_t instance_hashfn(u_int16_t group_num)
{
return ((group_num & 0xff) % INSTANCE_BUCKETS);
}
static struct nfulnl_instance *
-__instance_lookup(u_int16_t group_num)
+__instance_lookup(struct nfnl_log_net *log, u_int16_t group_num)
{
struct hlist_head *head;
struct nfulnl_instance *inst;
- head = &instance_table[instance_hashfn(group_num)];
+ head = &log->instance_table[instance_hashfn(group_num)];
hlist_for_each_entry_rcu(inst, head, hlist) {
if (inst->group_num == group_num)
return inst;
@@ -104,12 +115,12 @@ instance_get(struct nfulnl_instance *inst)
}
static struct nfulnl_instance *
-instance_lookup_get(u_int16_t group_num)
+instance_lookup_get(struct nfnl_log_net *log, u_int16_t group_num)
{
struct nfulnl_instance *inst;
rcu_read_lock_bh();
- inst = __instance_lookup(group_num);
+ inst = __instance_lookup(log, group_num);
if (inst && !atomic_inc_not_zero(&inst->use))
inst = NULL;
rcu_read_unlock_bh();
@@ -119,7 +130,11 @@ instance_lookup_get(u_int16_t group_num)
static void nfulnl_instance_free_rcu(struct rcu_head *head)
{
- kfree(container_of(head, struct nfulnl_instance, rcu));
+ struct nfulnl_instance *inst =
+ container_of(head, struct nfulnl_instance, rcu);
+
+ put_net(inst->net);
+ kfree(inst);
module_put(THIS_MODULE);
}
@@ -133,13 +148,15 @@ instance_put(struct nfulnl_instance *inst)
static void nfulnl_timer(unsigned long data);
static struct nfulnl_instance *
-instance_create(u_int16_t group_num, int portid, struct user_namespace *user_ns)
+instance_create(struct net *net, u_int16_t group_num,
+ int portid, struct user_namespace *user_ns)
{
struct nfulnl_instance *inst;
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
int err;
- spin_lock_bh(&instances_lock);
- if (__instance_lookup(group_num)) {
+ spin_lock_bh(&log->instances_lock);
+ if (__instance_lookup(log, group_num)) {
err = -EEXIST;
goto out_unlock;
}
@@ -163,6 +180,7 @@ instance_create(u_int16_t group_num, int portid, struct user_namespace *user_ns)
setup_timer(&inst->timer, nfulnl_timer, (unsigned long)inst);
+ inst->net = get_net(net);
inst->peer_user_ns = user_ns;
inst->peer_portid = portid;
inst->group_num = group_num;
@@ -174,14 +192,15 @@ instance_create(u_int16_t group_num, int portid, struct user_namespace *user_ns)
inst->copy_range = NFULNL_COPY_RANGE_MAX;
hlist_add_head_rcu(&inst->hlist,
- &instance_table[instance_hashfn(group_num)]);
+ &log->instance_table[instance_hashfn(group_num)]);
+
- spin_unlock_bh(&instances_lock);
+ spin_unlock_bh(&log->instances_lock);
return inst;
out_unlock:
- spin_unlock_bh(&instances_lock);
+ spin_unlock_bh(&log->instances_lock);
return ERR_PTR(err);
}
@@ -210,11 +229,12 @@ __instance_destroy(struct nfulnl_instance *inst)
}
static inline void
-instance_destroy(struct nfulnl_instance *inst)
+instance_destroy(struct nfnl_log_net *log,
+ struct nfulnl_instance *inst)
{
- spin_lock_bh(&instances_lock);
+ spin_lock_bh(&log->instances_lock);
__instance_destroy(inst);
- spin_unlock_bh(&instances_lock);
+ spin_unlock_bh(&log->instances_lock);
}
static int
@@ -336,7 +356,7 @@ __nfulnl_send(struct nfulnl_instance *inst)
if (!nlh)
goto out;
}
- status = nfnetlink_unicast(inst->skb, &init_net, inst->peer_portid,
+ status = nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid,
MSG_DONTWAIT);
inst->qlen = 0;
@@ -370,7 +390,8 @@ nfulnl_timer(unsigned long data)
/* This is an inline function, we don't really care about a long
* list of arguments */
static inline int
-__build_packet_message(struct nfulnl_instance *inst,
+__build_packet_message(struct nfnl_log_net *log,
+ struct nfulnl_instance *inst,
const struct sk_buff *skb,
unsigned int data_len,
u_int8_t pf,
@@ -536,7 +557,7 @@ __build_packet_message(struct nfulnl_instance *inst,
/* global sequence number */
if ((inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) &&
nla_put_be32(inst->skb, NFULA_SEQ_GLOBAL,
- htonl(atomic_inc_return(&global_seq))))
+ htonl(atomic_inc_return(&log->global_seq))))
goto nla_put_failure;
if (data_len) {
@@ -592,13 +613,15 @@ nfulnl_log_packet(u_int8_t pf,
const struct nf_loginfo *li;
unsigned int qthreshold;
unsigned int plen;
+ struct net *net = dev_net(in ? in : out);
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
if (li_user && li_user->type == NF_LOG_TYPE_ULOG)
li = li_user;
else
li = &default_loginfo;
- inst = instance_lookup_get(li->u.ulog.group);
+ inst = instance_lookup_get(log, li->u.ulog.group);
if (!inst)
return;
@@ -609,7 +632,7 @@ nfulnl_log_packet(u_int8_t pf,
/* FIXME: do we want to make the size calculation conditional based on
* what is actually present? way more branches and checks, but more
* memory efficient... */
- size = NLMSG_SPACE(sizeof(struct nfgenmsg))
+ size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ nla_total_size(sizeof(struct nfulnl_msg_packet_hdr))
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
@@ -680,7 +703,7 @@ nfulnl_log_packet(u_int8_t pf,
inst->qlen++;
- __build_packet_message(inst, skb, data_len, pf,
+ __build_packet_message(log, inst, skb, data_len, pf,
hooknum, in, out, prefix, plen);
if (inst->qlen >= qthreshold)
@@ -709,24 +732,24 @@ nfulnl_rcv_nl_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct netlink_notify *n = ptr;
+ struct nfnl_log_net *log = nfnl_log_pernet(n->net);
if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {
int i;
/* destroy all instances for this portid */
- spin_lock_bh(&instances_lock);
+ spin_lock_bh(&log->instances_lock);
for (i = 0; i < INSTANCE_BUCKETS; i++) {
struct hlist_node *t2;
struct nfulnl_instance *inst;
- struct hlist_head *head = &instance_table[i];
+ struct hlist_head *head = &log->instance_table[i];
hlist_for_each_entry_safe(inst, t2, head, hlist) {
- if ((net_eq(n->net, &init_net)) &&
- (n->portid == inst->peer_portid))
+ if (n->portid == inst->peer_portid)
__instance_destroy(inst);
}
}
- spin_unlock_bh(&instances_lock);
+ spin_unlock_bh(&log->instances_lock);
}
return NOTIFY_DONE;
}
@@ -767,6 +790,8 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
u_int16_t group_num = ntohs(nfmsg->res_id);
struct nfulnl_instance *inst;
struct nfulnl_msg_config_cmd *cmd = NULL;
+ struct net *net = sock_net(ctnl);
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
int ret = 0;
if (nfula[NFULA_CFG_CMD]) {
@@ -776,14 +801,14 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
/* Commands without queue context */
switch (cmd->command) {
case NFULNL_CFG_CMD_PF_BIND:
- return nf_log_bind_pf(pf, &nfulnl_logger);
+ return nf_log_bind_pf(net, pf, &nfulnl_logger);
case NFULNL_CFG_CMD_PF_UNBIND:
- nf_log_unbind_pf(pf);
+ nf_log_unbind_pf(net, pf);
return 0;
}
}
- inst = instance_lookup_get(group_num);
+ inst = instance_lookup_get(log, group_num);
if (inst && inst->peer_portid != NETLINK_CB(skb).portid) {
ret = -EPERM;
goto out_put;
@@ -797,7 +822,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
goto out_put;
}
- inst = instance_create(group_num,
+ inst = instance_create(net, group_num,
NETLINK_CB(skb).portid,
sk_user_ns(NETLINK_CB(skb).ssk));
if (IS_ERR(inst)) {
@@ -811,7 +836,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
goto out;
}
- instance_destroy(inst);
+ instance_destroy(log, inst);
goto out_put;
default:
ret = -ENOTSUPP;
@@ -894,55 +919,68 @@ static const struct nfnetlink_subsystem nfulnl_subsys = {
#ifdef CONFIG_PROC_FS
struct iter_state {
+ struct seq_net_private p;
unsigned int bucket;
};
-static struct hlist_node *get_first(struct iter_state *st)
+static struct hlist_node *get_first(struct net *net, struct iter_state *st)
{
+ struct nfnl_log_net *log;
if (!st)
return NULL;
+ log = nfnl_log_pernet(net);
+
for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
- if (!hlist_empty(&instance_table[st->bucket]))
- return rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket]));
+ struct hlist_head *head = &log->instance_table[st->bucket];
+
+ if (!hlist_empty(head))
+ return rcu_dereference_bh(hlist_first_rcu(head));
}
return NULL;
}
-static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h)
+static struct hlist_node *get_next(struct net *net, struct iter_state *st,
+ struct hlist_node *h)
{
h = rcu_dereference_bh(hlist_next_rcu(h));
while (!h) {
+ struct nfnl_log_net *log;
+ struct hlist_head *head;
+
if (++st->bucket >= INSTANCE_BUCKETS)
return NULL;
- h = rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket]));
+ log = nfnl_log_pernet(net);
+ head = &log->instance_table[st->bucket];
+ h = rcu_dereference_bh(hlist_first_rcu(head));
}
return h;
}
-static struct hlist_node *get_idx(struct iter_state *st, loff_t pos)
+static struct hlist_node *get_idx(struct net *net, struct iter_state *st,
+ loff_t pos)
{
struct hlist_node *head;
- head = get_first(st);
+ head = get_first(net, st);
if (head)
- while (pos && (head = get_next(st, head)))
+ while (pos && (head = get_next(net, st, head)))
pos--;
return pos ? NULL : head;
}
-static void *seq_start(struct seq_file *seq, loff_t *pos)
+static void *seq_start(struct seq_file *s, loff_t *pos)
__acquires(rcu_bh)
{
rcu_read_lock_bh();
- return get_idx(seq->private, *pos);
+ return get_idx(seq_file_net(s), s->private, *pos);
}
static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
{
(*pos)++;
- return get_next(s->private, v);
+ return get_next(seq_file_net(s), s->private, v);
}
static void seq_stop(struct seq_file *s, void *v)
@@ -971,8 +1009,8 @@ static const struct seq_operations nful_seq_ops = {
static int nful_open(struct inode *inode, struct file *file)
{
- return seq_open_private(file, &nful_seq_ops,
- sizeof(struct iter_state));
+ return seq_open_net(inode, file, &nful_seq_ops,
+ sizeof(struct iter_state));
}
static const struct file_operations nful_file_ops = {
@@ -980,17 +1018,43 @@ static const struct file_operations nful_file_ops = {
.open = nful_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = seq_release_net,
};
#endif /* PROC_FS */
-static int __init nfnetlink_log_init(void)
+static int __net_init nfnl_log_net_init(struct net *net)
{
- int i, status = -ENOMEM;
+ unsigned int i;
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
for (i = 0; i < INSTANCE_BUCKETS; i++)
- INIT_HLIST_HEAD(&instance_table[i]);
+ INIT_HLIST_HEAD(&log->instance_table[i]);
+ spin_lock_init(&log->instances_lock);
+
+#ifdef CONFIG_PROC_FS
+ if (!proc_create("nfnetlink_log", 0440,
+ net->nf.proc_netfilter, &nful_file_ops))
+ return -ENOMEM;
+#endif
+ return 0;
+}
+
+static void __net_exit nfnl_log_net_exit(struct net *net)
+{
+ remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter);
+}
+
+static struct pernet_operations nfnl_log_net_ops = {
+ .init = nfnl_log_net_init,
+ .exit = nfnl_log_net_exit,
+ .id = &nfnl_log_net_id,
+ .size = sizeof(struct nfnl_log_net),
+};
+
+static int __init nfnetlink_log_init(void)
+{
+ int status = -ENOMEM;
/* it's not really all that important to have a random value, so
* we can do this from the init function, even if there hasn't
@@ -1000,29 +1064,25 @@ static int __init nfnetlink_log_init(void)
netlink_register_notifier(&nfulnl_rtnl_notifier);
status = nfnetlink_subsys_register(&nfulnl_subsys);
if (status < 0) {
- printk(KERN_ERR "log: failed to create netlink socket\n");
+ pr_err("log: failed to create netlink socket\n");
goto cleanup_netlink_notifier;
}
status = nf_log_register(NFPROTO_UNSPEC, &nfulnl_logger);
if (status < 0) {
- printk(KERN_ERR "log: failed to register logger\n");
+ pr_err("log: failed to register logger\n");
goto cleanup_subsys;
}
-#ifdef CONFIG_PROC_FS
- if (!proc_create("nfnetlink_log", 0440,
- proc_net_netfilter, &nful_file_ops)) {
- status = -ENOMEM;
+ status = register_pernet_subsys(&nfnl_log_net_ops);
+ if (status < 0) {
+ pr_err("log: failed to register pernet ops\n");
goto cleanup_logger;
}
-#endif
return status;
-#ifdef CONFIG_PROC_FS
cleanup_logger:
nf_log_unregister(&nfulnl_logger);
-#endif
cleanup_subsys:
nfnetlink_subsys_unregister(&nfulnl_subsys);
cleanup_netlink_notifier:
@@ -1032,10 +1092,8 @@ cleanup_netlink_notifier:
static void __exit nfnetlink_log_fini(void)
{
+ unregister_pernet_subsys(&nfnl_log_net_ops);
nf_log_unregister(&nfulnl_logger);
-#ifdef CONFIG_PROC_FS
- remove_proc_entry("nfnetlink_log", proc_net_netfilter);
-#endif
nfnetlink_subsys_unregister(&nfulnl_subsys);
netlink_unregister_notifier(&nfulnl_rtnl_notifier);
}
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index 858fd52c1040..5e280b3e154f 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -30,6 +30,7 @@
#include <linux/list.h>
#include <net/sock.h>
#include <net/netfilter/nf_queue.h>
+#include <net/netns/generic.h>
#include <net/netfilter/nfnetlink_queue.h>
#include <linux/atomic.h>
@@ -66,23 +67,31 @@ struct nfqnl_instance {
typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);
-static DEFINE_SPINLOCK(instances_lock);
+static int nfnl_queue_net_id __read_mostly;
#define INSTANCE_BUCKETS 16
-static struct hlist_head instance_table[INSTANCE_BUCKETS] __read_mostly;
+struct nfnl_queue_net {
+ spinlock_t instances_lock;
+ struct hlist_head instance_table[INSTANCE_BUCKETS];
+};
+
+static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net)
+{
+ return net_generic(net, nfnl_queue_net_id);
+}
static inline u_int8_t instance_hashfn(u_int16_t queue_num)
{
- return ((queue_num >> 8) | queue_num) % INSTANCE_BUCKETS;
+ return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS;
}
static struct nfqnl_instance *
-instance_lookup(u_int16_t queue_num)
+instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num)
{
struct hlist_head *head;
struct nfqnl_instance *inst;
- head = &instance_table[instance_hashfn(queue_num)];
+ head = &q->instance_table[instance_hashfn(queue_num)];
hlist_for_each_entry_rcu(inst, head, hlist) {
if (inst->queue_num == queue_num)
return inst;
@@ -91,14 +100,15 @@ instance_lookup(u_int16_t queue_num)
}
static struct nfqnl_instance *
-instance_create(u_int16_t queue_num, int portid)
+instance_create(struct nfnl_queue_net *q, u_int16_t queue_num,
+ int portid)
{
struct nfqnl_instance *inst;
unsigned int h;
int err;
- spin_lock(&instances_lock);
- if (instance_lookup(queue_num)) {
+ spin_lock(&q->instances_lock);
+ if (instance_lookup(q, queue_num)) {
err = -EEXIST;
goto out_unlock;
}
@@ -112,7 +122,7 @@ instance_create(u_int16_t queue_num, int portid)
inst->queue_num = queue_num;
inst->peer_portid = portid;
inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
- inst->copy_range = 0xfffff;
+ inst->copy_range = 0xffff;
inst->copy_mode = NFQNL_COPY_NONE;
spin_lock_init(&inst->lock);
INIT_LIST_HEAD(&inst->queue_list);
@@ -123,16 +133,16 @@ instance_create(u_int16_t queue_num, int portid)
}
h = instance_hashfn(queue_num);
- hlist_add_head_rcu(&inst->hlist, &instance_table[h]);
+ hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]);
- spin_unlock(&instances_lock);
+ spin_unlock(&q->instances_lock);
return inst;
out_free:
kfree(inst);
out_unlock:
- spin_unlock(&instances_lock);
+ spin_unlock(&q->instances_lock);
return ERR_PTR(err);
}
@@ -158,11 +168,11 @@ __instance_destroy(struct nfqnl_instance *inst)
}
static void
-instance_destroy(struct nfqnl_instance *inst)
+instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst)
{
- spin_lock(&instances_lock);
+ spin_lock(&q->instances_lock);
__instance_destroy(inst);
- spin_unlock(&instances_lock);
+ spin_unlock(&q->instances_lock);
}
static inline void
@@ -217,14 +227,59 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
spin_unlock_bh(&queue->lock);
}
+static void
+nfqnl_zcopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen)
+{
+ int i, j = 0;
+ int plen = 0; /* length of skb->head fragment */
+ struct page *page;
+ unsigned int offset;
+
+ /* dont bother with small payloads */
+ if (len <= skb_tailroom(to)) {
+ skb_copy_bits(from, 0, skb_put(to, len), len);
+ return;
+ }
+
+ if (hlen) {
+ skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
+ len -= hlen;
+ } else {
+ plen = min_t(int, skb_headlen(from), len);
+ if (plen) {
+ page = virt_to_head_page(from->head);
+ offset = from->data - (unsigned char *)page_address(page);
+ __skb_fill_page_desc(to, 0, page, offset, plen);
+ get_page(page);
+ j = 1;
+ len -= plen;
+ }
+ }
+
+ to->truesize += len + plen;
+ to->len += len + plen;
+ to->data_len += len + plen;
+
+ for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
+ if (!len)
+ break;
+ skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
+ skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len);
+ len -= skb_shinfo(to)->frags[j].size;
+ skb_frag_ref(to, j);
+ j++;
+ }
+ skb_shinfo(to)->nr_frags = j;
+}
+
static struct sk_buff *
nfqnl_build_packet_message(struct nfqnl_instance *queue,
struct nf_queue_entry *entry,
__be32 **packet_id_ptr)
{
- sk_buff_data_t old_tail;
size_t size;
size_t data_len = 0, cap_len = 0;
+ int hlen = 0;
struct sk_buff *skb;
struct nlattr *nla;
struct nfqnl_msg_packet_hdr *pmsg;
@@ -236,7 +291,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
struct nf_conn *ct = NULL;
enum ip_conntrack_info uninitialized_var(ctinfo);
- size = NLMSG_SPACE(sizeof(struct nfgenmsg))
+ size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ nla_total_size(sizeof(struct nfqnl_msg_packet_hdr))
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
@@ -246,8 +301,10 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
#endif
+ nla_total_size(sizeof(u_int32_t)) /* mark */
+ nla_total_size(sizeof(struct nfqnl_msg_packet_hw))
- + nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)
- + nla_total_size(sizeof(u_int32_t))); /* cap_len */
+ + nla_total_size(sizeof(u_int32_t)); /* cap_len */
+
+ if (entskb->tstamp.tv64)
+ size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));
outdev = entry->outdev;
@@ -265,7 +322,16 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
if (data_len == 0 || data_len > entskb->len)
data_len = entskb->len;
- size += nla_total_size(data_len);
+
+ if (!entskb->head_frag ||
+ skb_headlen(entskb) < L1_CACHE_BYTES ||
+ skb_shinfo(entskb)->nr_frags >= MAX_SKB_FRAGS)
+ hlen = skb_headlen(entskb);
+
+ if (skb_has_frag_list(entskb))
+ hlen = entskb->len;
+ hlen = min_t(int, data_len, hlen);
+ size += sizeof(struct nlattr) + hlen;
cap_len = entskb->len;
break;
}
@@ -277,7 +343,6 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
if (!skb)
return NULL;
- old_tail = skb->tail;
nlh = nlmsg_put(skb, 0, 0,
NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET,
sizeof(struct nfgenmsg), 0);
@@ -382,31 +447,26 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
goto nla_put_failure;
}
+ if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0)
+ goto nla_put_failure;
+
+ if (cap_len > 0 && nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len)))
+ goto nla_put_failure;
+
if (data_len) {
struct nlattr *nla;
- int sz = nla_attr_size(data_len);
- if (skb_tailroom(skb) < nla_total_size(data_len)) {
- printk(KERN_WARNING "nf_queue: no tailroom!\n");
- kfree_skb(skb);
- return NULL;
- }
+ if (skb_tailroom(skb) < sizeof(*nla) + hlen)
+ goto nla_put_failure;
- nla = (struct nlattr *)skb_put(skb, nla_total_size(data_len));
+ nla = (struct nlattr *)skb_put(skb, sizeof(*nla));
nla->nla_type = NFQA_PAYLOAD;
- nla->nla_len = sz;
+ nla->nla_len = nla_attr_size(data_len);
- if (skb_copy_bits(entskb, 0, nla_data(nla), data_len))
- BUG();
+ nfqnl_zcopy(skb, entskb, data_len, hlen);
}
- if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0)
- goto nla_put_failure;
-
- if (cap_len > 0 && nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len)))
- goto nla_put_failure;
-
- nlh->nlmsg_len = skb->tail - old_tail;
+ nlh->nlmsg_len = skb->len;
return skb;
nla_put_failure:
@@ -423,9 +483,12 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
int err = -ENOBUFS;
__be32 *packet_id_ptr;
int failopen = 0;
+ struct net *net = dev_net(entry->indev ?
+ entry->indev : entry->outdev);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
/* rcu_read_lock()ed by nf_hook_slow() */
- queue = instance_lookup(queuenum);
+ queue = instance_lookup(q, queuenum);
if (!queue) {
err = -ESRCH;
goto err_out;
@@ -462,7 +525,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
*packet_id_ptr = htonl(entry->id);
/* nfnetlink_unicast will either free the nskb or add it to a socket */
- err = nfnetlink_unicast(nskb, &init_net, queue->peer_portid, MSG_DONTWAIT);
+ err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT);
if (err < 0) {
queue->queue_user_dropped++;
goto err_out_unlock;
@@ -575,15 +638,16 @@ dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
/* drop all packets with either indev or outdev == ifindex from all queue
* instances */
static void
-nfqnl_dev_drop(int ifindex)
+nfqnl_dev_drop(struct net *net, int ifindex)
{
int i;
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
rcu_read_lock();
for (i = 0; i < INSTANCE_BUCKETS; i++) {
struct nfqnl_instance *inst;
- struct hlist_head *head = &instance_table[i];
+ struct hlist_head *head = &q->instance_table[i];
hlist_for_each_entry_rcu(inst, head, hlist)
nfqnl_flush(inst, dev_cmp, ifindex);
@@ -600,12 +664,9 @@ nfqnl_rcv_dev_event(struct notifier_block *this,
{
struct net_device *dev = ptr;
- if (!net_eq(dev_net(dev), &init_net))
- return NOTIFY_DONE;
-
/* Drop any packets associated with the downed device */
if (event == NETDEV_DOWN)
- nfqnl_dev_drop(dev->ifindex);
+ nfqnl_dev_drop(dev_net(dev), dev->ifindex);
return NOTIFY_DONE;
}
@@ -618,24 +679,24 @@ nfqnl_rcv_nl_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct netlink_notify *n = ptr;
+ struct nfnl_queue_net *q = nfnl_queue_pernet(n->net);
if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {
int i;
/* destroy all instances for this portid */
- spin_lock(&instances_lock);
+ spin_lock(&q->instances_lock);
for (i = 0; i < INSTANCE_BUCKETS; i++) {
struct hlist_node *t2;
struct nfqnl_instance *inst;
- struct hlist_head *head = &instance_table[i];
+ struct hlist_head *head = &q->instance_table[i];
hlist_for_each_entry_safe(inst, t2, head, hlist) {
- if ((n->net == &init_net) &&
- (n->portid == inst->peer_portid))
+ if (n->portid == inst->peer_portid)
__instance_destroy(inst);
}
}
- spin_unlock(&instances_lock);
+ spin_unlock(&q->instances_lock);
}
return NOTIFY_DONE;
}
@@ -656,11 +717,12 @@ static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = {
[NFQA_MARK] = { .type = NLA_U32 },
};
-static struct nfqnl_instance *verdict_instance_lookup(u16 queue_num, int nlportid)
+static struct nfqnl_instance *
+verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, int nlportid)
{
struct nfqnl_instance *queue;
- queue = instance_lookup(queue_num);
+ queue = instance_lookup(q, queue_num);
if (!queue)
return ERR_PTR(-ENODEV);
@@ -704,7 +766,11 @@ nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb,
LIST_HEAD(batch_list);
u16 queue_num = ntohs(nfmsg->res_id);
- queue = verdict_instance_lookup(queue_num, NETLINK_CB(skb).portid);
+ struct net *net = sock_net(ctnl);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+
+ queue = verdict_instance_lookup(q, queue_num,
+ NETLINK_CB(skb).portid);
if (IS_ERR(queue))
return PTR_ERR(queue);
@@ -752,10 +818,13 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
enum ip_conntrack_info uninitialized_var(ctinfo);
struct nf_conn *ct = NULL;
- queue = instance_lookup(queue_num);
- if (!queue)
+ struct net *net = sock_net(ctnl);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
- queue = verdict_instance_lookup(queue_num, NETLINK_CB(skb).portid);
+ queue = instance_lookup(q, queue_num);
+ if (!queue)
+ queue = verdict_instance_lookup(q, queue_num,
+ NETLINK_CB(skb).portid);
if (IS_ERR(queue))
return PTR_ERR(queue);
@@ -819,6 +888,8 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
u_int16_t queue_num = ntohs(nfmsg->res_id);
struct nfqnl_instance *queue;
struct nfqnl_msg_config_cmd *cmd = NULL;
+ struct net *net = sock_net(ctnl);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
int ret = 0;
if (nfqa[NFQA_CFG_CMD]) {
@@ -832,7 +903,7 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
}
rcu_read_lock();
- queue = instance_lookup(queue_num);
+ queue = instance_lookup(q, queue_num);
if (queue && queue->peer_portid != NETLINK_CB(skb).portid) {
ret = -EPERM;
goto err_out_unlock;
@@ -845,7 +916,8 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
ret = -EBUSY;
goto err_out_unlock;
}
- queue = instance_create(queue_num, NETLINK_CB(skb).portid);
+ queue = instance_create(q, queue_num,
+ NETLINK_CB(skb).portid);
if (IS_ERR(queue)) {
ret = PTR_ERR(queue);
goto err_out_unlock;
@@ -856,7 +928,7 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
ret = -ENODEV;
goto err_out_unlock;
}
- instance_destroy(queue);
+ instance_destroy(q, queue);
break;
case NFQNL_CFG_CMD_PF_BIND:
case NFQNL_CFG_CMD_PF_UNBIND:
@@ -950,19 +1022,24 @@ static const struct nfnetlink_subsystem nfqnl_subsys = {
#ifdef CONFIG_PROC_FS
struct iter_state {
+ struct seq_net_private p;
unsigned int bucket;
};
static struct hlist_node *get_first(struct seq_file *seq)
{
struct iter_state *st = seq->private;
+ struct net *net;
+ struct nfnl_queue_net *q;
if (!st)
return NULL;
+ net = seq_file_net(seq);
+ q = nfnl_queue_pernet(net);
for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
- if (!hlist_empty(&instance_table[st->bucket]))
- return instance_table[st->bucket].first;
+ if (!hlist_empty(&q->instance_table[st->bucket]))
+ return q->instance_table[st->bucket].first;
}
return NULL;
}
@@ -970,13 +1047,17 @@ static struct hlist_node *get_first(struct seq_file *seq)
static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
{
struct iter_state *st = seq->private;
+ struct net *net = seq_file_net(seq);
h = h->next;
while (!h) {
+ struct nfnl_queue_net *q;
+
if (++st->bucket >= INSTANCE_BUCKETS)
return NULL;
- h = instance_table[st->bucket].first;
+ q = nfnl_queue_pernet(net);
+ h = q->instance_table[st->bucket].first;
}
return h;
}
@@ -992,11 +1073,11 @@ static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
return pos ? NULL : head;
}
-static void *seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(instances_lock)
+static void *seq_start(struct seq_file *s, loff_t *pos)
+ __acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock)
{
- spin_lock(&instances_lock);
- return get_idx(seq, *pos);
+ spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock);
+ return get_idx(s, *pos);
}
static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
@@ -1006,9 +1087,9 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
}
static void seq_stop(struct seq_file *s, void *v)
- __releases(instances_lock)
+ __releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock)
{
- spin_unlock(&instances_lock);
+ spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock);
}
static int seq_show(struct seq_file *s, void *v)
@@ -1032,7 +1113,7 @@ static const struct seq_operations nfqnl_seq_ops = {
static int nfqnl_open(struct inode *inode, struct file *file)
{
- return seq_open_private(file, &nfqnl_seq_ops,
+ return seq_open_net(inode, file, &nfqnl_seq_ops,
sizeof(struct iter_state));
}
@@ -1041,39 +1122,63 @@ static const struct file_operations nfqnl_file_ops = {
.open = nfqnl_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = seq_release_net,
};
#endif /* PROC_FS */
-static int __init nfnetlink_queue_init(void)
+static int __net_init nfnl_queue_net_init(struct net *net)
{
- int i, status = -ENOMEM;
+ unsigned int i;
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
for (i = 0; i < INSTANCE_BUCKETS; i++)
- INIT_HLIST_HEAD(&instance_table[i]);
+ INIT_HLIST_HEAD(&q->instance_table[i]);
+
+ spin_lock_init(&q->instances_lock);
+
+#ifdef CONFIG_PROC_FS
+ if (!proc_create("nfnetlink_queue", 0440,
+ net->nf.proc_netfilter, &nfqnl_file_ops))
+ return -ENOMEM;
+#endif
+ return 0;
+}
+
+static void __net_exit nfnl_queue_net_exit(struct net *net)
+{
+ remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter);
+}
+
+static struct pernet_operations nfnl_queue_net_ops = {
+ .init = nfnl_queue_net_init,
+ .exit = nfnl_queue_net_exit,
+ .id = &nfnl_queue_net_id,
+ .size = sizeof(struct nfnl_queue_net),
+};
+
+static int __init nfnetlink_queue_init(void)
+{
+ int status = -ENOMEM;
netlink_register_notifier(&nfqnl_rtnl_notifier);
status = nfnetlink_subsys_register(&nfqnl_subsys);
if (status < 0) {
- printk(KERN_ERR "nf_queue: failed to create netlink socket\n");
+ pr_err("nf_queue: failed to create netlink socket\n");
goto cleanup_netlink_notifier;
}
-#ifdef CONFIG_PROC_FS
- if (!proc_create("nfnetlink_queue", 0440,
- proc_net_netfilter, &nfqnl_file_ops))
+ status = register_pernet_subsys(&nfnl_queue_net_ops);
+ if (status < 0) {
+ pr_err("nf_queue: failed to register pernet ops\n");
goto cleanup_subsys;
-#endif
-
+ }
register_netdevice_notifier(&nfqnl_dev_notifier);
nf_register_queue_handler(&nfqh);
return status;
-#ifdef CONFIG_PROC_FS
cleanup_subsys:
nfnetlink_subsys_unregister(&nfqnl_subsys);
-#endif
cleanup_netlink_notifier:
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
return status;
@@ -1083,9 +1188,7 @@ static void __exit nfnetlink_queue_fini(void)
{
nf_unregister_queue_handler();
unregister_netdevice_notifier(&nfqnl_dev_notifier);
-#ifdef CONFIG_PROC_FS
- remove_proc_entry("nfnetlink_queue", proc_net_netfilter);
-#endif
+ unregister_pernet_subsys(&nfnl_queue_net_ops);
nfnetlink_subsys_unregister(&nfqnl_subsys);
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c
index ba92824086f3..3228d7f24eb4 100644
--- a/net/netfilter/xt_AUDIT.c
+++ b/net/netfilter/xt_AUDIT.c
@@ -124,6 +124,9 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
const struct xt_audit_info *info = par->targinfo;
struct audit_buffer *ab;
+ if (audit_enabled == 0)
+ goto errout;
+
ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT);
if (ab == NULL)
goto errout;
diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c
index fa40096940a1..fe573f6c9e91 100644
--- a/net/netfilter/xt_LOG.c
+++ b/net/netfilter/xt_LOG.c
@@ -474,7 +474,14 @@ ipt_log_packet(u_int8_t pf,
const struct nf_loginfo *loginfo,
const char *prefix)
{
- struct sbuff *m = sb_open();
+ struct sbuff *m;
+ struct net *net = dev_net(in ? in : out);
+
+ /* FIXME: Disabled from containers until syslog ns is supported */
+ if (!net_eq(net, &init_net))
+ return;
+
+ m = sb_open();
if (!loginfo)
loginfo = &default_loginfo;
@@ -798,7 +805,14 @@ ip6t_log_packet(u_int8_t pf,
const struct nf_loginfo *loginfo,
const char *prefix)
{
- struct sbuff *m = sb_open();
+ struct sbuff *m;
+ struct net *net = dev_net(in ? in : out);
+
+ /* FIXME: Disabled from containers until syslog ns is supported */
+ if (!net_eq(net, &init_net))
+ return;
+
+ m = sb_open();
if (!loginfo)
loginfo = &default_loginfo;
@@ -893,23 +907,55 @@ static struct nf_logger ip6t_log_logger __read_mostly = {
};
#endif
+static int __net_init log_net_init(struct net *net)
+{
+ nf_log_set(net, NFPROTO_IPV4, &ipt_log_logger);
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ nf_log_set(net, NFPROTO_IPV6, &ip6t_log_logger);
+#endif
+ return 0;
+}
+
+static void __net_exit log_net_exit(struct net *net)
+{
+ nf_log_unset(net, &ipt_log_logger);
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ nf_log_unset(net, &ip6t_log_logger);
+#endif
+}
+
+static struct pernet_operations log_net_ops = {
+ .init = log_net_init,
+ .exit = log_net_exit,
+};
+
static int __init log_tg_init(void)
{
int ret;
+ ret = register_pernet_subsys(&log_net_ops);
+ if (ret < 0)
+ goto err_pernet;
+
ret = xt_register_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs));
if (ret < 0)
- return ret;
+ goto err_target;
nf_log_register(NFPROTO_IPV4, &ipt_log_logger);
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
nf_log_register(NFPROTO_IPV6, &ip6t_log_logger);
#endif
return 0;
+
+err_target:
+ unregister_pernet_subsys(&log_net_ops);
+err_pernet:
+ return ret;
}
static void __exit log_tg_exit(void)
{
+ unregister_pernet_subsys(&log_net_ops);
nf_log_unregister(&ipt_log_logger);
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
nf_log_unregister(&ip6t_log_logger);
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index 817f9e9f2b16..1e2fae32f81b 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -76,22 +76,31 @@ static u32 hash_v6(const struct sk_buff *skb)
}
#endif
-static unsigned int
-nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
+static u32
+nfqueue_hash(const struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_NFQ_info_v1 *info = par->targinfo;
u32 queue = info->queuenum;
- if (info->queues_total > 1) {
- if (par->family == NFPROTO_IPV4)
- queue = (((u64) hash_v4(skb) * info->queues_total) >>
- 32) + queue;
+ if (par->family == NFPROTO_IPV4)
+ queue += ((u64) hash_v4(skb) * info->queues_total) >> 32;
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
- else if (par->family == NFPROTO_IPV6)
- queue = (((u64) hash_v6(skb) * info->queues_total) >>
- 32) + queue;
+ else if (par->family == NFPROTO_IPV6)
+ queue += ((u64) hash_v6(skb) * info->queues_total) >> 32;
#endif
- }
+
+ return queue;
+}
+
+static unsigned int
+nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_NFQ_info_v1 *info = par->targinfo;
+ u32 queue = info->queuenum;
+
+ if (info->queues_total > 1)
+ queue = nfqueue_hash(skb, par);
+
return NF_QUEUE_NR(queue);
}
@@ -108,7 +117,7 @@ nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par)
static int nfqueue_tg_check(const struct xt_tgchk_param *par)
{
- const struct xt_NFQ_info_v2 *info = par->targinfo;
+ const struct xt_NFQ_info_v3 *info = par->targinfo;
u32 maxid;
if (unlikely(!rnd_inited)) {
@@ -125,11 +134,32 @@ static int nfqueue_tg_check(const struct xt_tgchk_param *par)
info->queues_total, maxid);
return -ERANGE;
}
- if (par->target->revision == 2 && info->bypass > 1)
+ if (par->target->revision == 2 && info->flags > 1)
return -EINVAL;
+ if (par->target->revision == 3 && info->flags & ~NFQ_FLAG_MASK)
+ return -EINVAL;
+
return 0;
}
+static unsigned int
+nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_NFQ_info_v3 *info = par->targinfo;
+ u32 queue = info->queuenum;
+
+ if (info->queues_total > 1) {
+ if (info->flags & NFQ_FLAG_CPU_FANOUT) {
+ int cpu = smp_processor_id();
+
+ queue = info->queuenum + cpu % info->queues_total;
+ } else
+ queue = nfqueue_hash(skb, par);
+ }
+
+ return NF_QUEUE_NR(queue);
+}
+
static struct xt_target nfqueue_tg_reg[] __read_mostly = {
{
.name = "NFQUEUE",
@@ -156,6 +186,15 @@ static struct xt_target nfqueue_tg_reg[] __read_mostly = {
.targetsize = sizeof(struct xt_NFQ_info_v2),
.me = THIS_MODULE,
},
+ {
+ .name = "NFQUEUE",
+ .revision = 3,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = nfqueue_tg_check,
+ .target = nfqueue_tg_v3,
+ .targetsize = sizeof(struct xt_NFQ_info_v3),
+ .me = THIS_MODULE,
+ },
};
static int __init nfqueue_tg_init(void)
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index a5e673d32bda..647d989a01e6 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -201,6 +201,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
unsigned char opts[MAX_IPOPTLEN];
const struct xt_osf_finger *kf;
const struct xt_osf_user_finger *f;
+ struct net *net = dev_net(p->in ? p->in : p->out);
if (!info)
return false;
@@ -325,7 +326,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
fcount++;
if (info->flags & XT_OSF_LOG)
- nf_log_packet(p->family, p->hooknum, skb,
+ nf_log_packet(net, p->family, p->hooknum, skb,
p->in, p->out, NULL,
"%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
f->genre, f->version, f->subtype,
@@ -341,7 +342,8 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
rcu_read_unlock();
if (!fcount && (info->flags & XT_OSF_LOG))
- nf_log_packet(p->family, p->hooknum, skb, p->in, p->out, NULL,
+ nf_log_packet(net, p->family, p->hooknum, skb, p->in,
+ p->out, NULL,
"Remote OS is not known: %pI4:%u -> %pI4:%u\n",
&ip->saddr, ntohs(tcp->source),
&ip->daddr, ntohs(tcp->dest));
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 847d495cd4de..8a6c6ea466d8 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -1189,8 +1189,6 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb,
struct netlbl_unlhsh_walk_arg cb_arg;
u32 skip_bkt = cb->args[0];
u32 skip_chain = cb->args[1];
- u32 skip_addr4 = cb->args[2];
- u32 skip_addr6 = cb->args[3];
u32 iter_bkt;
u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0;
struct netlbl_unlhsh_iface *iface;
@@ -1215,7 +1213,7 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb,
continue;
netlbl_af4list_foreach_rcu(addr4,
&iface->addr4_list) {
- if (iter_addr4++ < skip_addr4)
+ if (iter_addr4++ < cb->args[2])
continue;
if (netlbl_unlabel_staticlist_gen(
NLBL_UNLABEL_C_STATICLIST,
@@ -1231,7 +1229,7 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb,
#if IS_ENABLED(CONFIG_IPV6)
netlbl_af6list_foreach_rcu(addr6,
&iface->addr6_list) {
- if (iter_addr6++ < skip_addr6)
+ if (iter_addr6++ < cb->args[3])
continue;
if (netlbl_unlabel_staticlist_gen(
NLBL_UNLABEL_C_STATICLIST,
@@ -1250,10 +1248,10 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb,
unlabel_staticlist_return:
rcu_read_unlock();
- cb->args[0] = skip_bkt;
- cb->args[1] = skip_chain;
- cb->args[2] = skip_addr4;
- cb->args[3] = skip_addr6;
+ cb->args[0] = iter_bkt;
+ cb->args[1] = iter_chain;
+ cb->args[2] = iter_addr4;
+ cb->args[3] = iter_addr6;
return skb->len;
}
@@ -1273,12 +1271,9 @@ static int netlbl_unlabel_staticlistdef(struct sk_buff *skb,
{
struct netlbl_unlhsh_walk_arg cb_arg;
struct netlbl_unlhsh_iface *iface;
- u32 skip_addr4 = cb->args[0];
- u32 skip_addr6 = cb->args[1];
- u32 iter_addr4 = 0;
+ u32 iter_addr4 = 0, iter_addr6 = 0;
struct netlbl_af4list *addr4;
#if IS_ENABLED(CONFIG_IPV6)
- u32 iter_addr6 = 0;
struct netlbl_af6list *addr6;
#endif
@@ -1292,7 +1287,7 @@ static int netlbl_unlabel_staticlistdef(struct sk_buff *skb,
goto unlabel_staticlistdef_return;
netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) {
- if (iter_addr4++ < skip_addr4)
+ if (iter_addr4++ < cb->args[0])
continue;
if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
iface,
@@ -1305,7 +1300,7 @@ static int netlbl_unlabel_staticlistdef(struct sk_buff *skb,
}
#if IS_ENABLED(CONFIG_IPV6)
netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) {
- if (iter_addr6++ < skip_addr6)
+ if (iter_addr6++ < cb->args[1])
continue;
if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
iface,
@@ -1320,8 +1315,8 @@ static int netlbl_unlabel_staticlistdef(struct sk_buff *skb,
unlabel_staticlistdef_return:
rcu_read_unlock();
- cb->args[0] = skip_addr4;
- cb->args[1] = skip_addr6;
+ cb->args[0] = iter_addr4;
+ cb->args[1] = iter_addr6;
return skb->len;
}
diff --git a/net/netlink/Kconfig b/net/netlink/Kconfig
new file mode 100644
index 000000000000..5d6e8c05b3d4
--- /dev/null
+++ b/net/netlink/Kconfig
@@ -0,0 +1,10 @@
+#
+# Netlink Sockets
+#
+
+config NETLINK_DIAG
+ tristate "NETLINK: socket monitoring interface"
+ default n
+ ---help---
+ Support for NETLINK socket monitoring interface used by the ss tool.
+ If unsure, say Y.
diff --git a/net/netlink/Makefile b/net/netlink/Makefile
index bdd6ddf4e95b..e837917f6c03 100644
--- a/net/netlink/Makefile
+++ b/net/netlink/Makefile
@@ -3,3 +3,6 @@
#
obj-y := af_netlink.o genetlink.o
+
+obj-$(CONFIG_NETLINK_DIAG) += netlink_diag.o
+netlink_diag-y := diag.o
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 1e3fd5bfcd86..ce2e0064e7f6 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -61,28 +61,7 @@
#include <net/scm.h>
#include <net/netlink.h>
-#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
-#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long))
-
-struct netlink_sock {
- /* struct sock has to be the first member of netlink_sock */
- struct sock sk;
- u32 portid;
- u32 dst_portid;
- u32 dst_group;
- u32 flags;
- u32 subscriptions;
- u32 ngroups;
- unsigned long *groups;
- unsigned long state;
- wait_queue_head_t wait;
- struct netlink_callback *cb;
- struct mutex *cb_mutex;
- struct mutex cb_def_mutex;
- void (*netlink_rcv)(struct sk_buff *skb);
- void (*netlink_bind)(int group);
- struct module *module;
-};
+#include "af_netlink.h"
struct listeners {
struct rcu_head rcu;
@@ -94,48 +73,20 @@ struct listeners {
#define NETLINK_BROADCAST_SEND_ERROR 0x4
#define NETLINK_RECV_NO_ENOBUFS 0x8
-static inline struct netlink_sock *nlk_sk(struct sock *sk)
-{
- return container_of(sk, struct netlink_sock, sk);
-}
-
static inline int netlink_is_kernel(struct sock *sk)
{
return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET;
}
-struct nl_portid_hash {
- struct hlist_head *table;
- unsigned long rehash_time;
-
- unsigned int mask;
- unsigned int shift;
-
- unsigned int entries;
- unsigned int max_shift;
-
- u32 rnd;
-};
-
-struct netlink_table {
- struct nl_portid_hash hash;
- struct hlist_head mc_list;
- struct listeners __rcu *listeners;
- unsigned int flags;
- unsigned int groups;
- struct mutex *cb_mutex;
- struct module *module;
- void (*bind)(int group);
- int registered;
-};
-
-static struct netlink_table *nl_table;
+struct netlink_table *nl_table;
+EXPORT_SYMBOL_GPL(nl_table);
static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);
static int netlink_dump(struct sock *sk);
-static DEFINE_RWLOCK(nl_table_lock);
+DEFINE_RWLOCK(nl_table_lock);
+EXPORT_SYMBOL_GPL(nl_table_lock);
static atomic_t nl_table_users = ATOMIC_INIT(0);
#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock));
@@ -1695,7 +1646,7 @@ struct nlmsghdr *
__nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags)
{
struct nlmsghdr *nlh;
- int size = NLMSG_LENGTH(len);
+ int size = nlmsg_msg_size(len);
nlh = (struct nlmsghdr*)skb_put(skb, NLMSG_ALIGN(size));
nlh->nlmsg_type = type;
@@ -1704,7 +1655,7 @@ __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int fla
nlh->nlmsg_pid = portid;
nlh->nlmsg_seq = seq;
if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0)
- memset(NLMSG_DATA(nlh) + len, 0, NLMSG_ALIGN(size) - size);
+ memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size);
return nlh;
}
EXPORT_SYMBOL(__nlmsg_put);
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
new file mode 100644
index 000000000000..d9acb2a1d855
--- /dev/null
+++ b/net/netlink/af_netlink.h
@@ -0,0 +1,62 @@
+#ifndef _AF_NETLINK_H
+#define _AF_NETLINK_H
+
+#include <net/sock.h>
+
+#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
+#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long))
+
+struct netlink_sock {
+ /* struct sock has to be the first member of netlink_sock */
+ struct sock sk;
+ u32 portid;
+ u32 dst_portid;
+ u32 dst_group;
+ u32 flags;
+ u32 subscriptions;
+ u32 ngroups;
+ unsigned long *groups;
+ unsigned long state;
+ wait_queue_head_t wait;
+ struct netlink_callback *cb;
+ struct mutex *cb_mutex;
+ struct mutex cb_def_mutex;
+ void (*netlink_rcv)(struct sk_buff *skb);
+ void (*netlink_bind)(int group);
+ struct module *module;
+};
+
+static inline struct netlink_sock *nlk_sk(struct sock *sk)
+{
+ return container_of(sk, struct netlink_sock, sk);
+}
+
+struct nl_portid_hash {
+ struct hlist_head *table;
+ unsigned long rehash_time;
+
+ unsigned int mask;
+ unsigned int shift;
+
+ unsigned int entries;
+ unsigned int max_shift;
+
+ u32 rnd;
+};
+
+struct netlink_table {
+ struct nl_portid_hash hash;
+ struct hlist_head mc_list;
+ struct listeners __rcu *listeners;
+ unsigned int flags;
+ unsigned int groups;
+ struct mutex *cb_mutex;
+ struct module *module;
+ void (*bind)(int group);
+ int registered;
+};
+
+extern struct netlink_table *nl_table;
+extern rwlock_t nl_table_lock;
+
+#endif
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
new file mode 100644
index 000000000000..5ffb1d1cf402
--- /dev/null
+++ b/net/netlink/diag.c
@@ -0,0 +1,188 @@
+#include <linux/module.h>
+
+#include <net/sock.h>
+#include <linux/netlink.h>
+#include <linux/sock_diag.h>
+#include <linux/netlink_diag.h>
+
+#include "af_netlink.h"
+
+static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb)
+{
+ struct netlink_sock *nlk = nlk_sk(sk);
+
+ if (nlk->groups == NULL)
+ return 0;
+
+ return nla_put(nlskb, NETLINK_DIAG_GROUPS, NLGRPSZ(nlk->ngroups),
+ nlk->groups);
+}
+
+static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
+ struct netlink_diag_req *req,
+ u32 portid, u32 seq, u32 flags, int sk_ino)
+{
+ struct nlmsghdr *nlh;
+ struct netlink_diag_msg *rep;
+ struct netlink_sock *nlk = nlk_sk(sk);
+
+ nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep),
+ flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ rep = nlmsg_data(nlh);
+ rep->ndiag_family = AF_NETLINK;
+ rep->ndiag_type = sk->sk_type;
+ rep->ndiag_protocol = sk->sk_protocol;
+ rep->ndiag_state = sk->sk_state;
+
+ rep->ndiag_ino = sk_ino;
+ rep->ndiag_portid = nlk->portid;
+ rep->ndiag_dst_portid = nlk->dst_portid;
+ rep->ndiag_dst_group = nlk->dst_group;
+ sock_diag_save_cookie(sk, rep->ndiag_cookie);
+
+ if ((req->ndiag_show & NDIAG_SHOW_GROUPS) &&
+ sk_diag_dump_groups(sk, skb))
+ goto out_nlmsg_trim;
+
+ if ((req->ndiag_show & NDIAG_SHOW_MEMINFO) &&
+ sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO))
+ goto out_nlmsg_trim;
+
+ return nlmsg_end(skb, nlh);
+
+out_nlmsg_trim:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ int protocol, int s_num)
+{
+ struct netlink_table *tbl = &nl_table[protocol];
+ struct nl_portid_hash *hash = &tbl->hash;
+ struct net *net = sock_net(skb->sk);
+ struct netlink_diag_req *req;
+ struct sock *sk;
+ int ret = 0, num = 0, i;
+
+ req = nlmsg_data(cb->nlh);
+
+ for (i = 0; i <= hash->mask; i++) {
+ sk_for_each(sk, &hash->table[i]) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num) {
+ num++;
+ continue;
+ }
+
+ if (sk_diag_fill(sk, skb, req,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI,
+ sock_i_ino(sk)) < 0) {
+ ret = 1;
+ goto done;
+ }
+
+ num++;
+ }
+ }
+
+ sk_for_each_bound(sk, &tbl->mc_list) {
+ if (sk_hashed(sk))
+ continue;
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num) {
+ num++;
+ continue;
+ }
+
+ if (sk_diag_fill(sk, skb, req,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI,
+ sock_i_ino(sk)) < 0) {
+ ret = 1;
+ goto done;
+ }
+ num++;
+ }
+done:
+ cb->args[0] = num;
+ cb->args[1] = protocol;
+
+ return ret;
+}
+
+static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct netlink_diag_req *req;
+ int s_num = cb->args[0];
+
+ req = nlmsg_data(cb->nlh);
+
+ read_lock(&nl_table_lock);
+
+ if (req->sdiag_protocol == NDIAG_PROTO_ALL) {
+ int i;
+
+ for (i = cb->args[1]; i < MAX_LINKS; i++) {
+ if (__netlink_diag_dump(skb, cb, i, s_num))
+ break;
+ s_num = 0;
+ }
+ } else {
+ if (req->sdiag_protocol >= MAX_LINKS) {
+ read_unlock(&nl_table_lock);
+ return -ENOENT;
+ }
+
+ __netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num);
+ }
+
+ read_unlock(&nl_table_lock);
+
+ return skb->len;
+}
+
+static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
+{
+ int hdrlen = sizeof(struct netlink_diag_req);
+ struct net *net = sock_net(skb->sk);
+
+ if (nlmsg_len(h) < hdrlen)
+ return -EINVAL;
+
+ if (h->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = netlink_diag_dump,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+ } else
+ return -EOPNOTSUPP;
+}
+
+static const struct sock_diag_handler netlink_diag_handler = {
+ .family = AF_NETLINK,
+ .dump = netlink_diag_handler_dump,
+};
+
+static int __init netlink_diag_init(void)
+{
+ return sock_diag_register(&netlink_diag_handler);
+}
+
+static void __exit netlink_diag_exit(void)
+{
+ sock_diag_unregister(&netlink_diag_handler);
+}
+
+module_init(netlink_diag_init);
+module_exit(netlink_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 16 /* AF_NETLINK */);
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index f2aabb6f4105..5a55be3f17a5 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -142,6 +142,7 @@ int genl_register_mc_group(struct genl_family *family,
int err = 0;
BUG_ON(grp->name[0] == '\0');
+ BUG_ON(memchr(grp->name, '\0', GENL_NAMSIZ) == NULL);
genl_lock();
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index d1fa1d9ffd2e..7fcb307dea47 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1173,6 +1173,7 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock,
}
if (sax != NULL) {
+ memset(sax, 0, sizeof(sax));
sax->sax25_family = AF_NETROM;
skb_copy_from_linear_data_offset(skb, 7, sax->sax25_call.ax25_call,
AX25_ADDR_LEN);
diff --git a/net/nfc/llcp/sock.c b/net/nfc/llcp/sock.c
index 6fa76704cb13..c1101e6de170 100644
--- a/net/nfc/llcp/sock.c
+++ b/net/nfc/llcp/sock.c
@@ -523,7 +523,8 @@ static unsigned int llcp_sock_poll(struct file *file, struct socket *sock,
return llcp_accept_poll(sk);
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
- mask |= POLLERR;
+ mask |= POLLERR |
+ (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
if (!skb_queue_empty(&sk->sk_receive_queue))
mask |= POLLIN | POLLRDNORM;
@@ -764,6 +765,8 @@ static int llcp_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
pr_debug("%p %zu\n", sk, len);
+ msg->msg_namelen = 0;
+
lock_sock(sk);
if (sk->sk_state == LLCP_CLOSED &&
@@ -809,6 +812,7 @@ static int llcp_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
pr_debug("Datagram socket %d %d\n", ui_cb->dsap, ui_cb->ssap);
+ memset(sockaddr, 0, sizeof(*sockaddr));
sockaddr->sa_family = AF_NFC;
sockaddr->nfc_protocol = NFC_PROTO_NFC_DEP;
sockaddr->dsap = ui_cb->dsap;
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index ac2defeeba83..d4d5363c7ba7 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -58,7 +58,7 @@ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci)
if (skb->ip_summed == CHECKSUM_COMPLETE)
skb->csum = csum_sub(skb->csum, csum_partial(skb->data
- + ETH_HLEN, VLAN_HLEN, 0));
+ + (2 * ETH_ALEN), VLAN_HLEN, 0));
vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
*current_tci = vhdr->h_vlan_TCI;
@@ -115,7 +115,7 @@ static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vla
if (skb->ip_summed == CHECKSUM_COMPLETE)
skb->csum = csum_add(skb->csum, csum_partial(skb->data
- + ETH_HLEN, VLAN_HLEN, 0));
+ + (2 * ETH_ALEN), VLAN_HLEN, 0));
}
__vlan_hwaccel_put_tag(skb, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index e87a26506dba..8759265a3e46 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -369,8 +369,8 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex,
len = sizeof(struct ovs_header);
len += nla_total_size(skb->len);
len += nla_total_size(FLOW_BUFSIZE);
- if (upcall_info->cmd == OVS_PACKET_CMD_ACTION)
- len += nla_total_size(8);
+ if (upcall_info->userdata)
+ len += NLA_ALIGN(upcall_info->userdata->nla_len);
user_skb = genlmsg_new(len, GFP_ATOMIC);
if (!user_skb) {
@@ -387,13 +387,15 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex,
nla_nest_end(user_skb, nla);
if (upcall_info->userdata)
- nla_put_u64(user_skb, OVS_PACKET_ATTR_USERDATA,
- nla_get_u64(upcall_info->userdata));
+ __nla_put(user_skb, OVS_PACKET_ATTR_USERDATA,
+ nla_len(upcall_info->userdata),
+ nla_data(upcall_info->userdata));
nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
skb_copy_and_csum_dev(skb, nla_data(nla));
+ genlmsg_end(user_skb, upcall);
err = genlmsg_unicast(net, user_skb, upcall_info->portid);
out:
@@ -543,7 +545,7 @@ static int validate_userspace(const struct nlattr *attr)
{
static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = {
[OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 },
- [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_U64 },
+ [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC },
};
struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1];
int error;
@@ -679,7 +681,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
/* Normally, setting the skb 'protocol' field would be handled by a
* call to eth_type_trans(), but it assumes there's a sending
* device, which we may not have. */
- if (ntohs(eth->h_proto) >= 1536)
+ if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN)
packet->protocol = eth->h_proto;
else
packet->protocol = htons(ETH_P_802_2);
@@ -1627,7 +1629,7 @@ static struct vport *lookup_vport(struct net *net,
vport = ovs_vport_rtnl_rcu(dp, port_no);
if (!vport)
- return ERR_PTR(-ENOENT);
+ return ERR_PTR(-ENODEV);
return vport;
} else
return ERR_PTR(-EINVAL);
@@ -1690,6 +1692,7 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
if (IS_ERR(vport))
goto exit_unlock;
+ err = 0;
reply = ovs_vport_cmd_build_info(vport, info->snd_portid, info->snd_seq,
OVS_VPORT_CMD_NEW);
if (IS_ERR(reply)) {
@@ -1771,6 +1774,7 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
if (IS_ERR(reply))
goto exit_unlock;
+ err = 0;
ovs_dp_detach_port(vport);
genl_notify(reply, genl_info_net(info), info->snd_portid,
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 031dfbf37c93..9125ad5c5aeb 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -119,7 +119,7 @@ struct ovs_skb_cb {
* struct dp_upcall - metadata to include with a packet to send to userspace
* @cmd: One of %OVS_PACKET_CMD_*.
* @key: Becomes %OVS_PACKET_ATTR_KEY. Must be nonnull.
- * @userdata: If nonnull, its u64 value is extracted and passed to userspace as
+ * @userdata: If nonnull, its variable-length value is passed to userspace as
* %OVS_PACKET_ATTR_USERDATA.
* @pid: Netlink PID to which packet should be sent. If @pid is 0 then no
* packet is sent and the packet is accounted in the datapath's @n_lost
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 20605ecf100b..332486839347 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -466,7 +466,7 @@ static __be16 parse_ethertype(struct sk_buff *skb)
proto = *(__be16 *) skb->data;
__skb_pull(skb, sizeof(__be16));
- if (ntohs(proto) >= 1536)
+ if (ntohs(proto) >= ETH_P_802_3_MIN)
return proto;
if (skb->len < sizeof(struct llc_snap_hdr))
@@ -482,7 +482,11 @@ static __be16 parse_ethertype(struct sk_buff *skb)
return htons(ETH_P_802_2);
__skb_pull(skb, sizeof(struct llc_snap_hdr));
- return llc->ethertype;
+
+ if (ntohs(llc->ethertype) >= ETH_P_802_3_MIN)
+ return llc->ethertype;
+
+ return htons(ETH_P_802_2);
}
static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
@@ -1034,7 +1038,7 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
swkey->eth.type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
- if (ntohs(swkey->eth.type) < 1536)
+ if (ntohs(swkey->eth.type) < ETH_P_802_3_MIN)
return -EINVAL;
attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
} else {
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 0531de6c7a4a..40f8a2489c90 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -63,16 +63,6 @@ static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netde
return stats;
}
-static int internal_dev_mac_addr(struct net_device *dev, void *p)
-{
- struct sockaddr *addr = p;
-
- if (!is_valid_ether_addr(addr->sa_data))
- return -EADDRNOTAVAIL;
- memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
- return 0;
-}
-
/* Called with rcu_read_lock_bh. */
static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev)
{
@@ -126,7 +116,7 @@ static const struct net_device_ops internal_dev_netdev_ops = {
.ndo_open = internal_dev_open,
.ndo_stop = internal_dev_stop,
.ndo_start_xmit = internal_dev_xmit,
- .ndo_set_mac_address = internal_dev_mac_addr,
+ .ndo_set_mac_address = eth_mac_addr,
.ndo_change_mtu = internal_dev_change_mtu,
.ndo_get_stats64 = internal_dev_get_stats,
};
@@ -138,6 +128,7 @@ static void do_setup(struct net_device *netdev)
netdev->netdev_ops = &internal_dev_netdev_ops;
netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
netdev->destructor = internal_dev_destructor;
SET_ETHTOOL_OPS(netdev, &internal_dev_ethtool_ops);
netdev->tx_queue_len = 0;
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 670cbc3518de..2130d61c384a 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -43,8 +43,7 @@ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb)
/* Make our own copy of the packet. Otherwise we will mangle the
* packet for anyone who came before us (e.g. tcpdump via AF_PACKET).
- * (No one comes after us, since we tell handle_bridge() that we took
- * the packet.) */
+ */
skb = skb_share_check(skb, GFP_ATOMIC);
if (unlikely(!skb))
return;
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index ba717cc038b3..f6b8132ce4cb 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -325,8 +325,7 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb)
* @skb: skb that was received
*
* Must be called with rcu_read_lock. The packet cannot be shared and
- * skb->data should point to the Ethernet header. The caller must have already
- * called compute_ip_summed() to initialize the checksumming fields.
+ * skb->data should point to the Ethernet header.
*/
void ovs_vport_receive(struct vport *vport, struct sk_buff *skb)
{
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index 3f7961ea3c56..aee7d43114c9 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -68,10 +68,10 @@ struct vport_err_stats {
/**
* struct vport - one port within a datapath
* @rcu: RCU callback head for deferred destruction.
- * @port_no: Index into @dp's @ports array.
* @dp: Datapath to which this port belongs.
* @upcall_portid: The Netlink port to use for packets received on this port that
* miss the flow table.
+ * @port_no: Index into @dp's @ports array.
* @hash_node: Element in @dev_table hash table in vport.c.
* @dp_hash_node: Element in @datapath->ports hash table in datapath.c.
* @ops: Class structure.
@@ -81,9 +81,9 @@ struct vport_err_stats {
*/
struct vport {
struct rcu_head rcu;
- u16 port_no;
struct datapath *dp;
u32 upcall_portid;
+ u16 port_no;
struct hlist_node hash_node;
struct hlist_node dp_hash_node;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 1d6793dbfbae..8e4644ff8d34 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -181,6 +181,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
struct packet_sock;
static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
+static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev);
static void *packet_previous_frame(struct packet_sock *po,
struct packet_ring_buffer *rb,
@@ -973,11 +975,11 @@ static void *packet_current_rx_frame(struct packet_sock *po,
static void *prb_lookup_block(struct packet_sock *po,
struct packet_ring_buffer *rb,
- unsigned int previous,
+ unsigned int idx,
int status)
{
struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
- struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, previous);
+ struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
if (status != BLOCK_STATUS(pbd))
return NULL;
@@ -1041,6 +1043,29 @@ static void packet_increment_head(struct packet_ring_buffer *buff)
buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
}
+static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
+{
+ struct sock *sk = &po->sk;
+ bool has_room;
+
+ if (po->prot_hook.func != tpacket_rcv)
+ return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
+ <= sk->sk_rcvbuf;
+
+ spin_lock(&sk->sk_receive_queue.lock);
+ if (po->tp_version == TPACKET_V3)
+ has_room = prb_lookup_block(po, &po->rx_ring,
+ po->rx_ring.prb_bdqc.kactive_blk_num,
+ TP_STATUS_KERNEL);
+ else
+ has_room = packet_lookup_frame(po, &po->rx_ring,
+ po->rx_ring.head,
+ TP_STATUS_KERNEL);
+ spin_unlock(&sk->sk_receive_queue.lock);
+
+ return has_room;
+}
+
static void packet_sock_destruct(struct sock *sk)
{
skb_queue_purge(&sk->sk_error_queue);
@@ -1066,16 +1091,16 @@ static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
return x;
}
-static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
+static unsigned int fanout_demux_hash(struct packet_fanout *f,
+ struct sk_buff *skb,
+ unsigned int num)
{
- u32 idx, hash = skb->rxhash;
-
- idx = ((u64)hash * num) >> 32;
-
- return f->arr[idx];
+ return (((u64)skb->rxhash) * num) >> 32;
}
-static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
+static unsigned int fanout_demux_lb(struct packet_fanout *f,
+ struct sk_buff *skb,
+ unsigned int num)
{
int cur, old;
@@ -1083,14 +1108,40 @@ static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb
while ((old = atomic_cmpxchg(&f->rr_cur, cur,
fanout_rr_next(f, num))) != cur)
cur = old;
- return f->arr[cur];
+ return cur;
+}
+
+static unsigned int fanout_demux_cpu(struct packet_fanout *f,
+ struct sk_buff *skb,
+ unsigned int num)
+{
+ return smp_processor_id() % num;
}
-static struct sock *fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
+static unsigned int fanout_demux_rollover(struct packet_fanout *f,
+ struct sk_buff *skb,
+ unsigned int idx, unsigned int skip,
+ unsigned int num)
{
- unsigned int cpu = smp_processor_id();
+ unsigned int i, j;
+
+ i = j = min_t(int, f->next[idx], num - 1);
+ do {
+ if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
+ if (i != j)
+ f->next[idx] = i;
+ return i;
+ }
+ if (++i == num)
+ i = 0;
+ } while (i != j);
- return f->arr[cpu % num];
+ return idx;
+}
+
+static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
+{
+ return f->flags & (flag >> 8);
}
static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
@@ -1099,7 +1150,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
struct packet_fanout *f = pt->af_packet_priv;
unsigned int num = f->num_members;
struct packet_sock *po;
- struct sock *sk;
+ unsigned int idx;
if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
!num) {
@@ -1110,23 +1161,31 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
switch (f->type) {
case PACKET_FANOUT_HASH:
default:
- if (f->defrag) {
+ if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
if (!skb)
return 0;
}
skb_get_rxhash(skb);
- sk = fanout_demux_hash(f, skb, num);
+ idx = fanout_demux_hash(f, skb, num);
break;
case PACKET_FANOUT_LB:
- sk = fanout_demux_lb(f, skb, num);
+ idx = fanout_demux_lb(f, skb, num);
break;
case PACKET_FANOUT_CPU:
- sk = fanout_demux_cpu(f, skb, num);
+ idx = fanout_demux_cpu(f, skb, num);
+ break;
+ case PACKET_FANOUT_ROLLOVER:
+ idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
break;
}
- po = pkt_sk(sk);
+ po = pkt_sk(f->arr[idx]);
+ if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
+ unlikely(!packet_rcv_has_room(po, skb))) {
+ idx = fanout_demux_rollover(f, skb, idx, idx, num);
+ po = pkt_sk(f->arr[idx]);
+ }
return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
}
@@ -1175,10 +1234,13 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
struct packet_sock *po = pkt_sk(sk);
struct packet_fanout *f, *match;
u8 type = type_flags & 0xff;
- u8 defrag = (type_flags & PACKET_FANOUT_FLAG_DEFRAG) ? 1 : 0;
+ u8 flags = type_flags >> 8;
int err;
switch (type) {
+ case PACKET_FANOUT_ROLLOVER:
+ if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
+ return -EINVAL;
case PACKET_FANOUT_HASH:
case PACKET_FANOUT_LB:
case PACKET_FANOUT_CPU:
@@ -1203,7 +1265,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
}
}
err = -EINVAL;
- if (match && match->defrag != defrag)
+ if (match && match->flags != flags)
goto out;
if (!match) {
err = -ENOMEM;
@@ -1213,7 +1275,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
write_pnet(&match->net, sock_net(sk));
match->id = id;
match->type = type;
- match->defrag = defrag;
+ match->flags = flags;
atomic_set(&match->rr_cur, 0);
INIT_LIST_HEAD(&match->list);
spin_lock_init(&match->lock);
@@ -1450,6 +1512,8 @@ retry:
if (unlikely(extra_len == 4))
skb->no_fcs = 1;
+ skb_probe_transport_header(skb, 0);
+
dev_queue_xmit(skb);
rcu_read_unlock();
return len;
@@ -1880,6 +1944,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
skb_reserve(skb, hlen);
skb_reset_network_header(skb);
+ skb_probe_transport_header(skb, 0);
if (po->tp_tx_has_off) {
int off_min, off_max, off;
@@ -2289,6 +2354,8 @@ static int packet_snd(struct socket *sock,
len += vnet_hdr_len;
}
+ skb_probe_transport_header(skb, reserve);
+
if (unlikely(extra_len == 4))
skb->no_fcs = 1;
@@ -3240,7 +3307,8 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
case PACKET_FANOUT:
val = (po->fanout ?
((u32)po->fanout->id |
- ((u32)po->fanout->type << 16)) :
+ ((u32)po->fanout->type << 16) |
+ ((u32)po->fanout->flags << 24)) :
0);
break;
case PACKET_TX_HAS_OFF:
diff --git a/net/packet/internal.h b/net/packet/internal.h
index e84cab8cb7a9..e891f025a1b9 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -77,10 +77,11 @@ struct packet_fanout {
unsigned int num_members;
u16 id;
u8 type;
- u8 defrag;
+ u8 flags;
atomic_t rr_cur;
struct list_head list;
struct sock *arr[PACKET_FANOUT_MAX];
+ int next[PACKET_FANOUT_MAX];
spinlock_t lock;
atomic_t sk_ref;
struct packet_type prot_hook ____cacheline_aligned_in_smp;
diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
index 0193630d3061..dc15f4300808 100644
--- a/net/phonet/pn_netlink.c
+++ b/net/phonet/pn_netlink.c
@@ -61,7 +61,7 @@ static const struct nla_policy ifa_phonet_policy[IFA_MAX+1] = {
[IFA_LOCAL] = { .type = NLA_U8 },
};
-static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *attr)
+static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tb[IFA_MAX+1];
@@ -224,7 +224,7 @@ static const struct nla_policy rtm_phonet_policy[RTA_MAX+1] = {
[RTA_OIF] = { .type = NLA_U32 },
};
-static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *attr)
+static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tb[RTA_MAX+1];
diff --git a/net/rds/stats.c b/net/rds/stats.c
index 7be790d60b90..73be187d389e 100644
--- a/net/rds/stats.c
+++ b/net/rds/stats.c
@@ -87,6 +87,7 @@ void rds_stats_info_copy(struct rds_info_iterator *iter,
for (i = 0; i < nr; i++) {
BUG_ON(strlen(names[i]) >= sizeof(ctr.name));
strncpy(ctr.name, names[i], sizeof(ctr.name) - 1);
+ ctr.name[sizeof(ctr.name) - 1] = '\0';
ctr.value = values[i];
rds_info_copy(iter, &ctr, sizeof(ctr));
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index cf68e6e4054a..9c8347451597 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1253,6 +1253,7 @@ static int rose_recvmsg(struct kiocb *iocb, struct socket *sock,
skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
if (srose != NULL) {
+ memset(srose, 0, msg->msg_namelen);
srose->srose_family = AF_ROSE;
srose->srose_addr = rose->dest_addr;
srose->srose_call = rose->dest_call;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 8579c4bb20c9..fd7072827a40 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -982,7 +982,7 @@ done:
return ret;
}
-static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_ACT_MAX + 1];
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 964f5e4f4b8a..8e118af90973 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -22,7 +22,6 @@
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/kmod.h>
-#include <linux/netlink.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <net/net_namespace.h>
@@ -118,7 +117,7 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
/* Add/change/delete/get a filter node */
-static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
@@ -141,7 +140,12 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
if ((n->nlmsg_type != RTM_GETTFILTER) && !capable(CAP_NET_ADMIN))
return -EPERM;
+
replay:
+ err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
+ if (err < 0)
+ return err;
+
t = nlmsg_data(n);
protocol = TC_H_MIN(t->tcm_info);
prio = TC_H_MAJ(t->tcm_info);
@@ -164,10 +168,6 @@ replay:
if (dev == NULL)
return -ENODEV;
- err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
- if (err < 0)
- return err;
-
/* Find qdisc */
if (!parent) {
q = dev->qdisc;
@@ -427,7 +427,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
const struct Qdisc_class_ops *cops;
struct tcf_dump_args arg;
- if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
+ if (nlmsg_len(cb->nlh) < sizeof(*tcm))
return skb->len;
dev = __dev_get_by_index(net, tcm->tcm_ifindex);
if (!dev)
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index c297e2a8e2a1..2b935e7cfe7b 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -971,13 +971,13 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
* Delete/get qdisc.
*/
-static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
{
struct net *net = sock_net(skb->sk);
struct tcmsg *tcm = nlmsg_data(n);
struct nlattr *tca[TCA_MAX + 1];
struct net_device *dev;
- u32 clid = tcm->tcm_parent;
+ u32 clid;
struct Qdisc *q = NULL;
struct Qdisc *p = NULL;
int err;
@@ -985,14 +985,15 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN))
return -EPERM;
- dev = __dev_get_by_index(net, tcm->tcm_ifindex);
- if (!dev)
- return -ENODEV;
-
err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
if (err < 0)
return err;
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ clid = tcm->tcm_parent;
if (clid) {
if (clid != TC_H_ROOT) {
if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
@@ -1038,7 +1039,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
* Create/change qdisc.
*/
-static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
{
struct net *net = sock_net(skb->sk);
struct tcmsg *tcm;
@@ -1053,6 +1054,10 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
replay:
/* Reinit, just in case something touches this. */
+ err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
+ if (err < 0)
+ return err;
+
tcm = nlmsg_data(n);
clid = tcm->tcm_parent;
q = p = NULL;
@@ -1061,9 +1066,6 @@ replay:
if (!dev)
return -ENODEV;
- err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
- if (err < 0)
- return err;
if (clid) {
if (clid != TC_H_ROOT) {
@@ -1372,7 +1374,7 @@ done:
-static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
{
struct net *net = sock_net(skb->sk);
struct tcmsg *tcm = nlmsg_data(n);
@@ -1382,22 +1384,22 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
const struct Qdisc_class_ops *cops;
unsigned long cl = 0;
unsigned long new_cl;
- u32 portid = tcm->tcm_parent;
- u32 clid = tcm->tcm_handle;
- u32 qid = TC_H_MAJ(clid);
+ u32 portid;
+ u32 clid;
+ u32 qid;
int err;
if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN))
return -EPERM;
- dev = __dev_get_by_index(net, tcm->tcm_ifindex);
- if (!dev)
- return -ENODEV;
-
err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
if (err < 0)
return err;
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return -ENODEV;
+
/*
parent == TC_H_UNSPEC - unspecified parent.
parent == TC_H_ROOT - class is root, which has no parent.
@@ -1413,6 +1415,10 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
/* Step 1. Determine qdisc handle X:0 */
+ portid = tcm->tcm_parent;
+ clid = tcm->tcm_handle;
+ qid = TC_H_MAJ(clid);
+
if (portid != TC_H_ROOT) {
u32 qid1 = TC_H_MAJ(portid);
@@ -1636,7 +1642,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
struct net_device *dev;
int t, s_t;
- if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
+ if (nlmsg_len(cb->nlh) < sizeof(*tcm))
return 0;
dev = dev_get_by_index(net, tcm->tcm_ifindex);
if (!dev)
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 13aa47aa2ffb..1bc210ffcba2 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -962,8 +962,11 @@ cbq_dequeue(struct Qdisc *sch)
cbq_update(q);
if ((incr -= incr2) < 0)
incr = 0;
+ q->now += incr;
+ } else {
+ if (now > q->now)
+ q->now = now;
}
- q->now += incr;
q->now_rt = now;
for (;;) {
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 4e606fcb2534..55786283a3df 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -195,7 +195,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
flow->deficit = q->quantum;
flow->dropped = 0;
}
- if (++sch->q.qlen < sch->limit)
+ if (++sch->q.qlen <= sch->limit)
return NET_XMIT_SUCCESS;
q->drop_overlimit++;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index ffad48109a22..eac7e0ee23c1 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -904,7 +904,7 @@ void psched_ratecfg_precompute(struct psched_ratecfg *r, u32 rate)
u64 mult;
int shift;
- r->rate_bps = rate << 3;
+ r->rate_bps = (u64)rate << 3;
r->shift = 0;
r->mult = 1;
/*
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 571f1d211f4d..79b1876b6cd2 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -981,6 +981,7 @@ static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
[TCA_HTB_INIT] = { .len = sizeof(struct tc_htb_glob) },
[TCA_HTB_CTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
[TCA_HTB_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+ [TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
};
static void htb_work_func(struct work_struct *work)
@@ -994,7 +995,7 @@ static void htb_work_func(struct work_struct *work)
static int htb_init(struct Qdisc *sch, struct nlattr *opt)
{
struct htb_sched *q = qdisc_priv(sch);
- struct nlattr *tb[TCA_HTB_INIT + 1];
+ struct nlattr *tb[TCA_HTB_MAX + 1];
struct tc_htb_glob *gopt;
int err;
int i;
@@ -1002,20 +1003,16 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
if (!opt)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy);
+ err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
if (err < 0)
return err;
- if (tb[TCA_HTB_INIT] == NULL) {
- pr_err("HTB: hey probably you have bad tc tool ?\n");
+ if (!tb[TCA_HTB_INIT])
return -EINVAL;
- }
+
gopt = nla_data(tb[TCA_HTB_INIT]);
- if (gopt->version != HTB_VER >> 16) {
- pr_err("HTB: need tc/htb version %d (minor is %d), you have %d\n",
- HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
+ if (gopt->version != HTB_VER >> 16)
return -EINVAL;
- }
err = qdisc_class_hash_init(&q->clhash);
if (err < 0)
@@ -1027,10 +1024,13 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
INIT_WORK(&q->work, htb_work_func);
skb_queue_head_init(&q->direct_queue);
- q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
- if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */
- q->direct_qlen = 2;
-
+ if (tb[TCA_HTB_DIRECT_QLEN])
+ q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
+ else {
+ q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
+ if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */
+ q->direct_qlen = 2;
+ }
if ((q->rate2quantum = gopt->rate2quantum) < 1)
q->rate2quantum = 1;
q->defcls = gopt->defcls;
@@ -1056,7 +1056,8 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
- if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt))
+ if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
+ nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
goto nla_put_failure;
nla_nest_end(skb, nest);
@@ -1311,7 +1312,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
struct htb_sched *q = qdisc_priv(sch);
struct htb_class *cl = (struct htb_class *)*arg, *parent;
struct nlattr *opt = tca[TCA_OPTIONS];
- struct nlattr *tb[__TCA_HTB_MAX];
+ struct nlattr *tb[TCA_HTB_MAX + 1];
struct tc_htb_opt *hopt;
/* extract all subattrs from opt attr */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 43cd0dd9149d..d2709e2b7be6 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1079,7 +1079,7 @@ struct sctp_transport *sctp_assoc_lookup_tsn(struct sctp_association *asoc,
transports) {
if (transport == active)
- break;
+ continue;
list_for_each_entry(chunk, &transport->transmitted,
transmitted_list) {
if (key == chunk->subh.data_hdr->tsn) {
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 5131fcfedb03..de1a0138317f 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -2082,7 +2082,7 @@ sctp_disposition_t sctp_sf_do_5_2_4_dupcook(struct net *net,
}
/* Delete the tempory new association. */
- sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc));
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_ASOC, SCTP_ASOC(new_asoc));
sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
/* Restore association pointer to provide SCTP command interpeter
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b9070736b8d9..f631c5ff4dbf 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1119,9 +1119,10 @@ static int __sctp_connect(struct sock* sk,
/* Make sure the destination port is correctly set
* in all addresses.
*/
- if (asoc && asoc->peer.port && asoc->peer.port != port)
+ if (asoc && asoc->peer.port && asoc->peer.port != port) {
+ err = -EINVAL;
goto out_free;
-
+ }
/* Check if there already is a matching association on the
* endpoint (other than the one created here).
@@ -6185,7 +6186,8 @@ unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
/* Is there any exceptional events? */
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
- mask |= POLLERR;
+ mask |= POLLERR |
+ sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0;
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= POLLRDHUP | POLLIN | POLLRDNORM;
if (sk->sk_shutdown == SHUTDOWN_MASK)
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index f7d34e7b6f81..5ead60550895 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -447,17 +447,21 @@ static int rsc_parse(struct cache_detail *cd,
else {
int N, i;
+ /*
+ * NOTE: we skip uid_valid()/gid_valid() checks here:
+ * instead, * -1 id's are later mapped to the
+ * (export-specific) anonymous id by nfsd_setuser.
+ *
+ * (But supplementary gid's get no such special
+ * treatment so are checked for validity here.)
+ */
/* uid */
rsci.cred.cr_uid = make_kuid(&init_user_ns, id);
- if (!uid_valid(rsci.cred.cr_uid))
- goto out;
/* gid */
if (get_int(&mesg, &id))
goto out;
rsci.cred.cr_gid = make_kgid(&init_user_ns, id);
- if (!gid_valid(rsci.cred.cr_gid))
- goto out;
/* number of additional gid's */
if (get_int(&mesg, &N))
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 7b9b40224a27..a9129f8d7070 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -1174,6 +1174,8 @@ static struct file_system_type rpc_pipe_fs_type = {
.mount = rpc_mount,
.kill_sb = rpc_kill_sb,
};
+MODULE_ALIAS_FS("rpc_pipefs");
+MODULE_ALIAS("rpc_pipefs");
static void
init_once(void *foo)
@@ -1218,6 +1220,3 @@ void unregister_rpc_pipefs(void)
kmem_cache_destroy(rpc_inode_cachep);
unregister_filesystem(&rpc_pipe_fs_type);
}
-
-/* Make 'mount -t rpc_pipefs ...' autoload this module. */
-MODULE_ALIAS("rpc_pipefs");
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index fb20f25ddec9..f8529fc8e542 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -180,6 +180,8 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue,
list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]);
task->tk_waitqueue = queue;
queue->qlen++;
+ /* barrier matches the read in rpc_wake_up_task_queue_locked() */
+ smp_wmb();
rpc_set_queued(task);
dprintk("RPC: %5u added to queue %p \"%s\"\n",
@@ -430,8 +432,11 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task
*/
static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
{
- if (RPC_IS_QUEUED(task) && task->tk_waitqueue == queue)
- __rpc_do_wake_up_task(queue, task);
+ if (RPC_IS_QUEUED(task)) {
+ smp_rmb();
+ if (task->tk_waitqueue == queue)
+ __rpc_do_wake_up_task(queue, task);
+ }
}
/*
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index c1d8476b7692..3d02130828da 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -849,6 +849,14 @@ static void xs_tcp_close(struct rpc_xprt *xprt)
xs_tcp_shutdown(xprt);
}
+static void xs_local_destroy(struct rpc_xprt *xprt)
+{
+ xs_close(xprt);
+ xs_free_peer_addresses(xprt);
+ xprt_free(xprt);
+ module_put(THIS_MODULE);
+}
+
/**
* xs_destroy - prepare to shutdown a transport
* @xprt: doomed transport
@@ -862,10 +870,7 @@ static void xs_destroy(struct rpc_xprt *xprt)
cancel_delayed_work_sync(&transport->connect_worker);
- xs_close(xprt);
- xs_free_peer_addresses(xprt);
- xprt_free(xprt);
- module_put(THIS_MODULE);
+ xs_local_destroy(xprt);
}
static inline struct rpc_xprt *xprt_from_sock(struct sock *sk)
@@ -2482,7 +2487,7 @@ static struct rpc_xprt_ops xs_local_ops = {
.send_request = xs_local_send_request,
.set_retrans_timeout = xprt_set_retrans_timeout_def,
.close = xs_close,
- .destroy = xs_destroy,
+ .destroy = xs_local_destroy,
.print_stats = xs_local_print_stats,
};
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 6675914dc592..8bcd4985d0fb 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -44,7 +44,7 @@ static int handle_cmd(struct sk_buff *skb, struct genl_info *info)
struct nlmsghdr *rep_nlh;
struct nlmsghdr *req_nlh = info->nlhdr;
struct tipc_genlmsghdr *req_userhdr = info->userhdr;
- int hdr_space = NLMSG_SPACE(GENL_HDRLEN + TIPC_GENL_HDRLEN);
+ int hdr_space = nlmsg_total_size(GENL_HDRLEN + TIPC_GENL_HDRLEN);
u16 cmd;
if ((req_userhdr->cmd & 0xC000) && (!capable(CAP_NET_ADMIN)))
@@ -53,8 +53,8 @@ static int handle_cmd(struct sk_buff *skb, struct genl_info *info)
cmd = req_userhdr->cmd;
rep_buf = tipc_cfg_do_cmd(req_userhdr->dest, cmd,
- NLMSG_DATA(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN,
- NLMSG_PAYLOAD(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN),
+ nlmsg_data(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN,
+ nlmsg_attrlen(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN),
hdr_space);
if (rep_buf) {
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index a9622b6cd916..515ce38e4f4c 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -790,6 +790,7 @@ static void set_orig_addr(struct msghdr *m, struct tipc_msg *msg)
if (addr) {
addr->family = AF_TIPC;
addr->addrtype = TIPC_ADDR_ID;
+ memset(&addr->addr, 0, sizeof(addr->addr));
addr->addr.id.ref = msg_origport(msg);
addr->addr.id.node = msg_orignode(msg);
addr->addr.name.domain = 0; /* could leave uninitialized */
@@ -904,6 +905,9 @@ static int recv_msg(struct kiocb *iocb, struct socket *sock,
goto exit;
}
+ /* will be updated in set_orig_addr() if needed */
+ m->msg_namelen = 0;
+
timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
restart:
@@ -1013,6 +1017,9 @@ static int recv_stream(struct kiocb *iocb, struct socket *sock,
goto exit;
}
+ /* will be updated in set_orig_addr() if needed */
+ m->msg_namelen = 0;
+
target = sock_rcvlowat(sk, flags & MSG_WAITALL, buf_len);
timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 51be64f163ec..5ca1631de7ef 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -382,7 +382,7 @@ static void unix_sock_destructor(struct sock *sk)
#endif
}
-static int unix_release_sock(struct sock *sk, int embrion)
+static void unix_release_sock(struct sock *sk, int embrion)
{
struct unix_sock *u = unix_sk(sk);
struct path path;
@@ -451,8 +451,6 @@ static int unix_release_sock(struct sock *sk, int embrion)
if (unix_tot_inflight)
unix_gc(); /* Garbage collect fds */
-
- return 0;
}
static void init_peercred(struct sock *sk)
@@ -699,9 +697,10 @@ static int unix_release(struct socket *sock)
if (!sk)
return 0;
+ unix_release_sock(sk, 0);
sock->sk = NULL;
- return unix_release_sock(sk, 0);
+ return 0;
}
static int unix_autobind(struct socket *sock)
@@ -1341,7 +1340,6 @@ static void unix_destruct_scm(struct sk_buff *skb)
struct scm_cookie scm;
memset(&scm, 0, sizeof(scm));
scm.pid = UNIXCB(skb).pid;
- scm.cred = UNIXCB(skb).cred;
if (UNIXCB(skb).fp)
unix_detach_fds(&scm, skb);
@@ -1392,8 +1390,8 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen
int err = 0;
UNIXCB(skb).pid = get_pid(scm->pid);
- if (scm->cred)
- UNIXCB(skb).cred = get_cred(scm->cred);
+ UNIXCB(skb).uid = scm->creds.uid;
+ UNIXCB(skb).gid = scm->creds.gid;
UNIXCB(skb).fp = NULL;
if (scm->fp && send_fds)
err = unix_attach_fds(scm, skb);
@@ -1410,13 +1408,13 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen
static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
const struct sock *other)
{
- if (UNIXCB(skb).cred)
+ if (UNIXCB(skb).pid)
return;
if (test_bit(SOCK_PASSCRED, &sock->flags) ||
!other->sk_socket ||
test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
UNIXCB(skb).pid = get_pid(task_tgid(current));
- UNIXCB(skb).cred = get_current_cred();
+ current_euid_egid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
}
}
@@ -1820,7 +1818,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
siocb->scm = &tmp_scm;
memset(&tmp_scm, 0, sizeof(tmp_scm));
}
- scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
+ scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
unix_set_secdata(siocb->scm, skb);
if (!(flags & MSG_PEEK)) {
@@ -1992,11 +1990,12 @@ again:
if (check_creds) {
/* Never glue messages from different writers */
if ((UNIXCB(skb).pid != siocb->scm->pid) ||
- (UNIXCB(skb).cred != siocb->scm->cred))
+ !uid_eq(UNIXCB(skb).uid, siocb->scm->creds.uid) ||
+ !gid_eq(UNIXCB(skb).gid, siocb->scm->creds.gid))
break;
- } else {
+ } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
/* Copy credentials */
- scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
+ scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
check_creds = 1;
}
@@ -2197,7 +2196,9 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
/* exceptional events? */
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
- mask |= POLLERR;
+ mask |= POLLERR |
+ (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
+
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= POLLRDHUP | POLLIN | POLLRDNORM;
if (sk->sk_shutdown == SHUTDOWN_MASK)
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index ca511c4f388a..7f93e2a42d7a 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -207,7 +207,7 @@ static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr)
struct vsock_sock *vsk;
list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table)
- if (vsock_addr_equals_addr_any(addr, &vsk->local_addr))
+ if (addr->svm_port == vsk->local_addr.svm_port)
return sk_vsock(vsk);
return NULL;
@@ -220,8 +220,8 @@ static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src,
list_for_each_entry(vsk, vsock_connected_sockets(src, dst),
connected_table) {
- if (vsock_addr_equals_addr(src, &vsk->remote_addr)
- && vsock_addr_equals_addr(dst, &vsk->local_addr)) {
+ if (vsock_addr_equals_addr(src, &vsk->remote_addr) &&
+ dst->svm_port == vsk->local_addr.svm_port) {
return sk_vsock(vsk);
}
}
@@ -1670,6 +1670,8 @@ vsock_stream_recvmsg(struct kiocb *kiocb,
vsk = vsock_sk(sk);
err = 0;
+ msg->msg_namelen = 0;
+
lock_sock(sk);
if (sk->sk_state != SS_CONNECTED) {
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index a70ace83a153..daff75200e25 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -123,6 +123,14 @@ static s32 vmci_transport_error_to_vsock_error(s32 vmci_error)
return err > 0 ? -err : err;
}
+static u32 vmci_transport_peer_rid(u32 peer_cid)
+{
+ if (VMADDR_CID_HYPERVISOR == peer_cid)
+ return VMCI_TRANSPORT_HYPERVISOR_PACKET_RID;
+
+ return VMCI_TRANSPORT_PACKET_RID;
+}
+
static inline void
vmci_transport_packet_init(struct vmci_transport_packet *pkt,
struct sockaddr_vm *src,
@@ -140,7 +148,7 @@ vmci_transport_packet_init(struct vmci_transport_packet *pkt,
pkt->dg.src = vmci_make_handle(VMADDR_CID_ANY,
VMCI_TRANSPORT_PACKET_RID);
pkt->dg.dst = vmci_make_handle(dst->svm_cid,
- VMCI_TRANSPORT_PACKET_RID);
+ vmci_transport_peer_rid(dst->svm_cid));
pkt->dg.payload_size = sizeof(*pkt) - sizeof(pkt->dg);
pkt->version = VMCI_TRANSPORT_PACKET_VERSION;
pkt->type = type;
@@ -464,19 +472,16 @@ static struct sock *vmci_transport_get_pending(
struct vsock_sock *vlistener;
struct vsock_sock *vpending;
struct sock *pending;
+ struct sockaddr_vm src;
+
+ vsock_addr_init(&src, pkt->dg.src.context, pkt->src_port);
vlistener = vsock_sk(listener);
list_for_each_entry(vpending, &vlistener->pending_links,
pending_links) {
- struct sockaddr_vm src;
- struct sockaddr_vm dst;
-
- vsock_addr_init(&src, pkt->dg.src.context, pkt->src_port);
- vsock_addr_init(&dst, pkt->dg.dst.context, pkt->dst_port);
-
if (vsock_addr_equals_addr(&src, &vpending->remote_addr) &&
- vsock_addr_equals_addr(&dst, &vpending->local_addr)) {
+ pkt->dst_port == vpending->local_addr.svm_port) {
pending = sk_vsock(vpending);
sock_hold(pending);
goto found;
@@ -511,6 +516,9 @@ static bool vmci_transport_is_trusted(struct vsock_sock *vsock, u32 peer_cid)
static bool vmci_transport_allow_dgram(struct vsock_sock *vsock, u32 peer_cid)
{
+ if (VMADDR_CID_HYPERVISOR == peer_cid)
+ return true;
+
if (vsock->cached_peer != peer_cid) {
vsock->cached_peer = peer_cid;
if (!vmci_transport_is_trusted(vsock, peer_cid) &&
@@ -631,7 +639,6 @@ static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg)
static bool vmci_transport_stream_allow(u32 cid, u32 port)
{
static const u32 non_socket_contexts[] = {
- VMADDR_CID_HYPERVISOR,
VMADDR_CID_RESERVED,
};
int i;
@@ -670,7 +677,7 @@ static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg)
*/
if (!vmci_transport_stream_allow(dg->src.context, -1)
- || VMCI_TRANSPORT_PACKET_RID != dg->src.resource)
+ || vmci_transport_peer_rid(dg->src.context) != dg->src.resource)
return VMCI_ERROR_NO_ACCESS;
if (VMCI_DG_SIZE(dg) < sizeof(*pkt))
@@ -739,10 +746,15 @@ static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg)
*/
bh_lock_sock(sk);
- if (!sock_owned_by_user(sk) && sk->sk_state == SS_CONNECTED)
- vmci_trans(vsk)->notify_ops->handle_notify_pkt(
- sk, pkt, true, &dst, &src,
- &bh_process_pkt);
+ if (!sock_owned_by_user(sk)) {
+ /* The local context ID may be out of date, update it. */
+ vsk->local_addr.svm_cid = dst.svm_cid;
+
+ if (sk->sk_state == SS_CONNECTED)
+ vmci_trans(vsk)->notify_ops->handle_notify_pkt(
+ sk, pkt, true, &dst, &src,
+ &bh_process_pkt);
+ }
bh_unlock_sock(sk);
@@ -902,6 +914,9 @@ static void vmci_transport_recv_pkt_work(struct work_struct *work)
lock_sock(sk);
+ /* The local context ID may be out of date. */
+ vsock_sk(sk)->local_addr.svm_cid = pkt->dg.dst.context;
+
switch (sk->sk_state) {
case SS_LISTEN:
vmci_transport_recv_listen(sk, pkt);
@@ -958,6 +973,10 @@ static int vmci_transport_recv_listen(struct sock *sk,
pending = vmci_transport_get_pending(sk, pkt);
if (pending) {
lock_sock(pending);
+
+ /* The local context ID may be out of date. */
+ vsock_sk(pending)->local_addr.svm_cid = pkt->dg.dst.context;
+
switch (pending->sk_state) {
case SS_CONNECTING:
err = vmci_transport_recv_connecting_server(sk,
@@ -1727,6 +1746,8 @@ static int vmci_transport_dgram_dequeue(struct kiocb *kiocb,
if (flags & MSG_OOB || flags & MSG_ERRQUEUE)
return -EOPNOTSUPP;
+ msg->msg_namelen = 0;
+
/* Retrieve the head sk_buff from the socket's receive queue. */
err = 0;
skb = skb_recv_datagram(&vsk->sk, flags, noblock, &err);
@@ -1759,7 +1780,6 @@ static int vmci_transport_dgram_dequeue(struct kiocb *kiocb,
if (err)
goto out;
- msg->msg_namelen = 0;
if (msg->msg_name) {
struct sockaddr_vm *vm_addr;
diff --git a/net/vmw_vsock/vmci_transport.h b/net/vmw_vsock/vmci_transport.h
index 1bf991803ec0..fd88ea8924e4 100644
--- a/net/vmw_vsock/vmci_transport.h
+++ b/net/vmw_vsock/vmci_transport.h
@@ -28,6 +28,9 @@
/* The resource ID on which control packets are sent. */
#define VMCI_TRANSPORT_PACKET_RID 1
+/* The resource ID on which control packets are sent to the hypervisor. */
+#define VMCI_TRANSPORT_HYPERVISOR_PACKET_RID 15
+
#define VSOCK_PROTO_INVALID 0
#define VSOCK_PROTO_PKT_ON_NOTIFY (1 << 0)
#define VSOCK_PROTO_ALL_SUPPORTED (VSOCK_PROTO_PKT_ON_NOTIFY)
diff --git a/net/vmw_vsock/vsock_addr.c b/net/vmw_vsock/vsock_addr.c
index b7df1aea7c59..ec2611b4ea0e 100644
--- a/net/vmw_vsock/vsock_addr.c
+++ b/net/vmw_vsock/vsock_addr.c
@@ -64,16 +64,6 @@ bool vsock_addr_equals_addr(const struct sockaddr_vm *addr,
}
EXPORT_SYMBOL_GPL(vsock_addr_equals_addr);
-bool vsock_addr_equals_addr_any(const struct sockaddr_vm *addr,
- const struct sockaddr_vm *other)
-{
- return (addr->svm_cid == VMADDR_CID_ANY ||
- other->svm_cid == VMADDR_CID_ANY ||
- addr->svm_cid == other->svm_cid) &&
- addr->svm_port == other->svm_port;
-}
-EXPORT_SYMBOL_GPL(vsock_addr_equals_addr_any);
-
int vsock_addr_cast(const struct sockaddr *addr,
size_t len, struct sockaddr_vm **out_addr)
{
diff --git a/net/vmw_vsock/vsock_addr.h b/net/vmw_vsock/vsock_addr.h
index cdfbcefdf843..9ccd5316eac0 100644
--- a/net/vmw_vsock/vsock_addr.h
+++ b/net/vmw_vsock/vsock_addr.h
@@ -24,8 +24,6 @@ bool vsock_addr_bound(const struct sockaddr_vm *addr);
void vsock_addr_unbind(struct sockaddr_vm *addr);
bool vsock_addr_equals_addr(const struct sockaddr_vm *addr,
const struct sockaddr_vm *other);
-bool vsock_addr_equals_addr_any(const struct sockaddr_vm *addr,
- const struct sockaddr_vm *other);
int vsock_addr_cast(const struct sockaddr *addr, size_t len,
struct sockaddr_vm **out_addr);
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 37a56ee1e1ed..6cbac99ae03d 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -511,7 +511,7 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
encaps_data = bridge_tunnel_header;
encaps_len = sizeof(bridge_tunnel_header);
skip_header_bytes -= 2;
- } else if (ethertype > 0x600) {
+ } else if (ethertype >= ETH_P_802_3_MIN) {
encaps_data = rfc1042_header;
encaps_len = sizeof(rfc1042_header);
skip_header_bytes -= 2;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 167c67d46c6a..23cea0f74336 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1037,6 +1037,24 @@ __xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir
return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
}
+static int flow_to_policy_dir(int dir)
+{
+ if (XFRM_POLICY_IN == FLOW_DIR_IN &&
+ XFRM_POLICY_OUT == FLOW_DIR_OUT &&
+ XFRM_POLICY_FWD == FLOW_DIR_FWD)
+ return dir;
+
+ switch (dir) {
+ default:
+ case FLOW_DIR_IN:
+ return XFRM_POLICY_IN;
+ case FLOW_DIR_OUT:
+ return XFRM_POLICY_OUT;
+ case FLOW_DIR_FWD:
+ return XFRM_POLICY_FWD;
+ }
+}
+
static struct flow_cache_object *
xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family,
u8 dir, struct flow_cache_object *old_obj, void *ctx)
@@ -1046,7 +1064,7 @@ xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family,
if (old_obj)
xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo));
- pol = __xfrm_policy_lookup(net, fl, family, dir);
+ pol = __xfrm_policy_lookup(net, fl, family, flow_to_policy_dir(dir));
if (IS_ERR_OR_NULL(pol))
return ERR_CAST(pol);
@@ -1932,7 +1950,8 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
* previous cache entry */
if (xdst == NULL) {
num_pols = 1;
- pols[0] = __xfrm_policy_lookup(net, fl, family, dir);
+ pols[0] = __xfrm_policy_lookup(net, fl, family,
+ flow_to_policy_dir(dir));
err = xfrm_expand_policies(fl, family, pols,
&num_pols, &num_xfrms);
if (err < 0)
diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
index 35754cc8a9e5..8dafe6d3c6e4 100644
--- a/net/xfrm/xfrm_replay.c
+++ b/net/xfrm/xfrm_replay.c
@@ -334,6 +334,70 @@ static void xfrm_replay_notify_bmp(struct xfrm_state *x, int event)
x->xflags &= ~XFRM_TIME_DEFER;
}
+static void xfrm_replay_notify_esn(struct xfrm_state *x, int event)
+{
+ u32 seq_diff, oseq_diff;
+ struct km_event c;
+ struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+ struct xfrm_replay_state_esn *preplay_esn = x->preplay_esn;
+
+ /* we send notify messages in case
+ * 1. we updated on of the sequence numbers, and the seqno difference
+ * is at least x->replay_maxdiff, in this case we also update the
+ * timeout of our timer function
+ * 2. if x->replay_maxage has elapsed since last update,
+ * and there were changes
+ *
+ * The state structure must be locked!
+ */
+
+ switch (event) {
+ case XFRM_REPLAY_UPDATE:
+ if (!x->replay_maxdiff)
+ break;
+
+ if (replay_esn->seq_hi == preplay_esn->seq_hi)
+ seq_diff = replay_esn->seq - preplay_esn->seq;
+ else
+ seq_diff = ~preplay_esn->seq + replay_esn->seq + 1;
+
+ if (replay_esn->oseq_hi == preplay_esn->oseq_hi)
+ oseq_diff = replay_esn->oseq - preplay_esn->oseq;
+ else
+ oseq_diff = ~preplay_esn->oseq + replay_esn->oseq + 1;
+
+ if (seq_diff < x->replay_maxdiff &&
+ oseq_diff < x->replay_maxdiff) {
+
+ if (x->xflags & XFRM_TIME_DEFER)
+ event = XFRM_REPLAY_TIMEOUT;
+ else
+ return;
+ }
+
+ break;
+
+ case XFRM_REPLAY_TIMEOUT:
+ if (memcmp(x->replay_esn, x->preplay_esn,
+ xfrm_replay_state_esn_len(replay_esn)) == 0) {
+ x->xflags |= XFRM_TIME_DEFER;
+ return;
+ }
+
+ break;
+ }
+
+ memcpy(x->preplay_esn, x->replay_esn,
+ xfrm_replay_state_esn_len(replay_esn));
+ c.event = XFRM_MSG_NEWAE;
+ c.data.aevent = event;
+ km_state_notify(x, &c);
+
+ if (x->replay_maxage &&
+ !mod_timer(&x->rtimer, jiffies + x->replay_maxage))
+ x->xflags &= ~XFRM_TIME_DEFER;
+}
+
static int xfrm_replay_overflow_esn(struct xfrm_state *x, struct sk_buff *skb)
{
int err = 0;
@@ -510,7 +574,7 @@ static struct xfrm_replay xfrm_replay_esn = {
.advance = xfrm_replay_advance_esn,
.check = xfrm_replay_check_esn,
.recheck = xfrm_replay_recheck_esn,
- .notify = xfrm_replay_notify_bmp,
+ .notify = xfrm_replay_notify_esn,
.overflow = xfrm_replay_overflow_esn,
};